From mboxrd@z Thu Jan 1 00:00:00 1970 From: Avi Kivity Subject: Re: [PATCH 6/7] Nested VMX patch 6 implements vmlaunch and vmresume Date: Thu, 17 Dec 2009 12:10:50 +0200 Message-ID: <4B2A03AA.5070202@redhat.com> References: <1260470309-7166-1-git-send-email-oritw@il.ibm.com> <1260470309-7166-2-git-send-email-oritw@il.ibm.com> <1260470309-7166-3-git-send-email-oritw@il.ibm.com> <1260470309-7166-4-git-send-email-oritw@il.ibm.com> <1260470309-7166-5-git-send-email-oritw@il.ibm.com> <1260470309-7166-6-git-send-email-oritw@il.ibm.com> <1260470309-7166-7-git-send-email-oritw@il.ibm.com> Mime-Version: 1.0 Content-Type: text/plain; charset=ISO-8859-1; format=flowed Content-Transfer-Encoding: 7bit Cc: kvm@vger.kernel.org, benami@il.ibm.com, abelg@il.ibm.com, muli@il.ibm.com, aliguori@us.ibm.com, mdday@us.ibm.com To: oritw@il.ibm.com Return-path: Received: from mx1.redhat.com ([209.132.183.28]:40083 "EHLO mx1.redhat.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1761380AbZLQKLI (ORCPT ); Thu, 17 Dec 2009 05:11:08 -0500 In-Reply-To: <1260470309-7166-7-git-send-email-oritw@il.ibm.com> Sender: kvm-owner@vger.kernel.org List-ID: On 12/10/2009 08:38 PM, oritw@il.ibm.com wrote: > From: Orit Wasserman > > > @@ -287,7 +297,7 @@ static inline int vmcs_field_type(unsigned long field) > } > > /* > - Returncs VMCS field size in bits > + Returns VMCS field size in bits > */ > Fold. > static inline int vmcs_field_size(int field_type, struct kvm_vcpu *vcpu) > { > @@ -313,6 +323,10 @@ static inline int vmcs_field_size(int field_type, struct kvm_vcpu *vcpu) > return 0; > } > > +#define NESTED_VM_EXIT_CONTROLS_MASK (~(VM_EXIT_LOAD_IA32_PAT | \ > + VM_EXIT_SAVE_IA32_PAT)) > +#define NESTED_VM_ENTRY_CONTROLS_MASK (~(VM_ENTRY_LOAD_IA32_PAT | \ > + I think a whitelist is better here, so if we add a new feature and forget it here, we don't introduce a vulnerability. > > +static inline int nested_cpu_has_vmx_tpr_shadow(struct kvm_vcpu *vcpu) > +{ > + return cpu_has_vmx_tpr_shadow()&& > + get_shadow_vmcs(vcpu)->cpu_based_vm_exec_control& > + CPU_BASED_TPR_SHADOW; > +} > bools are better. > + > +static inline int nested_cpu_has_secondary_exec_ctrls(struct kvm_vcpu *vcpu) > +{ > + return cpu_has_secondary_exec_ctrls()&& > + get_shadow_vmcs(vcpu)->cpu_based_vm_exec_control& > + CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; > +} > + > +static inline bool nested_vm_need_virtualize_apic_accesses(struct kvm_vcpu > + *vcpu) > +{ > + return get_shadow_vmcs(vcpu)->secondary_vm_exec_control& > + SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; > +} > Need to check for secondary controls first. > + > +static inline int nested_cpu_has_vmx_ept(struct kvm_vcpu *vcpu) > +{ > + return get_shadow_vmcs(vcpu)-> > + secondary_vm_exec_control& SECONDARY_EXEC_ENABLE_EPT; > +} > + > +static inline int nested_cpu_has_vmx_vpid(struct kvm_vcpu *vcpu) > +{ > + return get_shadow_vmcs(vcpu)->secondary_vm_exec_control& > + SECONDARY_EXEC_ENABLE_VPID; > +} > A helper nested_check_2ndary_control() can help reduce duplication. > + > +static inline int nested_cpu_has_vmx_pat(struct kvm_vcpu *vcpu) > +{ > + return get_shadow_vmcs(vcpu)->vm_entry_controls& > + VM_ENTRY_LOAD_IA32_PAT; > +} > Suggest dropping PAT support for now (it's optional in the spec IIRC, and doesn't help much). > > + > +void prepare_vmcs_12(struct kvm_vcpu *vcpu) > +{ > Not sure what this does. From the name, it appears to prepare a vmcs. From the contents, it appears to read the vmcs and save it into a shadow vmcs. > + struct shadow_vmcs *l2_shadow_vmcs = > + get_shadow_vmcs(vcpu); > + > + l2_shadow_vmcs->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR); > + l2_shadow_vmcs->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR); > + l2_shadow_vmcs->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR); > + l2_shadow_vmcs->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR); > + l2_shadow_vmcs->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR); > + l2_shadow_vmcs->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR); > + l2_shadow_vmcs->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR); > + l2_shadow_vmcs->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR); > + > + l2_shadow_vmcs->tsc_offset = vmcs_read64(TSC_OFFSET); > + l2_shadow_vmcs->guest_physical_address = > + vmcs_read64(GUEST_PHYSICAL_ADDRESS); > + l2_shadow_vmcs->vmcs_link_pointer = vmcs_read64(VMCS_LINK_POINTER); > + l2_shadow_vmcs->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); > + if (vmcs_config.vmentry_ctrl& VM_ENTRY_LOAD_IA32_PAT) > + l2_shadow_vmcs->guest_ia32_pat = vmcs_read64(GUEST_IA32_PAT); > + l2_shadow_vmcs->cr3_target_count = vmcs_read32(CR3_TARGET_COUNT); > + l2_shadow_vmcs->vm_entry_intr_info_field = > + vmcs_read32(VM_ENTRY_INTR_INFO_FIELD); > + l2_shadow_vmcs->vm_entry_exception_error_code = > + vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE); > + l2_shadow_vmcs->vm_entry_instruction_len = > + vmcs_read32(VM_ENTRY_INSTRUCTION_LEN); > + l2_shadow_vmcs->vm_instruction_error = > + vmcs_read32(VM_INSTRUCTION_ERROR); > + l2_shadow_vmcs->vm_exit_reason = vmcs_read32(VM_EXIT_REASON); > + l2_shadow_vmcs->vm_exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); > + l2_shadow_vmcs->vm_exit_intr_error_code = > + vmcs_read32(VM_EXIT_INTR_ERROR_CODE); > + l2_shadow_vmcs->idt_vectoring_info_field = > + vmcs_read32(IDT_VECTORING_INFO_FIELD); > + l2_shadow_vmcs->idt_vectoring_error_code = > + vmcs_read32(IDT_VECTORING_ERROR_CODE); > + l2_shadow_vmcs->vm_exit_instruction_len = > + vmcs_read32(VM_EXIT_INSTRUCTION_LEN); > + l2_shadow_vmcs->vmx_instruction_info = > + vmcs_read32(VMX_INSTRUCTION_INFO); > + l2_shadow_vmcs->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT); > + l2_shadow_vmcs->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT); > + l2_shadow_vmcs->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT); > + l2_shadow_vmcs->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT); > + l2_shadow_vmcs->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT); > + l2_shadow_vmcs->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT); > + l2_shadow_vmcs->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT); > + l2_shadow_vmcs->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT); > + l2_shadow_vmcs->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT); > + l2_shadow_vmcs->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT); > + l2_shadow_vmcs->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES); > + l2_shadow_vmcs->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES); > + l2_shadow_vmcs->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES); > + l2_shadow_vmcs->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES); > + l2_shadow_vmcs->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES); > + l2_shadow_vmcs->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES); > + l2_shadow_vmcs->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES); > + l2_shadow_vmcs->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES); > + l2_shadow_vmcs->guest_interruptibility_info = > + vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); > + l2_shadow_vmcs->guest_activity_state = > + vmcs_read32(GUEST_ACTIVITY_STATE); > + l2_shadow_vmcs->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS); > + > + l2_shadow_vmcs->cr4_read_shadow = vmcs_readl(CR4_READ_SHADOW); > + l2_shadow_vmcs->exit_qualification = vmcs_readl(EXIT_QUALIFICATION); > If EXIT_QUALIFICATION is a physical address, don't you need to translate it? vmx will store a host physical address and we need a guest physical address. > + l2_shadow_vmcs->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS); > Not all processors support this. > + > + if (l2_shadow_vmcs->cr0_guest_host_mask& X86_CR0_TS) > + l2_shadow_vmcs->guest_cr0 = vmcs_readl(GUEST_CR0); > + else /* if CR0_GUEST_HOST_MASK[TS]=0 l1 should think TS was really written to CR0 */ > + l2_shadow_vmcs->guest_cr0 = > + (vmcs_readl(GUEST_CR0)&~X86_CR0_TS) | (vmcs_readl(CR0_READ_SHADOW)& X86_CR0_TS); > + > + l2_shadow_vmcs->guest_cr4 = vmcs_readl(GUEST_CR4); > GUEST_CR4 will be different from the guest's view of it (for example, without EPT CR4.PAE will always be set. We also use CR4_GUEST_HOST_MASK, I think you need to account for that. > + l2_shadow_vmcs->guest_es_base = vmcs_readl(GUEST_ES_BASE); > + l2_shadow_vmcs->guest_cs_base = vmcs_readl(GUEST_CS_BASE); > + l2_shadow_vmcs->guest_ss_base = vmcs_readl(GUEST_SS_BASE); > + l2_shadow_vmcs->guest_ds_base = vmcs_readl(GUEST_DS_BASE); > + l2_shadow_vmcs->guest_fs_base = vmcs_readl(GUEST_FS_BASE); > + l2_shadow_vmcs->guest_gs_base = vmcs_readl(GUEST_GS_BASE); > + l2_shadow_vmcs->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE); > + l2_shadow_vmcs->guest_tr_base = vmcs_readl(GUEST_TR_BASE); > + l2_shadow_vmcs->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE); > + l2_shadow_vmcs->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE); > + l2_shadow_vmcs->guest_dr7 = vmcs_readl(GUEST_DR7); > + l2_shadow_vmcs->guest_rsp = vmcs_readl(GUEST_RSP); > + l2_shadow_vmcs->guest_rip = vmcs_readl(GUEST_RIP); > + l2_shadow_vmcs->guest_rflags = vmcs_readl(GUEST_RFLAGS); > + l2_shadow_vmcs->guest_pending_dbg_exceptions = > + vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS); > + l2_shadow_vmcs->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP); > + l2_shadow_vmcs->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP); > +} > + > +int load_vmcs_common(struct shadow_vmcs *src) > +{ > + vmcs_write16(GUEST_ES_SELECTOR, src->guest_es_selector); > + vmcs_write16(GUEST_CS_SELECTOR, src->guest_cs_selector); > + vmcs_write16(GUEST_SS_SELECTOR, src->guest_ss_selector); > + vmcs_write16(GUEST_DS_SELECTOR, src->guest_ds_selector); > + vmcs_write16(GUEST_FS_SELECTOR, src->guest_fs_selector); > + vmcs_write16(GUEST_GS_SELECTOR, src->guest_gs_selector); > + vmcs_write16(GUEST_LDTR_SELECTOR, src->guest_ldtr_selector); > + vmcs_write16(GUEST_TR_SELECTOR, src->guest_tr_selector); > + > + vmcs_write64(GUEST_IA32_DEBUGCTL, src->guest_ia32_debugctl); > + > + if (vmcs_config.vmentry_ctrl& VM_ENTRY_LOAD_IA32_PAT) > + vmcs_write64(GUEST_IA32_PAT, src->guest_ia32_pat); > This is dangerous, the guest can cause inconsistent page types and processor lockups. > + > + if (src->vm_entry_msr_load_count< 512) > + vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, src->vm_entry_msr_load_count); > Ditto. MSRs have to be validated. This feature is optional, right? Suggest we don't support it for now. > + vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, > + src->guest_pending_dbg_exceptions); > I think this is a new field. > static struct level_state *create_state(void) > { > struct level_state *state = NULL; > @@ -2301,8 +2578,6 @@ static void free_l1_state(struct kvm_vcpu *vcpu) > kfree(list_item); > } > } > - > - > ? > @@ -3574,6 +3849,10 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu) > { > struct vcpu_vmx *vmx = to_vmx(vcpu); > > + if (vmx->nested.nested_mode) { > + return; > + } > + > No braces on single-line if statements in kernel code. > if (!cpu_has_virtual_nmis()) { > /* > * Tracking the NMI-blocked state in software is built upon > @@ -3759,7 +4038,12 @@ static int handle_exception(struct kvm_vcpu *vcpu) > return 1; /* already handled by vmx_vcpu_run() */ > > if (is_no_device(intr_info)) { > + /* if l0 handled an fpu operation for l2 it's because l1 is > + not interested (exception bitmap 12 does not include NM_VECTOR) > + enable fpu and resume l2 (avoid switching to l1) > + */ > Use standard comment style please. > vmx_fpu_activate(vcpu); > + > return 1; > } > > > > @@ -4504,7 +4793,7 @@ static int handle_vmon(struct kvm_vcpu *vcpu) > static int handle_vmptrld(struct kvm_vcpu *vcpu) > { > struct vcpu_vmx *vmx = to_vmx(vcpu); > - u64 guest_vmcs_addr; > + gpa_t guest_vmcs_addr; > Fold. > @@ -4895,7 +5184,6 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu) > = vmcs_read32(VM_INSTRUCTION_ERROR); > return 0; > } > - > ? > if ((vectoring_info& VECTORING_INFO_VALID_MASK)&& > (exit_reason != EXIT_REASON_EXCEPTION_NMI&& > exit_reason != EXIT_REASON_EPT_VIOLATION&& > @@ -4903,8 +5191,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu) > printk(KERN_WARNING "%s: unexpected, valid vectoring info " > "(0x%x) and exit reason is 0x%x\n", > __func__, vectoring_info, exit_reason); > - > - if (unlikely(!cpu_has_virtual_nmis()&& vmx->soft_vnmi_blocked)) { > + if (!vmx->nested.nested_mode&& unlikely(!cpu_has_virtual_nmis()&& vmx->soft_vnmi_blocked)) { > Why is this different for nested mode? At least, you have to check if the guest is intercepting nmis. > if (vmx_interrupt_allowed(vcpu)) { > vmx->soft_vnmi_blocked = 0; > } else if (vmx->vnmi_blocked_time> 1000000000LL&& > @@ -4951,10 +5238,13 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx) > int type; > bool idtv_info_valid; > > - exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); > - > vmx->exit_reason = vmcs_read32(VM_EXIT_REASON); > > + if (vmx->nested.nested_mode) > + return; > Again, I think you have to check if the guest is intercepting interrupts. > + > + exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); > + > /* Handle machine checks before interrupts are enabled */ > if ((vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY) > || (vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI > @@ -5068,6 +5358,7 @@ static void fixup_rmode_irq(struct vcpu_vmx *vmx) > static void vmx_vcpu_run(struct kvm_vcpu *vcpu) > { > struct vcpu_vmx *vmx = to_vmx(vcpu); > + u32 nested_exception_bitmap = 0; > > /* Record the guest's net vcpu time for enforced NMI injections. */ > if (unlikely(!cpu_has_virtual_nmis()&& vmx->soft_vnmi_blocked)) > @@ -5099,6 +5390,37 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) > if (vcpu->arch.switch_db_regs) > set_debugreg(vcpu->arch.dr6, 6); > > + if (vcpu->fpu_active) { > + if (vmcs_readl(CR0_READ_SHADOW)& X86_CR0_TS) > + vmcs_set_bits(GUEST_CR0, X86_CR0_TS); > + else > + vmcs_clear_bits(GUEST_CR0, X86_CR0_TS); > + > + if (vmx->nested.nested_mode) { > + if (!nested_map_current(vcpu)) { > + vmx->fail = 1; > + return; > + } > + > + nested_exception_bitmap = get_shadow_vmcs(vcpu)-> > + exception_bitmap; > + > + nested_unmap_current(vcpu); > + } > + > + if (vmx->nested.nested_mode&& > + (nested_exception_bitmap& (1u<< NM_VECTOR))) > + vmcs_write32(EXCEPTION_BITMAP, > + vmcs_read32(EXCEPTION_BITMAP) | (1u<< NM_VECTOR)); > + else > + vmcs_write32(EXCEPTION_BITMAP, > + vmcs_read32(EXCEPTION_BITMAP)& ~(1u<< NM_VECTOR)); > I'd like to see generalized handling of the exception bitmap in update_exception_bitmap(), not something ad-hoc. > + } else { > + vmcs_set_bits(GUEST_CR0, X86_CR0_TS); > + vmcs_write32(EXCEPTION_BITMAP, > + vmcs_read32(EXCEPTION_BITMAP) | (1u<< NM_VECTOR)); > + } > This looks confused wrt the previous patch. > > +void save_vmcs(struct shadow_vmcs *dst) > +{ > + dst->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR); > + dst->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR); > + dst->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR); > + dst->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR); > + dst->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR); > + dst->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR); > + dst->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR); > + dst->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR); > + dst->host_es_selector = vmcs_read16(HOST_ES_SELECTOR); > + dst->host_cs_selector = vmcs_read16(HOST_CS_SELECTOR); > + dst->host_ss_selector = vmcs_read16(HOST_SS_SELECTOR); > + dst->host_ds_selector = vmcs_read16(HOST_DS_SELECTOR); > + dst->host_fs_selector = vmcs_read16(HOST_FS_SELECTOR); > + dst->host_gs_selector = vmcs_read16(HOST_GS_SELECTOR); > + dst->host_tr_selector = vmcs_read16(HOST_TR_SELECTOR); > + dst->io_bitmap_a = vmcs_read64(IO_BITMAP_A); > + dst->io_bitmap_b = vmcs_read64(IO_BITMAP_B); > + if (cpu_has_vmx_msr_bitmap()) > + dst->msr_bitmap = vmcs_read64(MSR_BITMAP); > Instead of reading and writing the host parts, which don't change, I'd like to see the vmcs host initialization factored out and reused. > + > + dst->vm_exit_msr_store_addr = vmcs_read64(VM_EXIT_MSR_STORE_ADDR); > + dst->vm_exit_msr_load_addr = vmcs_read64(VM_EXIT_MSR_LOAD_ADDR); > + dst->vm_entry_msr_load_addr = vmcs_read64(VM_ENTRY_MSR_LOAD_ADDR); > We don't use those. > +} > + > +int prepare_vmcs_02(struct kvm_vcpu *vcpu) > +{ > + struct vcpu_vmx *vmx = to_vmx(vcpu); > + struct shadow_vmcs *src , *l1_shadow_vmcs = vmx->nested.l1_shadow_vmcs; > + struct level_state *l2_state; > + u32 exec_control; > + > + src = get_shadow_vmcs(vcpu); > + if (!src) { > + nested_unmap_current(vcpu); > + printk(KERN_INFO "%s: Error no shadow vmcs\n", __func__); > + return 1; > + } > + > + load_vmcs_common(src); > + > + l2_state =&(vmx->nested.current_l2_page->l2_state); > + > + if (l2_state->first_launch) { > + > + vmcs_write64(VMCS_LINK_POINTER, src->vmcs_link_pointer); > Looks wrong. The address needs translation at least. Not sure what it does? > + > + if (l2_state->io_bitmap_a) > + vmcs_write64(IO_BITMAP_A, l2_state->io_bitmap_a); > + > + if (l2_state->io_bitmap_b) > + vmcs_write64(IO_BITMAP_B, l2_state->io_bitmap_b); > Address translation? Also, need to mask with the host's I/O bitmap. > + > + if (l2_state->msr_bitmap) > + vmcs_write64(MSR_BITMAP, l2_state->msr_bitmap); > Need to mask with the host's msr bitmap. > + > + if (src->vm_entry_msr_load_count> 0) { > + struct page *page; > + > + page = nested_get_page(vcpu, > + src->vm_entry_msr_load_addr); > + if (!page) > + return 1; > + > + vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, page_to_phys(page)); > + > + kvm_release_page_clean(page); > I don't see how we can trust the guest's msr autoload. > + > + vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, > + (l1_shadow_vmcs->page_fault_error_code_mask& > + src->page_fault_error_code_mask)); > + > + vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, > + (l1_shadow_vmcs->page_fault_error_code_match& > + src->page_fault_error_code_match)); > + > I don't think this is right. If both masks are 1 but host match is 1 and guest match is 0, we will trap only on the guest's idea of PFEC matching. Also, if the guest has bit 14 clear in EXCEPTION_BITMAP, this needs to be done completely differently. I didn't see where PFEC matchin is handled in the exception handler? > + if (cpu_has_secondary_exec_ctrls()) { > + > + exec_control = > + l1_shadow_vmcs->secondary_vm_exec_control; > + > + if (nested_cpu_has_secondary_exec_ctrls(vcpu)) { > + > + exec_control |= src->secondary_vm_exec_control; > + > + if (!vm_need_virtualize_apic_accesses(vcpu->kvm) || > + !nested_vm_need_virtualize_apic_accesses(vcpu)) > + exec_control&= > + ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; > + } > + > + vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); > + } > + > + load_vmcs_host_state(l1_shadow_vmcs); > + > + l2_state->first_launch = false; > + } > How are all those calculations redone if the guest changes one of those controls? > + > + if (vm_need_tpr_shadow(vcpu->kvm)&& > + nested_cpu_has_vmx_tpr_shadow(vcpu)) > + vmcs_write32(TPR_THRESHOLD, src->tpr_threshold); > If the guest doesn't trap interrupts, we need to stay with the host tpr threshold. > + > + if (enable_ept) { > + if (!nested_cpu_has_vmx_ept(vcpu)) { > + vmcs_write64(EPT_POINTER, > + l1_shadow_vmcs->ept_pointer); > + vmcs_write64(GUEST_PDPTR0, > + l1_shadow_vmcs->guest_pdptr0); > + vmcs_write64(GUEST_PDPTR1, > + l1_shadow_vmcs->guest_pdptr1); > + vmcs_write64(GUEST_PDPTR2, > + l1_shadow_vmcs->guest_pdptr2); > + vmcs_write64(GUEST_PDPTR3, > + l1_shadow_vmcs->guest_pdptr3); > + } > + } > This version doesn't support ept, so please drop. > + > + exec_control = l1_shadow_vmcs->cpu_based_vm_exec_control; > + > + exec_control&= ~CPU_BASED_VIRTUAL_INTR_PENDING; > + > + exec_control&= ~CPU_BASED_VIRTUAL_NMI_PENDING; > + > + exec_control&= ~CPU_BASED_TPR_SHADOW; > + > + exec_control |= src->cpu_based_vm_exec_control; > Need to whitelist good bits. > +void sync_cached_regs_to_vmcs(struct kvm_vcpu *vcpu) > +{ > + unsigned long mask; > + > + if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty)) > + vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]); > + if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty)) > + vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]); > + > + mask = ~((1<< VCPU_REGS_RSP) | (1<< VCPU_REGS_RIP)); > + > + if (vcpu->arch.regs_dirty& mask) { > + printk(KERN_INFO "WARNING: dirty cached registers regs_dirty 0x%x mask 0x%lx\n", > + vcpu->arch.regs_dirty, mask); > + WARN_ON(1); > + } > + > + vcpu->arch.regs_dirty = 0; > +} > Split into a separate patch (and use in existing code which already does this). -- error compiling committee.c: too many arguments to function