Re: [PATCH 6/7] Nested VMX patch 6 implements vmlaunch and vmresume

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Avi Kivity <avi@redhat.com>
To: oritw@il.ibm.com
Cc: kvm@vger.kernel.org, benami@il.ibm.com, abelg@il.ibm.com,
	muli@il.ibm.com, aliguori@us.ibm.com, mdday@us.ibm.com
Subject: Re: [PATCH 6/7] Nested VMX patch 6 implements vmlaunch and vmresume
Date: Thu, 17 Dec 2009 12:10:50 +0200	[thread overview]
Message-ID: <4B2A03AA.5070202@redhat.com> (raw)
In-Reply-To: <1260470309-7166-7-git-send-email-oritw@il.ibm.com>

On 12/10/2009 08:38 PM, oritw@il.ibm.com wrote:
> From: Orit Wasserman<oritw@il.ibm.com>
>
>
> @@ -287,7 +297,7 @@ static inline int vmcs_field_type(unsigned long field)
>   }
>
>   /*
> -  Returncs VMCS field size in bits
> +  Returns VMCS field size in bits
>   */
>    

Fold.

>   static inline int vmcs_field_size(int field_type, struct kvm_vcpu *vcpu)
>   {
> @@ -313,6 +323,10 @@ static inline int vmcs_field_size(int field_type, struct kvm_vcpu *vcpu)
>   	return 0;
>   }
>
> +#define NESTED_VM_EXIT_CONTROLS_MASK (~(VM_EXIT_LOAD_IA32_PAT | \
> +					VM_EXIT_SAVE_IA32_PAT))
> +#define NESTED_VM_ENTRY_CONTROLS_MASK (~(VM_ENTRY_LOAD_IA32_PAT | \
> +

I think a whitelist is better here, so if we add a new feature and 
forget it here, we don't introduce a vulnerability.

>
> +static inline int nested_cpu_has_vmx_tpr_shadow(struct  kvm_vcpu *vcpu)
> +{
> +	return cpu_has_vmx_tpr_shadow()&&
> +		get_shadow_vmcs(vcpu)->cpu_based_vm_exec_control&
> +		CPU_BASED_TPR_SHADOW;
> +}
>    

bools are better.

> +
> +static inline int nested_cpu_has_secondary_exec_ctrls(struct kvm_vcpu *vcpu)
> +{
> +	return cpu_has_secondary_exec_ctrls()&&
> +		get_shadow_vmcs(vcpu)->cpu_based_vm_exec_control&
> +		CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
> +}
> +
> +static inline bool nested_vm_need_virtualize_apic_accesses(struct kvm_vcpu
> +							   *vcpu)
> +{
> +	return get_shadow_vmcs(vcpu)->secondary_vm_exec_control&
> +		SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
> +}
>    

Need to check for secondary controls first.

> +
> +static inline int nested_cpu_has_vmx_ept(struct kvm_vcpu *vcpu)
> +{
> +	return get_shadow_vmcs(vcpu)->
> +		secondary_vm_exec_control&  SECONDARY_EXEC_ENABLE_EPT;
> +}
> +
> +static inline int nested_cpu_has_vmx_vpid(struct kvm_vcpu *vcpu)
> +{
> +	return get_shadow_vmcs(vcpu)->secondary_vm_exec_control&
> +		SECONDARY_EXEC_ENABLE_VPID;
> +}
>    

A helper nested_check_2ndary_control() can help reduce duplication.

> +
> +static inline int nested_cpu_has_vmx_pat(struct kvm_vcpu *vcpu)
> +{
> +	return get_shadow_vmcs(vcpu)->vm_entry_controls&
> +		VM_ENTRY_LOAD_IA32_PAT;
> +}
>    

Suggest dropping PAT support for now (it's optional in the spec IIRC, 
and doesn't help much).

>
> +
> +void prepare_vmcs_12(struct kvm_vcpu *vcpu)
> +{
>    

Not sure what this does.  From the name, it appears to prepare a vmcs.  
 From the contents, it appears to read the vmcs and save it into a 
shadow vmcs.

> +	struct shadow_vmcs *l2_shadow_vmcs =
> +		get_shadow_vmcs(vcpu);
> +
> +	l2_shadow_vmcs->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR);
> +	l2_shadow_vmcs->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR);
> +	l2_shadow_vmcs->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR);
> +	l2_shadow_vmcs->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR);
> +	l2_shadow_vmcs->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR);
> +	l2_shadow_vmcs->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR);
> +	l2_shadow_vmcs->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR);
> +	l2_shadow_vmcs->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR);
> +
> +	l2_shadow_vmcs->tsc_offset = vmcs_read64(TSC_OFFSET);
> +	l2_shadow_vmcs->guest_physical_address =
> +		vmcs_read64(GUEST_PHYSICAL_ADDRESS);
> +	l2_shadow_vmcs->vmcs_link_pointer = vmcs_read64(VMCS_LINK_POINTER);
> +	l2_shadow_vmcs->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
> +	if (vmcs_config.vmentry_ctrl&  VM_ENTRY_LOAD_IA32_PAT)
> +		l2_shadow_vmcs->guest_ia32_pat = vmcs_read64(GUEST_IA32_PAT);
> +	l2_shadow_vmcs->cr3_target_count = vmcs_read32(CR3_TARGET_COUNT);
> +	l2_shadow_vmcs->vm_entry_intr_info_field =
> +		vmcs_read32(VM_ENTRY_INTR_INFO_FIELD);
> +	l2_shadow_vmcs->vm_entry_exception_error_code =
> +		vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE);
> +	l2_shadow_vmcs->vm_entry_instruction_len =
> +		vmcs_read32(VM_ENTRY_INSTRUCTION_LEN);
> +	l2_shadow_vmcs->vm_instruction_error =
> +		vmcs_read32(VM_INSTRUCTION_ERROR);
> +	l2_shadow_vmcs->vm_exit_reason  = vmcs_read32(VM_EXIT_REASON);
> +	l2_shadow_vmcs->vm_exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
> +	l2_shadow_vmcs->vm_exit_intr_error_code =
> +		vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
> +	l2_shadow_vmcs->idt_vectoring_info_field =
> +		vmcs_read32(IDT_VECTORING_INFO_FIELD);
> +	l2_shadow_vmcs->idt_vectoring_error_code =
> +		vmcs_read32(IDT_VECTORING_ERROR_CODE);
> +	l2_shadow_vmcs->vm_exit_instruction_len =
> +		vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
> +	l2_shadow_vmcs->vmx_instruction_info =
> +		vmcs_read32(VMX_INSTRUCTION_INFO);
> +	l2_shadow_vmcs->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT);
> +	l2_shadow_vmcs->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT);
> +	l2_shadow_vmcs->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT);
> +	l2_shadow_vmcs->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT);
> +	l2_shadow_vmcs->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT);
> +	l2_shadow_vmcs->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT);
> +	l2_shadow_vmcs->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT);
> +	l2_shadow_vmcs->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT);
> +	l2_shadow_vmcs->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT);
> +	l2_shadow_vmcs->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT);
> +	l2_shadow_vmcs->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES);
> +	l2_shadow_vmcs->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES);
> +	l2_shadow_vmcs->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES);
> +	l2_shadow_vmcs->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES);
> +	l2_shadow_vmcs->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES);
> +	l2_shadow_vmcs->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES);
> +	l2_shadow_vmcs->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES);
> +	l2_shadow_vmcs->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES);
> +	l2_shadow_vmcs->guest_interruptibility_info =
> +		vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
> +	l2_shadow_vmcs->guest_activity_state =
> +		vmcs_read32(GUEST_ACTIVITY_STATE);
> +	l2_shadow_vmcs->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS);
> +
> +	l2_shadow_vmcs->cr4_read_shadow = vmcs_readl(CR4_READ_SHADOW);
> +	l2_shadow_vmcs->exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
>    

If EXIT_QUALIFICATION is a physical address, don't you need to translate 
it?  vmx will store a host physical address and we need a guest physical 
address.

> +	l2_shadow_vmcs->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS);
>    

Not all processors support this.

> +
> +	if (l2_shadow_vmcs->cr0_guest_host_mask&  X86_CR0_TS)
> +		l2_shadow_vmcs->guest_cr0 =  vmcs_readl(GUEST_CR0);
> +	else /* if CR0_GUEST_HOST_MASK[TS]=0 l1 should think TS was really written to CR0 */
> +		l2_shadow_vmcs->guest_cr0 =
> +			(vmcs_readl(GUEST_CR0)&~X86_CR0_TS) | (vmcs_readl(CR0_READ_SHADOW)&  X86_CR0_TS);
> +
> +	l2_shadow_vmcs->guest_cr4 = vmcs_readl(GUEST_CR4);
>    

GUEST_CR4 will be different from the guest's view of it (for example, 
without EPT CR4.PAE will always be set.

We also use CR4_GUEST_HOST_MASK, I think you need to account for that.

> +	l2_shadow_vmcs->guest_es_base = vmcs_readl(GUEST_ES_BASE);
> +	l2_shadow_vmcs->guest_cs_base = vmcs_readl(GUEST_CS_BASE);
> +	l2_shadow_vmcs->guest_ss_base = vmcs_readl(GUEST_SS_BASE);
> +	l2_shadow_vmcs->guest_ds_base = vmcs_readl(GUEST_DS_BASE);
> +	l2_shadow_vmcs->guest_fs_base = vmcs_readl(GUEST_FS_BASE);
> +	l2_shadow_vmcs->guest_gs_base = vmcs_readl(GUEST_GS_BASE);
> +	l2_shadow_vmcs->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE);
> +	l2_shadow_vmcs->guest_tr_base = vmcs_readl(GUEST_TR_BASE);
> +	l2_shadow_vmcs->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE);
> +	l2_shadow_vmcs->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE);
> +	l2_shadow_vmcs->guest_dr7 = vmcs_readl(GUEST_DR7);
> +	l2_shadow_vmcs->guest_rsp = vmcs_readl(GUEST_RSP);
> +	l2_shadow_vmcs->guest_rip = vmcs_readl(GUEST_RIP);
> +	l2_shadow_vmcs->guest_rflags = vmcs_readl(GUEST_RFLAGS);
> +	l2_shadow_vmcs->guest_pending_dbg_exceptions =
> +		vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
> +	l2_shadow_vmcs->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP);
> +	l2_shadow_vmcs->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP);
> +}
> +
> +int load_vmcs_common(struct shadow_vmcs *src)
> +{
> +	vmcs_write16(GUEST_ES_SELECTOR, src->guest_es_selector);
> +	vmcs_write16(GUEST_CS_SELECTOR, src->guest_cs_selector);
> +	vmcs_write16(GUEST_SS_SELECTOR, src->guest_ss_selector);
> +	vmcs_write16(GUEST_DS_SELECTOR, src->guest_ds_selector);
> +	vmcs_write16(GUEST_FS_SELECTOR, src->guest_fs_selector);
> +	vmcs_write16(GUEST_GS_SELECTOR, src->guest_gs_selector);
> +	vmcs_write16(GUEST_LDTR_SELECTOR, src->guest_ldtr_selector);
> +	vmcs_write16(GUEST_TR_SELECTOR, src->guest_tr_selector);
> +
> +	vmcs_write64(GUEST_IA32_DEBUGCTL, src->guest_ia32_debugctl);
> +
> +	if (vmcs_config.vmentry_ctrl&  VM_ENTRY_LOAD_IA32_PAT)
> +		vmcs_write64(GUEST_IA32_PAT, src->guest_ia32_pat);
>    

This is dangerous, the guest can cause inconsistent page types and 
processor lockups.

> +
> +	if (src->vm_entry_msr_load_count<  512)
> +		vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, src->vm_entry_msr_load_count);
>    

Ditto.  MSRs have to be validated.  This feature is optional, right?  
Suggest we don't support it for now.

> +	vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
> +		    src->guest_pending_dbg_exceptions);
>    

I think this is a new field.

>   static struct level_state *create_state(void)
>   {
>   	struct level_state *state = NULL;
> @@ -2301,8 +2578,6 @@ static void free_l1_state(struct kvm_vcpu *vcpu)
>   		kfree(list_item);
>   	}
>   }
> -
> -
>    

?

> @@ -3574,6 +3849,10 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
>   {
>   	struct vcpu_vmx *vmx = to_vmx(vcpu);
>
> +	if (vmx->nested.nested_mode) {
> +		return;
> +	}
> +
>    

No braces on single-line if statements in kernel code.

>   	if (!cpu_has_virtual_nmis()) {
>   		/*
>   		 * Tracking the NMI-blocked state in software is built upon
> @@ -3759,7 +4038,12 @@ static int handle_exception(struct kvm_vcpu *vcpu)
>   		return 1;  /* already handled by vmx_vcpu_run() */
>
>   	if (is_no_device(intr_info)) {
> +		/* if l0 handled an fpu operation for l2 it's because l1 is
> +		   not interested (exception bitmap 12 does not include NM_VECTOR)
> +		   enable fpu and resume l2 (avoid switching to l1)
> +		*/
>    

Use standard comment style please.

>   		vmx_fpu_activate(vcpu);
> +
>   		return 1;
>   	}
>
>
>
> @@ -4504,7 +4793,7 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
>   static int handle_vmptrld(struct kvm_vcpu *vcpu)
>   {
>   	struct vcpu_vmx *vmx = to_vmx(vcpu);
> -	u64 guest_vmcs_addr;
> +	gpa_t guest_vmcs_addr;
>    

Fold.

> @@ -4895,7 +5184,6 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
>   			= vmcs_read32(VM_INSTRUCTION_ERROR);
>   		return 0;
>   	}
> -
>    

?

>   	if ((vectoring_info&  VECTORING_INFO_VALID_MASK)&&
>   			(exit_reason != EXIT_REASON_EXCEPTION_NMI&&
>   			exit_reason != EXIT_REASON_EPT_VIOLATION&&
> @@ -4903,8 +5191,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
>   		printk(KERN_WARNING "%s: unexpected, valid vectoring info "
>   		       "(0x%x) and exit reason is 0x%x\n",
>   		       __func__, vectoring_info, exit_reason);
> -
> -	if (unlikely(!cpu_has_virtual_nmis()&&  vmx->soft_vnmi_blocked)) {
> +	if (!vmx->nested.nested_mode&&  unlikely(!cpu_has_virtual_nmis()&&  vmx->soft_vnmi_blocked)) {
>    

Why is this different for nested mode?  At least, you have to check if 
the guest is intercepting nmis.

>   		if (vmx_interrupt_allowed(vcpu)) {
>   			vmx->soft_vnmi_blocked = 0;
>   		} else if (vmx->vnmi_blocked_time>  1000000000LL&&
> @@ -4951,10 +5238,13 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
>   	int type;
>   	bool idtv_info_valid;
>
> -	exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
> -
>   	vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
>
> +	if (vmx->nested.nested_mode)
> +		return;
>    

Again, I think you have to check if the guest is intercepting interrupts.

> +
> +	exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
> +
>   	/* Handle machine checks before interrupts are enabled */
>   	if ((vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY)
>   	    || (vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI
> @@ -5068,6 +5358,7 @@ static void fixup_rmode_irq(struct vcpu_vmx *vmx)
>   static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
>   {
>   	struct vcpu_vmx *vmx = to_vmx(vcpu);
> +	u32 nested_exception_bitmap = 0;
>
>   	/* Record the guest's net vcpu time for enforced NMI injections. */
>   	if (unlikely(!cpu_has_virtual_nmis()&&  vmx->soft_vnmi_blocked))
> @@ -5099,6 +5390,37 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
>   	if (vcpu->arch.switch_db_regs)
>   		set_debugreg(vcpu->arch.dr6, 6);
>
> +	if (vcpu->fpu_active) {
> +		if (vmcs_readl(CR0_READ_SHADOW)&  X86_CR0_TS)
> +			vmcs_set_bits(GUEST_CR0, X86_CR0_TS);
> +		else
> +			vmcs_clear_bits(GUEST_CR0, X86_CR0_TS);
> +
> +		if (vmx->nested.nested_mode) {
> +			if (!nested_map_current(vcpu)) {
> +				vmx->fail = 1;
> +				return;
> +			}
> +
> +			nested_exception_bitmap =  get_shadow_vmcs(vcpu)->
> +				exception_bitmap;
> +
> +			nested_unmap_current(vcpu);
> +		}
> +
> +		if (vmx->nested.nested_mode&&
> +		    (nested_exception_bitmap&  (1u<<  NM_VECTOR)))
> +			vmcs_write32(EXCEPTION_BITMAP,
> +				     vmcs_read32(EXCEPTION_BITMAP) |  (1u<<  NM_VECTOR));
> +		else
> +			vmcs_write32(EXCEPTION_BITMAP,
> +				     vmcs_read32(EXCEPTION_BITMAP)&   ~(1u<<  NM_VECTOR));
>    

I'd like to see generalized handling of the exception bitmap in 
update_exception_bitmap(), not something ad-hoc.

> +	} else {
> +		vmcs_set_bits(GUEST_CR0, X86_CR0_TS);
> +		vmcs_write32(EXCEPTION_BITMAP,
> +			     vmcs_read32(EXCEPTION_BITMAP) |  (1u<<  NM_VECTOR));
> +	}
>    

This looks confused wrt the previous patch.

>
> +void save_vmcs(struct shadow_vmcs *dst)
> +{
> +	dst->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR);
> +	dst->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR);
> +	dst->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR);
> +	dst->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR);
> +	dst->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR);
> +	dst->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR);
> +	dst->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR);
> +	dst->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR);
> +	dst->host_es_selector = vmcs_read16(HOST_ES_SELECTOR);
> +	dst->host_cs_selector = vmcs_read16(HOST_CS_SELECTOR);
> +	dst->host_ss_selector = vmcs_read16(HOST_SS_SELECTOR);
> +	dst->host_ds_selector = vmcs_read16(HOST_DS_SELECTOR);
> +	dst->host_fs_selector = vmcs_read16(HOST_FS_SELECTOR);
> +	dst->host_gs_selector = vmcs_read16(HOST_GS_SELECTOR);
> +	dst->host_tr_selector = vmcs_read16(HOST_TR_SELECTOR);
> +	dst->io_bitmap_a = vmcs_read64(IO_BITMAP_A);
> +	dst->io_bitmap_b = vmcs_read64(IO_BITMAP_B);
> +	if (cpu_has_vmx_msr_bitmap())
> +		dst->msr_bitmap = vmcs_read64(MSR_BITMAP);
>    

Instead of reading and writing the host parts, which don't change, I'd 
like to see the vmcs host initialization factored out and reused.

> +
> +	dst->vm_exit_msr_store_addr = vmcs_read64(VM_EXIT_MSR_STORE_ADDR);
> +	dst->vm_exit_msr_load_addr = vmcs_read64(VM_EXIT_MSR_LOAD_ADDR);
> +	dst->vm_entry_msr_load_addr = vmcs_read64(VM_ENTRY_MSR_LOAD_ADDR);
>    

We don't use those.

> +}
> +
> +int prepare_vmcs_02(struct kvm_vcpu *vcpu)
> +{
> +	struct vcpu_vmx *vmx = to_vmx(vcpu);
> +	struct shadow_vmcs *src , *l1_shadow_vmcs = vmx->nested.l1_shadow_vmcs;
> +	struct level_state *l2_state;
> +	u32 exec_control;
> +
> +	src = get_shadow_vmcs(vcpu);
> +	if (!src) {
> +		nested_unmap_current(vcpu);
> +		printk(KERN_INFO "%s: Error no shadow vmcs\n", __func__);
> +		return 1;
> +	}
> +
> +	load_vmcs_common(src);
> +
> +	l2_state =&(vmx->nested.current_l2_page->l2_state);
> +
> +	if (l2_state->first_launch) {
> +
> +		vmcs_write64(VMCS_LINK_POINTER, src->vmcs_link_pointer);
>    

Looks wrong.  The address needs translation at least.  Not sure what it 
does?

> +
> +		if (l2_state->io_bitmap_a)
> +			vmcs_write64(IO_BITMAP_A, l2_state->io_bitmap_a);
> +
> +		if (l2_state->io_bitmap_b)
> +			vmcs_write64(IO_BITMAP_B, l2_state->io_bitmap_b);
>    

Address translation?  Also, need to mask with the host's I/O bitmap.

> +
> +		if (l2_state->msr_bitmap)
> +			vmcs_write64(MSR_BITMAP, l2_state->msr_bitmap);
>    

Need to mask with the host's msr bitmap.

> +
> +		if (src->vm_entry_msr_load_count>  0) {
> +			struct page *page;
> +
> +			page = nested_get_page(vcpu,
> +					       src->vm_entry_msr_load_addr);
> +			if (!page)
> +				return 1;
> +
> +			vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, page_to_phys(page));
> +
> +			kvm_release_page_clean(page);
>    

I don't see how we can trust the guest's msr autoload.

> +
> +		vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK,
> +			     (l1_shadow_vmcs->page_fault_error_code_mask&
> +			      src->page_fault_error_code_mask));
> +
> +		vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH,
> +			     (l1_shadow_vmcs->page_fault_error_code_match&
> +			      src->page_fault_error_code_match));
> +
>    

I don't think this is right.  If both masks are 1 but host match is 1 
and guest match is 0, we will trap only on the guest's idea of PFEC 
matching.

Also, if the guest has bit 14 clear in EXCEPTION_BITMAP, this needs to 
be done completely differently.

I didn't see where PFEC matchin is handled in the exception handler?

> +		if (cpu_has_secondary_exec_ctrls()) {
> +
> +			exec_control =
> +				l1_shadow_vmcs->secondary_vm_exec_control;
> +
> +			if (nested_cpu_has_secondary_exec_ctrls(vcpu)) {
> +
> +				exec_control |= src->secondary_vm_exec_control;
> +
> +				if (!vm_need_virtualize_apic_accesses(vcpu->kvm) ||
> +				    !nested_vm_need_virtualize_apic_accesses(vcpu))
> +					exec_control&=
> +						~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
> +			}
> +
> +			vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
> +		}
> +
> +		load_vmcs_host_state(l1_shadow_vmcs);
> +
> +		l2_state->first_launch = false;
> +	}
>    

How are all those calculations redone if the guest changes one of those 
controls?

> +
> +	if (vm_need_tpr_shadow(vcpu->kvm)&&
> +	    nested_cpu_has_vmx_tpr_shadow(vcpu))
> +		vmcs_write32(TPR_THRESHOLD, src->tpr_threshold);
>    

If the guest doesn't trap interrupts, we need to stay with the host tpr 
threshold.

> +
> +	if (enable_ept) {
> +		if (!nested_cpu_has_vmx_ept(vcpu)) {
> +			vmcs_write64(EPT_POINTER,
> +				     l1_shadow_vmcs->ept_pointer);
> +			vmcs_write64(GUEST_PDPTR0,
> +				     l1_shadow_vmcs->guest_pdptr0);
> +			vmcs_write64(GUEST_PDPTR1,
> +				     l1_shadow_vmcs->guest_pdptr1);
> +			vmcs_write64(GUEST_PDPTR2,
> +				     l1_shadow_vmcs->guest_pdptr2);
> +			vmcs_write64(GUEST_PDPTR3,
> +				     l1_shadow_vmcs->guest_pdptr3);
> +		}
> +	}
>    

This version doesn't support ept, so please drop.

> +
> +	exec_control = l1_shadow_vmcs->cpu_based_vm_exec_control;
> +
> +	exec_control&= ~CPU_BASED_VIRTUAL_INTR_PENDING;
> +
> +	exec_control&= ~CPU_BASED_VIRTUAL_NMI_PENDING;
> +
> +	exec_control&= ~CPU_BASED_TPR_SHADOW;
> +
> +	exec_control |= src->cpu_based_vm_exec_control;
>    

Need to whitelist good bits.

> +void sync_cached_regs_to_vmcs(struct kvm_vcpu *vcpu)
> +{
> +	unsigned long mask;
> +
> +	if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty))
> +		vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
> +	if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty))
> +		vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
> +
> +	mask = ~((1<<  VCPU_REGS_RSP) | (1<<  VCPU_REGS_RIP));
> +
> +	if (vcpu->arch.regs_dirty&  mask) {
> +		printk(KERN_INFO "WARNING: dirty cached registers regs_dirty 0x%x mask 0x%lx\n",
> +		       vcpu->arch.regs_dirty, mask);
> +		WARN_ON(1);
> +	}
> +
> +	vcpu->arch.regs_dirty = 0;
> +}
>    

Split into a separate patch (and use in existing code which already does 
this).


-- 
error compiling committee.c: too many arguments to function

next prev parent reply	other threads:[~2009-12-17 10:11 UTC|newest]

Thread overview: 24+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2009-12-10 18:38 Nested VMX support v4 oritw
2009-12-10 18:38 ` [PATCH 1/7] Nested VMX patch 1 implements vmon and vmoff oritw
2009-12-10 18:38   ` [PATCH 2/7] Nested VMX patch 2 implements vmclear oritw
2009-12-10 18:38     ` [PATCH 3/7] Nested VMX patch 3 implements vmptrld and vmptrst oritw
2009-12-10 18:38       ` [PATCH 4/7] Nested VMX patch 4 implements vmread and vmwrite oritw
2009-12-10 18:38         ` [PATCH 5/7] Nested VMX patch 5 Simplify fpu handling oritw
2009-12-10 18:38           ` [PATCH 6/7] Nested VMX patch 6 implements vmlaunch and vmresume oritw
2009-12-10 18:38             ` [PATCH 7/7] Nested VMX patch 7 handling of nested guest exits oritw
2009-12-17 13:46               ` Avi Kivity
2009-12-17 10:10             ` Avi Kivity [this message]
2009-12-17  9:10           ` [PATCH 5/7] Nested VMX patch 5 Simplify fpu handling Avi Kivity
2009-12-16 14:44         ` [PATCH 4/7] Nested VMX patch 4 implements vmread and vmwrite Avi Kivity
2009-12-16 14:32       ` [PATCH 3/7] Nested VMX patch 3 implements vmptrld and vmptrst Avi Kivity
2009-12-16 13:59     ` [PATCH 2/7] Nested VMX patch 2 implements vmclear Avi Kivity
2009-12-28 14:57     ` Gleb Natapov
2009-12-16 13:34   ` [PATCH 1/7] Nested VMX patch 1 implements vmon and vmoff Avi Kivity
2009-12-20 14:20   ` Gleb Natapov
2009-12-20 14:23     ` Avi Kivity
2009-12-20 14:25       ` Gleb Natapov
2009-12-20 17:08     ` Andi Kleen
2009-12-20 19:04       ` Avi Kivity
2009-12-21 15:52         ` Muli Ben-Yehuda
2009-12-21 16:00           ` Avi Kivity
2009-12-17 13:49 ` Nested VMX support v4 Avi Kivity

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=4B2A03AA.5070202@redhat.com \
    --to=avi@redhat.com \
    --cc=abelg@il.ibm.com \
    --cc=aliguori@us.ibm.com \
    --cc=benami@il.ibm.com \
    --cc=kvm@vger.kernel.org \
    --cc=mdday@us.ibm.com \
    --cc=muli@il.ibm.com \
    --cc=oritw@il.ibm.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.