From mboxrd@z Thu Jan  1 00:00:00 1970
From: Avi Kivity <avi@redhat.com>
Subject: Re: [PATCH 7/7] Nested VMX patch 7 handling of nested guest exits
Date: Thu, 17 Dec 2009 15:46:44 +0200
Message-ID: <4B2A3644.8020806@redhat.com>
References: <1260470309-7166-1-git-send-email-oritw@il.ibm.com> <1260470309-7166-2-git-send-email-oritw@il.ibm.com> <1260470309-7166-3-git-send-email-oritw@il.ibm.com> <1260470309-7166-4-git-send-email-oritw@il.ibm.com> <1260470309-7166-5-git-send-email-oritw@il.ibm.com> <1260470309-7166-6-git-send-email-oritw@il.ibm.com> <1260470309-7166-7-git-send-email-oritw@il.ibm.com> <1260470309-7166-8-git-send-email-oritw@il.ibm.com>
Mime-Version: 1.0
Content-Type: text/plain; charset=ISO-8859-1; format=flowed
Content-Transfer-Encoding: 7bit
Cc: kvm@vger.kernel.org, benami@il.ibm.com, abelg@il.ibm.com,
	muli@il.ibm.com, aliguori@us.ibm.com, mdday@us.ibm.com
To: oritw@il.ibm.com
Return-path: <kvm-owner@vger.kernel.org>
Received: from mx1.redhat.com ([209.132.183.28]:29661 "EHLO mx1.redhat.com"
	rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP
	id S935705AbZLQNrA (ORCPT <rfc822;kvm@vger.kernel.org>);
	Thu, 17 Dec 2009 08:47:00 -0500
In-Reply-To: <1260470309-7166-8-git-send-email-oritw@il.ibm.com>
Sender: kvm-owner@vger.kernel.org
List-ID: <kvm.vger.kernel.org>

On 12/10/2009 08:38 PM, oritw@il.ibm.com wrote:
> From: Orit Wasserman<oritw@il.ibm.com>
>
>    

(changelog)

> @@ -1525,6 +1539,22 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
>   			new_offset = vmcs_read64(TSC_OFFSET) + delta;
>   			vmcs_write64(TSC_OFFSET, new_offset);
>   		}
> +
> +		if (l1_shadow_vmcs != NULL) {
> +			l1_shadow_vmcs->host_tr_base =
> +				vmcs_readl(HOST_TR_BASE);
> +			l1_shadow_vmcs->host_gdtr_base =
> +				vmcs_readl(HOST_GDTR_BASE);
> +			l1_shadow_vmcs->host_ia32_sysenter_esp =
> +				vmcs_readl(HOST_IA32_SYSENTER_ESP);
> +
> +			if (tsc_this<  vcpu->arch.host_tsc)
> +				l1_shadow_vmcs->tsc_offset =
> +					vmcs_read64(TSC_OFFSET);
> +
> +			if (vmx->nested.nested_mode)
> +				load_vmcs_host_state(l1_shadow_vmcs);
> +		}
>    

Please share this code with non-nested vmcs setup.

> @@ -3794,6 +3824,11 @@ static void enable_irq_window(struct kvm_vcpu *vcpu)
>   {
>   	u32 cpu_based_vm_exec_control;
>
> +	if (to_vmx(vcpu)->nested.nested_mode) {
> +		nested_vmx_intr(vcpu);
> +		return;
> +	}
>    

I think this happens too late?  enable_irq_window() is called after 
we've given up on injecting the interrupt because interrupts are 
disabled.  But if we're running a guest, we can vmexit and inject the 
interrupt.  This code will only vmexit.

Hm, I see the vmexit code has an in_interrupt case, but I'd like this to 
be more regular: adjust vmx_interrupt_allowed() to allow interrupts if 
in a guest, and vmx_inject_irq() to force the vmexit.  This way 
interrupts have a single code path.

>
>   static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
>   {
> +	if (to_vmx(vcpu)->nested.nested_mode) {
> +		if (!nested_vmx_intr(vcpu))
> +			return 0;
> +	}
>    

... and you do that... so I wonder why the changes to 
enable_irq_window() are needed?

> +
>   	return (vmcs_readl(GUEST_RFLAGS)&  X86_EFLAGS_IF)&&
>   		!(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO)&
>   			(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));
> @@ -4042,6 +4082,10 @@ static int handle_exception(struct kvm_vcpu *vcpu)
>   		   not interested (exception bitmap 12 does not include NM_VECTOR)
>   		   enable fpu and resume l2 (avoid switching to l1)
>   		*/
> +
> +		if (vmx->nested.nested_mode)
> +			vmx->nested.nested_run_pending = 1; /* removing this line cause hung on boot of l2*/
> +
>    

This indicates a hack?

>   		vmx_fpu_activate(vcpu);
>
>   		return 1;
> @@ -4169,7 +4213,33 @@ static int handle_cr(struct kvm_vcpu *vcpu)
>   		trace_kvm_cr_write(cr, val);
>   		switch (cr) {
>   		case 0:
> -			kvm_set_cr0(vcpu, val);
> +			if (to_vmx(vcpu)->nested.nested_mode) {
> +				/* assume only X86_CR0_TS is handled by l0 */
> +				long new_cr0 = vmcs_readl(GUEST_CR0);
> +				long new_cr0_read_shadow = vmcs_readl(CR0_READ_SHADOW);
> +
> +				vmx_fpu_deactivate(vcpu);
>    
> +
> +				if (val&  X86_CR0_TS) {
> +					new_cr0 |= X86_CR0_TS;
> +					new_cr0_read_shadow |= X86_CR0_TS;
> +					vcpu->arch.cr0 |= X86_CR0_TS;
> +				} else {
> +					new_cr0&= ~X86_CR0_TS;
> +					new_cr0_read_shadow&= ~X86_CR0_TS;
> +					vcpu->arch.cr0&= X86_CR0_TS;
> +				}
> +
> +				vmcs_writel(GUEST_CR0, new_cr0);
> +				vmcs_writel(CR0_READ_SHADOW, new_cr0_read_shadow);
>    

Don't you need to #vmexit if the new cr0 violates the cr0_bits_always_on 
constraint, or if it changes bits in cr0 that the guest intercepts?

> +
> +				if (!(val&  X86_CR0_TS) || !(val&  X86_CR0_PE))
> +					vmx_fpu_activate(vcpu);
> +
> +				to_vmx(vcpu)->nested.nested_run_pending = 1;
>    

Please split into a function.

> +			} else
> +				kvm_set_cr0(vcpu, val);
> +
>   			skip_emulated_instruction(vcpu);
>   			return 1;
>   		case 3:
> @@ -4196,8 +4266,15 @@ static int handle_cr(struct kvm_vcpu *vcpu)
>   		break;
>   	case 2: /* clts */
>   		vmx_fpu_deactivate(vcpu);
> -		vcpu->arch.cr0&= ~X86_CR0_TS;
> -		vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0);
> +		if (to_vmx(vcpu)->nested.nested_mode) {
> +			vmcs_writel(GUEST_CR0, vmcs_readl(GUEST_CR0)&  ~X86_CR0_TS);
> +			vmcs_writel(CR0_READ_SHADOW, vmcs_readl(CR0_READ_SHADOW)&  ~X86_CR0_TS);
> +			vcpu->arch.cr0&= ~X86_CR0_TS;
> +			to_vmx(vcpu)->nested.nested_run_pending = 1;
>    

Won't the guest want to intercept this some time?

>   	/* Access CR3 don't cause VMExit in paging mode, so we need
>   	 * to sync with guest real CR3. */
>   	if (enable_ept&&  is_paging(vcpu))
> @@ -5347,6 +5435,60 @@ static void fixup_rmode_irq(struct vcpu_vmx *vmx)
>   		| vmx->rmode.irq.vector;
>   }
>
> +static int nested_handle_valid_idt(struct kvm_vcpu *vcpu)
>    

I asked for this to be renamed.

>   #ifdef CONFIG_X86_64
>   #define R "r"
>   #define Q "q"
> @@ -5358,8 +5500,17 @@ static void fixup_rmode_irq(struct vcpu_vmx *vmx)
>   static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
>   {
>   	struct vcpu_vmx *vmx = to_vmx(vcpu);
> +	int r;
>   	u32 nested_exception_bitmap = 0;
>
> +	if (vmx->nested.nested_mode) {
> +		r = nested_handle_valid_idt(vcpu);
>    

This will cause vmread()s before the launch of state that is not saved.  
This means it is broken on migration or after set_regs().

In general we follow the following pattern:

   read from memory
   vmwrite
   vmlaunch/vmresume
   vmread
   write to memory
   loop

There are exceptions where we allow state to be cached, mostly 
registers.  But we keep accessors for them so that save/restore works.

> +
> +static int nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu)
> +{
> +	if (to_vmx(vcpu)->nested.nested_mode) {
> +		struct page *msr_page = NULL;
> +		u32 msr_index = vcpu->arch.regs[VCPU_REGS_RCX];
> +		u32 exit_code = vmcs_read32(VM_EXIT_REASON);
> +		struct shadow_vmcs *l2svmcs = get_shadow_vmcs(vcpu);
> +
> +		if (!cpu_has_vmx_msr_bitmap()
> +		    || !nested_cpu_has_vmx_msr_bitmap(vcpu))
> +			return 1;
> +
> +		msr_page = nested_get_page(vcpu,
> +					   l2svmcs->msr_bitmap);
> +
> +		if (!msr_page) {
> +			printk(KERN_INFO "%s error in nested_get_page\n",
> +			       __func__);
> +			return 0;
> +		}
> +
> +		switch (exit_code) {
> +		case EXIT_REASON_MSR_READ:
> +			if (msr_index<= 0x1fff) {
> +				if (test_bit(msr_index,
> +					     (unsigned long *)(msr_page +
> +							       0x000)))
> +					return 1;
> +			} else if ((msr_index>= 0xc0000000)&&
> +				   (msr_index<= 0xc0001fff)) {
> +				msr_index&= 0x1fff;
> +				if (test_bit(msr_index,
> +					     (unsigned long *)(msr_page +
> +							       0x400)))
> +					return 1;
> +			}
> +			break;
> +		case EXIT_REASON_MSR_WRITE:
> +			if (msr_index<= 0x1fff) {
> +				if (test_bit(msr_index,
> +					     (unsigned long *)(msr_page +
> +							       0x800)))
> +						return 1;
> +			} else if ((msr_index>= 0xc0000000)&&
> +				   (msr_index<= 0xc0001fff)) {
> +				msr_index&= 0x1fff;
> +				if (test_bit(msr_index,
> +					     (unsigned long *)(msr_page +
> +							       0xc00)))
> +					return 1;
> +			}
> +			break;
> +		}
> +	}
> +
>    

Please refactor with a single test_bit, just calculate the offsets 
differently (+400*8 for high msrs, +800*8 for writes).

> +
> +static int nested_vmx_exit_handled(struct kvm_vcpu *vcpu, bool kvm_override)
> +{
> +	u32 exit_code = vmcs_read32(VM_EXIT_REASON);
> +	struct vcpu_vmx *vmx = to_vmx(vcpu);
> +	u32 intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
> +	struct shadow_vmcs *l2svmcs;
> +
> +	int r = 0;
> +
> +	if (vmx->nested.nested_run_pending)
> +		return 0;
> +
> +	if (unlikely(vmx->fail)) {
> +		printk(KERN_INFO "%s failed vm entry %x\n",
> +		       __func__, vmcs_read32(VM_INSTRUCTION_ERROR));
> +		return 1;
> +	}
> +
> +	if (kvm_override) {
>    

What's kvm_override?

> +		switch (exit_code) {
> +		case EXIT_REASON_EXTERNAL_INTERRUPT:
> +			return 0;
> +		case EXIT_REASON_EXCEPTION_NMI:
> +			if (!is_exception(intr_info))
> +				return 0;
> +
> +			if (is_page_fault(intr_info)&&  (!enable_ept))
> +				return 0;
> +
> +			break;
> +		case EXIT_REASON_EPT_VIOLATION:
> +			if (enable_ept)
> +				return 0;
> +
> +			break;
> +		}
> +	}
> +
> +
> +	if (!nested_map_current(vcpu))
> +		return 0;
> +
> +	l2svmcs = get_shadow_vmcs(vcpu);
> +
> +	switch (exit_code) {
> +	case EXIT_REASON_INVLPG:
> +		if (l2svmcs->cpu_based_vm_exec_control&
> +		    CPU_BASED_INVLPG_EXITING)
> +			r = 1;
> +		break;
> +	case EXIT_REASON_MSR_READ:
> +	case EXIT_REASON_MSR_WRITE:
> +		r = nested_vmx_exit_handled_msr(vcpu);
> +		break;
> +	case EXIT_REASON_CR_ACCESS: {
> +		unsigned long exit_qualification =
> +			vmcs_readl(EXIT_QUALIFICATION);
> +		int cr = exit_qualification&  15;
> +		int reg = (exit_qualification>>  8)&  15;
> +		unsigned long val = kvm_register_read(vcpu, reg);
> +
> +		switch ((exit_qualification>>  4)&  3) {
> +		case 0: /* mov to cr */
> +			switch (cr) {
> +			case 0:
> +				if (l2svmcs->cr0_guest_host_mask&
> +				    (val ^ l2svmcs->cr0_read_shadow))
> +					r = 1;
>    
> +				break;
> +			case 3:
> +				if (l2svmcs->cpu_based_vm_exec_control&
> +				    CPU_BASED_CR3_LOAD_EXITING)
> +					r = 1;
> +				break;
> +			case 4:
> +				if (l2svmcs->cr4_guest_host_mask&
> +				    (l2svmcs->cr4_read_shadow ^ val))
> +					r = 1;
> +				break;
> +			case 8:
> +				if (l2svmcs->cpu_based_vm_exec_control&
> +				    CPU_BASED_CR8_LOAD_EXITING)
> +					r = 1;
> +				break;
> +			}
> +			break;
> +		case 2: /* clts */
> +			if (l2svmcs->cr0_guest_host_mask&  X86_CR0_TS)
> +				r = 1;
> +			break;
> +		case 1: /*mov from cr*/
> +			switch (cr) {
> +			case 0:
> +				r = 1;
> +			case 3:
> +				if (l2svmcs->cpu_based_vm_exec_control&
> +				    CPU_BASED_CR3_STORE_EXITING)
> +					r = 1;
> +				break;
> +			case 4:
> +				r = 1;
> +				break;
> +			case 8:
> +				if (l2svmcs->cpu_based_vm_exec_control&
> +				    CPU_BASED_CR8_STORE_EXITING)
> +					r = 1;
> +				break;
> +			}
> +			break;
> +		case 3: /* lmsw */
> +			if (l2svmcs->cr0_guest_host_mask&
> +			    (val ^ l2svmcs->cr0_read_shadow))
> +				r = 1;
> +			break;
> +		}
> +		break;
> +	}
> +	case EXIT_REASON_DR_ACCESS: {
> +		if (l2svmcs->cpu_based_vm_exec_control&
> +		    CPU_BASED_MOV_DR_EXITING)
> +			r = 1;
> +		break;
> +	}
> +
> +	case EXIT_REASON_EXCEPTION_NMI: {
> +
> +		if (is_external_interrupt(intr_info)&&
> +		    (l2svmcs->pin_based_vm_exec_control&
> +		     PIN_BASED_EXT_INTR_MASK))
> +			r = 1;
> +		else if (is_nmi(intr_info)&&
> +		    (l2svmcs->pin_based_vm_exec_control&
> +		     PIN_BASED_NMI_EXITING))
> +			r = 1;
> +		else if (is_exception(intr_info)&&
> +		    (l2svmcs->exception_bitmap&
> +		     (1u<<  (intr_info&  INTR_INFO_VECTOR_MASK))))
> +			r = 1;
> +		else if (is_page_fault(intr_info))
> +			r = 1;
> +		break;
> +	}
> +
> +	case EXIT_REASON_EXTERNAL_INTERRUPT:
> +		if (l2svmcs->pin_based_vm_exec_control&
> +		    PIN_BASED_EXT_INTR_MASK)
> +			r = 1;
> +		break;
> +	default:
> +		r = 1;
> +	}
> +	nested_unmap_current(vcpu);
> +
>    

Please move these to the normal handlers so it is possible to follow the 
code.

-- 
error compiling committee.c: too many arguments to function