From mboxrd@z Thu Jan  1 00:00:00 1970
From: Avi Kivity <avi@redhat.com>
Subject: Re: [PATCH 19/24] Deciding if L0 or L1 should handle an L2 exit
Date: Mon, 14 Jun 2010 15:24:02 +0300
Message-ID: <4C161F62.9070506@redhat.com>
References: <1276431753-nyh@il.ibm.com> <201006131232.o5DCWIHl013120@rice.haifa.ibm.com>
Mime-Version: 1.0
Content-Type: text/plain; charset=ISO-8859-1; format=flowed
Content-Transfer-Encoding: 7bit
Cc: kvm@vger.kernel.org
To: "Nadav Har'El" <nyh@il.ibm.com>
Return-path: <kvm-owner@vger.kernel.org>
Received: from mx1.redhat.com ([209.132.183.28]:39735 "EHLO mx1.redhat.com"
	rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP
	id S1751743Ab0FNMYI (ORCPT <rfc822;kvm@vger.kernel.org>);
	Mon, 14 Jun 2010 08:24:08 -0400
In-Reply-To: <201006131232.o5DCWIHl013120@rice.haifa.ibm.com>
Sender: kvm-owner@vger.kernel.org
List-ID: <kvm.vger.kernel.org>

On 06/13/2010 03:32 PM, Nadav Har'El wrote:
> This patch contains the logic of whether an L2 exit should be handled by L0
> and then L2 should be resumed, or whether L1 should be run to handle this
> exit (using the nested_vmx_vmexit() function of the previous patch).
>
> The basic idea is to let L1 handle the exit only if it actually asked to
> trap this sort of event. For example, when L2 exits on a change to CR0,
> we check L1's CR0_GUEST_HOST_MASK to see if L1 expressed interest in any
> bit which changed; If it did, we exit to L1. But if it didn't it means that
> it is we (L0) that wished to trap this event, so we handle it ourselves.
>
> The next two patches add additional logic of what to do when an interrupt or
> exception is injected: Does L0 need to do it, should we exit to L1 to do it,
> or should we resume L2 and keep the exception to be injected later.
>
> We keep a new flag, "nested_run_pending", which can override the decision of
> which should run next, L1 or L2. nested_run_pending=1 means that we *must* run
> L2 next, not L1. This is necessary in several situations where had L1 run on
> bare metal it would not have expected to be resumed at this stage. One
> example is when L1 did a VMLAUNCH of L2 and therefore expects L2 to be run.
> Another examples is when L2 exits on an #NM exception that L0 asked for
> (because of lazy FPU loading), and L0 must deal with the exception and resume
> L2 which was in a middle of an instruction, and not resume L1 which does not
> expect to see an exit from L2 at this point. nested_run_pending is especially
> intended to avoid switching to L1 in the injection decision-point described
> above.
>
> @@ -3819,6 +3841,8 @@ static int handle_exception(struct kvm_v
>
>   	if (is_no_device(intr_info)) {
>   		vmx_fpu_activate(vcpu);
> +		if (vmx->nested.nested_mode)
> +			vmx->nested.nested_run_pending = 1;
>   		return 1;
>   	}
>    

Isn't this true for many other exceptions?  #UD which we emulate (but 
the guest doesn't trap), page faults which we handle completely...

>
> +
> +/* Return 1 if we should exit from L2 to L1 to handle a CR access exit,
> + * rather than handle it ourselves in L0. I.e., check if L1 wanted to
> + * intercept (via guest_host_mask etc.) the current event.
> + */
> +static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu,
> +	struct shadow_vmcs *l2svmcs)
> +{
> +	unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
> +	int cr = exit_qualification&  15;
> +	int reg = (exit_qualification>>  8)&  15;
> +	unsigned long val = kvm_register_read(vcpu, reg);
> +
> +	switch ((exit_qualification>>  4)&  3) {
> +	case 0: /* mov to cr */
> +		switch (cr) {
> +		case 0:
> +			if (l2svmcs->cr0_guest_host_mask&
> +			    (val ^ l2svmcs->cr0_read_shadow))
> +				return 1;
> +			break;
> +		case 3:
> +			if (l2svmcs->cpu_based_vm_exec_control&
> +			    CPU_BASED_CR3_LOAD_EXITING)
> +				return 1;
> +			break;
> +		case 4:
> +			if (l2svmcs->cr4_guest_host_mask&
> +			    (l2svmcs->cr4_read_shadow ^ val))
> +				return 1;
> +			break;
> +		case 8:
> +			if (l2svmcs->cpu_based_vm_exec_control&
> +			    CPU_BASED_CR8_LOAD_EXITING)
> +				return 1;
>    

Should check TPR threshold here too if enabled.


> +	case 3: /* lmsw */
> +		if (l2svmcs->cr0_guest_host_mask&
> +		    (val ^ l2svmcs->cr0_read_shadow))
> +			return 1;
>    

Need to mask off bit 0 (cr0.pe) of val, since lmsw can't clear it.

> +		break;
> +	}
> +	return 0;
> +}
> +
> +/* Return 1 if we should exit from L2 to L1 to handle an exit, or 0 if we
> + * should handle it ourselves in L0. Only call this when in nested_mode (L2).
> + */
> +static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu, bool afterexit)
> +{
> +	u32 exit_code = vmcs_read32(VM_EXIT_REASON);
> +	struct vcpu_vmx *vmx = to_vmx(vcpu);
> +	u32 intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
> +	struct shadow_vmcs *l2svmcs;
> +	int r = 0;
> +
> +	if (vmx->nested.nested_run_pending)
> +		return 0;
> +
> +	if (unlikely(vmx->fail)) {
> +		printk(KERN_INFO "%s failed vm entry %x\n",
> +		       __func__, vmcs_read32(VM_INSTRUCTION_ERROR));
> +		return 1;
> +	}
> +
> +	if (afterexit) {
> +		/* There are some cases where we should let L1 handle certain
> +		 * events when these are injected (afterexit==0) but we should
> +		 * handle them in L0 on an exit (afterexit==1).
> +		 */
> +		switch (exit_code) {
> +		case EXIT_REASON_EXTERNAL_INTERRUPT:
> +			return 0;
> +		case EXIT_REASON_EXCEPTION_NMI:
> +			if (!is_exception(intr_info))
> +				return 0;
> +			if (is_page_fault(intr_info)&&  (!enable_ept))
> +				return 0;
>    

Some page faults do need a l2->l1 transition.  Maybe I'll see this later.

> +			break;
> +		case EXIT_REASON_EPT_VIOLATION:
> +			if (enable_ept)
> +				return 0;
> +			break;
> +		}
> +	}
> +
> +	if (!nested_map_current(vcpu))
> +		return 0;
> +	l2svmcs = get_shadow_vmcs(vcpu);
> +
> +	switch (exit_code) {
> +	case EXIT_REASON_INVLPG:
> +		if (l2svmcs->cpu_based_vm_exec_control&
> +		    CPU_BASED_INVLPG_EXITING)
> +			r = 1;
> +		break;
> +	case EXIT_REASON_MSR_READ:
> +	case EXIT_REASON_MSR_WRITE:
> +		r = nested_vmx_exit_handled_msr(vcpu, l2svmcs, exit_code);
> +		break;
> +	case EXIT_REASON_CR_ACCESS:
> +		r = nested_vmx_exit_handled_cr(vcpu, l2svmcs);
> +		break;
> +	case EXIT_REASON_DR_ACCESS:
> +		if (l2svmcs->cpu_based_vm_exec_control&
> +		    CPU_BASED_MOV_DR_EXITING)
> +			r = 1;
> +		break;
> +	case EXIT_REASON_EXCEPTION_NMI:
> +		if (is_external_interrupt(intr_info)&&
> +		    (l2svmcs->pin_based_vm_exec_control&
> +		     PIN_BASED_EXT_INTR_MASK))
> +			r = 1;
>    

A real external interrupt should never be handled by the guest, only a 
virtual external interrupt.

> +		else if (is_nmi(intr_info)&&
> +		    (l2svmcs->pin_based_vm_exec_control&
> +		     PIN_BASED_NMI_EXITING))
> +			r = 1;
>    

Ditto for nmi.

> +		else if (is_exception(intr_info)&&
> +		    (l2svmcs->exception_bitmap&
> +		     (1u<<  (intr_info&  INTR_INFO_VECTOR_MASK))))
> +			r = 1;
>    

Bit 14 of the exception bitmap is special, need special treatment.

> +		else if (is_page_fault(intr_info))
> +			r = 1;
>    

Still looking for magic page fault handling...

> +		break;
> +	case EXIT_REASON_EXTERNAL_INTERRUPT:
> +		if (l2svmcs->pin_based_vm_exec_control&
> +		    PIN_BASED_EXT_INTR_MASK)
> +			r = 1;
> +		break;
> +	default:
> +		r = 1;
> +	}
> +	nested_unmap_current(vcpu);
> +
> +	return r;
> +}
> +
>   /*
>    * The guest has exited.  See if we can fix it or if we need userspace
>    * assistance.
>    

-- 
error compiling committee.c: too many arguments to function