From mboxrd@z Thu Jan  1 00:00:00 1970
From: Avi Kivity <avi@redhat.com>
Subject: Re: [PATCH 20/27] nVMX: Deciding if L0 or L1 should handle an L2
 exit
Date: Wed, 20 Oct 2010 14:13:30 +0200
Message-ID: <4CBEDCEA.1020507@redhat.com>
References: <1287309814-nyh@il.ibm.com> <201010171013.o9HADnE1029533@rice.haifa.ibm.com>
Mime-Version: 1.0
Content-Type: text/plain; charset=ISO-8859-1; format=flowed
Content-Transfer-Encoding: 7bit
Cc: kvm@vger.kernel.org, gleb@redhat.com
To: "Nadav Har'El" <nyh@il.ibm.com>
Return-path: <kvm-owner@vger.kernel.org>
Received: from mx1.redhat.com ([209.132.183.28]:13172 "EHLO mx1.redhat.com"
	rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP
	id S1752598Ab0JTMNg (ORCPT <rfc822;kvm@vger.kernel.org>);
	Wed, 20 Oct 2010 08:13:36 -0400
In-Reply-To: <201010171013.o9HADnE1029533@rice.haifa.ibm.com>
Sender: kvm-owner@vger.kernel.org
List-ID: <kvm.vger.kernel.org>

  On 10/17/2010 12:13 PM, Nadav Har'El wrote:
> This patch contains the logic of whether an L2 exit should be handled by L0
> and then L2 should be resumed, or whether L1 should be run to handle this
> exit (using the nested_vmx_vmexit() function of the previous patch).
>
> The basic idea is to let L1 handle the exit only if it actually asked to
> trap this sort of event. For example, when L2 exits on a change to CR0,
> we check L1's CR0_GUEST_HOST_MASK to see if L1 expressed interest in any
> bit which changed; If it did, we exit to L1. But if it didn't it means that
> it is we (L0) that wished to trap this event, so we handle it ourselves.
>
> The next two patches add additional logic of what to do when an interrupt or
> exception is injected: Does L0 need to do it, should we exit to L1 to do it,
> or should we resume L2 and keep the exception to be injected later.
>
> We keep a new flag, "nested_run_pending", which can override the decision of
> which should run next, L1 or L2. nested_run_pending=1 means that we *must* run
> L2 next, not L1. This is necessary in particular when L1 did a VMLAUNCH of L2
> and therefore expects L2 to be run (and perhaps be injected with an event it
> specified, etc.). Nested_run_pending is especially intended to avoid switching
> to L1 in the injection decision-point described above.
>
>
>   /*
> + * Return 1 if we should exit from L2 to L1 to handle an MSR access access,
> + * rather than handle it ourselves in L0. I.e., check L1's MSR bitmap whether
> + * it expressed interest in the current event (read or write a specific MSR).
> + */
> +static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu,
> +	struct vmcs_fields *vmcs12, u32 exit_reason)
> +{
> +	u32 msr_index = vcpu->arch.regs[VCPU_REGS_RCX];
> +	struct page *msr_bitmap_page;
> +	void *va;
> +	bool ret;
> +
> +	if (!cpu_has_vmx_msr_bitmap() || !nested_cpu_has_vmx_msr_bitmap(vcpu))
> +		return 1;
> +
> +	msr_bitmap_page = nested_get_page(vcpu, vmcs12->msr_bitmap);
> +	if (!msr_bitmap_page) {
> +		printk(KERN_INFO "%s error in nested_get_page\n", __func__);
> +		return 0;

return leaks the page.

> +	}
> +
> +	va = kmap_atomic(msr_bitmap_page, KM_USER1);
> +	if (exit_reason == EXIT_REASON_MSR_WRITE)
> +		va += 0x800;
> +	if (msr_index>= 0xc0000000) {
> +		msr_index -= 0xc0000000;
> +		va += 0x400;
> +	}
> +	if (msr_index>  0x1fff)
> +		return 0;

return leaks the kmap.

> +	ret = test_bit(msr_index, va);
> +	kunmap_atomic(va, KM_USER1);
> +	return ret;
> +}

How about using kvm_read_guest() instead?  Much simpler and safer.

> +
> +/*
> + * Return 1 if we should exit from L2 to L1 to handle a CR access exit,
> + * rather than handle it ourselves in L0. I.e., check if L1 wanted to
> + * intercept (via guest_host_mask etc.) the current event.
> + */
> +static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu,
> +	struct vmcs_fields *vmcs12)
> +{
> +	unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
> +	int cr = exit_qualification&  15;
> +	int reg = (exit_qualification>>  8)&  15;
> +	unsigned long val = kvm_register_read(vcpu, reg);
> +
> +	switch ((exit_qualification>>  4)&  3) {
> +	case 0: /* mov to cr */
> +		switch (cr) {
> +		case 0:
> +			if (vmcs12->cr0_guest_host_mask&
> +			    (val ^ vmcs12->cr0_read_shadow))
> +				return 1;
> +			break;
> +		case 3:
> +			if (vmcs12->cpu_based_vm_exec_control&
> +			    CPU_BASED_CR3_LOAD_EXITING)
> +				return 1;
> +			break;
> +		case 4:
> +			if (vmcs12->cr4_guest_host_mask&
> +			    (vmcs12->cr4_read_shadow ^ val))
> +				return 1;
> +			break;
> +		case 8:
> +			if (vmcs12->cpu_based_vm_exec_control&
> +			    CPU_BASED_CR8_LOAD_EXITING)
> +				return 1;
> +			/*
> +			 * TODO: missing else if control&  CPU_BASED_TPR_SHADOW
> +			 * then set tpr shadow and if below tpr_threshold, exit.
> +			 */
> +			break;
> +		}
> +		break;
> +	case 2: /* clts */
> +		if (vmcs12->cr0_guest_host_mask&  X86_CR0_TS)
> +			return 1;

If TS is already clear in the guest's cr0_read_shadow, an L2->L1 exit is 
not needed.

> +		break;
> +	case 1: /* mov from cr */
> +		switch (cr) {
> +		case 0:
> +			return 1;

Cannot happen.

> +		case 3:
> +			if (vmcs12->cpu_based_vm_exec_control&
> +			    CPU_BASED_CR3_STORE_EXITING)
> +				return 1;
> +			break;
> +		case 4:
> +			return 1;
> +			break;

Cannot happen.

> +		case 8:
> +			if (vmcs12->cpu_based_vm_exec_control&
> +			    CPU_BASED_CR8_STORE_EXITING)
> +				return 1;

What about TPR threshold?  Or is it not supported yet?

> +			break;
> +		}
> +		break;
> +	case 3: /* lmsw */
> +		/*
> +		 * lmsw can change bits 1..3 of cr0, and only set bit 0 of
> +		 * cr0. Other attempted changes are ignored, with no exit.
> +		 */
> +		if (vmcs12->cr0_guest_host_mask&  0xe&
> +		    (val ^ vmcs12->cr0_read_shadow))
> +			return 1;
> +		if ((vmcs12->cr0_guest_host_mask&  0x1)&&
> +		    !(vmcs12->cr0_read_shadow&  0x1)&&
> +		    (val&  0x1))
> +		    	return 1;
> +		break;
> +	}
> +	return 0;
> +}

I'd prefer to move the intercept checks to kvm_set_cr(), but that can be 
done later (much later).

> +
> +/*
> + * Return 1 if we should exit from L2 to L1 to handle an exit, or 0 if we
> + * should handle it ourselves in L0 (and then continue L2). Only call this
> + * when in nested_mode (L2).
> + */
> +static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
> +{
> +	u32 exit_reason = vmcs_read32(VM_EXIT_REASON);
> +	u32 intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
> +	struct vcpu_vmx *vmx = to_vmx(vcpu);
> +	struct vmcs_fields *vmcs12 = get_vmcs12_fields(vcpu);
> +
> +	if (vmx->nested.nested_run_pending)
> +		return 0;
> +
> +	if (unlikely(vmx->fail)) {
> +		printk(KERN_INFO "%s failed vm entry %x\n",
> +		       __func__, vmcs_read32(VM_INSTRUCTION_ERROR));
> +		return 1;
> +	}
> +
> +	switch (exit_reason) {
> +	case EXIT_REASON_EXTERNAL_INTERRUPT:
> +		return 0;
> +	case EXIT_REASON_EXCEPTION_NMI:
> +		if (!is_exception(intr_info))
> +			return 0;
> +		else if (is_page_fault(intr_info)&&  (!enable_ept))
> +			return 0;

We may still find out later that the page fault needs to be intercepted 
by the guest, yes?

> +		return (vmcs12->exception_bitmap&
> +				(1u<<  (intr_info&  INTR_INFO_VECTOR_MASK)));
> +	case EXIT_REASON_EPT_VIOLATION:
> +		return 0;
> +	case EXIT_REASON_INVLPG:
> +		return (vmcs12->cpu_based_vm_exec_control&
> +				CPU_BASED_INVLPG_EXITING);
> +	case EXIT_REASON_MSR_READ:
> +	case EXIT_REASON_MSR_WRITE:
> +		return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason);
> +	case EXIT_REASON_CR_ACCESS:
> +		return nested_vmx_exit_handled_cr(vcpu, vmcs12);
> +	case EXIT_REASON_DR_ACCESS:
> +		return (vmcs12->cpu_based_vm_exec_control&
> +				CPU_BASED_MOV_DR_EXITING);
> +	default:
> +		/*
> +		 * One particularly interesting case that is covered here is an
> +		 * exit caused by L2 running a VMX instruction. L2 is guest
> +		 * mode in L1's world, and according to the VMX spec running a
> +		 * VMX instruction in guest mode should cause an exit to root
> +		 * mode, i.e., to L1. This is why we need to return r=1 for
> +		 * those exit reasons too. This enables further nesting: Like
> +		 * L0 emulates VMX for L1, we now allow L1 to emulate VMX for
> +		 * L2, who will then be able to run L3.
> +		 */
> +		return 1;

What about intr/nmi window?

-- 
error compiling committee.c: too many arguments to function