From mboxrd@z Thu Jan 1 00:00:00 1970 From: Avi Kivity Subject: Re: [PATCH 7/7] Nested VMX patch 7 handling of nested guest exits Date: Thu, 17 Dec 2009 15:46:44 +0200 Message-ID: <4B2A3644.8020806@redhat.com> References: <1260470309-7166-1-git-send-email-oritw@il.ibm.com> <1260470309-7166-2-git-send-email-oritw@il.ibm.com> <1260470309-7166-3-git-send-email-oritw@il.ibm.com> <1260470309-7166-4-git-send-email-oritw@il.ibm.com> <1260470309-7166-5-git-send-email-oritw@il.ibm.com> <1260470309-7166-6-git-send-email-oritw@il.ibm.com> <1260470309-7166-7-git-send-email-oritw@il.ibm.com> <1260470309-7166-8-git-send-email-oritw@il.ibm.com> Mime-Version: 1.0 Content-Type: text/plain; charset=ISO-8859-1; format=flowed Content-Transfer-Encoding: 7bit Cc: kvm@vger.kernel.org, benami@il.ibm.com, abelg@il.ibm.com, muli@il.ibm.com, aliguori@us.ibm.com, mdday@us.ibm.com To: oritw@il.ibm.com Return-path: Received: from mx1.redhat.com ([209.132.183.28]:29661 "EHLO mx1.redhat.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S935705AbZLQNrA (ORCPT ); Thu, 17 Dec 2009 08:47:00 -0500 In-Reply-To: <1260470309-7166-8-git-send-email-oritw@il.ibm.com> Sender: kvm-owner@vger.kernel.org List-ID: On 12/10/2009 08:38 PM, oritw@il.ibm.com wrote: > From: Orit Wasserman > > (changelog) > @@ -1525,6 +1539,22 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) > new_offset = vmcs_read64(TSC_OFFSET) + delta; > vmcs_write64(TSC_OFFSET, new_offset); > } > + > + if (l1_shadow_vmcs != NULL) { > + l1_shadow_vmcs->host_tr_base = > + vmcs_readl(HOST_TR_BASE); > + l1_shadow_vmcs->host_gdtr_base = > + vmcs_readl(HOST_GDTR_BASE); > + l1_shadow_vmcs->host_ia32_sysenter_esp = > + vmcs_readl(HOST_IA32_SYSENTER_ESP); > + > + if (tsc_this< vcpu->arch.host_tsc) > + l1_shadow_vmcs->tsc_offset = > + vmcs_read64(TSC_OFFSET); > + > + if (vmx->nested.nested_mode) > + load_vmcs_host_state(l1_shadow_vmcs); > + } > Please share this code with non-nested vmcs setup. > @@ -3794,6 +3824,11 @@ static void enable_irq_window(struct kvm_vcpu *vcpu) > { > u32 cpu_based_vm_exec_control; > > + if (to_vmx(vcpu)->nested.nested_mode) { > + nested_vmx_intr(vcpu); > + return; > + } > I think this happens too late? enable_irq_window() is called after we've given up on injecting the interrupt because interrupts are disabled. But if we're running a guest, we can vmexit and inject the interrupt. This code will only vmexit. Hm, I see the vmexit code has an in_interrupt case, but I'd like this to be more regular: adjust vmx_interrupt_allowed() to allow interrupts if in a guest, and vmx_inject_irq() to force the vmexit. This way interrupts have a single code path. > > static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu) > { > + if (to_vmx(vcpu)->nested.nested_mode) { > + if (!nested_vmx_intr(vcpu)) > + return 0; > + } > ... and you do that... so I wonder why the changes to enable_irq_window() are needed? > + > return (vmcs_readl(GUEST_RFLAGS)& X86_EFLAGS_IF)&& > !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO)& > (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)); > @@ -4042,6 +4082,10 @@ static int handle_exception(struct kvm_vcpu *vcpu) > not interested (exception bitmap 12 does not include NM_VECTOR) > enable fpu and resume l2 (avoid switching to l1) > */ > + > + if (vmx->nested.nested_mode) > + vmx->nested.nested_run_pending = 1; /* removing this line cause hung on boot of l2*/ > + > This indicates a hack? > vmx_fpu_activate(vcpu); > > return 1; > @@ -4169,7 +4213,33 @@ static int handle_cr(struct kvm_vcpu *vcpu) > trace_kvm_cr_write(cr, val); > switch (cr) { > case 0: > - kvm_set_cr0(vcpu, val); > + if (to_vmx(vcpu)->nested.nested_mode) { > + /* assume only X86_CR0_TS is handled by l0 */ > + long new_cr0 = vmcs_readl(GUEST_CR0); > + long new_cr0_read_shadow = vmcs_readl(CR0_READ_SHADOW); > + > + vmx_fpu_deactivate(vcpu); > > + > + if (val& X86_CR0_TS) { > + new_cr0 |= X86_CR0_TS; > + new_cr0_read_shadow |= X86_CR0_TS; > + vcpu->arch.cr0 |= X86_CR0_TS; > + } else { > + new_cr0&= ~X86_CR0_TS; > + new_cr0_read_shadow&= ~X86_CR0_TS; > + vcpu->arch.cr0&= X86_CR0_TS; > + } > + > + vmcs_writel(GUEST_CR0, new_cr0); > + vmcs_writel(CR0_READ_SHADOW, new_cr0_read_shadow); > Don't you need to #vmexit if the new cr0 violates the cr0_bits_always_on constraint, or if it changes bits in cr0 that the guest intercepts? > + > + if (!(val& X86_CR0_TS) || !(val& X86_CR0_PE)) > + vmx_fpu_activate(vcpu); > + > + to_vmx(vcpu)->nested.nested_run_pending = 1; > Please split into a function. > + } else > + kvm_set_cr0(vcpu, val); > + > skip_emulated_instruction(vcpu); > return 1; > case 3: > @@ -4196,8 +4266,15 @@ static int handle_cr(struct kvm_vcpu *vcpu) > break; > case 2: /* clts */ > vmx_fpu_deactivate(vcpu); > - vcpu->arch.cr0&= ~X86_CR0_TS; > - vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0); > + if (to_vmx(vcpu)->nested.nested_mode) { > + vmcs_writel(GUEST_CR0, vmcs_readl(GUEST_CR0)& ~X86_CR0_TS); > + vmcs_writel(CR0_READ_SHADOW, vmcs_readl(CR0_READ_SHADOW)& ~X86_CR0_TS); > + vcpu->arch.cr0&= ~X86_CR0_TS; > + to_vmx(vcpu)->nested.nested_run_pending = 1; > Won't the guest want to intercept this some time? > /* Access CR3 don't cause VMExit in paging mode, so we need > * to sync with guest real CR3. */ > if (enable_ept&& is_paging(vcpu)) > @@ -5347,6 +5435,60 @@ static void fixup_rmode_irq(struct vcpu_vmx *vmx) > | vmx->rmode.irq.vector; > } > > +static int nested_handle_valid_idt(struct kvm_vcpu *vcpu) > I asked for this to be renamed. > #ifdef CONFIG_X86_64 > #define R "r" > #define Q "q" > @@ -5358,8 +5500,17 @@ static void fixup_rmode_irq(struct vcpu_vmx *vmx) > static void vmx_vcpu_run(struct kvm_vcpu *vcpu) > { > struct vcpu_vmx *vmx = to_vmx(vcpu); > + int r; > u32 nested_exception_bitmap = 0; > > + if (vmx->nested.nested_mode) { > + r = nested_handle_valid_idt(vcpu); > This will cause vmread()s before the launch of state that is not saved. This means it is broken on migration or after set_regs(). In general we follow the following pattern: read from memory vmwrite vmlaunch/vmresume vmread write to memory loop There are exceptions where we allow state to be cached, mostly registers. But we keep accessors for them so that save/restore works. > + > +static int nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu) > +{ > + if (to_vmx(vcpu)->nested.nested_mode) { > + struct page *msr_page = NULL; > + u32 msr_index = vcpu->arch.regs[VCPU_REGS_RCX]; > + u32 exit_code = vmcs_read32(VM_EXIT_REASON); > + struct shadow_vmcs *l2svmcs = get_shadow_vmcs(vcpu); > + > + if (!cpu_has_vmx_msr_bitmap() > + || !nested_cpu_has_vmx_msr_bitmap(vcpu)) > + return 1; > + > + msr_page = nested_get_page(vcpu, > + l2svmcs->msr_bitmap); > + > + if (!msr_page) { > + printk(KERN_INFO "%s error in nested_get_page\n", > + __func__); > + return 0; > + } > + > + switch (exit_code) { > + case EXIT_REASON_MSR_READ: > + if (msr_index<= 0x1fff) { > + if (test_bit(msr_index, > + (unsigned long *)(msr_page + > + 0x000))) > + return 1; > + } else if ((msr_index>= 0xc0000000)&& > + (msr_index<= 0xc0001fff)) { > + msr_index&= 0x1fff; > + if (test_bit(msr_index, > + (unsigned long *)(msr_page + > + 0x400))) > + return 1; > + } > + break; > + case EXIT_REASON_MSR_WRITE: > + if (msr_index<= 0x1fff) { > + if (test_bit(msr_index, > + (unsigned long *)(msr_page + > + 0x800))) > + return 1; > + } else if ((msr_index>= 0xc0000000)&& > + (msr_index<= 0xc0001fff)) { > + msr_index&= 0x1fff; > + if (test_bit(msr_index, > + (unsigned long *)(msr_page + > + 0xc00))) > + return 1; > + } > + break; > + } > + } > + > Please refactor with a single test_bit, just calculate the offsets differently (+400*8 for high msrs, +800*8 for writes). > + > +static int nested_vmx_exit_handled(struct kvm_vcpu *vcpu, bool kvm_override) > +{ > + u32 exit_code = vmcs_read32(VM_EXIT_REASON); > + struct vcpu_vmx *vmx = to_vmx(vcpu); > + u32 intr_info = vmcs_read32(VM_EXIT_INTR_INFO); > + struct shadow_vmcs *l2svmcs; > + > + int r = 0; > + > + if (vmx->nested.nested_run_pending) > + return 0; > + > + if (unlikely(vmx->fail)) { > + printk(KERN_INFO "%s failed vm entry %x\n", > + __func__, vmcs_read32(VM_INSTRUCTION_ERROR)); > + return 1; > + } > + > + if (kvm_override) { > What's kvm_override? > + switch (exit_code) { > + case EXIT_REASON_EXTERNAL_INTERRUPT: > + return 0; > + case EXIT_REASON_EXCEPTION_NMI: > + if (!is_exception(intr_info)) > + return 0; > + > + if (is_page_fault(intr_info)&& (!enable_ept)) > + return 0; > + > + break; > + case EXIT_REASON_EPT_VIOLATION: > + if (enable_ept) > + return 0; > + > + break; > + } > + } > + > + > + if (!nested_map_current(vcpu)) > + return 0; > + > + l2svmcs = get_shadow_vmcs(vcpu); > + > + switch (exit_code) { > + case EXIT_REASON_INVLPG: > + if (l2svmcs->cpu_based_vm_exec_control& > + CPU_BASED_INVLPG_EXITING) > + r = 1; > + break; > + case EXIT_REASON_MSR_READ: > + case EXIT_REASON_MSR_WRITE: > + r = nested_vmx_exit_handled_msr(vcpu); > + break; > + case EXIT_REASON_CR_ACCESS: { > + unsigned long exit_qualification = > + vmcs_readl(EXIT_QUALIFICATION); > + int cr = exit_qualification& 15; > + int reg = (exit_qualification>> 8)& 15; > + unsigned long val = kvm_register_read(vcpu, reg); > + > + switch ((exit_qualification>> 4)& 3) { > + case 0: /* mov to cr */ > + switch (cr) { > + case 0: > + if (l2svmcs->cr0_guest_host_mask& > + (val ^ l2svmcs->cr0_read_shadow)) > + r = 1; > > + break; > + case 3: > + if (l2svmcs->cpu_based_vm_exec_control& > + CPU_BASED_CR3_LOAD_EXITING) > + r = 1; > + break; > + case 4: > + if (l2svmcs->cr4_guest_host_mask& > + (l2svmcs->cr4_read_shadow ^ val)) > + r = 1; > + break; > + case 8: > + if (l2svmcs->cpu_based_vm_exec_control& > + CPU_BASED_CR8_LOAD_EXITING) > + r = 1; > + break; > + } > + break; > + case 2: /* clts */ > + if (l2svmcs->cr0_guest_host_mask& X86_CR0_TS) > + r = 1; > + break; > + case 1: /*mov from cr*/ > + switch (cr) { > + case 0: > + r = 1; > + case 3: > + if (l2svmcs->cpu_based_vm_exec_control& > + CPU_BASED_CR3_STORE_EXITING) > + r = 1; > + break; > + case 4: > + r = 1; > + break; > + case 8: > + if (l2svmcs->cpu_based_vm_exec_control& > + CPU_BASED_CR8_STORE_EXITING) > + r = 1; > + break; > + } > + break; > + case 3: /* lmsw */ > + if (l2svmcs->cr0_guest_host_mask& > + (val ^ l2svmcs->cr0_read_shadow)) > + r = 1; > + break; > + } > + break; > + } > + case EXIT_REASON_DR_ACCESS: { > + if (l2svmcs->cpu_based_vm_exec_control& > + CPU_BASED_MOV_DR_EXITING) > + r = 1; > + break; > + } > + > + case EXIT_REASON_EXCEPTION_NMI: { > + > + if (is_external_interrupt(intr_info)&& > + (l2svmcs->pin_based_vm_exec_control& > + PIN_BASED_EXT_INTR_MASK)) > + r = 1; > + else if (is_nmi(intr_info)&& > + (l2svmcs->pin_based_vm_exec_control& > + PIN_BASED_NMI_EXITING)) > + r = 1; > + else if (is_exception(intr_info)&& > + (l2svmcs->exception_bitmap& > + (1u<< (intr_info& INTR_INFO_VECTOR_MASK)))) > + r = 1; > + else if (is_page_fault(intr_info)) > + r = 1; > + break; > + } > + > + case EXIT_REASON_EXTERNAL_INTERRUPT: > + if (l2svmcs->pin_based_vm_exec_control& > + PIN_BASED_EXT_INTR_MASK) > + r = 1; > + break; > + default: > + r = 1; > + } > + nested_unmap_current(vcpu); > + > Please move these to the normal handlers so it is possible to follow the code. -- error compiling committee.c: too many arguments to function