From mboxrd@z Thu Jan 1 00:00:00 1970 From: Avi Kivity Subject: Re: [PATCH 2/7] Nested VMX patch 2 implements vmclear Date: Wed, 16 Dec 2009 15:59:49 +0200 Message-ID: <4B28E7D5.6080401@redhat.com> References: <1260470309-7166-1-git-send-email-oritw@il.ibm.com> <1260470309-7166-2-git-send-email-oritw@il.ibm.com> <1260470309-7166-3-git-send-email-oritw@il.ibm.com> Mime-Version: 1.0 Content-Type: text/plain; charset=ISO-8859-1; format=flowed Content-Transfer-Encoding: 7bit Cc: kvm@vger.kernel.org, benami@il.ibm.com, abelg@il.ibm.com, muli@il.ibm.com, aliguori@us.ibm.com, mdday@us.ibm.com To: oritw@il.ibm.com Return-path: Received: from mx1.redhat.com ([209.132.183.28]:8356 "EHLO mx1.redhat.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S933136AbZLPN7y (ORCPT ); Wed, 16 Dec 2009 08:59:54 -0500 In-Reply-To: <1260470309-7166-3-git-send-email-oritw@il.ibm.com> Sender: kvm-owner@vger.kernel.org List-ID: On 12/10/2009 08:38 PM, oritw@il.ibm.com wrote: > From: Orit Wasserman > > --- > arch/x86/kvm/vmx.c | 235 +++++++++++++++++++++++++++++++++++++++++++++++++++- > arch/x86/kvm/x86.c | 5 +- > arch/x86/kvm/x86.h | 3 + > 3 files changed, 240 insertions(+), 3 deletions(-) > > diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c > index 2726a6c..a7ffd5e 100644 > --- a/arch/x86/kvm/vmx.c > +++ b/arch/x86/kvm/vmx.c > @@ -93,13 +93,39 @@ struct shared_msr_entry { > }; > > struct __attribute__ ((__packed__)) level_state { > + /* Has the level1 guest done vmclear? */ > + bool vmclear; > +}; > Suggest calling it launch_state and using an enum. We can have three states: uninitialized, clear, and launched. Not sure if this is really required by the spec. Do we need vmclear in l1_state? > +struct __attribute__ ((__packed__)) nested_vmcs_page { > + u32 revision_id; > + u32 abort; > + struct level_state l2_state; > +}; > + > +struct nested_vmcs_list { > + struct list_head list; > 'link' > + gpa_t vmcs_addr; > + struct vmcs *l2_vmcs; > }; > > struct nested_vmx { > /* Has the level1 guest done vmxon? */ > bool vmxon; > + /* What is the location of the current vmcs l1 keeps for l2 */ > + gpa_t current_vmptr; > /* Level 1 state for switching to level 2 and back */ > struct level_state *l1_state; > + /* list of vmcs for each l2 guest created by l1 */ > + struct list_head l2_vmcs_list; > + /* l2 page corresponding to the current vmcs set by l1 */ > + struct nested_vmcs_page *current_l2_page; > }; > > struct vcpu_vmx { > @@ -156,6 +182,76 @@ static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) > return container_of(vcpu, struct vcpu_vmx, vcpu); > } > > +static struct page *nested_get_page(struct kvm_vcpu *vcpu, > + u64 vmcs_addr) > +{ > + struct page *vmcs_page = NULL; > + > + down_read(¤t->mm->mmap_sem); > + vmcs_page = gfn_to_page(vcpu->kvm, vmcs_addr>> PAGE_SHIFT); > + up_read(¤t->mm->mmap_sem); > gfn_to_page() doesn't need mmap_sem (and may deadlock if you take it). > + > + if (is_error_page(vmcs_page)) { > + printk(KERN_ERR "%s error allocating page 0x%llx\n", > + __func__, vmcs_addr); > + kvm_release_page_clean(vmcs_page); > + return NULL; > + } > + > + return vmcs_page; > + > +} > + > +static int nested_map_current(struct kvm_vcpu *vcpu) > +{ > + struct vcpu_vmx *vmx = to_vmx(vcpu); > + struct page *vmcs_page = > + nested_get_page(vcpu, vmx->nested.current_vmptr); > + struct nested_vmcs_page *mapped_page; > + > + if (vmcs_page == NULL) { > + printk(KERN_INFO "%s: failure in nested_get_page\n", __func__); > + return 0; > + } > + > + if (vmx->nested.current_l2_page) { > + printk(KERN_INFO "%s: shadow vmcs already mapped\n", __func__); > + WARN_ON(1); > + return 0; > + } > + > + mapped_page = kmap_atomic(vmcs_page, KM_USER0); > + > + if (!mapped_page) { > + printk(KERN_INFO "%s: error in kmap_atomic\n", __func__); > + return 0; > + } > kmap_atomic() can't fail. > + > + vmx->nested.current_l2_page = mapped_page; > + > + return 1; > +} > + > +static void nested_unmap_current(struct kvm_vcpu *vcpu) > +{ > + struct page *page; > + struct vcpu_vmx *vmx = to_vmx(vcpu); > + > + if (!vmx->nested.current_l2_page) { > + printk(KERN_INFO "Shadow vmcs already unmapped\n"); > + WARN_ON(1); > Use BUG_ON(), since this can't happen unless there's a bug. > + return; > + } > + > + page = kmap_atomic_to_page(vmx->nested.current_l2_page); > + > + kunmap_atomic(vmx->nested.current_l2_page, KM_USER0); > + > + kvm_release_page_dirty(page); > + > + vmx->nested.current_l2_page = NULL; > +} > + > static int init_rmode(struct kvm *kvm); > static u64 construct_eptp(unsigned long root_hpa); > > @@ -1144,6 +1240,35 @@ static int nested_vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) > return 0; > } > > +static int read_guest_vmcs_gpa(struct kvm_vcpu *vcpu, gva_t gva, u64 *gentry) > +{ > + int r = 0; > + uint size; > + > + *gentry = 0; > + > + if (is_long_mode(vcpu)) > + size = sizeof(u64); > + else > + size = sizeof(u32); > I think the gpa is always 64 bit, regardless of the current mode. > + > + r = kvm_read_guest_virt(gva, gentry, > + size, vcpu); > + if (r) { > + printk(KERN_ERR "%s cannot read guest vmcs addr %lx : %d\n", > + __func__, vcpu->arch.regs[VCPU_REGS_RAX], r); > RAX may not be relevant. Just return, and the user can disassemble the instructions and see for themselves. > + return r; > + } > + > + if (!IS_ALIGNED(*gentry, PAGE_SIZE)) { > + printk(KERN_DEBUG "%s addr %llx not aligned\n", > + __func__, *gentry); > + return 1; > + } > + > + return 0; > +} > + > > +/* > + * Decode the memory address (operand) of a vmx instruction according to Table 23-12/23-11 > + * For additional information regarding offset calculation see 3.7.5 > + */ > +static gva_t get_vmx_mem_address(struct kvm_vcpu *vcpu, > + unsigned long exit_qualification, > + u32 vmx_instruction_info) > +{ > + int scaling = vmx_instruction_info& 3; /* bits 0:1 scaling */ > + int addr_size = (vmx_instruction_info>> 7)& 7; /* bits 7:9 address size, 0=16bit, 1=32bit, 2=64bit */ > + bool is_reg = vmx_instruction_info& (1u<< 10); /* bit 10 1=register operand, 0= memory */ > + int seg_reg = (vmx_instruction_info>> 15)& 7; /* bits 15:17 segment register */ > + int index_reg = (vmx_instruction_info>> 18)& 0xf; /* bits 18:21 index register */ > + bool index_is_valid = !(vmx_instruction_info& (1u<< 22)); /* bit 22 index register validity, 0=valid, 1=invalid */ > + int base_reg = (vmx_instruction_info>> 23)& 0xf; /* bits 23:26 index register */ > + bool base_is_valid = !(vmx_instruction_info& (1u<< 27)); /* bit 27 base register validity, 0=valid, 1=invalid */ > + gva_t addr; > + > + if (is_reg) > + return 0; > Should #UD. > + > + switch (addr_size) { > + case 1: > + exit_qualification&= 0xffffffff; /* 32 high bits are undefied according to the spec, page 23-7 */ > + break; > + case 2: > + break; > + default: > + return 0; > + } > + > + /* Addr = segment_base + offset */ > + /* offfset = Base + [Index * Scale] + Displacement, see Figure 3-11 */ > + addr = vmx_get_segment_base(vcpu, seg_reg); > + if (base_is_valid) > + addr += kvm_register_read(vcpu, base_reg); > + if (index_is_valid) > + addr += kvm_register_read(vcpu, index_reg)*scaling; > Shouldn't this be a shift? Wish we had something like that for emulate.c. > + addr += exit_qualification; /* exit qualification holds the displacement, spec page 23-7 */ > + > + return addr; > +} > + > +static int handle_vmclear(struct kvm_vcpu *vcpu) > +{ > + struct vcpu_vmx *vmx = to_vmx(vcpu); > + struct level_state *l2_state; > + gpa_t guest_vmcs_addr; > + unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); > + u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); > + gva_t vmcs_gva; > + > + if (!nested_vmx_check_permission(vcpu)) > + return 1; > + > + vmcs_gva = get_vmx_mem_address(vcpu, exit_qualification, > + vmx_instruction_info); > I think you can let get_vmx_mem_address() do the vmread()s, simpler. > + > + if (read_guest_vmcs_gpa(vcpu, vmcs_gva,&guest_vmcs_addr)) > + return 1; > + > + vmx->nested.current_vmptr = guest_vmcs_addr; > + if (!nested_map_current(vcpu)) > + return 1; > + > + l2_state =&(to_vmx(vcpu)->nested.current_l2_page->l2_state); > + l2_state->vmclear = 1; > + nested_free_current_vmcs(vcpu); > Why free? Isn't the purpose of the list to keep those active? > + > + vmx->nested.current_vmptr = -1ull; > + > + nested_unmap_current(vcpu); > + > + skip_emulated_instruction(vcpu); > + clear_rflags_cf_zf(vcpu); > + > + return 1; > +} > + > As usual, if you can split some of the infrastructure into separate patches, it would help review. -- error compiling committee.c: too many arguments to function