Re: [PATCH 3/6] Nested VMX patch 3 implements vmptrld and vmptrst

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Avi Kivity <avi@redhat.com>
To: oritw@il.ibm.com
Cc: kvm@vger.kernel.org, benami@il.ibm.com, muli@il.ibm.com,
	abelg@il.ibm.com, aliguori@us.ibm.com, mmday@us.ibm.com
Subject: Re: [PATCH 3/6] Nested VMX patch 3 implements vmptrld and vmptrst
Date: Wed, 02 Sep 2009 23:05:09 +0300	[thread overview]
Message-ID: <4A9ECFF5.60701@redhat.com> (raw)
In-Reply-To: <1251905916-2834-4-git-send-email-oritw@il.ibm.com>

On 09/02/2009 06:38 PM, oritw@il.ibm.com wrote:
> +struct __attribute__ ((__packed__)) level_state {
> +	struct shadow_vmcs *shadow_vmcs;
> +
> +	u16 vpid;
> +	u64 shadow_efer;
> +	unsigned long cr2;
> +	unsigned long cr3;
> +	unsigned long cr4;
> +	unsigned long cr8;
> +
> +	u64 io_bitmap_a;
> +	u64 io_bitmap_b;
> +	u64 msr_bitmap;
> +
> +	struct vmcs *vmcs;
> +	int cpu;
> +	int launched;
> +};
>    



> +
>   struct vmcs {
>   	u32 revision_id;
>   	u32 abort;
> @@ -72,6 +217,17 @@ struct nested_vmx {
>   	bool vmon;
>   	/* Has the level1 guest done vmclear? */
>   	bool vmclear;
> +	/* What is the location of the  vmcs l1 keeps for l2? (in level1 gpa) */
> +	u64 l1_cur_vmcs;
>    

This is the vmptr (exactly as loaded by vmptrld), right?  If so, please 
call it vmptr.

> +	/*
> +	 * Level 2 state : includes vmcs,registers and
> +	 * a copy of vmcs12 for vmread/vmwrite
> +	 */
> +	struct level_state *l2_state;
> +
> +	/* Level 1 state for switching to level 2 and back */
> +	struct level_state *l1_state;
>    

Can you explain why we need two of them?  in the guest vmcs we have host 
and guest values, and in l1_state and l2_state we have more copies, and 
in struct vcpu we have yet another set of copies.  We also have a couple 
of copies in the host vmcs.  I'm getting dizzy...


>   static int init_rmode(struct kvm *kvm);
>   static u64 construct_eptp(unsigned long root_hpa);
>
>
>
> +int read_guest_vmcs_gpa(struct kvm_vcpu *vcpu, u64 *gentry)
> +{
> +	gpa_t gpa;
> +	struct page *page;
> +	int r = 0;
> +
> +	gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, vcpu->arch.regs[VCPU_REGS_RAX]);
> +
> +	/* checking guest gpa */
> +	page = gfn_to_page(vcpu->kvm, gpa>>  PAGE_SHIFT);
> +	if (is_error_page(page)) {
> +		printk(KERN_ERR "%s Invalid guest vmcs addr %llx\n",
> +		       __func__, gpa);
> +		r = 1;
> +		goto out;
> +	}
> +
> +	r = kvm_read_guest(vcpu->kvm, gpa, gentry, sizeof(u64));
> +	if (r) {
> +		printk(KERN_ERR "%s cannot read guest vmcs addr %llx : %d\n",
> +		       __func__, gpa, r);
> +		goto out;
> +	}
>    

You can use kvm_read_guest_virt() to simplify this.

> +
> +	if (!IS_ALIGNED(*gentry, PAGE_SIZE)) {
> +		printk(KERN_DEBUG "%s addr %llx not aligned\n",
> +		       __func__, *gentry);
> +		return 1;
> +	}
> +
> +out:
> +	kvm_release_page_clean(page);
> +	return r;
> +}
> +
> +static int handle_vmptrld(struct kvm_vcpu *vcpu)
> +{
> +	struct vcpu_vmx *vmx = to_vmx(vcpu);
> +	struct page *vmcs_page;
> +	u64 guest_vmcs_addr;
> +
> +	if (!nested_vmx_check_permission(vcpu))
> +		return 1;
> +
> +	if (read_guest_vmcs_gpa(vcpu,&guest_vmcs_addr))
> +		return 1;
> +
> +	if (create_l1_state(vcpu)) {
> +		printk(KERN_ERR "%s create_l1_state failed\n", __func__);
> +		return 1;
> +	}
> +
> +	if (create_l2_state(vcpu)) {
> +		printk(KERN_ERR "%s create_l2_state failed\n", __func__);
> +		return 1;
> +	}
> +
> +	vmx->nested.l2_state->vmcs = alloc_vmcs();
> +	if (!vmx->nested.l2_state->vmcs) {
> +		printk(KERN_ERR "%s error in creating level 2 vmcs", __func__);
> +		return 1;
> +	}
> +
> +	if (vmx->nested.l1_cur_vmcs != guest_vmcs_addr) {
> +		vmcs_page = nested_get_page(vcpu, guest_vmcs_addr);
> +		if (vmcs_page == NULL)
> +			return 1;
> +
> +		/* load nested vmcs to processor */
> +		if (vmptrld(vcpu, page_to_phys(vmcs_page))) {
>    

So, you're loading a guest page as the vmcs.  This is dangerous as the 
guest can play with it.  Much better to use inaccessible memory (and you 
do alloc_vmcs() earlier?)

> +
> +static int handle_vmptrst(struct kvm_vcpu *vcpu)
> +{
> +	if (!nested_vmx_check_permission(vcpu))
> +		return 1;
> +
> +	vcpu->arch.regs[VCPU_REGS_RAX] = to_vmx(vcpu)->nested.l1_cur_vmcs;
>    

Should store to mem64 according to the docs?

Better done through the emulator.

> +void save_vmcs(struct shadow_vmcs *dst)
> +{
> +	dst->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR);
> +	dst->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR);
> +	dst->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR);
> +	dst->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR);
> +	dst->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR);
> +	dst->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR);
> +	dst->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR);
> +	dst->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR);
> +	dst->host_es_selector = vmcs_read16(HOST_ES_SELECTOR);
> +	dst->host_cs_selector = vmcs_read16(HOST_CS_SELECTOR);
> +	dst->host_ss_selector = vmcs_read16(HOST_SS_SELECTOR);
> +	dst->host_ds_selector = vmcs_read16(HOST_DS_SELECTOR);
> +	dst->host_fs_selector = vmcs_read16(HOST_FS_SELECTOR);
> +	dst->host_gs_selector = vmcs_read16(HOST_GS_SELECTOR);
> +	dst->host_tr_selector = vmcs_read16(HOST_TR_SELECTOR);
> +	dst->io_bitmap_a = vmcs_read64(IO_BITMAP_A);
> +	dst->io_bitmap_b = vmcs_read64(IO_BITMAP_B);
> +	if (cpu_has_vmx_msr_bitmap())
> +		dst->msr_bitmap = vmcs_read64(MSR_BITMAP);
> +
> +	dst->vm_exit_msr_store_addr = vmcs_read64(VM_EXIT_MSR_STORE_ADDR);
> +	dst->vm_exit_msr_load_addr = vmcs_read64(VM_EXIT_MSR_LOAD_ADDR);
> +	dst->vm_entry_msr_load_addr = vmcs_read64(VM_ENTRY_MSR_LOAD_ADDR);
> +	dst->tsc_offset = vmcs_read64(TSC_OFFSET);
> +	dst->virtual_apic_page_addr = vmcs_read64(VIRTUAL_APIC_PAGE_ADDR);
> +	dst->apic_access_addr = vmcs_read64(APIC_ACCESS_ADDR);
> +	if (enable_ept)
> +		dst->ept_pointer = vmcs_read64(EPT_POINTER);
> +
> +	dst->guest_physical_address = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
> +	dst->vmcs_link_pointer = vmcs_read64(VMCS_LINK_POINTER);
> +	dst->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
> +	if (vmcs_config.vmentry_ctrl&  VM_ENTRY_LOAD_IA32_PAT)
> +		dst->guest_ia32_pat = vmcs_read64(GUEST_IA32_PAT);
> +	if (enable_ept) {
> +		dst->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0);
> +		dst->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1);
> +		dst->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2);
> +		dst->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3);
> +	}
> +	dst->pin_based_vm_exec_control = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL);
> +	dst->cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
> +	dst->exception_bitmap = vmcs_read32(EXCEPTION_BITMAP);
> +	dst->page_fault_error_code_mask =
> +		vmcs_read32(PAGE_FAULT_ERROR_CODE_MASK);
> +	dst->page_fault_error_code_match =
> +		vmcs_read32(PAGE_FAULT_ERROR_CODE_MATCH);
> +	dst->cr3_target_count = vmcs_read32(CR3_TARGET_COUNT);
> +	dst->vm_exit_controls = vmcs_read32(VM_EXIT_CONTROLS);
> +	dst->vm_exit_msr_store_count = vmcs_read32(VM_EXIT_MSR_STORE_COUNT);
> +	dst->vm_exit_msr_load_count = vmcs_read32(VM_EXIT_MSR_LOAD_COUNT);
> +	dst->vm_entry_controls = vmcs_read32(VM_ENTRY_CONTROLS);
> +	dst->vm_entry_msr_load_count = vmcs_read32(VM_ENTRY_MSR_LOAD_COUNT);
> +	dst->vm_entry_intr_info_field = vmcs_read32(VM_ENTRY_INTR_INFO_FIELD);
> +	dst->vm_entry_exception_error_code =
> +		vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE);
> +	dst->vm_entry_instruction_len = vmcs_read32(VM_ENTRY_INSTRUCTION_LEN);
> +	dst->tpr_threshold = vmcs_read32(TPR_THRESHOLD);
> +	dst->secondary_vm_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
> +	if (enable_vpid&&  dst->secondary_vm_exec_control&
> +	    SECONDARY_EXEC_ENABLE_VPID)
> +		dst->virtual_processor_id = vmcs_read16(VIRTUAL_PROCESSOR_ID);
> +	dst->vm_instruction_error = vmcs_read32(VM_INSTRUCTION_ERROR);
> +	dst->vm_exit_reason  = vmcs_read32(VM_EXIT_REASON);
> +	dst->vm_exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
> +	dst->vm_exit_intr_error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
> +	dst->idt_vectoring_info_field = vmcs_read32(IDT_VECTORING_INFO_FIELD);
> +	dst->idt_vectoring_error_code = vmcs_read32(IDT_VECTORING_ERROR_CODE);
> +	dst->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
> +	dst->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
> +	dst->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT);
> +	dst->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT);
> +	dst->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT);
> +	dst->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT);
> +	dst->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT);
> +	dst->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT);
> +	dst->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT);
> +	dst->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT);
> +	dst->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT);
> +	dst->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT);
> +	dst->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES);
> +	dst->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES);
> +	dst->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES);
> +	dst->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES);
> +	dst->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES);
> +	dst->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES);
> +	dst->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES);
> +	dst->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES);
> +	dst->guest_interruptibility_info =
> +		vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
> +	dst->guest_activity_state = vmcs_read32(GUEST_ACTIVITY_STATE);
> +	dst->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS);
> +	dst->host_ia32_sysenter_cs = vmcs_read32(HOST_IA32_SYSENTER_CS);
> +	dst->cr0_guest_host_mask = vmcs_readl(CR0_GUEST_HOST_MASK);
> +	dst->cr4_guest_host_mask = vmcs_readl(CR4_GUEST_HOST_MASK);
> +	dst->cr0_read_shadow = vmcs_readl(CR0_READ_SHADOW);
> +	dst->cr4_read_shadow = vmcs_readl(CR4_READ_SHADOW);
> +	dst->cr3_target_value0 = vmcs_readl(CR3_TARGET_VALUE0);
> +	dst->cr3_target_value1 = vmcs_readl(CR3_TARGET_VALUE1);
> +	dst->cr3_target_value2 = vmcs_readl(CR3_TARGET_VALUE2);
> +	dst->cr3_target_value3 = vmcs_readl(CR3_TARGET_VALUE3);
> +	dst->exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
> +	dst->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS);
> +	dst->guest_cr0 = vmcs_readl(GUEST_CR0);
> +	dst->guest_cr3 = vmcs_readl(GUEST_CR3);
> +	dst->guest_cr4 = vmcs_readl(GUEST_CR4);
> +	dst->guest_es_base = vmcs_readl(GUEST_ES_BASE);
> +	dst->guest_cs_base = vmcs_readl(GUEST_CS_BASE);
> +	dst->guest_ss_base = vmcs_readl(GUEST_SS_BASE);
> +	dst->guest_ds_base = vmcs_readl(GUEST_DS_BASE);
> +	dst->guest_fs_base = vmcs_readl(GUEST_FS_BASE);
> +	dst->guest_gs_base = vmcs_readl(GUEST_GS_BASE);
> +	dst->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE);
> +	dst->guest_tr_base = vmcs_readl(GUEST_TR_BASE);
> +	dst->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE);
> +	dst->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE);
> +	dst->guest_dr7 = vmcs_readl(GUEST_DR7);
> +	dst->guest_rsp = vmcs_readl(GUEST_RSP);
> +	dst->guest_rip = vmcs_readl(GUEST_RIP);
> +	dst->guest_rflags = vmcs_readl(GUEST_RFLAGS);
> +	dst->guest_pending_dbg_exceptions =
> +		vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
> +	dst->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP);
> +	dst->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP);
> +	dst->host_cr0 = vmcs_readl(HOST_CR0);
> +	dst->host_cr3 = vmcs_readl(HOST_CR3);
> +	dst->host_cr4 = vmcs_readl(HOST_CR4);
> +	dst->host_fs_base = vmcs_readl(HOST_FS_BASE);
> +	dst->host_gs_base = vmcs_readl(HOST_GS_BASE);
> +	dst->host_tr_base = vmcs_readl(HOST_TR_BASE);
> +	dst->host_gdtr_base = vmcs_readl(HOST_GDTR_BASE);
> +	dst->host_idtr_base = vmcs_readl(HOST_IDTR_BASE);
> +	dst->host_ia32_sysenter_esp = vmcs_readl(HOST_IA32_SYSENTER_ESP);
> +	dst->host_ia32_sysenter_eip = vmcs_readl(HOST_IA32_SYSENTER_EIP);
> +	dst->host_rsp = vmcs_readl(HOST_RSP);
> +	dst->host_rip = vmcs_readl(HOST_RIP);
> +	if (vmcs_config.vmexit_ctrl&  VM_EXIT_LOAD_IA32_PAT)
> +		dst->host_ia32_pat = vmcs_read64(HOST_IA32_PAT);
> +}
>    

I see.  You're using the processor's format when reading the guest 
vmcs.  But we don't have to do that, we can use the shadow_vmcs 
structure (and a memcpy).


-- 
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.

next prev parent reply	other threads:[~2009-09-02 20:04 UTC|newest]

Thread overview: 31+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2009-09-02 15:38 Nested VMX support - kernel v1 oritw
2009-09-02 15:38 ` [PATCH 1/6] Nested VMX patch 1 implements vmon and vmoff oritw
2009-09-02 15:38   ` [PATCH 2/6] Nested VMX patch 2 implements vmclear oritw
2009-09-02 15:38     ` [PATCH 3/6] Nested VMX patch 3 implements vmptrld and vmptrst oritw
2009-09-02 15:38       ` [PATCH 2/2] Nested VMX patch 4 implements vmread and vmwrite oritw
2009-09-02 15:38         ` [PATCH 5/6] Nested VMX patch 5 implements vmlaunch and vmresume oritw
2009-09-02 21:38           ` Avi Kivity
2009-09-03 14:53             ` Orit Wasserman
2009-09-06  9:29               ` Avi Kivity
2009-09-06 13:38                 ` Orit Wasserman
2009-09-02 20:15         ` [PATCH 2/2] Nested VMX patch 4 implements vmread and vmwrite Avi Kivity
2009-09-03 14:26           ` Orit Wasserman
2009-09-02 20:05       ` Avi Kivity [this message]
2009-09-03 14:25         ` [PATCH 3/6] Nested VMX patch 3 implements vmptrld and vmptrst Orit Wasserman
2009-09-06  9:25           ` Avi Kivity
2009-09-06 13:36             ` Orit Wasserman
2009-09-06 13:52               ` Avi Kivity
2009-09-06 16:55                 ` Orit Wasserman
2009-09-06 19:10                   ` Avi Kivity
2009-09-02 19:38     ` [PATCH 2/6] Nested VMX patch 2 implements vmclear Avi Kivity
2009-09-03 13:54       ` Orit Wasserman
2009-09-02 19:34   ` [PATCH 1/6] Nested VMX patch 1 implements vmon and vmoff Avi Kivity
2009-09-03 12:34     ` Orit Wasserman
2009-09-03 13:39       ` Avi Kivity
2009-09-03 14:54         ` Orit Wasserman
2009-09-02 15:57 ` Nested VMX support - kernel v1 Alexander Graf
2009-09-03  6:01   ` Muli Ben-Yehuda
2009-09-03  7:29     ` Alexander Graf
2009-09-03  9:53       ` Muli Ben-Yehuda
2009-09-06 19:28         ` Anthony Liguori
2009-09-02 21:39 ` Avi Kivity

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=4A9ECFF5.60701@redhat.com \
    --to=avi@redhat.com \
    --cc=abelg@il.ibm.com \
    --cc=aliguori@us.ibm.com \
    --cc=benami@il.ibm.com \
    --cc=kvm@vger.kernel.org \
    --cc=mmday@us.ibm.com \
    --cc=muli@il.ibm.com \
    --cc=oritw@il.ibm.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.