From mboxrd@z Thu Jan 1 00:00:00 1970 From: "Yang, Sheng" Subject: [RFC][PATCH 5/7] KVM: VMX: Enable EPT feature Date: Fri, 1 Feb 2008 16:24:51 +0800 Message-ID: <200802011624.52097.sheng.yang@intel.com> Mime-Version: 1.0 Content-Type: Multipart/Mixed; boundary="Boundary-00=_UdtoHAeZxyWhHkX" Cc: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f@public.gmane.org To: Avi Kivity Return-path: List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Sender: kvm-devel-bounces-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f@public.gmane.org Errors-To: kvm-devel-bounces-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f@public.gmane.org List-Id: kvm.vger.kernel.org --Boundary-00=_UdtoHAeZxyWhHkX Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: quoted-printable Content-Disposition: inline =46rom 5062793a4aae25b8f701decbcf6dead9a4a38348 Mon Sep 17 00:00:00 2001 =46rom: Sheng Yang Date: Fri, 1 Feb 2008 06:51:01 +0800 Subject: [PATCH] KVM: VMX: Enable EPT feature =46or EPT's entry format is different from page table, we would set up EPT = in=20 this patch. In EPT, hardware CR3 always point to guest page table, and changing = CR3 won't cause vmexit. The real mode was supported by using identity mapped pa= ge table. Signed-off-by: Sheng Yang =2D-- arch/x86/kvm/vmx.c | 357=20 +++++++++++++++++++++++++++++++++++++++++++++++++++- 1 files changed, 352 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 5f3767a..f5b59e7 100644 =2D-- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -40,7 +40,7 @@ module_param(bypass_guest_pf, bool, 0); static int enable_vpid =3D 1; module_param(enable_vpid, bool, 0); =2Dstatic int enable_ept =3D 0; +static int enable_ept =3D 1; module_param(enable_ept, bool, 0); struct vmcs { @@ -136,6 +136,15 @@ struct vmx_capability { .ar_bytes =3D GUEST_##seg##_AR_BYTES, \ } +#define EPT_PT_BITS 9 +#define EPT_ENT_PER_PAGE (1 << EPT_PT_BITS) +#define EPT_PAGE_SHIFT 12 +/* level 0 is the leaf of EPT table */ +#define VMX_GET_EPTE_OFFSET(gpa, level) \ + (((gpa) & ((EPT_ENT_PER_PAGE - 1) << (EPT_PAGE_SHIFT + \ + EPT_PT_BITS * (level)))) >> (EPT_PAGE_SHIFT + \ + EPT_PT_BITS * (level))); + static struct kvm_vmx_segment_field { unsigned selector; unsigned base; @@ -294,6 +303,18 @@ static inline void __invvpid(int ext, u16 vpid, gva_t= =20 gva) : : "a"(&operand), "c"(ext) : "cc", "memory"); } +static inline void __invept(int ext, u64 eptp, gpa_t gpa) +{ + struct { + u64 eptp, gpa; + } operand =3D {eptp, gpa}; + + asm volatile (ASM_VMX_INVEPT + /* CF=3D=3D1 or ZF=3D=3D1 --> rc =3D -1 */ + "; ja 1f ; ud2 ; 1:\n" + : : "a" (&operand), "c" (ext) : "cc", "memory"); +} + static struct kvm_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr) { int i; @@ -345,6 +366,34 @@ static inline void vpid_sync_vcpu_all(struct vcpu_vmx= =20 *vmx) __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vmx->vpid, 0); } +static inline void ept_sync_global(void) +{ + if (cpu_has_vmx_invept_global()) + __invept(VMX_EPT_EXTENT_GLOBAL, 0, 0); +} + +static inline void ept_sync_context(struct kvm *kvm) +{ + if (vm_need_ept()) { + if (cpu_has_vmx_invept_context()) + __invept(VMX_EPT_EXTENT_CONTEXT, + kvm->arch.eptp.entry, 0); + else + ept_sync_global(); + } +} + +static inline void ept_sync_individual_addr(struct kvm *kvm, gpa_t gpa) +{ + if (vm_need_ept()) { + if (cpu_has_vmx_invept_individual_addr()) + __invept(VMX_EPT_EXTENT_INDIVIDUAL_ADDR, + kvm->arch.eptp.entry, gpa); + else + ept_sync_context(kvm); + } +} + static unsigned long vmcs_readl(unsigned long field) { unsigned long value; @@ -432,6 +481,8 @@ static void update_exception_bitmap(struct kvm_vcpu *vc= pu) eb |=3D 1u << 1; if (vcpu->arch.rmode.active) eb =3D ~0; + if (vm_need_ept()) + eb &=3D ~(1u << PF_VECTOR); /* bypass_guest_pf =3D 0 */ vmcs_write32(EXCEPTION_BITMAP, eb); } @@ -1355,8 +1406,64 @@ static void vmx_decache_cr4_guest_bits(struct kvm_vc= pu=20 *vcpu) vcpu->arch.cr4 |=3D vmcs_readl(GUEST_CR4) & ~KVM_GUEST_CR4_MASK; } +static void ept_new_cr3(struct kvm_vcpu *vcpu) +{ + if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) { + if (!load_pdptrs(vcpu, vcpu->arch.cr3)) { + printk(KERN_ERR "EPT: Fail to load pdptrs!\n"); + return; + } + vmcs_write64(GUEST_PDPTR0, vcpu->arch.pdptrs[0]); + vmcs_write64(GUEST_PDPTR1, vcpu->arch.pdptrs[1]); + vmcs_write64(GUEST_PDPTR2, vcpu->arch.pdptrs[2]); + vmcs_write64(GUEST_PDPTR3, vcpu->arch.pdptrs[3]); + } +} + +static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); + +static void ept_update_paging_mode_cr0(unsigned long *hw_cr0, + unsigned long cr0, + struct kvm_vcpu *vcpu) +{ + if (!(cr0 & X86_CR0_PG)) { + /* From paging/starting to nonpaging */ + vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, + vmcs_config.cpu_based_exec_ctrl | + (CPU_BASED_CR3_LOAD_EXITING | + CPU_BASED_CR3_STORE_EXITING)); + vcpu->arch.cr0 =3D cr0; + vmx_set_cr4(vcpu, vcpu->arch.cr4); + *hw_cr0 |=3D X86_CR0_PE | X86_CR0_PG; + *hw_cr0 &=3D ~X86_CR0_WP; + } else if (!is_paging(vcpu)) { + /* From nonpaging to paging */ + vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, + vmcs_config.cpu_based_exec_ctrl & + ~(CPU_BASED_CR3_LOAD_EXITING | + CPU_BASED_CR3_STORE_EXITING)); + vcpu->arch.cr0 =3D cr0; + vmx_set_cr4(vcpu, vcpu->arch.cr4); + if (!(vcpu->arch.cr0 & X86_CR0_WP)) + *hw_cr0 &=3D ~X86_CR0_WP; + } +} + +static void ept_update_paging_mode_cr4(unsigned long *hw_cr4, + struct kvm_vcpu *vcpu) +{ + if (!is_paging(vcpu)) { + *hw_cr4 &=3D ~X86_CR4_PAE; + *hw_cr4 |=3D X86_CR4_PSE; + } else if (!(vcpu->arch.cr4 & X86_CR4_PAE)) + *hw_cr4 &=3D ~X86_CR4_PAE; +} + static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { + unsigned long hw_cr0 =3D (cr0 & ~KVM_GUEST_CR0_MASK) | + KVM_VM_CR0_ALWAYS_ON; + vmx_fpu_deactivate(vcpu); if (vcpu->arch.rmode.active && (cr0 & X86_CR0_PE)) @@ -1374,9 +1481,11 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsig= ned=20 long cr0) } #endif + if (vm_need_ept()) + ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu); + vmcs_writel(CR0_READ_SHADOW, cr0); =2D vmcs_writel(GUEST_CR0, =2D (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON); + vmcs_writel(GUEST_CR0, hw_cr0); vcpu->arch.cr0 =3D cr0; if (!(cr0 & X86_CR0_TS) || !(cr0 & X86_CR0_PE)) @@ -1385,6 +1494,9 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsign= ed=20 long cr0) static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) { + if (vm_need_ept()) + ept_new_cr3(vcpu); + vmx_flush_tlb(vcpu); vmcs_writel(GUEST_CR3, cr3); if (vcpu->arch.cr0 & X86_CR0_PE) @@ -1393,9 +1505,14 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsig= ned=20 long cr3) static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { + unsigned long hw_cr4 =3D cr4 | (vcpu->arch.rmode.active ? + KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON); + + if (vm_need_ept()) + ept_update_paging_mode_cr4(&hw_cr4, vcpu); + vmcs_writel(CR4_READ_SHADOW, cr4); =2D vmcs_writel(GUEST_CR4, cr4 | (vcpu->arch.rmode.active ? =2D KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON)); + vmcs_writel(GUEST_CR4, hw_cr4); vcpu->arch.cr4 =3D cr4; } @@ -1857,6 +1974,8 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) vpid_sync_vcpu_all(vmx); + ept_sync_context(vmx->vcpu.kvm); + return 0; out: @@ -2016,6 +2135,9 @@ static int handle_exception(struct kvm_vcpu *vcpu,=20 struct kvm_run *kvm_run) if (intr_info & INTR_INFO_DELIEVER_CODE_MASK) error_code =3D vmcs_read32(VM_EXIT_INTR_ERROR_CODE); if (is_page_fault(intr_info)) { + /* EPT won't cause page fault */ + if (vm_need_ept()) + BUG(); cr2 =3D vmcs_readl(EXIT_QUALIFICATION); return kvm_mmu_page_fault(vcpu, cr2, error_code); } @@ -2307,6 +2429,140 @@ static int handle_apic_access(struct kvm_vcpu *vcpu= ,=20 struct kvm_run *kvm_run) return 1; } +static int insert_ept_entry(struct kvm *kvm, gpa_t gpa, hpa_t hpa) +{ + struct page *page =3D pfn_to_page(kvm->arch.eptp.fields.asr_mfn); + int level, rtn; + u64 offset; + u64 *table; + epte_t epte; + struct page *ept_page; + + rtn =3D 0; + /* level 0 is leaf */ + mutex_lock(&kvm->arch.ept_mutex); + for (level =3D kvm->arch.eptp.fields.gaw; level > 0; level--) { + table =3D kmap_atomic(page, KM_USER0); + offset =3D VMX_GET_EPTE_OFFSET(gpa, level); + epte.entry =3D 0; + if (table[offset] =3D=3D 0) { + ept_page =3D alloc_page(GFP_KERNEL | + __GFP_HIGHMEM | __GFP_ZERO); + if (!ept_page) { + kunmap_atomic(table, KM_USER0); + rtn =3D -ENOMEM; + break; + } + atomic_inc(&kvm->arch.ept_npages); + epte.fields.addr_mfn =3D page_to_pfn(ept_page); + epte.fields.r =3D epte.fields.w =3D epte.fields.x =3D 1; + table[offset] =3D epte.entry; + } else + epte.entry =3D (u64)table[offset]; + kunmap_atomic(table, KM_USER0); + page =3D pfn_to_page(epte.fields.addr_mfn); + } + if (rtn < 0) + goto out; + /* dealing with leaf */ + table =3D kmap_atomic(page, KM_USER0); + offset =3D VMX_GET_EPTE_OFFSET(gpa, 0); + epte.entry =3D (u64)table[offset]; + if (table[offset] =3D=3D 0) { + epte.fields.addr_mfn =3D hpa >> PAGE_SHIFT; + epte.fields.emt =3D kvm->arch.eptp.fields.etmt; + epte.fields.r =3D epte.fields.w =3D epte.fields.x =3D 1; + table[offset] =3D epte.entry; + } else { + printk(KERN_INFO "EPT: GPA have been mapped. " + "GPA: 0x%lx, HPA: 0x%lx\n", + (long unsigned int)gpa, (long unsigned int)hpa); + rtn =3D 1; + } + kunmap_atomic(table, KM_USER0); +out: + mutex_unlock(&kvm->arch.ept_mutex); + return rtn; +} + +static int handle_ept_violation(struct kvm_vcpu *vcpu, struct kvm_run=20 *kvm_run) +{ + u64 exit_qualification; + enum emulation_result er; + gpa_t gpa; + hpa_t hpa; + unsigned long hva; + struct page *pages[1]; + int npages, gla_validity; + int r; + + exit_qualification =3D vmcs_read64(EXIT_QUALIFICATION); + /* + * 1. GPA exceeds GAW. + * 2. RWX violation. + */ + if (exit_qualification & (1 << 6)) { + printk(KERN_ERR "EPT: GPA exceeds GAW!\n"); + return -ENOTSUPP; + } + gla_validity =3D (exit_qualification >> 7) & 0x3; + if (gla_validity !=3D 0x3 && gla_validity !=3D 0x1) { + printk(KERN_ERR "EPT: Handling EPT violation failed!\n"); + printk(KERN_ERR "EPT: GPA: 0x%lx, GVA: 0x%lx\n", + (long unsigned int)vmcs_read64(GUEST_PHYSICAL_ADDRESS), + (long unsigned int)vmcs_read64(GUEST_LINEAR_ADDRESS)); + printk(KERN_ERR "EPT: Exit qualification is 0x%lx\n", + (long unsigned int)exit_qualification); + kvm_run->exit_reason =3D KVM_EXIT_UNKNOWN; + kvm_run->hw.hardware_exit_reason =3D 0; + return -ENOTSUPP; + } + + gpa =3D vmcs_read64(GUEST_PHYSICAL_ADDRESS); + hva =3D gfn_to_hva(vcpu->kvm, gpa >> PAGE_SHIFT); + if (!kvm_is_error_hva(hva)) { + npages =3D get_user_pages(current, current->mm, hva, 1, 1, 0, + pages, NULL); + if (npages !=3D 1) { + printk(KERN_ERR + "EPT: Error when dealing with hva 0x%lx\n", + hva); + return -ENOTSUPP; + } + atomic_inc(&vcpu->kvm->arch.guest_npages); + hpa =3D page_to_phys(pages[0]); + r =3D insert_ept_entry(vcpu->kvm, gpa & PAGE_MASK, hpa); + if (r !=3D 0) { + /* In normal condition, more than one vcpu may caused + * violation at the same time, so release others */ + kvm_release_page_clean(pages[0]); + atomic_dec(&vcpu->kvm->arch.guest_npages); + } + if (r < 0) { + printk(KERN_ERR "EPT: Not enough memory!\n"); + return -ENOMEM; + } + return 1; + } else { + /* must be MMIO */ + er =3D emulate_instruction(vcpu, kvm_run, 0, 0, 0); + + if (er =3D=3D EMULATE_FAIL) { + printk(KERN_ERR + "EPT: Fail to handle EPT violation vmexit!er is %d\n", + er); + printk(KERN_ERR "EPT: GPA: 0x%lx, GVA: 0x%lx\n", + (long unsigned int)vmcs_read64(GUEST_PHYSICAL_ADDRESS), + (long unsigned int)vmcs_read64(GUEST_LINEAR_ADDRESS)); + printk(KERN_ERR "EPT: Exit qualification is 0x%lx\n", + (long unsigned int)exit_qualification); + return -ENOTSUPP; + } else if (er =3D=3D EMULATE_DO_MMIO) + return 0; + } + return 1; +} + /* * The exit handlers return 1 if the exit was handled fully and guest=20 execution * may resume. Otherwise they set the kvm_run parameter to indicate what= =20 needs @@ -2329,6 +2585,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu= =20 *vcpu, [EXIT_REASON_TPR_BELOW_THRESHOLD] =3D handle_tpr_below_threshold, [EXIT_REASON_APIC_ACCESS] =3D handle_apic_access, [EXIT_REASON_WBINVD] =3D handle_wbinvd, + [EXIT_REASON_EPT_VIOLATION] =3D handle_ept_violation, }; static const int kvm_vmx_max_exit_handlers =3D @@ -2597,6 +2854,13 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, stru= ct=20 kvm_run *kvm_run) #endif ); + /* Access CR3 don't cause VMExit in paging mode, so we need + * to sync with guest real CR3. */ + if (vm_need_ept() && is_paging(vcpu)) { + vcpu->arch.cr3 =3D vmcs_readl(GUEST_CR3); + ept_new_cr3(vcpu); + } + vmx->idt_vectoring_info =3D vmcs_read32(IDT_VECTORING_INFO_FIELD); if (vmx->rmode.irq.pending) fixup_rmode_irq(vmx); @@ -2625,6 +2889,59 @@ static void vmx_free_vmcs(struct kvm_vcpu *vcpu) } } +static int ept_teardown(struct kvm *kvm) +{ + const int entries_per_page =3D PAGE_SIZE / sizeof(u64); + int level, i; + hpa_t path[VMX_EPT_MAX_GAW]; + epte_t *table; + eptp_t eptp; + struct page *page, *ept_page; + + mutex_lock(&kvm->arch.ept_mutex); + eptp =3D kvm->arch.eptp; + level =3D eptp.fields.gaw; + path[level] =3D eptp.fields.asr_mfn; + while (level <=3D eptp.fields.gaw) { + ept_page =3D pfn_to_page(path[level]); + table =3D kmap_atomic(ept_page, KM_USER0); + for (i =3D 0; i < entries_per_page; i++) { + if (table[i].entry =3D=3D 0) + continue; + if (level =3D=3D 0) { + page =3D pfn_to_page(table[i].fields.addr_mfn); + kvm_release_page_clean(page); + atomic_dec(&kvm->arch.guest_npages); + table[i].entry =3D 0; + } else { + level--; + path[level] =3D table[i].fields.addr_mfn; + table[i].entry =3D 0; + break; + } + } + kunmap_atomic(table, KM_USER0); + if (i !=3D entries_per_page) + continue; + __free_page(ept_page); + atomic_dec(&kvm->arch.ept_npages); + level++; + } + kvm->arch.eptp.entry =3D 0; + mutex_unlock(&kvm->arch.ept_mutex); + + if (atomic_read(&kvm->arch.ept_npages) || + atomic_read(&kvm->arch.guest_npages)) + printk(KERN_ERR "EPT: Fail to teardown ept table!" + "ept pages remains %d, guest remains %d\n", + atomic_read(&kvm->arch.ept_npages), + atomic_read(&kvm->arch.guest_npages)); + else + printk(KERN_INFO "EPT: Success teardown ept table\n"); + + return 0; +} + static void vmx_free_vcpu(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx =3D to_vmx(vcpu); @@ -2633,6 +2950,8 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu) if (vmx->vpid !=3D 0) __clear_bit(vmx->vpid, vmx_vpid_bitmap); spin_unlock(&vmx_vpid_lock); + if (vcpu->vcpu_id =3D=3D 0 && vm_need_ept()) + ept_teardown(vcpu->kvm); vmx_free_vmcs(vcpu); kfree(vmx->host_msrs); kfree(vmx->guest_msrs); @@ -2640,6 +2959,26 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu) kmem_cache_free(kvm_vcpu_cache, vmx); } +static int ept_eptp_init(struct kvm *kvm) +{ + struct page *root_page; + + root_page =3D alloc_page(GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO); + if (!root_page) + return -ENOMEM; + + mutex_init(&kvm->arch.ept_mutex); + + /* TODO write the value reading from MSR */ + kvm->arch.eptp.fields.etmt =3D VMX_EPT_DEFAULT_MT; + kvm->arch.eptp.fields.gaw =3D VMX_EPT_DEFAULT_GAW; + kvm->arch.eptp.fields.asr_mfn =3D page_to_pfn(root_page); + atomic_set(&kvm->arch.guest_npages, 0); + atomic_set(&kvm->arch.ept_npages, 1); + + return 0; +} + static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) { int err; @@ -2650,6 +2989,9 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *k= vm,=20 unsigned int id) return ERR_PTR(-ENOMEM); allocate_vpid(vmx); + if (id =3D=3D 0 && vm_need_ept()) + if (ept_eptp_init(kvm) < 0) + return ERR_PTR(-ENOMEM); err =3D kvm_vcpu_init(&vmx->vcpu, kvm, id); if (err) @@ -2802,9 +3144,14 @@ static int __init vmx_init(void) if (r) goto out1; + if (cpu_has_vmx_ept()) + bypass_guest_pf =3D 0; + if (bypass_guest_pf) kvm_mmu_set_nonpresent_ptes(~0xffeull, 0ull); + ept_sync_global(); + return 0; out1: =2D- debian.1.5.3.7.1-dirty --Boundary-00=_UdtoHAeZxyWhHkX Content-Type: text/x-diff; charset="utf-8"; name="0005-KVM-VMX-Enable-EPT-feature.patch" Content-Transfer-Encoding: quoted-printable Content-Disposition: attachment; filename="0005-KVM-VMX-Enable-EPT-feature.patch" =46rom 5062793a4aae25b8f701decbcf6dead9a4a38348 Mon Sep 17 00:00:00 2001 =46rom: Sheng Yang Date: Fri, 1 Feb 2008 06:51:01 +0800 Subject: [PATCH] KVM: VMX: Enable EPT feature =46or EPT's entry format is different from page table, we would set up EPT = in this patch. In EPT, hardware CR3 always point to guest page table, and changing = CR3 won't cause vmexit. The real mode was supported by using identity mapped pa= ge table. Signed-off-by: Sheng Yang =2D-- arch/x86/kvm/vmx.c | 357 ++++++++++++++++++++++++++++++++++++++++++++++++= +++- 1 files changed, 352 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 5f3767a..f5b59e7 100644 =2D-- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -40,7 +40,7 @@ module_param(bypass_guest_pf, bool, 0); static int enable_vpid =3D 1; module_param(enable_vpid, bool, 0); =20 =2Dstatic int enable_ept =3D 0; +static int enable_ept =3D 1; module_param(enable_ept, bool, 0); =20 struct vmcs { @@ -136,6 +136,15 @@ struct vmx_capability { .ar_bytes =3D GUEST_##seg##_AR_BYTES, \ } =20 +#define EPT_PT_BITS 9 +#define EPT_ENT_PER_PAGE (1 << EPT_PT_BITS) +#define EPT_PAGE_SHIFT 12 +/* level 0 is the leaf of EPT table */ +#define VMX_GET_EPTE_OFFSET(gpa, level) \ + (((gpa) & ((EPT_ENT_PER_PAGE - 1) << (EPT_PAGE_SHIFT + \ + EPT_PT_BITS * (level)))) >> (EPT_PAGE_SHIFT + \ + EPT_PT_BITS * (level))); + static struct kvm_vmx_segment_field { unsigned selector; unsigned base; @@ -294,6 +303,18 @@ static inline void __invvpid(int ext, u16 vpid, gva_t = gva) : : "a"(&operand), "c"(ext) : "cc", "memory"); } =20 +static inline void __invept(int ext, u64 eptp, gpa_t gpa) +{ + struct { + u64 eptp, gpa; + } operand =3D {eptp, gpa}; + + asm volatile (ASM_VMX_INVEPT + /* CF=3D=3D1 or ZF=3D=3D1 --> rc =3D -1 */ + "; ja 1f ; ud2 ; 1:\n" + : : "a" (&operand), "c" (ext) : "cc", "memory"); +} + static struct kvm_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr) { int i; @@ -345,6 +366,34 @@ static inline void vpid_sync_vcpu_all(struct vcpu_vmx = *vmx) __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vmx->vpid, 0); } =20 +static inline void ept_sync_global(void) +{ + if (cpu_has_vmx_invept_global()) + __invept(VMX_EPT_EXTENT_GLOBAL, 0, 0); +} + +static inline void ept_sync_context(struct kvm *kvm) +{ + if (vm_need_ept()) { + if (cpu_has_vmx_invept_context()) + __invept(VMX_EPT_EXTENT_CONTEXT, + kvm->arch.eptp.entry, 0); + else + ept_sync_global(); + } +} + +static inline void ept_sync_individual_addr(struct kvm *kvm, gpa_t gpa) +{ + if (vm_need_ept()) { + if (cpu_has_vmx_invept_individual_addr()) + __invept(VMX_EPT_EXTENT_INDIVIDUAL_ADDR, + kvm->arch.eptp.entry, gpa); + else + ept_sync_context(kvm); + } +} + static unsigned long vmcs_readl(unsigned long field) { unsigned long value; @@ -432,6 +481,8 @@ static void update_exception_bitmap(struct kvm_vcpu *vc= pu) eb |=3D 1u << 1; if (vcpu->arch.rmode.active) eb =3D ~0; + if (vm_need_ept()) + eb &=3D ~(1u << PF_VECTOR); /* bypass_guest_pf =3D 0 */ vmcs_write32(EXCEPTION_BITMAP, eb); } =20 @@ -1355,8 +1406,64 @@ static void vmx_decache_cr4_guest_bits(struct kvm_vc= pu *vcpu) vcpu->arch.cr4 |=3D vmcs_readl(GUEST_CR4) & ~KVM_GUEST_CR4_MASK; } =20 +static void ept_new_cr3(struct kvm_vcpu *vcpu) +{ + if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) { + if (!load_pdptrs(vcpu, vcpu->arch.cr3)) { + printk(KERN_ERR "EPT: Fail to load pdptrs!\n"); + return; + } + vmcs_write64(GUEST_PDPTR0, vcpu->arch.pdptrs[0]); + vmcs_write64(GUEST_PDPTR1, vcpu->arch.pdptrs[1]); + vmcs_write64(GUEST_PDPTR2, vcpu->arch.pdptrs[2]); + vmcs_write64(GUEST_PDPTR3, vcpu->arch.pdptrs[3]); + } +} + +static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); + +static void ept_update_paging_mode_cr0(unsigned long *hw_cr0, + unsigned long cr0, + struct kvm_vcpu *vcpu) +{ + if (!(cr0 & X86_CR0_PG)) { + /* From paging/starting to nonpaging */ + vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, + vmcs_config.cpu_based_exec_ctrl | + (CPU_BASED_CR3_LOAD_EXITING | + CPU_BASED_CR3_STORE_EXITING)); + vcpu->arch.cr0 =3D cr0; + vmx_set_cr4(vcpu, vcpu->arch.cr4); + *hw_cr0 |=3D X86_CR0_PE | X86_CR0_PG; + *hw_cr0 &=3D ~X86_CR0_WP; + } else if (!is_paging(vcpu)) { + /* From nonpaging to paging */ + vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, + vmcs_config.cpu_based_exec_ctrl & + ~(CPU_BASED_CR3_LOAD_EXITING | + CPU_BASED_CR3_STORE_EXITING)); + vcpu->arch.cr0 =3D cr0; + vmx_set_cr4(vcpu, vcpu->arch.cr4); + if (!(vcpu->arch.cr0 & X86_CR0_WP)) + *hw_cr0 &=3D ~X86_CR0_WP; + } +} + +static void ept_update_paging_mode_cr4(unsigned long *hw_cr4, + struct kvm_vcpu *vcpu) +{ + if (!is_paging(vcpu)) { + *hw_cr4 &=3D ~X86_CR4_PAE; + *hw_cr4 |=3D X86_CR4_PSE; + } else if (!(vcpu->arch.cr4 & X86_CR4_PAE)) + *hw_cr4 &=3D ~X86_CR4_PAE; +} + static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { + unsigned long hw_cr0 =3D (cr0 & ~KVM_GUEST_CR0_MASK) | + KVM_VM_CR0_ALWAYS_ON; + vmx_fpu_deactivate(vcpu); =20 if (vcpu->arch.rmode.active && (cr0 & X86_CR0_PE)) @@ -1374,9 +1481,11 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsig= ned long cr0) } #endif =20 + if (vm_need_ept()) + ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu); + vmcs_writel(CR0_READ_SHADOW, cr0); =2D vmcs_writel(GUEST_CR0, =2D (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON); + vmcs_writel(GUEST_CR0, hw_cr0); vcpu->arch.cr0 =3D cr0; =20 if (!(cr0 & X86_CR0_TS) || !(cr0 & X86_CR0_PE)) @@ -1385,6 +1494,9 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsign= ed long cr0) =20 static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) { + if (vm_need_ept()) + ept_new_cr3(vcpu); + vmx_flush_tlb(vcpu); vmcs_writel(GUEST_CR3, cr3); if (vcpu->arch.cr0 & X86_CR0_PE) @@ -1393,9 +1505,14 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsig= ned long cr3) =20 static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { + unsigned long hw_cr4 =3D cr4 | (vcpu->arch.rmode.active ? + KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON); + + if (vm_need_ept()) + ept_update_paging_mode_cr4(&hw_cr4, vcpu); + vmcs_writel(CR4_READ_SHADOW, cr4); =2D vmcs_writel(GUEST_CR4, cr4 | (vcpu->arch.rmode.active ? =2D KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON)); + vmcs_writel(GUEST_CR4, hw_cr4); vcpu->arch.cr4 =3D cr4; } =20 @@ -1857,6 +1974,8 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) =20 vpid_sync_vcpu_all(vmx); =20 + ept_sync_context(vmx->vcpu.kvm); + return 0; =20 out: @@ -2016,6 +2135,9 @@ static int handle_exception(struct kvm_vcpu *vcpu, st= ruct kvm_run *kvm_run) if (intr_info & INTR_INFO_DELIEVER_CODE_MASK) error_code =3D vmcs_read32(VM_EXIT_INTR_ERROR_CODE); if (is_page_fault(intr_info)) { + /* EPT won't cause page fault */ + if (vm_need_ept()) + BUG(); cr2 =3D vmcs_readl(EXIT_QUALIFICATION); return kvm_mmu_page_fault(vcpu, cr2, error_code); } @@ -2307,6 +2429,140 @@ static int handle_apic_access(struct kvm_vcpu *vcpu= , struct kvm_run *kvm_run) return 1; } =20 +static int insert_ept_entry(struct kvm *kvm, gpa_t gpa, hpa_t hpa) +{ + struct page *page =3D pfn_to_page(kvm->arch.eptp.fields.asr_mfn); + int level, rtn; + u64 offset; + u64 *table; + epte_t epte; + struct page *ept_page; + + rtn =3D 0; + /* level 0 is leaf */ + mutex_lock(&kvm->arch.ept_mutex); + for (level =3D kvm->arch.eptp.fields.gaw; level > 0; level--) { + table =3D kmap_atomic(page, KM_USER0); + offset =3D VMX_GET_EPTE_OFFSET(gpa, level); + epte.entry =3D 0; + if (table[offset] =3D=3D 0) { + ept_page =3D alloc_page(GFP_KERNEL | + __GFP_HIGHMEM | __GFP_ZERO); + if (!ept_page) { + kunmap_atomic(table, KM_USER0); + rtn =3D -ENOMEM; + break; + } + atomic_inc(&kvm->arch.ept_npages); + epte.fields.addr_mfn =3D page_to_pfn(ept_page); + epte.fields.r =3D epte.fields.w =3D epte.fields.x =3D 1; + table[offset] =3D epte.entry; + } else + epte.entry =3D (u64)table[offset]; + kunmap_atomic(table, KM_USER0); + page =3D pfn_to_page(epte.fields.addr_mfn); + } + if (rtn < 0) + goto out; + /* dealing with leaf */ + table =3D kmap_atomic(page, KM_USER0); + offset =3D VMX_GET_EPTE_OFFSET(gpa, 0); + epte.entry =3D (u64)table[offset]; + if (table[offset] =3D=3D 0) { + epte.fields.addr_mfn =3D hpa >> PAGE_SHIFT; + epte.fields.emt =3D kvm->arch.eptp.fields.etmt; + epte.fields.r =3D epte.fields.w =3D epte.fields.x =3D 1; + table[offset] =3D epte.entry; + } else { + printk(KERN_INFO "EPT: GPA have been mapped. " + "GPA: 0x%lx, HPA: 0x%lx\n", + (long unsigned int)gpa, (long unsigned int)hpa); + rtn =3D 1; + } + kunmap_atomic(table, KM_USER0); +out: + mutex_unlock(&kvm->arch.ept_mutex); + return rtn; +} + +static int handle_ept_violation(struct kvm_vcpu *vcpu, struct kvm_run *kvm= _run) +{ + u64 exit_qualification; + enum emulation_result er; + gpa_t gpa; + hpa_t hpa; + unsigned long hva; + struct page *pages[1]; + int npages, gla_validity; + int r; + + exit_qualification =3D vmcs_read64(EXIT_QUALIFICATION); + /* + * 1. GPA exceeds GAW. + * 2. RWX violation. + */ + if (exit_qualification & (1 << 6)) { + printk(KERN_ERR "EPT: GPA exceeds GAW!\n"); + return -ENOTSUPP; + } + gla_validity =3D (exit_qualification >> 7) & 0x3; + if (gla_validity !=3D 0x3 && gla_validity !=3D 0x1) { + printk(KERN_ERR "EPT: Handling EPT violation failed!\n"); + printk(KERN_ERR "EPT: GPA: 0x%lx, GVA: 0x%lx\n", + (long unsigned int)vmcs_read64(GUEST_PHYSICAL_ADDRESS), + (long unsigned int)vmcs_read64(GUEST_LINEAR_ADDRESS)); + printk(KERN_ERR "EPT: Exit qualification is 0x%lx\n", + (long unsigned int)exit_qualification); + kvm_run->exit_reason =3D KVM_EXIT_UNKNOWN; + kvm_run->hw.hardware_exit_reason =3D 0; + return -ENOTSUPP; + } + + gpa =3D vmcs_read64(GUEST_PHYSICAL_ADDRESS); + hva =3D gfn_to_hva(vcpu->kvm, gpa >> PAGE_SHIFT); + if (!kvm_is_error_hva(hva)) { + npages =3D get_user_pages(current, current->mm, hva, 1, 1, 0, + pages, NULL); + if (npages !=3D 1) { + printk(KERN_ERR + "EPT: Error when dealing with hva 0x%lx\n", + hva); + return -ENOTSUPP; + } + atomic_inc(&vcpu->kvm->arch.guest_npages); + hpa =3D page_to_phys(pages[0]); + r =3D insert_ept_entry(vcpu->kvm, gpa & PAGE_MASK, hpa); + if (r !=3D 0) { + /* In normal condition, more than one vcpu may caused + * violation at the same time, so release others */ + kvm_release_page_clean(pages[0]); + atomic_dec(&vcpu->kvm->arch.guest_npages); + } + if (r < 0) { + printk(KERN_ERR "EPT: Not enough memory!\n"); + return -ENOMEM; + } + return 1; + } else { + /* must be MMIO */ + er =3D emulate_instruction(vcpu, kvm_run, 0, 0, 0); + + if (er =3D=3D EMULATE_FAIL) { + printk(KERN_ERR + "EPT: Fail to handle EPT violation vmexit!er is %d\n", + er); + printk(KERN_ERR "EPT: GPA: 0x%lx, GVA: 0x%lx\n", + (long unsigned int)vmcs_read64(GUEST_PHYSICAL_ADDRESS), + (long unsigned int)vmcs_read64(GUEST_LINEAR_ADDRESS)); + printk(KERN_ERR "EPT: Exit qualification is 0x%lx\n", + (long unsigned int)exit_qualification); + return -ENOTSUPP; + } else if (er =3D=3D EMULATE_DO_MMIO) + return 0; + } + return 1; +} + /* * The exit handlers return 1 if the exit was handled fully and guest exec= ution * may resume. Otherwise they set the kvm_run parameter to indicate what = needs @@ -2329,6 +2585,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu= *vcpu, [EXIT_REASON_TPR_BELOW_THRESHOLD] =3D handle_tpr_below_threshold, [EXIT_REASON_APIC_ACCESS] =3D handle_apic_access, [EXIT_REASON_WBINVD] =3D handle_wbinvd, + [EXIT_REASON_EPT_VIOLATION] =3D handle_ept_violation, }; =20 static const int kvm_vmx_max_exit_handlers =3D @@ -2597,6 +2854,13 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, stru= ct kvm_run *kvm_run) #endif ); =20 + /* Access CR3 don't cause VMExit in paging mode, so we need + * to sync with guest real CR3. */ + if (vm_need_ept() && is_paging(vcpu)) { + vcpu->arch.cr3 =3D vmcs_readl(GUEST_CR3); + ept_new_cr3(vcpu); + } + vmx->idt_vectoring_info =3D vmcs_read32(IDT_VECTORING_INFO_FIELD); if (vmx->rmode.irq.pending) fixup_rmode_irq(vmx); @@ -2625,6 +2889,59 @@ static void vmx_free_vmcs(struct kvm_vcpu *vcpu) } } =20 +static int ept_teardown(struct kvm *kvm) +{ + const int entries_per_page =3D PAGE_SIZE / sizeof(u64); + int level, i; + hpa_t path[VMX_EPT_MAX_GAW]; + epte_t *table; + eptp_t eptp; + struct page *page, *ept_page; + + mutex_lock(&kvm->arch.ept_mutex); + eptp =3D kvm->arch.eptp; + level =3D eptp.fields.gaw; + path[level] =3D eptp.fields.asr_mfn; + while (level <=3D eptp.fields.gaw) { + ept_page =3D pfn_to_page(path[level]); + table =3D kmap_atomic(ept_page, KM_USER0); + for (i =3D 0; i < entries_per_page; i++) { + if (table[i].entry =3D=3D 0) + continue; + if (level =3D=3D 0) { + page =3D pfn_to_page(table[i].fields.addr_mfn); + kvm_release_page_clean(page); + atomic_dec(&kvm->arch.guest_npages); + table[i].entry =3D 0; + } else { + level--; + path[level] =3D table[i].fields.addr_mfn; + table[i].entry =3D 0; + break; + } + } + kunmap_atomic(table, KM_USER0); + if (i !=3D entries_per_page) + continue; + __free_page(ept_page); + atomic_dec(&kvm->arch.ept_npages); + level++; + } + kvm->arch.eptp.entry =3D 0; + mutex_unlock(&kvm->arch.ept_mutex); + + if (atomic_read(&kvm->arch.ept_npages) || + atomic_read(&kvm->arch.guest_npages)) + printk(KERN_ERR "EPT: Fail to teardown ept table!" + "ept pages remains %d, guest remains %d\n", + atomic_read(&kvm->arch.ept_npages), + atomic_read(&kvm->arch.guest_npages)); + else + printk(KERN_INFO "EPT: Success teardown ept table\n"); + + return 0; +} + static void vmx_free_vcpu(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx =3D to_vmx(vcpu); @@ -2633,6 +2950,8 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu) if (vmx->vpid !=3D 0) __clear_bit(vmx->vpid, vmx_vpid_bitmap); spin_unlock(&vmx_vpid_lock); + if (vcpu->vcpu_id =3D=3D 0 && vm_need_ept()) + ept_teardown(vcpu->kvm); vmx_free_vmcs(vcpu); kfree(vmx->host_msrs); kfree(vmx->guest_msrs); @@ -2640,6 +2959,26 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu) kmem_cache_free(kvm_vcpu_cache, vmx); } =20 +static int ept_eptp_init(struct kvm *kvm) +{ + struct page *root_page; + + root_page =3D alloc_page(GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO); + if (!root_page) + return -ENOMEM; + + mutex_init(&kvm->arch.ept_mutex); + + /* TODO write the value reading from MSR */ + kvm->arch.eptp.fields.etmt =3D VMX_EPT_DEFAULT_MT; + kvm->arch.eptp.fields.gaw =3D VMX_EPT_DEFAULT_GAW; + kvm->arch.eptp.fields.asr_mfn =3D page_to_pfn(root_page); + atomic_set(&kvm->arch.guest_npages, 0); + atomic_set(&kvm->arch.ept_npages, 1); + + return 0; +} + static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) { int err; @@ -2650,6 +2989,9 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *k= vm, unsigned int id) return ERR_PTR(-ENOMEM); =20 allocate_vpid(vmx); + if (id =3D=3D 0 && vm_need_ept()) + if (ept_eptp_init(kvm) < 0) + return ERR_PTR(-ENOMEM); =20 err =3D kvm_vcpu_init(&vmx->vcpu, kvm, id); if (err) @@ -2802,9 +3144,14 @@ static int __init vmx_init(void) if (r) goto out1; =20 + if (cpu_has_vmx_ept()) + bypass_guest_pf =3D 0; + if (bypass_guest_pf) kvm_mmu_set_nonpresent_ptes(~0xffeull, 0ull); =20 + ept_sync_global(); + return 0; =20 out1: =2D-=20 debian.1.5.3.7.1-dirty --Boundary-00=_UdtoHAeZxyWhHkX Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Content-Disposition: inline ------------------------------------------------------------------------- This SF.net email is sponsored by: Microsoft Defy all challenges. Microsoft(R) Visual Studio 2008. http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/ --Boundary-00=_UdtoHAeZxyWhHkX Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Content-Disposition: inline _______________________________________________ kvm-devel mailing list kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f@public.gmane.org https://lists.sourceforge.net/lists/listinfo/kvm-devel --Boundary-00=_UdtoHAeZxyWhHkX--