From mboxrd@z Thu Jan 1 00:00:00 1970 From: Christian Ehrhardt Subject: [PATCH 4/5] Architecture independence layer - v1 - split_generic_x86_x86.c Date: Fri, 24 Aug 2007 13:53:18 +0200 Message-ID: <46CEC6AE.2060508@linux.vnet.ibm.com> Mime-Version: 1.0 Content-Type: text/plain; charset="iso-8859-1" Content-Transfer-Encoding: quoted-printable To: "kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f@public.gmane.org" Return-path: List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Sender: kvm-devel-bounces-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f@public.gmane.org Errors-To: kvm-devel-bounces-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f@public.gmane.org List-Id: kvm.vger.kernel.org from Christian Ehrhardt Contains the insertion of the x86 arch code to x86.[ch] and some minor = changes adopting this x86.c arch implementations to the new interface. Signed-off-by: Christian Ehrhardt --- x86.c | 1851 = ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ x86.h | 63 ++ 2 files changed, 1914 insertions(+) diff --git a/drivers/kvm/x86.c b/drivers/kvm/x86.c index 5c72fee..8660414 100644 --- a/drivers/kvm/x86.c +++ b/drivers/kvm/x86.c @@ -49,3 +49,1854 @@ MODULE_AUTHOR("Qumranet"); MODULE_LICENSE("GPL"); = +struct kvm_x86_ops *kvm_x86_ops; +struct kmem_cache *kvm_vcpu_cache; +EXPORT_SYMBOL_GPL(kvm_vcpu_cache); + +#define MAX_IO_MSRS 256 + +#define CR0_RESERVED_BITS \ + (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \ + | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \ + | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG)) +#define CR4_RESERVED_BITS \ + (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\ + | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \ + | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR \ + | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE)) + +#define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR) +#define EFER_RESERVED_BITS 0xfffffffffffff2fe + +#ifdef CONFIG_X86_64 +// LDT or TSS descriptor in the GDT. 16 bytes. +struct segment_descriptor_64 { + struct segment_descriptor s; + u32 base_higher; + u32 pad_zero; +}; + +#endif + +unsigned long segment_base(u16 selector) +{ + struct descriptor_table gdt; + struct segment_descriptor *d; + unsigned long table_base; + typedef unsigned long ul; + unsigned long v; + + if (selector =3D=3D 0) + return 0; + + asm ("sgdt %0" : "=3Dm"(gdt)); + table_base =3D gdt.base; + + if (selector & 4) { /* from ldt */ + u16 ldt_selector; + + asm ("sldt %0" : "=3Dg"(ldt_selector)); + table_base =3D segment_base(ldt_selector); + } + d =3D (struct segment_descriptor *)(table_base + (selector & ~7)); + v =3D d->base_low | ((ul)d->base_mid << 16) | ((ul)d->base_high << 24); +#ifdef CONFIG_X86_64 + if (d->system =3D=3D 0 + && (d->type =3D=3D 2 || d->type =3D=3D 9 || d->type =3D=3D 11)) + v |=3D ((ul)((struct segment_descriptor_64 *)d)->base_higher) << 3= 2; +#endif + return v; +} +EXPORT_SYMBOL_GPL(segment_base); + +void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) +{ + if (!vcpu->fpu_active || vcpu->guest_fpu_loaded) + return; + + vcpu->guest_fpu_loaded =3D 1; + fx_save(&vcpu->host_fx_image); + fx_restore(&vcpu->guest_fx_image); +} +EXPORT_SYMBOL_GPL(kvm_load_guest_fpu); + +void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) +{ + if (!vcpu->guest_fpu_loaded) + return; + + vcpu->guest_fpu_loaded =3D 0; + fx_save(&vcpu->guest_fx_image); + fx_restore(&vcpu->host_fx_image); +} +EXPORT_SYMBOL_GPL(kvm_put_guest_fpu); + +static void ack_flush(void *_completed) +{ + atomic_t *completed =3D _completed; + + atomic_inc(completed); +} + +void kvm_flush_remote_tlbs(struct kvm *kvm) +{ + int i, cpu, needed; + cpumask_t cpus; + struct kvm_vcpu *vcpu; + atomic_t completed; + + atomic_set(&completed, 0); + cpus_clear(cpus); + needed =3D 0; + for (i =3D 0; i < KVM_MAX_VCPUS; ++i) { + vcpu =3D kvm->vcpus[i]; + if (!vcpu) + continue; + if (test_and_set_bit(KVM_TLB_FLUSH, &vcpu->requests)) + continue; + cpu =3D vcpu->cpu; + if (cpu !=3D -1 && cpu !=3D raw_smp_processor_id()) + if (!cpu_isset(cpu, cpus)) { + cpu_set(cpu, cpus); + ++needed; + } + } + + /* + * We really want smp_call_function_mask() here. But that's not + * available, so ipi all cpus in parallel and wait for them + * to complete. + */ + for (cpu =3D first_cpu(cpus); cpu !=3D NR_CPUS; cpu =3D next_cpu(cpu, = cpus)) + smp_call_function_single(cpu, ack_flush, &completed, 1, 0); + while (atomic_read(&completed) !=3D needed) { + cpu_relax(); + barrier(); + } +} + +int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) +{ + struct page *page; + int r; + + mutex_init(&vcpu->mutex); + vcpu->cpu =3D -1; + vcpu->mmu.root_hpa =3D INVALID_PAGE; + vcpu->kvm =3D kvm; + vcpu->vcpu_id =3D id; + + page =3D alloc_page(GFP_KERNEL | __GFP_ZERO); + if (!page) { + r =3D -ENOMEM; + goto fail; + } + vcpu->run =3D page_address(page); + + page =3D alloc_page(GFP_KERNEL | __GFP_ZERO); + if (!page) { + r =3D -ENOMEM; + goto fail_free_run; + } + vcpu->pio_data =3D page_address(page); + + r =3D kvm_mmu_create(vcpu); + if (r < 0) + goto fail_free_pio_data; + + return 0; + +fail_free_pio_data: + free_page((unsigned long)vcpu->pio_data); +fail_free_run: + free_page((unsigned long)vcpu->run); +fail: + return -ENOMEM; +} +EXPORT_SYMBOL_GPL(kvm_vcpu_init); + +void kvm_vcpu_uninit(struct kvm_vcpu *vcpu) +{ + kvm_mmu_destroy(vcpu); + free_page((unsigned long)vcpu->pio_data); + free_page((unsigned long)vcpu->run); +} +EXPORT_SYMBOL_GPL(kvm_vcpu_uninit); + +void kvm_arch_inject_gp(struct kvm_vcpu *vcpu) +{ + kvm_x86_ops->inject_gp(vcpu, 0); +} + +/* + * Load the pae pdptrs. Return true is they are all valid. + */ +static int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3) +{ + gfn_t pdpt_gfn =3D cr3 >> PAGE_SHIFT; + unsigned offset =3D ((cr3 & (PAGE_SIZE-1)) >> 5) << 2; + int i; + u64 *pdpt; + int ret; + struct page *page; + u64 pdpte[ARRAY_SIZE(vcpu->pdptrs)]; + + mutex_lock(&vcpu->kvm->lock); + page =3D gfn_to_page(vcpu->kvm, pdpt_gfn); + if (!page) { + ret =3D 0; + goto out; + } + + pdpt =3D kmap_atomic(page, KM_USER0); + memcpy(pdpte, pdpt+offset, sizeof(pdpte)); + kunmap_atomic(pdpt, KM_USER0); + + for (i =3D 0; i < ARRAY_SIZE(pdpte); ++i) { + if ((pdpte[i] & 1) && (pdpte[i] & 0xfffffff0000001e6ull)) { + ret =3D 0; + goto out; + } + } + ret =3D 1; + + memcpy(vcpu->pdptrs, pdpte, sizeof(vcpu->pdptrs)); +out: + mutex_unlock(&vcpu->kvm->lock); + + return ret; +} + +void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) +{ + if (cr0 & CR0_RESERVED_BITS) { + printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n", + cr0, vcpu->cr0); + kvm_arch_inject_gp(vcpu); + return; + } + + if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) { + printk(KERN_DEBUG "set_cr0: #GP, CD =3D=3D 0 && NW =3D=3D 1\n"); + kvm_arch_inject_gp(vcpu); + return; + } + + if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) { + printk(KERN_DEBUG "set_cr0: #GP, set PG flag " + "and a clear PE flag\n"); + kvm_arch_inject_gp(vcpu); + return; + } + + if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { +#ifdef CONFIG_X86_64 + if ((vcpu->shadow_efer & EFER_LME)) { + int cs_db, cs_l; + + if (!is_pae(vcpu)) { + printk(KERN_DEBUG "set_cr0: #GP, start paging " + "in long mode while PAE is disabled\n"); + kvm_arch_inject_gp(vcpu); + return; + } + kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); + if (cs_l) { + printk(KERN_DEBUG "set_cr0: #GP, start paging " + "in long mode while CS.L =3D=3D 1\n"); + kvm_arch_inject_gp(vcpu); + return; + + } + } else +#endif + if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->cr3)) { + printk(KERN_DEBUG "set_cr0: #GP, pdptrs " + "reserved bits\n"); + kvm_arch_inject_gp(vcpu); + return; + } + + } + + kvm_x86_ops->set_cr0(vcpu, cr0); + vcpu->cr0 =3D cr0; + + mutex_lock(&vcpu->kvm->lock); + kvm_mmu_reset_context(vcpu); + mutex_unlock(&vcpu->kvm->lock); + return; +} +EXPORT_SYMBOL_GPL(set_cr0); + +void lmsw(struct kvm_vcpu *vcpu, unsigned long msw) +{ + set_cr0(vcpu, (vcpu->cr0 & ~0x0ful) | (msw & 0x0f)); +} +EXPORT_SYMBOL_GPL(lmsw); + +void set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) +{ + if (cr4 & CR4_RESERVED_BITS) { + printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n"); + kvm_arch_inject_gp(vcpu); + return; + } + + if (is_long_mode(vcpu)) { + if (!(cr4 & X86_CR4_PAE)) { + printk(KERN_DEBUG "set_cr4: #GP, clearing PAE while " + "in long mode\n"); + kvm_arch_inject_gp(vcpu); + return; + } + } else if (is_paging(vcpu) && !is_pae(vcpu) && (cr4 & X86_CR4_PAE) + && !load_pdptrs(vcpu, vcpu->cr3)) { + printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n"); + kvm_arch_inject_gp(vcpu); + return; + } + + if (cr4 & X86_CR4_VMXE) { + printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n"); + kvm_arch_inject_gp(vcpu); + return; + } + kvm_x86_ops->set_cr4(vcpu, cr4); + mutex_lock(&vcpu->kvm->lock); + kvm_mmu_reset_context(vcpu); + mutex_unlock(&vcpu->kvm->lock); +} +EXPORT_SYMBOL_GPL(set_cr4); + +void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) +{ + if (is_long_mode(vcpu)) { + if (cr3 & CR3_L_MODE_RESERVED_BITS) { + printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n"); + kvm_arch_inject_gp(vcpu); + return; + } + } else { + if (is_pae(vcpu)) { + if (cr3 & CR3_PAE_RESERVED_BITS) { + printk(KERN_DEBUG + "set_cr3: #GP, reserved bits\n"); + kvm_arch_inject_gp(vcpu); + return; + } + if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) { + printk(KERN_DEBUG "set_cr3: #GP, pdptrs " + "reserved bits\n"); + kvm_arch_inject_gp(vcpu); + return; + } + } else { + if (cr3 & CR3_NONPAE_RESERVED_BITS) { + printk(KERN_DEBUG + "set_cr3: #GP, reserved bits\n"); + kvm_arch_inject_gp(vcpu); + return; + } + } + } + + mutex_lock(&vcpu->kvm->lock); + /* + * Does the new cr3 value map to physical memory? (Note, we + * catch an invalid cr3 even in real-mode, because it would + * cause trouble later on when we turn on paging anyway.) + * + * A real CPU would silently accept an invalid cr3 and would + * attempt to use it - with largely undefined (and often hard + * to debug) behavior on the guest side. + */ + if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT))) + kvm_arch_inject_gp(vcpu); + else { + vcpu->cr3 =3D cr3; + vcpu->mmu.new_cr3(vcpu); + } + mutex_unlock(&vcpu->kvm->lock); +} +EXPORT_SYMBOL_GPL(set_cr3); + +void set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) +{ + if (cr8 & CR8_RESERVED_BITS) { + printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8); + kvm_arch_inject_gp(vcpu); + return; + } + vcpu->cr8 =3D cr8; +} +EXPORT_SYMBOL_GPL(set_cr8); + +void fx_init(struct kvm_vcpu *vcpu) +{ + unsigned after_mxcsr_mask; + + /* Initialize guest FPU by resetting ours and saving into guest's */ + preempt_disable(); + fx_save(&vcpu->host_fx_image); + fpu_init(); + fx_save(&vcpu->guest_fx_image); + fx_restore(&vcpu->host_fx_image); + preempt_enable(); + + after_mxcsr_mask =3D offsetof(struct i387_fxsave_struct, st_space); + vcpu->guest_fx_image.mxcsr =3D 0x1f80; + memset((void *)&vcpu->guest_fx_image + after_mxcsr_mask, + 0, sizeof(struct i387_fxsave_struct) - after_mxcsr_mask); +} +EXPORT_SYMBOL_GPL(fx_init); + +static gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn) +{ + int i; + struct kvm_mem_alias *alias; + + for (i =3D 0; i < kvm->naliases; ++i) { + alias =3D &kvm->aliases[i]; + if (gfn >=3D alias->base_gfn + && gfn < alias->base_gfn + alias->npages) + return alias->target_gfn + gfn - alias->base_gfn; + } + return gfn; +} + +static struct kvm_memory_slot *__gfn_to_memslot(struct kvm *kvm, gfn_t gfn) +{ + int i; + + for (i =3D 0; i < kvm->nmemslots; ++i) { + struct kvm_memory_slot *memslot =3D &kvm->memslots[i]; + + if (gfn >=3D memslot->base_gfn + && gfn < memslot->base_gfn + memslot->npages) + return memslot; + } + return NULL; +} + +struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn) +{ + gfn =3D unalias_gfn(kvm, gfn); + return __gfn_to_memslot(kvm, gfn); +} + +struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn) +{ + struct kvm_memory_slot *slot; + + gfn =3D unalias_gfn(kvm, gfn); + slot =3D __gfn_to_memslot(kvm, gfn); + if (!slot) + return NULL; + return slot->phys_mem[gfn - slot->base_gfn]; +} +EXPORT_SYMBOL_GPL(gfn_to_page); + +/* WARNING: Does not work on aliased pages. */ +void mark_page_dirty(struct kvm *kvm, gfn_t gfn) +{ + struct kvm_memory_slot *memslot; + + memslot =3D __gfn_to_memslot(kvm, gfn); + if (memslot && memslot->dirty_bitmap) { + unsigned long rel_gfn =3D gfn - memslot->base_gfn; + + /* avoid RMW */ + if (!test_bit(rel_gfn, memslot->dirty_bitmap)) + set_bit(rel_gfn, memslot->dirty_bitmap); + } +} + +int emulator_read_std(unsigned long addr, + void *val, + unsigned int bytes, + struct kvm_vcpu *vcpu) +{ + void *data =3D val; + + while (bytes) { + gpa_t gpa =3D vcpu->mmu.gva_to_gpa(vcpu, addr); + unsigned offset =3D addr & (PAGE_SIZE-1); + unsigned tocopy =3D min(bytes, (unsigned)PAGE_SIZE - offset); + unsigned long pfn; + struct page *page; + void *page_virt; + + if (gpa =3D=3D UNMAPPED_GVA) + return X86EMUL_PROPAGATE_FAULT; + pfn =3D gpa >> PAGE_SHIFT; + page =3D gfn_to_page(vcpu->kvm, pfn); + if (!page) + return X86EMUL_UNHANDLEABLE; + page_virt =3D kmap_atomic(page, KM_USER0); + + memcpy(data, page_virt + offset, tocopy); + + kunmap_atomic(page_virt, KM_USER0); + + bytes -=3D tocopy; + data +=3D tocopy; + addr +=3D tocopy; + } + + return X86EMUL_CONTINUE; +} +EXPORT_SYMBOL_GPL(emulator_read_std); + +static int emulator_write_std(unsigned long addr, + const void *val, + unsigned int bytes, + struct kvm_vcpu *vcpu) +{ + pr_unimpl(vcpu, "emulator_write_std: addr %lx n %d\n", addr, bytes); + return X86EMUL_UNHANDLEABLE; +} + +static int emulator_read_emulated(unsigned long addr, + void *val, + unsigned int bytes, + struct kvm_vcpu *vcpu) +{ + struct kvm_io_device *mmio_dev; + gpa_t gpa; + + if (vcpu->mmio_read_completed) { + memcpy(val, vcpu->mmio_data, bytes); + vcpu->mmio_read_completed =3D 0; + return X86EMUL_CONTINUE; + } else if (emulator_read_std(addr, val, bytes, vcpu) + =3D=3D X86EMUL_CONTINUE) + return X86EMUL_CONTINUE; + + gpa =3D vcpu->mmu.gva_to_gpa(vcpu, addr); + if (gpa =3D=3D UNMAPPED_GVA) + return X86EMUL_PROPAGATE_FAULT; + + /* + * Is this MMIO handled locally? + */ + mmio_dev =3D vcpu_find_mmio_dev(vcpu, gpa); + if (mmio_dev) { + kvm_iodevice_read(mmio_dev, gpa, bytes, val); + return X86EMUL_CONTINUE; + } + + vcpu->mmio_needed =3D 1; + vcpu->mmio_phys_addr =3D gpa; + vcpu->mmio_size =3D bytes; + vcpu->mmio_is_write =3D 0; + + return X86EMUL_UNHANDLEABLE; +} + +static int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, + const void *val, int bytes) +{ + struct page *page; + void *virt; + + if (((gpa + bytes - 1) >> PAGE_SHIFT) !=3D (gpa >> PAGE_SHIFT)) + return 0; + page =3D gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); + if (!page) + return 0; + mark_page_dirty(vcpu->kvm, gpa >> PAGE_SHIFT); + virt =3D kmap_atomic(page, KM_USER0); + kvm_mmu_pte_write(vcpu, gpa, val, bytes); + memcpy(virt + offset_in_page(gpa), val, bytes); + kunmap_atomic(virt, KM_USER0); + return 1; +} + +static int emulator_write_emulated_onepage(unsigned long addr, + const void *val, + unsigned int bytes, + struct kvm_vcpu *vcpu) +{ + struct kvm_io_device *mmio_dev; + gpa_t gpa =3D vcpu->mmu.gva_to_gpa(vcpu, addr); + + if (gpa =3D=3D UNMAPPED_GVA) { + kvm_x86_ops->inject_page_fault(vcpu, addr, 2); + return X86EMUL_PROPAGATE_FAULT; + } + + if (emulator_write_phys(vcpu, gpa, val, bytes)) + return X86EMUL_CONTINUE; + + /* + * Is this MMIO handled locally? + */ + mmio_dev =3D vcpu_find_mmio_dev(vcpu, gpa); + if (mmio_dev) { + kvm_iodevice_write(mmio_dev, gpa, bytes, val); + return X86EMUL_CONTINUE; + } + + vcpu->mmio_needed =3D 1; + vcpu->mmio_phys_addr =3D gpa; + vcpu->mmio_size =3D bytes; + vcpu->mmio_is_write =3D 1; + memcpy(vcpu->mmio_data, val, bytes); + + return X86EMUL_CONTINUE; +} + +int emulator_write_emulated(unsigned long addr, + const void *val, + unsigned int bytes, + struct kvm_vcpu *vcpu) +{ + /* Crossing a page boundary? */ + if (((addr + bytes - 1) ^ addr) & PAGE_MASK) { + int rc, now; + + now =3D -addr & ~PAGE_MASK; + rc =3D emulator_write_emulated_onepage(addr, val, now, vcpu); + if (rc !=3D X86EMUL_CONTINUE) + return rc; + addr +=3D now; + val +=3D now; + bytes -=3D now; + } + return emulator_write_emulated_onepage(addr, val, bytes, vcpu); +} +EXPORT_SYMBOL_GPL(emulator_write_emulated); + +static int emulator_cmpxchg_emulated(unsigned long addr, + const void *old, + const void *new, + unsigned int bytes, + struct kvm_vcpu *vcpu) +{ + static int reported; + + if (!reported) { + reported =3D 1; + printk(KERN_WARNING "kvm: emulating exchange as write\n"); + } + return emulator_write_emulated(addr, new, bytes, vcpu); +} + +static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg) +{ + return kvm_x86_ops->get_segment_base(vcpu, seg); +} + +int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address) +{ + return X86EMUL_CONTINUE; +} + +int emulate_clts(struct kvm_vcpu *vcpu) +{ + unsigned long cr0; + + cr0 =3D vcpu->cr0 & ~X86_CR0_TS; + kvm_x86_ops->set_cr0(vcpu, cr0); + return X86EMUL_CONTINUE; +} + +int emulator_get_dr(struct x86_emulate_ctxt* ctxt, int dr, unsigned = long *dest) +{ + struct kvm_vcpu *vcpu =3D ctxt->vcpu; + + switch (dr) { + case 0 ... 3: + *dest =3D kvm_x86_ops->get_dr(vcpu, dr); + return X86EMUL_CONTINUE; + default: + pr_unimpl(vcpu, "%s: unexpected dr %u\n", __FUNCTION__, dr); + return X86EMUL_UNHANDLEABLE; + } +} + +int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned = long value) +{ + unsigned long mask =3D (ctxt->mode =3D=3D X86EMUL_MODE_PROT64) ? ~0ULL= : ~0U; + int exception; + + kvm_x86_ops->set_dr(ctxt->vcpu, dr, value & mask, &exception); + if (exception) { + /* FIXME: better handling */ + return X86EMUL_UNHANDLEABLE; + } + return X86EMUL_CONTINUE; +} + +static void report_emulation_failure(struct x86_emulate_ctxt *ctxt) +{ + static int reported; + u8 opcodes[4]; + unsigned long rip =3D ctxt->vcpu->rip; + unsigned long rip_linear; + + rip_linear =3D rip + get_segment_base(ctxt->vcpu, VCPU_SREG_CS); + + if (reported) + return; + + emulator_read_std(rip_linear, (void *)opcodes, 4, ctxt->vcpu); + + printk(KERN_ERR "emulation failed but !mmio_needed?" + " rip %lx %02x %02x %02x %02x\n", + rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]); + reported =3D 1; +} + +struct x86_emulate_ops emulate_ops =3D { + .read_std =3D emulator_read_std, + .write_std =3D emulator_write_std, + .read_emulated =3D emulator_read_emulated, + .write_emulated =3D emulator_write_emulated, + .cmpxchg_emulated =3D emulator_cmpxchg_emulated, +}; + +int emulate_instruction(struct kvm_vcpu *vcpu, + struct kvm_run *run, + unsigned long cr2, + u16 error_code) +{ + struct x86_emulate_ctxt emulate_ctxt; + int r; + int cs_db, cs_l; + + vcpu->mmio_fault_cr2 =3D cr2; + kvm_x86_ops->cache_regs(vcpu); + + kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); + + emulate_ctxt.vcpu =3D vcpu; + emulate_ctxt.eflags =3D kvm_x86_ops->get_rflags(vcpu); + emulate_ctxt.cr2 =3D cr2; + emulate_ctxt.mode =3D (emulate_ctxt.eflags & X86_EFLAGS_VM) + ? X86EMUL_MODE_REAL : cs_l + ? X86EMUL_MODE_PROT64 : cs_db + ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; + + if (emulate_ctxt.mode =3D=3D X86EMUL_MODE_PROT64) { + emulate_ctxt.cs_base =3D 0; + emulate_ctxt.ds_base =3D 0; + emulate_ctxt.es_base =3D 0; + emulate_ctxt.ss_base =3D 0; + } else { + emulate_ctxt.cs_base =3D get_segment_base(vcpu, VCPU_SREG_CS); + emulate_ctxt.ds_base =3D get_segment_base(vcpu, VCPU_SREG_DS); + emulate_ctxt.es_base =3D get_segment_base(vcpu, VCPU_SREG_ES); + emulate_ctxt.ss_base =3D get_segment_base(vcpu, VCPU_SREG_SS); + } + + emulate_ctxt.gs_base =3D get_segment_base(vcpu, VCPU_SREG_GS); + emulate_ctxt.fs_base =3D get_segment_base(vcpu, VCPU_SREG_FS); + + vcpu->mmio_is_write =3D 0; + vcpu->pio.string =3D 0; + r =3D x86_emulate_memop(&emulate_ctxt, &emulate_ops); + if (vcpu->pio.string) + return EMULATE_DO_MMIO; + + if ((r || vcpu->mmio_is_write) && run) { + run->exit_reason =3D KVM_EXIT_MMIO; + run->mmio.phys_addr =3D vcpu->mmio_phys_addr; + memcpy(run->mmio.data, vcpu->mmio_data, 8); + run->mmio.len =3D vcpu->mmio_size; + run->mmio.is_write =3D vcpu->mmio_is_write; + } + + if (r) { + if (kvm_mmu_unprotect_page_virt(vcpu, cr2)) + return EMULATE_DONE; + if (!vcpu->mmio_needed) { + report_emulation_failure(&emulate_ctxt); + return EMULATE_FAIL; + } + return EMULATE_DO_MMIO; + } + + kvm_x86_ops->decache_regs(vcpu); + kvm_x86_ops->set_rflags(vcpu, emulate_ctxt.eflags); + + if (vcpu->mmio_is_write) { + vcpu->mmio_needed =3D 0; + return EMULATE_DO_MMIO; + } + + return EMULATE_DONE; +} +EXPORT_SYMBOL_GPL(emulate_instruction); + +int kvm_emulate_halt(struct kvm_vcpu *vcpu) +{ + if (vcpu->irq_summary) + return 1; + + vcpu->run->exit_reason =3D KVM_EXIT_HLT; + ++vcpu->stat.halt_exits; + return 0; +} +EXPORT_SYMBOL_GPL(kvm_emulate_halt); + +int kvm_hypercall(struct kvm_vcpu *vcpu, struct kvm_run *run) +{ + unsigned long nr, a0, a1, a2, a3, a4, a5, ret; + + kvm_x86_ops->cache_regs(vcpu); + ret =3D -KVM_EINVAL; +#ifdef CONFIG_X86_64 + if (is_long_mode(vcpu)) { + nr =3D vcpu->regs[VCPU_REGS_RAX]; + a0 =3D vcpu->regs[VCPU_REGS_RDI]; + a1 =3D vcpu->regs[VCPU_REGS_RSI]; + a2 =3D vcpu->regs[VCPU_REGS_RDX]; + a3 =3D vcpu->regs[VCPU_REGS_RCX]; + a4 =3D vcpu->regs[VCPU_REGS_R8]; + a5 =3D vcpu->regs[VCPU_REGS_R9]; + } else +#endif + { + nr =3D vcpu->regs[VCPU_REGS_RBX] & -1u; + a0 =3D vcpu->regs[VCPU_REGS_RAX] & -1u; + a1 =3D vcpu->regs[VCPU_REGS_RCX] & -1u; + a2 =3D vcpu->regs[VCPU_REGS_RDX] & -1u; + a3 =3D vcpu->regs[VCPU_REGS_RSI] & -1u; + a4 =3D vcpu->regs[VCPU_REGS_RDI] & -1u; + a5 =3D vcpu->regs[VCPU_REGS_RBP] & -1u; + } + switch (nr) { + default: + run->hypercall.nr =3D nr; + run->hypercall.args[0] =3D a0; + run->hypercall.args[1] =3D a1; + run->hypercall.args[2] =3D a2; + run->hypercall.args[3] =3D a3; + run->hypercall.args[4] =3D a4; + run->hypercall.args[5] =3D a5; + run->hypercall.ret =3D ret; + run->hypercall.longmode =3D is_long_mode(vcpu); + kvm_x86_ops->decache_regs(vcpu); + return 0; + } + vcpu->regs[VCPU_REGS_RAX] =3D ret; + kvm_x86_ops->decache_regs(vcpu); + return 1; +} +EXPORT_SYMBOL_GPL(kvm_hypercall); + +static u64 mk_cr_64(u64 curr_cr, u32 new_val) +{ + return (curr_cr & ~((1ULL << 32) - 1)) | new_val; +} + +void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base) +{ + struct descriptor_table dt =3D { limit, base }; + + kvm_x86_ops->set_gdt(vcpu, &dt); +} + +void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base) +{ + struct descriptor_table dt =3D { limit, base }; + + kvm_x86_ops->set_idt(vcpu, &dt); +} + +void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw, + unsigned long *rflags) +{ + lmsw(vcpu, msw); + *rflags =3D kvm_x86_ops->get_rflags(vcpu); +} + +unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr) +{ + kvm_x86_ops->decache_cr4_guest_bits(vcpu); + switch (cr) { + case 0: + return vcpu->cr0; + case 2: + return vcpu->cr2; + case 3: + return vcpu->cr3; + case 4: + return vcpu->cr4; + default: + vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr); + return 0; + } +} + +void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val, + unsigned long *rflags) +{ + switch (cr) { + case 0: + set_cr0(vcpu, mk_cr_64(vcpu->cr0, val)); + *rflags =3D kvm_x86_ops->get_rflags(vcpu); + break; + case 2: + vcpu->cr2 =3D val; + break; + case 3: + set_cr3(vcpu, val); + break; + case 4: + set_cr4(vcpu, mk_cr_64(vcpu->cr4, val)); + break; + default: + vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr); + } +} + +/* + * Register the para guest with the host: + */ +static int vcpu_register_para(struct kvm_vcpu *vcpu, gpa_t para_state_gpa) +{ + struct kvm_vcpu_para_state *para_state; + hpa_t para_state_hpa, hypercall_hpa; + struct page *para_state_page; + unsigned char *hypercall; + gpa_t hypercall_gpa; + + printk(KERN_DEBUG "kvm: guest trying to enter paravirtual mode\n"); + printk(KERN_DEBUG ".... para_state_gpa: %08Lx\n", para_state_gpa); + + /* + * Needs to be page aligned: + */ + if (para_state_gpa !=3D PAGE_ALIGN(para_state_gpa)) + goto err_gp; + + para_state_hpa =3D gpa_to_hpa(vcpu, para_state_gpa); + printk(KERN_DEBUG ".... para_state_hpa: %08Lx\n", para_state_hpa); + if (is_error_hpa(para_state_hpa)) + goto err_gp; + + mark_page_dirty(vcpu->kvm, para_state_gpa >> PAGE_SHIFT); + para_state_page =3D pfn_to_page(para_state_hpa >> PAGE_SHIFT); + para_state =3D kmap(para_state_page); + + printk(KERN_DEBUG ".... guest version: %d\n", = para_state->guest_version); + printk(KERN_DEBUG ".... size: %d\n", para_state->size); + + para_state->host_version =3D KVM_PARA_API_VERSION; + /* + * We cannot support guests that try to register themselves + * with a newer API version than the host supports: + */ + if (para_state->guest_version > KVM_PARA_API_VERSION) { + para_state->ret =3D -KVM_EINVAL; + goto err_kunmap_skip; + } + + hypercall_gpa =3D para_state->hypercall_gpa; + hypercall_hpa =3D gpa_to_hpa(vcpu, hypercall_gpa); + printk(KERN_DEBUG ".... hypercall_hpa: %08Lx\n", hypercall_hpa); + if (is_error_hpa(hypercall_hpa)) { + para_state->ret =3D -KVM_EINVAL; + goto err_kunmap_skip; + } + + printk(KERN_DEBUG "kvm: para guest successfully registered.\n"); + vcpu->para_state_page =3D para_state_page; + vcpu->para_state_gpa =3D para_state_gpa; + vcpu->hypercall_gpa =3D hypercall_gpa; + + mark_page_dirty(vcpu->kvm, hypercall_gpa >> PAGE_SHIFT); + hypercall =3D kmap_atomic(pfn_to_page(hypercall_hpa >> PAGE_SHIFT), + KM_USER1) + (hypercall_hpa & ~PAGE_MASK); + kvm_x86_ops->patch_hypercall(vcpu, hypercall); + kunmap_atomic(hypercall, KM_USER1); + + para_state->ret =3D 0; +err_kunmap_skip: + kunmap(para_state_page); + return 0; +err_gp: + return 1; +} + +int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) +{ + u64 data; + + switch (msr) { + case 0xc0010010: /* SYSCFG */ + case 0xc0010015: /* HWCR */ + case MSR_IA32_PLATFORM_ID: + case MSR_IA32_P5_MC_ADDR: + case MSR_IA32_P5_MC_TYPE: + case MSR_IA32_MC0_CTL: + case MSR_IA32_MCG_STATUS: + case MSR_IA32_MCG_CAP: + case MSR_IA32_MC0_MISC: + case MSR_IA32_MC0_MISC+4: + case MSR_IA32_MC0_MISC+8: + case MSR_IA32_MC0_MISC+12: + case MSR_IA32_MC0_MISC+16: + case MSR_IA32_UCODE_REV: + case MSR_IA32_PERF_STATUS: + case MSR_IA32_EBL_CR_POWERON: + /* MTRR registers */ + case 0xfe: + case 0x200 ... 0x2ff: + data =3D 0; + break; + case 0xcd: /* fsb frequency */ + data =3D 3; + break; + case MSR_IA32_APICBASE: + data =3D vcpu->apic_base; + break; + case MSR_IA32_MISC_ENABLE: + data =3D vcpu->ia32_misc_enable_msr; + break; +#ifdef CONFIG_X86_64 + case MSR_EFER: + data =3D vcpu->shadow_efer; + break; +#endif + default: + pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr); + return 1; + } + *pdata =3D data; + return 0; +} +EXPORT_SYMBOL_GPL(kvm_get_msr_common); + +/* + * Reads an msr value (of 'msr_index') into 'pdata'. + * Returns 0 on success, non-0 otherwise. + * Assumes vcpu_load() was already called. + */ +int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) +{ + return kvm_x86_ops->get_msr(vcpu, msr_index, pdata); +} + +#ifdef CONFIG_X86_64 + +static void set_efer(struct kvm_vcpu *vcpu, u64 efer) +{ + if (efer & EFER_RESERVED_BITS) { + printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n", + efer); + kvm_arch_inject_gp(vcpu); + return; + } + + if (is_paging(vcpu) + && (vcpu->shadow_efer & EFER_LME) !=3D (efer & EFER_LME)) { + printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n"); + kvm_arch_inject_gp(vcpu); + return; + } + + kvm_x86_ops->set_efer(vcpu, efer); + + efer &=3D ~EFER_LMA; + efer |=3D vcpu->shadow_efer & EFER_LMA; + + vcpu->shadow_efer =3D efer; +} + +#endif + +int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) +{ + switch (msr) { +#ifdef CONFIG_X86_64 + case MSR_EFER: + set_efer(vcpu, data); + break; +#endif + case MSR_IA32_MC0_STATUS: + pr_unimpl(vcpu, "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n", + __FUNCTION__, data); + break; + case MSR_IA32_MCG_STATUS: + pr_unimpl(vcpu, "%s: MSR_IA32_MCG_STATUS 0x%llx, nop\n", + __FUNCTION__, data); + break; + case MSR_IA32_UCODE_REV: + case MSR_IA32_UCODE_WRITE: + case 0x200 ... 0x2ff: /* MTRRs */ + break; + case MSR_IA32_APICBASE: + vcpu->apic_base =3D data; + break; + case MSR_IA32_MISC_ENABLE: + vcpu->ia32_misc_enable_msr =3D data; + break; + /* + * This is the 'probe whether the host is KVM' logic: + */ + case MSR_KVM_API_MAGIC: + return vcpu_register_para(vcpu, data); + + default: + pr_unimpl(vcpu, "unhandled wrmsr: 0x%x\n", msr); + return 1; + } + return 0; +} +EXPORT_SYMBOL_GPL(kvm_set_msr_common); + +/* + * Writes msr value into into the appropriate "register". + * Returns 0 on success, non-0 otherwise. + * Assumes vcpu_load() was already called. + */ +int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) +{ + return kvm_x86_ops->set_msr(vcpu, msr_index, data); +} + +void kvm_resched(struct kvm_vcpu *vcpu) +{ + if (!need_resched()) + return; + cond_resched(); +} +EXPORT_SYMBOL_GPL(kvm_resched); + +void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) +{ + int i; + u32 function; + struct kvm_cpuid_entry *e, *best; + + kvm_x86_ops->cache_regs(vcpu); + function =3D vcpu->regs[VCPU_REGS_RAX]; + vcpu->regs[VCPU_REGS_RAX] =3D 0; + vcpu->regs[VCPU_REGS_RBX] =3D 0; + vcpu->regs[VCPU_REGS_RCX] =3D 0; + vcpu->regs[VCPU_REGS_RDX] =3D 0; + best =3D NULL; + for (i =3D 0; i < vcpu->cpuid_nent; ++i) { + e =3D &vcpu->cpuid_entries[i]; + if (e->function =3D=3D function) { + best =3D e; + break; + } + /* + * Both basic or both extended? + */ + if (((e->function ^ function) & 0x80000000) =3D=3D 0) + if (!best || e->function > best->function) + best =3D e; + } + if (best) { + vcpu->regs[VCPU_REGS_RAX] =3D best->eax; + vcpu->regs[VCPU_REGS_RBX] =3D best->ebx; + vcpu->regs[VCPU_REGS_RCX] =3D best->ecx; + vcpu->regs[VCPU_REGS_RDX] =3D best->edx; + } + kvm_x86_ops->decache_regs(vcpu); + kvm_x86_ops->skip_emulated_instruction(vcpu); +} +EXPORT_SYMBOL_GPL(kvm_emulate_cpuid); + +int kvm_arch_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +{ + int r; + sigset_t sigsaved; + + vcpu_load(vcpu); + + if (vcpu->sigset_active) + sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved); + + /* re-sync apic's tpr */ + vcpu->cr8 =3D kvm_run->cr8; + + if (vcpu->pio.cur_count) { + r =3D complete_pio(vcpu); + if (r) + goto out; + } + + if (vcpu->mmio_needed) { + memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8); + vcpu->mmio_read_completed =3D 1; + vcpu->mmio_needed =3D 0; + r =3D emulate_instruction(vcpu, kvm_run, + vcpu->mmio_fault_cr2, 0); + if (r =3D=3D EMULATE_DO_MMIO) { + /* + * Read-modify-write. Back to userspace. + */ + r =3D 0; + goto out; + } + } + + if (kvm_run->exit_reason =3D=3D KVM_EXIT_HYPERCALL) { + kvm_x86_ops->cache_regs(vcpu); + vcpu->regs[VCPU_REGS_RAX] =3D kvm_run->hypercall.ret; + kvm_x86_ops->decache_regs(vcpu); + } + + r =3D kvm_x86_ops->run(vcpu, kvm_run); + +out: + if (vcpu->sigset_active) + sigprocmask(SIG_SETMASK, &sigsaved, NULL); + + vcpu_put(vcpu); + return r; +} + +static void get_segment(struct kvm_vcpu *vcpu, + struct kvm_segment *var, int seg) +{ + return kvm_x86_ops->get_segment(vcpu, var, seg); +} + +int kvm_arch_vcpu_get_regs(struct kvm_vcpu *vcpu, + struct kvm_regs *regs) +{ + vcpu_load(vcpu); + + kvm_x86_ops->cache_regs(vcpu); + + regs->rax =3D vcpu->regs[VCPU_REGS_RAX]; + regs->rbx =3D vcpu->regs[VCPU_REGS_RBX]; + regs->rcx =3D vcpu->regs[VCPU_REGS_RCX]; + regs->rdx =3D vcpu->regs[VCPU_REGS_RDX]; + regs->rsi =3D vcpu->regs[VCPU_REGS_RSI]; + regs->rdi =3D vcpu->regs[VCPU_REGS_RDI]; + regs->rsp =3D vcpu->regs[VCPU_REGS_RSP]; + regs->rbp =3D vcpu->regs[VCPU_REGS_RBP]; +#ifdef CONFIG_X86_64 + regs->r8 =3D vcpu->regs[VCPU_REGS_R8]; + regs->r9 =3D vcpu->regs[VCPU_REGS_R9]; + regs->r10 =3D vcpu->regs[VCPU_REGS_R10]; + regs->r11 =3D vcpu->regs[VCPU_REGS_R11]; + regs->r12 =3D vcpu->regs[VCPU_REGS_R12]; + regs->r13 =3D vcpu->regs[VCPU_REGS_R13]; + regs->r14 =3D vcpu->regs[VCPU_REGS_R14]; + regs->r15 =3D vcpu->regs[VCPU_REGS_R15]; +#endif + + regs->rip =3D vcpu->rip; + regs->rflags =3D kvm_x86_ops->get_rflags(vcpu); + + /* + * Don't leak debug flags in case they were set for guest debugging + */ + if (vcpu->guest_debug.enabled && vcpu->guest_debug.singlestep) + regs->rflags &=3D ~(X86_EFLAGS_TF | X86_EFLAGS_RF); + + vcpu_put(vcpu); + + return 0; +} + +int kvm_arch_vcpu_set_regs(struct kvm_vcpu *vcpu, + struct kvm_regs *regs) +{ + vcpu_load(vcpu); + + vcpu->regs[VCPU_REGS_RAX] =3D regs->rax; + vcpu->regs[VCPU_REGS_RBX] =3D regs->rbx; + vcpu->regs[VCPU_REGS_RCX] =3D regs->rcx; + vcpu->regs[VCPU_REGS_RDX] =3D regs->rdx; + vcpu->regs[VCPU_REGS_RSI] =3D regs->rsi; + vcpu->regs[VCPU_REGS_RDI] =3D regs->rdi; + vcpu->regs[VCPU_REGS_RSP] =3D regs->rsp; + vcpu->regs[VCPU_REGS_RBP] =3D regs->rbp; +#ifdef CONFIG_X86_64 + vcpu->regs[VCPU_REGS_R8] =3D regs->r8; + vcpu->regs[VCPU_REGS_R9] =3D regs->r9; + vcpu->regs[VCPU_REGS_R10] =3D regs->r10; + vcpu->regs[VCPU_REGS_R11] =3D regs->r11; + vcpu->regs[VCPU_REGS_R12] =3D regs->r12; + vcpu->regs[VCPU_REGS_R13] =3D regs->r13; + vcpu->regs[VCPU_REGS_R14] =3D regs->r14; + vcpu->regs[VCPU_REGS_R15] =3D regs->r15; +#endif + + vcpu->rip =3D regs->rip; + kvm_x86_ops->set_rflags(vcpu, regs->rflags); + + kvm_x86_ops->decache_regs(vcpu); + + vcpu_put(vcpu); + + return 0; +} + +static int kvm_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, + struct kvm_sregs *sregs) +{ + struct descriptor_table dt; + + vcpu_load(vcpu); + + get_segment(vcpu, &sregs->cs, VCPU_SREG_CS); + get_segment(vcpu, &sregs->ds, VCPU_SREG_DS); + get_segment(vcpu, &sregs->es, VCPU_SREG_ES); + get_segment(vcpu, &sregs->fs, VCPU_SREG_FS); + get_segment(vcpu, &sregs->gs, VCPU_SREG_GS); + get_segment(vcpu, &sregs->ss, VCPU_SREG_SS); + + get_segment(vcpu, &sregs->tr, VCPU_SREG_TR); + get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); + + kvm_x86_ops->get_idt(vcpu, &dt); + sregs->idt.limit =3D dt.limit; + sregs->idt.base =3D dt.base; + kvm_x86_ops->get_gdt(vcpu, &dt); + sregs->gdt.limit =3D dt.limit; + sregs->gdt.base =3D dt.base; + + kvm_x86_ops->decache_cr4_guest_bits(vcpu); + sregs->cr0 =3D vcpu->cr0; + sregs->cr2 =3D vcpu->cr2; + sregs->cr3 =3D vcpu->cr3; + sregs->cr4 =3D vcpu->cr4; + sregs->cr8 =3D vcpu->cr8; + sregs->efer =3D vcpu->shadow_efer; + sregs->apic_base =3D vcpu->apic_base; + + memcpy(sregs->interrupt_bitmap, vcpu->irq_pending, + sizeof sregs->interrupt_bitmap); + + vcpu_put(vcpu); + + return 0; +} + +static void set_segment(struct kvm_vcpu *vcpu, + struct kvm_segment *var, int seg) +{ + return kvm_x86_ops->set_segment(vcpu, var, seg); +} + +static int kvm_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, + struct kvm_sregs *sregs) +{ + int mmu_reset_needed =3D 0; + int i; + struct descriptor_table dt; + + vcpu_load(vcpu); + + dt.limit =3D sregs->idt.limit; + dt.base =3D sregs->idt.base; + kvm_x86_ops->set_idt(vcpu, &dt); + dt.limit =3D sregs->gdt.limit; + dt.base =3D sregs->gdt.base; + kvm_x86_ops->set_gdt(vcpu, &dt); + + vcpu->cr2 =3D sregs->cr2; + mmu_reset_needed |=3D vcpu->cr3 !=3D sregs->cr3; + vcpu->cr3 =3D sregs->cr3; + + vcpu->cr8 =3D sregs->cr8; + + mmu_reset_needed |=3D vcpu->shadow_efer !=3D sregs->efer; +#ifdef CONFIG_X86_64 + kvm_x86_ops->set_efer(vcpu, sregs->efer); +#endif + vcpu->apic_base =3D sregs->apic_base; + + kvm_x86_ops->decache_cr4_guest_bits(vcpu); + + mmu_reset_needed |=3D vcpu->cr0 !=3D sregs->cr0; + kvm_x86_ops->set_cr0(vcpu, sregs->cr0); + + mmu_reset_needed |=3D vcpu->cr4 !=3D sregs->cr4; + kvm_x86_ops->set_cr4(vcpu, sregs->cr4); + if (!is_long_mode(vcpu) && is_pae(vcpu)) + load_pdptrs(vcpu, vcpu->cr3); + + if (mmu_reset_needed) + kvm_mmu_reset_context(vcpu); + + memcpy(vcpu->irq_pending, sregs->interrupt_bitmap, + sizeof vcpu->irq_pending); + vcpu->irq_summary =3D 0; + for (i =3D 0; i < ARRAY_SIZE(vcpu->irq_pending); ++i) + if (vcpu->irq_pending[i]) + __set_bit(i, &vcpu->irq_summary); + + set_segment(vcpu, &sregs->cs, VCPU_SREG_CS); + set_segment(vcpu, &sregs->ds, VCPU_SREG_DS); + set_segment(vcpu, &sregs->es, VCPU_SREG_ES); + set_segment(vcpu, &sregs->fs, VCPU_SREG_FS); + set_segment(vcpu, &sregs->gs, VCPU_SREG_GS); + set_segment(vcpu, &sregs->ss, VCPU_SREG_SS); + + set_segment(vcpu, &sregs->tr, VCPU_SREG_TR); + set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); + + vcpu_put(vcpu); + + return 0; +} + +void kvm_arch_cache_regs(struct kvm_vcpu *vcpu){ + kvm_x86_ops->cache_regs(vcpu); +} + +void kvm_arch_decache_regs(struct kvm_vcpu *vcpu){ + kvm_x86_ops->decache_regs(vcpu); +} + +void kvm_arch_vcpu_decache(struct kvm_vcpu *vcpu){ + kvm_x86_ops->vcpu_decache(vcpu); +} + +struct kvm_vcpu* kvm_arch_vcpu_create(struct kvm *kvm, unsigned id){ + return kvm_x86_ops->vcpu_create(kvm, id); +} + +void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu){ + kvm_x86_ops->vcpu_free(vcpu); +} + +void kvm_arch_skip_emulated_instruction(struct kvm_vcpu *vcpu){ + kvm_x86_ops->skip_emulated_instruction(vcpu); +} + +/* + * List of msr numbers which we expose to userspace through KVM_GET_MSRS + * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST. + * + * This list is modified at module load time to reflect the + * capabilities of the host cpu. + */ +static u32 msrs_to_save[] =3D { + MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, + MSR_K6_STAR, +#ifdef CONFIG_X86_64 + MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, +#endif + MSR_IA32_TIME_STAMP_COUNTER, +}; + +static unsigned num_msrs_to_save; + +static u32 emulated_msrs[] =3D { + MSR_IA32_MISC_ENABLE, +}; + +static __init void kvm_init_msr_list(void) +{ + u32 dummy[2]; + unsigned i, j; + + for (i =3D j =3D 0; i < ARRAY_SIZE(msrs_to_save); i++) { + if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0) + continue; + if (j < i) + msrs_to_save[j] =3D msrs_to_save[i]; + j++; + } + num_msrs_to_save =3D j; +} + +/* + * Adapt set_msr() to msr_io()'s calling convention + */ +static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) +{ + return kvm_set_msr(vcpu, index, *data); +} + +/* + * Read or write a bunch of msrs. All parameters are kernel addresses. + * + * @return number of msrs set successfully. + */ +static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs, + struct kvm_msr_entry *entries, + int (*do_msr)(struct kvm_vcpu *vcpu, + unsigned index, u64 *data)) +{ + int i; + + vcpu_load(vcpu); + + for (i =3D 0; i < msrs->nmsrs; ++i) + if (do_msr(vcpu, entries[i].index, &entries[i].data)) + break; + + vcpu_put(vcpu); + + return i; +} + +/* + * Read or write a bunch of msrs. Parameters are user addresses. + * + * @return number of msrs set successfully. + */ +static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs, + int (*do_msr)(struct kvm_vcpu *vcpu, + unsigned index, u64 *data), + int writeback) +{ + struct kvm_msrs msrs; + struct kvm_msr_entry *entries; + int r, n; + unsigned size; + + r =3D -EFAULT; + if (copy_from_user(&msrs, user_msrs, sizeof msrs)) + goto out; + + r =3D -E2BIG; + if (msrs.nmsrs >=3D MAX_IO_MSRS) + goto out; + + r =3D -ENOMEM; + size =3D sizeof(struct kvm_msr_entry) * msrs.nmsrs; + entries =3D vmalloc(size); + if (!entries) + goto out; + + r =3D -EFAULT; + if (copy_from_user(entries, user_msrs->entries, size)) + goto out_free; + + r =3D n =3D __msr_io(vcpu, &msrs, entries, do_msr); + if (r < 0) + goto out_free; + + r =3D -EFAULT; + if (writeback && copy_to_user(user_msrs->entries, entries, size)) + goto out_free; + + r =3D n; + +out_free: + vfree(entries); +out: + return r; +} + +int kvm_arch_vcpu_debug_guest(struct kvm_vcpu *vcpu, + struct kvm_debug_guest *dbg) +{ + int r; + r =3D kvm_x86_ops->set_guest_debug(vcpu, dbg); + return r; +} + +void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) +{ + kvm_x86_ops->vcpu_load(vcpu, cpu); +} + +void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) +{ + kvm_x86_ops->vcpu_put(vcpu); +} + +static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu) +{ + u64 efer; + int i; + struct kvm_cpuid_entry *e, *entry; + + rdmsrl(MSR_EFER, efer); + entry =3D NULL; + for (i =3D 0; i < vcpu->cpuid_nent; ++i) { + e =3D &vcpu->cpuid_entries[i]; + if (e->function =3D=3D 0x80000001) { + entry =3D e; + break; + } + } + if (entry && (entry->edx & (1 << 20)) && !(efer & EFER_NX)) { + entry->edx &=3D ~(1 << 20); + printk(KERN_INFO "kvm: guest NX capability removed\n"); + } +} + +static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, + struct kvm_cpuid *cpuid, + struct kvm_cpuid_entry __user *entries) +{ + int r; + + r =3D -E2BIG; + if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) + goto out; + r =3D -EFAULT; + if (copy_from_user(&vcpu->cpuid_entries, entries, + cpuid->nent * sizeof(struct kvm_cpuid_entry))) + goto out; + vcpu->cpuid_nent =3D cpuid->nent; + cpuid_fix_nx_cap(vcpu); + return 0; + +out: + return r; +} + +/* + * fxsave fpu state. Taken from x86_64/processor.h. To be killed when + * we have asm/x86/processor.h + */ +struct fxsave { + u16 cwd; + u16 swd; + u16 twd; + u16 fop; + u64 rip; + u64 rdp; + u32 mxcsr; + u32 mxcsr_mask; + u32 st_space[32]; /* 8*16 bytes for each FP-reg =3D 128 bytes */ +#ifdef CONFIG_X86_64 + u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg =3D 256 bytes= */ +#else + u32 xmm_space[32]; /* 8*16 bytes for each XMM-reg =3D 128 bytes = */ +#endif +}; + +int kvm_arch_vcpu_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) +{ + struct fxsave *fxsave =3D (struct fxsave *)&vcpu->guest_fx_image; + + vcpu_load(vcpu); + + memcpy(fpu->fpr, fxsave->st_space, 128); + fpu->fcw =3D fxsave->cwd; + fpu->fsw =3D fxsave->swd; + fpu->ftwx =3D fxsave->twd; + fpu->last_opcode =3D fxsave->fop; + fpu->last_ip =3D fxsave->rip; + fpu->last_dp =3D fxsave->rdp; + memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space); + + vcpu_put(vcpu); + + return 0; +} + +int kvm_arch_vcpu_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) +{ + struct fxsave *fxsave =3D (struct fxsave *)&vcpu->guest_fx_image; + + vcpu_load(vcpu); + + memcpy(fxsave->st_space, fpu->fpr, 128); + fxsave->cwd =3D fpu->fcw; + fxsave->swd =3D fpu->fsw; + fxsave->twd =3D fpu->ftwx; + fxsave->fop =3D fpu->last_opcode; + fxsave->rip =3D fpu->last_ip; + fxsave->rdp =3D fpu->last_dp; + memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space); + + vcpu_put(vcpu); + + return 0; +} + +long kvm_arch_vcpu_ioctl(struct file *filp, + unsigned int ioctl, unsigned long arg) +{ + struct kvm_vcpu *vcpu =3D filp->private_data; + void __user *argp =3D (void __user *)arg; + int r =3D -EINVAL; + + switch (ioctl) { + case KVM_GET_SREGS: { + struct kvm_sregs kvm_sregs; + + memset(&kvm_sregs, 0, sizeof kvm_sregs); + r =3D kvm_vcpu_ioctl_get_sregs(vcpu, &kvm_sregs); + if (r) + goto out; + r =3D -EFAULT; + if (copy_to_user(argp, &kvm_sregs, sizeof kvm_sregs)) + goto out; + r =3D 0; + break; + } + case KVM_SET_SREGS: { + struct kvm_sregs kvm_sregs; + + r =3D -EFAULT; + if (copy_from_user(&kvm_sregs, argp, sizeof kvm_sregs)) + goto out; + r =3D kvm_vcpu_ioctl_set_sregs(vcpu, &kvm_sregs); + if (r) + goto out; + r =3D 0; + break; + } + case KVM_GET_MSRS: + r =3D msr_io(vcpu, argp, kvm_get_msr, 1); + break; + case KVM_SET_MSRS: + r =3D msr_io(vcpu, argp, do_set_msr, 0); + break; + case KVM_SET_CPUID: { + struct kvm_cpuid __user *cpuid_arg =3D argp; + struct kvm_cpuid cpuid; + + r =3D -EFAULT; + if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) + goto out; + r =3D kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries); + if (r) + goto out; + break; + } + default: + ; + } +out: + return r; +} + +long kvm_arch_vm_ioctl(struct file *filp, + unsigned int ioctl, unsigned long arg) +{ + int r =3D -EINVAL; + + switch (ioctl) { + default: + ; + } + return r; +} + +long kvm_arch_dev_ioctl(struct file *filp, + unsigned int ioctl, unsigned long arg) +{ + void __user *argp =3D (void __user *)arg; + long r =3D -EINVAL; + + switch (ioctl) { + case KVM_GET_MSR_INDEX_LIST: { + struct kvm_msr_list __user *user_msr_list =3D argp; + struct kvm_msr_list msr_list; + unsigned n; + + r =3D -EFAULT; + if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list)) + goto out; + n =3D msr_list.nmsrs; + msr_list.nmsrs =3D num_msrs_to_save + ARRAY_SIZE(emulated_msrs); + if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list)) + goto out; + r =3D -E2BIG; + if (n < num_msrs_to_save) + goto out; + r =3D -EFAULT; + if (copy_to_user(user_msr_list->indices, &msrs_to_save, + num_msrs_to_save * sizeof(u32))) + goto out; + if (copy_to_user(user_msr_list->indices + + num_msrs_to_save * sizeof(u32), + &emulated_msrs, + ARRAY_SIZE(emulated_msrs) * sizeof(u32))) + goto out; + r =3D 0; + break; + } + default: + ; + } +out: + return r; +} + +void kvm_arch_hardware_enable(void *junk) +{ + kvm_x86_ops->hardware_enable(NULL); +} + +void kvm_arch_hardware_disable(void *junk) +{ + kvm_x86_ops->hardware_disable(NULL); +} + +int kvm_init_x86(struct kvm_x86_ops *ops, unsigned int vcpu_size, + struct module *module) +{ + int r; + int cpu; + + if (kvm_x86_ops) { + printk(KERN_ERR "kvm: already loaded the other module\n"); + return -EEXIST; + } + + if (!ops->cpu_has_kvm_support()) { + printk(KERN_ERR "kvm: no hardware support\n"); + return -EOPNOTSUPP; + } + if (ops->disabled_by_bios()) { + printk(KERN_ERR "kvm: disabled by bios\n"); + return -EOPNOTSUPP; + } + + kvm_x86_ops =3D ops; + + r =3D kvm_x86_ops->hardware_setup(); + if (r < 0) + goto out; + + for_each_online_cpu(cpu) { + smp_call_function_single(cpu, + kvm_x86_ops->check_processor_compatibility, + &r, 0, 1); + if (r < 0) + goto out_free_0; + } + + r =3D kvm_init_generic(vcpu_size, module); + return r; + +out_free_0: + kvm_x86_ops->hardware_unsetup(); +out: + kvm_x86_ops =3D NULL; + return r; +} +EXPORT_SYMBOL_GPL(kvm_init_x86); + +void kvm_exit_x86(void) +{ + kvm_exit_generic(); + kvm_x86_ops->hardware_unsetup(); + kvm_x86_ops =3D NULL; +} +EXPORT_SYMBOL_GPL(kvm_exit_x86); + +__init int kvm_arch_init(void) +{ + int r; + + r =3D kvm_mmu_module_init(); + if (r) + goto out4; + + kvm_init_debug(); + kvm_init_msr_list(); + +out4: + return r; +} + +__exit void kvm_arch_exit(void) +{ + kvm_exit_debug(); + kvm_mmu_module_exit(); +} diff --git a/drivers/kvm/x86.h b/drivers/kvm/x86.h index 021385e..6a78512 100644 --- a/drivers/kvm/x86.h +++ b/drivers/kvm/x86.h @@ -6,4 +6,67 @@ * the COPYING file in the top-level directory. */ = +struct kvm_x86_ops { + int (*cpu_has_kvm_support)(void); /* __init */ + int (*disabled_by_bios)(void); /* __init */ + void (*hardware_enable)(void *dummy); /* __init */ + void (*hardware_disable)(void *dummy); + void (*check_processor_compatibility)(void *rtn); + int (*hardware_setup)(void); /* __init */ + void (*hardware_unsetup)(void); /* __exit */ + + /* Create, but do not attach this VCPU */ + struct kvm_vcpu *(*vcpu_create)(struct kvm *kvm, unsigned id); + void (*vcpu_free)(struct kvm_vcpu *vcpu); + + void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu); + void (*vcpu_put)(struct kvm_vcpu *vcpu); + void (*vcpu_decache)(struct kvm_vcpu *vcpu); + + int (*set_guest_debug)(struct kvm_vcpu *vcpu, + struct kvm_debug_guest *dbg); + int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata); + int (*set_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 data); + u64 (*get_segment_base)(struct kvm_vcpu *vcpu, int seg); + void (*get_segment)(struct kvm_vcpu *vcpu, + struct kvm_segment *var, int seg); + void (*set_segment)(struct kvm_vcpu *vcpu, + struct kvm_segment *var, int seg); + void (*get_cs_db_l_bits)(struct kvm_vcpu *vcpu, int *db, int *l); + void (*decache_cr4_guest_bits)(struct kvm_vcpu *vcpu); + void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0); + void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3); + void (*set_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4); + void (*set_efer)(struct kvm_vcpu *vcpu, u64 efer); + void (*get_idt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt); + void (*set_idt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt); + void (*get_gdt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt); + void (*set_gdt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt); + unsigned long (*get_dr)(struct kvm_vcpu *vcpu, int dr); + void (*set_dr)(struct kvm_vcpu *vcpu, int dr, unsigned long value, + int *exception); + void (*cache_regs)(struct kvm_vcpu *vcpu); + void (*decache_regs)(struct kvm_vcpu *vcpu); + unsigned long (*get_rflags)(struct kvm_vcpu *vcpu); + void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags); + + void (*invlpg)(struct kvm_vcpu *vcpu, gva_t addr); + void (*tlb_flush)(struct kvm_vcpu *vcpu); + void (*inject_page_fault)(struct kvm_vcpu *vcpu, + unsigned long addr, u32 err_code); + + void (*inject_gp)(struct kvm_vcpu *vcpu, unsigned err_code); + + int (*run)(struct kvm_vcpu *vcpu, struct kvm_run *run); + void (*skip_emulated_instruction)(struct kvm_vcpu *vcpu); + void (*patch_hypercall)(struct kvm_vcpu *vcpu, + unsigned char *hypercall_addr); +}; + +extern struct kvm_x86_ops *kvm_x86_ops; + +int kvm_init_x86(struct kvm_x86_ops *ops, unsigned int vcpu_size, + struct module *module); +void kvm_exit_x86(void); + #endif -- = Gr=FCsse / regards, = Christian Ehrhardt IBM Linux Technology Center, Open Virtualization +49 7031/16-3385 Ehrhardt-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org Ehrhardt-tA70FqPdS9bQT0dZR+AlfA@public.gmane.org IBM Deutschland Entwicklung GmbH Vorsitzender des Aufsichtsrats: Johann Weihen = Gesch=E4ftsf=FChrung: Herbert Kircher = Sitz der Gesellschaft: B=F6blingen Registergericht: Amtsgericht Stuttgart, HRB 243294 ------------------------------------------------------------------------- This SF.net email is sponsored by: Splunk Inc. Still grepping through log files to find problems? Stop. Now Search log events and configuration files using AJAX and a browser. Download your FREE copy of Splunk now >> http://get.splunk.com/