From mboxrd@z Thu Jan 1 00:00:00 1970 From: Avi Kivity Subject: Re: [PATCH] add sysenter/syscall emulation for 32bit compat mode Date: Wed, 17 Jun 2009 10:24:22 +0300 Message-ID: <4A389A26.2090800@redhat.com> References: <1245158713-15054-1-git-send-email-andre.przywara@amd.com> Mime-Version: 1.0 Content-Type: text/plain; charset=UTF-8; format=flowed Content-Transfer-Encoding: 7bit Cc: kvm@vger.kernel.org, Amit Shah , Christoph Egger To: Andre Przywara Return-path: Received: from mx2.redhat.com ([66.187.237.31]:59186 "EHLO mx2.redhat.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1754822AbZFQHYa (ORCPT ); Wed, 17 Jun 2009 03:24:30 -0400 In-Reply-To: <1245158713-15054-1-git-send-email-andre.przywara@amd.com> Sender: kvm-owner@vger.kernel.org List-ID: On 06/16/2009 04:25 PM, Andre Przywara wrote: > sysenter/sysexit are not supported on AMD's 32bit compat mode, whereas > syscall is not supported on Intel's 32bit compat mode. To allow cross > vendor migration we emulate the missing instructions by setting up the > processor state accordingly. > The sysenter code was originally sketched by Amit Shah, it was completed, > debugged, syscall added and made-to-work by Christoph Egger and polished > up by Andre Przywara. > Please note that sysret does not need to be emulated, because it will be > exectued in 64bit mode and returning to 32bit compat mode works on Intel. > > > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c > index 6025e5b..9066fb3 100644 > --- a/arch/x86/kvm/x86.c > +++ b/arch/x86/kvm/x86.c > @@ -2635,11 +2635,38 @@ int emulate_instruction(struct kvm_vcpu *vcpu, > /* Reject the instructions other than VMCALL/VMMCALL when > * try to emulate invalid opcode */ > c =&vcpu->arch.emulate_ctxt.decode; > - if ((emulation_type& EMULTYPE_TRAP_UD)&& > - (!(c->twobyte&& c->b == 0x01&& > - (c->modrm_reg == 0 || c->modrm_reg == 3)&& > - c->modrm_mod == 3&& c->modrm_rm == 1))) > - return EMULATE_FAIL; > + > + if (emulation_type& EMULTYPE_TRAP_UD) { > + if (!c->twobyte) > + return EMULATE_FAIL; > + switch (c->b) { > + case 0x01: /* VMMCALL */ > + if (c->modrm_mod != 3) > + return EMULATE_FAIL; > + if (c->modrm_rm != 1) > + return EMULATE_FAIL; > + break; > + case 0x34: /* sysenter */ > + case 0x35: /* sysexit */ > + if (c->modrm_mod != 0) > + return EMULATE_FAIL; > + if (c->modrm_rm != 0) > + return EMULATE_FAIL; > + break; > + case 0x05: /* syscall */ > + r = 0; > + if (c->modrm_mod != 0) > + return EMULATE_FAIL; > + if (c->modrm_rm != 0) > + return EMULATE_FAIL; > + break; > + default: > + return EMULATE_FAIL; > + } > + > + if (!(c->modrm_reg == 0 || c->modrm_reg == 3)) > + return EMULATE_FAIL; > + } > > ++vcpu->stat.insn_emulation; > if (r) { > This would be better encoded in the opcode table. No need to do it for this patch though. > diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c > index 22c765d..6c65462 100644 > --- a/arch/x86/kvm/x86_emulate.c > +++ b/arch/x86/kvm/x86_emulate.c > @@ -32,6 +32,8 @@ > #include > #include > > +#include "mmu.h" > + > Still unneeded I presume.@@ -320,8 +324,11 @@ static u32 group2_table[] = { > }; > > /* EFLAGS bit definitions. */ > +#define EFLG_VM (1<<17) > +#define EFLG_RF (1<<16) > #define EFLG_OF (1<<11) > #define EFLG_DF (1<<10) > +#define EFLG_IF (1<<9) > #define EFLG_SF (1<<7) > #define EFLG_ZF (1<<6) > #define EFLG_AF (1<<4) > @@ -1390,6 +1397,207 @@ void toggle_interruptibility(struct x86_emulate_ctxt *ctxt, u32 mask) > ctxt->interruptibility = mask; > } > > +#define EMUL_SYSENTER 1 > +#define EMUL_SYSEXIT 2 > +#define EMUL_SYSCALL 3 > + > +static int > +emulate_syscalls(struct x86_emulate_ctxt *ctxt, int ins) > +{ > + struct decode_cache *c =&ctxt->decode; > + struct kvm_segment cs, ss; > + unsigned long cr0; > + u64 msr_data; > + int usermode; > + > + /* inject #UD if LOCK prefix is used */ > + if (c->lock_prefix) > + return -1; > We should encode legality of the lock prefix in the opcode tables. Again unrelated to this patch. > + > + /* inject #GP if in real mode or paging is disabled */ > + cr0 = ctxt->vcpu->arch.cr0; > + if (ctxt->mode == X86EMUL_MODE_REAL || !(cr0& X86_CR0_PE)) { > + if (ins == EMUL_SYSENTER || ins == EMUL_SYSEXIT) > + kvm_inject_gp(ctxt->vcpu, 0); > + return -1; > + } > + > + /* XXX sysenter/sysexit have not been tested in 64bit mode. > + * Therefore, we inject an #UD. > + */ > + if (ins == EMUL_SYSENTER&& ctxt->mode == X86EMUL_MODE_PROT64) > + return -1; > We should return EMULATE_FAIL rather than confusing the poor guest. > + > + /* sysexit must be called from CPL 0 */ > + if (ins == EMUL_SYSEXIT&& kvm_x86_ops->get_cpl(ctxt->vcpu) != 0) { > + kvm_inject_gp(ctxt->vcpu, 0); > + return -1; > + } > + > + /* setup common descriptor content */ > + memset(&cs, 0, sizeof(struct kvm_segment)); > + memset(&ss, 0, sizeof(struct kvm_segment)); > + kvm_x86_ops->get_segment(ctxt->vcpu,&cs, VCPU_SREG_CS); > + > + cs.l = 0; /* will be adjusted later */ > + cs.base = 0; /* flat segment */ > + cs.g = 1; /* 4kb granularity */ > + cs.limit = 0xfffff; /* 4GB limit */ > + cs.type = 0x0b; /* Read, Execute, Accessed */ > + cs.s = 1; > + cs.dpl = 0; /* will be adjusted later */ > + cs.present = 1; > + cs.db = 1; > + > + ss.unusable = 0; > + ss.base = 0; /* flat segment */ > + ss.limit = 0xfffff; /* 4GB limit */ > + ss.g = 1; /* 4kb granularity */ > + ss.s = 1; > + ss.type = 0x03; /* Read/Write, Accessed */ > + ss.db = 1; /* 32bit stack segment */ > + ss.dpl = 0; > + ss.present = 1; > + > + if (ins == EMUL_SYSEXIT) { > + if ((c->rex_prefix& 0x8) != 0x0) > + usermode = X86EMUL_MODE_PROT64; > + else > + usermode = X86EMUL_MODE_PROT32; > + } else { > + usermode = ctxt->mode; > + } > + > + /* now fixup the segment descriptor for each specific instruction */ > + switch (ins) { > + case EMUL_SYSCALL: > + kvm_x86_ops->get_msr(ctxt->vcpu, MSR_STAR,&msr_data); > + msr_data>>= 32; > + cs.selector = (u16)(msr_data& 0xfffc); > + ss.selector = (u16)(msr_data + 8); > + if (is_long_mode(ctxt->vcpu)) { > + cs.db = 0; > + cs.l = 1; > + if (usermode == X86EMUL_MODE_PROT64) { > + /* Intel cares about granularity (g bit), > + * so we don't set the effective limit. > + */ > + cs.g = 1; > + cs.limit = 0xffffffff; > + } > + } > + break; > + case EMUL_SYSENTER: > + kvm_x86_ops->get_msr(ctxt->vcpu, > + MSR_IA32_SYSENTER_CS,&msr_data); > + switch (usermode) { > + case X86EMUL_MODE_PROT32: > + if ((msr_data& 0xfffc) == 0x0) { > + kvm_inject_gp(ctxt->vcpu, 0); > + return -1; > + } > + break; > + case X86EMUL_MODE_PROT64: > + if (msr_data == 0x0) { > + kvm_inject_gp(ctxt->vcpu, 0); > + return -1; > + } > + break; > + } > + ctxt->eflags&= ~(EFLG_VM | EFLG_IF | EFLG_RF); > + cs.selector = (u16)msr_data; > + cs.selector&= ~SELECTOR_RPL_MASK; > + ss.selector = cs.selector + 8; > + ss.selector&= ~SELECTOR_RPL_MASK; > + if (usermode == X86EMUL_MODE_PROT64 > + || is_long_mode(ctxt->vcpu)) { > + cs.db = 0; > + cs.l = 1; > + cs.limit = 0xffffffff; > + ss.limit = 0xffffffff; > + } > + break; > + case EMUL_SYSEXIT: > + /* We don't care about cs.g/ss.g bits > + * (= 4kb granularity) so we have to set the effective > + * limit here or we get a #GP in the guest, otherwise. > + */ > + cs.limit = 0xffffffff; > + ss.limit = 0xffffffff; > + cs.dpl = 3; > + ss.dpl = 3; > + kvm_x86_ops->get_msr(ctxt->vcpu, > + MSR_IA32_SYSENTER_CS,&msr_data); > + switch (usermode) { > + case X86EMUL_MODE_PROT32: > + cs.selector = (u16)(msr_data + 16); > + if ((msr_data& 0xfffc) == 0x0) { > + kvm_inject_gp(ctxt->vcpu, 0); > + return -1; > + } > + ss.selector = (u16)(msr_data + 24); > + break; > + case X86EMUL_MODE_PROT64: > + cs.selector = (u16)(msr_data + 32); > + if (msr_data == 0x0) { > + kvm_inject_gp(ctxt->vcpu, 0); > + return -1; > + } > + ss.selector = cs.selector + 8; > + cs.db = 0; > + cs.l = 1; > + break; > + } > + cs.selector |= SELECTOR_RPL_MASK; > + ss.selector |= SELECTOR_RPL_MASK; > + break; > + } > + > + kvm_x86_ops->set_segment(ctxt->vcpu,&cs, VCPU_SREG_CS); > + kvm_x86_ops->set_segment(ctxt->vcpu,&ss, VCPU_SREG_SS); > + > + switch (ins) { > + case EMUL_SYSCALL: > + c->regs[VCPU_REGS_RCX] = c->eip; > + if (is_long_mode(ctxt->vcpu)) { > + c->regs[VCPU_REGS_R11] = ctxt->eflags& ~EFLG_RF; > + > + kvm_x86_ops->get_msr(ctxt->vcpu, > + usermode == X86EMUL_MODE_PROT64 ? > + MSR_LSTAR : MSR_CSTAR, > + &msr_data); > + c->eip = msr_data; > + > + kvm_x86_ops->get_msr(ctxt->vcpu, > + MSR_SYSCALL_MASK,&msr_data); > + ctxt->eflags&= ~(msr_data | EFLG_RF); > + } else { > + /* legacy mode */ > + kvm_x86_ops->get_msr(ctxt->vcpu, MSR_STAR,&msr_data); > + c->eip = (u32)msr_data; > + > + ctxt->eflags&= ~(EFLG_VM | EFLG_IF | EFLG_RF); > + } > + break; > + case EMUL_SYSENTER: > + kvm_x86_ops->get_msr(ctxt->vcpu, > + MSR_IA32_SYSENTER_EIP,&msr_data); > + c->eip = msr_data; > + > + kvm_x86_ops->get_msr(ctxt->vcpu, > + MSR_IA32_SYSENTER_ESP,&msr_data); > + c->regs[VCPU_REGS_RSP] = msr_data; > + break; > + > + case EMUL_SYSEXIT: > + c->eip = ctxt->vcpu->arch.regs[VCPU_REGS_RDX]; > + c->regs[VCPU_REGS_RSP] = ctxt->vcpu->arch.regs[VCPU_REGS_RCX]; > + break; > + } > + > + return 0; > +} > This function is way to big. Please split it up to one function per instruction, and use a helper function for common code. -- Do not meddle in the internals of kernels, for they are subtle and quick to panic.