From: Avi Kivity <avi@redhat.com>
To: Andre Przywara <andre.przywara@amd.com>
Cc: kvm@vger.kernel.org, Amit Shah <amit.shah@redhat.com>,
Christoph Egger <christoph.egger@amd.com>
Subject: Re: [PATCH] add sysenter/syscall emulation for 32bit compat mode
Date: Wed, 17 Jun 2009 10:24:22 +0300 [thread overview]
Message-ID: <4A389A26.2090800@redhat.com> (raw)
In-Reply-To: <1245158713-15054-1-git-send-email-andre.przywara@amd.com>
On 06/16/2009 04:25 PM, Andre Przywara wrote:
> sysenter/sysexit are not supported on AMD's 32bit compat mode, whereas
> syscall is not supported on Intel's 32bit compat mode. To allow cross
> vendor migration we emulate the missing instructions by setting up the
> processor state accordingly.
> The sysenter code was originally sketched by Amit Shah, it was completed,
> debugged, syscall added and made-to-work by Christoph Egger and polished
> up by Andre Przywara.
> Please note that sysret does not need to be emulated, because it will be
> exectued in 64bit mode and returning to 32bit compat mode works on Intel.
>
>
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 6025e5b..9066fb3 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -2635,11 +2635,38 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
> /* Reject the instructions other than VMCALL/VMMCALL when
> * try to emulate invalid opcode */
> c =&vcpu->arch.emulate_ctxt.decode;
> - if ((emulation_type& EMULTYPE_TRAP_UD)&&
> - (!(c->twobyte&& c->b == 0x01&&
> - (c->modrm_reg == 0 || c->modrm_reg == 3)&&
> - c->modrm_mod == 3&& c->modrm_rm == 1)))
> - return EMULATE_FAIL;
> +
> + if (emulation_type& EMULTYPE_TRAP_UD) {
> + if (!c->twobyte)
> + return EMULATE_FAIL;
> + switch (c->b) {
> + case 0x01: /* VMMCALL */
> + if (c->modrm_mod != 3)
> + return EMULATE_FAIL;
> + if (c->modrm_rm != 1)
> + return EMULATE_FAIL;
> + break;
> + case 0x34: /* sysenter */
> + case 0x35: /* sysexit */
> + if (c->modrm_mod != 0)
> + return EMULATE_FAIL;
> + if (c->modrm_rm != 0)
> + return EMULATE_FAIL;
> + break;
> + case 0x05: /* syscall */
> + r = 0;
> + if (c->modrm_mod != 0)
> + return EMULATE_FAIL;
> + if (c->modrm_rm != 0)
> + return EMULATE_FAIL;
> + break;
> + default:
> + return EMULATE_FAIL;
> + }
> +
> + if (!(c->modrm_reg == 0 || c->modrm_reg == 3))
> + return EMULATE_FAIL;
> + }
>
> ++vcpu->stat.insn_emulation;
> if (r) {
>
This would be better encoded in the opcode table. No need to do it for
this patch though.
> diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
> index 22c765d..6c65462 100644
> --- a/arch/x86/kvm/x86_emulate.c
> +++ b/arch/x86/kvm/x86_emulate.c
> @@ -32,6 +32,8 @@
> #include<linux/module.h>
> #include<asm/kvm_x86_emulate.h>
>
> +#include "mmu.h"
> +
>
Still unneeded I presume.@@ -320,8 +324,11 @@ static u32 group2_table[] = {
> };
>
> /* EFLAGS bit definitions. */
> +#define EFLG_VM (1<<17)
> +#define EFLG_RF (1<<16)
> #define EFLG_OF (1<<11)
> #define EFLG_DF (1<<10)
> +#define EFLG_IF (1<<9)
> #define EFLG_SF (1<<7)
> #define EFLG_ZF (1<<6)
> #define EFLG_AF (1<<4)
> @@ -1390,6 +1397,207 @@ void toggle_interruptibility(struct x86_emulate_ctxt *ctxt, u32 mask)
> ctxt->interruptibility = mask;
> }
>
> +#define EMUL_SYSENTER 1
> +#define EMUL_SYSEXIT 2
> +#define EMUL_SYSCALL 3
> +
> +static int
> +emulate_syscalls(struct x86_emulate_ctxt *ctxt, int ins)
> +{
> + struct decode_cache *c =&ctxt->decode;
> + struct kvm_segment cs, ss;
> + unsigned long cr0;
> + u64 msr_data;
> + int usermode;
> +
> + /* inject #UD if LOCK prefix is used */
> + if (c->lock_prefix)
> + return -1;
>
We should encode legality of the lock prefix in the opcode tables.
Again unrelated to this patch.
> +
> + /* inject #GP if in real mode or paging is disabled */
> + cr0 = ctxt->vcpu->arch.cr0;
> + if (ctxt->mode == X86EMUL_MODE_REAL || !(cr0& X86_CR0_PE)) {
> + if (ins == EMUL_SYSENTER || ins == EMUL_SYSEXIT)
> + kvm_inject_gp(ctxt->vcpu, 0);
> + return -1;
> + }
> +
> + /* XXX sysenter/sysexit have not been tested in 64bit mode.
> + * Therefore, we inject an #UD.
> + */
> + if (ins == EMUL_SYSENTER&& ctxt->mode == X86EMUL_MODE_PROT64)
> + return -1;
>
We should return EMULATE_FAIL rather than confusing the poor guest.
> +
> + /* sysexit must be called from CPL 0 */
> + if (ins == EMUL_SYSEXIT&& kvm_x86_ops->get_cpl(ctxt->vcpu) != 0) {
> + kvm_inject_gp(ctxt->vcpu, 0);
> + return -1;
> + }
> +
> + /* setup common descriptor content */
> + memset(&cs, 0, sizeof(struct kvm_segment));
> + memset(&ss, 0, sizeof(struct kvm_segment));
> + kvm_x86_ops->get_segment(ctxt->vcpu,&cs, VCPU_SREG_CS);
> +
> + cs.l = 0; /* will be adjusted later */
> + cs.base = 0; /* flat segment */
> + cs.g = 1; /* 4kb granularity */
> + cs.limit = 0xfffff; /* 4GB limit */
> + cs.type = 0x0b; /* Read, Execute, Accessed */
> + cs.s = 1;
> + cs.dpl = 0; /* will be adjusted later */
> + cs.present = 1;
> + cs.db = 1;
> +
> + ss.unusable = 0;
> + ss.base = 0; /* flat segment */
> + ss.limit = 0xfffff; /* 4GB limit */
> + ss.g = 1; /* 4kb granularity */
> + ss.s = 1;
> + ss.type = 0x03; /* Read/Write, Accessed */
> + ss.db = 1; /* 32bit stack segment */
> + ss.dpl = 0;
> + ss.present = 1;
> +
> + if (ins == EMUL_SYSEXIT) {
> + if ((c->rex_prefix& 0x8) != 0x0)
> + usermode = X86EMUL_MODE_PROT64;
> + else
> + usermode = X86EMUL_MODE_PROT32;
> + } else {
> + usermode = ctxt->mode;
> + }
> +
> + /* now fixup the segment descriptor for each specific instruction */
> + switch (ins) {
> + case EMUL_SYSCALL:
> + kvm_x86_ops->get_msr(ctxt->vcpu, MSR_STAR,&msr_data);
> + msr_data>>= 32;
> + cs.selector = (u16)(msr_data& 0xfffc);
> + ss.selector = (u16)(msr_data + 8);
> + if (is_long_mode(ctxt->vcpu)) {
> + cs.db = 0;
> + cs.l = 1;
> + if (usermode == X86EMUL_MODE_PROT64) {
> + /* Intel cares about granularity (g bit),
> + * so we don't set the effective limit.
> + */
> + cs.g = 1;
> + cs.limit = 0xffffffff;
> + }
> + }
> + break;
> + case EMUL_SYSENTER:
> + kvm_x86_ops->get_msr(ctxt->vcpu,
> + MSR_IA32_SYSENTER_CS,&msr_data);
> + switch (usermode) {
> + case X86EMUL_MODE_PROT32:
> + if ((msr_data& 0xfffc) == 0x0) {
> + kvm_inject_gp(ctxt->vcpu, 0);
> + return -1;
> + }
> + break;
> + case X86EMUL_MODE_PROT64:
> + if (msr_data == 0x0) {
> + kvm_inject_gp(ctxt->vcpu, 0);
> + return -1;
> + }
> + break;
> + }
> + ctxt->eflags&= ~(EFLG_VM | EFLG_IF | EFLG_RF);
> + cs.selector = (u16)msr_data;
> + cs.selector&= ~SELECTOR_RPL_MASK;
> + ss.selector = cs.selector + 8;
> + ss.selector&= ~SELECTOR_RPL_MASK;
> + if (usermode == X86EMUL_MODE_PROT64
> + || is_long_mode(ctxt->vcpu)) {
> + cs.db = 0;
> + cs.l = 1;
> + cs.limit = 0xffffffff;
> + ss.limit = 0xffffffff;
> + }
> + break;
> + case EMUL_SYSEXIT:
> + /* We don't care about cs.g/ss.g bits
> + * (= 4kb granularity) so we have to set the effective
> + * limit here or we get a #GP in the guest, otherwise.
> + */
> + cs.limit = 0xffffffff;
> + ss.limit = 0xffffffff;
> + cs.dpl = 3;
> + ss.dpl = 3;
> + kvm_x86_ops->get_msr(ctxt->vcpu,
> + MSR_IA32_SYSENTER_CS,&msr_data);
> + switch (usermode) {
> + case X86EMUL_MODE_PROT32:
> + cs.selector = (u16)(msr_data + 16);
> + if ((msr_data& 0xfffc) == 0x0) {
> + kvm_inject_gp(ctxt->vcpu, 0);
> + return -1;
> + }
> + ss.selector = (u16)(msr_data + 24);
> + break;
> + case X86EMUL_MODE_PROT64:
> + cs.selector = (u16)(msr_data + 32);
> + if (msr_data == 0x0) {
> + kvm_inject_gp(ctxt->vcpu, 0);
> + return -1;
> + }
> + ss.selector = cs.selector + 8;
> + cs.db = 0;
> + cs.l = 1;
> + break;
> + }
> + cs.selector |= SELECTOR_RPL_MASK;
> + ss.selector |= SELECTOR_RPL_MASK;
> + break;
> + }
> +
> + kvm_x86_ops->set_segment(ctxt->vcpu,&cs, VCPU_SREG_CS);
> + kvm_x86_ops->set_segment(ctxt->vcpu,&ss, VCPU_SREG_SS);
> +
> + switch (ins) {
> + case EMUL_SYSCALL:
> + c->regs[VCPU_REGS_RCX] = c->eip;
> + if (is_long_mode(ctxt->vcpu)) {
> + c->regs[VCPU_REGS_R11] = ctxt->eflags& ~EFLG_RF;
> +
> + kvm_x86_ops->get_msr(ctxt->vcpu,
> + usermode == X86EMUL_MODE_PROT64 ?
> + MSR_LSTAR : MSR_CSTAR,
> + &msr_data);
> + c->eip = msr_data;
> +
> + kvm_x86_ops->get_msr(ctxt->vcpu,
> + MSR_SYSCALL_MASK,&msr_data);
> + ctxt->eflags&= ~(msr_data | EFLG_RF);
> + } else {
> + /* legacy mode */
> + kvm_x86_ops->get_msr(ctxt->vcpu, MSR_STAR,&msr_data);
> + c->eip = (u32)msr_data;
> +
> + ctxt->eflags&= ~(EFLG_VM | EFLG_IF | EFLG_RF);
> + }
> + break;
> + case EMUL_SYSENTER:
> + kvm_x86_ops->get_msr(ctxt->vcpu,
> + MSR_IA32_SYSENTER_EIP,&msr_data);
> + c->eip = msr_data;
> +
> + kvm_x86_ops->get_msr(ctxt->vcpu,
> + MSR_IA32_SYSENTER_ESP,&msr_data);
> + c->regs[VCPU_REGS_RSP] = msr_data;
> + break;
> +
> + case EMUL_SYSEXIT:
> + c->eip = ctxt->vcpu->arch.regs[VCPU_REGS_RDX];
> + c->regs[VCPU_REGS_RSP] = ctxt->vcpu->arch.regs[VCPU_REGS_RCX];
> + break;
> + }
> +
> + return 0;
> +}
>
This function is way to big. Please split it up to one function per
instruction, and use a helper function for common code.
--
Do not meddle in the internals of kernels, for they are subtle and quick to panic.
prev parent reply other threads:[~2009-06-17 7:24 UTC|newest]
Thread overview: 4+ messages / expand[flat|nested] mbox.gz Atom feed top
2009-06-16 13:25 [PATCH] add sysenter/syscall emulation for 32bit compat mode Andre Przywara
2009-06-17 5:02 ` Amit Shah
2009-06-17 7:51 ` Andre Przywara
2009-06-17 7:24 ` Avi Kivity [this message]
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=4A389A26.2090800@redhat.com \
--to=avi@redhat.com \
--cc=amit.shah@redhat.com \
--cc=andre.przywara@amd.com \
--cc=christoph.egger@amd.com \
--cc=kvm@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox