* [PATCH v4 1/5] x86/32on64: use generic instruction decoding for call gate emulation
2016-12-13 11:08 [PATCH v4 0/5] x86: extend use of generic insn emulator for PV Jan Beulich
@ 2016-12-13 11:26 ` Jan Beulich
2016-12-13 15:50 ` Andrew Cooper
2016-12-13 11:27 ` [PATCH v4 2/5] x86emul: generalize exception handling for rep_* hooks Jan Beulich
` (3 subsequent siblings)
4 siblings, 1 reply; 14+ messages in thread
From: Jan Beulich @ 2016-12-13 11:26 UTC (permalink / raw)
To: xen-devel; +Cc: Andrew Cooper
[-- Attachment #1: Type: text/plain, Size: 14748 bytes --]
... instead of custom handling. Note that we can't use generic
emulation, as the emulator's far branch support is rather rudimentary
at this point in time.
Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v4: Drop a redundant conditional and the then pointless __addr_ok()
check. Deliver exceptions when getting back X86EMUL_EXCEPTION
(addressing a prior re-basing oversight). Retain zero error code
for failed insn fetches from original code.
--- a/xen/arch/x86/traps.c
+++ b/xen/arch/x86/traps.c
@@ -28,6 +28,7 @@
#include <xen/init.h>
#include <xen/sched.h>
#include <xen/lib.h>
+#include <xen/err.h>
#include <xen/errno.h>
#include <xen/mm.h>
#include <xen/console.h>
@@ -3279,13 +3280,94 @@ static inline int check_stack_limit(unsi
(!(ar & _SEGMENT_EC) ? (esp - 1) <= limit : (esp - decr) > limit));
}
+struct gate_op_ctxt {
+ struct x86_emulate_ctxt ctxt;
+ struct {
+ unsigned long base, limit;
+ } cs;
+ bool insn_fetch;
+};
+
+static int gate_op_read(
+ enum x86_segment seg,
+ unsigned long offset,
+ void *p_data,
+ unsigned int bytes,
+ struct x86_emulate_ctxt *ctxt)
+{
+ const struct gate_op_ctxt *goc =
+ container_of(ctxt, struct gate_op_ctxt, ctxt);
+ unsigned int rc = bytes, sel = 0;
+ unsigned long addr = offset, limit = 0;
+
+ switch ( seg )
+ {
+ case x86_seg_cs:
+ addr += goc->cs.base;
+ limit = goc->cs.limit;
+ break;
+ case x86_seg_ds:
+ sel = read_sreg(ds);
+ break;
+ case x86_seg_es:
+ sel = read_sreg(es);
+ break;
+ case x86_seg_fs:
+ sel = read_sreg(fs);
+ break;
+ case x86_seg_gs:
+ sel = read_sreg(gs);
+ break;
+ case x86_seg_ss:
+ sel = ctxt->regs->ss;
+ break;
+ default:
+ return X86EMUL_UNHANDLEABLE;
+ }
+ if ( sel )
+ {
+ unsigned int ar;
+
+ ASSERT(!goc->insn_fetch);
+ if ( !read_descriptor(sel, current, &addr, &limit, &ar, 0) ||
+ !(ar & _SEGMENT_S) ||
+ !(ar & _SEGMENT_P) ||
+ ((ar & _SEGMENT_CODE) && !(ar & _SEGMENT_WR)) )
+ return X86EMUL_UNHANDLEABLE;
+ addr += offset;
+ }
+ else if ( seg != x86_seg_cs )
+ return X86EMUL_UNHANDLEABLE;
+
+ /* We don't mean to emulate any branches. */
+ if ( limit < bytes - 1 || offset > limit - bytes + 1 )
+ return X86EMUL_UNHANDLEABLE;
+
+ addr = (uint32_t)addr;
+
+ if ( (rc = __copy_from_user(p_data, (void *)addr, bytes)) )
+ {
+ /*
+ * TODO: This should report PFEC_insn_fetch when goc->insn_fetch &&
+ * cpu_has_nx, but we'd then need a "fetch" variant of
+ * __copy_from_user() respecting NX, SMEP, and protection keys.
+ */
+ x86_emul_pagefault(0, addr + bytes - rc, ctxt);
+ return X86EMUL_EXCEPTION;
+ }
+
+ return X86EMUL_OKAY;
+}
+
static void emulate_gate_op(struct cpu_user_regs *regs)
{
struct vcpu *v = current;
- unsigned int sel, ar, dpl, nparm, opnd_sel;
- unsigned int op_default, op_bytes, ad_default, ad_bytes;
- unsigned long off, eip, opnd_off, base, limit;
- int jump;
+ unsigned int sel, ar, dpl, nparm, insn_len;
+ struct gate_op_ctxt ctxt = { .ctxt.regs = regs, .insn_fetch = true };
+ struct x86_emulate_state *state;
+ unsigned long off, base, limit;
+ uint16_t opnd_sel = 0;
+ int jump = -1, rc = X86EMUL_OKAY;
/* Check whether this fault is due to the use of a call gate. */
if ( !read_gate_descriptor(regs->error_code, v, &sel, &off, &ar) ||
@@ -3307,7 +3389,8 @@ static void emulate_gate_op(struct cpu_u
* Decode instruction (and perhaps operand) to determine RPL,
* whether this is a jump or a call, and the call return offset.
*/
- if ( !read_descriptor(regs->cs, v, &base, &limit, &ar, 0) ||
+ if ( !read_descriptor(regs->cs, v, &ctxt.cs.base, &ctxt.cs.limit,
+ &ar, 0) ||
!(ar & _SEGMENT_S) ||
!(ar & _SEGMENT_P) ||
!(ar & _SEGMENT_CODE) )
@@ -3316,179 +3399,73 @@ static void emulate_gate_op(struct cpu_u
return;
}
- op_bytes = op_default = ar & _SEGMENT_DB ? 4 : 2;
- ad_default = ad_bytes = op_default;
- opnd_sel = opnd_off = 0;
- jump = -1;
- for ( eip = regs->eip; eip - regs->_eip < 10; )
+ ctxt.ctxt.addr_size = ar & _SEGMENT_DB ? 32 : 16;
+ /* Leave zero in ctxt.ctxt.sp_size, as it's not needed for decoding. */
+ state = x86_decode_insn(&ctxt.ctxt, gate_op_read);
+ ctxt.insn_fetch = false;
+ if ( IS_ERR_OR_NULL(state) )
{
- switch ( insn_fetch(u8, base, eip, limit) )
+ if ( PTR_ERR(state) == -X86EMUL_EXCEPTION )
{
- case 0x66: /* operand-size override */
- op_bytes = op_default ^ 6; /* switch between 2/4 bytes */
- continue;
- case 0x67: /* address-size override */
- ad_bytes = ad_default != 4 ? 4 : 2; /* switch to 2/4 bytes */
- continue;
- case 0x2e: /* CS override */
- opnd_sel = regs->cs;
- ASSERT(opnd_sel);
- continue;
- case 0x3e: /* DS override */
- opnd_sel = read_sreg(ds);
- if ( !opnd_sel )
- opnd_sel = dpl;
- continue;
- case 0x26: /* ES override */
- opnd_sel = read_sreg(es);
- if ( !opnd_sel )
- opnd_sel = dpl;
- continue;
- case 0x64: /* FS override */
- opnd_sel = read_sreg(fs);
- if ( !opnd_sel )
- opnd_sel = dpl;
- continue;
- case 0x65: /* GS override */
- opnd_sel = read_sreg(gs);
- if ( !opnd_sel )
- opnd_sel = dpl;
- continue;
- case 0x36: /* SS override */
- opnd_sel = regs->ss;
- if ( !opnd_sel )
- opnd_sel = dpl;
- continue;
- case 0xea:
- ++jump;
- /* FALLTHROUGH */
- case 0x9a:
- ++jump;
- opnd_sel = regs->cs;
- opnd_off = eip;
- ad_bytes = ad_default;
- eip += op_bytes + 2;
+ ASSERT(ctxt.ctxt.event_pending);
+ pv_inject_event(&ctxt.ctxt.event);
+ }
+ else
+ {
+ ASSERT(!ctxt.ctxt.event_pending);
+ do_guest_trap(TRAP_gp_fault, regs);
+ }
+ return;
+ }
+
+ switch ( ctxt.ctxt.opcode )
+ {
+ unsigned int modrm_345;
+
+ case 0xea:
+ ++jump;
+ /* fall through */
+ case 0x9a:
+ ++jump;
+ opnd_sel = x86_insn_immediate(state, 1);
+ break;
+ case 0xff:
+ if ( x86_insn_modrm(state, NULL, &modrm_345) >= 3 )
break;
- case 0xff:
- {
- unsigned int modrm;
+ switch ( modrm_345 & 7 )
+ {
+ enum x86_segment seg;
- switch ( (modrm = insn_fetch(u8, base, eip, limit)) & 0xf8 )
- {
- case 0x28: case 0x68: case 0xa8:
- ++jump;
- /* FALLTHROUGH */
- case 0x18: case 0x58: case 0x98:
- ++jump;
- if ( ad_bytes != 2 )
- {
- if ( (modrm & 7) == 4 )
- {
- unsigned int sib;
- sib = insn_fetch(u8, base, eip, limit);
-
- modrm = (modrm & ~7) | (sib & 7);
- if ( ((sib >>= 3) & 7) != 4 )
- opnd_off = *(unsigned long *)
- decode_register(sib & 7, regs, 0);
- opnd_off <<= sib >> 3;
- }
- if ( (modrm & 7) != 5 || (modrm & 0xc0) )
- opnd_off += *(unsigned long *)
- decode_register(modrm & 7, regs, 0);
- else
- modrm |= 0x87;
- if ( !opnd_sel )
- {
- switch ( modrm & 7 )
- {
- default:
- opnd_sel = read_sreg(ds);
- break;
- case 4: case 5:
- opnd_sel = regs->ss;
- break;
- }
- }
- }
- else
- {
- switch ( modrm & 7 )
- {
- case 0: case 1: case 7:
- opnd_off = regs->ebx;
- break;
- case 6:
- if ( !(modrm & 0xc0) )
- modrm |= 0x80;
- else
- case 2: case 3:
- {
- opnd_off = regs->ebp;
- if ( !opnd_sel )
- opnd_sel = regs->ss;
- }
- break;
- }
- if ( !opnd_sel )
- opnd_sel = read_sreg(ds);
- switch ( modrm & 7 )
- {
- case 0: case 2: case 4:
- opnd_off += regs->esi;
- break;
- case 1: case 3: case 5:
- opnd_off += regs->edi;
- break;
- }
- }
- switch ( modrm & 0xc0 )
- {
- case 0x40:
- opnd_off += insn_fetch(s8, base, eip, limit);
- break;
- case 0x80:
- if ( ad_bytes > 2 )
- opnd_off += insn_fetch(s32, base, eip, limit);
- else
- opnd_off += insn_fetch(s16, base, eip, limit);
- break;
- }
- if ( ad_bytes == 4 )
- opnd_off = (unsigned int)opnd_off;
- else if ( ad_bytes == 2 )
- opnd_off = (unsigned short)opnd_off;
- break;
- }
- }
+ case 5:
+ ++jump;
+ /* fall through */
+ case 3:
+ ++jump;
+ base = x86_insn_operand_ea(state, &seg);
+ rc = gate_op_read(seg,
+ base + (x86_insn_opsize(state) >> 3),
+ &opnd_sel, sizeof(opnd_sel), &ctxt.ctxt);
break;
}
break;
}
- if ( jump < 0 )
- {
- fail:
- do_guest_trap(TRAP_gp_fault, regs);
- skip:
- return;
- }
+ insn_len = x86_insn_length(state, &ctxt.ctxt);
+ x86_emulate_free_state(state);
- if ( (opnd_sel != regs->cs &&
- !read_descriptor(opnd_sel, v, &base, &limit, &ar, 0)) ||
- !(ar & _SEGMENT_S) ||
- !(ar & _SEGMENT_P) ||
- ((ar & _SEGMENT_CODE) && !(ar & _SEGMENT_WR)) )
+ if ( rc == X86EMUL_EXCEPTION )
{
- do_guest_trap(TRAP_gp_fault, regs);
+ ASSERT(ctxt.ctxt.event_pending);
+ pv_inject_event(&ctxt.ctxt.event);
return;
}
- opnd_off += op_bytes;
-#define ad_default ad_bytes
- opnd_sel = insn_fetch(u16, base, opnd_off, limit);
-#undef ad_default
- if ( (opnd_sel & ~3) != regs->error_code || dpl < (opnd_sel & 3) )
+ ASSERT(!ctxt.ctxt.event_pending);
+
+ if ( rc != X86EMUL_OKAY ||
+ jump < 0 ||
+ (opnd_sel & ~3) != regs->error_code ||
+ dpl < (opnd_sel & 3) )
{
do_guest_trap(TRAP_gp_fault, regs);
return;
@@ -3629,7 +3606,7 @@ static void emulate_gate_op(struct cpu_u
}
}
push(regs->cs);
- push(eip);
+ push(regs->eip + insn_len);
#undef push
regs->esp = esp;
regs->ss = ss;
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -5532,6 +5532,14 @@ void x86_emulate_free_state(struct x86_e
}
#endif
+unsigned int
+x86_insn_opsize(const struct x86_emulate_state *state)
+{
+ check_state(state);
+
+ return state->op_bytes << 3;
+}
+
int
x86_insn_modrm(const struct x86_emulate_state *state,
unsigned int *rm, unsigned int *reg)
@@ -5549,6 +5557,33 @@ x86_insn_modrm(const struct x86_emulate_
return state->modrm_mod;
}
+unsigned long
+x86_insn_operand_ea(const struct x86_emulate_state *state,
+ enum x86_segment *seg)
+{
+ *seg = state->ea.type == OP_MEM ? state->ea.mem.seg : x86_seg_none;
+
+ check_state(state);
+
+ return state->ea.mem.off;
+}
+
+unsigned long
+x86_insn_immediate(const struct x86_emulate_state *state, unsigned int nr)
+{
+ check_state(state);
+
+ switch ( nr )
+ {
+ case 0:
+ return state->imm1;
+ case 1:
+ return state->imm2;
+ }
+
+ return 0;
+}
+
unsigned int
x86_insn_length(const struct x86_emulate_state *state,
const struct x86_emulate_ctxt *ctxt)
--- a/xen/arch/x86/x86_emulate/x86_emulate.h
+++ b/xen/arch/x86/x86_emulate/x86_emulate.h
@@ -628,9 +628,17 @@ x86_decode_insn(
void *p_data, unsigned int bytes,
struct x86_emulate_ctxt *ctxt));
+unsigned int
+x86_insn_opsize(const struct x86_emulate_state *state);
int
x86_insn_modrm(const struct x86_emulate_state *state,
unsigned int *rm, unsigned int *reg);
+unsigned long
+x86_insn_operand_ea(const struct x86_emulate_state *state,
+ enum x86_segment *seg);
+unsigned long
+x86_insn_immediate(const struct x86_emulate_state *state,
+ unsigned int nr);
unsigned int
x86_insn_length(const struct x86_emulate_state *state,
const struct x86_emulate_ctxt *ctxt);
[-- Attachment #2: x86-32on64-gate-op-generic-decode.patch --]
[-- Type: text/plain, Size: 14816 bytes --]
x86/32on64: use generic instruction decoding for call gate emulation
... instead of custom handling. Note that we can't use generic
emulation, as the emulator's far branch support is rather rudimentary
at this point in time.
Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v4: Drop a redundant conditional and the then pointless __addr_ok()
check. Deliver exceptions when getting back X86EMUL_EXCEPTION
(addressing a prior re-basing oversight). Retain zero error code
for failed insn fetches from original code.
--- a/xen/arch/x86/traps.c
+++ b/xen/arch/x86/traps.c
@@ -28,6 +28,7 @@
#include <xen/init.h>
#include <xen/sched.h>
#include <xen/lib.h>
+#include <xen/err.h>
#include <xen/errno.h>
#include <xen/mm.h>
#include <xen/console.h>
@@ -3279,13 +3280,94 @@ static inline int check_stack_limit(unsi
(!(ar & _SEGMENT_EC) ? (esp - 1) <= limit : (esp - decr) > limit));
}
+struct gate_op_ctxt {
+ struct x86_emulate_ctxt ctxt;
+ struct {
+ unsigned long base, limit;
+ } cs;
+ bool insn_fetch;
+};
+
+static int gate_op_read(
+ enum x86_segment seg,
+ unsigned long offset,
+ void *p_data,
+ unsigned int bytes,
+ struct x86_emulate_ctxt *ctxt)
+{
+ const struct gate_op_ctxt *goc =
+ container_of(ctxt, struct gate_op_ctxt, ctxt);
+ unsigned int rc = bytes, sel = 0;
+ unsigned long addr = offset, limit = 0;
+
+ switch ( seg )
+ {
+ case x86_seg_cs:
+ addr += goc->cs.base;
+ limit = goc->cs.limit;
+ break;
+ case x86_seg_ds:
+ sel = read_sreg(ds);
+ break;
+ case x86_seg_es:
+ sel = read_sreg(es);
+ break;
+ case x86_seg_fs:
+ sel = read_sreg(fs);
+ break;
+ case x86_seg_gs:
+ sel = read_sreg(gs);
+ break;
+ case x86_seg_ss:
+ sel = ctxt->regs->ss;
+ break;
+ default:
+ return X86EMUL_UNHANDLEABLE;
+ }
+ if ( sel )
+ {
+ unsigned int ar;
+
+ ASSERT(!goc->insn_fetch);
+ if ( !read_descriptor(sel, current, &addr, &limit, &ar, 0) ||
+ !(ar & _SEGMENT_S) ||
+ !(ar & _SEGMENT_P) ||
+ ((ar & _SEGMENT_CODE) && !(ar & _SEGMENT_WR)) )
+ return X86EMUL_UNHANDLEABLE;
+ addr += offset;
+ }
+ else if ( seg != x86_seg_cs )
+ return X86EMUL_UNHANDLEABLE;
+
+ /* We don't mean to emulate any branches. */
+ if ( limit < bytes - 1 || offset > limit - bytes + 1 )
+ return X86EMUL_UNHANDLEABLE;
+
+ addr = (uint32_t)addr;
+
+ if ( (rc = __copy_from_user(p_data, (void *)addr, bytes)) )
+ {
+ /*
+ * TODO: This should report PFEC_insn_fetch when goc->insn_fetch &&
+ * cpu_has_nx, but we'd then need a "fetch" variant of
+ * __copy_from_user() respecting NX, SMEP, and protection keys.
+ */
+ x86_emul_pagefault(0, addr + bytes - rc, ctxt);
+ return X86EMUL_EXCEPTION;
+ }
+
+ return X86EMUL_OKAY;
+}
+
static void emulate_gate_op(struct cpu_user_regs *regs)
{
struct vcpu *v = current;
- unsigned int sel, ar, dpl, nparm, opnd_sel;
- unsigned int op_default, op_bytes, ad_default, ad_bytes;
- unsigned long off, eip, opnd_off, base, limit;
- int jump;
+ unsigned int sel, ar, dpl, nparm, insn_len;
+ struct gate_op_ctxt ctxt = { .ctxt.regs = regs, .insn_fetch = true };
+ struct x86_emulate_state *state;
+ unsigned long off, base, limit;
+ uint16_t opnd_sel = 0;
+ int jump = -1, rc = X86EMUL_OKAY;
/* Check whether this fault is due to the use of a call gate. */
if ( !read_gate_descriptor(regs->error_code, v, &sel, &off, &ar) ||
@@ -3307,7 +3389,8 @@ static void emulate_gate_op(struct cpu_u
* Decode instruction (and perhaps operand) to determine RPL,
* whether this is a jump or a call, and the call return offset.
*/
- if ( !read_descriptor(regs->cs, v, &base, &limit, &ar, 0) ||
+ if ( !read_descriptor(regs->cs, v, &ctxt.cs.base, &ctxt.cs.limit,
+ &ar, 0) ||
!(ar & _SEGMENT_S) ||
!(ar & _SEGMENT_P) ||
!(ar & _SEGMENT_CODE) )
@@ -3316,179 +3399,73 @@ static void emulate_gate_op(struct cpu_u
return;
}
- op_bytes = op_default = ar & _SEGMENT_DB ? 4 : 2;
- ad_default = ad_bytes = op_default;
- opnd_sel = opnd_off = 0;
- jump = -1;
- for ( eip = regs->eip; eip - regs->_eip < 10; )
+ ctxt.ctxt.addr_size = ar & _SEGMENT_DB ? 32 : 16;
+ /* Leave zero in ctxt.ctxt.sp_size, as it's not needed for decoding. */
+ state = x86_decode_insn(&ctxt.ctxt, gate_op_read);
+ ctxt.insn_fetch = false;
+ if ( IS_ERR_OR_NULL(state) )
{
- switch ( insn_fetch(u8, base, eip, limit) )
+ if ( PTR_ERR(state) == -X86EMUL_EXCEPTION )
{
- case 0x66: /* operand-size override */
- op_bytes = op_default ^ 6; /* switch between 2/4 bytes */
- continue;
- case 0x67: /* address-size override */
- ad_bytes = ad_default != 4 ? 4 : 2; /* switch to 2/4 bytes */
- continue;
- case 0x2e: /* CS override */
- opnd_sel = regs->cs;
- ASSERT(opnd_sel);
- continue;
- case 0x3e: /* DS override */
- opnd_sel = read_sreg(ds);
- if ( !opnd_sel )
- opnd_sel = dpl;
- continue;
- case 0x26: /* ES override */
- opnd_sel = read_sreg(es);
- if ( !opnd_sel )
- opnd_sel = dpl;
- continue;
- case 0x64: /* FS override */
- opnd_sel = read_sreg(fs);
- if ( !opnd_sel )
- opnd_sel = dpl;
- continue;
- case 0x65: /* GS override */
- opnd_sel = read_sreg(gs);
- if ( !opnd_sel )
- opnd_sel = dpl;
- continue;
- case 0x36: /* SS override */
- opnd_sel = regs->ss;
- if ( !opnd_sel )
- opnd_sel = dpl;
- continue;
- case 0xea:
- ++jump;
- /* FALLTHROUGH */
- case 0x9a:
- ++jump;
- opnd_sel = regs->cs;
- opnd_off = eip;
- ad_bytes = ad_default;
- eip += op_bytes + 2;
+ ASSERT(ctxt.ctxt.event_pending);
+ pv_inject_event(&ctxt.ctxt.event);
+ }
+ else
+ {
+ ASSERT(!ctxt.ctxt.event_pending);
+ do_guest_trap(TRAP_gp_fault, regs);
+ }
+ return;
+ }
+
+ switch ( ctxt.ctxt.opcode )
+ {
+ unsigned int modrm_345;
+
+ case 0xea:
+ ++jump;
+ /* fall through */
+ case 0x9a:
+ ++jump;
+ opnd_sel = x86_insn_immediate(state, 1);
+ break;
+ case 0xff:
+ if ( x86_insn_modrm(state, NULL, &modrm_345) >= 3 )
break;
- case 0xff:
- {
- unsigned int modrm;
+ switch ( modrm_345 & 7 )
+ {
+ enum x86_segment seg;
- switch ( (modrm = insn_fetch(u8, base, eip, limit)) & 0xf8 )
- {
- case 0x28: case 0x68: case 0xa8:
- ++jump;
- /* FALLTHROUGH */
- case 0x18: case 0x58: case 0x98:
- ++jump;
- if ( ad_bytes != 2 )
- {
- if ( (modrm & 7) == 4 )
- {
- unsigned int sib;
- sib = insn_fetch(u8, base, eip, limit);
-
- modrm = (modrm & ~7) | (sib & 7);
- if ( ((sib >>= 3) & 7) != 4 )
- opnd_off = *(unsigned long *)
- decode_register(sib & 7, regs, 0);
- opnd_off <<= sib >> 3;
- }
- if ( (modrm & 7) != 5 || (modrm & 0xc0) )
- opnd_off += *(unsigned long *)
- decode_register(modrm & 7, regs, 0);
- else
- modrm |= 0x87;
- if ( !opnd_sel )
- {
- switch ( modrm & 7 )
- {
- default:
- opnd_sel = read_sreg(ds);
- break;
- case 4: case 5:
- opnd_sel = regs->ss;
- break;
- }
- }
- }
- else
- {
- switch ( modrm & 7 )
- {
- case 0: case 1: case 7:
- opnd_off = regs->ebx;
- break;
- case 6:
- if ( !(modrm & 0xc0) )
- modrm |= 0x80;
- else
- case 2: case 3:
- {
- opnd_off = regs->ebp;
- if ( !opnd_sel )
- opnd_sel = regs->ss;
- }
- break;
- }
- if ( !opnd_sel )
- opnd_sel = read_sreg(ds);
- switch ( modrm & 7 )
- {
- case 0: case 2: case 4:
- opnd_off += regs->esi;
- break;
- case 1: case 3: case 5:
- opnd_off += regs->edi;
- break;
- }
- }
- switch ( modrm & 0xc0 )
- {
- case 0x40:
- opnd_off += insn_fetch(s8, base, eip, limit);
- break;
- case 0x80:
- if ( ad_bytes > 2 )
- opnd_off += insn_fetch(s32, base, eip, limit);
- else
- opnd_off += insn_fetch(s16, base, eip, limit);
- break;
- }
- if ( ad_bytes == 4 )
- opnd_off = (unsigned int)opnd_off;
- else if ( ad_bytes == 2 )
- opnd_off = (unsigned short)opnd_off;
- break;
- }
- }
+ case 5:
+ ++jump;
+ /* fall through */
+ case 3:
+ ++jump;
+ base = x86_insn_operand_ea(state, &seg);
+ rc = gate_op_read(seg,
+ base + (x86_insn_opsize(state) >> 3),
+ &opnd_sel, sizeof(opnd_sel), &ctxt.ctxt);
break;
}
break;
}
- if ( jump < 0 )
- {
- fail:
- do_guest_trap(TRAP_gp_fault, regs);
- skip:
- return;
- }
+ insn_len = x86_insn_length(state, &ctxt.ctxt);
+ x86_emulate_free_state(state);
- if ( (opnd_sel != regs->cs &&
- !read_descriptor(opnd_sel, v, &base, &limit, &ar, 0)) ||
- !(ar & _SEGMENT_S) ||
- !(ar & _SEGMENT_P) ||
- ((ar & _SEGMENT_CODE) && !(ar & _SEGMENT_WR)) )
+ if ( rc == X86EMUL_EXCEPTION )
{
- do_guest_trap(TRAP_gp_fault, regs);
+ ASSERT(ctxt.ctxt.event_pending);
+ pv_inject_event(&ctxt.ctxt.event);
return;
}
- opnd_off += op_bytes;
-#define ad_default ad_bytes
- opnd_sel = insn_fetch(u16, base, opnd_off, limit);
-#undef ad_default
- if ( (opnd_sel & ~3) != regs->error_code || dpl < (opnd_sel & 3) )
+ ASSERT(!ctxt.ctxt.event_pending);
+
+ if ( rc != X86EMUL_OKAY ||
+ jump < 0 ||
+ (opnd_sel & ~3) != regs->error_code ||
+ dpl < (opnd_sel & 3) )
{
do_guest_trap(TRAP_gp_fault, regs);
return;
@@ -3629,7 +3606,7 @@ static void emulate_gate_op(struct cpu_u
}
}
push(regs->cs);
- push(eip);
+ push(regs->eip + insn_len);
#undef push
regs->esp = esp;
regs->ss = ss;
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -5532,6 +5532,14 @@ void x86_emulate_free_state(struct x86_e
}
#endif
+unsigned int
+x86_insn_opsize(const struct x86_emulate_state *state)
+{
+ check_state(state);
+
+ return state->op_bytes << 3;
+}
+
int
x86_insn_modrm(const struct x86_emulate_state *state,
unsigned int *rm, unsigned int *reg)
@@ -5549,6 +5557,33 @@ x86_insn_modrm(const struct x86_emulate_
return state->modrm_mod;
}
+unsigned long
+x86_insn_operand_ea(const struct x86_emulate_state *state,
+ enum x86_segment *seg)
+{
+ *seg = state->ea.type == OP_MEM ? state->ea.mem.seg : x86_seg_none;
+
+ check_state(state);
+
+ return state->ea.mem.off;
+}
+
+unsigned long
+x86_insn_immediate(const struct x86_emulate_state *state, unsigned int nr)
+{
+ check_state(state);
+
+ switch ( nr )
+ {
+ case 0:
+ return state->imm1;
+ case 1:
+ return state->imm2;
+ }
+
+ return 0;
+}
+
unsigned int
x86_insn_length(const struct x86_emulate_state *state,
const struct x86_emulate_ctxt *ctxt)
--- a/xen/arch/x86/x86_emulate/x86_emulate.h
+++ b/xen/arch/x86/x86_emulate/x86_emulate.h
@@ -628,9 +628,17 @@ x86_decode_insn(
void *p_data, unsigned int bytes,
struct x86_emulate_ctxt *ctxt));
+unsigned int
+x86_insn_opsize(const struct x86_emulate_state *state);
int
x86_insn_modrm(const struct x86_emulate_state *state,
unsigned int *rm, unsigned int *reg);
+unsigned long
+x86_insn_operand_ea(const struct x86_emulate_state *state,
+ enum x86_segment *seg);
+unsigned long
+x86_insn_immediate(const struct x86_emulate_state *state,
+ unsigned int nr);
unsigned int
x86_insn_length(const struct x86_emulate_state *state,
const struct x86_emulate_ctxt *ctxt);
[-- Attachment #3: Type: text/plain, Size: 127 bytes --]
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel
^ permalink raw reply [flat|nested] 14+ messages in thread* [PATCH v4 3/5] x86emul: make write and cmpxchg hooks optional
2016-12-13 11:08 [PATCH v4 0/5] x86: extend use of generic insn emulator for PV Jan Beulich
2016-12-13 11:26 ` [PATCH v4 1/5] x86/32on64: use generic instruction decoding for call gate emulation Jan Beulich
2016-12-13 11:27 ` [PATCH v4 2/5] x86emul: generalize exception handling for rep_* hooks Jan Beulich
@ 2016-12-13 11:28 ` Jan Beulich
2016-12-13 15:52 ` Andrew Cooper
2016-12-13 11:28 ` [PATCH v4 4/5] x86/PV: use generic emulator for privileged instruction handling Jan Beulich
2016-12-13 11:29 ` [PATCH v4 5/5] x86/PV: prefer pv_inject_hw_exception() Jan Beulich
4 siblings, 1 reply; 14+ messages in thread
From: Jan Beulich @ 2016-12-13 11:28 UTC (permalink / raw)
To: xen-devel; +Cc: Andrew Cooper
[-- Attachment #1: Type: text/plain, Size: 7437 bytes --]
While the read and fetch hooks are basically unavoidable, write and
cmpxchg aren't really needed by that many insns.
Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v4: Don't open-code fail_if(). Add ASSERT()s for fetch and read hooks.
Move a write hook check closer to its use, and add ASSERT()s and
comments where this is not desirable. Re-base.
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -1520,6 +1520,7 @@ protmode_load_seg(
{
uint32_t new_desc_b = desc.b | a_flag;
+ fail_if(!ops->cmpxchg);
switch ( (rc = ops->cmpxchg(sel_seg, (sel & 0xfff8) + 4, &desc.b,
&new_desc_b, sizeof(desc.b), ctxt)) )
{
@@ -2017,6 +2018,8 @@ x86_decode(
bool pc_rel = false;
int rc = X86EMUL_OKAY;
+ ASSERT(ops->insn_fetch);
+
memset(state, 0, sizeof(*state));
ea.type = OP_NONE;
ea.mem.seg = x86_seg_ds;
@@ -2494,6 +2497,8 @@ x86_emulate(
struct x86_emulate_stub stub = {};
DECLARE_ALIGNED(mmval_t, mmval);
+ ASSERT(ops->read);
+
rc = x86_decode(&state, ctxt, ops);
if ( rc != X86EMUL_OKAY )
return rc;
@@ -2658,13 +2663,18 @@ x86_emulate(
}
else if ( !(d & Mov) ) /* optimisation - avoid slow emulated read */
{
+ fail_if(lock_prefix ? !ops->cmpxchg : !ops->write);
if ( (rc = read_ulong(dst.mem.seg, dst.mem.off,
&dst.val, dst.bytes, ctxt, ops)) )
goto done;
dst.orig_val = dst.val;
}
- else /* Lock prefix is allowed only on RMW instructions. */
+ else
+ {
+ /* Lock prefix is allowed only on RMW instructions. */
generate_exception_if(lock_prefix, EXC_UD);
+ fail_if(!ops->write);
+ }
break;
}
@@ -2813,7 +2823,9 @@ x86_emulate(
unsigned long regs[] = {
_regs.eax, _regs.ecx, _regs.edx, _regs.ebx,
_regs.esp, _regs.ebp, _regs.esi, _regs.edi };
+
generate_exception_if(mode_64bit(), EXC_UD);
+ fail_if(!ops->write);
for ( i = 0; i < 8; i++ )
if ( (rc = ops->write(x86_seg_ss, sp_pre_dec(op_bytes),
®s[i], op_bytes, ctxt)) != 0 )
@@ -3111,7 +3123,7 @@ x86_emulate(
case 0x9a: /* call (far, absolute) */
ASSERT(!mode_64bit());
far_call:
- fail_if(ops->read_segment == NULL);
+ fail_if(!ops->read_segment || !ops->write);
if ( (rc = ops->read_segment(x86_seg_cs, &sreg, ctxt)) ||
(rc = load_seg(x86_seg_cs, imm2, 0, &cs, ctxt, ops)) ||
@@ -3350,6 +3362,7 @@ x86_emulate(
dst.type = OP_REG;
dst.bytes = (mode_64bit() && (op_bytes == 4)) ? 8 : op_bytes;
dst.reg = (unsigned long *)&_regs.ebp;
+ fail_if(!ops->write);
if ( (rc = ops->write(x86_seg_ss, sp_pre_dec(dst.bytes),
&_regs.ebp, dst.bytes, ctxt)) )
goto done;
@@ -4446,6 +4459,7 @@ x86_emulate(
else if ( rc != X86EMUL_UNHANDLEABLE )
goto done;
}
+ fail_if(limit && !ops->write);
while ( limit )
{
rc = ops->write(ea.mem.seg, base, &zero, sizeof(zero), ctxt);
@@ -4466,7 +4480,7 @@ x86_emulate(
case 1: /* sidt */
generate_exception_if(ea.type != OP_MEM, EXC_UD);
generate_exception_if(umip_active(ctxt, ops), EXC_GP, 0);
- fail_if(ops->read_segment == NULL);
+ fail_if(!ops->read_segment || !ops->write);
if ( (rc = ops->read_segment(seg, &sreg, ctxt)) )
goto done;
if ( mode_64bit() )
@@ -4503,12 +4517,18 @@ x86_emulate(
break;
case 4: /* smsw */
generate_exception_if(umip_active(ctxt, ops), EXC_GP, 0);
- ea.bytes = (ea.type == OP_MEM) ? 2 : op_bytes;
+ if ( ea.type == OP_MEM )
+ {
+ fail_if(!ops->write);
+ d |= Mov; /* force writeback */
+ ea.bytes = 2;
+ }
+ else
+ ea.bytes = op_bytes;
dst = ea;
fail_if(ops->read_cr == NULL);
if ( (rc = ops->read_cr(0, &dst.val, ctxt)) )
goto done;
- d |= Mov; /* force writeback */
break;
case 6: /* lmsw */
fail_if(ops->read_cr == NULL);
@@ -4710,6 +4730,8 @@ x86_emulate(
if ( !(b & 1) )
rc = ops->read(ea.mem.seg, ea.mem.off+0, mmvalp,
ea.bytes, ctxt);
+ else
+ fail_if(!ops->write); /* Check before running the stub. */
/* convert memory operand to (%rAX) */
rex_prefix &= ~REX_B;
vex.b = 1;
@@ -4724,8 +4746,11 @@ x86_emulate(
put_fpu(&fic);
put_stub(stub);
if ( !rc && (b & 1) && (ea.type == OP_MEM) )
+ {
+ ASSERT(ops->write); /* See the fail_if() above. */
rc = ops->write(ea.mem.seg, ea.mem.off, mmvalp,
ea.bytes, ctxt);
+ }
if ( rc )
goto done;
dst.type = OP_NONE;
@@ -4994,6 +5019,8 @@ x86_emulate(
if ( b == 0x6f )
rc = ops->read(ea.mem.seg, ea.mem.off+0, mmvalp,
ea.bytes, ctxt);
+ else
+ fail_if(!ops->write); /* Check before running the stub. */
}
if ( ea.type == OP_MEM || b == 0x7e )
{
@@ -5015,8 +5042,11 @@ x86_emulate(
put_fpu(&fic);
put_stub(stub);
if ( !rc && (b != 0x6f) && (ea.type == OP_MEM) )
+ {
+ ASSERT(ops->write); /* See the fail_if() above. */
rc = ops->write(ea.mem.seg, ea.mem.off, mmvalp,
ea.bytes, ctxt);
+ }
if ( rc )
goto done;
dst.type = OP_NONE;
@@ -5264,6 +5294,7 @@ x86_emulate(
generate_exception_if((modrm_reg & 7) != 1, EXC_UD);
generate_exception_if(ea.type != OP_MEM, EXC_UD);
+ fail_if(!ops->cmpxchg);
if ( op_bytes == 8 )
host_and_vcpu_must_have(cx16);
op_bytes *= 2;
@@ -5396,12 +5427,18 @@ x86_emulate(
!ctxt->force_writeback )
/* nothing to do */;
else if ( lock_prefix )
+ {
+ fail_if(!ops->cmpxchg);
rc = ops->cmpxchg(
dst.mem.seg, dst.mem.off, &dst.orig_val,
&dst.val, dst.bytes, ctxt);
+ }
else
+ {
+ fail_if(!ops->write);
rc = ops->write(
dst.mem.seg, dst.mem.off, &dst.val, dst.bytes, ctxt);
+ }
if ( rc != 0 )
goto done;
default:
@@ -5485,8 +5522,6 @@ x86_decode_insn(
const struct x86_emulate_ops ops = {
.insn_fetch = insn_fetch,
.read = x86emul_unhandleable_rw,
- .write = PTR_POISON,
- .cmpxchg = PTR_POISON,
};
int rc = x86_decode(state, ctxt, &ops);
[-- Attachment #2: x86emul-write-cmpxchg-optional.patch --]
[-- Type: text/plain, Size: 7483 bytes --]
x86emul: make write and cmpxchg hooks optional
While the read and fetch hooks are basically unavoidable, write and
cmpxchg aren't really needed by that many insns.
Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v4: Don't open-code fail_if(). Add ASSERT()s for fetch and read hooks.
Move a write hook check closer to its use, and add ASSERT()s and
comments where this is not desirable. Re-base.
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -1520,6 +1520,7 @@ protmode_load_seg(
{
uint32_t new_desc_b = desc.b | a_flag;
+ fail_if(!ops->cmpxchg);
switch ( (rc = ops->cmpxchg(sel_seg, (sel & 0xfff8) + 4, &desc.b,
&new_desc_b, sizeof(desc.b), ctxt)) )
{
@@ -2017,6 +2018,8 @@ x86_decode(
bool pc_rel = false;
int rc = X86EMUL_OKAY;
+ ASSERT(ops->insn_fetch);
+
memset(state, 0, sizeof(*state));
ea.type = OP_NONE;
ea.mem.seg = x86_seg_ds;
@@ -2494,6 +2497,8 @@ x86_emulate(
struct x86_emulate_stub stub = {};
DECLARE_ALIGNED(mmval_t, mmval);
+ ASSERT(ops->read);
+
rc = x86_decode(&state, ctxt, ops);
if ( rc != X86EMUL_OKAY )
return rc;
@@ -2658,13 +2663,18 @@ x86_emulate(
}
else if ( !(d & Mov) ) /* optimisation - avoid slow emulated read */
{
+ fail_if(lock_prefix ? !ops->cmpxchg : !ops->write);
if ( (rc = read_ulong(dst.mem.seg, dst.mem.off,
&dst.val, dst.bytes, ctxt, ops)) )
goto done;
dst.orig_val = dst.val;
}
- else /* Lock prefix is allowed only on RMW instructions. */
+ else
+ {
+ /* Lock prefix is allowed only on RMW instructions. */
generate_exception_if(lock_prefix, EXC_UD);
+ fail_if(!ops->write);
+ }
break;
}
@@ -2813,7 +2823,9 @@ x86_emulate(
unsigned long regs[] = {
_regs.eax, _regs.ecx, _regs.edx, _regs.ebx,
_regs.esp, _regs.ebp, _regs.esi, _regs.edi };
+
generate_exception_if(mode_64bit(), EXC_UD);
+ fail_if(!ops->write);
for ( i = 0; i < 8; i++ )
if ( (rc = ops->write(x86_seg_ss, sp_pre_dec(op_bytes),
®s[i], op_bytes, ctxt)) != 0 )
@@ -3111,7 +3123,7 @@ x86_emulate(
case 0x9a: /* call (far, absolute) */
ASSERT(!mode_64bit());
far_call:
- fail_if(ops->read_segment == NULL);
+ fail_if(!ops->read_segment || !ops->write);
if ( (rc = ops->read_segment(x86_seg_cs, &sreg, ctxt)) ||
(rc = load_seg(x86_seg_cs, imm2, 0, &cs, ctxt, ops)) ||
@@ -3350,6 +3362,7 @@ x86_emulate(
dst.type = OP_REG;
dst.bytes = (mode_64bit() && (op_bytes == 4)) ? 8 : op_bytes;
dst.reg = (unsigned long *)&_regs.ebp;
+ fail_if(!ops->write);
if ( (rc = ops->write(x86_seg_ss, sp_pre_dec(dst.bytes),
&_regs.ebp, dst.bytes, ctxt)) )
goto done;
@@ -4446,6 +4459,7 @@ x86_emulate(
else if ( rc != X86EMUL_UNHANDLEABLE )
goto done;
}
+ fail_if(limit && !ops->write);
while ( limit )
{
rc = ops->write(ea.mem.seg, base, &zero, sizeof(zero), ctxt);
@@ -4466,7 +4480,7 @@ x86_emulate(
case 1: /* sidt */
generate_exception_if(ea.type != OP_MEM, EXC_UD);
generate_exception_if(umip_active(ctxt, ops), EXC_GP, 0);
- fail_if(ops->read_segment == NULL);
+ fail_if(!ops->read_segment || !ops->write);
if ( (rc = ops->read_segment(seg, &sreg, ctxt)) )
goto done;
if ( mode_64bit() )
@@ -4503,12 +4517,18 @@ x86_emulate(
break;
case 4: /* smsw */
generate_exception_if(umip_active(ctxt, ops), EXC_GP, 0);
- ea.bytes = (ea.type == OP_MEM) ? 2 : op_bytes;
+ if ( ea.type == OP_MEM )
+ {
+ fail_if(!ops->write);
+ d |= Mov; /* force writeback */
+ ea.bytes = 2;
+ }
+ else
+ ea.bytes = op_bytes;
dst = ea;
fail_if(ops->read_cr == NULL);
if ( (rc = ops->read_cr(0, &dst.val, ctxt)) )
goto done;
- d |= Mov; /* force writeback */
break;
case 6: /* lmsw */
fail_if(ops->read_cr == NULL);
@@ -4710,6 +4730,8 @@ x86_emulate(
if ( !(b & 1) )
rc = ops->read(ea.mem.seg, ea.mem.off+0, mmvalp,
ea.bytes, ctxt);
+ else
+ fail_if(!ops->write); /* Check before running the stub. */
/* convert memory operand to (%rAX) */
rex_prefix &= ~REX_B;
vex.b = 1;
@@ -4724,8 +4746,11 @@ x86_emulate(
put_fpu(&fic);
put_stub(stub);
if ( !rc && (b & 1) && (ea.type == OP_MEM) )
+ {
+ ASSERT(ops->write); /* See the fail_if() above. */
rc = ops->write(ea.mem.seg, ea.mem.off, mmvalp,
ea.bytes, ctxt);
+ }
if ( rc )
goto done;
dst.type = OP_NONE;
@@ -4994,6 +5019,8 @@ x86_emulate(
if ( b == 0x6f )
rc = ops->read(ea.mem.seg, ea.mem.off+0, mmvalp,
ea.bytes, ctxt);
+ else
+ fail_if(!ops->write); /* Check before running the stub. */
}
if ( ea.type == OP_MEM || b == 0x7e )
{
@@ -5015,8 +5042,11 @@ x86_emulate(
put_fpu(&fic);
put_stub(stub);
if ( !rc && (b != 0x6f) && (ea.type == OP_MEM) )
+ {
+ ASSERT(ops->write); /* See the fail_if() above. */
rc = ops->write(ea.mem.seg, ea.mem.off, mmvalp,
ea.bytes, ctxt);
+ }
if ( rc )
goto done;
dst.type = OP_NONE;
@@ -5264,6 +5294,7 @@ x86_emulate(
generate_exception_if((modrm_reg & 7) != 1, EXC_UD);
generate_exception_if(ea.type != OP_MEM, EXC_UD);
+ fail_if(!ops->cmpxchg);
if ( op_bytes == 8 )
host_and_vcpu_must_have(cx16);
op_bytes *= 2;
@@ -5396,12 +5427,18 @@ x86_emulate(
!ctxt->force_writeback )
/* nothing to do */;
else if ( lock_prefix )
+ {
+ fail_if(!ops->cmpxchg);
rc = ops->cmpxchg(
dst.mem.seg, dst.mem.off, &dst.orig_val,
&dst.val, dst.bytes, ctxt);
+ }
else
+ {
+ fail_if(!ops->write);
rc = ops->write(
dst.mem.seg, dst.mem.off, &dst.val, dst.bytes, ctxt);
+ }
if ( rc != 0 )
goto done;
default:
@@ -5485,8 +5522,6 @@ x86_decode_insn(
const struct x86_emulate_ops ops = {
.insn_fetch = insn_fetch,
.read = x86emul_unhandleable_rw,
- .write = PTR_POISON,
- .cmpxchg = PTR_POISON,
};
int rc = x86_decode(state, ctxt, &ops);
[-- Attachment #3: Type: text/plain, Size: 127 bytes --]
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel
^ permalink raw reply [flat|nested] 14+ messages in thread* [PATCH v4 4/5] x86/PV: use generic emulator for privileged instruction handling
2016-12-13 11:08 [PATCH v4 0/5] x86: extend use of generic insn emulator for PV Jan Beulich
` (2 preceding siblings ...)
2016-12-13 11:28 ` [PATCH v4 3/5] x86emul: make write and cmpxchg hooks optional Jan Beulich
@ 2016-12-13 11:28 ` Jan Beulich
2016-12-13 16:53 ` Andrew Cooper
2016-12-13 11:29 ` [PATCH v4 5/5] x86/PV: prefer pv_inject_hw_exception() Jan Beulich
4 siblings, 1 reply; 14+ messages in thread
From: Jan Beulich @ 2016-12-13 11:28 UTC (permalink / raw)
To: xen-devel; +Cc: Andrew Cooper
[-- Attachment #1: Type: text/plain, Size: 47653 bytes --]
There's a new emulator return code being added to allow bypassing
certain operations (see the code comment).
Another small tweak to the emulator is to single iteration handling
of INS and OUTS: Since we don't want to handle any other memory access
instructions, we want these to be handled by the rep_ins() / rep_outs()
hooks here too.
And then long-mode related bits now get hidden from the guest. This
should have been that way from the beginning, but becomes a requirement
now as the emulator's in_longmode() needs this to reflect guest view.
Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v4: Rename priv_op_to_linear() to pv_emul_virt_to_linear() and change
its return type. Refuse system segments and set just one of L and
DB in 64-bit mode case of priv_op_read_segment(). Have
pv_emul_cpuid() return X86EMUL_EXCEPTION in the CPUID faulting
case. Add EFER behavioral change to description.
v3: Re-base. Do away with the special case pointer checks on the ->read
and ->write methods in OUTS and INS handling. Clear EFER.LM* bits
for 32-bit guests (to avoid the emulator's in_longmode() returning
a wrong result). Introduce and use ->validate() hook. Make
formatting more consistent.
--- a/xen/arch/x86/traps.c
+++ b/xen/arch/x86/traps.c
@@ -697,16 +697,13 @@ static inline void do_guest_trap(unsigne
pv_inject_event(&event);
}
-static void instruction_done(
- struct cpu_user_regs *regs, unsigned long eip, unsigned int bpmatch)
+static void instruction_done(struct cpu_user_regs *regs, unsigned long eip)
{
regs->eip = eip;
regs->eflags &= ~X86_EFLAGS_RF;
- if ( bpmatch || (regs->eflags & X86_EFLAGS_TF) )
+ if ( regs->eflags & X86_EFLAGS_TF )
{
- current->arch.debugreg[6] |= bpmatch | DR_STATUS_RESERVED_ONE;
- if ( regs->eflags & X86_EFLAGS_TF )
- current->arch.debugreg[6] |= DR_STEP;
+ current->arch.debugreg[6] |= DR_STEP | DR_STATUS_RESERVED_ONE;
do_guest_trap(TRAP_debug, regs);
}
}
@@ -1336,7 +1333,7 @@ static int emulate_invalid_rdtscp(struct
return 0;
eip += sizeof(opcode);
pv_soft_rdtsc(v, regs, 1);
- instruction_done(regs, eip, 0);
+ instruction_done(regs, eip);
return EXCRET_fault_fixed;
}
@@ -1378,7 +1375,7 @@ static int emulate_forced_invalid_op(str
pv_cpuid(regs);
- instruction_done(regs, eip, 0);
+ instruction_done(regs, eip);
trace_trap_one_addr(TRC_PV_FORCED_INVALID_OP, regs->eip);
@@ -2023,6 +2020,155 @@ static int read_gate_descriptor(unsigned
return 1;
}
+static int pv_emul_virt_to_linear(unsigned long base, unsigned long offset,
+ unsigned int bytes, unsigned long limit,
+ enum x86_segment seg,
+ struct x86_emulate_ctxt *ctxt,
+ unsigned long *addr)
+{
+ int rc = X86EMUL_OKAY;
+
+ *addr = base + offset;
+
+ if ( ctxt->addr_size < 8 )
+ {
+ if ( limit < bytes - 1 || offset > limit - bytes + 1 )
+ rc = X86EMUL_EXCEPTION;
+ *addr = (uint32_t)*addr;
+ }
+ else if ( !__addr_ok(*addr) )
+ rc = X86EMUL_EXCEPTION;
+
+ if ( unlikely(rc == X86EMUL_EXCEPTION) )
+ x86_emul_hw_exception(seg != x86_seg_ss ? TRAP_gp_fault
+ : TRAP_stack_error,
+ 0, ctxt);
+
+ return rc;
+}
+
+struct priv_op_ctxt {
+ struct x86_emulate_ctxt ctxt;
+ struct {
+ unsigned long base, limit;
+ } cs;
+ char *io_emul_stub;
+ unsigned int bpmatch;
+ unsigned int tsc;
+#define TSC_BASE 1
+#define TSC_AUX 2
+};
+
+static int priv_op_insn_fetch(enum x86_segment seg,
+ unsigned long offset,
+ void *p_data,
+ unsigned int bytes,
+ struct x86_emulate_ctxt *ctxt)
+{
+ const struct priv_op_ctxt *poc =
+ container_of(ctxt, struct priv_op_ctxt, ctxt);
+ unsigned int rc;
+ unsigned long addr = poc->cs.base + offset;
+
+ ASSERT(seg == x86_seg_cs);
+
+ /* We don't mean to emulate any branches. */
+ if ( !bytes )
+ return X86EMUL_UNHANDLEABLE;
+
+ rc = pv_emul_virt_to_linear(poc->cs.base, offset, bytes, poc->cs.limit,
+ x86_seg_cs, ctxt, &addr);
+ if ( rc != X86EMUL_OKAY )
+ return rc;
+
+ if ( (rc = __copy_from_user(p_data, (void *)addr, bytes)) != 0 )
+ {
+ x86_emul_pagefault(cpu_has_nx ? PFEC_insn_fetch : 0,
+ addr + bytes - rc, ctxt);
+ return X86EMUL_EXCEPTION;
+ }
+
+ return X86EMUL_OKAY;
+}
+
+static int priv_op_read_segment(enum x86_segment seg,
+ struct segment_register *reg,
+ struct x86_emulate_ctxt *ctxt)
+{
+ if ( ctxt->addr_size < 8 )
+ {
+ unsigned long limit;
+ unsigned int sel, ar;
+
+ switch ( seg )
+ {
+ case x86_seg_cs: sel = ctxt->regs->cs; break;
+ case x86_seg_ds: sel = read_sreg(ds); break;
+ case x86_seg_es: sel = read_sreg(es); break;
+ case x86_seg_fs: sel = read_sreg(fs); break;
+ case x86_seg_gs: sel = read_sreg(gs); break;
+ case x86_seg_ss: sel = ctxt->regs->ss; break;
+ case x86_seg_tr:
+ /* Check if this is an attempt to access to I/O bitmap. */
+ if ( (ctxt->opcode & ~0xb) == 0xe4 || (ctxt->opcode & ~3) == 0x6c )
+ return X86EMUL_DONE;
+ /* fall through */
+ default: return X86EMUL_UNHANDLEABLE;
+ }
+
+ if ( !read_descriptor(sel, current, ®->base, &limit, &ar, 0) )
+ return X86EMUL_UNHANDLEABLE;
+
+ reg->limit = limit;
+ reg->attr.bytes = ar >> 8;
+ }
+ else
+ {
+ switch ( seg )
+ {
+ default:
+ if ( !is_x86_user_segment(seg) )
+ return X86EMUL_UNHANDLEABLE;
+ reg->base = 0;
+ break;
+ case x86_seg_fs:
+ reg->base = rdfsbase();
+ break;
+ case x86_seg_gs:
+ reg->base = rdgsbase();
+ break;
+ }
+
+ reg->limit = ~0U;
+
+ reg->attr.bytes = 0;
+ reg->attr.fields.type = _SEGMENT_WR >> 8;
+ if ( seg == x86_seg_cs )
+ {
+ reg->attr.fields.type |= _SEGMENT_CODE >> 8;
+ reg->attr.fields.l = 1;
+ }
+ else
+ reg->attr.fields.db = 1;
+ reg->attr.fields.s = 1;
+ reg->attr.fields.dpl = 3;
+ reg->attr.fields.p = 1;
+ reg->attr.fields.g = 1;
+ }
+
+ /*
+ * For x86_emulate.c's mode_ring0() to work, fake a DPL of zero.
+ * Also do this for consistency for non-conforming code segments.
+ */
+ if ( (seg == x86_seg_ss ||
+ (seg == x86_seg_cs &&
+ !(reg->attr.fields.type & (_SEGMENT_EC >> 8)))) &&
+ guest_kernel_mode(current, ctxt->regs) )
+ reg->attr.fields.dpl = 0;
+
+ return X86EMUL_OKAY;
+}
+
/* Perform IOPL check between the vcpu's shadowed IOPL, and the assumed cpl. */
static bool_t iopl_ok(const struct vcpu *v, const struct cpu_user_regs *regs)
{
@@ -2269,6 +2415,238 @@ unsigned long guest_to_host_gpr_switch(u
void (*pv_post_outb_hook)(unsigned int port, u8 value);
+typedef void io_emul_stub_t(struct cpu_user_regs *);
+
+static io_emul_stub_t *io_emul_stub_setup(struct priv_op_ctxt *ctxt, u8 opcode,
+ unsigned int port, unsigned int bytes)
+{
+ if ( !ctxt->io_emul_stub )
+ ctxt->io_emul_stub = map_domain_page(_mfn(this_cpu(stubs.mfn))) +
+ (this_cpu(stubs.addr) &
+ ~PAGE_MASK) +
+ STUB_BUF_SIZE / 2;
+
+ /* movq $host_to_guest_gpr_switch,%rcx */
+ ctxt->io_emul_stub[0] = 0x48;
+ ctxt->io_emul_stub[1] = 0xb9;
+ *(void **)&ctxt->io_emul_stub[2] = (void *)host_to_guest_gpr_switch;
+ /* callq *%rcx */
+ ctxt->io_emul_stub[10] = 0xff;
+ ctxt->io_emul_stub[11] = 0xd1;
+ /* data16 or nop */
+ ctxt->io_emul_stub[12] = (bytes != 2) ? 0x90 : 0x66;
+ /* <io-access opcode> */
+ ctxt->io_emul_stub[13] = opcode;
+ /* imm8 or nop */
+ ctxt->io_emul_stub[14] = !(opcode & 8) ? port : 0x90;
+ /* ret (jumps to guest_to_host_gpr_switch) */
+ ctxt->io_emul_stub[15] = 0xc3;
+ BUILD_BUG_ON(STUB_BUF_SIZE / 2 < 16);
+
+ if ( ioemul_handle_quirk )
+ ioemul_handle_quirk(opcode, &ctxt->io_emul_stub[12], ctxt->ctxt.regs);
+
+ /* Handy function-typed pointer to the stub. */
+ return (void *)(this_cpu(stubs.addr) + STUB_BUF_SIZE / 2);
+}
+
+static int priv_op_read_io(unsigned int port, unsigned int bytes,
+ unsigned long *val, struct x86_emulate_ctxt *ctxt)
+{
+ struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt);
+ struct vcpu *curr = current;
+ struct domain *currd = current->domain;
+
+ /* INS must not come here. */
+ ASSERT((ctxt->opcode & ~9) == 0xe4);
+
+ if ( !guest_io_okay(port, bytes, curr, ctxt->regs) )
+ return X86EMUL_UNHANDLEABLE;
+
+ poc->bpmatch = check_guest_io_breakpoint(curr, port, bytes);
+
+ if ( admin_io_okay(port, bytes, currd) )
+ {
+ io_emul_stub_t *io_emul =
+ io_emul_stub_setup(poc, ctxt->opcode, port, bytes);
+
+ mark_regs_dirty(ctxt->regs);
+ io_emul(ctxt->regs);
+ return X86EMUL_DONE;
+ }
+
+ *val = guest_io_read(port, bytes, currd);
+
+ return X86EMUL_OKAY;
+}
+
+static int priv_op_write_io(unsigned int port, unsigned int bytes,
+ unsigned long val, struct x86_emulate_ctxt *ctxt)
+{
+ struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt);
+ struct vcpu *curr = current;
+ struct domain *currd = current->domain;
+
+ /* OUTS must not come here. */
+ ASSERT((ctxt->opcode & ~9) == 0xe6);
+
+ if ( !guest_io_okay(port, bytes, curr, ctxt->regs) )
+ return X86EMUL_UNHANDLEABLE;
+
+ poc->bpmatch = check_guest_io_breakpoint(curr, port, bytes);
+
+ if ( admin_io_okay(port, bytes, currd) )
+ {
+ io_emul_stub_t *io_emul =
+ io_emul_stub_setup(poc, ctxt->opcode, port, bytes);
+
+ mark_regs_dirty(ctxt->regs);
+ io_emul(ctxt->regs);
+ if ( (bytes == 1) && pv_post_outb_hook )
+ pv_post_outb_hook(port, val);
+ return X86EMUL_DONE;
+ }
+
+ guest_io_write(port, bytes, val, currd);
+
+ return X86EMUL_OKAY;
+}
+
+static int priv_op_rep_ins(uint16_t port,
+ enum x86_segment seg, unsigned long offset,
+ unsigned int bytes_per_rep, unsigned long *reps,
+ struct x86_emulate_ctxt *ctxt)
+{
+ struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt);
+ struct vcpu *curr = current;
+ struct domain *currd = current->domain;
+ unsigned long goal = *reps;
+ struct segment_register sreg;
+ int rc;
+
+ ASSERT(seg == x86_seg_es);
+
+ *reps = 0;
+
+ if ( !guest_io_okay(port, bytes_per_rep, curr, ctxt->regs) )
+ return X86EMUL_UNHANDLEABLE;
+
+ rc = priv_op_read_segment(x86_seg_es, &sreg, ctxt);
+ if ( rc != X86EMUL_OKAY )
+ return rc;
+
+ if ( !sreg.attr.fields.p )
+ return X86EMUL_UNHANDLEABLE;
+ if ( !sreg.attr.fields.s ||
+ (sreg.attr.fields.type & (_SEGMENT_CODE >> 8)) ||
+ !(sreg.attr.fields.type & (_SEGMENT_WR >> 8)) )
+ {
+ x86_emul_hw_exception(TRAP_gp_fault, 0, ctxt);
+ return X86EMUL_EXCEPTION;
+ }
+
+ poc->bpmatch = check_guest_io_breakpoint(curr, port, bytes_per_rep);
+
+ while ( *reps < goal )
+ {
+ unsigned int data = guest_io_read(port, bytes_per_rep, currd);
+ unsigned long addr;
+
+ rc = pv_emul_virt_to_linear(sreg.base, offset, bytes_per_rep,
+ sreg.limit, x86_seg_es, ctxt, &addr);
+ if ( rc != X86EMUL_OKAY )
+ return rc;
+
+ if ( (rc = __copy_to_user((void *)addr, &data, bytes_per_rep)) != 0 )
+ {
+ x86_emul_pagefault(PFEC_write_access,
+ addr + bytes_per_rep - rc, ctxt);
+ return X86EMUL_EXCEPTION;
+ }
+
+ ++*reps;
+
+ if ( poc->bpmatch || hypercall_preempt_check() )
+ break;
+
+ /* x86_emulate() clips the repetition count to ensure we don't wrap. */
+ if ( unlikely(ctxt->regs->_eflags & X86_EFLAGS_DF) )
+ offset -= bytes_per_rep;
+ else
+ offset += bytes_per_rep;
+ }
+
+ return X86EMUL_OKAY;
+}
+
+static int priv_op_rep_outs(enum x86_segment seg, unsigned long offset,
+ uint16_t port,
+ unsigned int bytes_per_rep, unsigned long *reps,
+ struct x86_emulate_ctxt *ctxt)
+{
+ struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt);
+ struct vcpu *curr = current;
+ struct domain *currd = current->domain;
+ unsigned long goal = *reps;
+ struct segment_register sreg;
+ int rc;
+
+ *reps = 0;
+
+ if ( !guest_io_okay(port, bytes_per_rep, curr, ctxt->regs) )
+ return X86EMUL_UNHANDLEABLE;
+
+ rc = priv_op_read_segment(seg, &sreg, ctxt);
+ if ( rc != X86EMUL_OKAY )
+ return rc;
+
+ if ( !sreg.attr.fields.p )
+ return X86EMUL_UNHANDLEABLE;
+ if ( !sreg.attr.fields.s ||
+ ((sreg.attr.fields.type & (_SEGMENT_CODE >> 8)) &&
+ !(sreg.attr.fields.type & (_SEGMENT_WR >> 8))) )
+ {
+ x86_emul_hw_exception(seg != x86_seg_ss ? TRAP_gp_fault
+ : TRAP_stack_error,
+ 0, ctxt);
+ return X86EMUL_EXCEPTION;
+ }
+
+ poc->bpmatch = check_guest_io_breakpoint(curr, port, bytes_per_rep);
+
+ while ( *reps < goal )
+ {
+ unsigned int data = 0;
+ unsigned long addr;
+
+ rc = pv_emul_virt_to_linear(sreg.base, offset, bytes_per_rep,
+ sreg.limit, seg, ctxt, &addr);
+ if ( rc != X86EMUL_OKAY )
+ return rc;
+
+ if ( (rc = __copy_from_user(&data, (void *)addr, bytes_per_rep)) != 0 )
+ {
+ x86_emul_pagefault(0, addr + bytes_per_rep - rc, ctxt);
+ return X86EMUL_EXCEPTION;
+ }
+
+ guest_io_write(port, bytes_per_rep, data, currd);
+
+ ++*reps;
+
+ if ( poc->bpmatch || hypercall_preempt_check() )
+ break;
+
+ /* x86_emulate() clips the repetition count to ensure we don't wrap. */
+ if ( unlikely(ctxt->regs->_eflags & X86_EFLAGS_DF) )
+ offset -= bytes_per_rep;
+ else
+ offset += bytes_per_rep;
+ }
+
+ return X86EMUL_OKAY;
+}
+
static int priv_op_read_cr(unsigned int reg, unsigned long *val,
struct x86_emulate_ctxt *ctxt)
{
@@ -2409,6 +2787,7 @@ static inline bool is_cpufreq_controller
static int priv_op_read_msr(unsigned int reg, uint64_t *val,
struct x86_emulate_ctxt *ctxt)
{
+ struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt);
const struct vcpu *curr = current;
const struct domain *currd = curr->domain;
bool vpmu_msr = false;
@@ -2436,6 +2815,28 @@ static int priv_op_read_msr(unsigned int
*val = curr->arch.pv_vcpu.gs_base_user;
return X86EMUL_OKAY;
+ /*
+ * In order to fully retain original behavior, defer calling
+ * pv_soft_rdtsc() until after emulation. This may want/need to be
+ * reconsidered.
+ */
+ case MSR_IA32_TSC:
+ poc->tsc |= TSC_BASE;
+ goto normal;
+
+ case MSR_TSC_AUX:
+ poc->tsc |= TSC_AUX;
+ if ( cpu_has_rdtscp )
+ goto normal;
+ *val = 0;
+ return X86EMUL_OKAY;
+
+ case MSR_EFER:
+ *val = read_efer();
+ if ( is_pv_32bit_domain(currd) )
+ *val &= ~(EFER_LME | EFER_LMA | EFER_LMSLE);
+ return X86EMUL_OKAY;
+
case MSR_K7_FID_VID_CTL:
case MSR_K7_FID_VID_STATUS:
case MSR_K8_PSTATE_LIMIT:
@@ -2539,7 +2940,6 @@ static int priv_op_read_msr(unsigned int
if ( rc )
return X86EMUL_OKAY;
/* fall through */
- case MSR_EFER:
normal:
/* Everyone can read the MSR space. */
/* gdprintk(XENLOG_WARNING, "Domain attempted RDMSR %08x\n", reg); */
@@ -2761,11 +3161,41 @@ static int priv_op_write_msr(unsigned in
return X86EMUL_UNHANDLEABLE;
}
+static int priv_op_wbinvd(struct x86_emulate_ctxt *ctxt)
+{
+ /* Ignore the instruction if unprivileged. */
+ if ( !cache_flush_permitted(current->domain) )
+ /*
+ * Non-physdev domain attempted WBINVD; ignore for now since
+ * newer linux uses this in some start-of-day timing loops.
+ */
+ ;
+ else
+ wbinvd();
+
+ return X86EMUL_OKAY;
+}
+
int pv_emul_cpuid(unsigned int *eax, unsigned int *ebx, unsigned int *ecx,
unsigned int *edx, struct x86_emulate_ctxt *ctxt)
{
struct cpu_user_regs regs = *ctxt->regs;
+ /*
+ * x86_emulate uses this function to query CPU features for its own
+ * internal use. Make sure we're actually emulating CPUID before checking
+ * for emulated CPUID faulting.
+ */
+ if ( ctxt->opcode == X86EMUL_OPC(0x0f, 0xa2) )
+ {
+ const struct vcpu *curr = current;
+
+ /* If cpuid faulting is enabled and CPL>0 leave the #GP untouched. */
+ if ( curr->arch.cpuid_faulting &&
+ !guest_kernel_mode(curr, ctxt->regs) )
+ return X86EMUL_EXCEPTION;
+ }
+
regs._eax = *eax;
regs._ecx = *ecx;
@@ -2779,497 +3209,153 @@ int pv_emul_cpuid(unsigned int *eax, uns
return X86EMUL_OKAY;
}
-/* Instruction fetch with error handling. */
-#define insn_fetch(type, base, eip, limit) \
-({ unsigned long _rc, _ptr = (base) + (eip); \
- type _x; \
- if ( ad_default < 8 ) \
- _ptr = (unsigned int)_ptr; \
- if ( (limit) < sizeof(_x) - 1 || (eip) > (limit) - (sizeof(_x) - 1) ) \
- goto fail; \
- if ( (_rc = copy_from_user(&_x, (type *)_ptr, sizeof(_x))) != 0 ) \
- { \
- pv_inject_page_fault(0, _ptr + sizeof(_x) - _rc); \
- goto skip; \
- } \
- (eip) += sizeof(_x); _x; })
-
-static int emulate_privileged_op(struct cpu_user_regs *regs)
+static int priv_op_validate(const struct x86_emulate_state *state,
+ struct x86_emulate_ctxt *ctxt)
{
- struct vcpu *v = current;
- struct domain *currd = v->domain;
- unsigned long *reg, eip = regs->eip;
- u8 opcode, modrm_reg = 0, modrm_rm = 0, rep_prefix = 0, lock = 0, rex = 0;
- enum { lm_seg_none, lm_seg_fs, lm_seg_gs } lm_ovr = lm_seg_none;
- int rc;
- unsigned int port, i, data_sel, ar, data, bpmatch = 0;
- unsigned int op_bytes, op_default, ad_bytes, ad_default, opsize_prefix= 0;
-#define rd_ad(reg) (ad_bytes >= sizeof(regs->reg) \
- ? regs->reg \
- : ad_bytes == 4 \
- ? (u32)regs->reg \
- : (u16)regs->reg)
-#define wr_ad(reg, val) (ad_bytes >= sizeof(regs->reg) \
- ? regs->reg = (val) \
- : ad_bytes == 4 \
- ? (*(u32 *)®s->reg = (val)) \
- : (*(u16 *)®s->reg = (val)))
- unsigned long code_base, code_limit;
- char *io_emul_stub = NULL;
- void (*io_emul)(struct cpu_user_regs *);
- uint64_t val;
-
- if ( !read_descriptor(regs->cs, v, &code_base, &code_limit, &ar, 1) )
- goto fail;
- op_default = op_bytes = (ar & (_SEGMENT_L|_SEGMENT_DB)) ? 4 : 2;
- ad_default = ad_bytes = (ar & _SEGMENT_L) ? 8 : op_default;
- if ( !(ar & _SEGMENT_S) ||
- !(ar & _SEGMENT_P) ||
- !(ar & _SEGMENT_CODE) )
- goto fail;
-
- /* emulating only opcodes not allowing SS to be default */
- data_sel = read_sreg(ds);
-
- /* Legacy prefixes. */
- for ( i = 0; i < 8; i++, rex == opcode || (rex = 0) )
+ switch ( ctxt->opcode )
{
- switch ( opcode = insn_fetch(u8, code_base, eip, code_limit) )
- {
- case 0x66: /* operand-size override */
- opsize_prefix = 1;
- op_bytes = op_default ^ 6; /* switch between 2/4 bytes */
- continue;
- case 0x67: /* address-size override */
- ad_bytes = ad_default != 4 ? 4 : 2; /* switch to 2/4 bytes */
- continue;
- case 0x2e: /* CS override */
- data_sel = regs->cs;
- continue;
- case 0x3e: /* DS override */
- data_sel = read_sreg(ds);
- continue;
- case 0x26: /* ES override */
- data_sel = read_sreg(es);
- continue;
- case 0x64: /* FS override */
- data_sel = read_sreg(fs);
- lm_ovr = lm_seg_fs;
- continue;
- case 0x65: /* GS override */
- data_sel = read_sreg(gs);
- lm_ovr = lm_seg_gs;
- continue;
- case 0x36: /* SS override */
- data_sel = regs->ss;
- continue;
- case 0xf0: /* LOCK */
- lock = 1;
- continue;
- case 0xf2: /* REPNE/REPNZ */
- case 0xf3: /* REP/REPE/REPZ */
- rep_prefix = 1;
- continue;
- default:
- if ( (ar & _SEGMENT_L) && (opcode & 0xf0) == 0x40 )
- {
- rex = opcode;
- continue;
- }
- break;
- }
- break;
- }
-
- /* REX prefix. */
- if ( rex & 8 ) /* REX.W */
- op_bytes = 4; /* emulate only opcodes not supporting 64-bit operands */
- modrm_reg = (rex & 4) << 1; /* REX.R */
- /* REX.X does not need to be decoded. */
- modrm_rm = (rex & 1) << 3; /* REX.B */
-
- if ( opcode == 0x0f )
- goto twobyte_opcode;
-
- if ( lock )
- goto fail;
-
- /* Input/Output String instructions. */
- if ( (opcode >= 0x6c) && (opcode <= 0x6f) )
- {
- unsigned long data_base, data_limit;
-
- if ( rep_prefix && (rd_ad(ecx) == 0) )
- goto done;
-
- if ( !(opcode & 2) )
- {
- data_sel = read_sreg(es);
- lm_ovr = lm_seg_none;
- }
-
- if ( !(ar & _SEGMENT_L) )
- {
- if ( !read_descriptor(data_sel, v, &data_base, &data_limit,
- &ar, 0) )
- goto fail;
- if ( !(ar & _SEGMENT_S) ||
- !(ar & _SEGMENT_P) ||
- (opcode & 2 ?
- (ar & _SEGMENT_CODE) && !(ar & _SEGMENT_WR) :
- (ar & _SEGMENT_CODE) || !(ar & _SEGMENT_WR)) )
- goto fail;
- }
- else
- {
- switch ( lm_ovr )
- {
- default:
- data_base = 0UL;
- break;
- case lm_seg_fs:
- data_base = rdfsbase();
- break;
- case lm_seg_gs:
- data_base = rdgsbase();
- break;
- }
- data_limit = ~0UL;
- ar = _SEGMENT_WR|_SEGMENT_S|_SEGMENT_DPL|_SEGMENT_P;
- }
-
- port = (u16)regs->edx;
+ case 0x6c ... 0x6f: /* ins / outs */
+ case 0xe4 ... 0xe7: /* in / out (immediate port) */
+ case 0xec ... 0xef: /* in / out (port in %dx) */
+ case X86EMUL_OPC(0x0f, 0x06): /* clts */
+ case X86EMUL_OPC(0x0f, 0x09): /* wbinvd */
+ case X86EMUL_OPC(0x0f, 0x20) ...
+ X86EMUL_OPC(0x0f, 0x23): /* mov to/from cr/dr */
+ case X86EMUL_OPC(0x0f, 0x30): /* wrmsr */
+ case X86EMUL_OPC(0x0f, 0x31): /* rdtsc */
+ case X86EMUL_OPC(0x0f, 0x32): /* rdmsr */
+ case X86EMUL_OPC(0x0f, 0xa2): /* cpuid */
+ return X86EMUL_OKAY;
- continue_io_string:
- switch ( opcode )
- {
- case 0x6c: /* INSB */
- op_bytes = 1;
- case 0x6d: /* INSW/INSL */
- if ( (data_limit < (op_bytes - 1)) ||
- (rd_ad(edi) > (data_limit - (op_bytes - 1))) ||
- !guest_io_okay(port, op_bytes, v, regs) )
- goto fail;
- data = guest_io_read(port, op_bytes, currd);
- if ( (rc = copy_to_user((void *)data_base + rd_ad(edi),
- &data, op_bytes)) != 0 )
- {
- pv_inject_page_fault(PFEC_write_access,
- data_base + rd_ad(edi) + op_bytes - rc);
- return EXCRET_fault_fixed;
- }
- wr_ad(edi, regs->edi + (int)((regs->eflags & X86_EFLAGS_DF)
- ? -op_bytes : op_bytes));
+ case 0xfa: case 0xfb: /* cli / sti */
+ if ( !iopl_ok(current, ctxt->regs) )
break;
-
- case 0x6e: /* OUTSB */
- op_bytes = 1;
- case 0x6f: /* OUTSW/OUTSL */
- if ( (data_limit < (op_bytes - 1)) ||
- (rd_ad(esi) > (data_limit - (op_bytes - 1))) ||
- !guest_io_okay(port, op_bytes, v, regs) )
- goto fail;
- if ( (rc = copy_from_user(&data, (void *)data_base + rd_ad(esi),
- op_bytes)) != 0 )
- {
- pv_inject_page_fault(0, data_base + rd_ad(esi)
- + op_bytes - rc);
- return EXCRET_fault_fixed;
- }
- guest_io_write(port, op_bytes, data, currd);
- wr_ad(esi, regs->esi + (int)((regs->eflags & X86_EFLAGS_DF)
- ? -op_bytes : op_bytes));
- break;
- }
-
- bpmatch = check_guest_io_breakpoint(v, port, op_bytes);
-
- if ( rep_prefix && (wr_ad(ecx, regs->ecx - 1) != 0) )
- {
- if ( !bpmatch && !hypercall_preempt_check() )
- goto continue_io_string;
- eip = regs->eip;
- }
-
- goto done;
- }
-
- /*
- * Very likely to be an I/O instruction (IN/OUT).
- * Build an stub to execute the instruction with full guest GPR
- * context. This is needed for some systems which (ab)use IN/OUT
- * to communicate with BIOS code in system-management mode.
- */
- io_emul_stub = map_domain_page(_mfn(this_cpu(stubs.mfn))) +
- (this_cpu(stubs.addr) & ~PAGE_MASK) +
- STUB_BUF_SIZE / 2;
- /* movq $host_to_guest_gpr_switch,%rcx */
- io_emul_stub[0] = 0x48;
- io_emul_stub[1] = 0xb9;
- *(void **)&io_emul_stub[2] = (void *)host_to_guest_gpr_switch;
- /* callq *%rcx */
- io_emul_stub[10] = 0xff;
- io_emul_stub[11] = 0xd1;
- /* data16 or nop */
- io_emul_stub[12] = (op_bytes != 2) ? 0x90 : 0x66;
- /* <io-access opcode> */
- io_emul_stub[13] = opcode;
- /* imm8 or nop */
- io_emul_stub[14] = 0x90;
- /* ret (jumps to guest_to_host_gpr_switch) */
- io_emul_stub[15] = 0xc3;
- BUILD_BUG_ON(STUB_BUF_SIZE / 2 < 16);
-
- /* Handy function-typed pointer to the stub. */
- io_emul = (void *)(this_cpu(stubs.addr) + STUB_BUF_SIZE / 2);
-
- if ( ioemul_handle_quirk )
- ioemul_handle_quirk(opcode, &io_emul_stub[12], regs);
-
- /* I/O Port and Interrupt Flag instructions. */
- switch ( opcode )
- {
- case 0xe4: /* IN imm8,%al */
- op_bytes = 1;
- case 0xe5: /* IN imm8,%eax */
- port = insn_fetch(u8, code_base, eip, code_limit);
- io_emul_stub[14] = port; /* imm8 */
- exec_in:
- if ( !guest_io_okay(port, op_bytes, v, regs) )
- goto fail;
- if ( admin_io_okay(port, op_bytes, currd) )
- {
- mark_regs_dirty(regs);
- io_emul(regs);
- }
- else
- {
- if ( op_bytes == 4 )
- regs->eax = 0;
- else
- regs->eax &= ~((1 << (op_bytes * 8)) - 1);
- regs->eax |= guest_io_read(port, op_bytes, currd);
- }
- bpmatch = check_guest_io_breakpoint(v, port, op_bytes);
- goto done;
-
- case 0xec: /* IN %dx,%al */
- op_bytes = 1;
- case 0xed: /* IN %dx,%eax */
- port = (u16)regs->edx;
- goto exec_in;
-
- case 0xe6: /* OUT %al,imm8 */
- op_bytes = 1;
- case 0xe7: /* OUT %eax,imm8 */
- port = insn_fetch(u8, code_base, eip, code_limit);
- io_emul_stub[14] = port; /* imm8 */
- exec_out:
- if ( !guest_io_okay(port, op_bytes, v, regs) )
- goto fail;
- if ( admin_io_okay(port, op_bytes, currd) )
- {
- mark_regs_dirty(regs);
- io_emul(regs);
- if ( (op_bytes == 1) && pv_post_outb_hook )
- pv_post_outb_hook(port, regs->eax);
- }
- else
- {
- guest_io_write(port, op_bytes, regs->eax, currd);
- }
- bpmatch = check_guest_io_breakpoint(v, port, op_bytes);
- goto done;
-
- case 0xee: /* OUT %al,%dx */
- op_bytes = 1;
- case 0xef: /* OUT %eax,%dx */
- port = (u16)regs->edx;
- goto exec_out;
-
- case 0xfa: /* CLI */
- case 0xfb: /* STI */
- if ( !iopl_ok(v, regs) )
- goto fail;
/*
* This is just too dangerous to allow, in my opinion. Consider if the
* caller then tries to reenable interrupts using POPF: we can't trap
* that and we'll end up with hard-to-debug lockups. Fast & loose will
* do for us. :-)
+ vcpu_info(current, evtchn_upcall_mask) = (ctxt->opcode == 0xfa);
*/
- /*v->vcpu_info->evtchn_upcall_mask = (opcode == 0xfa);*/
- goto done;
- }
+ return X86EMUL_DONE;
- /* No decode of this single-byte opcode. */
- goto fail;
+ case X86EMUL_OPC(0x0f, 0x01):
+ {
+ unsigned int modrm_rm, modrm_reg;
- twobyte_opcode:
- /*
- * All 2 and 3 byte opcodes, except RDTSC (0x31), RDTSCP (0x1,0xF9),
- * and CPUID (0xa2), are executable only from guest kernel mode
- * (virtual ring 0).
- */
- opcode = insn_fetch(u8, code_base, eip, code_limit);
- if ( !guest_kernel_mode(v, regs) &&
- (opcode != 0x1) && (opcode != 0x31) && (opcode != 0xa2) )
- goto fail;
-
- if ( lock && (opcode & ~3) != 0x20 )
- goto fail;
- switch ( opcode )
- {
- case 0x1: /* RDTSCP and XSETBV */
- switch ( insn_fetch(u8, code_base, eip, code_limit) )
- {
- case 0xf9: /* RDTSCP */
- if ( (v->arch.pv_vcpu.ctrlreg[4] & X86_CR4_TSD) &&
- !guest_kernel_mode(v, regs) )
- goto fail;
- pv_soft_rdtsc(v, regs, 1);
+ if ( x86_insn_modrm(state, &modrm_rm, &modrm_reg) != 3 ||
+ (modrm_rm & 7) != 1 )
break;
- case 0xd1: /* XSETBV */
+ switch ( modrm_reg & 7 )
{
- u64 new_xfeature = (u32)regs->eax | ((u64)regs->edx << 32);
+ case 2: /* xsetbv */
+ case 7: /* rdtscp */
+ return X86EMUL_OKAY;
+ }
+ break;
+ }
+ }
- if ( lock || rep_prefix || opsize_prefix
- || !(v->arch.pv_vcpu.ctrlreg[4] & X86_CR4_OSXSAVE) )
- {
- do_guest_trap(TRAP_invalid_op, regs);
- goto skip;
- }
+ return X86EMUL_UNHANDLEABLE;
+}
- if ( !guest_kernel_mode(v, regs) )
- goto fail;
+static const struct x86_emulate_ops priv_op_ops = {
+ .insn_fetch = priv_op_insn_fetch,
+ .read = x86emul_unhandleable_rw,
+ .validate = priv_op_validate,
+ .read_io = priv_op_read_io,
+ .write_io = priv_op_write_io,
+ .rep_ins = priv_op_rep_ins,
+ .rep_outs = priv_op_rep_outs,
+ .read_segment = priv_op_read_segment,
+ .read_cr = priv_op_read_cr,
+ .write_cr = priv_op_write_cr,
+ .read_dr = priv_op_read_dr,
+ .write_dr = priv_op_write_dr,
+ .read_msr = priv_op_read_msr,
+ .write_msr = priv_op_write_msr,
+ .cpuid = pv_emul_cpuid,
+ .wbinvd = priv_op_wbinvd,
+};
- if ( handle_xsetbv(regs->ecx, new_xfeature) )
- goto fail;
+static int emulate_privileged_op(struct cpu_user_regs *regs)
+{
+ struct vcpu *curr = current;
+ struct domain *currd = curr->domain;
+ struct priv_op_ctxt ctxt = { .ctxt.regs = regs };
+ int rc;
+ unsigned int eflags, ar;
- break;
- }
- default:
- goto fail;
- }
- break;
+ if ( !read_descriptor(regs->cs, curr, &ctxt.cs.base, &ctxt.cs.limit,
+ &ar, 1) ||
+ !(ar & _SEGMENT_S) ||
+ !(ar & _SEGMENT_P) ||
+ !(ar & _SEGMENT_CODE) )
+ return 0;
- case 0x06: /* CLTS */
- (void)do_fpu_taskswitch(0);
- break;
+ /* Mirror virtualized state into EFLAGS. */
+ ASSERT(regs->_eflags & X86_EFLAGS_IF);
+ if ( vcpu_info(curr, evtchn_upcall_mask) )
+ regs->_eflags &= ~X86_EFLAGS_IF;
+ else
+ regs->_eflags |= X86_EFLAGS_IF;
+ ASSERT(!(regs->_eflags & X86_EFLAGS_IOPL));
+ regs->_eflags |= curr->arch.pv_vcpu.iopl;
+ eflags = regs->_eflags;
+
+ ctxt.ctxt.addr_size = ar & _SEGMENT_L ? 64 : ar & _SEGMENT_DB ? 32 : 16;
+ /* Leave zero in ctxt.ctxt.sp_size, as it's not needed. */
+ rc = x86_emulate(&ctxt.ctxt, &priv_op_ops);
- case 0x09: /* WBINVD */
- /* Ignore the instruction if unprivileged. */
- if ( !cache_flush_permitted(currd) )
- /* Non-physdev domain attempted WBINVD; ignore for now since
- newer linux uses this in some start-of-day timing loops */
- ;
- else
- wbinvd();
- break;
+ if ( ctxt.io_emul_stub )
+ unmap_domain_page(ctxt.io_emul_stub);
- case 0x20: /* MOV CR?,<reg> */
- opcode = insn_fetch(u8, code_base, eip, code_limit);
- if ( opcode < 0xc0 )
- goto fail;
- modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
- modrm_rm |= (opcode >> 0) & 7;
- if ( priv_op_read_cr(modrm_reg, decode_register(modrm_rm, regs, 0),
- NULL) != X86EMUL_OKAY )
- goto fail;
- break;
-
- case 0x21: /* MOV DR?,<reg> */ {
- opcode = insn_fetch(u8, code_base, eip, code_limit);
- if ( opcode < 0xc0 )
- goto fail;
- modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
- modrm_rm |= (opcode >> 0) & 7;
- if ( priv_op_read_dr(modrm_reg, decode_register(modrm_rm, regs, 0),
- NULL) != X86EMUL_OKAY )
- goto fail;
- break;
- }
+ /*
+ * Un-mirror virtualized state from EFLAGS.
+ * Nothing we allow to be emulated can change TF, IF, or IOPL.
+ */
+ ASSERT(!((regs->_eflags ^ eflags) & (X86_EFLAGS_IF | X86_EFLAGS_IOPL)));
+ regs->_eflags |= X86_EFLAGS_IF;
+ regs->_eflags &= ~X86_EFLAGS_IOPL;
+
+ /* More strict than x86_emulate_wrapper(). */
+ ASSERT(ctxt.ctxt.event_pending == (rc == X86EMUL_EXCEPTION));
+
+ switch ( rc )
+ {
+ case X86EMUL_OKAY:
+ if ( ctxt.tsc & TSC_BASE )
+ {
+ if ( ctxt.tsc & TSC_AUX )
+ pv_soft_rdtsc(curr, regs, 1);
+ else if ( currd->arch.vtsc )
+ pv_soft_rdtsc(curr, regs, 0);
+ else
+ {
+ uint64_t val = rdtsc();
- case 0x22: /* MOV <reg>,CR? */
- opcode = insn_fetch(u8, code_base, eip, code_limit);
- if ( opcode < 0xc0 )
- goto fail;
- modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
- modrm_rm |= (opcode >> 0) & 7;
- reg = decode_register(modrm_rm, regs, 0);
- switch ( priv_op_write_cr(modrm_reg, *reg, NULL) )
- {
- case X86EMUL_OKAY:
- break;
- case X86EMUL_RETRY: /* retry after preemption */
- goto skip;
- default:
- goto fail;
+ regs->eax = (uint32_t)val;
+ regs->edx = (uint32_t)(val >> 32);
+ }
}
- break;
-
- case 0x23: /* MOV <reg>,DR? */
- opcode = insn_fetch(u8, code_base, eip, code_limit);
- if ( opcode < 0xc0 )
- goto fail;
- modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
- modrm_rm |= (opcode >> 0) & 7;
- reg = decode_register(modrm_rm, regs, 0);
- if ( priv_op_write_dr(modrm_reg, *reg, NULL) != X86EMUL_OKAY )
- goto fail;
- break;
- case 0x30: /* WRMSR */
- if ( priv_op_write_msr(regs->_ecx, (regs->rdx << 32) | regs->_eax,
- NULL) != X86EMUL_OKAY )
- goto fail;
- break;
-
- case 0x31: /* RDTSC */
- if ( (v->arch.pv_vcpu.ctrlreg[4] & X86_CR4_TSD) &&
- !guest_kernel_mode(v, regs) )
- goto fail;
- if ( currd->arch.vtsc )
- pv_soft_rdtsc(v, regs, 0);
- else
+ if ( ctxt.ctxt.retire.singlestep )
+ ctxt.bpmatch |= DR_STEP;
+ if ( ctxt.bpmatch )
{
- val = rdtsc();
- goto rdmsr_writeback;
+ curr->arch.debugreg[6] |= ctxt.bpmatch | DR_STATUS_RESERVED_ONE;
+ if ( !(curr->arch.pv_vcpu.trap_bounce.flags & TBF_EXCEPTION) )
+ do_guest_trap(TRAP_debug, regs);
}
- break;
-
- case 0x32: /* RDMSR */
- if ( priv_op_read_msr(regs->_ecx, &val, NULL) != X86EMUL_OKAY )
- goto fail;
- rdmsr_writeback:
- regs->eax = (uint32_t)val;
- regs->edx = (uint32_t)(val >> 32);
- break;
-
- case 0xa2: /* CPUID */
- /* If cpuid faulting is enabled and CPL>0 leave the #GP untouched. */
- if ( v->arch.cpuid_faulting && !guest_kernel_mode(v, regs) )
- goto fail;
-
- pv_cpuid(regs);
- break;
+ /* fall through */
+ case X86EMUL_RETRY:
+ return EXCRET_fault_fixed;
- default:
- goto fail;
+ case X86EMUL_EXCEPTION:
+ pv_inject_event(&ctxt.ctxt.event);
+ return EXCRET_fault_fixed;
}
-#undef wr_ad
-#undef rd_ad
-
- done:
- instruction_done(regs, eip, bpmatch);
- skip:
- if ( io_emul_stub )
- unmap_domain_page(io_emul_stub);
- return EXCRET_fault_fixed;
-
- fail:
- if ( io_emul_stub )
- unmap_domain_page(io_emul_stub);
return 0;
}
@@ -3615,7 +3702,7 @@ static void emulate_gate_op(struct cpu_u
sel |= (regs->cs & 3);
regs->cs = sel;
- instruction_done(regs, off, 0);
+ instruction_done(regs, off);
}
void do_general_protection(struct cpu_user_regs *regs)
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -1185,7 +1185,7 @@ static int ioport_access_check(
fail_if(ops->read_segment == NULL);
if ( (rc = ops->read_segment(x86_seg_tr, &tr, ctxt)) != 0 )
- return rc;
+ return rc == X86EMUL_DONE ? X86EMUL_OKAY : rc;
/* Ensure the TSS has an io-bitmap-offset field. */
generate_exception_if(tr.attr.fields.type != 0xb, EXC_GP, 0);
@@ -2506,6 +2506,21 @@ x86_emulate(
/* Sync rIP to post decode value. */
_regs.eip = state.eip;
+ if ( ops->validate )
+ {
+#ifndef NDEBUG
+ state.caller = __builtin_return_address(0);
+#endif
+ rc = ops->validate(&state, ctxt);
+#ifndef NDEBUG
+ state.caller = NULL;
+#endif
+ if ( rc == X86EMUL_DONE )
+ goto no_writeback;
+ if ( rc != X86EMUL_OKAY )
+ return rc;
+ }
+
b = ctxt->opcode;
d = state.desc;
#define state (&state)
@@ -2935,13 +2950,28 @@ x86_emulate(
dst.mem.off = truncate_ea_and_reps(_regs.edi, nr_reps, dst.bytes);
if ( (rc = ioport_access_check(port, dst.bytes, ctxt, ops)) != 0 )
goto done;
- if ( (nr_reps == 1) || !ops->rep_ins ||
- ((rc = ops->rep_ins(port, dst.mem.seg, dst.mem.off, dst.bytes,
- &nr_reps, ctxt)) == X86EMUL_UNHANDLEABLE) )
+ /* Try the presumably most efficient approach first. */
+ if ( !ops->rep_ins )
+ nr_reps = 1;
+ rc = X86EMUL_UNHANDLEABLE;
+ if ( nr_reps == 1 && ops->read_io && ops->write )
{
- fail_if(ops->read_io == NULL);
+ rc = ops->read_io(port, dst.bytes, &dst.val, ctxt);
+ if ( rc == X86EMUL_OKAY )
+ nr_reps = 0;
+ }
+ if ( (nr_reps > 1 || rc == X86EMUL_UNHANDLEABLE) && ops->rep_ins )
+ rc = ops->rep_ins(port, dst.mem.seg, dst.mem.off, dst.bytes,
+ &nr_reps, ctxt);
+ if ( nr_reps >= 1 && rc == X86EMUL_UNHANDLEABLE )
+ {
+ fail_if(!ops->read_io || !ops->write);
if ( (rc = ops->read_io(port, dst.bytes, &dst.val, ctxt)) != 0 )
goto done;
+ nr_reps = 0;
+ }
+ if ( !nr_reps && rc == X86EMUL_OKAY )
+ {
dst.type = OP_MEM;
nr_reps = 1;
}
@@ -2959,14 +2989,30 @@ x86_emulate(
ea.mem.off = truncate_ea_and_reps(_regs.esi, nr_reps, dst.bytes);
if ( (rc = ioport_access_check(port, dst.bytes, ctxt, ops)) != 0 )
goto done;
- if ( (nr_reps == 1) || !ops->rep_outs ||
- ((rc = ops->rep_outs(ea.mem.seg, ea.mem.off, port, dst.bytes,
- &nr_reps, ctxt)) == X86EMUL_UNHANDLEABLE) )
+ /* Try the presumably most efficient approach first. */
+ if ( !ops->rep_outs )
+ nr_reps = 1;
+ rc = X86EMUL_UNHANDLEABLE;
+ if ( nr_reps == 1 && ops->write_io )
{
- if ( (rc = read_ulong(ea.mem.seg, truncate_ea(_regs.esi),
- &dst.val, dst.bytes, ctxt, ops)) != 0 )
+ rc = read_ulong(ea.mem.seg, ea.mem.off, &dst.val, dst.bytes,
+ ctxt, ops);
+ if ( rc == X86EMUL_OKAY )
+ nr_reps = 0;
+ }
+ if ( (nr_reps > 1 || rc == X86EMUL_UNHANDLEABLE) && ops->rep_outs )
+ rc = ops->rep_outs(ea.mem.seg, ea.mem.off, port, dst.bytes,
+ &nr_reps, ctxt);
+ if ( nr_reps >= 1 && rc == X86EMUL_UNHANDLEABLE )
+ {
+ if ( (rc = read_ulong(ea.mem.seg, ea.mem.off, &dst.val,
+ dst.bytes, ctxt, ops)) != X86EMUL_OKAY )
goto done;
fail_if(ops->write_io == NULL);
+ nr_reps = 0;
+ }
+ if ( !nr_reps && rc == X86EMUL_OKAY )
+ {
if ( (rc = ops->write_io(port, dst.bytes, dst.val, ctxt)) != 0 )
goto done;
nr_reps = 1;
@@ -4039,7 +4085,11 @@ x86_emulate(
rc = ops->read_io(port, dst.bytes, &dst.val, ctxt);
}
if ( rc != 0 )
+ {
+ if ( rc == X86EMUL_DONE )
+ goto no_writeback;
goto done;
+ }
break;
}
@@ -5445,9 +5495,7 @@ x86_emulate(
break;
}
- no_writeback:
- /* Commit shadow register state. */
- _regs.eflags &= ~EFLG_RF;
+ no_writeback: /* Commit shadow register state. */
/* Zero the upper 32 bits of %rip if not in 64-bit mode. */
if ( !mode_64bit() )
@@ -5457,7 +5505,15 @@ x86_emulate(
if ( (rc == X86EMUL_OKAY) && (ctxt->regs->eflags & EFLG_TF) )
ctxt->retire.singlestep = true;
- *ctxt->regs = _regs;
+ if ( rc != X86EMUL_DONE )
+ *ctxt->regs = _regs;
+ else
+ {
+ ctxt->regs->eip = _regs.eip;
+ rc = X86EMUL_OKAY;
+ }
+
+ ctxt->regs->eflags &= ~EFLG_RF;
done:
_put_fpu();
--- a/xen/arch/x86/x86_emulate/x86_emulate.h
+++ b/xen/arch/x86/x86_emulate/x86_emulate.h
@@ -146,6 +146,14 @@ struct __attribute__((__packed__)) segme
#define X86EMUL_EXCEPTION 2
/* Retry the emulation for some reason. No state modified. */
#define X86EMUL_RETRY 3
+ /*
+ * Operation fully done by one of the hooks:
+ * - validate(): operation completed (except common insn retire logic)
+ * - read_segment(x86_seg_tr, ...): bypass I/O bitmap access
+ * - read_io() / write_io(): bypass GPR update (non-string insns only)
+ * Undefined behavior when used anywhere else.
+ */
+#define X86EMUL_DONE 4
/* FPU sub-types which may be requested via ->get_fpu(). */
enum x86_emulate_fpu_type {
@@ -156,6 +164,8 @@ enum x86_emulate_fpu_type {
X86EMUL_FPU_ymm /* AVX/XOP instruction set (%ymm0-%ymm7/15) */
};
+struct x86_emulate_state;
+
/*
* These operations represent the instruction emulator's interface to memory,
* I/O ports, privileged state... pretty much everything other than GPRs.
@@ -238,6 +248,14 @@ struct x86_emulate_ops
struct x86_emulate_ctxt *ctxt);
/*
+ * validate: Post-decode, pre-emulate hook to allow caller controlled
+ * filtering.
+ */
+ int (*validate)(
+ const struct x86_emulate_state *state,
+ struct x86_emulate_ctxt *ctxt);
+
+ /*
* rep_ins: Emulate INS: <src_port> -> <dst_seg:dst_offset>.
* @bytes_per_rep: [IN ] Bytes transferred per repetition.
* @reps: [IN ] Maximum repetitions to be emulated.
[-- Attachment #2: x86-PV-priv-op-generic-emul.patch --]
[-- Type: text/plain, Size: 47717 bytes --]
x86/PV: use generic emulator for privileged instruction handling
There's a new emulator return code being added to allow bypassing
certain operations (see the code comment).
Another small tweak to the emulator is to single iteration handling
of INS and OUTS: Since we don't want to handle any other memory access
instructions, we want these to be handled by the rep_ins() / rep_outs()
hooks here too.
And then long-mode related bits now get hidden from the guest. This
should have been that way from the beginning, but becomes a requirement
now as the emulator's in_longmode() needs this to reflect guest view.
Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v4: Rename priv_op_to_linear() to pv_emul_virt_to_linear() and change
its return type. Refuse system segments and set just one of L and
DB in 64-bit mode case of priv_op_read_segment(). Have
pv_emul_cpuid() return X86EMUL_EXCEPTION in the CPUID faulting
case. Add EFER behavioral change to description.
v3: Re-base. Do away with the special case pointer checks on the ->read
and ->write methods in OUTS and INS handling. Clear EFER.LM* bits
for 32-bit guests (to avoid the emulator's in_longmode() returning
a wrong result). Introduce and use ->validate() hook. Make
formatting more consistent.
--- a/xen/arch/x86/traps.c
+++ b/xen/arch/x86/traps.c
@@ -697,16 +697,13 @@ static inline void do_guest_trap(unsigne
pv_inject_event(&event);
}
-static void instruction_done(
- struct cpu_user_regs *regs, unsigned long eip, unsigned int bpmatch)
+static void instruction_done(struct cpu_user_regs *regs, unsigned long eip)
{
regs->eip = eip;
regs->eflags &= ~X86_EFLAGS_RF;
- if ( bpmatch || (regs->eflags & X86_EFLAGS_TF) )
+ if ( regs->eflags & X86_EFLAGS_TF )
{
- current->arch.debugreg[6] |= bpmatch | DR_STATUS_RESERVED_ONE;
- if ( regs->eflags & X86_EFLAGS_TF )
- current->arch.debugreg[6] |= DR_STEP;
+ current->arch.debugreg[6] |= DR_STEP | DR_STATUS_RESERVED_ONE;
do_guest_trap(TRAP_debug, regs);
}
}
@@ -1336,7 +1333,7 @@ static int emulate_invalid_rdtscp(struct
return 0;
eip += sizeof(opcode);
pv_soft_rdtsc(v, regs, 1);
- instruction_done(regs, eip, 0);
+ instruction_done(regs, eip);
return EXCRET_fault_fixed;
}
@@ -1378,7 +1375,7 @@ static int emulate_forced_invalid_op(str
pv_cpuid(regs);
- instruction_done(regs, eip, 0);
+ instruction_done(regs, eip);
trace_trap_one_addr(TRC_PV_FORCED_INVALID_OP, regs->eip);
@@ -2023,6 +2020,155 @@ static int read_gate_descriptor(unsigned
return 1;
}
+static int pv_emul_virt_to_linear(unsigned long base, unsigned long offset,
+ unsigned int bytes, unsigned long limit,
+ enum x86_segment seg,
+ struct x86_emulate_ctxt *ctxt,
+ unsigned long *addr)
+{
+ int rc = X86EMUL_OKAY;
+
+ *addr = base + offset;
+
+ if ( ctxt->addr_size < 8 )
+ {
+ if ( limit < bytes - 1 || offset > limit - bytes + 1 )
+ rc = X86EMUL_EXCEPTION;
+ *addr = (uint32_t)*addr;
+ }
+ else if ( !__addr_ok(*addr) )
+ rc = X86EMUL_EXCEPTION;
+
+ if ( unlikely(rc == X86EMUL_EXCEPTION) )
+ x86_emul_hw_exception(seg != x86_seg_ss ? TRAP_gp_fault
+ : TRAP_stack_error,
+ 0, ctxt);
+
+ return rc;
+}
+
+struct priv_op_ctxt {
+ struct x86_emulate_ctxt ctxt;
+ struct {
+ unsigned long base, limit;
+ } cs;
+ char *io_emul_stub;
+ unsigned int bpmatch;
+ unsigned int tsc;
+#define TSC_BASE 1
+#define TSC_AUX 2
+};
+
+static int priv_op_insn_fetch(enum x86_segment seg,
+ unsigned long offset,
+ void *p_data,
+ unsigned int bytes,
+ struct x86_emulate_ctxt *ctxt)
+{
+ const struct priv_op_ctxt *poc =
+ container_of(ctxt, struct priv_op_ctxt, ctxt);
+ unsigned int rc;
+ unsigned long addr = poc->cs.base + offset;
+
+ ASSERT(seg == x86_seg_cs);
+
+ /* We don't mean to emulate any branches. */
+ if ( !bytes )
+ return X86EMUL_UNHANDLEABLE;
+
+ rc = pv_emul_virt_to_linear(poc->cs.base, offset, bytes, poc->cs.limit,
+ x86_seg_cs, ctxt, &addr);
+ if ( rc != X86EMUL_OKAY )
+ return rc;
+
+ if ( (rc = __copy_from_user(p_data, (void *)addr, bytes)) != 0 )
+ {
+ x86_emul_pagefault(cpu_has_nx ? PFEC_insn_fetch : 0,
+ addr + bytes - rc, ctxt);
+ return X86EMUL_EXCEPTION;
+ }
+
+ return X86EMUL_OKAY;
+}
+
+static int priv_op_read_segment(enum x86_segment seg,
+ struct segment_register *reg,
+ struct x86_emulate_ctxt *ctxt)
+{
+ if ( ctxt->addr_size < 8 )
+ {
+ unsigned long limit;
+ unsigned int sel, ar;
+
+ switch ( seg )
+ {
+ case x86_seg_cs: sel = ctxt->regs->cs; break;
+ case x86_seg_ds: sel = read_sreg(ds); break;
+ case x86_seg_es: sel = read_sreg(es); break;
+ case x86_seg_fs: sel = read_sreg(fs); break;
+ case x86_seg_gs: sel = read_sreg(gs); break;
+ case x86_seg_ss: sel = ctxt->regs->ss; break;
+ case x86_seg_tr:
+ /* Check if this is an attempt to access to I/O bitmap. */
+ if ( (ctxt->opcode & ~0xb) == 0xe4 || (ctxt->opcode & ~3) == 0x6c )
+ return X86EMUL_DONE;
+ /* fall through */
+ default: return X86EMUL_UNHANDLEABLE;
+ }
+
+ if ( !read_descriptor(sel, current, ®->base, &limit, &ar, 0) )
+ return X86EMUL_UNHANDLEABLE;
+
+ reg->limit = limit;
+ reg->attr.bytes = ar >> 8;
+ }
+ else
+ {
+ switch ( seg )
+ {
+ default:
+ if ( !is_x86_user_segment(seg) )
+ return X86EMUL_UNHANDLEABLE;
+ reg->base = 0;
+ break;
+ case x86_seg_fs:
+ reg->base = rdfsbase();
+ break;
+ case x86_seg_gs:
+ reg->base = rdgsbase();
+ break;
+ }
+
+ reg->limit = ~0U;
+
+ reg->attr.bytes = 0;
+ reg->attr.fields.type = _SEGMENT_WR >> 8;
+ if ( seg == x86_seg_cs )
+ {
+ reg->attr.fields.type |= _SEGMENT_CODE >> 8;
+ reg->attr.fields.l = 1;
+ }
+ else
+ reg->attr.fields.db = 1;
+ reg->attr.fields.s = 1;
+ reg->attr.fields.dpl = 3;
+ reg->attr.fields.p = 1;
+ reg->attr.fields.g = 1;
+ }
+
+ /*
+ * For x86_emulate.c's mode_ring0() to work, fake a DPL of zero.
+ * Also do this for consistency for non-conforming code segments.
+ */
+ if ( (seg == x86_seg_ss ||
+ (seg == x86_seg_cs &&
+ !(reg->attr.fields.type & (_SEGMENT_EC >> 8)))) &&
+ guest_kernel_mode(current, ctxt->regs) )
+ reg->attr.fields.dpl = 0;
+
+ return X86EMUL_OKAY;
+}
+
/* Perform IOPL check between the vcpu's shadowed IOPL, and the assumed cpl. */
static bool_t iopl_ok(const struct vcpu *v, const struct cpu_user_regs *regs)
{
@@ -2269,6 +2415,238 @@ unsigned long guest_to_host_gpr_switch(u
void (*pv_post_outb_hook)(unsigned int port, u8 value);
+typedef void io_emul_stub_t(struct cpu_user_regs *);
+
+static io_emul_stub_t *io_emul_stub_setup(struct priv_op_ctxt *ctxt, u8 opcode,
+ unsigned int port, unsigned int bytes)
+{
+ if ( !ctxt->io_emul_stub )
+ ctxt->io_emul_stub = map_domain_page(_mfn(this_cpu(stubs.mfn))) +
+ (this_cpu(stubs.addr) &
+ ~PAGE_MASK) +
+ STUB_BUF_SIZE / 2;
+
+ /* movq $host_to_guest_gpr_switch,%rcx */
+ ctxt->io_emul_stub[0] = 0x48;
+ ctxt->io_emul_stub[1] = 0xb9;
+ *(void **)&ctxt->io_emul_stub[2] = (void *)host_to_guest_gpr_switch;
+ /* callq *%rcx */
+ ctxt->io_emul_stub[10] = 0xff;
+ ctxt->io_emul_stub[11] = 0xd1;
+ /* data16 or nop */
+ ctxt->io_emul_stub[12] = (bytes != 2) ? 0x90 : 0x66;
+ /* <io-access opcode> */
+ ctxt->io_emul_stub[13] = opcode;
+ /* imm8 or nop */
+ ctxt->io_emul_stub[14] = !(opcode & 8) ? port : 0x90;
+ /* ret (jumps to guest_to_host_gpr_switch) */
+ ctxt->io_emul_stub[15] = 0xc3;
+ BUILD_BUG_ON(STUB_BUF_SIZE / 2 < 16);
+
+ if ( ioemul_handle_quirk )
+ ioemul_handle_quirk(opcode, &ctxt->io_emul_stub[12], ctxt->ctxt.regs);
+
+ /* Handy function-typed pointer to the stub. */
+ return (void *)(this_cpu(stubs.addr) + STUB_BUF_SIZE / 2);
+}
+
+static int priv_op_read_io(unsigned int port, unsigned int bytes,
+ unsigned long *val, struct x86_emulate_ctxt *ctxt)
+{
+ struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt);
+ struct vcpu *curr = current;
+ struct domain *currd = current->domain;
+
+ /* INS must not come here. */
+ ASSERT((ctxt->opcode & ~9) == 0xe4);
+
+ if ( !guest_io_okay(port, bytes, curr, ctxt->regs) )
+ return X86EMUL_UNHANDLEABLE;
+
+ poc->bpmatch = check_guest_io_breakpoint(curr, port, bytes);
+
+ if ( admin_io_okay(port, bytes, currd) )
+ {
+ io_emul_stub_t *io_emul =
+ io_emul_stub_setup(poc, ctxt->opcode, port, bytes);
+
+ mark_regs_dirty(ctxt->regs);
+ io_emul(ctxt->regs);
+ return X86EMUL_DONE;
+ }
+
+ *val = guest_io_read(port, bytes, currd);
+
+ return X86EMUL_OKAY;
+}
+
+static int priv_op_write_io(unsigned int port, unsigned int bytes,
+ unsigned long val, struct x86_emulate_ctxt *ctxt)
+{
+ struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt);
+ struct vcpu *curr = current;
+ struct domain *currd = current->domain;
+
+ /* OUTS must not come here. */
+ ASSERT((ctxt->opcode & ~9) == 0xe6);
+
+ if ( !guest_io_okay(port, bytes, curr, ctxt->regs) )
+ return X86EMUL_UNHANDLEABLE;
+
+ poc->bpmatch = check_guest_io_breakpoint(curr, port, bytes);
+
+ if ( admin_io_okay(port, bytes, currd) )
+ {
+ io_emul_stub_t *io_emul =
+ io_emul_stub_setup(poc, ctxt->opcode, port, bytes);
+
+ mark_regs_dirty(ctxt->regs);
+ io_emul(ctxt->regs);
+ if ( (bytes == 1) && pv_post_outb_hook )
+ pv_post_outb_hook(port, val);
+ return X86EMUL_DONE;
+ }
+
+ guest_io_write(port, bytes, val, currd);
+
+ return X86EMUL_OKAY;
+}
+
+static int priv_op_rep_ins(uint16_t port,
+ enum x86_segment seg, unsigned long offset,
+ unsigned int bytes_per_rep, unsigned long *reps,
+ struct x86_emulate_ctxt *ctxt)
+{
+ struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt);
+ struct vcpu *curr = current;
+ struct domain *currd = current->domain;
+ unsigned long goal = *reps;
+ struct segment_register sreg;
+ int rc;
+
+ ASSERT(seg == x86_seg_es);
+
+ *reps = 0;
+
+ if ( !guest_io_okay(port, bytes_per_rep, curr, ctxt->regs) )
+ return X86EMUL_UNHANDLEABLE;
+
+ rc = priv_op_read_segment(x86_seg_es, &sreg, ctxt);
+ if ( rc != X86EMUL_OKAY )
+ return rc;
+
+ if ( !sreg.attr.fields.p )
+ return X86EMUL_UNHANDLEABLE;
+ if ( !sreg.attr.fields.s ||
+ (sreg.attr.fields.type & (_SEGMENT_CODE >> 8)) ||
+ !(sreg.attr.fields.type & (_SEGMENT_WR >> 8)) )
+ {
+ x86_emul_hw_exception(TRAP_gp_fault, 0, ctxt);
+ return X86EMUL_EXCEPTION;
+ }
+
+ poc->bpmatch = check_guest_io_breakpoint(curr, port, bytes_per_rep);
+
+ while ( *reps < goal )
+ {
+ unsigned int data = guest_io_read(port, bytes_per_rep, currd);
+ unsigned long addr;
+
+ rc = pv_emul_virt_to_linear(sreg.base, offset, bytes_per_rep,
+ sreg.limit, x86_seg_es, ctxt, &addr);
+ if ( rc != X86EMUL_OKAY )
+ return rc;
+
+ if ( (rc = __copy_to_user((void *)addr, &data, bytes_per_rep)) != 0 )
+ {
+ x86_emul_pagefault(PFEC_write_access,
+ addr + bytes_per_rep - rc, ctxt);
+ return X86EMUL_EXCEPTION;
+ }
+
+ ++*reps;
+
+ if ( poc->bpmatch || hypercall_preempt_check() )
+ break;
+
+ /* x86_emulate() clips the repetition count to ensure we don't wrap. */
+ if ( unlikely(ctxt->regs->_eflags & X86_EFLAGS_DF) )
+ offset -= bytes_per_rep;
+ else
+ offset += bytes_per_rep;
+ }
+
+ return X86EMUL_OKAY;
+}
+
+static int priv_op_rep_outs(enum x86_segment seg, unsigned long offset,
+ uint16_t port,
+ unsigned int bytes_per_rep, unsigned long *reps,
+ struct x86_emulate_ctxt *ctxt)
+{
+ struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt);
+ struct vcpu *curr = current;
+ struct domain *currd = current->domain;
+ unsigned long goal = *reps;
+ struct segment_register sreg;
+ int rc;
+
+ *reps = 0;
+
+ if ( !guest_io_okay(port, bytes_per_rep, curr, ctxt->regs) )
+ return X86EMUL_UNHANDLEABLE;
+
+ rc = priv_op_read_segment(seg, &sreg, ctxt);
+ if ( rc != X86EMUL_OKAY )
+ return rc;
+
+ if ( !sreg.attr.fields.p )
+ return X86EMUL_UNHANDLEABLE;
+ if ( !sreg.attr.fields.s ||
+ ((sreg.attr.fields.type & (_SEGMENT_CODE >> 8)) &&
+ !(sreg.attr.fields.type & (_SEGMENT_WR >> 8))) )
+ {
+ x86_emul_hw_exception(seg != x86_seg_ss ? TRAP_gp_fault
+ : TRAP_stack_error,
+ 0, ctxt);
+ return X86EMUL_EXCEPTION;
+ }
+
+ poc->bpmatch = check_guest_io_breakpoint(curr, port, bytes_per_rep);
+
+ while ( *reps < goal )
+ {
+ unsigned int data = 0;
+ unsigned long addr;
+
+ rc = pv_emul_virt_to_linear(sreg.base, offset, bytes_per_rep,
+ sreg.limit, seg, ctxt, &addr);
+ if ( rc != X86EMUL_OKAY )
+ return rc;
+
+ if ( (rc = __copy_from_user(&data, (void *)addr, bytes_per_rep)) != 0 )
+ {
+ x86_emul_pagefault(0, addr + bytes_per_rep - rc, ctxt);
+ return X86EMUL_EXCEPTION;
+ }
+
+ guest_io_write(port, bytes_per_rep, data, currd);
+
+ ++*reps;
+
+ if ( poc->bpmatch || hypercall_preempt_check() )
+ break;
+
+ /* x86_emulate() clips the repetition count to ensure we don't wrap. */
+ if ( unlikely(ctxt->regs->_eflags & X86_EFLAGS_DF) )
+ offset -= bytes_per_rep;
+ else
+ offset += bytes_per_rep;
+ }
+
+ return X86EMUL_OKAY;
+}
+
static int priv_op_read_cr(unsigned int reg, unsigned long *val,
struct x86_emulate_ctxt *ctxt)
{
@@ -2409,6 +2787,7 @@ static inline bool is_cpufreq_controller
static int priv_op_read_msr(unsigned int reg, uint64_t *val,
struct x86_emulate_ctxt *ctxt)
{
+ struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt);
const struct vcpu *curr = current;
const struct domain *currd = curr->domain;
bool vpmu_msr = false;
@@ -2436,6 +2815,28 @@ static int priv_op_read_msr(unsigned int
*val = curr->arch.pv_vcpu.gs_base_user;
return X86EMUL_OKAY;
+ /*
+ * In order to fully retain original behavior, defer calling
+ * pv_soft_rdtsc() until after emulation. This may want/need to be
+ * reconsidered.
+ */
+ case MSR_IA32_TSC:
+ poc->tsc |= TSC_BASE;
+ goto normal;
+
+ case MSR_TSC_AUX:
+ poc->tsc |= TSC_AUX;
+ if ( cpu_has_rdtscp )
+ goto normal;
+ *val = 0;
+ return X86EMUL_OKAY;
+
+ case MSR_EFER:
+ *val = read_efer();
+ if ( is_pv_32bit_domain(currd) )
+ *val &= ~(EFER_LME | EFER_LMA | EFER_LMSLE);
+ return X86EMUL_OKAY;
+
case MSR_K7_FID_VID_CTL:
case MSR_K7_FID_VID_STATUS:
case MSR_K8_PSTATE_LIMIT:
@@ -2539,7 +2940,6 @@ static int priv_op_read_msr(unsigned int
if ( rc )
return X86EMUL_OKAY;
/* fall through */
- case MSR_EFER:
normal:
/* Everyone can read the MSR space. */
/* gdprintk(XENLOG_WARNING, "Domain attempted RDMSR %08x\n", reg); */
@@ -2761,11 +3161,41 @@ static int priv_op_write_msr(unsigned in
return X86EMUL_UNHANDLEABLE;
}
+static int priv_op_wbinvd(struct x86_emulate_ctxt *ctxt)
+{
+ /* Ignore the instruction if unprivileged. */
+ if ( !cache_flush_permitted(current->domain) )
+ /*
+ * Non-physdev domain attempted WBINVD; ignore for now since
+ * newer linux uses this in some start-of-day timing loops.
+ */
+ ;
+ else
+ wbinvd();
+
+ return X86EMUL_OKAY;
+}
+
int pv_emul_cpuid(unsigned int *eax, unsigned int *ebx, unsigned int *ecx,
unsigned int *edx, struct x86_emulate_ctxt *ctxt)
{
struct cpu_user_regs regs = *ctxt->regs;
+ /*
+ * x86_emulate uses this function to query CPU features for its own
+ * internal use. Make sure we're actually emulating CPUID before checking
+ * for emulated CPUID faulting.
+ */
+ if ( ctxt->opcode == X86EMUL_OPC(0x0f, 0xa2) )
+ {
+ const struct vcpu *curr = current;
+
+ /* If cpuid faulting is enabled and CPL>0 leave the #GP untouched. */
+ if ( curr->arch.cpuid_faulting &&
+ !guest_kernel_mode(curr, ctxt->regs) )
+ return X86EMUL_EXCEPTION;
+ }
+
regs._eax = *eax;
regs._ecx = *ecx;
@@ -2779,497 +3209,153 @@ int pv_emul_cpuid(unsigned int *eax, uns
return X86EMUL_OKAY;
}
-/* Instruction fetch with error handling. */
-#define insn_fetch(type, base, eip, limit) \
-({ unsigned long _rc, _ptr = (base) + (eip); \
- type _x; \
- if ( ad_default < 8 ) \
- _ptr = (unsigned int)_ptr; \
- if ( (limit) < sizeof(_x) - 1 || (eip) > (limit) - (sizeof(_x) - 1) ) \
- goto fail; \
- if ( (_rc = copy_from_user(&_x, (type *)_ptr, sizeof(_x))) != 0 ) \
- { \
- pv_inject_page_fault(0, _ptr + sizeof(_x) - _rc); \
- goto skip; \
- } \
- (eip) += sizeof(_x); _x; })
-
-static int emulate_privileged_op(struct cpu_user_regs *regs)
+static int priv_op_validate(const struct x86_emulate_state *state,
+ struct x86_emulate_ctxt *ctxt)
{
- struct vcpu *v = current;
- struct domain *currd = v->domain;
- unsigned long *reg, eip = regs->eip;
- u8 opcode, modrm_reg = 0, modrm_rm = 0, rep_prefix = 0, lock = 0, rex = 0;
- enum { lm_seg_none, lm_seg_fs, lm_seg_gs } lm_ovr = lm_seg_none;
- int rc;
- unsigned int port, i, data_sel, ar, data, bpmatch = 0;
- unsigned int op_bytes, op_default, ad_bytes, ad_default, opsize_prefix= 0;
-#define rd_ad(reg) (ad_bytes >= sizeof(regs->reg) \
- ? regs->reg \
- : ad_bytes == 4 \
- ? (u32)regs->reg \
- : (u16)regs->reg)
-#define wr_ad(reg, val) (ad_bytes >= sizeof(regs->reg) \
- ? regs->reg = (val) \
- : ad_bytes == 4 \
- ? (*(u32 *)®s->reg = (val)) \
- : (*(u16 *)®s->reg = (val)))
- unsigned long code_base, code_limit;
- char *io_emul_stub = NULL;
- void (*io_emul)(struct cpu_user_regs *);
- uint64_t val;
-
- if ( !read_descriptor(regs->cs, v, &code_base, &code_limit, &ar, 1) )
- goto fail;
- op_default = op_bytes = (ar & (_SEGMENT_L|_SEGMENT_DB)) ? 4 : 2;
- ad_default = ad_bytes = (ar & _SEGMENT_L) ? 8 : op_default;
- if ( !(ar & _SEGMENT_S) ||
- !(ar & _SEGMENT_P) ||
- !(ar & _SEGMENT_CODE) )
- goto fail;
-
- /* emulating only opcodes not allowing SS to be default */
- data_sel = read_sreg(ds);
-
- /* Legacy prefixes. */
- for ( i = 0; i < 8; i++, rex == opcode || (rex = 0) )
+ switch ( ctxt->opcode )
{
- switch ( opcode = insn_fetch(u8, code_base, eip, code_limit) )
- {
- case 0x66: /* operand-size override */
- opsize_prefix = 1;
- op_bytes = op_default ^ 6; /* switch between 2/4 bytes */
- continue;
- case 0x67: /* address-size override */
- ad_bytes = ad_default != 4 ? 4 : 2; /* switch to 2/4 bytes */
- continue;
- case 0x2e: /* CS override */
- data_sel = regs->cs;
- continue;
- case 0x3e: /* DS override */
- data_sel = read_sreg(ds);
- continue;
- case 0x26: /* ES override */
- data_sel = read_sreg(es);
- continue;
- case 0x64: /* FS override */
- data_sel = read_sreg(fs);
- lm_ovr = lm_seg_fs;
- continue;
- case 0x65: /* GS override */
- data_sel = read_sreg(gs);
- lm_ovr = lm_seg_gs;
- continue;
- case 0x36: /* SS override */
- data_sel = regs->ss;
- continue;
- case 0xf0: /* LOCK */
- lock = 1;
- continue;
- case 0xf2: /* REPNE/REPNZ */
- case 0xf3: /* REP/REPE/REPZ */
- rep_prefix = 1;
- continue;
- default:
- if ( (ar & _SEGMENT_L) && (opcode & 0xf0) == 0x40 )
- {
- rex = opcode;
- continue;
- }
- break;
- }
- break;
- }
-
- /* REX prefix. */
- if ( rex & 8 ) /* REX.W */
- op_bytes = 4; /* emulate only opcodes not supporting 64-bit operands */
- modrm_reg = (rex & 4) << 1; /* REX.R */
- /* REX.X does not need to be decoded. */
- modrm_rm = (rex & 1) << 3; /* REX.B */
-
- if ( opcode == 0x0f )
- goto twobyte_opcode;
-
- if ( lock )
- goto fail;
-
- /* Input/Output String instructions. */
- if ( (opcode >= 0x6c) && (opcode <= 0x6f) )
- {
- unsigned long data_base, data_limit;
-
- if ( rep_prefix && (rd_ad(ecx) == 0) )
- goto done;
-
- if ( !(opcode & 2) )
- {
- data_sel = read_sreg(es);
- lm_ovr = lm_seg_none;
- }
-
- if ( !(ar & _SEGMENT_L) )
- {
- if ( !read_descriptor(data_sel, v, &data_base, &data_limit,
- &ar, 0) )
- goto fail;
- if ( !(ar & _SEGMENT_S) ||
- !(ar & _SEGMENT_P) ||
- (opcode & 2 ?
- (ar & _SEGMENT_CODE) && !(ar & _SEGMENT_WR) :
- (ar & _SEGMENT_CODE) || !(ar & _SEGMENT_WR)) )
- goto fail;
- }
- else
- {
- switch ( lm_ovr )
- {
- default:
- data_base = 0UL;
- break;
- case lm_seg_fs:
- data_base = rdfsbase();
- break;
- case lm_seg_gs:
- data_base = rdgsbase();
- break;
- }
- data_limit = ~0UL;
- ar = _SEGMENT_WR|_SEGMENT_S|_SEGMENT_DPL|_SEGMENT_P;
- }
-
- port = (u16)regs->edx;
+ case 0x6c ... 0x6f: /* ins / outs */
+ case 0xe4 ... 0xe7: /* in / out (immediate port) */
+ case 0xec ... 0xef: /* in / out (port in %dx) */
+ case X86EMUL_OPC(0x0f, 0x06): /* clts */
+ case X86EMUL_OPC(0x0f, 0x09): /* wbinvd */
+ case X86EMUL_OPC(0x0f, 0x20) ...
+ X86EMUL_OPC(0x0f, 0x23): /* mov to/from cr/dr */
+ case X86EMUL_OPC(0x0f, 0x30): /* wrmsr */
+ case X86EMUL_OPC(0x0f, 0x31): /* rdtsc */
+ case X86EMUL_OPC(0x0f, 0x32): /* rdmsr */
+ case X86EMUL_OPC(0x0f, 0xa2): /* cpuid */
+ return X86EMUL_OKAY;
- continue_io_string:
- switch ( opcode )
- {
- case 0x6c: /* INSB */
- op_bytes = 1;
- case 0x6d: /* INSW/INSL */
- if ( (data_limit < (op_bytes - 1)) ||
- (rd_ad(edi) > (data_limit - (op_bytes - 1))) ||
- !guest_io_okay(port, op_bytes, v, regs) )
- goto fail;
- data = guest_io_read(port, op_bytes, currd);
- if ( (rc = copy_to_user((void *)data_base + rd_ad(edi),
- &data, op_bytes)) != 0 )
- {
- pv_inject_page_fault(PFEC_write_access,
- data_base + rd_ad(edi) + op_bytes - rc);
- return EXCRET_fault_fixed;
- }
- wr_ad(edi, regs->edi + (int)((regs->eflags & X86_EFLAGS_DF)
- ? -op_bytes : op_bytes));
+ case 0xfa: case 0xfb: /* cli / sti */
+ if ( !iopl_ok(current, ctxt->regs) )
break;
-
- case 0x6e: /* OUTSB */
- op_bytes = 1;
- case 0x6f: /* OUTSW/OUTSL */
- if ( (data_limit < (op_bytes - 1)) ||
- (rd_ad(esi) > (data_limit - (op_bytes - 1))) ||
- !guest_io_okay(port, op_bytes, v, regs) )
- goto fail;
- if ( (rc = copy_from_user(&data, (void *)data_base + rd_ad(esi),
- op_bytes)) != 0 )
- {
- pv_inject_page_fault(0, data_base + rd_ad(esi)
- + op_bytes - rc);
- return EXCRET_fault_fixed;
- }
- guest_io_write(port, op_bytes, data, currd);
- wr_ad(esi, regs->esi + (int)((regs->eflags & X86_EFLAGS_DF)
- ? -op_bytes : op_bytes));
- break;
- }
-
- bpmatch = check_guest_io_breakpoint(v, port, op_bytes);
-
- if ( rep_prefix && (wr_ad(ecx, regs->ecx - 1) != 0) )
- {
- if ( !bpmatch && !hypercall_preempt_check() )
- goto continue_io_string;
- eip = regs->eip;
- }
-
- goto done;
- }
-
- /*
- * Very likely to be an I/O instruction (IN/OUT).
- * Build an stub to execute the instruction with full guest GPR
- * context. This is needed for some systems which (ab)use IN/OUT
- * to communicate with BIOS code in system-management mode.
- */
- io_emul_stub = map_domain_page(_mfn(this_cpu(stubs.mfn))) +
- (this_cpu(stubs.addr) & ~PAGE_MASK) +
- STUB_BUF_SIZE / 2;
- /* movq $host_to_guest_gpr_switch,%rcx */
- io_emul_stub[0] = 0x48;
- io_emul_stub[1] = 0xb9;
- *(void **)&io_emul_stub[2] = (void *)host_to_guest_gpr_switch;
- /* callq *%rcx */
- io_emul_stub[10] = 0xff;
- io_emul_stub[11] = 0xd1;
- /* data16 or nop */
- io_emul_stub[12] = (op_bytes != 2) ? 0x90 : 0x66;
- /* <io-access opcode> */
- io_emul_stub[13] = opcode;
- /* imm8 or nop */
- io_emul_stub[14] = 0x90;
- /* ret (jumps to guest_to_host_gpr_switch) */
- io_emul_stub[15] = 0xc3;
- BUILD_BUG_ON(STUB_BUF_SIZE / 2 < 16);
-
- /* Handy function-typed pointer to the stub. */
- io_emul = (void *)(this_cpu(stubs.addr) + STUB_BUF_SIZE / 2);
-
- if ( ioemul_handle_quirk )
- ioemul_handle_quirk(opcode, &io_emul_stub[12], regs);
-
- /* I/O Port and Interrupt Flag instructions. */
- switch ( opcode )
- {
- case 0xe4: /* IN imm8,%al */
- op_bytes = 1;
- case 0xe5: /* IN imm8,%eax */
- port = insn_fetch(u8, code_base, eip, code_limit);
- io_emul_stub[14] = port; /* imm8 */
- exec_in:
- if ( !guest_io_okay(port, op_bytes, v, regs) )
- goto fail;
- if ( admin_io_okay(port, op_bytes, currd) )
- {
- mark_regs_dirty(regs);
- io_emul(regs);
- }
- else
- {
- if ( op_bytes == 4 )
- regs->eax = 0;
- else
- regs->eax &= ~((1 << (op_bytes * 8)) - 1);
- regs->eax |= guest_io_read(port, op_bytes, currd);
- }
- bpmatch = check_guest_io_breakpoint(v, port, op_bytes);
- goto done;
-
- case 0xec: /* IN %dx,%al */
- op_bytes = 1;
- case 0xed: /* IN %dx,%eax */
- port = (u16)regs->edx;
- goto exec_in;
-
- case 0xe6: /* OUT %al,imm8 */
- op_bytes = 1;
- case 0xe7: /* OUT %eax,imm8 */
- port = insn_fetch(u8, code_base, eip, code_limit);
- io_emul_stub[14] = port; /* imm8 */
- exec_out:
- if ( !guest_io_okay(port, op_bytes, v, regs) )
- goto fail;
- if ( admin_io_okay(port, op_bytes, currd) )
- {
- mark_regs_dirty(regs);
- io_emul(regs);
- if ( (op_bytes == 1) && pv_post_outb_hook )
- pv_post_outb_hook(port, regs->eax);
- }
- else
- {
- guest_io_write(port, op_bytes, regs->eax, currd);
- }
- bpmatch = check_guest_io_breakpoint(v, port, op_bytes);
- goto done;
-
- case 0xee: /* OUT %al,%dx */
- op_bytes = 1;
- case 0xef: /* OUT %eax,%dx */
- port = (u16)regs->edx;
- goto exec_out;
-
- case 0xfa: /* CLI */
- case 0xfb: /* STI */
- if ( !iopl_ok(v, regs) )
- goto fail;
/*
* This is just too dangerous to allow, in my opinion. Consider if the
* caller then tries to reenable interrupts using POPF: we can't trap
* that and we'll end up with hard-to-debug lockups. Fast & loose will
* do for us. :-)
+ vcpu_info(current, evtchn_upcall_mask) = (ctxt->opcode == 0xfa);
*/
- /*v->vcpu_info->evtchn_upcall_mask = (opcode == 0xfa);*/
- goto done;
- }
+ return X86EMUL_DONE;
- /* No decode of this single-byte opcode. */
- goto fail;
+ case X86EMUL_OPC(0x0f, 0x01):
+ {
+ unsigned int modrm_rm, modrm_reg;
- twobyte_opcode:
- /*
- * All 2 and 3 byte opcodes, except RDTSC (0x31), RDTSCP (0x1,0xF9),
- * and CPUID (0xa2), are executable only from guest kernel mode
- * (virtual ring 0).
- */
- opcode = insn_fetch(u8, code_base, eip, code_limit);
- if ( !guest_kernel_mode(v, regs) &&
- (opcode != 0x1) && (opcode != 0x31) && (opcode != 0xa2) )
- goto fail;
-
- if ( lock && (opcode & ~3) != 0x20 )
- goto fail;
- switch ( opcode )
- {
- case 0x1: /* RDTSCP and XSETBV */
- switch ( insn_fetch(u8, code_base, eip, code_limit) )
- {
- case 0xf9: /* RDTSCP */
- if ( (v->arch.pv_vcpu.ctrlreg[4] & X86_CR4_TSD) &&
- !guest_kernel_mode(v, regs) )
- goto fail;
- pv_soft_rdtsc(v, regs, 1);
+ if ( x86_insn_modrm(state, &modrm_rm, &modrm_reg) != 3 ||
+ (modrm_rm & 7) != 1 )
break;
- case 0xd1: /* XSETBV */
+ switch ( modrm_reg & 7 )
{
- u64 new_xfeature = (u32)regs->eax | ((u64)regs->edx << 32);
+ case 2: /* xsetbv */
+ case 7: /* rdtscp */
+ return X86EMUL_OKAY;
+ }
+ break;
+ }
+ }
- if ( lock || rep_prefix || opsize_prefix
- || !(v->arch.pv_vcpu.ctrlreg[4] & X86_CR4_OSXSAVE) )
- {
- do_guest_trap(TRAP_invalid_op, regs);
- goto skip;
- }
+ return X86EMUL_UNHANDLEABLE;
+}
- if ( !guest_kernel_mode(v, regs) )
- goto fail;
+static const struct x86_emulate_ops priv_op_ops = {
+ .insn_fetch = priv_op_insn_fetch,
+ .read = x86emul_unhandleable_rw,
+ .validate = priv_op_validate,
+ .read_io = priv_op_read_io,
+ .write_io = priv_op_write_io,
+ .rep_ins = priv_op_rep_ins,
+ .rep_outs = priv_op_rep_outs,
+ .read_segment = priv_op_read_segment,
+ .read_cr = priv_op_read_cr,
+ .write_cr = priv_op_write_cr,
+ .read_dr = priv_op_read_dr,
+ .write_dr = priv_op_write_dr,
+ .read_msr = priv_op_read_msr,
+ .write_msr = priv_op_write_msr,
+ .cpuid = pv_emul_cpuid,
+ .wbinvd = priv_op_wbinvd,
+};
- if ( handle_xsetbv(regs->ecx, new_xfeature) )
- goto fail;
+static int emulate_privileged_op(struct cpu_user_regs *regs)
+{
+ struct vcpu *curr = current;
+ struct domain *currd = curr->domain;
+ struct priv_op_ctxt ctxt = { .ctxt.regs = regs };
+ int rc;
+ unsigned int eflags, ar;
- break;
- }
- default:
- goto fail;
- }
- break;
+ if ( !read_descriptor(regs->cs, curr, &ctxt.cs.base, &ctxt.cs.limit,
+ &ar, 1) ||
+ !(ar & _SEGMENT_S) ||
+ !(ar & _SEGMENT_P) ||
+ !(ar & _SEGMENT_CODE) )
+ return 0;
- case 0x06: /* CLTS */
- (void)do_fpu_taskswitch(0);
- break;
+ /* Mirror virtualized state into EFLAGS. */
+ ASSERT(regs->_eflags & X86_EFLAGS_IF);
+ if ( vcpu_info(curr, evtchn_upcall_mask) )
+ regs->_eflags &= ~X86_EFLAGS_IF;
+ else
+ regs->_eflags |= X86_EFLAGS_IF;
+ ASSERT(!(regs->_eflags & X86_EFLAGS_IOPL));
+ regs->_eflags |= curr->arch.pv_vcpu.iopl;
+ eflags = regs->_eflags;
+
+ ctxt.ctxt.addr_size = ar & _SEGMENT_L ? 64 : ar & _SEGMENT_DB ? 32 : 16;
+ /* Leave zero in ctxt.ctxt.sp_size, as it's not needed. */
+ rc = x86_emulate(&ctxt.ctxt, &priv_op_ops);
- case 0x09: /* WBINVD */
- /* Ignore the instruction if unprivileged. */
- if ( !cache_flush_permitted(currd) )
- /* Non-physdev domain attempted WBINVD; ignore for now since
- newer linux uses this in some start-of-day timing loops */
- ;
- else
- wbinvd();
- break;
+ if ( ctxt.io_emul_stub )
+ unmap_domain_page(ctxt.io_emul_stub);
- case 0x20: /* MOV CR?,<reg> */
- opcode = insn_fetch(u8, code_base, eip, code_limit);
- if ( opcode < 0xc0 )
- goto fail;
- modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
- modrm_rm |= (opcode >> 0) & 7;
- if ( priv_op_read_cr(modrm_reg, decode_register(modrm_rm, regs, 0),
- NULL) != X86EMUL_OKAY )
- goto fail;
- break;
-
- case 0x21: /* MOV DR?,<reg> */ {
- opcode = insn_fetch(u8, code_base, eip, code_limit);
- if ( opcode < 0xc0 )
- goto fail;
- modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
- modrm_rm |= (opcode >> 0) & 7;
- if ( priv_op_read_dr(modrm_reg, decode_register(modrm_rm, regs, 0),
- NULL) != X86EMUL_OKAY )
- goto fail;
- break;
- }
+ /*
+ * Un-mirror virtualized state from EFLAGS.
+ * Nothing we allow to be emulated can change TF, IF, or IOPL.
+ */
+ ASSERT(!((regs->_eflags ^ eflags) & (X86_EFLAGS_IF | X86_EFLAGS_IOPL)));
+ regs->_eflags |= X86_EFLAGS_IF;
+ regs->_eflags &= ~X86_EFLAGS_IOPL;
+
+ /* More strict than x86_emulate_wrapper(). */
+ ASSERT(ctxt.ctxt.event_pending == (rc == X86EMUL_EXCEPTION));
+
+ switch ( rc )
+ {
+ case X86EMUL_OKAY:
+ if ( ctxt.tsc & TSC_BASE )
+ {
+ if ( ctxt.tsc & TSC_AUX )
+ pv_soft_rdtsc(curr, regs, 1);
+ else if ( currd->arch.vtsc )
+ pv_soft_rdtsc(curr, regs, 0);
+ else
+ {
+ uint64_t val = rdtsc();
- case 0x22: /* MOV <reg>,CR? */
- opcode = insn_fetch(u8, code_base, eip, code_limit);
- if ( opcode < 0xc0 )
- goto fail;
- modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
- modrm_rm |= (opcode >> 0) & 7;
- reg = decode_register(modrm_rm, regs, 0);
- switch ( priv_op_write_cr(modrm_reg, *reg, NULL) )
- {
- case X86EMUL_OKAY:
- break;
- case X86EMUL_RETRY: /* retry after preemption */
- goto skip;
- default:
- goto fail;
+ regs->eax = (uint32_t)val;
+ regs->edx = (uint32_t)(val >> 32);
+ }
}
- break;
-
- case 0x23: /* MOV <reg>,DR? */
- opcode = insn_fetch(u8, code_base, eip, code_limit);
- if ( opcode < 0xc0 )
- goto fail;
- modrm_reg += ((opcode >> 3) & 7) + (lock << 3);
- modrm_rm |= (opcode >> 0) & 7;
- reg = decode_register(modrm_rm, regs, 0);
- if ( priv_op_write_dr(modrm_reg, *reg, NULL) != X86EMUL_OKAY )
- goto fail;
- break;
- case 0x30: /* WRMSR */
- if ( priv_op_write_msr(regs->_ecx, (regs->rdx << 32) | regs->_eax,
- NULL) != X86EMUL_OKAY )
- goto fail;
- break;
-
- case 0x31: /* RDTSC */
- if ( (v->arch.pv_vcpu.ctrlreg[4] & X86_CR4_TSD) &&
- !guest_kernel_mode(v, regs) )
- goto fail;
- if ( currd->arch.vtsc )
- pv_soft_rdtsc(v, regs, 0);
- else
+ if ( ctxt.ctxt.retire.singlestep )
+ ctxt.bpmatch |= DR_STEP;
+ if ( ctxt.bpmatch )
{
- val = rdtsc();
- goto rdmsr_writeback;
+ curr->arch.debugreg[6] |= ctxt.bpmatch | DR_STATUS_RESERVED_ONE;
+ if ( !(curr->arch.pv_vcpu.trap_bounce.flags & TBF_EXCEPTION) )
+ do_guest_trap(TRAP_debug, regs);
}
- break;
-
- case 0x32: /* RDMSR */
- if ( priv_op_read_msr(regs->_ecx, &val, NULL) != X86EMUL_OKAY )
- goto fail;
- rdmsr_writeback:
- regs->eax = (uint32_t)val;
- regs->edx = (uint32_t)(val >> 32);
- break;
-
- case 0xa2: /* CPUID */
- /* If cpuid faulting is enabled and CPL>0 leave the #GP untouched. */
- if ( v->arch.cpuid_faulting && !guest_kernel_mode(v, regs) )
- goto fail;
-
- pv_cpuid(regs);
- break;
+ /* fall through */
+ case X86EMUL_RETRY:
+ return EXCRET_fault_fixed;
- default:
- goto fail;
+ case X86EMUL_EXCEPTION:
+ pv_inject_event(&ctxt.ctxt.event);
+ return EXCRET_fault_fixed;
}
-#undef wr_ad
-#undef rd_ad
-
- done:
- instruction_done(regs, eip, bpmatch);
- skip:
- if ( io_emul_stub )
- unmap_domain_page(io_emul_stub);
- return EXCRET_fault_fixed;
-
- fail:
- if ( io_emul_stub )
- unmap_domain_page(io_emul_stub);
return 0;
}
@@ -3615,7 +3702,7 @@ static void emulate_gate_op(struct cpu_u
sel |= (regs->cs & 3);
regs->cs = sel;
- instruction_done(regs, off, 0);
+ instruction_done(regs, off);
}
void do_general_protection(struct cpu_user_regs *regs)
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -1185,7 +1185,7 @@ static int ioport_access_check(
fail_if(ops->read_segment == NULL);
if ( (rc = ops->read_segment(x86_seg_tr, &tr, ctxt)) != 0 )
- return rc;
+ return rc == X86EMUL_DONE ? X86EMUL_OKAY : rc;
/* Ensure the TSS has an io-bitmap-offset field. */
generate_exception_if(tr.attr.fields.type != 0xb, EXC_GP, 0);
@@ -2506,6 +2506,21 @@ x86_emulate(
/* Sync rIP to post decode value. */
_regs.eip = state.eip;
+ if ( ops->validate )
+ {
+#ifndef NDEBUG
+ state.caller = __builtin_return_address(0);
+#endif
+ rc = ops->validate(&state, ctxt);
+#ifndef NDEBUG
+ state.caller = NULL;
+#endif
+ if ( rc == X86EMUL_DONE )
+ goto no_writeback;
+ if ( rc != X86EMUL_OKAY )
+ return rc;
+ }
+
b = ctxt->opcode;
d = state.desc;
#define state (&state)
@@ -2935,13 +2950,28 @@ x86_emulate(
dst.mem.off = truncate_ea_and_reps(_regs.edi, nr_reps, dst.bytes);
if ( (rc = ioport_access_check(port, dst.bytes, ctxt, ops)) != 0 )
goto done;
- if ( (nr_reps == 1) || !ops->rep_ins ||
- ((rc = ops->rep_ins(port, dst.mem.seg, dst.mem.off, dst.bytes,
- &nr_reps, ctxt)) == X86EMUL_UNHANDLEABLE) )
+ /* Try the presumably most efficient approach first. */
+ if ( !ops->rep_ins )
+ nr_reps = 1;
+ rc = X86EMUL_UNHANDLEABLE;
+ if ( nr_reps == 1 && ops->read_io && ops->write )
{
- fail_if(ops->read_io == NULL);
+ rc = ops->read_io(port, dst.bytes, &dst.val, ctxt);
+ if ( rc == X86EMUL_OKAY )
+ nr_reps = 0;
+ }
+ if ( (nr_reps > 1 || rc == X86EMUL_UNHANDLEABLE) && ops->rep_ins )
+ rc = ops->rep_ins(port, dst.mem.seg, dst.mem.off, dst.bytes,
+ &nr_reps, ctxt);
+ if ( nr_reps >= 1 && rc == X86EMUL_UNHANDLEABLE )
+ {
+ fail_if(!ops->read_io || !ops->write);
if ( (rc = ops->read_io(port, dst.bytes, &dst.val, ctxt)) != 0 )
goto done;
+ nr_reps = 0;
+ }
+ if ( !nr_reps && rc == X86EMUL_OKAY )
+ {
dst.type = OP_MEM;
nr_reps = 1;
}
@@ -2959,14 +2989,30 @@ x86_emulate(
ea.mem.off = truncate_ea_and_reps(_regs.esi, nr_reps, dst.bytes);
if ( (rc = ioport_access_check(port, dst.bytes, ctxt, ops)) != 0 )
goto done;
- if ( (nr_reps == 1) || !ops->rep_outs ||
- ((rc = ops->rep_outs(ea.mem.seg, ea.mem.off, port, dst.bytes,
- &nr_reps, ctxt)) == X86EMUL_UNHANDLEABLE) )
+ /* Try the presumably most efficient approach first. */
+ if ( !ops->rep_outs )
+ nr_reps = 1;
+ rc = X86EMUL_UNHANDLEABLE;
+ if ( nr_reps == 1 && ops->write_io )
{
- if ( (rc = read_ulong(ea.mem.seg, truncate_ea(_regs.esi),
- &dst.val, dst.bytes, ctxt, ops)) != 0 )
+ rc = read_ulong(ea.mem.seg, ea.mem.off, &dst.val, dst.bytes,
+ ctxt, ops);
+ if ( rc == X86EMUL_OKAY )
+ nr_reps = 0;
+ }
+ if ( (nr_reps > 1 || rc == X86EMUL_UNHANDLEABLE) && ops->rep_outs )
+ rc = ops->rep_outs(ea.mem.seg, ea.mem.off, port, dst.bytes,
+ &nr_reps, ctxt);
+ if ( nr_reps >= 1 && rc == X86EMUL_UNHANDLEABLE )
+ {
+ if ( (rc = read_ulong(ea.mem.seg, ea.mem.off, &dst.val,
+ dst.bytes, ctxt, ops)) != X86EMUL_OKAY )
goto done;
fail_if(ops->write_io == NULL);
+ nr_reps = 0;
+ }
+ if ( !nr_reps && rc == X86EMUL_OKAY )
+ {
if ( (rc = ops->write_io(port, dst.bytes, dst.val, ctxt)) != 0 )
goto done;
nr_reps = 1;
@@ -4039,7 +4085,11 @@ x86_emulate(
rc = ops->read_io(port, dst.bytes, &dst.val, ctxt);
}
if ( rc != 0 )
+ {
+ if ( rc == X86EMUL_DONE )
+ goto no_writeback;
goto done;
+ }
break;
}
@@ -5445,9 +5495,7 @@ x86_emulate(
break;
}
- no_writeback:
- /* Commit shadow register state. */
- _regs.eflags &= ~EFLG_RF;
+ no_writeback: /* Commit shadow register state. */
/* Zero the upper 32 bits of %rip if not in 64-bit mode. */
if ( !mode_64bit() )
@@ -5457,7 +5505,15 @@ x86_emulate(
if ( (rc == X86EMUL_OKAY) && (ctxt->regs->eflags & EFLG_TF) )
ctxt->retire.singlestep = true;
- *ctxt->regs = _regs;
+ if ( rc != X86EMUL_DONE )
+ *ctxt->regs = _regs;
+ else
+ {
+ ctxt->regs->eip = _regs.eip;
+ rc = X86EMUL_OKAY;
+ }
+
+ ctxt->regs->eflags &= ~EFLG_RF;
done:
_put_fpu();
--- a/xen/arch/x86/x86_emulate/x86_emulate.h
+++ b/xen/arch/x86/x86_emulate/x86_emulate.h
@@ -146,6 +146,14 @@ struct __attribute__((__packed__)) segme
#define X86EMUL_EXCEPTION 2
/* Retry the emulation for some reason. No state modified. */
#define X86EMUL_RETRY 3
+ /*
+ * Operation fully done by one of the hooks:
+ * - validate(): operation completed (except common insn retire logic)
+ * - read_segment(x86_seg_tr, ...): bypass I/O bitmap access
+ * - read_io() / write_io(): bypass GPR update (non-string insns only)
+ * Undefined behavior when used anywhere else.
+ */
+#define X86EMUL_DONE 4
/* FPU sub-types which may be requested via ->get_fpu(). */
enum x86_emulate_fpu_type {
@@ -156,6 +164,8 @@ enum x86_emulate_fpu_type {
X86EMUL_FPU_ymm /* AVX/XOP instruction set (%ymm0-%ymm7/15) */
};
+struct x86_emulate_state;
+
/*
* These operations represent the instruction emulator's interface to memory,
* I/O ports, privileged state... pretty much everything other than GPRs.
@@ -238,6 +248,14 @@ struct x86_emulate_ops
struct x86_emulate_ctxt *ctxt);
/*
+ * validate: Post-decode, pre-emulate hook to allow caller controlled
+ * filtering.
+ */
+ int (*validate)(
+ const struct x86_emulate_state *state,
+ struct x86_emulate_ctxt *ctxt);
+
+ /*
* rep_ins: Emulate INS: <src_port> -> <dst_seg:dst_offset>.
* @bytes_per_rep: [IN ] Bytes transferred per repetition.
* @reps: [IN ] Maximum repetitions to be emulated.
[-- Attachment #3: Type: text/plain, Size: 127 bytes --]
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel
^ permalink raw reply [flat|nested] 14+ messages in thread