From mboxrd@z Thu Jan 1 00:00:00 1970 From: Boris Ostrovsky Subject: Re: [PATCH 4/6] x86/emulate: Support for emulating software event injection Date: Wed, 24 Sep 2014 09:01:34 -0400 Message-ID: <5422C0AE.3090404@oracle.com> References: <1411484611-31027-1-git-send-email-andrew.cooper3@citrix.com> <1411484611-31027-5-git-send-email-andrew.cooper3@citrix.com> Mime-Version: 1.0 Content-Type: text/plain; charset="us-ascii"; Format="flowed" Content-Transfer-Encoding: 7bit Return-path: In-Reply-To: <1411484611-31027-5-git-send-email-andrew.cooper3@citrix.com> List-Unsubscribe: , List-Post: List-Help: List-Subscribe: , Sender: xen-devel-bounces@lists.xen.org Errors-To: xen-devel-bounces@lists.xen.org To: Andrew Cooper , Xen-devel Cc: Aravind Gopalakrishnan , Suravee Suthikulpanit , Jan Beulich List-Id: xen-devel@lists.xenproject.org On 09/23/2014 11:03 AM, Andrew Cooper wrote: > AMD SVM requires all software events to have their injection emulated if > hardware lacks NextRIP support. In addition, `icebp` (opcode 0xf1) injection > requires emulation in all cases, even with hardware NextRIP support. > > Emulating full control transfers is overkill for our needs. All that matters > is that guest userspace can't bypass the descriptor DPL check. Any guest OS > which would incur other faults as part of injection is going to end up with a > double fault instead, and won't be in a position to care that the faulting eip > is wrong. > > Reported-by: Andrei LUTAS > Signed-off-by: Andrew Cooper > Signed-off-by: Jan Beulich > CC: Boris Ostrovsky > CC: Suravee Suthikulpanit > CC: Aravind Gopalakrishnan > --- > xen/arch/x86/hvm/emulate.c | 8 +++ > xen/arch/x86/hvm/svm/svm.c | 57 +++++++++++++-- > xen/arch/x86/mm.c | 2 + > xen/arch/x86/mm/shadow/common.c | 1 + > xen/arch/x86/x86_emulate/x86_emulate.c | 122 ++++++++++++++++++++++++++++++-- > xen/arch/x86/x86_emulate/x86_emulate.h | 10 +++ > 6 files changed, 191 insertions(+), 9 deletions(-) > > diff --git a/xen/arch/x86/hvm/emulate.c b/xen/arch/x86/hvm/emulate.c > index 7ee146b..463ccfb 100644 > --- a/xen/arch/x86/hvm/emulate.c > +++ b/xen/arch/x86/hvm/emulate.c > @@ -21,6 +21,7 @@ > #include > #include > #include > +#include > > static void hvmtrace_io_assist(int is_mmio, ioreq_t *p) > { > @@ -1328,6 +1329,13 @@ static int _hvm_emulate_one(struct hvm_emulate_ctxt *hvmemul_ctxt, > vio->mmio_retrying = vio->mmio_retry; > vio->mmio_retry = 0; > > + if ( cpu_has_vmx ) > + hvmemul_ctxt->ctxt.swint_emulate = x86_swint_emulate_none; > + else if ( cpu_has_svm_nrips ) > + hvmemul_ctxt->ctxt.swint_emulate = x86_swint_emulate_icebp; > + else > + hvmemul_ctxt->ctxt.swint_emulate = x86_swint_emulate_all; > + > rc = x86_emulate(&hvmemul_ctxt->ctxt, ops); > > if ( rc == X86EMUL_OKAY && vio->mmio_retry ) > diff --git a/xen/arch/x86/hvm/svm/svm.c b/xen/arch/x86/hvm/svm/svm.c > index de982fd..b6beefc 100644 > --- a/xen/arch/x86/hvm/svm/svm.c > +++ b/xen/arch/x86/hvm/svm/svm.c > @@ -1177,11 +1177,12 @@ static void svm_inject_trap(struct hvm_trap *trap) > struct vmcb_struct *vmcb = curr->arch.hvm_svm.vmcb; > eventinj_t event = vmcb->eventinj; > struct hvm_trap _trap = *trap; > + const struct cpu_user_regs *regs = guest_cpu_user_regs(); > > switch ( _trap.vector ) > { > case TRAP_debug: > - if ( guest_cpu_user_regs()->eflags & X86_EFLAGS_TF ) > + if ( regs->eflags & X86_EFLAGS_TF ) > { > __restore_debug_registers(vmcb, curr); > vmcb_set_dr6(vmcb, vmcb_get_dr6(vmcb) | 0x4000); > @@ -1209,10 +1210,58 @@ static void svm_inject_trap(struct hvm_trap *trap) > > event.bytes = 0; > event.fields.v = 1; > - event.fields.type = X86_EVENTTYPE_HW_EXCEPTION; > event.fields.vector = _trap.vector; > - event.fields.ev = (_trap.error_code != HVM_DELIVER_NO_ERROR_CODE); > - event.fields.errorcode = _trap.error_code; > + > + /* Refer to AMD Vol 2: System Programming, 15.20 Event Injection. */ > + switch ( _trap.type ) > + { > + case X86_EVENTTYPE_SW_INTERRUPT: /* int $n */ > + /* > + * Injection type 4 (software interrupt) is only supported with > + * NextRIP support. Without NextRIP, the emulator will have performed > + * DPL and presence checks for us. > + */ > + if ( cpu_has_svm_nrips ) > + { > + vmcb->nextrip = regs->eip + _trap.insn_len; > + event.fields.type = X86_EVENTTYPE_SW_INTERRUPT; > + } > + else > + event.fields.type = X86_EVENTTYPE_HW_EXCEPTION; > + break; > + > + case X86_EVENTTYPE_PRI_SW_EXCEPTION: /* icebp */ > + /* > + * icebp's injection must always be emulated. Software injection help > + * in x86_emulate has moved eip forward, but NextRIP (if used) still > + * needs setting or execution will resume from 0. > + */ Can you tell me where eip is updated? I don't see any difference between how, for example, int3 is emulated differently from icebp. -boris > + if ( cpu_has_svm_nrips ) > + vmcb->nextrip = regs->eip; > + event.fields.type = X86_EVENTTYPE_HW_EXCEPTION; > + break; > + > + case X86_EVENTTYPE_SW_EXCEPTION: /* int3, into */ > + /* > + * The AMD manual states that .type=3 (HW exception), .vector=3 or 4, > + * will perform DPL checks. Experimentally, DPL and presence checks > + * are indeed performed, even without NextRIP support. > + * > + * However without NextRIP support, the event injection still needs > + * fully emulating to get the correct eip in the trap frame, yet get > + * the correct faulting eip should a fault occur. > + */ > + if ( cpu_has_svm_nrips ) > + vmcb->nextrip = regs->eip + _trap.insn_len; > + event.fields.type = X86_EVENTTYPE_HW_EXCEPTION; > + break; > + > + default: > + event.fields.type = X86_EVENTTYPE_HW_EXCEPTION; > + event.fields.ev = (_trap.error_code != HVM_DELIVER_NO_ERROR_CODE); > + event.fields.errorcode = _trap.error_code; > + break; > + } > > vmcb->eventinj = event; > > diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c > index 5b3f06f..bfe9f05 100644 > --- a/xen/arch/x86/mm.c > +++ b/xen/arch/x86/mm.c > @@ -5096,6 +5096,7 @@ int ptwr_do_page_fault(struct vcpu *v, unsigned long addr, > ptwr_ctxt.ctxt.force_writeback = 0; > ptwr_ctxt.ctxt.addr_size = ptwr_ctxt.ctxt.sp_size = > is_pv_32on64_domain(d) ? 32 : BITS_PER_LONG; > + ptwr_ctxt.ctxt.swint_emulate = x86_swint_emulate_none; > ptwr_ctxt.cr2 = addr; > ptwr_ctxt.pte = pte; > > @@ -5172,6 +5173,7 @@ int mmio_ro_do_page_fault(struct vcpu *v, unsigned long addr, > .ctxt.regs = regs, > .ctxt.addr_size = addr_size, > .ctxt.sp_size = addr_size, > + .ctxt.swint_emulate = x86_swint_emulate_none, > .cr2 = addr > }; > int rc; > diff --git a/xen/arch/x86/mm/shadow/common.c b/xen/arch/x86/mm/shadow/common.c > index 9115a78..a5eed28 100644 > --- a/xen/arch/x86/mm/shadow/common.c > +++ b/xen/arch/x86/mm/shadow/common.c > @@ -366,6 +366,7 @@ const struct x86_emulate_ops *shadow_init_emulation( > > sh_ctxt->ctxt.regs = regs; > sh_ctxt->ctxt.force_writeback = 0; > + sh_ctxt->ctxt.swint_emulate = x86_swint_emulate_none; > > if ( is_pv_vcpu(v) ) > { > diff --git a/xen/arch/x86/x86_emulate/x86_emulate.c b/xen/arch/x86/x86_emulate/x86_emulate.c > index e06aa60..ffca65a 100644 > --- a/xen/arch/x86/x86_emulate/x86_emulate.c > +++ b/xen/arch/x86/x86_emulate/x86_emulate.c > @@ -403,6 +403,11 @@ typedef union { > #define EXC_PF 14 > #define EXC_MF 16 > > +/* Segment selector error code bits. */ > +#define ECODE_EXT (1 << 0) > +#define ECODE_IDT (1 << 1) > +#define ECODE_TI (1 << 2) > + > /* > * Instruction emulation: > * Most instructions are emulated directly via a fragment of inline assembly > @@ -1318,6 +1323,115 @@ decode_segment(uint8_t modrm_reg) > return decode_segment_failed; > } > > +/* Inject a software interrupt/exception, emulating if needed. */ > +static int inject_swint(enum x86_swint_type type, > + uint8_t vector, uint8_t insn_len, > + struct x86_emulate_ctxt *ctxt, > + const struct x86_emulate_ops *ops) > +{ > + int rc, error_code, fault_type = EXC_GP; > + > + fail_if(ops->inject_sw_interrupt == NULL); > + fail_if(ops->inject_hw_exception == NULL); > + > + /* > + * Without hardware support, injecting software interrupts/exceptions is > + * problematic. > + * > + * All software methods of generating exceptions (other than BOUND) yield > + * traps, so eip in the exception frame needs to point after the > + * instruction, not at it. > + * > + * However, if injecting it as a hardware exception causes a fault during > + * delivery, our adjustment of eip will cause the fault to be reported > + * after the faulting instruction, not pointing to it. > + * > + * Therefore, eip can only safely be wound forwards if we are certain that > + * injecting an equivalent hardware exception won't fault, which means > + * emulating everything the processor would do on a control transfer. > + * > + * However, emulation of complete control transfers is very complicated. > + * All we care about is that guest userspace cannot avoid the descriptor > + * DPL check by using the Xen emulator, and successfully invoke DPL=0 > + * descriptors. > + * > + * Any OS which would further fault during injection is going to receive a > + * double fault anyway, and won't be in a position to care that the > + * faulting eip is incorrect. > + */ > + > + if ( (ctxt->swint_emulate == x86_swint_emulate_all) || > + ((ctxt->swint_emulate == x86_swint_emulate_icebp) && > + (type == x86_swint_icebp)) ) > + { > + if ( !in_realmode(ctxt, ops) ) > + { > + unsigned int idte_size = (ctxt->addr_size == 64) ? 16 : 8; > + unsigned int idte_offset = vector * idte_size; > + struct segment_register idtr; > + uint32_t idte_ctl; > + > + /* icebp sets the External Event bit despite being an instruction. */ > + error_code = (vector << 3) | ECODE_IDT | > + (type == x86_swint_icebp ? ECODE_EXT : 0); > + > + /* > + * TODO - this does not cover the v8086 mode with CR4.VME case > + * correctly, but falls on the safe side from the point of view of > + * a 32bit OS. Someone with many TUITs can see about reading the > + * TSS Software Interrupt Redirection bitmap. > + */ > + if ( (ctxt->regs->eflags & EFLG_VM) && > + ((ctxt->regs->eflags & EFLG_IOPL) != EFLG_IOPL) ) > + goto raise_exn; > + > + fail_if(ops->read_segment == NULL); > + fail_if(ops->read == NULL); > + if ( (rc = ops->read_segment(x86_seg_idtr, &idtr, ctxt)) ) > + goto done; > + > + if ( (idte_offset + idte_size - 1) > idtr.limit ) > + goto raise_exn; > + > + /* > + * Should strictly speaking read all 8/16 bytes of an entry, > + * but we currently only care about the dpl and present bits. > + */ > + ops->read(x86_seg_none, idtr.base + idte_offset + 4, > + &idte_ctl, sizeof(idte_ctl), ctxt); > + > + /* Is this entry present? */ > + if ( !(idte_ctl & (1u << 15)) ) > + { > + fault_type = EXC_NP; > + goto raise_exn; > + } > + > + /* icebp counts as a hardware event, and bypasses the dpl check. */ > + if ( type != x86_swint_icebp ) > + { > + struct segment_register ss; > + > + if ( (rc = ops->read_segment(x86_seg_ss, &ss, ctxt)) ) > + goto done; > + > + if ( ss.attr.fields.dpl > ((idte_ctl >> 13) & 3) ) > + goto raise_exn; > + } > + } > + > + ctxt->regs->eip += insn_len; > + } > + > + rc = ops->inject_sw_interrupt(type, vector, insn_len, ctxt); > + > + done: > + return rc; > + > + raise_exn: > + return ops->inject_hw_exception(fault_type, error_code, ctxt); > +} > + > int > x86_emulate( > struct x86_emulate_ctxt *ctxt, > @@ -2637,11 +2751,9 @@ x86_emulate( > src.val = insn_fetch_type(uint8_t); > swint_type = x86_swint_int; > swint: > - fail_if(!in_realmode(ctxt, ops)); /* XSA-106 */ > - fail_if(ops->inject_sw_interrupt == NULL); > - rc = ops->inject_sw_interrupt(swint_type, src.val, > - _regs.eip - ctxt->regs->eip, > - ctxt) ? : X86EMUL_EXCEPTION; > + rc = inject_swint(swint_type, src.val, > + _regs.eip - ctxt->regs->eip, > + ctxt, ops) ? : X86EMUL_EXCEPTION; > goto done; > > case 0xce: /* into */ > diff --git a/xen/arch/x86/x86_emulate/x86_emulate.h b/xen/arch/x86/x86_emulate/x86_emulate.h > index b336e17..b059341 100644 > --- a/xen/arch/x86/x86_emulate/x86_emulate.h > +++ b/xen/arch/x86/x86_emulate/x86_emulate.h > @@ -59,6 +59,13 @@ enum x86_swint_type { > x86_swint_int, /* 0xcd $n */ > }; > > +/* How much help is required with software event injection? */ > +enum x86_swint_emulation { > + x86_swint_emulate_none, /* Hardware supports all software injection properly */ > + x86_swint_emulate_icebp,/* Help needed with `icebp` (0xf1) */ > + x86_swint_emulate_all, /* Help needed with all software events */ > +}; > + > /* > * Attribute for segment selector. This is a copy of bit 40:47 & 52:55 of the > * segment descriptor. It happens to match the format of an AMD SVM VMCB. > @@ -388,6 +395,9 @@ struct x86_emulate_ctxt > /* Set this if writes may have side effects. */ > uint8_t force_writeback; > > + /* Software event injection support. */ > + enum x86_swint_emulation swint_emulate; > + > /* Retirement state, set by the emulator (valid only on X86EMUL_OKAY). */ > union { > struct {