All of lore.kernel.org
 help / color / mirror / Atom feed
From: Wei Liu <wei.liu2@citrix.com>
To: Xen-devel <xen-devel@lists.xenproject.org>
Cc: Andrew Cooper <andrew.cooper3@citrix.com>,
	Wei Liu <wei.liu2@citrix.com>, Jan Beulich <JBeulich@suse.com>
Subject: Re: [PATCH for-next v3 01/22] x86/traps: move privilege instruction emulation code
Date: Thu, 18 May 2017 18:28:32 +0100	[thread overview]
Message-ID: <20170518172832.afqcp65eztdaurdb@citrix.com> (raw)
In-Reply-To: <20170518171004.27204-2-wei.liu2@citrix.com>

I forgot to move gpr_switch.S. Here is an updated version.

---8<---
From 58df816b937dc7a3598de01f053a6030e631057e Mon Sep 17 00:00:00 2001
From: Wei Liu <wei.liu2@citrix.com>
Date: Thu, 18 May 2017 16:18:56 +0100
Subject: [PATCH] x86/traps: move privilege instruction emulation code

Move relevant code to pv/emulate.c. Export emulate_privileged_op in
pv/traps.h.

Note that read_descriptor is duplicated in emulate.c. The duplication
will be gone once all emulation code is moved.

Also move gpr_switch.S to pv/ because the code in that file is only
used by privilege instruction emulation.

No functional change.

Signed-off-by: Wei Liu <wei.liu2@citrix.com>
---
 xen/arch/x86/pv/Makefile                 |    2 +
 xen/arch/x86/pv/emulate.c                | 1470 ++++++++++++++++++++++++++++++
 xen/arch/x86/{x86_64 => pv}/gpr_switch.S |    0
 xen/arch/x86/traps.c                     | 1358 +--------------------------
 xen/arch/x86/x86_64/Makefile             |    1 -
 xen/include/asm-x86/pv/traps.h           |   48 +
 6 files changed, 1522 insertions(+), 1357 deletions(-)
 create mode 100644 xen/arch/x86/pv/emulate.c
 rename xen/arch/x86/{x86_64 => pv}/gpr_switch.S (100%)
 create mode 100644 xen/include/asm-x86/pv/traps.h

diff --git a/xen/arch/x86/pv/Makefile b/xen/arch/x86/pv/Makefile
index 489a9f59cb..f272f607d4 100644
--- a/xen/arch/x86/pv/Makefile
+++ b/xen/arch/x86/pv/Makefile
@@ -3,3 +3,5 @@ obj-y += traps.o
 
 obj-bin-y += dom0_build.init.o
 obj-y += domain.o
+obj-y += emulate.o
+obj-bin-y += gpr_switch.o
diff --git a/xen/arch/x86/pv/emulate.c b/xen/arch/x86/pv/emulate.c
new file mode 100644
index 0000000000..fb0d066a3b
--- /dev/null
+++ b/xen/arch/x86/pv/emulate.c
@@ -0,0 +1,1470 @@
+/******************************************************************************
+ * arch/x86/pv/emulate.c
+ *
+ * PV emulation code
+ *
+ * Modifications to Linux original are copyright (c) 2002-2004, K A Fraser
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <xen/errno.h>
+#include <xen/event.h>
+#include <xen/guest_access.h>
+#include <xen/iocap.h>
+#include <xen/spinlock.h>
+#include <xen/trace.h>
+
+#include <asm/apic.h>
+#include <asm/debugreg.h>
+#include <asm/hpet.h>
+#include <asm/hypercall.h>
+#include <asm/mc146818rtc.h>
+#include <asm/p2m.h>
+#include <asm/pv/traps.h>
+#include <asm/shared.h>
+#include <asm/traps.h>
+#include <asm/x86_emulate.h>
+
+#include <xsm/xsm.h>
+
+#include "../x86_64/mmconfig.h"
+
+/******************
+ * Helper functions
+ */
+
+static int read_descriptor(unsigned int sel,
+                           const struct vcpu *v,
+                           unsigned long *base,
+                           unsigned long *limit,
+                           unsigned int *ar,
+                           bool_t insn_fetch)
+{
+    struct desc_struct desc;
+
+    if ( sel < 4)
+        desc.b = desc.a = 0;
+    else if ( __get_user(desc,
+                         (const struct desc_struct *)(!(sel & 4)
+                                                      ? GDT_VIRT_START(v)
+                                                      : LDT_VIRT_START(v))
+                         + (sel >> 3)) )
+        return 0;
+    if ( !insn_fetch )
+        desc.b &= ~_SEGMENT_L;
+
+    *ar = desc.b & 0x00f0ff00;
+    if ( !(desc.b & _SEGMENT_L) )
+    {
+        *base = ((desc.a >> 16) + ((desc.b & 0xff) << 16) +
+                 (desc.b & 0xff000000));
+        *limit = (desc.a & 0xffff) | (desc.b & 0x000f0000);
+        if ( desc.b & _SEGMENT_G )
+            *limit = ((*limit + 1) << 12) - 1;
+#ifndef NDEBUG
+        if ( sel > 3 )
+        {
+            unsigned int a, l;
+            unsigned char valid;
+
+            asm volatile (
+                "larl %2,%0 ; setz %1"
+                : "=r" (a), "=qm" (valid) : "rm" (sel));
+            BUG_ON(valid && ((a & 0x00f0ff00) != *ar));
+            asm volatile (
+                "lsll %2,%0 ; setz %1"
+                : "=r" (l), "=qm" (valid) : "rm" (sel));
+            BUG_ON(valid && (l != *limit));
+        }
+#endif
+    }
+    else
+    {
+        *base = 0UL;
+        *limit = ~0UL;
+    }
+
+    return 1;
+}
+
+/***********************
+ * I/O emulation support
+ */
+
+struct priv_op_ctxt {
+    struct x86_emulate_ctxt ctxt;
+    struct {
+        unsigned long base, limit;
+    } cs;
+    char *io_emul_stub;
+    unsigned int bpmatch;
+    unsigned int tsc;
+#define TSC_BASE 1
+#define TSC_AUX 2
+};
+
+/* I/O emulation support. Helper routines for, and type of, the stack stub.*/
+void host_to_guest_gpr_switch(struct cpu_user_regs *);
+unsigned long guest_to_host_gpr_switch(unsigned long);
+
+void (*pv_post_outb_hook)(unsigned int port, u8 value);
+
+typedef void io_emul_stub_t(struct cpu_user_regs *);
+
+static io_emul_stub_t *io_emul_stub_setup(struct priv_op_ctxt *ctxt, u8 opcode,
+                                          unsigned int port, unsigned int bytes)
+{
+    if ( !ctxt->io_emul_stub )
+        ctxt->io_emul_stub = map_domain_page(_mfn(this_cpu(stubs.mfn))) +
+                                             (this_cpu(stubs.addr) &
+                                              ~PAGE_MASK) +
+                                             STUB_BUF_SIZE / 2;
+
+    /* movq $host_to_guest_gpr_switch,%rcx */
+    ctxt->io_emul_stub[0] = 0x48;
+    ctxt->io_emul_stub[1] = 0xb9;
+    *(void **)&ctxt->io_emul_stub[2] = (void *)host_to_guest_gpr_switch;
+    /* callq *%rcx */
+    ctxt->io_emul_stub[10] = 0xff;
+    ctxt->io_emul_stub[11] = 0xd1;
+    /* data16 or nop */
+    ctxt->io_emul_stub[12] = (bytes != 2) ? 0x90 : 0x66;
+    /* <io-access opcode> */
+    ctxt->io_emul_stub[13] = opcode;
+    /* imm8 or nop */
+    ctxt->io_emul_stub[14] = !(opcode & 8) ? port : 0x90;
+    /* ret (jumps to guest_to_host_gpr_switch) */
+    ctxt->io_emul_stub[15] = 0xc3;
+    BUILD_BUG_ON(STUB_BUF_SIZE / 2 < 16);
+
+    if ( ioemul_handle_quirk )
+        ioemul_handle_quirk(opcode, &ctxt->io_emul_stub[12], ctxt->ctxt.regs);
+
+    /* Handy function-typed pointer to the stub. */
+    return (void *)(this_cpu(stubs.addr) + STUB_BUF_SIZE / 2);
+}
+
+
+/* Perform IOPL check between the vcpu's shadowed IOPL, and the assumed cpl. */
+static bool_t iopl_ok(const struct vcpu *v, const struct cpu_user_regs *regs)
+{
+    unsigned int cpl = guest_kernel_mode(v, regs) ?
+        (VM_ASSIST(v->domain, architectural_iopl) ? 0 : 1) : 3;
+
+    ASSERT((v->arch.pv_vcpu.iopl & ~X86_EFLAGS_IOPL) == 0);
+
+    return IOPL(cpl) <= v->arch.pv_vcpu.iopl;
+}
+
+/* Has the guest requested sufficient permission for this I/O access? */
+static int guest_io_okay(
+    unsigned int port, unsigned int bytes,
+    struct vcpu *v, struct cpu_user_regs *regs)
+{
+    /* If in user mode, switch to kernel mode just to read I/O bitmap. */
+    int user_mode = !(v->arch.flags & TF_kernel_mode);
+#define TOGGLE_MODE() if ( user_mode ) toggle_guest_mode(v)
+
+    if ( iopl_ok(v, regs) )
+        return 1;
+
+    if ( v->arch.pv_vcpu.iobmp_limit > (port + bytes) )
+    {
+        union { uint8_t bytes[2]; uint16_t mask; } x;
+
+        /*
+         * Grab permission bytes from guest space. Inaccessible bytes are
+         * read as 0xff (no access allowed).
+         */
+        TOGGLE_MODE();
+        switch ( __copy_from_guest_offset(x.bytes, v->arch.pv_vcpu.iobmp,
+                                          port>>3, 2) )
+        {
+        default: x.bytes[0] = ~0;
+            /* fallthrough */
+        case 1:  x.bytes[1] = ~0;
+            /* fallthrough */
+        case 0:  break;
+        }
+        TOGGLE_MODE();
+
+        if ( (x.mask & (((1<<bytes)-1) << (port&7))) == 0 )
+            return 1;
+    }
+
+    return 0;
+}
+
+/* Has the administrator granted sufficient permission for this I/O access? */
+static bool_t admin_io_okay(unsigned int port, unsigned int bytes,
+                            const struct domain *d)
+{
+    /*
+     * Port 0xcf8 (CONFIG_ADDRESS) is only visible for DWORD accesses.
+     * We never permit direct access to that register.
+     */
+    if ( (port == 0xcf8) && (bytes == 4) )
+        return 0;
+
+    /* We also never permit direct access to the RTC/CMOS registers. */
+    if ( ((port & ~1) == RTC_PORT(0)) )
+        return 0;
+
+    return ioports_access_permitted(d, port, port + bytes - 1);
+}
+
+static bool_t pci_cfg_ok(struct domain *currd, unsigned int start,
+                         unsigned int size, uint32_t *write)
+{
+    uint32_t machine_bdf;
+
+    if ( !is_hardware_domain(currd) )
+        return 0;
+
+    if ( !CF8_ENABLED(currd->arch.pci_cf8) )
+        return 1;
+
+    machine_bdf = CF8_BDF(currd->arch.pci_cf8);
+    if ( write )
+    {
+        const unsigned long *ro_map = pci_get_ro_map(0);
+
+        if ( ro_map && test_bit(machine_bdf, ro_map) )
+            return 0;
+    }
+    start |= CF8_ADDR_LO(currd->arch.pci_cf8);
+    /* AMD extended configuration space access? */
+    if ( CF8_ADDR_HI(currd->arch.pci_cf8) &&
+         boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
+         boot_cpu_data.x86 >= 0x10 && boot_cpu_data.x86 <= 0x17 )
+    {
+        uint64_t msr_val;
+
+        if ( rdmsr_safe(MSR_AMD64_NB_CFG, msr_val) )
+            return 0;
+        if ( msr_val & (1ULL << AMD64_NB_CFG_CF8_EXT_ENABLE_BIT) )
+            start |= CF8_ADDR_HI(currd->arch.pci_cf8);
+    }
+
+    return !write ?
+           xsm_pci_config_permission(XSM_HOOK, currd, machine_bdf,
+                                     start, start + size - 1, 0) == 0 :
+           pci_conf_write_intercept(0, machine_bdf, start, size, write) >= 0;
+}
+
+uint32_t guest_io_read(unsigned int port, unsigned int bytes,
+                       struct domain *currd)
+{
+    uint32_t data = 0;
+    unsigned int shift = 0;
+
+    if ( admin_io_okay(port, bytes, currd) )
+    {
+        switch ( bytes )
+        {
+        case 1: return inb(port);
+        case 2: return inw(port);
+        case 4: return inl(port);
+        }
+    }
+
+    while ( bytes != 0 )
+    {
+        unsigned int size = 1;
+        uint32_t sub_data = ~0;
+
+        if ( (port == 0x42) || (port == 0x43) || (port == 0x61) )
+        {
+            sub_data = pv_pit_handler(port, 0, 0);
+        }
+        else if ( port == RTC_PORT(0) )
+        {
+            sub_data = currd->arch.cmos_idx;
+        }
+        else if ( (port == RTC_PORT(1)) &&
+                  ioports_access_permitted(currd, RTC_PORT(0), RTC_PORT(1)) )
+        {
+            unsigned long flags;
+
+            spin_lock_irqsave(&rtc_lock, flags);
+            outb(currd->arch.cmos_idx & 0x7f, RTC_PORT(0));
+            sub_data = inb(RTC_PORT(1));
+            spin_unlock_irqrestore(&rtc_lock, flags);
+        }
+        else if ( (port == 0xcf8) && (bytes == 4) )
+        {
+            size = 4;
+            sub_data = currd->arch.pci_cf8;
+        }
+        else if ( (port & 0xfffc) == 0xcfc )
+        {
+            size = min(bytes, 4 - (port & 3));
+            if ( size == 3 )
+                size = 2;
+            if ( pci_cfg_ok(currd, port & 3, size, NULL) )
+                sub_data = pci_conf_read(currd->arch.pci_cf8, port & 3, size);
+        }
+
+        if ( size == 4 )
+            return sub_data;
+
+        data |= (sub_data & ((1u << (size * 8)) - 1)) << shift;
+        shift += size * 8;
+        port += size;
+        bytes -= size;
+    }
+
+    return data;
+}
+
+static unsigned int check_guest_io_breakpoint(struct vcpu *v,
+    unsigned int port, unsigned int len)
+{
+    unsigned int width, i, match = 0;
+    unsigned long start;
+
+    if ( !(v->arch.debugreg[5]) ||
+         !(v->arch.pv_vcpu.ctrlreg[4] & X86_CR4_DE) )
+        return 0;
+
+    for ( i = 0; i < 4; i++ )
+    {
+        if ( !(v->arch.debugreg[5] &
+               (3 << (i * DR_ENABLE_SIZE))) )
+            continue;
+
+        start = v->arch.debugreg[i];
+        width = 0;
+
+        switch ( (v->arch.debugreg[7] >>
+                  (DR_CONTROL_SHIFT + i * DR_CONTROL_SIZE)) & 0xc )
+        {
+        case DR_LEN_1: width = 1; break;
+        case DR_LEN_2: width = 2; break;
+        case DR_LEN_4: width = 4; break;
+        case DR_LEN_8: width = 8; break;
+        }
+
+        if ( (start < (port + len)) && ((start + width) > port) )
+            match |= 1 << i;
+    }
+
+    return match;
+}
+
+static int priv_op_read_io(unsigned int port, unsigned int bytes,
+                           unsigned long *val, struct x86_emulate_ctxt *ctxt)
+{
+    struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt);
+    struct vcpu *curr = current;
+    struct domain *currd = current->domain;
+
+    /* INS must not come here. */
+    ASSERT((ctxt->opcode & ~9) == 0xe4);
+
+    if ( !guest_io_okay(port, bytes, curr, ctxt->regs) )
+        return X86EMUL_UNHANDLEABLE;
+
+    poc->bpmatch = check_guest_io_breakpoint(curr, port, bytes);
+
+    if ( admin_io_okay(port, bytes, currd) )
+    {
+        io_emul_stub_t *io_emul =
+            io_emul_stub_setup(poc, ctxt->opcode, port, bytes);
+
+        mark_regs_dirty(ctxt->regs);
+        io_emul(ctxt->regs);
+        return X86EMUL_DONE;
+    }
+
+    *val = guest_io_read(port, bytes, currd);
+
+    return X86EMUL_OKAY;
+}
+
+void guest_io_write(unsigned int port, unsigned int bytes, uint32_t data,
+                    struct domain *currd)
+{
+    if ( admin_io_okay(port, bytes, currd) )
+    {
+        switch ( bytes ) {
+        case 1:
+            outb((uint8_t)data, port);
+            if ( pv_post_outb_hook )
+                pv_post_outb_hook(port, (uint8_t)data);
+            break;
+        case 2:
+            outw((uint16_t)data, port);
+            break;
+        case 4:
+            outl(data, port);
+            break;
+        }
+        return;
+    }
+
+    while ( bytes != 0 )
+    {
+        unsigned int size = 1;
+
+        if ( (port == 0x42) || (port == 0x43) || (port == 0x61) )
+        {
+            pv_pit_handler(port, (uint8_t)data, 1);
+        }
+        else if ( port == RTC_PORT(0) )
+        {
+            currd->arch.cmos_idx = data;
+        }
+        else if ( (port == RTC_PORT(1)) &&
+                  ioports_access_permitted(currd, RTC_PORT(0), RTC_PORT(1)) )
+        {
+            unsigned long flags;
+
+            if ( pv_rtc_handler )
+                pv_rtc_handler(currd->arch.cmos_idx & 0x7f, data);
+            spin_lock_irqsave(&rtc_lock, flags);
+            outb(currd->arch.cmos_idx & 0x7f, RTC_PORT(0));
+            outb(data, RTC_PORT(1));
+            spin_unlock_irqrestore(&rtc_lock, flags);
+        }
+        else if ( (port == 0xcf8) && (bytes == 4) )
+        {
+            size = 4;
+            currd->arch.pci_cf8 = data;
+        }
+        else if ( (port & 0xfffc) == 0xcfc )
+        {
+            size = min(bytes, 4 - (port & 3));
+            if ( size == 3 )
+                size = 2;
+            if ( pci_cfg_ok(currd, port & 3, size, &data) )
+                pci_conf_write(currd->arch.pci_cf8, port & 3, size, data);
+        }
+
+        if ( size == 4 )
+            return;
+
+        port += size;
+        bytes -= size;
+        data >>= size * 8;
+    }
+}
+
+static int priv_op_write_io(unsigned int port, unsigned int bytes,
+                            unsigned long val, struct x86_emulate_ctxt *ctxt)
+{
+    struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt);
+    struct vcpu *curr = current;
+    struct domain *currd = current->domain;
+
+    /* OUTS must not come here. */
+    ASSERT((ctxt->opcode & ~9) == 0xe6);
+
+    if ( !guest_io_okay(port, bytes, curr, ctxt->regs) )
+        return X86EMUL_UNHANDLEABLE;
+
+    poc->bpmatch = check_guest_io_breakpoint(curr, port, bytes);
+
+    if ( admin_io_okay(port, bytes, currd) )
+    {
+        io_emul_stub_t *io_emul =
+            io_emul_stub_setup(poc, ctxt->opcode, port, bytes);
+
+        mark_regs_dirty(ctxt->regs);
+        io_emul(ctxt->regs);
+        if ( (bytes == 1) && pv_post_outb_hook )
+            pv_post_outb_hook(port, val);
+        return X86EMUL_DONE;
+    }
+
+    guest_io_write(port, bytes, val, currd);
+
+    return X86EMUL_OKAY;
+}
+
+static int priv_op_read_segment(enum x86_segment seg,
+                                struct segment_register *reg,
+                                struct x86_emulate_ctxt *ctxt)
+{
+    /* Check if this is an attempt to access the I/O bitmap. */
+    if ( seg == x86_seg_tr )
+    {
+        switch ( ctxt->opcode )
+        {
+        case 0x6c ... 0x6f: /* ins / outs */
+        case 0xe4 ... 0xe7: /* in / out (immediate port) */
+        case 0xec ... 0xef: /* in / out (port in %dx) */
+            /* Defer the check to priv_op_{read,write}_io(). */
+            return X86EMUL_DONE;
+        }
+    }
+
+    if ( ctxt->addr_size < 64 )
+    {
+        unsigned long limit;
+        unsigned int sel, ar;
+
+        switch ( seg )
+        {
+        case x86_seg_cs: sel = ctxt->regs->cs; break;
+        case x86_seg_ds: sel = read_sreg(ds);  break;
+        case x86_seg_es: sel = read_sreg(es);  break;
+        case x86_seg_fs: sel = read_sreg(fs);  break;
+        case x86_seg_gs: sel = read_sreg(gs);  break;
+        case x86_seg_ss: sel = ctxt->regs->ss; break;
+        default: return X86EMUL_UNHANDLEABLE;
+        }
+
+        if ( !read_descriptor(sel, current, &reg->base, &limit, &ar, 0) )
+            return X86EMUL_UNHANDLEABLE;
+
+        reg->limit = limit;
+        reg->attr.bytes = ar >> 8;
+    }
+    else
+    {
+        switch ( seg )
+        {
+        default:
+            if ( !is_x86_user_segment(seg) )
+                return X86EMUL_UNHANDLEABLE;
+            reg->base = 0;
+            break;
+        case x86_seg_fs:
+            reg->base = rdfsbase();
+            break;
+        case x86_seg_gs:
+            reg->base = rdgsbase();
+            break;
+        }
+
+        reg->limit = ~0U;
+
+        reg->attr.bytes = 0;
+        reg->attr.fields.type = _SEGMENT_WR >> 8;
+        if ( seg == x86_seg_cs )
+        {
+            reg->attr.fields.type |= _SEGMENT_CODE >> 8;
+            reg->attr.fields.l = 1;
+        }
+        else
+            reg->attr.fields.db = 1;
+        reg->attr.fields.s   = 1;
+        reg->attr.fields.dpl = 3;
+        reg->attr.fields.p   = 1;
+        reg->attr.fields.g   = 1;
+    }
+
+    /*
+     * For x86_emulate.c's mode_ring0() to work, fake a DPL of zero.
+     * Also do this for consistency for non-conforming code segments.
+     */
+    if ( (seg == x86_seg_ss ||
+          (seg == x86_seg_cs &&
+           !(reg->attr.fields.type & (_SEGMENT_EC >> 8)))) &&
+         guest_kernel_mode(current, ctxt->regs) )
+        reg->attr.fields.dpl = 0;
+
+    return X86EMUL_OKAY;
+}
+
+static int pv_emul_virt_to_linear(unsigned long base, unsigned long offset,
+                                  unsigned int bytes, unsigned long limit,
+                                  enum x86_segment seg,
+                                  struct x86_emulate_ctxt *ctxt,
+                                  unsigned long *addr)
+{
+    int rc = X86EMUL_OKAY;
+
+    *addr = base + offset;
+
+    if ( ctxt->addr_size < 64 )
+    {
+        if ( limit < bytes - 1 || offset > limit - bytes + 1 )
+            rc = X86EMUL_EXCEPTION;
+        *addr = (uint32_t)*addr;
+    }
+    else if ( !__addr_ok(*addr) )
+        rc = X86EMUL_EXCEPTION;
+
+    if ( unlikely(rc == X86EMUL_EXCEPTION) )
+        x86_emul_hw_exception(seg != x86_seg_ss ? TRAP_gp_fault
+                                                : TRAP_stack_error,
+                              0, ctxt);
+
+    return rc;
+}
+
+static int priv_op_rep_ins(uint16_t port,
+                           enum x86_segment seg, unsigned long offset,
+                           unsigned int bytes_per_rep, unsigned long *reps,
+                           struct x86_emulate_ctxt *ctxt)
+{
+    struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt);
+    struct vcpu *curr = current;
+    struct domain *currd = current->domain;
+    unsigned long goal = *reps;
+    struct segment_register sreg;
+    int rc;
+
+    ASSERT(seg == x86_seg_es);
+
+    *reps = 0;
+
+    if ( !guest_io_okay(port, bytes_per_rep, curr, ctxt->regs) )
+        return X86EMUL_UNHANDLEABLE;
+
+    rc = priv_op_read_segment(x86_seg_es, &sreg, ctxt);
+    if ( rc != X86EMUL_OKAY )
+        return rc;
+
+    if ( !sreg.attr.fields.p )
+        return X86EMUL_UNHANDLEABLE;
+    if ( !sreg.attr.fields.s ||
+         (sreg.attr.fields.type & (_SEGMENT_CODE >> 8)) ||
+         !(sreg.attr.fields.type & (_SEGMENT_WR >> 8)) )
+    {
+        x86_emul_hw_exception(TRAP_gp_fault, 0, ctxt);
+        return X86EMUL_EXCEPTION;
+    }
+
+    poc->bpmatch = check_guest_io_breakpoint(curr, port, bytes_per_rep);
+
+    while ( *reps < goal )
+    {
+        unsigned int data = guest_io_read(port, bytes_per_rep, currd);
+        unsigned long addr;
+
+        rc = pv_emul_virt_to_linear(sreg.base, offset, bytes_per_rep,
+                                    sreg.limit, x86_seg_es, ctxt, &addr);
+        if ( rc != X86EMUL_OKAY )
+            return rc;
+
+        if ( (rc = __copy_to_user((void *)addr, &data, bytes_per_rep)) != 0 )
+        {
+            x86_emul_pagefault(PFEC_write_access,
+                               addr + bytes_per_rep - rc, ctxt);
+            return X86EMUL_EXCEPTION;
+        }
+
+        ++*reps;
+
+        if ( poc->bpmatch || hypercall_preempt_check() )
+            break;
+
+        /* x86_emulate() clips the repetition count to ensure we don't wrap. */
+        if ( unlikely(ctxt->regs->eflags & X86_EFLAGS_DF) )
+            offset -= bytes_per_rep;
+        else
+            offset += bytes_per_rep;
+    }
+
+    return X86EMUL_OKAY;
+}
+
+static int priv_op_rep_outs(enum x86_segment seg, unsigned long offset,
+                            uint16_t port,
+                            unsigned int bytes_per_rep, unsigned long *reps,
+                            struct x86_emulate_ctxt *ctxt)
+{
+    struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt);
+    struct vcpu *curr = current;
+    struct domain *currd = current->domain;
+    unsigned long goal = *reps;
+    struct segment_register sreg;
+    int rc;
+
+    *reps = 0;
+
+    if ( !guest_io_okay(port, bytes_per_rep, curr, ctxt->regs) )
+        return X86EMUL_UNHANDLEABLE;
+
+    rc = priv_op_read_segment(seg, &sreg, ctxt);
+    if ( rc != X86EMUL_OKAY )
+        return rc;
+
+    if ( !sreg.attr.fields.p )
+        return X86EMUL_UNHANDLEABLE;
+    if ( !sreg.attr.fields.s ||
+         ((sreg.attr.fields.type & (_SEGMENT_CODE >> 8)) &&
+          !(sreg.attr.fields.type & (_SEGMENT_WR >> 8))) )
+    {
+        x86_emul_hw_exception(seg != x86_seg_ss ? TRAP_gp_fault
+                                                : TRAP_stack_error,
+                              0, ctxt);
+        return X86EMUL_EXCEPTION;
+    }
+
+    poc->bpmatch = check_guest_io_breakpoint(curr, port, bytes_per_rep);
+
+    while ( *reps < goal )
+    {
+        unsigned int data = 0;
+        unsigned long addr;
+
+        rc = pv_emul_virt_to_linear(sreg.base, offset, bytes_per_rep,
+                                    sreg.limit, seg, ctxt, &addr);
+        if ( rc != X86EMUL_OKAY )
+            return rc;
+
+        if ( (rc = __copy_from_user(&data, (void *)addr, bytes_per_rep)) != 0 )
+        {
+            x86_emul_pagefault(0, addr + bytes_per_rep - rc, ctxt);
+            return X86EMUL_EXCEPTION;
+        }
+
+        guest_io_write(port, bytes_per_rep, data, currd);
+
+        ++*reps;
+
+        if ( poc->bpmatch || hypercall_preempt_check() )
+            break;
+
+        /* x86_emulate() clips the repetition count to ensure we don't wrap. */
+        if ( unlikely(ctxt->regs->eflags & X86_EFLAGS_DF) )
+            offset -= bytes_per_rep;
+        else
+            offset += bytes_per_rep;
+    }
+
+    return X86EMUL_OKAY;
+}
+
+static int priv_op_read_cr(unsigned int reg, unsigned long *val,
+                           struct x86_emulate_ctxt *ctxt)
+{
+    const struct vcpu *curr = current;
+
+    switch ( reg )
+    {
+    case 0: /* Read CR0 */
+        *val = (read_cr0() & ~X86_CR0_TS) | curr->arch.pv_vcpu.ctrlreg[0];
+        return X86EMUL_OKAY;
+
+    case 2: /* Read CR2 */
+    case 4: /* Read CR4 */
+        *val = curr->arch.pv_vcpu.ctrlreg[reg];
+        return X86EMUL_OKAY;
+
+    case 3: /* Read CR3 */
+    {
+        const struct domain *currd = curr->domain;
+        unsigned long mfn;
+
+        if ( !is_pv_32bit_domain(currd) )
+        {
+            mfn = pagetable_get_pfn(curr->arch.guest_table);
+            *val = xen_pfn_to_cr3(mfn_to_gmfn(currd, mfn));
+        }
+        else
+        {
+            l4_pgentry_t *pl4e =
+                map_domain_page(_mfn(pagetable_get_pfn(curr->arch.guest_table)));
+
+            mfn = l4e_get_pfn(*pl4e);
+            unmap_domain_page(pl4e);
+            *val = compat_pfn_to_cr3(mfn_to_gmfn(currd, mfn));
+        }
+        /* PTs should not be shared */
+        BUG_ON(page_get_owner(mfn_to_page(mfn)) == dom_cow);
+        return X86EMUL_OKAY;
+    }
+    }
+
+    return X86EMUL_UNHANDLEABLE;
+}
+
+static int priv_op_write_cr(unsigned int reg, unsigned long val,
+                            struct x86_emulate_ctxt *ctxt)
+{
+    struct vcpu *curr = current;
+
+    switch ( reg )
+    {
+    case 0: /* Write CR0 */
+        if ( (val ^ read_cr0()) & ~X86_CR0_TS )
+        {
+            gdprintk(XENLOG_WARNING,
+                    "Attempt to change unmodifiable CR0 flags\n");
+            break;
+        }
+        do_fpu_taskswitch(!!(val & X86_CR0_TS));
+        return X86EMUL_OKAY;
+
+    case 2: /* Write CR2 */
+        curr->arch.pv_vcpu.ctrlreg[2] = val;
+        arch_set_cr2(curr, val);
+        return X86EMUL_OKAY;
+
+    case 3: /* Write CR3 */
+    {
+        struct domain *currd = curr->domain;
+        unsigned long gfn;
+        struct page_info *page;
+        int rc;
+
+        gfn = !is_pv_32bit_domain(currd)
+              ? xen_cr3_to_pfn(val) : compat_cr3_to_pfn(val);
+        page = get_page_from_gfn(currd, gfn, NULL, P2M_ALLOC);
+        if ( !page )
+            break;
+        rc = new_guest_cr3(page_to_mfn(page));
+        put_page(page);
+
+        switch ( rc )
+        {
+        case 0:
+            return X86EMUL_OKAY;
+        case -ERESTART: /* retry after preemption */
+            return X86EMUL_RETRY;
+        }
+        break;
+    }
+
+    case 4: /* Write CR4 */
+        curr->arch.pv_vcpu.ctrlreg[4] = pv_guest_cr4_fixup(curr, val);
+        write_cr4(pv_guest_cr4_to_real_cr4(curr));
+        ctxt_switch_levelling(curr);
+        return X86EMUL_OKAY;
+    }
+
+    return X86EMUL_UNHANDLEABLE;
+}
+
+static int priv_op_read_dr(unsigned int reg, unsigned long *val,
+                           struct x86_emulate_ctxt *ctxt)
+{
+    unsigned long res = do_get_debugreg(reg);
+
+    if ( IS_ERR_VALUE(res) )
+        return X86EMUL_UNHANDLEABLE;
+
+    *val = res;
+
+    return X86EMUL_OKAY;
+}
+
+static int priv_op_write_dr(unsigned int reg, unsigned long val,
+                            struct x86_emulate_ctxt *ctxt)
+{
+    return do_set_debugreg(reg, val) == 0
+           ? X86EMUL_OKAY : X86EMUL_UNHANDLEABLE;
+}
+
+static inline uint64_t guest_misc_enable(uint64_t val)
+{
+    val &= ~(MSR_IA32_MISC_ENABLE_PERF_AVAIL |
+             MSR_IA32_MISC_ENABLE_MONITOR_ENABLE);
+    val |= MSR_IA32_MISC_ENABLE_BTS_UNAVAIL |
+           MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL |
+           MSR_IA32_MISC_ENABLE_XTPR_DISABLE;
+    return val;
+}
+
+static inline bool is_cpufreq_controller(const struct domain *d)
+{
+    return ((cpufreq_controller == FREQCTL_dom0_kernel) &&
+            is_hardware_domain(d));
+}
+
+static int priv_op_read_msr(unsigned int reg, uint64_t *val,
+                            struct x86_emulate_ctxt *ctxt)
+{
+    struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt);
+    const struct vcpu *curr = current;
+    const struct domain *currd = curr->domain;
+    bool vpmu_msr = false;
+
+    switch ( reg )
+    {
+        int rc;
+
+    case MSR_FS_BASE:
+        if ( is_pv_32bit_domain(currd) )
+            break;
+        *val = cpu_has_fsgsbase ? __rdfsbase() : curr->arch.pv_vcpu.fs_base;
+        return X86EMUL_OKAY;
+
+    case MSR_GS_BASE:
+        if ( is_pv_32bit_domain(currd) )
+            break;
+        *val = cpu_has_fsgsbase ? __rdgsbase()
+                                : curr->arch.pv_vcpu.gs_base_kernel;
+        return X86EMUL_OKAY;
+
+    case MSR_SHADOW_GS_BASE:
+        if ( is_pv_32bit_domain(currd) )
+            break;
+        *val = curr->arch.pv_vcpu.gs_base_user;
+        return X86EMUL_OKAY;
+
+    /*
+     * In order to fully retain original behavior, defer calling
+     * pv_soft_rdtsc() until after emulation. This may want/need to be
+     * reconsidered.
+     */
+    case MSR_IA32_TSC:
+        poc->tsc |= TSC_BASE;
+        goto normal;
+
+    case MSR_TSC_AUX:
+        poc->tsc |= TSC_AUX;
+        if ( cpu_has_rdtscp )
+            goto normal;
+        *val = 0;
+        return X86EMUL_OKAY;
+
+    case MSR_EFER:
+        *val = read_efer();
+        if ( is_pv_32bit_domain(currd) )
+            *val &= ~(EFER_LME | EFER_LMA | EFER_LMSLE);
+        return X86EMUL_OKAY;
+
+    case MSR_K7_FID_VID_CTL:
+    case MSR_K7_FID_VID_STATUS:
+    case MSR_K8_PSTATE_LIMIT:
+    case MSR_K8_PSTATE_CTRL:
+    case MSR_K8_PSTATE_STATUS:
+    case MSR_K8_PSTATE0:
+    case MSR_K8_PSTATE1:
+    case MSR_K8_PSTATE2:
+    case MSR_K8_PSTATE3:
+    case MSR_K8_PSTATE4:
+    case MSR_K8_PSTATE5:
+    case MSR_K8_PSTATE6:
+    case MSR_K8_PSTATE7:
+        if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD )
+            break;
+        if ( unlikely(is_cpufreq_controller(currd)) )
+            goto normal;
+        *val = 0;
+        return X86EMUL_OKAY;
+
+    case MSR_IA32_UCODE_REV:
+        BUILD_BUG_ON(MSR_IA32_UCODE_REV != MSR_AMD_PATCHLEVEL);
+        if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL )
+        {
+            if ( wrmsr_safe(MSR_IA32_UCODE_REV, 0) )
+                break;
+            /* As documented in the SDM: Do a CPUID 1 here */
+            cpuid_eax(1);
+        }
+        goto normal;
+
+    case MSR_IA32_MISC_ENABLE:
+        if ( rdmsr_safe(reg, *val) )
+            break;
+        *val = guest_misc_enable(*val);
+        return X86EMUL_OKAY;
+
+    case MSR_AMD64_DR0_ADDRESS_MASK:
+        if ( !boot_cpu_has(X86_FEATURE_DBEXT) )
+            break;
+        *val = curr->arch.pv_vcpu.dr_mask[0];
+        return X86EMUL_OKAY;
+
+    case MSR_AMD64_DR1_ADDRESS_MASK ... MSR_AMD64_DR3_ADDRESS_MASK:
+        if ( !boot_cpu_has(X86_FEATURE_DBEXT) )
+            break;
+        *val = curr->arch.pv_vcpu.dr_mask[reg - MSR_AMD64_DR1_ADDRESS_MASK + 1];
+        return X86EMUL_OKAY;
+
+    case MSR_IA32_PERF_CAPABILITIES:
+        /* No extra capabilities are supported. */
+        *val = 0;
+        return X86EMUL_OKAY;
+
+    case MSR_INTEL_PLATFORM_INFO:
+        if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
+             rdmsr_safe(MSR_INTEL_PLATFORM_INFO, *val) )
+            break;
+        *val = 0;
+        if ( this_cpu(cpuid_faulting_enabled) )
+            *val |= MSR_PLATFORM_INFO_CPUID_FAULTING;
+        return X86EMUL_OKAY;
+
+    case MSR_INTEL_MISC_FEATURES_ENABLES:
+        if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
+             rdmsr_safe(MSR_INTEL_MISC_FEATURES_ENABLES, *val) )
+            break;
+        *val = 0;
+        if ( curr->arch.cpuid_faulting )
+            *val |= MSR_MISC_FEATURES_CPUID_FAULTING;
+        return X86EMUL_OKAY;
+
+    case MSR_P6_PERFCTR(0)...MSR_P6_PERFCTR(7):
+    case MSR_P6_EVNTSEL(0)...MSR_P6_EVNTSEL(3):
+    case MSR_CORE_PERF_FIXED_CTR0...MSR_CORE_PERF_FIXED_CTR2:
+    case MSR_CORE_PERF_FIXED_CTR_CTRL...MSR_CORE_PERF_GLOBAL_OVF_CTRL:
+        if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL )
+        {
+            vpmu_msr = true;
+            /* fall through */
+    case MSR_AMD_FAM15H_EVNTSEL0...MSR_AMD_FAM15H_PERFCTR5:
+    case MSR_K7_EVNTSEL0...MSR_K7_PERFCTR3:
+            if ( vpmu_msr || (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) )
+            {
+                if ( vpmu_do_rdmsr(reg, val) )
+                    break;
+                return X86EMUL_OKAY;
+            }
+        }
+        /* fall through */
+    default:
+        if ( rdmsr_hypervisor_regs(reg, val) )
+            return X86EMUL_OKAY;
+
+        rc = vmce_rdmsr(reg, val);
+        if ( rc < 0 )
+            break;
+        if ( rc )
+            return X86EMUL_OKAY;
+        /* fall through */
+    normal:
+        /* Everyone can read the MSR space. */
+        /* gdprintk(XENLOG_WARNING, "Domain attempted RDMSR %08x\n", reg); */
+        if ( rdmsr_safe(reg, *val) )
+            break;
+        return X86EMUL_OKAY;
+    }
+
+    return X86EMUL_UNHANDLEABLE;
+}
+
+static int priv_op_write_msr(unsigned int reg, uint64_t val,
+                             struct x86_emulate_ctxt *ctxt)
+{
+    struct vcpu *curr = current;
+    const struct domain *currd = curr->domain;
+    bool vpmu_msr = false;
+
+    switch ( reg )
+    {
+        uint64_t temp;
+        int rc;
+
+    case MSR_FS_BASE:
+        if ( is_pv_32bit_domain(currd) || !is_canonical_address(val) )
+            break;
+        wrfsbase(val);
+        curr->arch.pv_vcpu.fs_base = val;
+        return X86EMUL_OKAY;
+
+    case MSR_GS_BASE:
+        if ( is_pv_32bit_domain(currd) || !is_canonical_address(val) )
+            break;
+        wrgsbase(val);
+        curr->arch.pv_vcpu.gs_base_kernel = val;
+        return X86EMUL_OKAY;
+
+    case MSR_SHADOW_GS_BASE:
+        if ( is_pv_32bit_domain(currd) || !is_canonical_address(val) )
+            break;
+        wrmsrl(MSR_SHADOW_GS_BASE, val);
+        curr->arch.pv_vcpu.gs_base_user = val;
+        return X86EMUL_OKAY;
+
+    case MSR_K7_FID_VID_STATUS:
+    case MSR_K7_FID_VID_CTL:
+    case MSR_K8_PSTATE_LIMIT:
+    case MSR_K8_PSTATE_CTRL:
+    case MSR_K8_PSTATE_STATUS:
+    case MSR_K8_PSTATE0:
+    case MSR_K8_PSTATE1:
+    case MSR_K8_PSTATE2:
+    case MSR_K8_PSTATE3:
+    case MSR_K8_PSTATE4:
+    case MSR_K8_PSTATE5:
+    case MSR_K8_PSTATE6:
+    case MSR_K8_PSTATE7:
+    case MSR_K8_HWCR:
+        if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD )
+            break;
+        if ( likely(!is_cpufreq_controller(currd)) ||
+             wrmsr_safe(reg, val) == 0 )
+            return X86EMUL_OKAY;
+        break;
+
+    case MSR_AMD64_NB_CFG:
+        if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD ||
+             boot_cpu_data.x86 < 0x10 || boot_cpu_data.x86 > 0x17 )
+            break;
+        if ( !is_hardware_domain(currd) || !is_pinned_vcpu(curr) )
+            return X86EMUL_OKAY;
+        if ( (rdmsr_safe(MSR_AMD64_NB_CFG, temp) != 0) ||
+             ((val ^ temp) & ~(1ULL << AMD64_NB_CFG_CF8_EXT_ENABLE_BIT)) )
+            goto invalid;
+        if ( wrmsr_safe(MSR_AMD64_NB_CFG, val) == 0 )
+            return X86EMUL_OKAY;
+        break;
+
+    case MSR_FAM10H_MMIO_CONF_BASE:
+        if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD ||
+             boot_cpu_data.x86 < 0x10 || boot_cpu_data.x86 > 0x17 )
+            break;
+        if ( !is_hardware_domain(currd) || !is_pinned_vcpu(curr) )
+            return X86EMUL_OKAY;
+        if ( rdmsr_safe(MSR_FAM10H_MMIO_CONF_BASE, temp) != 0 )
+            break;
+        if ( (pci_probe & PCI_PROBE_MASK) == PCI_PROBE_MMCONF ?
+             temp != val :
+             ((temp ^ val) &
+              ~(FAM10H_MMIO_CONF_ENABLE |
+                (FAM10H_MMIO_CONF_BUSRANGE_MASK <<
+                 FAM10H_MMIO_CONF_BUSRANGE_SHIFT) |
+                ((u64)FAM10H_MMIO_CONF_BASE_MASK <<
+                 FAM10H_MMIO_CONF_BASE_SHIFT))) )
+            goto invalid;
+        if ( wrmsr_safe(MSR_FAM10H_MMIO_CONF_BASE, val) == 0 )
+            return X86EMUL_OKAY;
+        break;
+
+    case MSR_IA32_UCODE_REV:
+        if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL )
+            break;
+        if ( !is_hardware_domain(currd) || !is_pinned_vcpu(curr) )
+            return X86EMUL_OKAY;
+        if ( rdmsr_safe(reg, temp) )
+            break;
+        if ( val )
+            goto invalid;
+        return X86EMUL_OKAY;
+
+    case MSR_IA32_MISC_ENABLE:
+        if ( rdmsr_safe(reg, temp) )
+            break;
+        if ( val != guest_misc_enable(temp) )
+            goto invalid;
+        return X86EMUL_OKAY;
+
+    case MSR_IA32_MPERF:
+    case MSR_IA32_APERF:
+        if ( (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) &&
+             (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) )
+            break;
+        if ( likely(!is_cpufreq_controller(currd)) ||
+             wrmsr_safe(reg, val) == 0 )
+            return X86EMUL_OKAY;
+        break;
+
+    case MSR_IA32_PERF_CTL:
+        if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL )
+            break;
+        if ( likely(!is_cpufreq_controller(currd)) ||
+             wrmsr_safe(reg, val) == 0 )
+            return X86EMUL_OKAY;
+        break;
+
+    case MSR_IA32_THERM_CONTROL:
+    case MSR_IA32_ENERGY_PERF_BIAS:
+        if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL )
+            break;
+        if ( !is_hardware_domain(currd) || !is_pinned_vcpu(curr) ||
+             wrmsr_safe(reg, val) == 0 )
+            return X86EMUL_OKAY;
+        break;
+
+    case MSR_AMD64_DR0_ADDRESS_MASK:
+        if ( !boot_cpu_has(X86_FEATURE_DBEXT) || (val >> 32) )
+            break;
+        curr->arch.pv_vcpu.dr_mask[0] = val;
+        if ( curr->arch.debugreg[7] & DR7_ACTIVE_MASK )
+            wrmsrl(MSR_AMD64_DR0_ADDRESS_MASK, val);
+        return X86EMUL_OKAY;
+
+    case MSR_AMD64_DR1_ADDRESS_MASK ... MSR_AMD64_DR3_ADDRESS_MASK:
+        if ( !boot_cpu_has(X86_FEATURE_DBEXT) || (val >> 32) )
+            break;
+        curr->arch.pv_vcpu.dr_mask[reg - MSR_AMD64_DR1_ADDRESS_MASK + 1] = val;
+        if ( curr->arch.debugreg[7] & DR7_ACTIVE_MASK )
+            wrmsrl(reg, val);
+        return X86EMUL_OKAY;
+
+    case MSR_INTEL_PLATFORM_INFO:
+        if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
+             val || rdmsr_safe(MSR_INTEL_PLATFORM_INFO, val) )
+            break;
+        return X86EMUL_OKAY;
+
+    case MSR_INTEL_MISC_FEATURES_ENABLES:
+        if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
+             (val & ~MSR_MISC_FEATURES_CPUID_FAULTING) ||
+             rdmsr_safe(MSR_INTEL_MISC_FEATURES_ENABLES, temp) )
+            break;
+        if ( (val & MSR_MISC_FEATURES_CPUID_FAULTING) &&
+             !this_cpu(cpuid_faulting_enabled) )
+            break;
+        curr->arch.cpuid_faulting = !!(val & MSR_MISC_FEATURES_CPUID_FAULTING);
+        return X86EMUL_OKAY;
+
+    case MSR_P6_PERFCTR(0)...MSR_P6_PERFCTR(7):
+    case MSR_P6_EVNTSEL(0)...MSR_P6_EVNTSEL(3):
+    case MSR_CORE_PERF_FIXED_CTR0...MSR_CORE_PERF_FIXED_CTR2:
+    case MSR_CORE_PERF_FIXED_CTR_CTRL...MSR_CORE_PERF_GLOBAL_OVF_CTRL:
+        if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL )
+        {
+            vpmu_msr = true;
+    case MSR_AMD_FAM15H_EVNTSEL0...MSR_AMD_FAM15H_PERFCTR5:
+    case MSR_K7_EVNTSEL0...MSR_K7_PERFCTR3:
+            if ( vpmu_msr || (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) )
+            {
+                if ( (vpmu_mode & XENPMU_MODE_ALL) &&
+                     !is_hardware_domain(currd) )
+                    return X86EMUL_OKAY;
+
+                if ( vpmu_do_wrmsr(reg, val, 0) )
+                    break;
+                return X86EMUL_OKAY;
+            }
+        }
+        /* fall through */
+    default:
+        if ( wrmsr_hypervisor_regs(reg, val) == 1 )
+            return X86EMUL_OKAY;
+
+        rc = vmce_wrmsr(reg, val);
+        if ( rc < 0 )
+            break;
+        if ( rc )
+            return X86EMUL_OKAY;
+
+        if ( (rdmsr_safe(reg, temp) != 0) || (val != temp) )
+    invalid:
+            gdprintk(XENLOG_WARNING,
+                     "Domain attempted WRMSR %08x from 0x%016"PRIx64" to 0x%016"PRIx64"\n",
+                     reg, temp, val);
+        return X86EMUL_OKAY;
+    }
+
+    return X86EMUL_UNHANDLEABLE;
+}
+
+static int priv_op_wbinvd(struct x86_emulate_ctxt *ctxt)
+{
+    /* Ignore the instruction if unprivileged. */
+    if ( !cache_flush_permitted(current->domain) )
+        /*
+         * Non-physdev domain attempted WBINVD; ignore for now since
+         * newer linux uses this in some start-of-day timing loops.
+         */
+        ;
+    else
+        wbinvd();
+
+    return X86EMUL_OKAY;
+}
+
+int pv_emul_cpuid(uint32_t leaf, uint32_t subleaf,
+                  struct cpuid_leaf *res, struct x86_emulate_ctxt *ctxt)
+{
+    guest_cpuid(current, leaf, subleaf, res);
+
+    return X86EMUL_OKAY;
+}
+
+static int priv_op_validate(const struct x86_emulate_state *state,
+                            struct x86_emulate_ctxt *ctxt)
+{
+    switch ( ctxt->opcode )
+    {
+    case 0x6c ... 0x6f: /* ins / outs */
+    case 0xe4 ... 0xe7: /* in / out (immediate port) */
+    case 0xec ... 0xef: /* in / out (port in %dx) */
+    case X86EMUL_OPC(0x0f, 0x06): /* clts */
+    case X86EMUL_OPC(0x0f, 0x09): /* wbinvd */
+    case X86EMUL_OPC(0x0f, 0x20) ...
+         X86EMUL_OPC(0x0f, 0x23): /* mov to/from cr/dr */
+    case X86EMUL_OPC(0x0f, 0x30): /* wrmsr */
+    case X86EMUL_OPC(0x0f, 0x31): /* rdtsc */
+    case X86EMUL_OPC(0x0f, 0x32): /* rdmsr */
+    case X86EMUL_OPC(0x0f, 0xa2): /* cpuid */
+        return X86EMUL_OKAY;
+
+    case 0xfa: case 0xfb: /* cli / sti */
+        if ( !iopl_ok(current, ctxt->regs) )
+            break;
+        /*
+         * This is just too dangerous to allow, in my opinion. Consider if the
+         * caller then tries to reenable interrupts using POPF: we can't trap
+         * that and we'll end up with hard-to-debug lockups. Fast & loose will
+         * do for us. :-)
+        vcpu_info(current, evtchn_upcall_mask) = (ctxt->opcode == 0xfa);
+         */
+        return X86EMUL_DONE;
+
+    case X86EMUL_OPC(0x0f, 0x01):
+    {
+        unsigned int modrm_rm, modrm_reg;
+
+        if ( x86_insn_modrm(state, &modrm_rm, &modrm_reg) != 3 ||
+             (modrm_rm & 7) != 1 )
+            break;
+        switch ( modrm_reg & 7 )
+        {
+        case 2: /* xsetbv */
+        case 7: /* rdtscp */
+            return X86EMUL_OKAY;
+        }
+        break;
+    }
+    }
+
+    return X86EMUL_UNHANDLEABLE;
+}
+
+static int priv_op_insn_fetch(enum x86_segment seg,
+                              unsigned long offset,
+                              void *p_data,
+                              unsigned int bytes,
+                              struct x86_emulate_ctxt *ctxt)
+{
+    const struct priv_op_ctxt *poc =
+        container_of(ctxt, struct priv_op_ctxt, ctxt);
+    unsigned int rc;
+    unsigned long addr = poc->cs.base + offset;
+
+    ASSERT(seg == x86_seg_cs);
+
+    /* We don't mean to emulate any branches. */
+    if ( !bytes )
+        return X86EMUL_UNHANDLEABLE;
+
+    rc = pv_emul_virt_to_linear(poc->cs.base, offset, bytes, poc->cs.limit,
+                                x86_seg_cs, ctxt, &addr);
+    if ( rc != X86EMUL_OKAY )
+        return rc;
+
+    if ( (rc = __copy_from_user(p_data, (void *)addr, bytes)) != 0 )
+    {
+        /*
+         * TODO: This should report PFEC_insn_fetch when goc->insn_fetch &&
+         * cpu_has_nx, but we'd then need a "fetch" variant of
+         * __copy_from_user() respecting NX, SMEP, and protection keys.
+         */
+        x86_emul_pagefault(0, addr + bytes - rc, ctxt);
+        return X86EMUL_EXCEPTION;
+    }
+
+    return X86EMUL_OKAY;
+}
+
+
+static const struct x86_emulate_ops priv_op_ops = {
+    .insn_fetch          = priv_op_insn_fetch,
+    .read                = x86emul_unhandleable_rw,
+    .validate            = priv_op_validate,
+    .read_io             = priv_op_read_io,
+    .write_io            = priv_op_write_io,
+    .rep_ins             = priv_op_rep_ins,
+    .rep_outs            = priv_op_rep_outs,
+    .read_segment        = priv_op_read_segment,
+    .read_cr             = priv_op_read_cr,
+    .write_cr            = priv_op_write_cr,
+    .read_dr             = priv_op_read_dr,
+    .write_dr            = priv_op_write_dr,
+    .read_msr            = priv_op_read_msr,
+    .write_msr           = priv_op_write_msr,
+    .cpuid               = pv_emul_cpuid,
+    .wbinvd              = priv_op_wbinvd,
+};
+
+int emulate_privileged_op(struct cpu_user_regs *regs)
+{
+    struct vcpu *curr = current;
+    struct domain *currd = curr->domain;
+    struct priv_op_ctxt ctxt = {
+        .ctxt.regs = regs,
+        .ctxt.vendor = currd->arch.cpuid->x86_vendor,
+        .ctxt.lma = !is_pv_32bit_domain(currd),
+    };
+    int rc;
+    unsigned int eflags, ar;
+
+    if ( !read_descriptor(regs->cs, curr, &ctxt.cs.base, &ctxt.cs.limit,
+                          &ar, 1) ||
+         !(ar & _SEGMENT_S) ||
+         !(ar & _SEGMENT_P) ||
+         !(ar & _SEGMENT_CODE) )
+        return 0;
+
+    /* Mirror virtualized state into EFLAGS. */
+    ASSERT(regs->eflags & X86_EFLAGS_IF);
+    if ( vcpu_info(curr, evtchn_upcall_mask) )
+        regs->eflags &= ~X86_EFLAGS_IF;
+    else
+        regs->eflags |= X86_EFLAGS_IF;
+    ASSERT(!(regs->eflags & X86_EFLAGS_IOPL));
+    regs->eflags |= curr->arch.pv_vcpu.iopl;
+    eflags = regs->eflags;
+
+    ctxt.ctxt.addr_size = ar & _SEGMENT_L ? 64 : ar & _SEGMENT_DB ? 32 : 16;
+    /* Leave zero in ctxt.ctxt.sp_size, as it's not needed. */
+    rc = x86_emulate(&ctxt.ctxt, &priv_op_ops);
+
+    if ( ctxt.io_emul_stub )
+        unmap_domain_page(ctxt.io_emul_stub);
+
+    /*
+     * Un-mirror virtualized state from EFLAGS.
+     * Nothing we allow to be emulated can change anything other than the
+     * arithmetic bits, and the resume flag.
+     */
+    ASSERT(!((regs->eflags ^ eflags) &
+             ~(X86_EFLAGS_RF | X86_EFLAGS_ARITH_MASK)));
+    regs->eflags |= X86_EFLAGS_IF;
+    regs->eflags &= ~X86_EFLAGS_IOPL;
+
+    switch ( rc )
+    {
+    case X86EMUL_OKAY:
+        if ( ctxt.tsc & TSC_BASE )
+        {
+            if ( ctxt.tsc & TSC_AUX )
+                pv_soft_rdtsc(curr, regs, 1);
+            else if ( currd->arch.vtsc )
+                pv_soft_rdtsc(curr, regs, 0);
+            else
+                msr_split(regs, rdtsc());
+        }
+
+        if ( ctxt.ctxt.retire.singlestep )
+            ctxt.bpmatch |= DR_STEP;
+        if ( ctxt.bpmatch )
+        {
+            curr->arch.debugreg[6] |= ctxt.bpmatch | DR_STATUS_RESERVED_ONE;
+            if ( !(curr->arch.pv_vcpu.trap_bounce.flags & TBF_EXCEPTION) )
+                pv_inject_hw_exception(TRAP_debug, X86_EVENT_NO_EC);
+        }
+        /* fall through */
+    case X86EMUL_RETRY:
+        return EXCRET_fault_fixed;
+
+    case X86EMUL_EXCEPTION:
+        pv_inject_event(&ctxt.ctxt.event);
+        return EXCRET_fault_fixed;
+    }
+
+    return 0;
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/xen/arch/x86/x86_64/gpr_switch.S b/xen/arch/x86/pv/gpr_switch.S
similarity index 100%
rename from xen/arch/x86/x86_64/gpr_switch.S
rename to xen/arch/x86/pv/gpr_switch.S
diff --git a/xen/arch/x86/traps.c b/xen/arch/x86/traps.c
index cd8ca20398..cd43e9f44c 100644
--- a/xen/arch/x86/traps.c
+++ b/xen/arch/x86/traps.c
@@ -78,6 +78,8 @@
 #include <asm/cpuid.h>
 #include <xsm/xsm.h>
 
+#include <asm/pv/traps.h>
+
 /*
  * opt_nmi: one of 'ignore', 'dom0', or 'fatal'.
  *  fatal:  Xen prints diagnostic message and then hangs.
@@ -705,41 +707,6 @@ static void instruction_done(struct cpu_user_regs *regs, unsigned long rip)
     }
 }
 
-static unsigned int check_guest_io_breakpoint(struct vcpu *v,
-    unsigned int port, unsigned int len)
-{
-    unsigned int width, i, match = 0;
-    unsigned long start;
-
-    if ( !(v->arch.debugreg[5]) ||
-         !(v->arch.pv_vcpu.ctrlreg[4] & X86_CR4_DE) )
-        return 0;
-
-    for ( i = 0; i < 4; i++ )
-    {
-        if ( !(v->arch.debugreg[5] &
-               (3 << (i * DR_ENABLE_SIZE))) )
-            continue;
-
-        start = v->arch.debugreg[i];
-        width = 0;
-
-        switch ( (v->arch.debugreg[7] >>
-                  (DR_CONTROL_SHIFT + i * DR_CONTROL_SIZE)) & 0xc )
-        {
-        case DR_LEN_1: width = 1; break;
-        case DR_LEN_2: width = 2; break;
-        case DR_LEN_4: width = 4; break;
-        case DR_LEN_8: width = 8; break;
-        }
-
-        if ( (start < (port + len)) && ((start + width) > port) )
-            match |= 1 << i;
-    }
-
-    return match;
-}
-
 /*
  * Called from asm to set up the MCE trapbounce info.
  * Returns 0 if no callback is set up, else 1.
@@ -1733,1327 +1700,6 @@ static int read_gate_descriptor(unsigned int gate_sel,
     return 1;
 }
 
-static int pv_emul_virt_to_linear(unsigned long base, unsigned long offset,
-                                  unsigned int bytes, unsigned long limit,
-                                  enum x86_segment seg,
-                                  struct x86_emulate_ctxt *ctxt,
-                                  unsigned long *addr)
-{
-    int rc = X86EMUL_OKAY;
-
-    *addr = base + offset;
-
-    if ( ctxt->addr_size < 64 )
-    {
-        if ( limit < bytes - 1 || offset > limit - bytes + 1 )
-            rc = X86EMUL_EXCEPTION;
-        *addr = (uint32_t)*addr;
-    }
-    else if ( !__addr_ok(*addr) )
-        rc = X86EMUL_EXCEPTION;
-
-    if ( unlikely(rc == X86EMUL_EXCEPTION) )
-        x86_emul_hw_exception(seg != x86_seg_ss ? TRAP_gp_fault
-                                                : TRAP_stack_error,
-                              0, ctxt);
-
-    return rc;
-}
-
-struct priv_op_ctxt {
-    struct x86_emulate_ctxt ctxt;
-    struct {
-        unsigned long base, limit;
-    } cs;
-    char *io_emul_stub;
-    unsigned int bpmatch;
-    unsigned int tsc;
-#define TSC_BASE 1
-#define TSC_AUX 2
-};
-
-static int priv_op_insn_fetch(enum x86_segment seg,
-                              unsigned long offset,
-                              void *p_data,
-                              unsigned int bytes,
-                              struct x86_emulate_ctxt *ctxt)
-{
-    const struct priv_op_ctxt *poc =
-        container_of(ctxt, struct priv_op_ctxt, ctxt);
-    unsigned int rc;
-    unsigned long addr = poc->cs.base + offset;
-
-    ASSERT(seg == x86_seg_cs);
-
-    /* We don't mean to emulate any branches. */
-    if ( !bytes )
-        return X86EMUL_UNHANDLEABLE;
-
-    rc = pv_emul_virt_to_linear(poc->cs.base, offset, bytes, poc->cs.limit,
-                                x86_seg_cs, ctxt, &addr);
-    if ( rc != X86EMUL_OKAY )
-        return rc;
-
-    if ( (rc = __copy_from_user(p_data, (void *)addr, bytes)) != 0 )
-    {
-        /*
-         * TODO: This should report PFEC_insn_fetch when goc->insn_fetch &&
-         * cpu_has_nx, but we'd then need a "fetch" variant of
-         * __copy_from_user() respecting NX, SMEP, and protection keys.
-         */
-        x86_emul_pagefault(0, addr + bytes - rc, ctxt);
-        return X86EMUL_EXCEPTION;
-    }
-
-    return X86EMUL_OKAY;
-}
-
-static int priv_op_read_segment(enum x86_segment seg,
-                                struct segment_register *reg,
-                                struct x86_emulate_ctxt *ctxt)
-{
-    /* Check if this is an attempt to access the I/O bitmap. */
-    if ( seg == x86_seg_tr )
-    {
-        switch ( ctxt->opcode )
-        {
-        case 0x6c ... 0x6f: /* ins / outs */
-        case 0xe4 ... 0xe7: /* in / out (immediate port) */
-        case 0xec ... 0xef: /* in / out (port in %dx) */
-            /* Defer the check to priv_op_{read,write}_io(). */
-            return X86EMUL_DONE;
-        }
-    }
-
-    if ( ctxt->addr_size < 64 )
-    {
-        unsigned long limit;
-        unsigned int sel, ar;
-
-        switch ( seg )
-        {
-        case x86_seg_cs: sel = ctxt->regs->cs; break;
-        case x86_seg_ds: sel = read_sreg(ds);  break;
-        case x86_seg_es: sel = read_sreg(es);  break;
-        case x86_seg_fs: sel = read_sreg(fs);  break;
-        case x86_seg_gs: sel = read_sreg(gs);  break;
-        case x86_seg_ss: sel = ctxt->regs->ss; break;
-        default: return X86EMUL_UNHANDLEABLE;
-        }
-
-        if ( !read_descriptor(sel, current, &reg->base, &limit, &ar, 0) )
-            return X86EMUL_UNHANDLEABLE;
-
-        reg->limit = limit;
-        reg->attr.bytes = ar >> 8;
-    }
-    else
-    {
-        switch ( seg )
-        {
-        default:
-            if ( !is_x86_user_segment(seg) )
-                return X86EMUL_UNHANDLEABLE;
-            reg->base = 0;
-            break;
-        case x86_seg_fs:
-            reg->base = rdfsbase();
-            break;
-        case x86_seg_gs:
-            reg->base = rdgsbase();
-            break;
-        }
-
-        reg->limit = ~0U;
-
-        reg->attr.bytes = 0;
-        reg->attr.fields.type = _SEGMENT_WR >> 8;
-        if ( seg == x86_seg_cs )
-        {
-            reg->attr.fields.type |= _SEGMENT_CODE >> 8;
-            reg->attr.fields.l = 1;
-        }
-        else
-            reg->attr.fields.db = 1;
-        reg->attr.fields.s   = 1;
-        reg->attr.fields.dpl = 3;
-        reg->attr.fields.p   = 1;
-        reg->attr.fields.g   = 1;
-    }
-
-    /*
-     * For x86_emulate.c's mode_ring0() to work, fake a DPL of zero.
-     * Also do this for consistency for non-conforming code segments.
-     */
-    if ( (seg == x86_seg_ss ||
-          (seg == x86_seg_cs &&
-           !(reg->attr.fields.type & (_SEGMENT_EC >> 8)))) &&
-         guest_kernel_mode(current, ctxt->regs) )
-        reg->attr.fields.dpl = 0;
-
-    return X86EMUL_OKAY;
-}
-
-/* Perform IOPL check between the vcpu's shadowed IOPL, and the assumed cpl. */
-static bool_t iopl_ok(const struct vcpu *v, const struct cpu_user_regs *regs)
-{
-    unsigned int cpl = guest_kernel_mode(v, regs) ?
-        (VM_ASSIST(v->domain, architectural_iopl) ? 0 : 1) : 3;
-
-    ASSERT((v->arch.pv_vcpu.iopl & ~X86_EFLAGS_IOPL) == 0);
-
-    return IOPL(cpl) <= v->arch.pv_vcpu.iopl;
-}
-
-/* Has the guest requested sufficient permission for this I/O access? */
-static int guest_io_okay(
-    unsigned int port, unsigned int bytes,
-    struct vcpu *v, struct cpu_user_regs *regs)
-{
-    /* If in user mode, switch to kernel mode just to read I/O bitmap. */
-    int user_mode = !(v->arch.flags & TF_kernel_mode);
-#define TOGGLE_MODE() if ( user_mode ) toggle_guest_mode(v)
-
-    if ( iopl_ok(v, regs) )
-        return 1;
-
-    if ( v->arch.pv_vcpu.iobmp_limit > (port + bytes) )
-    {
-        union { uint8_t bytes[2]; uint16_t mask; } x;
-
-        /*
-         * Grab permission bytes from guest space. Inaccessible bytes are
-         * read as 0xff (no access allowed).
-         */
-        TOGGLE_MODE();
-        switch ( __copy_from_guest_offset(x.bytes, v->arch.pv_vcpu.iobmp,
-                                          port>>3, 2) )
-        {
-        default: x.bytes[0] = ~0;
-            /* fallthrough */
-        case 1:  x.bytes[1] = ~0;
-            /* fallthrough */
-        case 0:  break;
-        }
-        TOGGLE_MODE();
-
-        if ( (x.mask & (((1<<bytes)-1) << (port&7))) == 0 )
-            return 1;
-    }
-
-    return 0;
-}
-
-/* Has the administrator granted sufficient permission for this I/O access? */
-static bool_t admin_io_okay(unsigned int port, unsigned int bytes,
-                            const struct domain *d)
-{
-    /*
-     * Port 0xcf8 (CONFIG_ADDRESS) is only visible for DWORD accesses.
-     * We never permit direct access to that register.
-     */
-    if ( (port == 0xcf8) && (bytes == 4) )
-        return 0;
-
-    /* We also never permit direct access to the RTC/CMOS registers. */
-    if ( ((port & ~1) == RTC_PORT(0)) )
-        return 0;
-
-    return ioports_access_permitted(d, port, port + bytes - 1);
-}
-
-static bool_t pci_cfg_ok(struct domain *currd, unsigned int start,
-                         unsigned int size, uint32_t *write)
-{
-    uint32_t machine_bdf;
-
-    if ( !is_hardware_domain(currd) )
-        return 0;
-
-    if ( !CF8_ENABLED(currd->arch.pci_cf8) )
-        return 1;
-
-    machine_bdf = CF8_BDF(currd->arch.pci_cf8);
-    if ( write )
-    {
-        const unsigned long *ro_map = pci_get_ro_map(0);
-
-        if ( ro_map && test_bit(machine_bdf, ro_map) )
-            return 0;
-    }
-    start |= CF8_ADDR_LO(currd->arch.pci_cf8);
-    /* AMD extended configuration space access? */
-    if ( CF8_ADDR_HI(currd->arch.pci_cf8) &&
-         boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
-         boot_cpu_data.x86 >= 0x10 && boot_cpu_data.x86 <= 0x17 )
-    {
-        uint64_t msr_val;
-
-        if ( rdmsr_safe(MSR_AMD64_NB_CFG, msr_val) )
-            return 0;
-        if ( msr_val & (1ULL << AMD64_NB_CFG_CF8_EXT_ENABLE_BIT) )
-            start |= CF8_ADDR_HI(currd->arch.pci_cf8);
-    }
-
-    return !write ?
-           xsm_pci_config_permission(XSM_HOOK, currd, machine_bdf,
-                                     start, start + size - 1, 0) == 0 :
-           pci_conf_write_intercept(0, machine_bdf, start, size, write) >= 0;
-}
-
-uint32_t guest_io_read(unsigned int port, unsigned int bytes,
-                       struct domain *currd)
-{
-    uint32_t data = 0;
-    unsigned int shift = 0;
-
-    if ( admin_io_okay(port, bytes, currd) )
-    {
-        switch ( bytes )
-        {
-        case 1: return inb(port);
-        case 2: return inw(port);
-        case 4: return inl(port);
-        }
-    }
-
-    while ( bytes != 0 )
-    {
-        unsigned int size = 1;
-        uint32_t sub_data = ~0;
-
-        if ( (port == 0x42) || (port == 0x43) || (port == 0x61) )
-        {
-            sub_data = pv_pit_handler(port, 0, 0);
-        }
-        else if ( port == RTC_PORT(0) )
-        {
-            sub_data = currd->arch.cmos_idx;
-        }
-        else if ( (port == RTC_PORT(1)) &&
-                  ioports_access_permitted(currd, RTC_PORT(0), RTC_PORT(1)) )
-        {
-            unsigned long flags;
-
-            spin_lock_irqsave(&rtc_lock, flags);
-            outb(currd->arch.cmos_idx & 0x7f, RTC_PORT(0));
-            sub_data = inb(RTC_PORT(1));
-            spin_unlock_irqrestore(&rtc_lock, flags);
-        }
-        else if ( (port == 0xcf8) && (bytes == 4) )
-        {
-            size = 4;
-            sub_data = currd->arch.pci_cf8;
-        }
-        else if ( (port & 0xfffc) == 0xcfc )
-        {
-            size = min(bytes, 4 - (port & 3));
-            if ( size == 3 )
-                size = 2;
-            if ( pci_cfg_ok(currd, port & 3, size, NULL) )
-                sub_data = pci_conf_read(currd->arch.pci_cf8, port & 3, size);
-        }
-
-        if ( size == 4 )
-            return sub_data;
-
-        data |= (sub_data & ((1u << (size * 8)) - 1)) << shift;
-        shift += size * 8;
-        port += size;
-        bytes -= size;
-    }
-
-    return data;
-}
-
-void guest_io_write(unsigned int port, unsigned int bytes, uint32_t data,
-                    struct domain *currd)
-{
-    if ( admin_io_okay(port, bytes, currd) )
-    {
-        switch ( bytes ) {
-        case 1:
-            outb((uint8_t)data, port);
-            if ( pv_post_outb_hook )
-                pv_post_outb_hook(port, (uint8_t)data);
-            break;
-        case 2:
-            outw((uint16_t)data, port);
-            break;
-        case 4:
-            outl(data, port);
-            break;
-        }
-        return;
-    }
-
-    while ( bytes != 0 )
-    {
-        unsigned int size = 1;
-
-        if ( (port == 0x42) || (port == 0x43) || (port == 0x61) )
-        {
-            pv_pit_handler(port, (uint8_t)data, 1);
-        }
-        else if ( port == RTC_PORT(0) )
-        {
-            currd->arch.cmos_idx = data;
-        }
-        else if ( (port == RTC_PORT(1)) &&
-                  ioports_access_permitted(currd, RTC_PORT(0), RTC_PORT(1)) )
-        {
-            unsigned long flags;
-
-            if ( pv_rtc_handler )
-                pv_rtc_handler(currd->arch.cmos_idx & 0x7f, data);
-            spin_lock_irqsave(&rtc_lock, flags);
-            outb(currd->arch.cmos_idx & 0x7f, RTC_PORT(0));
-            outb(data, RTC_PORT(1));
-            spin_unlock_irqrestore(&rtc_lock, flags);
-        }
-        else if ( (port == 0xcf8) && (bytes == 4) )
-        {
-            size = 4;
-            currd->arch.pci_cf8 = data;
-        }
-        else if ( (port & 0xfffc) == 0xcfc )
-        {
-            size = min(bytes, 4 - (port & 3));
-            if ( size == 3 )
-                size = 2;
-            if ( pci_cfg_ok(currd, port & 3, size, &data) )
-                pci_conf_write(currd->arch.pci_cf8, port & 3, size, data);
-        }
-
-        if ( size == 4 )
-            return;
-
-        port += size;
-        bytes -= size;
-        data >>= size * 8;
-    }
-}
-
-/* I/O emulation support. Helper routines for, and type of, the stack stub.*/
-void host_to_guest_gpr_switch(struct cpu_user_regs *);
-unsigned long guest_to_host_gpr_switch(unsigned long);
-
-void (*pv_post_outb_hook)(unsigned int port, u8 value);
-
-typedef void io_emul_stub_t(struct cpu_user_regs *);
-
-static io_emul_stub_t *io_emul_stub_setup(struct priv_op_ctxt *ctxt, u8 opcode,
-                                          unsigned int port, unsigned int bytes)
-{
-    if ( !ctxt->io_emul_stub )
-        ctxt->io_emul_stub = map_domain_page(_mfn(this_cpu(stubs.mfn))) +
-                                             (this_cpu(stubs.addr) &
-                                              ~PAGE_MASK) +
-                                             STUB_BUF_SIZE / 2;
-
-    /* movq $host_to_guest_gpr_switch,%rcx */
-    ctxt->io_emul_stub[0] = 0x48;
-    ctxt->io_emul_stub[1] = 0xb9;
-    *(void **)&ctxt->io_emul_stub[2] = (void *)host_to_guest_gpr_switch;
-    /* callq *%rcx */
-    ctxt->io_emul_stub[10] = 0xff;
-    ctxt->io_emul_stub[11] = 0xd1;
-    /* data16 or nop */
-    ctxt->io_emul_stub[12] = (bytes != 2) ? 0x90 : 0x66;
-    /* <io-access opcode> */
-    ctxt->io_emul_stub[13] = opcode;
-    /* imm8 or nop */
-    ctxt->io_emul_stub[14] = !(opcode & 8) ? port : 0x90;
-    /* ret (jumps to guest_to_host_gpr_switch) */
-    ctxt->io_emul_stub[15] = 0xc3;
-    BUILD_BUG_ON(STUB_BUF_SIZE / 2 < 16);
-
-    if ( ioemul_handle_quirk )
-        ioemul_handle_quirk(opcode, &ctxt->io_emul_stub[12], ctxt->ctxt.regs);
-
-    /* Handy function-typed pointer to the stub. */
-    return (void *)(this_cpu(stubs.addr) + STUB_BUF_SIZE / 2);
-}
-
-static int priv_op_read_io(unsigned int port, unsigned int bytes,
-                           unsigned long *val, struct x86_emulate_ctxt *ctxt)
-{
-    struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt);
-    struct vcpu *curr = current;
-    struct domain *currd = current->domain;
-
-    /* INS must not come here. */
-    ASSERT((ctxt->opcode & ~9) == 0xe4);
-
-    if ( !guest_io_okay(port, bytes, curr, ctxt->regs) )
-        return X86EMUL_UNHANDLEABLE;
-
-    poc->bpmatch = check_guest_io_breakpoint(curr, port, bytes);
-
-    if ( admin_io_okay(port, bytes, currd) )
-    {
-        io_emul_stub_t *io_emul =
-            io_emul_stub_setup(poc, ctxt->opcode, port, bytes);
-
-        mark_regs_dirty(ctxt->regs);
-        io_emul(ctxt->regs);
-        return X86EMUL_DONE;
-    }
-
-    *val = guest_io_read(port, bytes, currd);
-
-    return X86EMUL_OKAY;
-}
-
-static int priv_op_write_io(unsigned int port, unsigned int bytes,
-                            unsigned long val, struct x86_emulate_ctxt *ctxt)
-{
-    struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt);
-    struct vcpu *curr = current;
-    struct domain *currd = current->domain;
-
-    /* OUTS must not come here. */
-    ASSERT((ctxt->opcode & ~9) == 0xe6);
-
-    if ( !guest_io_okay(port, bytes, curr, ctxt->regs) )
-        return X86EMUL_UNHANDLEABLE;
-
-    poc->bpmatch = check_guest_io_breakpoint(curr, port, bytes);
-
-    if ( admin_io_okay(port, bytes, currd) )
-    {
-        io_emul_stub_t *io_emul =
-            io_emul_stub_setup(poc, ctxt->opcode, port, bytes);
-
-        mark_regs_dirty(ctxt->regs);
-        io_emul(ctxt->regs);
-        if ( (bytes == 1) && pv_post_outb_hook )
-            pv_post_outb_hook(port, val);
-        return X86EMUL_DONE;
-    }
-
-    guest_io_write(port, bytes, val, currd);
-
-    return X86EMUL_OKAY;
-}
-
-static int priv_op_rep_ins(uint16_t port,
-                           enum x86_segment seg, unsigned long offset,
-                           unsigned int bytes_per_rep, unsigned long *reps,
-                           struct x86_emulate_ctxt *ctxt)
-{
-    struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt);
-    struct vcpu *curr = current;
-    struct domain *currd = current->domain;
-    unsigned long goal = *reps;
-    struct segment_register sreg;
-    int rc;
-
-    ASSERT(seg == x86_seg_es);
-
-    *reps = 0;
-
-    if ( !guest_io_okay(port, bytes_per_rep, curr, ctxt->regs) )
-        return X86EMUL_UNHANDLEABLE;
-
-    rc = priv_op_read_segment(x86_seg_es, &sreg, ctxt);
-    if ( rc != X86EMUL_OKAY )
-        return rc;
-
-    if ( !sreg.attr.fields.p )
-        return X86EMUL_UNHANDLEABLE;
-    if ( !sreg.attr.fields.s ||
-         (sreg.attr.fields.type & (_SEGMENT_CODE >> 8)) ||
-         !(sreg.attr.fields.type & (_SEGMENT_WR >> 8)) )
-    {
-        x86_emul_hw_exception(TRAP_gp_fault, 0, ctxt);
-        return X86EMUL_EXCEPTION;
-    }
-
-    poc->bpmatch = check_guest_io_breakpoint(curr, port, bytes_per_rep);
-
-    while ( *reps < goal )
-    {
-        unsigned int data = guest_io_read(port, bytes_per_rep, currd);
-        unsigned long addr;
-
-        rc = pv_emul_virt_to_linear(sreg.base, offset, bytes_per_rep,
-                                    sreg.limit, x86_seg_es, ctxt, &addr);
-        if ( rc != X86EMUL_OKAY )
-            return rc;
-
-        if ( (rc = __copy_to_user((void *)addr, &data, bytes_per_rep)) != 0 )
-        {
-            x86_emul_pagefault(PFEC_write_access,
-                               addr + bytes_per_rep - rc, ctxt);
-            return X86EMUL_EXCEPTION;
-        }
-
-        ++*reps;
-
-        if ( poc->bpmatch || hypercall_preempt_check() )
-            break;
-
-        /* x86_emulate() clips the repetition count to ensure we don't wrap. */
-        if ( unlikely(ctxt->regs->eflags & X86_EFLAGS_DF) )
-            offset -= bytes_per_rep;
-        else
-            offset += bytes_per_rep;
-    }
-
-    return X86EMUL_OKAY;
-}
-
-static int priv_op_rep_outs(enum x86_segment seg, unsigned long offset,
-                            uint16_t port,
-                            unsigned int bytes_per_rep, unsigned long *reps,
-                            struct x86_emulate_ctxt *ctxt)
-{
-    struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt);
-    struct vcpu *curr = current;
-    struct domain *currd = current->domain;
-    unsigned long goal = *reps;
-    struct segment_register sreg;
-    int rc;
-
-    *reps = 0;
-
-    if ( !guest_io_okay(port, bytes_per_rep, curr, ctxt->regs) )
-        return X86EMUL_UNHANDLEABLE;
-
-    rc = priv_op_read_segment(seg, &sreg, ctxt);
-    if ( rc != X86EMUL_OKAY )
-        return rc;
-
-    if ( !sreg.attr.fields.p )
-        return X86EMUL_UNHANDLEABLE;
-    if ( !sreg.attr.fields.s ||
-         ((sreg.attr.fields.type & (_SEGMENT_CODE >> 8)) &&
-          !(sreg.attr.fields.type & (_SEGMENT_WR >> 8))) )
-    {
-        x86_emul_hw_exception(seg != x86_seg_ss ? TRAP_gp_fault
-                                                : TRAP_stack_error,
-                              0, ctxt);
-        return X86EMUL_EXCEPTION;
-    }
-
-    poc->bpmatch = check_guest_io_breakpoint(curr, port, bytes_per_rep);
-
-    while ( *reps < goal )
-    {
-        unsigned int data = 0;
-        unsigned long addr;
-
-        rc = pv_emul_virt_to_linear(sreg.base, offset, bytes_per_rep,
-                                    sreg.limit, seg, ctxt, &addr);
-        if ( rc != X86EMUL_OKAY )
-            return rc;
-
-        if ( (rc = __copy_from_user(&data, (void *)addr, bytes_per_rep)) != 0 )
-        {
-            x86_emul_pagefault(0, addr + bytes_per_rep - rc, ctxt);
-            return X86EMUL_EXCEPTION;
-        }
-
-        guest_io_write(port, bytes_per_rep, data, currd);
-
-        ++*reps;
-
-        if ( poc->bpmatch || hypercall_preempt_check() )
-            break;
-
-        /* x86_emulate() clips the repetition count to ensure we don't wrap. */
-        if ( unlikely(ctxt->regs->eflags & X86_EFLAGS_DF) )
-            offset -= bytes_per_rep;
-        else
-            offset += bytes_per_rep;
-    }
-
-    return X86EMUL_OKAY;
-}
-
-static int priv_op_read_cr(unsigned int reg, unsigned long *val,
-                           struct x86_emulate_ctxt *ctxt)
-{
-    const struct vcpu *curr = current;
-
-    switch ( reg )
-    {
-    case 0: /* Read CR0 */
-        *val = (read_cr0() & ~X86_CR0_TS) | curr->arch.pv_vcpu.ctrlreg[0];
-        return X86EMUL_OKAY;
-
-    case 2: /* Read CR2 */
-    case 4: /* Read CR4 */
-        *val = curr->arch.pv_vcpu.ctrlreg[reg];
-        return X86EMUL_OKAY;
-
-    case 3: /* Read CR3 */
-    {
-        const struct domain *currd = curr->domain;
-        unsigned long mfn;
-
-        if ( !is_pv_32bit_domain(currd) )
-        {
-            mfn = pagetable_get_pfn(curr->arch.guest_table);
-            *val = xen_pfn_to_cr3(mfn_to_gmfn(currd, mfn));
-        }
-        else
-        {
-            l4_pgentry_t *pl4e =
-                map_domain_page(_mfn(pagetable_get_pfn(curr->arch.guest_table)));
-
-            mfn = l4e_get_pfn(*pl4e);
-            unmap_domain_page(pl4e);
-            *val = compat_pfn_to_cr3(mfn_to_gmfn(currd, mfn));
-        }
-        /* PTs should not be shared */
-        BUG_ON(page_get_owner(mfn_to_page(mfn)) == dom_cow);
-        return X86EMUL_OKAY;
-    }
-    }
-
-    return X86EMUL_UNHANDLEABLE;
-}
-
-static int priv_op_write_cr(unsigned int reg, unsigned long val,
-                            struct x86_emulate_ctxt *ctxt)
-{
-    struct vcpu *curr = current;
-
-    switch ( reg )
-    {
-    case 0: /* Write CR0 */
-        if ( (val ^ read_cr0()) & ~X86_CR0_TS )
-        {
-            gdprintk(XENLOG_WARNING,
-                    "Attempt to change unmodifiable CR0 flags\n");
-            break;
-        }
-        do_fpu_taskswitch(!!(val & X86_CR0_TS));
-        return X86EMUL_OKAY;
-
-    case 2: /* Write CR2 */
-        curr->arch.pv_vcpu.ctrlreg[2] = val;
-        arch_set_cr2(curr, val);
-        return X86EMUL_OKAY;
-
-    case 3: /* Write CR3 */
-    {
-        struct domain *currd = curr->domain;
-        unsigned long gfn;
-        struct page_info *page;
-        int rc;
-
-        gfn = !is_pv_32bit_domain(currd)
-              ? xen_cr3_to_pfn(val) : compat_cr3_to_pfn(val);
-        page = get_page_from_gfn(currd, gfn, NULL, P2M_ALLOC);
-        if ( !page )
-            break;
-        rc = new_guest_cr3(page_to_mfn(page));
-        put_page(page);
-
-        switch ( rc )
-        {
-        case 0:
-            return X86EMUL_OKAY;
-        case -ERESTART: /* retry after preemption */
-            return X86EMUL_RETRY;
-        }
-        break;
-    }
-
-    case 4: /* Write CR4 */
-        curr->arch.pv_vcpu.ctrlreg[4] = pv_guest_cr4_fixup(curr, val);
-        write_cr4(pv_guest_cr4_to_real_cr4(curr));
-        ctxt_switch_levelling(curr);
-        return X86EMUL_OKAY;
-    }
-
-    return X86EMUL_UNHANDLEABLE;
-}
-
-static int priv_op_read_dr(unsigned int reg, unsigned long *val,
-                           struct x86_emulate_ctxt *ctxt)
-{
-    unsigned long res = do_get_debugreg(reg);
-
-    if ( IS_ERR_VALUE(res) )
-        return X86EMUL_UNHANDLEABLE;
-
-    *val = res;
-
-    return X86EMUL_OKAY;
-}
-
-static int priv_op_write_dr(unsigned int reg, unsigned long val,
-                            struct x86_emulate_ctxt *ctxt)
-{
-    return do_set_debugreg(reg, val) == 0
-           ? X86EMUL_OKAY : X86EMUL_UNHANDLEABLE;
-}
-
-static inline uint64_t guest_misc_enable(uint64_t val)
-{
-    val &= ~(MSR_IA32_MISC_ENABLE_PERF_AVAIL |
-             MSR_IA32_MISC_ENABLE_MONITOR_ENABLE);
-    val |= MSR_IA32_MISC_ENABLE_BTS_UNAVAIL |
-           MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL |
-           MSR_IA32_MISC_ENABLE_XTPR_DISABLE;
-    return val;
-}
-
-static inline bool is_cpufreq_controller(const struct domain *d)
-{
-    return ((cpufreq_controller == FREQCTL_dom0_kernel) &&
-            is_hardware_domain(d));
-}
-
-static int priv_op_read_msr(unsigned int reg, uint64_t *val,
-                            struct x86_emulate_ctxt *ctxt)
-{
-    struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt);
-    const struct vcpu *curr = current;
-    const struct domain *currd = curr->domain;
-    bool vpmu_msr = false;
-
-    switch ( reg )
-    {
-        int rc;
-
-    case MSR_FS_BASE:
-        if ( is_pv_32bit_domain(currd) )
-            break;
-        *val = cpu_has_fsgsbase ? __rdfsbase() : curr->arch.pv_vcpu.fs_base;
-        return X86EMUL_OKAY;
-
-    case MSR_GS_BASE:
-        if ( is_pv_32bit_domain(currd) )
-            break;
-        *val = cpu_has_fsgsbase ? __rdgsbase()
-                                : curr->arch.pv_vcpu.gs_base_kernel;
-        return X86EMUL_OKAY;
-
-    case MSR_SHADOW_GS_BASE:
-        if ( is_pv_32bit_domain(currd) )
-            break;
-        *val = curr->arch.pv_vcpu.gs_base_user;
-        return X86EMUL_OKAY;
-
-    /*
-     * In order to fully retain original behavior, defer calling
-     * pv_soft_rdtsc() until after emulation. This may want/need to be
-     * reconsidered.
-     */
-    case MSR_IA32_TSC:
-        poc->tsc |= TSC_BASE;
-        goto normal;
-
-    case MSR_TSC_AUX:
-        poc->tsc |= TSC_AUX;
-        if ( cpu_has_rdtscp )
-            goto normal;
-        *val = 0;
-        return X86EMUL_OKAY;
-
-    case MSR_EFER:
-        *val = read_efer();
-        if ( is_pv_32bit_domain(currd) )
-            *val &= ~(EFER_LME | EFER_LMA | EFER_LMSLE);
-        return X86EMUL_OKAY;
-
-    case MSR_K7_FID_VID_CTL:
-    case MSR_K7_FID_VID_STATUS:
-    case MSR_K8_PSTATE_LIMIT:
-    case MSR_K8_PSTATE_CTRL:
-    case MSR_K8_PSTATE_STATUS:
-    case MSR_K8_PSTATE0:
-    case MSR_K8_PSTATE1:
-    case MSR_K8_PSTATE2:
-    case MSR_K8_PSTATE3:
-    case MSR_K8_PSTATE4:
-    case MSR_K8_PSTATE5:
-    case MSR_K8_PSTATE6:
-    case MSR_K8_PSTATE7:
-        if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD )
-            break;
-        if ( unlikely(is_cpufreq_controller(currd)) )
-            goto normal;
-        *val = 0;
-        return X86EMUL_OKAY;
-
-    case MSR_IA32_UCODE_REV:
-        BUILD_BUG_ON(MSR_IA32_UCODE_REV != MSR_AMD_PATCHLEVEL);
-        if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL )
-        {
-            if ( wrmsr_safe(MSR_IA32_UCODE_REV, 0) )
-                break;
-            /* As documented in the SDM: Do a CPUID 1 here */
-            cpuid_eax(1);
-        }
-        goto normal;
-
-    case MSR_IA32_MISC_ENABLE:
-        if ( rdmsr_safe(reg, *val) )
-            break;
-        *val = guest_misc_enable(*val);
-        return X86EMUL_OKAY;
-
-    case MSR_AMD64_DR0_ADDRESS_MASK:
-        if ( !boot_cpu_has(X86_FEATURE_DBEXT) )
-            break;
-        *val = curr->arch.pv_vcpu.dr_mask[0];
-        return X86EMUL_OKAY;
-
-    case MSR_AMD64_DR1_ADDRESS_MASK ... MSR_AMD64_DR3_ADDRESS_MASK:
-        if ( !boot_cpu_has(X86_FEATURE_DBEXT) )
-            break;
-        *val = curr->arch.pv_vcpu.dr_mask[reg - MSR_AMD64_DR1_ADDRESS_MASK + 1];
-        return X86EMUL_OKAY;
-
-    case MSR_IA32_PERF_CAPABILITIES:
-        /* No extra capabilities are supported. */
-        *val = 0;
-        return X86EMUL_OKAY;
-
-    case MSR_INTEL_PLATFORM_INFO:
-        if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
-             rdmsr_safe(MSR_INTEL_PLATFORM_INFO, *val) )
-            break;
-        *val = 0;
-        if ( this_cpu(cpuid_faulting_enabled) )
-            *val |= MSR_PLATFORM_INFO_CPUID_FAULTING;
-        return X86EMUL_OKAY;
-
-    case MSR_INTEL_MISC_FEATURES_ENABLES:
-        if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
-             rdmsr_safe(MSR_INTEL_MISC_FEATURES_ENABLES, *val) )
-            break;
-        *val = 0;
-        if ( curr->arch.cpuid_faulting )
-            *val |= MSR_MISC_FEATURES_CPUID_FAULTING;
-        return X86EMUL_OKAY;
-
-    case MSR_P6_PERFCTR(0)...MSR_P6_PERFCTR(7):
-    case MSR_P6_EVNTSEL(0)...MSR_P6_EVNTSEL(3):
-    case MSR_CORE_PERF_FIXED_CTR0...MSR_CORE_PERF_FIXED_CTR2:
-    case MSR_CORE_PERF_FIXED_CTR_CTRL...MSR_CORE_PERF_GLOBAL_OVF_CTRL:
-        if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL )
-        {
-            vpmu_msr = true;
-            /* fall through */
-    case MSR_AMD_FAM15H_EVNTSEL0...MSR_AMD_FAM15H_PERFCTR5:
-    case MSR_K7_EVNTSEL0...MSR_K7_PERFCTR3:
-            if ( vpmu_msr || (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) )
-            {
-                if ( vpmu_do_rdmsr(reg, val) )
-                    break;
-                return X86EMUL_OKAY;
-            }
-        }
-        /* fall through */
-    default:
-        if ( rdmsr_hypervisor_regs(reg, val) )
-            return X86EMUL_OKAY;
-
-        rc = vmce_rdmsr(reg, val);
-        if ( rc < 0 )
-            break;
-        if ( rc )
-            return X86EMUL_OKAY;
-        /* fall through */
-    normal:
-        /* Everyone can read the MSR space. */
-        /* gdprintk(XENLOG_WARNING, "Domain attempted RDMSR %08x\n", reg); */
-        if ( rdmsr_safe(reg, *val) )
-            break;
-        return X86EMUL_OKAY;
-    }
-
-    return X86EMUL_UNHANDLEABLE;
-}
-
-#include "x86_64/mmconfig.h"
-
-static int priv_op_write_msr(unsigned int reg, uint64_t val,
-                             struct x86_emulate_ctxt *ctxt)
-{
-    struct vcpu *curr = current;
-    const struct domain *currd = curr->domain;
-    bool vpmu_msr = false;
-
-    switch ( reg )
-    {
-        uint64_t temp;
-        int rc;
-
-    case MSR_FS_BASE:
-        if ( is_pv_32bit_domain(currd) || !is_canonical_address(val) )
-            break;
-        wrfsbase(val);
-        curr->arch.pv_vcpu.fs_base = val;
-        return X86EMUL_OKAY;
-
-    case MSR_GS_BASE:
-        if ( is_pv_32bit_domain(currd) || !is_canonical_address(val) )
-            break;
-        wrgsbase(val);
-        curr->arch.pv_vcpu.gs_base_kernel = val;
-        return X86EMUL_OKAY;
-
-    case MSR_SHADOW_GS_BASE:
-        if ( is_pv_32bit_domain(currd) || !is_canonical_address(val) )
-            break;
-        wrmsrl(MSR_SHADOW_GS_BASE, val);
-        curr->arch.pv_vcpu.gs_base_user = val;
-        return X86EMUL_OKAY;
-
-    case MSR_K7_FID_VID_STATUS:
-    case MSR_K7_FID_VID_CTL:
-    case MSR_K8_PSTATE_LIMIT:
-    case MSR_K8_PSTATE_CTRL:
-    case MSR_K8_PSTATE_STATUS:
-    case MSR_K8_PSTATE0:
-    case MSR_K8_PSTATE1:
-    case MSR_K8_PSTATE2:
-    case MSR_K8_PSTATE3:
-    case MSR_K8_PSTATE4:
-    case MSR_K8_PSTATE5:
-    case MSR_K8_PSTATE6:
-    case MSR_K8_PSTATE7:
-    case MSR_K8_HWCR:
-        if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD )
-            break;
-        if ( likely(!is_cpufreq_controller(currd)) ||
-             wrmsr_safe(reg, val) == 0 )
-            return X86EMUL_OKAY;
-        break;
-
-    case MSR_AMD64_NB_CFG:
-        if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD ||
-             boot_cpu_data.x86 < 0x10 || boot_cpu_data.x86 > 0x17 )
-            break;
-        if ( !is_hardware_domain(currd) || !is_pinned_vcpu(curr) )
-            return X86EMUL_OKAY;
-        if ( (rdmsr_safe(MSR_AMD64_NB_CFG, temp) != 0) ||
-             ((val ^ temp) & ~(1ULL << AMD64_NB_CFG_CF8_EXT_ENABLE_BIT)) )
-            goto invalid;
-        if ( wrmsr_safe(MSR_AMD64_NB_CFG, val) == 0 )
-            return X86EMUL_OKAY;
-        break;
-
-    case MSR_FAM10H_MMIO_CONF_BASE:
-        if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD ||
-             boot_cpu_data.x86 < 0x10 || boot_cpu_data.x86 > 0x17 )
-            break;
-        if ( !is_hardware_domain(currd) || !is_pinned_vcpu(curr) )
-            return X86EMUL_OKAY;
-        if ( rdmsr_safe(MSR_FAM10H_MMIO_CONF_BASE, temp) != 0 )
-            break;
-        if ( (pci_probe & PCI_PROBE_MASK) == PCI_PROBE_MMCONF ?
-             temp != val :
-             ((temp ^ val) &
-              ~(FAM10H_MMIO_CONF_ENABLE |
-                (FAM10H_MMIO_CONF_BUSRANGE_MASK <<
-                 FAM10H_MMIO_CONF_BUSRANGE_SHIFT) |
-                ((u64)FAM10H_MMIO_CONF_BASE_MASK <<
-                 FAM10H_MMIO_CONF_BASE_SHIFT))) )
-            goto invalid;
-        if ( wrmsr_safe(MSR_FAM10H_MMIO_CONF_BASE, val) == 0 )
-            return X86EMUL_OKAY;
-        break;
-
-    case MSR_IA32_UCODE_REV:
-        if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL )
-            break;
-        if ( !is_hardware_domain(currd) || !is_pinned_vcpu(curr) )
-            return X86EMUL_OKAY;
-        if ( rdmsr_safe(reg, temp) )
-            break;
-        if ( val )
-            goto invalid;
-        return X86EMUL_OKAY;
-
-    case MSR_IA32_MISC_ENABLE:
-        if ( rdmsr_safe(reg, temp) )
-            break;
-        if ( val != guest_misc_enable(temp) )
-            goto invalid;
-        return X86EMUL_OKAY;
-
-    case MSR_IA32_MPERF:
-    case MSR_IA32_APERF:
-        if ( (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) &&
-             (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) )
-            break;
-        if ( likely(!is_cpufreq_controller(currd)) ||
-             wrmsr_safe(reg, val) == 0 )
-            return X86EMUL_OKAY;
-        break;
-
-    case MSR_IA32_PERF_CTL:
-        if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL )
-            break;
-        if ( likely(!is_cpufreq_controller(currd)) ||
-             wrmsr_safe(reg, val) == 0 )
-            return X86EMUL_OKAY;
-        break;
-
-    case MSR_IA32_THERM_CONTROL:
-    case MSR_IA32_ENERGY_PERF_BIAS:
-        if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL )
-            break;
-        if ( !is_hardware_domain(currd) || !is_pinned_vcpu(curr) ||
-             wrmsr_safe(reg, val) == 0 )
-            return X86EMUL_OKAY;
-        break;
-
-    case MSR_AMD64_DR0_ADDRESS_MASK:
-        if ( !boot_cpu_has(X86_FEATURE_DBEXT) || (val >> 32) )
-            break;
-        curr->arch.pv_vcpu.dr_mask[0] = val;
-        if ( curr->arch.debugreg[7] & DR7_ACTIVE_MASK )
-            wrmsrl(MSR_AMD64_DR0_ADDRESS_MASK, val);
-        return X86EMUL_OKAY;
-
-    case MSR_AMD64_DR1_ADDRESS_MASK ... MSR_AMD64_DR3_ADDRESS_MASK:
-        if ( !boot_cpu_has(X86_FEATURE_DBEXT) || (val >> 32) )
-            break;
-        curr->arch.pv_vcpu.dr_mask[reg - MSR_AMD64_DR1_ADDRESS_MASK + 1] = val;
-        if ( curr->arch.debugreg[7] & DR7_ACTIVE_MASK )
-            wrmsrl(reg, val);
-        return X86EMUL_OKAY;
-
-    case MSR_INTEL_PLATFORM_INFO:
-        if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
-             val || rdmsr_safe(MSR_INTEL_PLATFORM_INFO, val) )
-            break;
-        return X86EMUL_OKAY;
-
-    case MSR_INTEL_MISC_FEATURES_ENABLES:
-        if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
-             (val & ~MSR_MISC_FEATURES_CPUID_FAULTING) ||
-             rdmsr_safe(MSR_INTEL_MISC_FEATURES_ENABLES, temp) )
-            break;
-        if ( (val & MSR_MISC_FEATURES_CPUID_FAULTING) &&
-             !this_cpu(cpuid_faulting_enabled) )
-            break;
-        curr->arch.cpuid_faulting = !!(val & MSR_MISC_FEATURES_CPUID_FAULTING);
-        return X86EMUL_OKAY;
-
-    case MSR_P6_PERFCTR(0)...MSR_P6_PERFCTR(7):
-    case MSR_P6_EVNTSEL(0)...MSR_P6_EVNTSEL(3):
-    case MSR_CORE_PERF_FIXED_CTR0...MSR_CORE_PERF_FIXED_CTR2:
-    case MSR_CORE_PERF_FIXED_CTR_CTRL...MSR_CORE_PERF_GLOBAL_OVF_CTRL:
-        if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL )
-        {
-            vpmu_msr = true;
-    case MSR_AMD_FAM15H_EVNTSEL0...MSR_AMD_FAM15H_PERFCTR5:
-    case MSR_K7_EVNTSEL0...MSR_K7_PERFCTR3:
-            if ( vpmu_msr || (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) )
-            {
-                if ( (vpmu_mode & XENPMU_MODE_ALL) &&
-                     !is_hardware_domain(currd) )
-                    return X86EMUL_OKAY;
-
-                if ( vpmu_do_wrmsr(reg, val, 0) )
-                    break;
-                return X86EMUL_OKAY;
-            }
-        }
-        /* fall through */
-    default:
-        if ( wrmsr_hypervisor_regs(reg, val) == 1 )
-            return X86EMUL_OKAY;
-
-        rc = vmce_wrmsr(reg, val);
-        if ( rc < 0 )
-            break;
-        if ( rc )
-            return X86EMUL_OKAY;
-
-        if ( (rdmsr_safe(reg, temp) != 0) || (val != temp) )
-    invalid:
-            gdprintk(XENLOG_WARNING,
-                     "Domain attempted WRMSR %08x from 0x%016"PRIx64" to 0x%016"PRIx64"\n",
-                     reg, temp, val);
-        return X86EMUL_OKAY;
-    }
-
-    return X86EMUL_UNHANDLEABLE;
-}
-
-static int priv_op_wbinvd(struct x86_emulate_ctxt *ctxt)
-{
-    /* Ignore the instruction if unprivileged. */
-    if ( !cache_flush_permitted(current->domain) )
-        /*
-         * Non-physdev domain attempted WBINVD; ignore for now since
-         * newer linux uses this in some start-of-day timing loops.
-         */
-        ;
-    else
-        wbinvd();
-
-    return X86EMUL_OKAY;
-}
-
-int pv_emul_cpuid(uint32_t leaf, uint32_t subleaf,
-                  struct cpuid_leaf *res, struct x86_emulate_ctxt *ctxt)
-{
-    guest_cpuid(current, leaf, subleaf, res);
-
-    return X86EMUL_OKAY;
-}
-
-static int priv_op_validate(const struct x86_emulate_state *state,
-                            struct x86_emulate_ctxt *ctxt)
-{
-    switch ( ctxt->opcode )
-    {
-    case 0x6c ... 0x6f: /* ins / outs */
-    case 0xe4 ... 0xe7: /* in / out (immediate port) */
-    case 0xec ... 0xef: /* in / out (port in %dx) */
-    case X86EMUL_OPC(0x0f, 0x06): /* clts */
-    case X86EMUL_OPC(0x0f, 0x09): /* wbinvd */
-    case X86EMUL_OPC(0x0f, 0x20) ...
-         X86EMUL_OPC(0x0f, 0x23): /* mov to/from cr/dr */
-    case X86EMUL_OPC(0x0f, 0x30): /* wrmsr */
-    case X86EMUL_OPC(0x0f, 0x31): /* rdtsc */
-    case X86EMUL_OPC(0x0f, 0x32): /* rdmsr */
-    case X86EMUL_OPC(0x0f, 0xa2): /* cpuid */
-        return X86EMUL_OKAY;
-
-    case 0xfa: case 0xfb: /* cli / sti */
-        if ( !iopl_ok(current, ctxt->regs) )
-            break;
-        /*
-         * This is just too dangerous to allow, in my opinion. Consider if the
-         * caller then tries to reenable interrupts using POPF: we can't trap
-         * that and we'll end up with hard-to-debug lockups. Fast & loose will
-         * do for us. :-)
-        vcpu_info(current, evtchn_upcall_mask) = (ctxt->opcode == 0xfa);
-         */
-        return X86EMUL_DONE;
-
-    case X86EMUL_OPC(0x0f, 0x01):
-    {
-        unsigned int modrm_rm, modrm_reg;
-
-        if ( x86_insn_modrm(state, &modrm_rm, &modrm_reg) != 3 ||
-             (modrm_rm & 7) != 1 )
-            break;
-        switch ( modrm_reg & 7 )
-        {
-        case 2: /* xsetbv */
-        case 7: /* rdtscp */
-            return X86EMUL_OKAY;
-        }
-        break;
-    }
-    }
-
-    return X86EMUL_UNHANDLEABLE;
-}
-
-static const struct x86_emulate_ops priv_op_ops = {
-    .insn_fetch          = priv_op_insn_fetch,
-    .read                = x86emul_unhandleable_rw,
-    .validate            = priv_op_validate,
-    .read_io             = priv_op_read_io,
-    .write_io            = priv_op_write_io,
-    .rep_ins             = priv_op_rep_ins,
-    .rep_outs            = priv_op_rep_outs,
-    .read_segment        = priv_op_read_segment,
-    .read_cr             = priv_op_read_cr,
-    .write_cr            = priv_op_write_cr,
-    .read_dr             = priv_op_read_dr,
-    .write_dr            = priv_op_write_dr,
-    .read_msr            = priv_op_read_msr,
-    .write_msr           = priv_op_write_msr,
-    .cpuid               = pv_emul_cpuid,
-    .wbinvd              = priv_op_wbinvd,
-};
-
-static int emulate_privileged_op(struct cpu_user_regs *regs)
-{
-    struct vcpu *curr = current;
-    struct domain *currd = curr->domain;
-    struct priv_op_ctxt ctxt = {
-        .ctxt.regs = regs,
-        .ctxt.vendor = currd->arch.cpuid->x86_vendor,
-        .ctxt.lma = !is_pv_32bit_domain(currd),
-    };
-    int rc;
-    unsigned int eflags, ar;
-
-    if ( !read_descriptor(regs->cs, curr, &ctxt.cs.base, &ctxt.cs.limit,
-                          &ar, 1) ||
-         !(ar & _SEGMENT_S) ||
-         !(ar & _SEGMENT_P) ||
-         !(ar & _SEGMENT_CODE) )
-        return 0;
-
-    /* Mirror virtualized state into EFLAGS. */
-    ASSERT(regs->eflags & X86_EFLAGS_IF);
-    if ( vcpu_info(curr, evtchn_upcall_mask) )
-        regs->eflags &= ~X86_EFLAGS_IF;
-    else
-        regs->eflags |= X86_EFLAGS_IF;
-    ASSERT(!(regs->eflags & X86_EFLAGS_IOPL));
-    regs->eflags |= curr->arch.pv_vcpu.iopl;
-    eflags = regs->eflags;
-
-    ctxt.ctxt.addr_size = ar & _SEGMENT_L ? 64 : ar & _SEGMENT_DB ? 32 : 16;
-    /* Leave zero in ctxt.ctxt.sp_size, as it's not needed. */
-    rc = x86_emulate(&ctxt.ctxt, &priv_op_ops);
-
-    if ( ctxt.io_emul_stub )
-        unmap_domain_page(ctxt.io_emul_stub);
-
-    /*
-     * Un-mirror virtualized state from EFLAGS.
-     * Nothing we allow to be emulated can change anything other than the
-     * arithmetic bits, and the resume flag.
-     */
-    ASSERT(!((regs->eflags ^ eflags) &
-             ~(X86_EFLAGS_RF | X86_EFLAGS_ARITH_MASK)));
-    regs->eflags |= X86_EFLAGS_IF;
-    regs->eflags &= ~X86_EFLAGS_IOPL;
-
-    switch ( rc )
-    {
-    case X86EMUL_OKAY:
-        if ( ctxt.tsc & TSC_BASE )
-        {
-            if ( ctxt.tsc & TSC_AUX )
-                pv_soft_rdtsc(curr, regs, 1);
-            else if ( currd->arch.vtsc )
-                pv_soft_rdtsc(curr, regs, 0);
-            else
-                msr_split(regs, rdtsc());
-        }
-
-        if ( ctxt.ctxt.retire.singlestep )
-            ctxt.bpmatch |= DR_STEP;
-        if ( ctxt.bpmatch )
-        {
-            curr->arch.debugreg[6] |= ctxt.bpmatch | DR_STATUS_RESERVED_ONE;
-            if ( !(curr->arch.pv_vcpu.trap_bounce.flags & TBF_EXCEPTION) )
-                pv_inject_hw_exception(TRAP_debug, X86_EVENT_NO_EC);
-        }
-        /* fall through */
-    case X86EMUL_RETRY:
-        return EXCRET_fault_fixed;
-
-    case X86EMUL_EXCEPTION:
-        pv_inject_event(&ctxt.ctxt.event);
-        return EXCRET_fault_fixed;
-    }
-
-    return 0;
-}
-
 static inline int check_stack_limit(unsigned int ar, unsigned int limit,
                                     unsigned int esp, unsigned int decr)
 {
diff --git a/xen/arch/x86/x86_64/Makefile b/xen/arch/x86/x86_64/Makefile
index d8815e78b0..f336a6ae65 100644
--- a/xen/arch/x86/x86_64/Makefile
+++ b/xen/arch/x86/x86_64/Makefile
@@ -1,7 +1,6 @@
 subdir-y += compat
 
 obj-bin-y += entry.o
-obj-bin-y += gpr_switch.o
 obj-y += traps.o
 obj-$(CONFIG_KEXEC) += machine_kexec.o
 obj-y += pci.o
diff --git a/xen/include/asm-x86/pv/traps.h b/xen/include/asm-x86/pv/traps.h
new file mode 100644
index 0000000000..32c7bac587
--- /dev/null
+++ b/xen/include/asm-x86/pv/traps.h
@@ -0,0 +1,48 @@
+/*
+ * pv/traps.h
+ *
+ * PV guest traps interface definitions
+ *
+ * Copyright (C) 2017 Wei Liu <wei.liu2@citrix.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms and conditions of the GNU General Public
+ * License, version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef __X86_PV_TRAPS_H__
+#define __X86_PV_TRAPS_H__
+
+#ifdef CONFIG_PV
+
+#include <public/xen.h>
+
+int emulate_privileged_op(struct cpu_user_regs *regs);
+
+#else  /* !CONFIG_PV */
+
+#include <xen/errno.h>
+
+int emulate_privileged_op(struct cpu_user_regs *regs) { return -EOPNOTSUPP; }
+
+#endif	/* CONFIG_PV */
+
+#endif	/* __X86_PV_TRAPS_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
-- 
2.11.0


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

  reply	other threads:[~2017-05-18 17:29 UTC|newest]

Thread overview: 65+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2017-05-18 17:09 [PATCH for-next v3 00/22] x86: refactor trap handling code Wei Liu
2017-05-18 17:09 ` [PATCH for-next v3 01/22] x86/traps: move privilege instruction emulation code Wei Liu
2017-05-18 17:28   ` Wei Liu [this message]
2017-05-29 15:14     ` Jan Beulich
2017-05-30 17:27       ` Wei Liu
2017-05-30 17:30         ` Andrew Cooper
2017-05-31  5:55           ` Jan Beulich
2017-05-31 11:01             ` Wei Liu
2017-05-31 11:05               ` Andrew Cooper
2017-05-31 11:36                 ` Wei Liu
2017-05-31 11:43                 ` Jan Beulich
2017-05-18 17:09 ` [PATCH for-next v3 02/22] x86/traps: move gate op " Wei Liu
2017-05-29 15:15   ` Jan Beulich
2017-05-18 17:09 ` [PATCH for-next v3 03/22] x86/traps: move emulate_invalid_rdtscp Wei Liu
2017-05-29 15:18   ` Jan Beulich
2017-05-18 17:09 ` [PATCH for-next v3 04/22] x86/traps: move emulate_forced_invalid_op Wei Liu
2017-05-29 15:19   ` Jan Beulich
2017-05-18 17:09 ` [PATCH for-next v3 05/22] x86/pv: clean up emulate.c Wei Liu
2017-05-29 15:37   ` Jan Beulich
2017-05-18 17:09 ` [PATCH for-next v3 06/22] x86/traps: move PV hypercall handlers to pv/traps.c Wei Liu
2017-05-29 15:40   ` Jan Beulich
2017-05-30 17:40     ` Andrew Cooper
2017-05-31  5:59       ` Jan Beulich
2017-05-31 11:14         ` Wei Liu
2017-05-31 11:45           ` Jan Beulich
2017-06-02 11:01             ` Wei Liu
2017-06-06  7:36               ` Jan Beulich
2017-06-08 11:30                 ` Andrew Cooper
2017-06-08 14:28                   ` Wei Liu
2017-05-18 17:09 ` [PATCH for-next v3 07/22] x86/traps: move pv_inject_event " Wei Liu
2017-05-29 15:42   ` Jan Beulich
2017-05-18 17:09 ` [PATCH for-next v3 08/22] x86/traps: move set_guest_{machinecheck, nmi}_trapbounce Wei Liu
2017-05-29 15:43   ` Jan Beulich
2017-05-18 17:09 ` [PATCH for-next v3 09/22] x86/traps: move {un, }register_guest_nmi_callback Wei Liu
2017-05-18 17:09 ` [PATCH for-next v3 10/22] x86/traps: delcare percpu softirq_trap Wei Liu
2017-05-29 15:49   ` Jan Beulich
2017-05-31 11:35     ` Wei Liu
2017-05-31 11:46       ` Jan Beulich
2017-05-31 11:54         ` Wei Liu
2017-05-18 17:09 ` [PATCH for-next v3 11/22] x86/traps: move guest_has_trap_callback to pv/traps.c Wei Liu
2017-05-29 15:54   ` Jan Beulich
2017-05-18 17:09 ` [PATCH for-next v3 12/22] x86/traps: move send_guest_trap " Wei Liu
2017-05-29 15:55   ` Jan Beulich
2017-06-05 17:08     ` Wei Liu
2017-06-06  7:37       ` Jan Beulich
2017-05-18 17:09 ` [PATCH for-next v3 13/22] x86/traps: move toggle_guest_mode Wei Liu
2017-05-29 16:05   ` Jan Beulich
2017-05-30 17:47     ` Andrew Cooper
2017-05-31  6:00       ` Jan Beulich
2017-05-18 17:09 ` [PATCH for-next v3 14/22] x86/traps: move do_iret to pv/traps.c Wei Liu
2017-05-29 16:07   ` Jan Beulich
2017-05-18 17:09 ` [PATCH for-next v3 15/22] x86/traps: move init_int80_direct_trap Wei Liu
2017-05-29 16:07   ` Jan Beulich
2017-05-18 17:09 ` [PATCH for-next v3 16/22] x86/traps: move callback_op code Wei Liu
2017-05-29 16:09   ` Jan Beulich
2017-05-18 17:09 ` [PATCH for-next v3 17/22] x86/traps: move hypercall_page_initialise_ring3_kernel Wei Liu
2017-05-29 16:10   ` Jan Beulich
2017-05-18 17:10 ` [PATCH for-next v3 18/22] x86/traps: merge x86_64/compat/traps.c into pv/traps.c Wei Liu
2017-05-29 16:12   ` Jan Beulich
2017-05-18 17:10 ` [PATCH for-next v3 19/22] x86: clean up pv/traps.c Wei Liu
2017-05-29 16:18   ` Jan Beulich
2017-05-18 17:10 ` [PATCH for-next v3 20/22] x86: guest_has_trap_callback should return bool Wei Liu
2017-05-18 17:10 ` [PATCH for-next v3 21/22] x86: fix coding style issues in asm-x86/traps.h Wei Liu
2017-05-18 17:10 ` [PATCH for-next v3 22/22] x86: clean up traps.c Wei Liu
2017-05-29 16:21   ` Jan Beulich

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20170518172832.afqcp65eztdaurdb@citrix.com \
    --to=wei.liu2@citrix.com \
    --cc=JBeulich@suse.com \
    --cc=andrew.cooper3@citrix.com \
    --cc=xen-devel@lists.xenproject.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.