All of lore.kernel.org
 help / color / mirror / Atom feed
From: Christoph Egger <Christoph.Egger@amd.com>
To: "xen-devel@lists.xensource.com" <xen-devel@lists.xensource.com>
Cc: "Dong, Eddie" <eddie.dong@intel.com>, Tim Deegan <Tim.Deegan@citrix.com>
Subject: [PATCH 09/13] Nested Virtualization: svm specific implementation
Date: Wed, 1 Sep 2010 17:14:20 +0200	[thread overview]
Message-ID: <201009011714.20987.Christoph.Egger@amd.com> (raw)

[-- Attachment #1: Type: text/plain, Size: 322 bytes --]


Signed-off-by: Christoph Egger <Christoph.Egger@amd.com>

-- 
---to satisfy European Law for business letters:
Advanced Micro Devices GmbH
Einsteinring 24, 85609 Dornach b. Muenchen
Geschaeftsfuehrer: Alberto Bozzo, Andrew Bowd
Sitz: Dornach, Gemeinde Aschheim, Landkreis Muenchen
Registergericht Muenchen, HRB Nr. 43632

[-- Attachment #2: xen_nh09_svm.diff --]
[-- Type: text/x-diff, Size: 65068 bytes --]

# HG changeset patch
# User cegger
# Date 1283345885 -7200
Implement SVM specific part for Nested Virtualization

diff -r 0fcb32521d57 -r fa992936dba4 xen/arch/x86/hvm/svm/Makefile
--- a/xen/arch/x86/hvm/svm/Makefile
+++ b/xen/arch/x86/hvm/svm/Makefile
@@ -3,5 +3,6 @@ obj-y += emulate.o
 obj-y += entry.o
 obj-y += intr.o
 obj-y += svm.o
+obj-y += svmdebug.o
 obj-y += vmcb.o
 obj-y += vpmu.o
diff -r 0fcb32521d57 -r fa992936dba4 xen/arch/x86/hvm/svm/emulate.c
--- a/xen/arch/x86/hvm/svm/emulate.c
+++ b/xen/arch/x86/hvm/svm/emulate.c
@@ -101,6 +101,11 @@ MAKE_INSTR(HLT,    1, 0xf4);
 MAKE_INSTR(INT3,   1, 0xcc);
 MAKE_INSTR(RDTSC,  2, 0x0f, 0x31);
 MAKE_INSTR(PAUSE,  1, 0x90);
+MAKE_INSTR(VMRUN,  3, 0x0f, 0x01, 0xd8);
+MAKE_INSTR(VMLOAD, 3, 0x0f, 0x01, 0xda);
+MAKE_INSTR(VMSAVE, 3, 0x0f, 0x01, 0xdb);
+MAKE_INSTR(STGI,   3, 0x0f, 0x01, 0xdc);
+MAKE_INSTR(CLGI,   3, 0x0f, 0x01, 0xdd);
 
 static const u8 *opc_bytes[INSTR_MAX_COUNT] = 
 {
@@ -114,6 +119,11 @@ static const u8 *opc_bytes[INSTR_MAX_COU
     [INSTR_INT3]   = OPCODE_INT3,
     [INSTR_RDTSC]  = OPCODE_RDTSC,
     [INSTR_PAUSE]  = OPCODE_PAUSE,
+    [INSTR_VMRUN]  = OPCODE_VMRUN,
+    [INSTR_VMLOAD] = OPCODE_VMLOAD,
+    [INSTR_VMSAVE] = OPCODE_VMSAVE,
+    [INSTR_STGI]   = OPCODE_STGI,
+    [INSTR_CLGI]   = OPCODE_CLGI,
 };
 
 static int fetch(struct vcpu *v, u8 *buf, unsigned long addr, int len)
diff -r 0fcb32521d57 -r fa992936dba4 xen/arch/x86/hvm/svm/svm.c
--- a/xen/arch/x86/hvm/svm/svm.c
+++ b/xen/arch/x86/hvm/svm/svm.c
@@ -49,6 +49,8 @@
 #include <asm/hvm/svm/vmcb.h>
 #include <asm/hvm/svm/emulate.h>
 #include <asm/hvm/svm/intr.h>
+#include <asm/hvm/svm/svmdebug.h>
+#include <asm/hvm/nestedhvm.h>
 #include <asm/x86_emulate.h>
 #include <public/sched.h>
 #include <asm/hvm/vpt.h>
@@ -108,6 +110,44 @@ static void svm_cpu_down(void)
     write_efer(read_efer() & ~EFER_SVME);
 }
 
+static unsigned long *
+svm_msrbit(unsigned long *msr_bitmap, uint32_t msr)
+{
+    unsigned long *msr_bit = NULL;
+
+    /*
+     * See AMD64 Programmers Manual, Vol 2, Section 15.10 (MSR-Bitmap Address).
+     */
+    if ( msr <= 0x1fff )
+        msr_bit = msr_bitmap + 0x0000 / BYTES_PER_LONG;
+    else if ( (msr >= 0xc0000000) && (msr <= 0xc0001fff) )
+        msr_bit = msr_bitmap + 0x0800 / BYTES_PER_LONG;
+    else if ( (msr >= 0xc0010000) && (msr <= 0xc0011fff) )
+        msr_bit = msr_bitmap + 0x1000 / BYTES_PER_LONG;
+
+    return msr_bit;
+}
+
+void svm_intercept_msr(struct vcpu *v, uint32_t msr, int enable)
+{
+    unsigned long *msr_bit;
+
+    msr_bit = svm_msrbit(v->arch.hvm_svm.msrpm, msr);
+    BUG_ON(msr_bit == NULL);
+    msr &= 0x1fff;
+
+    if ( enable )
+    {
+        __set_bit(msr * 2, msr_bit);
+        __set_bit(msr * 2 + 1, msr_bit);
+    }
+    else
+    {
+        __clear_bit(msr * 2, msr_bit);
+        __clear_bit(msr * 2 + 1, msr_bit);
+    }
+}
+
 static enum handler_return
 long_mode_do_msr_write(unsigned int msr, uint64_t msr_content)
 {
@@ -325,7 +365,7 @@ static int svm_load_vmcb_ctxt(struct vcp
 {
     svm_load_cpu_state(v, ctxt);
     if (svm_vmcb_restore(v, ctxt)) {
-        printk("svm_vmcb restore failed!\n");
+        gdprintk(XENLOG_ERR, "svm_vmcb restore failed!\n");
         domain_crash(v->domain);
         return -EINVAL;
     }
@@ -692,8 +732,10 @@ static void svm_ctxt_switch_to(struct vc
 static void svm_do_resume(struct vcpu *v) 
 {
     bool_t debug_state = v->domain->debugger_attached;
-
-    if ( unlikely(v->arch.hvm_vcpu.debug_state_latch != debug_state) )
+    bool_t guestmode = nestedhvm_vcpu_in_guestmode(v);
+
+    if ( !guestmode &&
+        unlikely(v->arch.hvm_vcpu.debug_state_latch != debug_state) )
     {
         uint32_t mask = (1U << TRAP_debug) | (1U << TRAP_int3);
         v->arch.hvm_vcpu.debug_state_latch = debug_state;
@@ -712,11 +754,14 @@ static void svm_do_resume(struct vcpu *v
         hvm_asid_flush_vcpu(v);
     }
 
-    /* Reflect the vlapic's TPR in the hardware vtpr */
-    v->arch.hvm_svm.vmcb->vintr.fields.tpr = 
-        (vlapic_get_reg(vcpu_vlapic(v), APIC_TASKPRI) & 0xFF) >> 4;
-
-    hvm_do_resume(v);
+    if ( !guestmode )
+    {
+        /* Reflect the vlapic's TPR in the hardware vtpr */
+        v->arch.hvm_svm.vmcb->vintr.fields.tpr = 
+            (vlapic_get_reg(vcpu_vlapic(v), APIC_TASKPRI) & 0xFF) >> 4;
+
+        hvm_do_resume(v);
+    }
     reset_stack_and_jump(svm_asm_do_resume);
 }
 
@@ -861,6 +906,998 @@ static void svm_init_erratum_383(struct 
     }
 }
 
+/*
+ * Nested SVM
+ */
+static int nsvm_vcpu_destroy(struct vcpu *v);
+
+static int nsvm_vcpu_initialise(struct vcpu *v)
+{
+    void *msrpm;
+    struct nestedhvm *hvm = &vcpu_nestedhvm(v);
+    struct nestedsvm *svm;
+
+    ASSERT(hvm->nh_hostsave == NULL);
+    hvm->nh_hostsave = alloc_vmcb();
+    if (hvm->nh_hostsave == NULL)
+        goto err;
+
+    msrpm = alloc_xenheap_pages(get_order_from_bytes(MSRPM_SIZE), 0);
+    hvm->nh_cached_msrpm = msrpm;
+    if (msrpm == NULL)
+        goto err;
+    memset(msrpm, 0x0, MSRPM_SIZE);
+    hvm->nh_cached_msrpm_size = MSRPM_SIZE;
+
+    msrpm = alloc_xenheap_pages(get_order_from_bytes(MSRPM_SIZE), 0);
+    hvm->nh_merged_msrpm = msrpm;
+    if (msrpm == NULL)
+        goto err;
+    memset(msrpm, 0x0, MSRPM_SIZE);
+    hvm->nh_merged_msrpm_size = MSRPM_SIZE;
+
+    hvm->nh_vm = alloc_vmcb();
+    hvm->nh_vmsize = sizeof(struct vmcb_struct);
+    if (hvm->nh_vm == NULL)
+        goto err;
+
+    svm = hvm->nh_arch = xmalloc_bytes(sizeof(struct nestedsvm));
+    if (hvm->nh_arch == NULL)
+        goto err;
+    hvm->nh_arch_size = sizeof(struct nestedsvm);
+    memset(hvm->nh_arch, 0x0, hvm->nh_arch_size);
+
+    svm->ns_tmpvmcb = alloc_vmcb();
+    if (svm->ns_tmpvmcb == NULL)
+        goto err;
+
+    return 0;
+
+err:
+    nsvm_vcpu_destroy(v);
+    return -ENOMEM;
+}
+
+static int nsvm_vcpu_destroy(struct vcpu *v)
+{
+    struct nestedhvm *hvm = &vcpu_nestedhvm(v);
+
+    if (hvm->nh_vm) {
+        free_vmcb(hvm->nh_vm);
+        hvm->nh_vm = NULL;
+    }
+    if (hvm->nh_hostsave) {
+        free_vmcb(hvm->nh_hostsave);
+        hvm->nh_hostsave = NULL;
+    }
+    if (hvm->nh_cached_msrpm) {
+        ASSERT(hvm->nh_cached_msrpm_size > 0);
+        free_xenheap_pages(hvm->nh_cached_msrpm,
+                           get_order_from_bytes(hvm->nh_cached_msrpm_size));
+        hvm->nh_cached_msrpm = NULL;
+        hvm->nh_cached_msrpm_size = 0;
+    }
+    if (hvm->nh_merged_msrpm) {
+        ASSERT(hvm->nh_merged_msrpm_size > 0);
+        free_xenheap_pages(hvm->nh_merged_msrpm,
+                           get_order_from_bytes(hvm->nh_merged_msrpm_size));
+        hvm->nh_merged_msrpm = NULL;
+        hvm->nh_merged_msrpm_size = 0;
+    }
+    if (hvm->nh_arch) {
+        struct nestedsvm *svm = hvm->nh_arch;
+        if (svm->ns_tmpvmcb)
+            free_vmcb(svm->ns_tmpvmcb);
+        xfree(hvm->nh_arch);
+        hvm->nh_arch = NULL;
+        hvm->nh_arch_size = 0;
+    }
+
+    return 0;
+}
+
+static int nsvm_vcpu_reset(struct vcpu *v)
+{
+    struct nestedhvm *hvm = &vcpu_nestedhvm(v);
+    struct vmcb_struct *vmcb = hvm->nh_vm;
+
+    hvm->nh_vmmaxaddr = 0xfd00000000ULL;
+    vmcb->np_enable = 0;
+    vmcb->g_pat = MSR_IA32_CR_PAT_RESET;
+    return 0;
+}
+
+static int nsvm_vcpu_features(struct vcpu *v,
+    uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx)
+{
+    /* We handle SVM features via cpuid by tools/libxc.
+     * So nothing to do here.
+     */
+    return 0;
+}
+
+static void nsvm_vmcb_loadsave(struct vmcb_struct *from,
+				struct vmcb_struct *to)
+{
+    to->fs = from->fs;
+    to->gs = from->gs;
+    to->tr = from->tr;
+    to->ldtr = from->ldtr;
+    to->kerngsbase = from->kerngsbase;
+    to->star = from->star;
+    to->lstar = from->lstar;
+    to->cstar = from->cstar;
+    to->sfmask = from->sfmask;
+    to->sysenter_cs = from->sysenter_cs;
+    to->sysenter_esp = from->sysenter_esp;
+    to->sysenter_eip = from->sysenter_eip;
+}
+
+static int nsvm_vcpu_hostsave(struct vcpu *v, unsigned int inst_len)
+{
+    struct nestedhvm *hvm = &vcpu_nestedhvm(v);
+    struct vmcb_struct *hsave, *vmcb;
+
+    hsave = hvm->nh_hostsave;
+    vmcb = v->arch.hvm_svm.vmcb;
+
+    memcpy(hsave, vmcb, sizeof(struct vmcb_struct));
+    hsave->rip += inst_len;
+
+    /* Remember the host interrupt flag */
+    hvm->nh_hostflags.fields.rflagsif = (hsave->rflags & X86_EFLAGS_IF) ? 1 : 0;
+
+    /* Nested paging mode */
+    if (nestedhvm_paging_mode_hap(v))
+        hsave->cr3 = vmcb->cr3;
+        hsave->h_cr3 = vmcb->h_cr3;
+    if (paging_mode_hap(v->domain))
+        hsave->cr3 = vmcb->cr3;
+    else
+        hsave->cr3 = v->arch.hvm_vcpu.guest_cr[3];
+
+    hsave->efer = v->arch.hvm_vcpu.guest_efer;
+    hsave->cr0 = v->arch.hvm_vcpu.guest_cr[0];
+    hsave->cr2 = v->arch.hvm_vcpu.guest_cr[2];
+    hsave->cr4 = v->arch.hvm_vcpu.guest_cr[4];
+
+    return 0;
+}
+
+static int nsvm_vcpu_hostrestore(struct vcpu *v, struct cpu_user_regs *regs)
+{
+    struct nestedhvm *hvm = &vcpu_nestedhvm(v);
+    struct vmcb_struct *hsave, *vmcb;
+    int rc;
+
+    hsave = hvm->nh_hostsave;
+    vmcb = v->arch.hvm_svm.vmcb;
+
+    /* Must keep register values handled by VMSAVE/VMLOAD */
+    nsvm_vmcb_loadsave(vmcb, hsave);
+    memcpy(vmcb, hsave, sizeof(struct vmcb_struct));
+
+    /* EFER */
+    v->arch.hvm_vcpu.guest_efer = vmcb->efer;
+    rc = hvm_set_efer(vmcb->efer);
+    if (rc != X86EMUL_OKAY)
+        gdprintk(XENLOG_ERR, "hvm_set_efer failed, rc: %u\n", rc);
+
+    /* CR4 */
+    v->arch.hvm_vcpu.guest_cr[4] = vmcb->cr4;
+    rc = hvm_set_cr4(vmcb->cr4);
+    if (rc != X86EMUL_OKAY)
+        gdprintk(XENLOG_ERR, "hvm_set_cr4 failed, rc: %u\n", rc);
+
+    /* CR0 */
+    v->arch.hvm_vcpu.guest_cr[0] = vmcb->cr0 | X86_CR0_PE;
+    vmcb->rflags &= ~X86_EFLAGS_VM;
+    rc = hvm_set_cr0(vmcb->cr0 | X86_CR0_PE);
+    if (rc != X86EMUL_OKAY)
+        gdprintk(XENLOG_ERR, "hvm_set_cr0 failed, rc: %u\n", rc);
+
+    /* CR2 */
+    v->arch.hvm_vcpu.guest_cr[2] = vmcb->cr2;
+    hvm_update_guest_cr(v, 2);
+
+    /* CR3 */
+    /* Nested paging mode */
+    if (nestedhvm_paging_mode_hap(v)) {
+        /* host nested paging + guest nested paging. */
+        /* hvm_set_cr3() below sets v->arch.hvm_vcpu.guest_cr[3] for us. */
+    } else if (paging_mode_hap(v->domain)) {
+        /* host nested paging + guest shadow paging. */
+        /* hvm_set_cr3() below sets v->arch.hvm_vcpu.guest_cr[3] for us. */
+    } else {
+        /* host shadow paging + guest shadow paging. */
+
+        /* Reset MMU context  -- XXX (hostrestore) not yet working*/
+        if (!pagetable_is_null(v->arch.guest_table))
+            put_page(pagetable_get_page(v->arch.guest_table));
+        v->arch.guest_table = pagetable_null();
+        /* hvm_set_cr3() below sets v->arch.hvm_vcpu.guest_cr[3] for us. */
+    }
+    rc = hvm_set_cr3(vmcb->cr3);
+    if (rc != X86EMUL_OKAY)
+        gdprintk(XENLOG_ERR, "hvm_set_cr3 failed, rc: %u\n", rc);
+
+    regs->eax = vmcb->rax;
+    regs->esp = vmcb->rsp;
+    regs->eip = vmcb->rip;
+    regs->eflags = vmcb->rflags;
+    vmcb->dr7 = 0; /* disable all breakpoints */
+    vmcb->cpl = 0;
+
+    /* Clear exitintinfo to prevent a fault loop of re-injecting
+     * exceptions forever.
+     */
+    vmcb->exitintinfo.bytes = 0;
+
+    hvm_asid_flush_vcpu(v);
+
+    return 0;
+}
+
+static int
+nsvm_vcpu_vmrun(struct vcpu *v)
+{
+    struct nestedhvm *hvm = &vcpu_nestedhvm(v);
+    struct vmcb_struct *ns_vmcb;
+
+    ns_vmcb = hvm->nh_vm;
+
+    /* Save values for later use. Needed for Nested-on-Nested and
+     * Shadow-on-Shadow paging.
+     */
+    hvm->nh_vm_guestcr3 = ns_vmcb->cr3;
+    hvm->nh_vm_hostcr3 = ns_vmcb->h_cr3;
+
+    hvm->nh_flushp2m = (ns_vmcb->tlb_control
+        || (hvm->nh_guest_asid != ns_vmcb->guest_asid));
+    hvm->nh_guest_asid = ns_vmcb->guest_asid;
+
+    /* nested paging for the guest */
+    hvm->nh_hap_enabled = (ns_vmcb->np_enable) ? 1 : 0;
+
+    /* Remember the V_INTR_MASK in hostflags */
+    hvm->nh_hostflags.fields.vintrmask =
+        (ns_vmcb->vintr.fields.intr_masking) ? 1 : 0;
+
+    return 0;
+}
+
+static uint64_t
+nsvm_vmcb_exitcode_generic2native(enum nestedhvm_intercepts exitcode)
+{
+    switch (exitcode) {
+    case NESTEDHVM_INTERCEPT_INVALID:
+        return VMEXIT_INVALID;
+    case NESTEDHVM_INTERCEPT_SHUTDOWN:
+        return VMEXIT_SHUTDOWN;
+    case NESTEDHVM_INTERCEPT_VMMCALL:
+        return VMEXIT_VMMCALL;
+    case NESTEDHVM_INTERCEPT_INTR:
+        return VMEXIT_INTR;
+    case NESTEDHVM_INTERCEPT_NMI:
+        return VMEXIT_NMI;
+    case NESTEDHVM_INTERCEPT_MCE:
+        return VMEXIT_EXCEPTION_MC;
+    case NESTEDHVM_INTERCEPT_MSR_READ:
+    case NESTEDHVM_INTERCEPT_MSR_WRITE:
+        return VMEXIT_MSR;
+    case NESTEDHVM_INTERCEPT_IOIO:
+        return VMEXIT_IOIO;
+    case NESTEDHVM_INTERCEPT_NPF:
+        return VMEXIT_NPF;
+    case NESTEDHVM_INTERCEPT_PF:
+        return VMEXIT_EXCEPTION_PF;
+    case NESTEDHVM_INTERCEPT_DE:
+        return VMEXIT_EXCEPTION_DE;
+    case NESTEDHVM_INTERCEPT_OF:
+        return VMEXIT_EXCEPTION_OF;
+    case NESTEDHVM_INTERCEPT_BR:
+        return VMEXIT_EXCEPTION_BR;
+    case NESTEDHVM_INTERCEPT_UD:
+        return VMEXIT_EXCEPTION_UD;
+    case NESTEDHVM_INTERCEPT_NM:
+        return VMEXIT_EXCEPTION_NM;
+    case NESTEDHVM_INTERCEPT_DF:
+        return VMEXIT_EXCEPTION_DF;
+    case NESTEDHVM_INTERCEPT_09:
+        return VMEXIT_EXCEPTION_09;
+    case NESTEDHVM_INTERCEPT_XF:
+        return VMEXIT_EXCEPTION_XF;
+    case NESTEDHVM_INTERCEPT_DB:
+        return VMEXIT_EXCEPTION_DB;
+    case NESTEDHVM_INTERCEPT_BP:
+        return VMEXIT_EXCEPTION_BP;
+    case NESTEDHVM_INTERCEPT_TS:
+        return VMEXIT_EXCEPTION_TS;
+    case NESTEDHVM_INTERCEPT_NP:
+        return VMEXIT_EXCEPTION_NP;
+    case NESTEDHVM_INTERCEPT_SS:
+        return VMEXIT_EXCEPTION_SS;
+    case NESTEDHVM_INTERCEPT_GP:
+        return VMEXIT_EXCEPTION_GP;
+    case NESTEDHVM_INTERCEPT_15:
+        return VMEXIT_EXCEPTION_15;
+    case NESTEDHVM_INTERCEPT_MF:
+        return VMEXIT_EXCEPTION_MF;
+    case NESTEDHVM_INTERCEPT_AC:
+        return VMEXIT_EXCEPTION_AC;
+
+    case NESTEDHVM_INTERCEPT_LAST:
+        gdprintk(XENLOG_ERR, "generic to native exitcode mapping failed %u\n",
+            exitcode);
+        BUG();
+        return NESTEDHVM_INTERCEPT_LAST;
+    }
+
+    return NESTEDHVM_INTERCEPT_LAST;
+}
+
+static int
+nsvm_vcpu_vmexit(struct vcpu *v, struct cpu_user_regs *regs, uint64_t exitcode)
+{
+    struct nestedhvm *hvm = &vcpu_nestedhvm(v);
+    struct vmcb_struct *ns_vmcb;
+
+    ns_vmcb = hvm->nh_vm;
+
+    ns_vmcb->exitcode = exitcode;
+    ns_vmcb->eventinj.bytes = 0;
+
+    if (hvm->nh_hostflags.fields.forcevmexit) {
+        enum nestedhvm_intercepts nh_exitcode;
+
+        nh_exitcode = hvm->nh_forcevmexit.exitcode;
+
+        switch (nh_exitcode) {
+        case NESTEDHVM_INTERCEPT_INVALID:
+        case NESTEDHVM_INTERCEPT_SHUTDOWN:
+        case NESTEDHVM_INTERCEPT_VMMCALL:
+            break;
+        case NESTEDHVM_INTERCEPT_INTR:
+        case NESTEDHVM_INTERCEPT_NMI:
+            break;
+        case NESTEDHVM_INTERCEPT_PF:
+            ns_vmcb->cr2 = ns_vmcb->exitinfo2;
+            /* fall through */
+        case NESTEDHVM_INTERCEPT_NPF:
+            ns_vmcb->exitinfo1 = hvm->nh_forcevmexit.exitinfo1; /* error code */
+            ns_vmcb->exitinfo2 = hvm->nh_forcevmexit.exitinfo2; /* fault address */
+            break;
+        case NESTEDHVM_INTERCEPT_MCE:
+            break;
+        case NESTEDHVM_INTERCEPT_MSR_READ:
+            ns_vmcb->exitinfo1 = 0;
+            regs->ecx = hvm->nh_forcevmexit.exitinfo1;
+            break;
+        case NESTEDHVM_INTERCEPT_MSR_WRITE:
+            ns_vmcb->exitinfo1 = 1;
+            regs->ecx = hvm->nh_forcevmexit.exitinfo1;
+            regs->eax = (uint32_t)hvm->nh_forcevmexit.exitinfo2;
+            regs->edx = (uint32_t)(hvm->nh_forcevmexit.exitinfo2 >> 32);
+            break;
+        case NESTEDHVM_INTERCEPT_IOIO:
+            /* Not implemented, the path shouldn't run anyway. */
+            BUG();
+            break;
+
+        case NESTEDHVM_INTERCEPT_DE:
+        case NESTEDHVM_INTERCEPT_OF:
+        case NESTEDHVM_INTERCEPT_BR:
+        case NESTEDHVM_INTERCEPT_UD:
+        case NESTEDHVM_INTERCEPT_NM:
+        case NESTEDHVM_INTERCEPT_DF:
+        case NESTEDHVM_INTERCEPT_09:
+        case NESTEDHVM_INTERCEPT_XF:
+            break;
+        case NESTEDHVM_INTERCEPT_DB:
+        case NESTEDHVM_INTERCEPT_BP:
+        case NESTEDHVM_INTERCEPT_TS:
+            break;
+        case NESTEDHVM_INTERCEPT_NP:
+        case NESTEDHVM_INTERCEPT_SS:
+        case NESTEDHVM_INTERCEPT_GP:
+        case NESTEDHVM_INTERCEPT_15:
+        case NESTEDHVM_INTERCEPT_MF:
+        case NESTEDHVM_INTERCEPT_AC:
+            ns_vmcb->exitinfo1 = hvm->nh_forcevmexit.exitinfo1;
+            break;
+
+        case NESTEDHVM_INTERCEPT_LAST:
+            BUG();
+            break;
+        }
+        exitcode = nsvm_vmcb_exitcode_generic2native(nh_exitcode);
+        ns_vmcb->exitcode = exitcode;
+    }
+
+    return 0;
+}
+
+static uint64_t
+nsvm_vmcb_exitcode_native2generic(struct vcpu *v, struct cpu_user_regs *regs,
+    uint64_t exitcode, uint64_t *info1, uint64_t *info2)
+{
+    struct vmcb_struct *ns_vmcb = vcpu_nestedhvm(v).nh_vm;
+
+    *info1 = *info2 = 0;
+    switch (exitcode) {
+    case VMEXIT_INVALID:
+        return NESTEDHVM_INTERCEPT_INVALID;
+    case VMEXIT_SHUTDOWN:
+        return NESTEDHVM_INTERCEPT_SHUTDOWN;
+    case VMEXIT_VMMCALL:
+        return NESTEDHVM_INTERCEPT_VMMCALL;
+    case VMEXIT_INTR:
+        return NESTEDHVM_INTERCEPT_INTR;
+    case VMEXIT_NMI:
+        return NESTEDHVM_INTERCEPT_NMI;
+    case VMEXIT_NPF:
+        *info1 = ns_vmcb->exitinfo1; /* #PF error code */
+        *info2 = ns_vmcb->exitinfo2; /* #PF guest physical address */
+        return NESTEDHVM_INTERCEPT_NPF;
+    case VMEXIT_EXCEPTION_PF:
+        *info1 = ns_vmcb->exitinfo1; /* #PF error code */
+        *info2 = ns_vmcb->exitinfo2; /* #PF virtual address */
+        return NESTEDHVM_INTERCEPT_PF;
+    case VMEXIT_EXCEPTION_MC:
+        return NESTEDHVM_INTERCEPT_MCE;
+    case VMEXIT_MSR:
+        *info1 = regs->ecx;
+        *info2 = ((uint64_t)regs->edx << 32) | regs->eax;
+        return (ns_vmcb->exitinfo1 == 0) ?
+            NESTEDHVM_INTERCEPT_MSR_READ : NESTEDHVM_INTERCEPT_MSR_WRITE;
+    case VMEXIT_IOIO:
+        return NESTEDHVM_INTERCEPT_IOIO;
+
+    case VMEXIT_EXCEPTION_DE:
+        return NESTEDHVM_INTERCEPT_DE;
+    case VMEXIT_EXCEPTION_OF:
+        return NESTEDHVM_INTERCEPT_OF;
+    case VMEXIT_EXCEPTION_BR:
+        return NESTEDHVM_INTERCEPT_BR;
+    case VMEXIT_EXCEPTION_UD:
+        return NESTEDHVM_INTERCEPT_UD;
+    case VMEXIT_EXCEPTION_NM:
+        return NESTEDHVM_INTERCEPT_NM;
+    case VMEXIT_EXCEPTION_DF:
+        return NESTEDHVM_INTERCEPT_DF;
+    case VMEXIT_EXCEPTION_09:
+        return NESTEDHVM_INTERCEPT_09;
+    case VMEXIT_EXCEPTION_XF:
+        return NESTEDHVM_INTERCEPT_XF;
+
+    case VMEXIT_EXCEPTION_DB:
+        *info1 = ns_vmcb->cs.attr.bytes;
+        *info2 = ns_vmcb->rip;
+        return NESTEDHVM_INTERCEPT_DB;
+    case VMEXIT_EXCEPTION_BP:
+        *info1 = ns_vmcb->cs.attr.bytes;
+        *info2 = ns_vmcb->rip;
+        return NESTEDHVM_INTERCEPT_BP;
+
+    case VMEXIT_EXCEPTION_TS:
+        return NESTEDHVM_INTERCEPT_TS;
+
+    case VMEXIT_EXCEPTION_NP:
+        *info1 = ns_vmcb->exitinfo1;
+        return NESTEDHVM_INTERCEPT_NP;
+    case VMEXIT_EXCEPTION_SS:
+        *info1 = ns_vmcb->exitinfo1;
+        return NESTEDHVM_INTERCEPT_SS;
+    case VMEXIT_EXCEPTION_GP:
+        *info1 = ns_vmcb->exitinfo1;
+        return NESTEDHVM_INTERCEPT_GP;
+    case VMEXIT_EXCEPTION_15:
+        *info1 = ns_vmcb->exitinfo1;
+        return NESTEDHVM_INTERCEPT_15;
+    case VMEXIT_EXCEPTION_MF:
+        *info1 = ns_vmcb->exitinfo1;
+        return NESTEDHVM_INTERCEPT_MF;
+    case VMEXIT_EXCEPTION_AC:
+        *info1 = ns_vmcb->exitinfo1;
+        return NESTEDHVM_INTERCEPT_AC;
+    }
+
+    return NESTEDHVM_INTERCEPT_LAST;
+}
+
+static int
+nsvm_vmcb_intercepted_by_guest(struct vcpu *v, uint64_t exitcode)
+{
+    uint64_t exit_bits;
+    struct nestedhvm *hvm = &vcpu_nestedhvm(v);
+    struct nestedsvm *svm = hvm->nh_arch;
+
+    if (hvm->nh_hostflags.fields.forcevmexit)
+        exitcode = nsvm_vmcb_exitcode_generic2native(hvm->nh_forcevmexit.exitcode);
+
+    switch (exitcode) {
+    case VMEXIT_CR0_READ ... VMEXIT_CR15_READ:
+    case VMEXIT_CR0_WRITE ... VMEXIT_CR15_WRITE:
+        exit_bits = 1ULL << (exitcode - VMEXIT_CR0_READ);
+        if (svm->ns_cr_intercepts & exit_bits)
+            break;
+        return 0;
+
+    case VMEXIT_DR0_READ ... VMEXIT_DR7_READ:
+    case VMEXIT_DR0_WRITE ... VMEXIT_DR7_WRITE:
+        exit_bits = 1ULL << (exitcode - VMEXIT_DR0_READ);
+        if (svm->ns_dr_intercepts & exit_bits)
+            break;
+        return 0;
+
+    case VMEXIT_EXCEPTION_DE ... VMEXIT_EXCEPTION_XF:
+        exit_bits = 1ULL << (exitcode - VMEXIT_EXCEPTION_DE);
+        if (svm->ns_exception_intercepts & exit_bits)
+            break;
+        return 0;
+
+    case VMEXIT_INTR ... VMEXIT_SHUTDOWN:
+        exit_bits = 1ULL << (exitcode - VMEXIT_INTR);
+        if (svm->ns_general1_intercepts & exit_bits)
+            break;
+        return 0;
+
+    /* case VMEXIT_VMRUN ... VMEXIT_MWAIT_CONDITIONAL: */
+    default:
+        exit_bits = 1ULL << (exitcode - VMEXIT_VMRUN);
+        if (svm->ns_general2_intercepts & exit_bits)
+            break;
+        return 0;
+    }
+
+    return 1;
+}
+
+static int nsvm_vmrun_permissionmap(struct vcpu *v)
+{
+    struct arch_svm_struct *arch_svm = &v->arch.hvm_svm;
+    struct nestedhvm *hvm = &vcpu_nestedhvm(v);
+    struct vmcb_struct *ns_vmcb = hvm->nh_vm;
+    struct vmcb_struct *host_vmcb = arch_svm->vmcb;
+    unsigned long *ns_msrpm_ptr;
+    unsigned int i;
+    enum hvm_copy_result ret;
+
+    ns_msrpm_ptr = (unsigned long *)hvm->nh_cached_msrpm;
+
+    ret = hvm_copy_from_guest_phys(hvm->nh_cached_msrpm,
+                                   ns_vmcb->msrpm_base_pa,
+                                   hvm->nh_cached_msrpm_size);
+    if (ret != HVMCOPY_okay) {
+        gdprintk(XENLOG_ERR, "hvm_copy_from_guest_phys msrpm %u\n", ret);
+        return 1;
+    }
+
+    /* Skip io bitmap merge since hvm_io_bitmap has all bits set but
+     * 0x80 and 0xed.
+     */
+
+    /* v->arch.hvm_svm.msrpm has type unsigned long, thus
+     * BYTES_PER_LONG.
+     */
+    for (i = 0; i < MSRPM_SIZE / BYTES_PER_LONG; i++)
+        hvm->nh_merged_msrpm[i] = arch_svm->msrpm[i] | ns_msrpm_ptr[i];
+
+    host_vmcb->iopm_base_pa =
+        (uint64_t)virt_to_maddr(hvm_io_bitmap);
+    host_vmcb->msrpm_base_pa =
+        (uint64_t)virt_to_maddr(hvm->nh_merged_msrpm);
+
+    return 0;
+}
+
+static int nsvm_vmcb_prepare4vmrun(struct vcpu *v, struct cpu_user_regs *regs)
+{
+    struct nestedhvm *hvm = &vcpu_nestedhvm(v);
+    struct nestedsvm *svm = hvm->nh_arch;
+    struct vmcb_struct *ns_vmcb = hvm->nh_vm;
+    struct vmcb_struct *host_vmcb = v->arch.hvm_svm.vmcb;
+    int rc;
+
+    /* Enable nested guest intercepts */
+    svm->ns_cr_intercepts = ns_vmcb->cr_intercepts;
+    svm->ns_dr_intercepts = ns_vmcb->dr_intercepts;
+    svm->ns_exception_intercepts = ns_vmcb->exception_intercepts;
+    svm->ns_general1_intercepts = ns_vmcb->general1_intercepts;
+    svm->ns_general2_intercepts = ns_vmcb->general2_intercepts;
+
+    host_vmcb->cr_intercepts |= ns_vmcb->cr_intercepts;
+    host_vmcb->dr_intercepts |= ns_vmcb->dr_intercepts;
+    host_vmcb->exception_intercepts |= ns_vmcb->exception_intercepts;
+    host_vmcb->general1_intercepts |= ns_vmcb->general1_intercepts;
+    host_vmcb->general2_intercepts |= ns_vmcb->general2_intercepts;
+
+    /* Nested Pause Filter */
+    if (ns_vmcb->general1_intercepts & GENERAL1_INTERCEPT_PAUSE)
+        host_vmcb->pause_filter_count =
+            min(ns_vmcb->pause_filter_count, host_vmcb->pause_filter_count);
+    else
+        host_vmcb->pause_filter_count = SVM_PAUSEFILTER_INIT;
+
+    /* Nested IO permission bitmaps */
+    rc = nsvm_vmrun_permissionmap(v);
+    if (rc)
+        return rc;
+
+    /* TSC offset */
+    hvm_set_guest_tsc(v, host_vmcb->tsc_offset + ns_vmcb->tsc_offset);
+
+    /* ASID */
+    hvm_asid_flush_vcpu(v);
+    /* host_vmcb->guest_asid = ns_vmcb->guest_asid; */
+
+    /* TLB control */
+    host_vmcb->tlb_control |= ns_vmcb->tlb_control;
+
+    /* Virtual Interrupts */
+    host_vmcb->vintr = ns_vmcb->vintr;
+    host_vmcb->vintr.fields.intr_masking = 1;
+
+    /* Shadow Mode */
+    host_vmcb->interrupt_shadow = ns_vmcb->interrupt_shadow;
+
+    /* Exit codes */
+    host_vmcb->exitcode = ns_vmcb->exitcode;
+    host_vmcb->exitinfo1 = ns_vmcb->exitinfo1;
+    host_vmcb->exitinfo2 = ns_vmcb->exitinfo2;
+    host_vmcb->exitintinfo = ns_vmcb->exitintinfo;
+
+    /* Pending Interrupts */
+    host_vmcb->eventinj = ns_vmcb->eventinj;
+
+    /* LBR virtualization */
+    svm->ns_lbr_control = ns_vmcb->lbr_control;
+    host_vmcb->lbr_control.bytes |= ns_vmcb->lbr_control.bytes;
+
+    /* NextRIP */
+    host_vmcb->nextrip = ns_vmcb->nextrip;
+
+    /*
+     * VMCB Save State Area
+     */
+
+    /* Segments */
+    host_vmcb->es = ns_vmcb->es;
+    host_vmcb->cs = ns_vmcb->cs;
+    host_vmcb->ss = ns_vmcb->ss;
+    host_vmcb->ds = ns_vmcb->ds;
+    host_vmcb->gdtr = ns_vmcb->gdtr;
+    host_vmcb->idtr = ns_vmcb->idtr;
+
+    /* CPL */
+    host_vmcb->cpl = ns_vmcb->cpl;
+
+    /* EFER */
+    v->arch.hvm_vcpu.guest_efer = ns_vmcb->efer;
+    rc = hvm_set_efer(ns_vmcb->efer);
+    if (rc != X86EMUL_OKAY)
+	gdprintk(XENLOG_ERR, "hvm_set_efer failed, rc: %u\n", rc);
+
+    /* CR4 */
+    v->arch.hvm_vcpu.guest_cr[4] = ns_vmcb->cr4;
+    rc = hvm_set_cr4(ns_vmcb->cr4);
+    if (rc != X86EMUL_OKAY)
+        gdprintk(XENLOG_ERR, "hvm_set_cr4 failed, rc: %u\n", rc);
+
+    /* CR0 */
+    v->arch.hvm_vcpu.guest_cr[0] = ns_vmcb->cr0;
+    rc = hvm_set_cr0(ns_vmcb->cr0);
+    if (rc != X86EMUL_OKAY)
+        gdprintk(XENLOG_ERR, "hvm_set_cr0 failed, rc: %u\n", rc);
+
+    /* CR2 */
+    v->arch.hvm_vcpu.guest_cr[2] = ns_vmcb->cr2;
+    hvm_update_guest_cr(v, 2);
+
+    /* Nested paging mode */
+    if (nestedhvm_paging_mode_hap(v)) {
+        /* host nested paging + guest nested paging. */
+
+        /* hvm_set_cr3() below sets v->arch.hvm_vcpu.guest_cr[3] for us. */
+        rc = hvm_set_cr3(ns_vmcb->cr3);
+        if (rc != X86EMUL_OKAY)
+            gdprintk(XENLOG_ERR, "hvm_set_cr3 failed, rc: %u\n", rc);
+    } else if (paging_mode_hap(v->domain)) {
+        /* host nested paging + guest shadow paging. */
+        host_vmcb->np_enable = 1;
+        /* Keep h_cr3 as it is. */
+        /* Guest shadow paging: Must intercept pagefaults. */
+        host_vmcb->exception_intercepts |= (1U << TRAP_page_fault);
+        /* hvm_set_cr3() below sets v->arch.hvm_vcpu.guest_cr[3] for us. */
+        rc = hvm_set_cr3(ns_vmcb->cr3);
+        if (rc != X86EMUL_OKAY)
+            gdprintk(XENLOG_ERR, "hvm_set_cr3 failed, rc: %u\n", rc);
+    } else {
+        /* host shadow paging + guest shadow paging. */
+        host_vmcb->np_enable = 0;
+        host_vmcb->h_cr3 = 0x0;
+
+        /* TODO: Once shadow-shadow paging is in place come back to here
+         * and set host_vmcb->cr3 to the shadowed shadow table.
+         */
+    }
+
+    /* DRn */
+    host_vmcb->dr7 = ns_vmcb->dr7;
+    host_vmcb->dr6 = ns_vmcb->dr6;
+
+    /* RFLAGS */
+    host_vmcb->rflags = ns_vmcb->rflags;
+
+    /* RIP */
+    host_vmcb->rip = ns_vmcb->rip;
+
+    /* RSP */
+    host_vmcb->rsp = ns_vmcb->rsp;
+
+    /* RAX */
+    host_vmcb->rax = ns_vmcb->rax;
+
+    /* Keep the host values of the fs, gs, ldtr, tr, kerngsbase,
+     * star, lstar, cstar, sfmask, sysenter_cs, sysenter_esp,
+     * sysenter_eip. These are handled via VMSAVE/VMLOAD emulation.
+     */
+
+    /* Page tables */
+    host_vmcb->pdpe0 = ns_vmcb->pdpe0;
+    host_vmcb->pdpe1 = ns_vmcb->pdpe1;
+    host_vmcb->pdpe2 = ns_vmcb->pdpe2;
+    host_vmcb->pdpe3 = ns_vmcb->pdpe3;
+
+    /* PAT */
+    host_vmcb->g_pat = ns_vmcb->g_pat;
+
+    /* Debug Control MSR */
+    host_vmcb->debugctlmsr = ns_vmcb->debugctlmsr;
+
+    /* LBR MSRs */
+    host_vmcb->lastbranchfromip = ns_vmcb->lastbranchfromip;
+    host_vmcb->lastbranchtoip = ns_vmcb->lastbranchtoip;
+    host_vmcb->lastintfromip = ns_vmcb->lastintfromip;
+    host_vmcb->lastinttoip = ns_vmcb->lastinttoip;
+
+    rc = svm_vmcb_isvalid(__func__, ns_vmcb, 1);
+    if (rc) {
+        gdprintk(XENLOG_ERR, "nested vmcb invalid\n");
+        return rc;
+    }
+
+    rc = svm_vmcb_isvalid(__func__, host_vmcb, 1);
+    if (rc) {
+        gdprintk(XENLOG_ERR, "host vmcb invalid\n");
+        return rc;
+    }
+
+    /* Switch guest registers to nested guest */
+    regs->eax = ns_vmcb->rax;
+    regs->eip = ns_vmcb->rip;
+    regs->esp = ns_vmcb->rsp;
+    regs->eflags = ns_vmcb->rflags;
+
+    return 0;
+}
+
+static int nsvm_vmcb_prepare4vmexit(struct vcpu *v)
+{
+    struct nestedhvm *hvm = &vcpu_nestedhvm(v);
+    struct nestedsvm *svm = hvm->nh_arch;
+    struct vmcb_struct *vmcb, *ns_vmcb;
+
+    vmcb = v->arch.hvm_svm.vmcb;
+    ns_vmcb = hvm->nh_vm;
+
+    svm_vmsave(vmcb);
+
+    /* Intercepts */
+    /* Copy cached intercepts since they are the guest's original
+     * intercepts.
+     */
+    ns_vmcb->cr_intercepts = svm->ns_cr_intercepts;
+    ns_vmcb->dr_intercepts = svm->ns_dr_intercepts;
+    ns_vmcb->exception_intercepts = svm->ns_exception_intercepts;
+    ns_vmcb->general1_intercepts = svm->ns_general1_intercepts;
+    ns_vmcb->general2_intercepts = svm->ns_general2_intercepts;
+
+    /* Nested Pause Filter */
+    ns_vmcb->pause_filter_count = vmcb->pause_filter_count;
+
+    /* Nested IO permission bitmap */
+    /* Just keep the iopm_base_pa and msrpm_base_pa values.
+     * The guest must not see the virtualized values.
+     */
+
+    /* TSC offset */
+    ns_vmcb->tsc_offset = vmcb->tsc_offset;
+
+    /* ASID */
+    /* ns_vmcb->guest_asid = vmcb->guest_asid; */
+
+    /* TLB control */
+    ns_vmcb->tlb_control = 0;
+
+    /* Virtual Interrupts */
+    ns_vmcb->vintr = vmcb->vintr;
+    if (!(hvm->nh_hostflags.fields.vintrmask))
+        ns_vmcb->vintr.fields.intr_masking = 0;
+
+    /* Shadow mode */
+    ns_vmcb->interrupt_shadow = vmcb->interrupt_shadow;
+
+    /* Exit codes */
+    ns_vmcb->exitcode = vmcb->exitcode;
+    ns_vmcb->exitinfo1 = vmcb->exitinfo1;
+    ns_vmcb->exitinfo2 = vmcb->exitinfo2;
+    ns_vmcb->exitintinfo = vmcb->exitintinfo;
+
+    /* Interrupts */
+    /* If we emulate a VMRUN/#VMEXIT in the same host #VMEXIT cycle we have
+     * to make sure that we do not lose injected events. So check eventinj
+     * here and copy it to exitintinfo if it is valid.
+     * exitintinfo and eventinj can't be both valid because the case below
+     * only happens on a VMRUN instruction intercept which has no valid
+     * exitintinfo set.
+     */
+    if ( unlikely(vmcb->eventinj.fields.v) &&
+         hvm_event_needs_reinjection(vmcb->eventinj.fields.type,
+                                     vmcb->eventinj.fields.vector) )
+    {
+        ns_vmcb->exitintinfo = vmcb->eventinj;
+    }
+
+    ns_vmcb->eventinj.bytes = 0;
+
+    /* Nested paging mode */
+    if (nestedhvm_paging_mode_hap(v)) {
+        /* host nested paging + guest nested paging. */
+        ns_vmcb->np_enable = vmcb->np_enable;
+        ns_vmcb->cr3 = vmcb->cr3;
+        /* The vmcb->h_cr3 is the shadowed h_cr3. The original
+         * unshadowed guest h_cr3 is kept in ns_vmcb->h_cr3,
+         * hence we keep the ns_vmcb->h_cr3 value. */
+    } else if (paging_mode_hap(v->domain)) {
+        /* host nested paging + guest shadow paging. */
+        ns_vmcb->np_enable = 0;
+        /* Throw h_cr3 away. Guest is not allowed to set it or
+         * it can break out, otherwise (security hole!) */
+        ns_vmcb->h_cr3 = 0x0;
+        /* Stop intercepting #PF (already done above
+         * by restoring cached intercepts). */
+        ns_vmcb->cr3 = vmcb->cr3;
+    } else {
+        /* host shadow paging + guest shadow paging. */
+        ns_vmcb->np_enable = 0;
+        ns_vmcb->h_cr3 = 0x0;
+        /* The vmcb->cr3 is the shadowed cr3. The original
+         * unshadowed guest cr3 is kept in ns_vmcb->cr3,
+         * hence we keep the ns_vmcb->cr3 value. */
+    }
+
+    /* LBR virtualization */
+    ns_vmcb->lbr_control = svm->ns_lbr_control;
+
+    /* NextRIP */
+    ns_vmcb->nextrip = vmcb->nextrip;
+
+    /*
+     * VMCB Save State Area
+     */
+
+    /* Segments */
+    ns_vmcb->es = vmcb->es;
+    ns_vmcb->cs = vmcb->cs;
+    ns_vmcb->ss = vmcb->ss;
+    ns_vmcb->ds = vmcb->ds;
+    ns_vmcb->gdtr = vmcb->gdtr;
+    ns_vmcb->idtr = vmcb->idtr;
+
+    /* CPL */
+    ns_vmcb->cpl = vmcb->cpl;
+
+    /* EFER */
+    ns_vmcb->efer = vmcb->efer;
+
+    /* CRn */
+    ns_vmcb->cr4 = vmcb->cr4;
+    ns_vmcb->cr0 = vmcb->cr0;
+
+    /* DRn */
+    ns_vmcb->dr7 = vmcb->dr7;
+    ns_vmcb->dr6 = vmcb->dr6;
+
+    /* RFLAGS */
+    ns_vmcb->rflags = vmcb->rflags;
+
+    /* RIP */
+    ns_vmcb->rip = vmcb->rip;
+
+    /* RSP */
+    ns_vmcb->rsp = vmcb->rsp;
+
+    /* RAX */
+    ns_vmcb->rax = vmcb->rax;
+
+    /* Keep the nested guest values of the fs, gs, ldtr, tr, kerngsbase,
+     * star, lstar, cstar, sfmask, sysenter_cs, sysenter_esp,
+     * sysenter_eip. These are handled via VMSAVE/VMLOAD emulation.
+     */
+
+    /* CR2 */
+    ns_vmcb->cr2 = vmcb->cr2;
+
+    /* Page tables */
+    ns_vmcb->pdpe0 = vmcb->pdpe0;
+    ns_vmcb->pdpe1 = vmcb->pdpe1;
+    ns_vmcb->pdpe2 = vmcb->pdpe2;
+    ns_vmcb->pdpe3 = vmcb->pdpe3;
+
+    /* PAT */
+    ns_vmcb->g_pat = vmcb->g_pat;
+
+    /* Debug Control MSR */
+    ns_vmcb->debugctlmsr = vmcb->debugctlmsr;
+
+    /* LBR MSRs */
+    ns_vmcb->lastbranchfromip = vmcb->lastbranchfromip;
+    ns_vmcb->lastbranchtoip = vmcb->lastbranchtoip;
+    ns_vmcb->lastintfromip = vmcb->lastintfromip;
+    ns_vmcb->lastinttoip = vmcb->lastinttoip;
+
+    return 0;
+}
+
+static int nsvm_rdmsr(struct vcpu *v, unsigned int msr, uint64_t *msr_content)
+{
+    struct nestedsvm *svm = vcpu_nestedhvm(v).nh_arch;
+    int ret = 1;
+
+    *msr_content = 0;
+
+    switch (msr) {
+    case MSR_K8_VM_CR:
+        break;
+    case MSR_K8_VM_HSAVE_PA:
+        *msr_content = svm->ns_msr_hsavepa;
+        break;
+    default:
+        ret = 0;
+        break;
+    }
+
+    return ret;
+}
+
+static int nsvm_wrmsr(struct vcpu *v, unsigned int msr, uint64_t msr_content)
+{
+    int ret = 1;
+    struct nestedhvm *hvm = &vcpu_nestedhvm(v);
+    struct nestedsvm *svm = hvm->nh_arch;
+
+    switch (msr) {
+    case MSR_K8_VM_CR:
+        /* ignore write. handle all bits as read-only. */
+        break;
+    case MSR_K8_VM_HSAVE_PA:
+        if (!nestedhvm_vmaddr_isvalid(hvm, msr_content)) {
+            gdprintk(XENLOG_ERR,
+                "MSR_K8_VM_HSAVE_PA value invalid 0x%"PRIx64"\n", msr_content);
+            ret = -1; /* inject #GP */
+            break;
+        }
+        svm->ns_msr_hsavepa = msr_content;
+        break;
+    default:
+        ret = 0;
+        break;
+    }
+
+    return ret;
+}
+
 static int svm_cpu_up(void)
 {
     uint64_t msr_content;
@@ -955,8 +1992,8 @@ static void svm_do_nested_pgfault(paddr_
         struct {
             uint64_t gpa;
             uint64_t mfn;
-            u32 qualification;
-            u32 p2mt;
+            uint32_t qualification;
+            uint32_t p2mt;
         } _d;
 
         _d.gpa = gpa;
@@ -997,11 +2034,24 @@ static void svm_cpuid_intercept(
 
     hvm_cpuid(input, eax, ebx, ecx, edx);
 
-    if ( input == 0x80000001 )
-    {
+    switch (input) {
+    case 0x80000001:
         /* Fix up VLAPIC details. */
         if ( vlapic_hw_disabled(vcpu_vlapic(v)) )
             __clear_bit(X86_FEATURE_APIC & 31, edx);
+        break;
+    case 0x8000000a:
+        /* We require the host to use nested paging as
+         * hap-on-shadow is not supported.
+         * The tools have no way to check this case and
+         * thus always enable it. So we mask hap unless
+         * we use hap-on-hap. 
+         */
+        if ( cpu_has_svm_npt && !paging_mode_hap(v->domain) )
+            *edx &= ~(1U << SVM_FEATURE_NPT);
+        break;
+    default:
+        break;
     }
 
     HVMTRACE_5D (CPUID, input, *eax, *ebx, *ecx, *edx);
@@ -1037,6 +2087,7 @@ static void svm_dr_access(struct vcpu *v
 
 static int svm_msr_read_intercept(unsigned int msr, uint64_t *msr_content)
 {
+    int ret;
     struct vcpu *v = current;
     struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
 
@@ -1074,9 +2125,6 @@ static int svm_msr_read_intercept(unsign
         *msr_content = 0;
         break;
 
-    case MSR_K8_VM_HSAVE_PA:
-        goto gpf;
-
     case MSR_IA32_DEBUGCTLMSR:
         *msr_content = vmcb->debugctlmsr;
         break;
@@ -1109,6 +2157,11 @@ static int svm_msr_read_intercept(unsign
         break;
 
     default:
+        ret = nsvm_rdmsr(v, msr, msr_content);
+        if ( ret < 0 )
+            goto gpf;
+        else if ( ret )
+            break;
 
         if ( rdmsr_viridian_regs(msr, msr_content) ||
              rdmsr_hypervisor_regs(msr, msr_content) )
@@ -1131,14 +2184,12 @@ static int svm_msr_read_intercept(unsign
 
 static int svm_msr_write_intercept(unsigned int msr, uint64_t msr_content)
 {
+    int ret;
     struct vcpu *v = current;
     struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
 
     switch ( msr )
     {
-    case MSR_K8_VM_HSAVE_PA:
-        goto gpf;
-
     case MSR_IA32_SYSENTER_CS:
         v->arch.hvm_svm.guest_sysenter_cs = msr_content;
         break;
@@ -1189,6 +2240,12 @@ static int svm_msr_write_intercept(unsig
         break;
 
     default:
+        ret = nsvm_wrmsr(v, msr, msr_content);
+        if ( ret < 0 )
+            goto gpf;
+        else if ( ret )
+            break;
+
         if ( wrmsr_viridian_regs(msr, msr_content) )
             break;
 
@@ -1277,6 +2334,179 @@ static void svm_vmexit_do_pause(struct c
     do_sched_op_compat(SCHEDOP_yield, 0);
 }
 
+static void svm_vmexit_do_vmrun(struct cpu_user_regs *regs,
+				struct vcpu *v, uint64_t vmcbaddr)
+{
+    int ret;
+    unsigned int inst_len;
+
+    if ( !nestedhvm_enabled(v->domain) ) {
+        hvm_inject_exception(TRAP_invalid_op, HVM_DELIVER_NO_ERROR_CODE, 0);
+        return;
+    }
+
+    if ( (inst_len = __get_instruction_length(current, INSTR_VMRUN)) == 0 )
+        return;
+
+    ret = nestedhvm_vcpu_vmentry(v, regs, vmcbaddr, inst_len);
+    if (ret)
+        /* On failure, nestedhvm_vcpu_vmentry injected an exception,
+         * almost a #GP or #UD.
+         */
+        return;
+}
+
+static void
+svm_vmexit_do_vmload(struct vmcb_struct *vmcb,
+                     struct cpu_user_regs *regs,
+                     struct vcpu *v, uint64_t vmcbaddr)
+{
+    int ret;
+    unsigned int inst_len;
+    struct nestedsvm *svm = vcpu_nestedhvm(v).nh_arch;
+    struct vmcb_struct *tmp_vmcb = svm->ns_tmpvmcb;
+
+    if ( !nestedhvm_enabled(v->domain) ) {
+        hvm_inject_exception(TRAP_invalid_op, HVM_DELIVER_NO_ERROR_CODE, 0);
+        return;
+    }
+
+    if ( (inst_len = __get_instruction_length(v, INSTR_VMLOAD)) == 0 )
+        return;
+
+    ret = nestedhvm_vcpu_state_validate(v, vmcbaddr);
+    if (ret) {
+        gdprintk(XENLOG_ERR,
+            "nestedhvm_vcpu_state_validate failed, injecting 0x%x\n", ret);
+        goto inject;
+    }
+
+    ret = hvm_copy_from_guest_phys(tmp_vmcb, vmcbaddr,
+        sizeof(struct vmcb_struct));
+    if (ret) {
+        gdprintk(XENLOG_ERR,
+            "hvm_copy_from_guest_phys failed, injecting 0x%x\n", ret);
+        goto inject;
+    }
+
+    nsvm_vmcb_loadsave(tmp_vmcb, vmcb);
+    svm_vmload(vmcb);
+
+    __update_guest_eip(regs, inst_len);
+    memset(tmp_vmcb, 0x0, sizeof(struct vmcb_struct));
+    return;
+
+ inject:
+    memset(tmp_vmcb, 0x0, sizeof(struct vmcb_struct));
+    hvm_inject_exception(ret, HVM_DELIVER_NO_ERROR_CODE, 0);
+    return;
+}
+
+static void
+svm_vmexit_do_vmsave(struct vmcb_struct *vmcb,
+                     struct cpu_user_regs *regs,
+                     struct vcpu *v, uint64_t vmcbaddr)
+{
+    int ret;
+    unsigned int inst_len;
+    struct nestedsvm *svm = vcpu_nestedhvm(v).nh_arch;
+    struct vmcb_struct *tmp_vmcb = svm->ns_tmpvmcb;
+
+    if ( !nestedhvm_enabled(v->domain) ) {
+        hvm_inject_exception(TRAP_invalid_op, HVM_DELIVER_NO_ERROR_CODE, 0);
+        return;
+    }
+
+    if ( (inst_len = __get_instruction_length(v, INSTR_VMSAVE)) == 0 )
+        return;
+
+    ret = nestedhvm_vcpu_state_validate(v, vmcbaddr);
+    if (ret) {
+        gdprintk(XENLOG_ERR,
+            "nestedhvm_vcpu_state_validate failed, injecting 0x%x\n", ret);
+        goto inject;
+    }
+
+    ret = hvm_copy_from_guest_phys(tmp_vmcb, vmcbaddr,
+        sizeof(struct vmcb_struct));
+    if (ret) {
+        gdprintk(XENLOG_ERR,
+            "hvm_copy_from_guest_phys failed, injecting 0x%x\n", ret);
+        goto inject;
+    }
+
+    svm_vmsave(vmcb);
+    nsvm_vmcb_loadsave(vmcb, tmp_vmcb);
+
+    ret = hvm_copy_to_guest_phys(vmcbaddr, tmp_vmcb,
+        sizeof(struct vmcb_struct));
+    if (ret) {
+        gdprintk(XENLOG_ERR,
+            "hvm_copy_to_guest_phys failed, injecting 0x%x\n", ret);
+        goto inject;
+    }
+
+    __update_guest_eip(regs, inst_len);
+    memset(tmp_vmcb, 0x0, sizeof(struct vmcb_struct));
+    return;
+
+ inject:
+    memset(tmp_vmcb, 0x0, sizeof(struct vmcb_struct));
+    hvm_inject_exception(ret, HVM_DELIVER_NO_ERROR_CODE, 0);
+    return;
+}
+
+static void svm_vmexit_do_clgi(struct cpu_user_regs *regs, struct vcpu *v)
+{
+    int ret;
+    struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
+    unsigned int inst_len;
+
+    if ( !nestedhvm_enabled(v->domain) ) {
+        hvm_inject_exception(TRAP_invalid_op, HVM_DELIVER_NO_ERROR_CODE, 0);
+        return;
+    }
+
+    if ( (inst_len = __get_instruction_length(v, INSTR_CLGI)) == 0 )
+        return;
+
+    ret = nestedsvm_vcpu_clgi(v);
+    if (ret)
+        /* On failure, nestedsvm_vcpu_clgi injected an exception,
+         * almost a #GP or #UD.
+         */
+        return;
+
+    /* After a CLGI no interrupts should come */
+    vmcb->vintr.fields.irq = 0;
+    vmcb->general1_intercepts &= ~GENERAL1_INTERCEPT_VINTR;
+
+    __update_guest_eip(regs, inst_len);
+}
+
+static void svm_vmexit_do_stgi(struct cpu_user_regs *regs, struct vcpu *v)
+{
+    int ret;
+    unsigned int inst_len;
+
+    if ( !nestedhvm_enabled(v->domain) ) {
+        hvm_inject_exception(TRAP_invalid_op, HVM_DELIVER_NO_ERROR_CODE, 0);
+        return;
+    }
+
+    if ( (inst_len = __get_instruction_length(v, INSTR_STGI)) == 0 )
+        return;
+
+    ret = nestedsvm_vcpu_stgi(v);
+    if (ret)
+        /* On failure, nestedsvm_vcpu_stgi injected an exception,
+         * almost a #GP or #UD.
+         */
+	return;
+
+    __update_guest_eip(regs, inst_len);
+}
+
 static void svm_vmexit_ud_intercept(struct cpu_user_regs *regs)
 {
     struct hvm_emulate_ctxt ctxt;
@@ -1424,20 +2654,37 @@ static struct hvm_function_table __read_
     .msr_read_intercept   = svm_msr_read_intercept,
     .msr_write_intercept  = svm_msr_write_intercept,
     .invlpg_intercept     = svm_invlpg_intercept,
-    .set_rdtsc_exiting    = svm_set_rdtsc_exiting
+    .set_rdtsc_exiting    = svm_set_rdtsc_exiting,
+
+    .nestedhvm_vcpu_initialise = nsvm_vcpu_initialise,
+    .nestedhvm_vcpu_destroy = nsvm_vcpu_destroy,
+    .nestedhvm_vcpu_reset = nsvm_vcpu_reset,
+    .nestedhvm_vcpu_features = nsvm_vcpu_features,
+    .nestedhvm_vcpu_hostsave = nsvm_vcpu_hostsave,
+    .nestedhvm_vcpu_hostrestore = nsvm_vcpu_hostrestore,
+    .nestedhvm_vcpu_vmentry = nsvm_vcpu_vmrun,
+    .nestedhvm_vcpu_vmexit = nsvm_vcpu_vmexit,
+    .nestedhvm_vm_exitcode_native2generic = nsvm_vmcb_exitcode_native2generic,
+    .nestedhvm_vm_intercepted_by_guest = nsvm_vmcb_intercepted_by_guest,
+    .nestedhvm_vm_prepare4vmentry = nsvm_vmcb_prepare4vmrun,
+    .nestedhvm_vm_prepare4vmexit = nsvm_vmcb_prepare4vmexit,
 };
 
 asmlinkage void svm_vmexit_handler(struct cpu_user_regs *regs)
 {
-    unsigned int exit_reason;
+    uint64_t exit_reason;
     struct vcpu *v = current;
     struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
     eventinj_t eventinj;
     int inst_len, rc;
+    bool_t vcpu_guestmode = 0;
 
     if ( paging_mode_hap(v->domain) )
         v->arch.hvm_vcpu.guest_cr[3] = v->arch.hvm_vcpu.hw_cr[3] = vmcb->cr3;
 
+    if ( nestedhvm_enabled(v->domain) && nestedhvm_vcpu_in_guestmode(v) )
+        vcpu_guestmode = 1;
+
     /*
      * Before doing anything else, we need to sync up the VLAPIC's TPR with
      * SVM's vTPR. It's OK if the guest doesn't touch CR8 (e.g. 32-bit Windows)
@@ -1445,12 +2692,39 @@ asmlinkage void svm_vmexit_handler(struc
      * NB. We need to preserve the low bits of the TPR to make checked builds
      * of Windows work, even though they don't actually do anything.
      */
-    vlapic_set_reg(vcpu_vlapic(v), APIC_TASKPRI,
-                   ((vmcb->vintr.fields.tpr & 0x0F) << 4) |
-                   (vlapic_get_reg(vcpu_vlapic(v), APIC_TASKPRI) & 0x0F));
+    if ( !vcpu_guestmode ) {
+        vlapic_set_reg(vcpu_vlapic(v), APIC_TASKPRI,
+                       ((vmcb->vintr.fields.tpr & 0x0F) << 4) |
+                       (vlapic_get_reg(vcpu_vlapic(v), APIC_TASKPRI) & 0x0F));
+    }
 
     exit_reason = vmcb->exitcode;
 
+    if ( vcpu_guestmode ) {
+        enum nestedhvm_vmexits nsret;
+
+        nsret = nestedhvm_vcpu_vmexit(v, regs, exit_reason);
+        vcpu_nestedhvm(v).nh_hostflags.fields.forcevmexit = 0;
+        switch (nsret) {
+        case NESTEDHVM_VMEXIT_DONE:
+            goto out;
+        case NESTEDHVM_VMEXIT_ERROR:
+            gdprintk(XENLOG_ERR,
+		"nestedhvm_vcpu_vmexit() returned NESTEDHVM_VMEXIT_ERROR\n");
+            goto out;
+        case NESTEDHVM_VMEXIT_HOST:
+        case NESTEDHVM_VMEXIT_CONTINUE:
+            break;
+        case NESTEDHVM_VMEXIT_FATALERROR:
+            gdprintk(XENLOG_ERR, "unexpected nestedhvm error\n");
+            goto exit_and_crash;
+        default:
+            gdprintk(XENLOG_INFO, "nestedhvm_vcpu_vmexit returned %i\n",
+                nsret);
+            goto exit_and_crash;
+        }
+    }
+
     if ( hvm_long_mode_enabled(v) )
         HVMTRACE_ND(VMEXIT64, 1/*cycles*/, 3, exit_reason,
                     (uint32_t)regs->eip, (uint32_t)((uint64_t)regs->eip >> 32),
@@ -1462,7 +2736,7 @@ asmlinkage void svm_vmexit_handler(struc
 
     if ( unlikely(exit_reason == VMEXIT_INVALID) )
     {
-        svm_dump_vmcb(__func__, vmcb);
+        svm_vmcb_dump(__func__, vmcb);
         goto exit_and_crash;
     }
 
@@ -1610,6 +2884,10 @@ asmlinkage void svm_vmexit_handler(struc
     case VMEXIT_VMMCALL:
         if ( (inst_len = __get_instruction_length(v, INSTR_VMCALL)) == 0 )
             break;
+        if ( vcpu_guestmode ) {
+            hvm_inject_exception(TRAP_invalid_op, HVM_DELIVER_NO_ERROR_CODE, 0);
+            break;
+        }
         HVMTRACE_1D(VMMCALL, regs->eax);
         rc = hvm_do_hypercall(regs);
         if ( rc != HVM_HCALL_preempted )
@@ -1642,11 +2920,25 @@ asmlinkage void svm_vmexit_handler(struc
 
     case VMEXIT_MONITOR:
     case VMEXIT_MWAIT:
+        hvm_inject_exception(TRAP_invalid_op, HVM_DELIVER_NO_ERROR_CODE, 0);
+        break;
+
     case VMEXIT_VMRUN:
+        svm_vmexit_do_vmrun(regs, v,
+                            regs->eax);
+        break;
     case VMEXIT_VMLOAD:
+        svm_vmexit_do_vmload(vmcb, regs, v, regs->eax);
+        break;
     case VMEXIT_VMSAVE:
+        svm_vmexit_do_vmsave(vmcb, regs, v, regs->eax);
+        break;
     case VMEXIT_STGI:
+        svm_vmexit_do_stgi(regs, v);
+        break;
     case VMEXIT_CLGI:
+        svm_vmexit_do_clgi(regs, v);
+        break;
     case VMEXIT_SKINIT:
         hvm_inject_exception(TRAP_invalid_op, HVM_DELIVER_NO_ERROR_CODE, 0);
         break;
@@ -1676,7 +2968,7 @@ asmlinkage void svm_vmexit_handler(struc
 
     default:
     exit_and_crash:
-        gdprintk(XENLOG_ERR, "unexpected VMEXIT: exit reason = 0x%x, "
+        gdprintk(XENLOG_ERR, "unexpected VMEXIT: exit reason = 0x%"PRIx64", "
                  "exitinfo1 = %"PRIx64", exitinfo2 = %"PRIx64"\n",
                  exit_reason, 
                  (u64)vmcb->exitinfo1, (u64)vmcb->exitinfo2);
@@ -1684,6 +2976,11 @@ asmlinkage void svm_vmexit_handler(struc
         break;
     }
 
+  out:
+    if ( vcpu_guestmode )
+        /* Don't clobber TPR of the nested guest. */
+        return;
+
     /* The exit may have updated the TPR: reflect this in the hardware vtpr */
     vmcb->vintr.fields.tpr = 
         (vlapic_get_reg(vcpu_vlapic(v), APIC_TASKPRI) & 0xFF) >> 4;
diff -r 0fcb32521d57 -r fa992936dba4 xen/arch/x86/hvm/svm/svmdebug.c
--- /dev/null
+++ b/xen/arch/x86/hvm/svm/svmdebug.c
@@ -0,0 +1,185 @@
+/*
+ * svmdebug.c: debug functions
+ * Copyright (c) 2010, Advanced Micro Devices, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ */
+
+#include <asm/processor.h>
+#include <asm/msr-index.h>
+#include <asm/hvm/svm/svmdebug.h>
+
+static void svm_dump_sel(const char *name, svm_segment_register_t *s)
+{
+    printk("%s: sel=0x%04x, attr=0x%04x, limit=0x%08x, base=0x%016llx\n", 
+           name, s->sel, s->attr.bytes, s->limit,
+           (unsigned long long)s->base);
+}
+
+void svm_vmcb_dump(const char *from, struct vmcb_struct *vmcb)
+{
+    printk("Dumping guest's current state at %s...\n", from);
+    printk("Size of VMCB = %d, address = %p\n", 
+            (int) sizeof(struct vmcb_struct), vmcb);
+
+    printk("cr_intercepts = 0x%08x dr_intercepts = 0x%08x "
+           "exception_intercepts = 0x%08x\n", 
+           vmcb->cr_intercepts, vmcb->dr_intercepts, 
+           vmcb->exception_intercepts);
+    printk("general1_intercepts = 0x%08x general2_intercepts = 0x%08x\n", 
+           vmcb->general1_intercepts, vmcb->general2_intercepts);
+    printk("iopm_base_pa = %016llx msrpm_base_pa = 0x%016llx tsc_offset = "
+            "0x%016llx\n", 
+           (unsigned long long) vmcb->iopm_base_pa,
+           (unsigned long long) vmcb->msrpm_base_pa,
+           (unsigned long long) vmcb->tsc_offset);
+    printk("tlb_control = 0x%08x vintr = 0x%016llx interrupt_shadow = "
+            "0x%016llx\n", vmcb->tlb_control,
+           (unsigned long long) vmcb->vintr.bytes,
+           (unsigned long long) vmcb->interrupt_shadow);
+    printk("exitcode = 0x%016llx exitintinfo = 0x%016llx\n", 
+           (unsigned long long) vmcb->exitcode,
+           (unsigned long long) vmcb->exitintinfo.bytes);
+    printk("exitinfo1 = 0x%016llx exitinfo2 = 0x%016llx \n",
+           (unsigned long long) vmcb->exitinfo1,
+           (unsigned long long) vmcb->exitinfo2);
+    printk("np_enable = 0x%016llx guest_asid = 0x%03x\n", 
+           (unsigned long long) vmcb->np_enable, vmcb->guest_asid);
+    printk("cpl = %d efer = 0x%016llx star = 0x%016llx lstar = 0x%016llx\n", 
+           vmcb->cpl, (unsigned long long) vmcb->efer,
+           (unsigned long long) vmcb->star, (unsigned long long) vmcb->lstar);
+    printk("CR0 = 0x%016llx CR2 = 0x%016llx\n",
+           (unsigned long long) vmcb->cr0, (unsigned long long) vmcb->cr2);
+    printk("CR3 = 0x%016llx CR4 = 0x%016llx\n", 
+           (unsigned long long) vmcb->cr3, (unsigned long long) vmcb->cr4);
+    printk("RSP = 0x%016llx  RIP = 0x%016llx\n", 
+           (unsigned long long) vmcb->rsp, (unsigned long long) vmcb->rip);
+    printk("RAX = 0x%016llx  RFLAGS=0x%016llx\n",
+           (unsigned long long) vmcb->rax, (unsigned long long) vmcb->rflags);
+    printk("DR6 = 0x%016llx, DR7 = 0x%016llx\n", 
+           (unsigned long long) vmcb->dr6, (unsigned long long) vmcb->dr7);
+    printk("CSTAR = 0x%016llx SFMask = 0x%016llx\n",
+           (unsigned long long) vmcb->cstar, 
+           (unsigned long long) vmcb->sfmask);
+    printk("KernGSBase = 0x%016llx PAT = 0x%016llx \n", 
+           (unsigned long long) vmcb->kerngsbase,
+           (unsigned long long) vmcb->g_pat);
+    printk("H_CR3 = 0x%016llx\n", (unsigned long long)vmcb->h_cr3);
+
+    /* print out all the selectors */
+    svm_dump_sel("CS", &vmcb->cs);
+    svm_dump_sel("DS", &vmcb->ds);
+    svm_dump_sel("SS", &vmcb->ss);
+    svm_dump_sel("ES", &vmcb->es);
+    svm_dump_sel("FS", &vmcb->fs);
+    svm_dump_sel("GS", &vmcb->gs);
+    svm_dump_sel("GDTR", &vmcb->gdtr);
+    svm_dump_sel("LDTR", &vmcb->ldtr);
+    svm_dump_sel("IDTR", &vmcb->idtr);
+    svm_dump_sel("TR", &vmcb->tr);
+}
+
+bool_t
+svm_vmcb_isvalid(const char *from, struct vmcb_struct *vmcb,
+                 bool_t verbose)
+{
+    bool_t ret = 0; /* ok */
+
+#define PRINTF(...) \
+    if (verbose) { ret = 1; printk("%s: ", from); printk(__VA_ARGS__); \
+    } else return 1;
+
+    if ((vmcb->efer & EFER_SVME) == 0) {
+        PRINTF("EFER: SVME bit not set (0x%"PRIx64")\n", vmcb->efer);
+    }
+
+    if ((vmcb->cr0 & X86_CR0_CD) == 0 && (vmcb->cr0 & X86_CR0_NW) != 0) {
+        PRINTF("CR0: CD bit is zero and NW bit set (0x%"PRIx64")\n",
+                vmcb->cr0);
+    }
+
+    if ((vmcb->cr0 >> 32U) != 0) {
+        PRINTF("CR0: bits [63:32] are not zero (0x%"PRIx64")\n",
+                vmcb->cr0);
+    }
+
+    if ((vmcb->cr3 & 0x7) != 0) {
+        PRINTF("CR3: MBZ bits are set (0x%"PRIx64")\n", vmcb->cr3);
+    }
+    if ((vmcb->efer & EFER_LMA) && (vmcb->cr3 & 0xfe) != 0) {
+        PRINTF("CR3: MBZ bits are set (0x%"PRIx64")\n", vmcb->cr3);
+    }
+
+    if ((vmcb->cr4 >> 11U) != 0) {
+        PRINTF("CR4: bits [63:11] are not zero (0x%"PRIx64")\n",
+                vmcb->cr4);
+    }
+
+    if ((vmcb->dr6 >> 32U) != 0) {
+        PRINTF("DR6: bits [63:32] are not zero (0x%"PRIx64")\n",
+                vmcb->dr6);
+    }
+
+    if ((vmcb->dr7 >> 32U) != 0) {
+        PRINTF("DR7: bits [63:32] are not zero (0x%"PRIx64")\n",
+                vmcb->dr7);
+    }
+
+    if ((vmcb->efer >> 15U) != 0) {
+        PRINTF("EFER: bits [63:15] are not zero (0x%"PRIx64")\n",
+                vmcb->efer);
+    }
+
+    if ((vmcb->efer & EFER_LME) != 0 && ((vmcb->cr0 & X86_CR0_PG) != 0)) {
+        if ((vmcb->cr4 & X86_CR4_PAE) == 0) {
+            PRINTF("EFER_LME and CR0.PG are both set and CR4.PAE is zero.\n");
+        }
+        if ((vmcb->cr0 & X86_CR0_PE) == 0) {
+            PRINTF("EFER_LME and CR0.PG are both set and CR0.PE is zero.\n");
+        }
+    }
+
+    if ((vmcb->efer & EFER_LME) != 0
+        && (vmcb->cr0 & X86_CR0_PG) != 0
+        && (vmcb->cr4 & X86_CR4_PAE) != 0
+        && (vmcb->cs.attr.fields.l != 0)
+        && (vmcb->cs.attr.fields.db != 0))
+    {
+        PRINTF("EFER_LME, CR0.PG, CR4.PAE, CS.L and CS.D are all non-zero.\n");
+    }
+
+    if ((vmcb->general2_intercepts & GENERAL2_INTERCEPT_VMRUN) == 0) {
+        PRINTF("GENERAL2_INTERCEPT: VMRUN intercept bit is clear (0x%"PRIx32")\n",
+            vmcb->general2_intercepts);
+    }
+
+    if (vmcb->eventinj.fields.resvd1 != 0) {
+        PRINTF("eventinj: MBZ bits are set (0x%"PRIx64")\n",
+                vmcb->eventinj.bytes);
+    }
+
+#undef PRINTF
+    return ret;
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff -r 0fcb32521d57 -r fa992936dba4 xen/arch/x86/hvm/svm/vmcb.c
--- a/xen/arch/x86/hvm/svm/vmcb.c
+++ b/xen/arch/x86/hvm/svm/vmcb.c
@@ -33,6 +33,7 @@
 #include <asm/hvm/svm/svm.h>
 #include <asm/hvm/svm/intr.h>
 #include <asm/hvm/svm/asid.h>
+#include <asm/hvm/svm/svmdebug.h>
 #include <xen/event.h>
 #include <xen/kernel.h>
 #include <xen/domain_page.h>
@@ -75,37 +76,6 @@ struct host_save_area *alloc_host_save_a
     return hsa;
 }
 
-void svm_intercept_msr(struct vcpu *v, uint32_t msr, int enable)
-{
-    unsigned long *msr_bitmap = v->arch.hvm_svm.msrpm;
-    unsigned long *msr_bit = NULL;
-
-    /*
-     * See AMD64 Programmers Manual, Vol 2, Section 15.10 (MSR-Bitmap Address).
-     */
-    if ( msr <= 0x1fff )
-        msr_bit = msr_bitmap + 0x0000 / BYTES_PER_LONG;
-    else if ( (msr >= 0xc0000000) && (msr <= 0xc0001fff) )
-        msr_bit = msr_bitmap + 0x0800 / BYTES_PER_LONG;
-    else if ( (msr >= 0xc0010000) && (msr <= 0xc0011fff) )
-        msr_bit = msr_bitmap + 0x1000 / BYTES_PER_LONG;
-
-    BUG_ON(msr_bit == NULL);
-
-    msr &= 0x1fff;
-
-    if ( enable )
-    {
-        __set_bit(msr * 2, msr_bit);
-        __set_bit(msr * 2 + 1, msr_bit);
-    }
-    else
-    {
-        __clear_bit(msr * 2, msr_bit);
-        __clear_bit(msr * 2 + 1, msr_bit);
-    }
-}
-
 static int construct_vmcb(struct vcpu *v)
 {
     struct arch_svm_struct *arch_svm = &v->arch.hvm_svm;
@@ -250,7 +220,7 @@ static int construct_vmcb(struct vcpu *v
 
     if ( cpu_has_pause_filter )
     {
-        vmcb->pause_filter_count = 3000;
+        vmcb->pause_filter_count = SVM_PAUSEFILTER_INIT;
         vmcb->general1_intercepts |= GENERAL1_INTERCEPT_PAUSE;
     }
 
@@ -298,76 +268,6 @@ void svm_destroy_vmcb(struct vcpu *v)
     arch_svm->vmcb = NULL;
 }
 
-static void svm_dump_sel(char *name, svm_segment_register_t *s)
-{
-    printk("%s: sel=0x%04x, attr=0x%04x, limit=0x%08x, base=0x%016llx\n", 
-           name, s->sel, s->attr.bytes, s->limit,
-           (unsigned long long)s->base);
-}
-
-void svm_dump_vmcb(const char *from, struct vmcb_struct *vmcb)
-{
-    printk("Dumping guest's current state at %s...\n", from);
-    printk("Size of VMCB = %d, address = %p\n", 
-            (int) sizeof(struct vmcb_struct), vmcb);
-
-    printk("cr_intercepts = 0x%08x dr_intercepts = 0x%08x "
-           "exception_intercepts = 0x%08x\n", 
-           vmcb->cr_intercepts, vmcb->dr_intercepts, 
-           vmcb->exception_intercepts);
-    printk("general1_intercepts = 0x%08x general2_intercepts = 0x%08x\n", 
-           vmcb->general1_intercepts, vmcb->general2_intercepts);
-    printk("iopm_base_pa = %016llx msrpm_base_pa = 0x%016llx tsc_offset = "
-            "0x%016llx\n", 
-           (unsigned long long) vmcb->iopm_base_pa,
-           (unsigned long long) vmcb->msrpm_base_pa,
-           (unsigned long long) vmcb->tsc_offset);
-    printk("tlb_control = 0x%08x vintr = 0x%016llx interrupt_shadow = "
-            "0x%016llx\n", vmcb->tlb_control,
-           (unsigned long long) vmcb->vintr.bytes,
-           (unsigned long long) vmcb->interrupt_shadow);
-    printk("exitcode = 0x%016llx exitintinfo = 0x%016llx\n", 
-           (unsigned long long) vmcb->exitcode,
-           (unsigned long long) vmcb->exitintinfo.bytes);
-    printk("exitinfo1 = 0x%016llx exitinfo2 = 0x%016llx \n",
-           (unsigned long long) vmcb->exitinfo1,
-           (unsigned long long) vmcb->exitinfo2);
-    printk("np_enable = 0x%016llx guest_asid = 0x%03x\n", 
-           (unsigned long long) vmcb->np_enable, vmcb->guest_asid);
-    printk("cpl = %d efer = 0x%016llx star = 0x%016llx lstar = 0x%016llx\n", 
-           vmcb->cpl, (unsigned long long) vmcb->efer,
-           (unsigned long long) vmcb->star, (unsigned long long) vmcb->lstar);
-    printk("CR0 = 0x%016llx CR2 = 0x%016llx\n",
-           (unsigned long long) vmcb->cr0, (unsigned long long) vmcb->cr2);
-    printk("CR3 = 0x%016llx CR4 = 0x%016llx\n", 
-           (unsigned long long) vmcb->cr3, (unsigned long long) vmcb->cr4);
-    printk("RSP = 0x%016llx  RIP = 0x%016llx\n", 
-           (unsigned long long) vmcb->rsp, (unsigned long long) vmcb->rip);
-    printk("RAX = 0x%016llx  RFLAGS=0x%016llx\n",
-           (unsigned long long) vmcb->rax, (unsigned long long) vmcb->rflags);
-    printk("DR6 = 0x%016llx, DR7 = 0x%016llx\n", 
-           (unsigned long long) vmcb->dr6, (unsigned long long) vmcb->dr7);
-    printk("CSTAR = 0x%016llx SFMask = 0x%016llx\n",
-           (unsigned long long) vmcb->cstar, 
-           (unsigned long long) vmcb->sfmask);
-    printk("KernGSBase = 0x%016llx PAT = 0x%016llx \n", 
-           (unsigned long long) vmcb->kerngsbase,
-           (unsigned long long) vmcb->g_pat);
-    printk("H_CR3 = 0x%016llx\n", (unsigned long long)vmcb->h_cr3);
-
-    /* print out all the selectors */
-    svm_dump_sel("CS", &vmcb->cs);
-    svm_dump_sel("DS", &vmcb->ds);
-    svm_dump_sel("SS", &vmcb->ss);
-    svm_dump_sel("ES", &vmcb->es);
-    svm_dump_sel("FS", &vmcb->fs);
-    svm_dump_sel("GS", &vmcb->gs);
-    svm_dump_sel("GDTR", &vmcb->gdtr);
-    svm_dump_sel("LDTR", &vmcb->ldtr);
-    svm_dump_sel("IDTR", &vmcb->idtr);
-    svm_dump_sel("TR", &vmcb->tr);
-}
-
 static void vmcb_dump(unsigned char ch)
 {
     struct domain *d;
@@ -385,7 +285,7 @@ static void vmcb_dump(unsigned char ch)
         for_each_vcpu ( d, v )
         {
             printk("\tVCPU %d\n", v->vcpu_id);
-            svm_dump_vmcb("key_handler", v->arch.hvm_svm.vmcb);
+            svm_vmcb_dump("key_handler", v->arch.hvm_svm.vmcb);
         }
     }
 
diff -r 0fcb32521d57 -r fa992936dba4 xen/include/asm-x86/hvm/svm/emulate.h
--- a/xen/include/asm-x86/hvm/svm/emulate.h
+++ b/xen/include/asm-x86/hvm/svm/emulate.h
@@ -32,6 +32,11 @@ enum instruction_index {
     INSTR_INT3,
     INSTR_RDTSC,
     INSTR_PAUSE,
+    INSTR_VMRUN,
+    INSTR_VMLOAD,
+    INSTR_VMSAVE,
+    INSTR_STGI,
+    INSTR_CLGI,
     INSTR_MAX_COUNT /* Must be last - Number of instructions supported */
 };
 
diff -r 0fcb32521d57 -r fa992936dba4 xen/include/asm-x86/hvm/svm/svm.h
--- a/xen/include/asm-x86/hvm/svm/svm.h
+++ b/xen/include/asm-x86/hvm/svm/svm.h
@@ -29,8 +29,6 @@
 #include <asm/i387.h>
 #include <asm/hvm/vpmu.h>
 
-void svm_dump_vmcb(const char *from, struct vmcb_struct *vmcb);
-
 #define SVM_REG_EAX (0) 
 #define SVM_REG_ECX (1) 
 #define SVM_REG_EDX (2) 
@@ -76,4 +74,6 @@ extern u32 svm_feature_flags;
 #define cpu_has_svm_nrips   test_bit(SVM_FEATURE_NRIPS, &svm_feature_flags)
 #define cpu_has_pause_filter  test_bit(SVM_FEATURE_PAUSEF, &svm_feature_flags)
 
+#define SVM_PAUSEFILTER_INIT    3000
+
 #endif /* __ASM_X86_HVM_SVM_H__ */
diff -r 0fcb32521d57 -r fa992936dba4 xen/include/asm-x86/hvm/svm/svmdebug.h
--- /dev/null
+++ b/xen/include/asm-x86/hvm/svm/svmdebug.h
@@ -0,0 +1,30 @@
+/*
+ * svmdebug.h: SVM related debug defintions
+ * Copyright (c) 2010, AMD Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ */
+
+#ifndef __ASM_X86_HVM_SVM_SVMDEBUG_H__
+#define __ASM_X86_HVM_SVM_SVMDEBUG_H__
+
+#include <asm/types.h>
+#include <asm/hvm/svm/vmcb.h>
+
+void svm_vmcb_dump(const char *from, struct vmcb_struct *vmcb);
+bool_t svm_vmcb_isvalid(const char *from, struct vmcb_struct *vmcb,
+                        bool_t verbose);
+
+#endif /* __ASM_X86_HVM_SVM_SVMDEBUG_H__ */
diff -r 0fcb32521d57 -r fa992936dba4 xen/include/asm-x86/hvm/svm/vmcb.h
--- a/xen/include/asm-x86/hvm/svm/vmcb.h
+++ b/xen/include/asm-x86/hvm/svm/vmcb.h
@@ -474,6 +474,27 @@ struct arch_svm_struct {
     uint64_t guest_sysenter_eip;
 };
 
+struct nestedsvm {
+    uint64_t ns_msr_hsavepa; /* MSR HSAVE_PA value */
+
+    /* Cached real intercepts of the nested guest */
+    uint32_t ns_cr_intercepts;
+    uint32_t ns_dr_intercepts;
+    uint32_t ns_exception_intercepts;
+    uint32_t ns_general1_intercepts;
+    uint32_t ns_general2_intercepts;
+
+    /* Cached real lbr of the nested guest */
+    lbrctrl_t ns_lbr_control;
+
+    /* Permament allocated vmcb during vcpu lifetime.
+     * Used for VMLOAD/VMSAVE instruction emulation.
+     * Tim doesn't like the idea to xmalloc() and free()
+     * a temporary vmcb every time and Xen stack is small.
+     */
+    struct vmcb_struct *ns_tmpvmcb;
+};
+
 struct vmcb_struct *alloc_vmcb(void);
 struct host_save_area *alloc_host_save_area(void);
 void free_vmcb(struct vmcb_struct *vmcb);

[-- Attachment #3: Type: text/plain, Size: 138 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

             reply	other threads:[~2010-09-01 15:14 UTC|newest]

Thread overview: 5+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2010-09-01 15:14 Christoph Egger [this message]
  -- strict thread matches above, loose matches on Subject: below --
2010-10-15 13:07 [PATCH 09/13] Nested Virtualization: svm specific implementation Christoph Egger
2010-11-12 18:43 Christoph Egger
2010-11-16 14:54 ` Tim Deegan
2010-12-02 17:44   ` Christoph Egger

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=201009011714.20987.Christoph.Egger@amd.com \
    --to=christoph.egger@amd.com \
    --cc=Tim.Deegan@citrix.com \
    --cc=eddie.dong@intel.com \
    --cc=xen-devel@lists.xensource.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.