xen-devel.lists.xenproject.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 14/14] Nested Virtualization: hap-on-hap
@ 2010-08-05 15:05 Christoph Egger
  2010-08-09 13:18 ` Tim Deegan
  0 siblings, 1 reply; 5+ messages in thread
From: Christoph Egger @ 2010-08-05 15:05 UTC (permalink / raw)
  To: xen-devel

[-- Attachment #1: Type: text/plain, Size: 322 bytes --]


Signed-off-by: Christoph Egger <Christoph.Egger@amd.com>

-- 
---to satisfy European Law for business letters:
Advanced Micro Devices GmbH
Einsteinring 24, 85609 Dornach b. Muenchen
Geschaeftsfuehrer: Alberto Bozzo, Andrew Bowd
Sitz: Dornach, Gemeinde Aschheim, Landkreis Muenchen
Registergericht Muenchen, HRB Nr. 43632

[-- Attachment #2: xen_nh14_haphap.diff --]
[-- Type: text/x-diff, Size: 51443 bytes --]

# HG changeset patch
# User cegger
# Date 1281006359 -7200
Implement Nested-on-Nested.
This allows the guest to run nested guest with hap enabled.

diff -r b0e3fea1e01c -r 4cae90a3ea9c xen/arch/x86/hvm/hvm.c
--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -1042,12 +1042,56 @@ int hvm_inject_exception(unsigned int tr
     return 0;
 }
 
-bool_t hvm_hap_nested_page_fault(unsigned long gfn)
+bool_t hvm_hap_nested_page_fault(paddr_t gpa, struct cpu_user_regs *regs)
 {
     p2m_type_t p2mt;
     mfn_t mfn;
     struct vcpu *v = current;
     struct p2m_domain *p2m = p2m_get_hostp2m(v->domain);
+    unsigned long gfn = gpa >> PAGE_SHIFT;
+    int rv;
+
+    /* On Nested Virtualization, walk the guest page table.
+     * If this succeeds, all is fine.
+     * If this fails, inject a nested page fault into the guest.
+     */
+    if ( nestedhvm_enabled(v->domain)
+        && nestedhvm_vcpu_in_guestmode(v)
+        && nestedhvm_paging_mode_hap(v) )
+    {
+        enum nestedhvm_vmexits nsret;
+        struct nestedhvm *hvm = &VCPU_NESTEDHVM(v);
+
+        /* nested guest gpa == guest gva */
+        rv = nestedhvm_hap_nested_page_fault(v, gpa);
+        switch (rv) {
+        case NESTEDHVM_PAGEFAULT_DONE:
+            return 1;
+        case NESTEDHVM_PAGEFAULT_ERROR:
+            return 0;
+        case NESTEDHVM_PAGEFAULT_INJECT:
+            break;
+        }
+
+        /* inject #VMEXIT(NPF) into guest. */
+        hvm->nh_forcevmexit.exitcode = NESTEDHVM_INTERCEPT_NPF;
+        hvm->nh_forcevmexit.exitinfo1 = regs->error_code;
+        hvm->nh_forcevmexit.exitinfo2 = gpa;
+        hvm->nh_hostflags.fields.forcevmexit = 1;
+        nsret = nestedhvm_vcpu_vmexit(v, regs, NESTEDHVM_INTERCEPT_NPF);
+        hvm->nh_hostflags.fields.forcevmexit = 0;
+        switch (nsret) {
+        case NESTEDHVM_VMEXIT_DONE:
+        case NESTEDHVM_VMEXIT_ERROR: /* L1 guest will crash L2 guest */
+            return 1;
+        case NESTEDHVM_VMEXIT_HOST:
+        case NESTEDHVM_VMEXIT_CONTINUE:
+        case NESTEDHVM_VMEXIT_FATALERROR:
+        default:
+            gdprintk(XENLOG_ERR, "unexpected nestedhvm error %i\n", nsret);
+            return 0;
+        }
+    }
 
     mfn = gfn_to_mfn_guest(p2m, gfn, &p2mt);
 
@@ -1128,6 +1172,15 @@ int hvm_set_efer(uint64_t value)
         return X86EMUL_EXCEPTION;
     }
 
+    if ( nestedhvm_enabled(v->domain) &&
+       ((value & EFER_SVME) == 0 ) &&
+       ((value ^ v->arch.hvm_vcpu.guest_efer) & EFER_SVME) )
+    {
+        /* Cleared EFER.SVME: Flush all nestedp2m tables */
+        p2m_flush_nestedp2m(v->domain);
+        nestedhvm_vcpu_reset(v);
+    }
+
     value |= v->arch.hvm_vcpu.guest_efer & EFER_LMA;
     v->arch.hvm_vcpu.guest_efer = value;
     hvm_update_guest_efer(v);
@@ -1278,8 +1331,12 @@ int hvm_set_cr0(unsigned long value)
     v->arch.hvm_vcpu.guest_cr[0] = value;
     hvm_update_guest_cr(v, 0);
 
-    if ( (value ^ old_value) & X86_CR0_PG )
-        paging_update_paging_modes(v);
+    if ( (value ^ old_value) & X86_CR0_PG ) {
+        if ( !nestedhvm_vmentry_emulate(v) && nestedhvm_vcpu_in_guestmode(v) )
+            paging_update_nestedmode(v);
+        else
+            paging_update_paging_modes(v);
+    }
 
     return X86EMUL_OKAY;
 
@@ -1346,8 +1403,12 @@ int hvm_set_cr4(unsigned long value)
     hvm_update_guest_cr(v, 4);
 
     /* Modifying CR4.{PSE,PAE,PGE} invalidates all TLB entries, inc. Global. */
-    if ( (old_cr ^ value) & (X86_CR4_PSE | X86_CR4_PGE | X86_CR4_PAE) )
-        paging_update_paging_modes(v);
+    if ( (old_cr ^ value) & (X86_CR4_PSE | X86_CR4_PGE | X86_CR4_PAE) ) {
+        if ( !nestedhvm_vmentry_emulate(v) && nestedhvm_vcpu_in_guestmode(v) )
+            paging_update_nestedmode(v);
+        else
+            paging_update_paging_modes(v);
+    }
 
     return X86EMUL_OKAY;
 
@@ -1831,11 +1892,15 @@ static enum hvm_copy_result __hvm_copy(
     void *buf, paddr_t addr, int size, unsigned int flags, uint32_t pfec)
 {
     struct vcpu *curr = current;
-    struct p2m_domain *p2m = p2m_get_hostp2m(curr->domain);
+    struct p2m_domain *p2m;
     unsigned long gfn, mfn;
     p2m_type_t p2mt;
     char *p;
     int count, todo = size;
+    unsigned long cr3 = curr->arch.hvm_vcpu.guest_cr[3];
+    const struct paging_mode *mode = paging_get_mode(curr);
+
+    p2m = p2m_get_p2m(curr);
 
     while ( todo > 0 )
     {
@@ -1843,7 +1908,7 @@ static enum hvm_copy_result __hvm_copy(
 
         if ( flags & HVMCOPY_virt )
         {
-            gfn = paging_gva_to_gfn(curr, addr, &pfec);
+            gfn = paging_p2m_ga_to_gfn(curr, p2m, mode, cr3, addr, &pfec);
             if ( gfn == INVALID_GFN )
             {
                 if ( pfec == PFEC_page_paged )
diff -r b0e3fea1e01c -r 4cae90a3ea9c xen/arch/x86/hvm/nestedhvm.c
--- a/xen/arch/x86/hvm/nestedhvm.c
+++ b/xen/arch/x86/hvm/nestedhvm.c
@@ -487,6 +487,7 @@ nestedhvm_vcpu_vmexit(struct vcpu *v, st
 	enum hvm_copy_result hvm_rc;
 
 	hvm->nh_hostflags.fields.vmentry = 1;
+	paging_update_nestedmode(v);
 	if (nestedhvm_vcpu_in_guestmode(v)) {
 		ret = nestedhvm_vmexit(v, regs, exitcode);
 		switch (ret) {
diff -r b0e3fea1e01c -r 4cae90a3ea9c xen/arch/x86/hvm/svm/svm.c
--- a/xen/arch/x86/hvm/svm/svm.c
+++ b/xen/arch/x86/hvm/svm/svm.c
@@ -1580,6 +1580,10 @@ static int nsvm_vmcb_prepare4vmrun(struc
     /* Nested paging mode */
     if (nestedhvm_paging_mode_hap(v)) {
         /* host nested paging + guest nested paging. */
+        host_vmcb->np_enable = 1;
+
+        host_vmcb->h_cr3 =
+            pagetable_get_paddr(p2m_get_pagetable(p2m_get_nestedp2m(v, ns_vmcb->h_cr3)));
 
         /* hvm_set_cr3() below sets v->arch.hvm_vcpu.guest_cr[3] for us. */
         rc = hvm_set_cr3(ns_vmcb->cr3);
@@ -1966,14 +1970,16 @@ struct hvm_function_table * __init start
     return &svm_function_table;
 }
 
-static void svm_do_nested_pgfault(paddr_t gpa)
+static void svm_do_nested_pgfault(struct vcpu *v,
+    struct cpu_user_regs *regs, paddr_t gpa)
 {
+    int ret;
     unsigned long gfn = gpa >> PAGE_SHIFT;
     mfn_t mfn;
     p2m_type_t p2mt;
-    struct p2m_domain *p2m;
-
-    p2m = p2m_get_hostp2m(current->domain);
+    struct p2m_domain *p2m = NULL;
+
+    ret = hvm_hap_nested_page_fault(gpa, regs);
 
     if ( tb_init_done )
     {
@@ -1984,6 +1990,7 @@ static void svm_do_nested_pgfault(paddr_
             uint32_t p2mt;
         } _d;
 
+        p2m = p2m_get_p2m(v);
         _d.gpa = gpa;
         _d.qualification = 0;
         _d.mfn = mfn_x(gfn_to_mfn_query(p2m, gfn, &_d.p2mt));
@@ -1991,14 +1998,16 @@ static void svm_do_nested_pgfault(paddr_
         __trace_var(TRC_HVM_NPF, 0, sizeof(_d), (unsigned char *)&_d);
     }
 
-    if ( hvm_hap_nested_page_fault(gfn) )
+    if ( ret )
         return;
 
+    if ( p2m == NULL )
+        p2m = p2m_get_p2m(v);
     /* Everything else is an error. */
     mfn = gfn_to_mfn_guest(p2m, gfn, &p2mt);
     gdprintk(XENLOG_ERR, "SVM violation gpa %#"PRIpaddr", mfn %#lx, type %i\n",
              gpa, mfn_x(mfn), p2mt);
-    domain_crash(current->domain);
+    domain_crash(v->domain);
 }
 
 static void svm_fpu_dirty_intercept(void)
@@ -2924,7 +2933,7 @@ asmlinkage void svm_vmexit_handler(struc
     case VMEXIT_NPF:
         perfc_incra(svmexits, VMEXIT_NPF_PERFC);
         regs->error_code = vmcb->exitinfo1;
-        svm_do_nested_pgfault(vmcb->exitinfo2);
+        svm_do_nested_pgfault(v, regs, vmcb->exitinfo2);
         break;
 
     case VMEXIT_IRET:
diff -r b0e3fea1e01c -r 4cae90a3ea9c xen/arch/x86/hvm/vmx/vmx.c
--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -2120,7 +2120,7 @@ static void ept_handle_violation(unsigne
     }
 
     if ( (qualification & EPT_GLA_VALID) &&
-         hvm_hap_nested_page_fault(gfn) )
+         hvm_hap_nested_page_fault(gpa, guest_cpu_user_regs()) )
         return;
 
     /* Everything else is an error. */
diff -r b0e3fea1e01c -r 4cae90a3ea9c xen/arch/x86/mm/hap/Makefile
--- a/xen/arch/x86/mm/hap/Makefile
+++ b/xen/arch/x86/mm/hap/Makefile
@@ -3,6 +3,7 @@ obj-y += guest_walk_2level.o
 obj-y += guest_walk_3level.o
 obj-y += guest_walk_4level.o
 obj-y += p2m-ept.o
+obj-y += nested_hap.o
 
 guest_levels  = $(subst level,,$(filter %level,$(subst ., ,$(subst _, ,$(1)))))
 guest_walk_defns = -DGUEST_PAGING_LEVELS=$(call guest_levels,$(1))
diff -r b0e3fea1e01c -r 4cae90a3ea9c xen/arch/x86/mm/hap/guest_walk.c
--- a/xen/arch/x86/mm/hap/guest_walk.c
+++ b/xen/arch/x86/mm/hap/guest_walk.c
@@ -29,6 +29,9 @@
 #define _hap_gva_to_gfn(levels) hap_gva_to_gfn_##levels##_levels
 #define hap_gva_to_gfn(levels) _hap_gva_to_gfn(levels)
 
+#define _hap_p2m_ga_to_gfn(levels) hap_p2m_ga_to_gfn_##levels##_levels
+#define hap_p2m_ga_to_gfn(levels) _hap_p2m_ga_to_gfn(levels)
+
 #if GUEST_PAGING_LEVELS <= CONFIG_PAGING_LEVELS
 
 #include <asm/guest_pt.h>
@@ -38,15 +41,23 @@ unsigned long hap_gva_to_gfn(GUEST_PAGIN
     struct vcpu *v, unsigned long gva, uint32_t *pfec)
 {
     unsigned long cr3;
+    struct p2m_domain *p2m = p2m_get_hostp2m(v->domain);
+
+    cr3 = v->arch.hvm_vcpu.guest_cr[3];
+    return hap_p2m_ga_to_gfn(GUEST_PAGING_LEVELS)(v, p2m, cr3, gva, pfec);
+}
+
+unsigned long hap_p2m_ga_to_gfn(GUEST_PAGING_LEVELS)(
+    struct vcpu *v, struct p2m_domain *p2m, unsigned long cr3,
+    paddr_t ga, uint32_t *pfec)
+{
     uint32_t missing;
     mfn_t top_mfn;
     void *top_map;
     p2m_type_t p2mt;
     walk_t gw;
-    struct p2m_domain *p2m = p2m_get_hostp2m(v->domain);
 
     /* Get the top-level table's MFN */
-    cr3 = v->arch.hvm_vcpu.guest_cr[3];
     top_mfn = gfn_to_mfn_unshare(p2m, cr3 >> PAGE_SHIFT, &p2mt, 0);
     if ( p2m_is_paging(p2mt) )
     {
@@ -72,7 +83,7 @@ unsigned long hap_gva_to_gfn(GUEST_PAGIN
 #if GUEST_PAGING_LEVELS == 3
     top_map += (cr3 & ~(PAGE_MASK | 31));
 #endif
-    missing = guest_walk_tables(v, p2m, gva, &gw, pfec[0], top_mfn, top_map);
+    missing = guest_walk_tables(v, p2m, ga, &gw, pfec[0], top_mfn, top_map);
     unmap_domain_page(top_map);
 
     /* Interpret the answer */
@@ -119,6 +130,15 @@ unsigned long hap_gva_to_gfn(GUEST_PAGIN
     return INVALID_GFN;
 }
 
+unsigned long hap_p2m_ga_to_gfn(GUEST_PAGING_LEVELS)(
+    struct vcpu *v, struct p2m_domain *p2m, unsigned long cr3,
+    paddr_t ga, uint32_t *pfec)
+{
+    gdprintk(XENLOG_ERR,
+             "Guest paging level is greater than host paging level!\n");
+    domain_crash(v->domain);
+    return INVALID_GFN;
+}
 #endif
 
 
diff -r b0e3fea1e01c -r 4cae90a3ea9c xen/arch/x86/mm/hap/hap.c
--- a/xen/arch/x86/mm/hap/hap.c
+++ b/xen/arch/x86/mm/hap/hap.c
@@ -336,6 +336,30 @@ static void hap_free_p2m_page(struct p2m
     hap_unlock(d);
 }
 
+#define nestedp2m_alloc_p2m_page hap_alloc_p2m_page
+
+/* We must use hap_free() or flushing the nested p2m tables fails
+ * with "freeing ptp fails due to insufficient pages.
+ * XXX This triggers a bug in p2m that causes a crash in
+ * xen/common/page_alloc.c:1201 on L1 guest shutdown/destroy.
+ */
+static void
+nestedp2m_free_p2m_page(struct p2m_domain *p2m, struct page_info *pg)
+{
+    struct domain *d = p2m->domain;
+    hap_lock(d);
+    ASSERT(page_get_owner(pg) == d);
+    /* Should have just the one ref we gave it in alloc_p2m_page() */
+    BUG_ON((pg->count_info & PGC_count_mask) != 1);
+    pg->count_info = 0;
+    page_set_owner(pg, NULL);
+    hap_free(d, page_to_mfn(pg));
+    d->arch.paging.hap.total_pages++;
+    d->arch.paging.hap.p2m_pages--;
+    ASSERT(d->arch.paging.hap.p2m_pages >= 0);
+    hap_unlock(d);
+}
+
 /* Return the size of the pool, rounded up to the nearest MB */
 static unsigned int
 hap_get_allocation(struct domain *d)
@@ -567,6 +591,7 @@ void hap_domain_init(struct domain *d)
 int hap_enable(struct domain *d, u32 mode)
 {
     unsigned int old_pages;
+    uint8_t i;
     int rv = 0;
     uint32_t oldmode;
 
@@ -606,6 +631,13 @@ int hap_enable(struct domain *d, u32 mod
             goto out;
     }
 
+    for (i = 0; i < MAX_NESTEDP2M; i++) {
+        rv = p2m_alloc_table(d->arch.nested_p2m[i],
+            nestedp2m_alloc_p2m_page, nestedp2m_free_p2m_page);
+        if ( rv != 0 )
+           goto out;
+    }
+
  out:
     if (rv)
         d->arch.paging.mode = oldmode;
@@ -615,6 +647,13 @@ int hap_enable(struct domain *d, u32 mod
 
 void hap_final_teardown(struct domain *d)
 {
+    uint8_t i;
+
+    /* Destroy nestedp2m's first */
+    for (i = 0; i < MAX_NESTEDP2M; i++) {
+        p2m_teardown(d->arch.nested_p2m[i]);
+    }
+
     if ( d->arch.paging.hap.total_pages != 0 )
         hap_teardown(d);
 
@@ -638,7 +677,7 @@ void hap_teardown(struct domain *d)
         /* release the monitor table held by each vcpu */
         for_each_vcpu ( d, v )
         {
-            if ( v->arch.paging.mode && paging_mode_external(d) )
+            if ( paging_get_hostmode(v) && paging_mode_external(d) )
             {
                 mfn = pagetable_get_mfn(v->arch.monitor_table);
                 if ( mfn_valid(mfn) && (mfn_x(mfn) != 0) )
@@ -706,6 +745,7 @@ static const struct paging_mode hap_pagi
 void hap_vcpu_init(struct vcpu *v)
 {
     v->arch.paging.mode = &hap_paging_real_mode;
+    v->arch.paging.nestedmode = &hap_paging_real_mode;
 }
 
 /************************************************/
@@ -732,6 +772,15 @@ static int hap_page_fault(struct vcpu *v
  */
 static int hap_invlpg(struct vcpu *v, unsigned long va)
 {
+    if (nestedhvm_enabled(v->domain)) {
+        /* Emulate INVLPGA:
+         * Must perform the flush right now or an other vcpu may
+         * use it when we the next VMRUN emulation, otherwise.
+         */
+        p2m_flush(v, VCPU_NESTEDHVM(v).nh_p2m);
+        return 0;
+    }
+
     HAP_ERROR("Intercepted a guest INVLPG (%u:%u) with HAP enabled.\n",
               v->domain->domain_id, v->vcpu_id);
     domain_crash(v->domain);
@@ -744,17 +793,22 @@ static void hap_update_cr3(struct vcpu *
     hvm_update_guest_cr(v, 3);
 }
 
+const struct paging_mode *
+hap_paging_get_mode(struct vcpu *v)
+{
+    return !hvm_paging_enabled(v)   ? &hap_paging_real_mode :
+        hvm_long_mode_enabled(v) ? &hap_paging_long_mode :
+        hvm_pae_enabled(v)       ? &hap_paging_pae_mode  :
+                                   &hap_paging_protected_mode;
+}
+
 static void hap_update_paging_modes(struct vcpu *v)
 {
     struct domain *d = v->domain;
 
     hap_lock(d);
 
-    v->arch.paging.mode =
-        !hvm_paging_enabled(v)   ? &hap_paging_real_mode :
-        hvm_long_mode_enabled(v) ? &hap_paging_long_mode :
-        hvm_pae_enabled(v)       ? &hap_paging_pae_mode  :
-                                   &hap_paging_protected_mode;
+    v->arch.paging.mode = hap_paging_get_mode(v);
 
     if ( pagetable_is_null(v->arch.monitor_table) )
     {
@@ -842,11 +896,20 @@ static unsigned long hap_gva_to_gfn_real
     return ((paddr_t)gva >> PAGE_SHIFT);
 }
 
+static unsigned long hap_p2m_ga_to_gfn_real_mode(
+    struct vcpu *v, struct p2m_domain *p2m, unsigned long cr3,
+    paddr_t ga, uint32_t *pfec)
+{
+    return (ga >> PAGE_SHIFT);
+}
+
+
 /* Entry points into this mode of the hap code. */
 static const struct paging_mode hap_paging_real_mode = {
     .page_fault             = hap_page_fault,
     .invlpg                 = hap_invlpg,
     .gva_to_gfn             = hap_gva_to_gfn_real_mode,
+    .p2m_ga_to_gfn          = hap_p2m_ga_to_gfn_real_mode,
     .update_cr3             = hap_update_cr3,
     .update_paging_modes    = hap_update_paging_modes,
     .write_p2m_entry        = hap_write_p2m_entry,
@@ -857,6 +920,7 @@ static const struct paging_mode hap_pagi
     .page_fault             = hap_page_fault,
     .invlpg                 = hap_invlpg,
     .gva_to_gfn             = hap_gva_to_gfn_2_levels,
+    .p2m_ga_to_gfn          = hap_p2m_ga_to_gfn_2_levels,
     .update_cr3             = hap_update_cr3,
     .update_paging_modes    = hap_update_paging_modes,
     .write_p2m_entry        = hap_write_p2m_entry,
@@ -867,6 +931,7 @@ static const struct paging_mode hap_pagi
     .page_fault             = hap_page_fault,
     .invlpg                 = hap_invlpg,
     .gva_to_gfn             = hap_gva_to_gfn_3_levels,
+    .p2m_ga_to_gfn          = hap_p2m_ga_to_gfn_3_levels,
     .update_cr3             = hap_update_cr3,
     .update_paging_modes    = hap_update_paging_modes,
     .write_p2m_entry        = hap_write_p2m_entry,
@@ -877,6 +942,7 @@ static const struct paging_mode hap_pagi
     .page_fault             = hap_page_fault,
     .invlpg                 = hap_invlpg,
     .gva_to_gfn             = hap_gva_to_gfn_4_levels,
+    .p2m_ga_to_gfn          = hap_p2m_ga_to_gfn_4_levels,
     .update_cr3             = hap_update_cr3,
     .update_paging_modes    = hap_update_paging_modes,
     .write_p2m_entry        = hap_write_p2m_entry,
diff -r b0e3fea1e01c -r 4cae90a3ea9c xen/arch/x86/mm/hap/nested_hap.c
--- /dev/null
+++ b/xen/arch/x86/mm/hap/nested_hap.c
@@ -0,0 +1,421 @@
+/******************************************************************************
+ * arch/x86/mm/hap/nested_hap.c
+ *
+ * Code for Nested Virtualization
+ * Copyright (c) 2010 Advanced Micro Devices
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <asm/domain.h>
+#include <asm/page.h>
+#include <asm/paging.h>
+#include <asm/p2m.h>
+#include <asm/mem_event.h>
+#include <public/mem_event.h>
+#include <asm/mem_sharing.h>
+#include <xen/event.h>
+#include <asm/hap.h>
+#include <asm/hvm/support.h>
+
+#include <asm/hvm/nestedhvm.h>
+
+#include "private.h"
+
+/* AlGORITHM for NESTED PAGE FAULT 
+ * 
+ * NOTATION
+ * Levels: L0, L1, L2
+ * Guests: L1 guest, L2 guest
+ * Hypervisor: L0 hypervisor
+ * Addresses: L2-GVA, L2-GPA, L1-GVA, L1-GPA, MPA
+ *
+ * On L0, when #NPF happens, the handler function should do:
+ * hap_page_fault(GPA)
+ * {
+ *    1. If #NPF is from L1 guest, then we crash the guest VM (same as old 
+ *       code)
+ *    2. If #NPF is from L2 guest, then we continue from (3)
+ *    3. Get h_cr3 from L1 guest. Map h_cr3 into L0 hypervisor address space.
+ *    4. Walk the h_cr3 page table
+ *    5.    - if not present, then we inject #NPF back to L1 guest and 
+ *            re-launch L1 guest (L1 guest will either treat this #NPF as MMIO,
+ *            or fix its p2m table for L2 guest)
+ *    6.    - if present, then we will get the a new translated value L1-GPA 
+ *            (points to L1 machine memory)
+ *    7.        * Use L1-GPA to walk L0 P2M table
+ *    8.            - if not present, then crash the guest (should not happen)
+ *    9.            - if present, then we get a new translated value MPA 
+ *                    (points to real machine memory)
+ *   10.                * Finally, use GPA and MPA to walk nested_p2m 
+ *                        and fix the bits.
+ * }
+ * 
+ */
+
+
+/********************************************/
+/*        NESTED VIRT P2M FUNCTIONS         */
+/********************************************/
+/* Override macros from asm/page.h to make them work with mfn_t */
+#undef mfn_valid
+#define mfn_valid(_mfn) __mfn_valid(mfn_x(_mfn))
+#undef page_to_mfn
+#define page_to_mfn(_pg) _mfn(__page_to_mfn(_pg))
+
+static void
+nested_write_p2m_entry(struct p2m_domain *p2m,
+    l1_pgentry_t *p, l1_pgentry_t new)
+{
+    struct domain *d = p2m->domain;
+    uint32_t old_flags;
+
+    hap_lock(d);
+
+    old_flags = l1e_get_flags(*p);
+    safe_write_pte(p, new);
+    
+    hap_unlock(d);
+}
+
+static int 
+nestedp2m_next_level(struct p2m_domain *p2m, struct page_info **table_pg, 
+                     void **table, unsigned long *gfn_remainder, 
+                     unsigned long gfn, uint32_t shift, uint32_t max, 
+                     unsigned long type)
+{
+    l1_pgentry_t *l1_entry;
+    l1_pgentry_t *p2m_entry;
+    l1_pgentry_t new_entry;
+    void *next;
+    int i;
+
+    ASSERT(p2m);
+    ASSERT(p2m->alloc_page);
+
+    if ( !(p2m_entry = p2m_find_entry(*table, gfn_remainder, gfn, shift, max)) )
+        return 0;
+
+    if ( !(l1e_get_flags(*p2m_entry) & _PAGE_PRESENT) )
+    {
+        struct page_info *pg;
+
+        pg = p2m_alloc_ptp(p2m, type);
+        if ( pg == NULL )
+            return 0;
+
+        new_entry = l1e_from_pfn(mfn_x(page_to_mfn(pg)),
+                                 __PAGE_HYPERVISOR | _PAGE_USER);
+
+        switch ( type ) {
+        case PGT_l3_page_table:
+            nested_write_p2m_entry(p2m, p2m_entry, new_entry);
+            break;
+        case PGT_l2_page_table:
+#if CONFIG_PAGING_LEVELS == 3
+            /* for PAE mode, PDPE only has PCD/PWT/P bits available */
+            new_entry = l1e_from_pfn(mfn_x(page_to_mfn(pg)), _PAGE_PRESENT);
+#endif
+            nested_write_p2m_entry(p2m, p2m_entry, new_entry);
+            break;
+        case PGT_l1_page_table:
+            nested_write_p2m_entry(p2m, p2m_entry, new_entry);
+            break;
+        default:
+            BUG();
+            break;
+        }
+    }
+
+    ASSERT(l1e_get_flags(*p2m_entry) & (_PAGE_PRESENT|_PAGE_PSE));
+
+    /* split single large page into 4KB page in P2M table */
+    if ( type == PGT_l1_page_table && (l1e_get_flags(*p2m_entry) & _PAGE_PSE) )
+    {
+        unsigned long flags, pfn;
+        struct page_info *pg;
+
+        pg = p2m_alloc_ptp(p2m, PGT_l1_page_table);
+        if ( pg == NULL )
+            return 0;
+
+        /* New splintered mappings inherit the flags of the old superpage, 
+         * with a little reorganisation for the _PAGE_PSE_PAT bit. */
+        flags = l1e_get_flags(*p2m_entry);
+        pfn = l1e_get_pfn(*p2m_entry);
+        if ( pfn & 1 )           /* ==> _PAGE_PSE_PAT was set */
+            pfn -= 1;            /* Clear it; _PAGE_PSE becomes _PAGE_PAT */
+        else
+            flags &= ~_PAGE_PSE; /* Clear _PAGE_PSE (== _PAGE_PAT) */
+        
+        l1_entry = __map_domain_page(pg);
+        for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
+        {
+            new_entry = l1e_from_pfn(pfn + i, flags);
+            nested_write_p2m_entry(p2m, l1_entry+i, new_entry);
+        }
+        unmap_domain_page(l1_entry);
+        
+        new_entry = l1e_from_pfn(mfn_x(page_to_mfn(pg)),
+                                 __PAGE_HYPERVISOR|_PAGE_USER);
+        nested_write_p2m_entry(p2m, p2m_entry, new_entry);
+    }
+
+    *table_pg = l1e_get_page(*p2m_entry);
+    next = __map_domain_page(*table_pg);
+    unmap_domain_page(*table);
+    *table = next;
+
+    return 1;
+}
+
+int 
+nestedp2m_set_entry(struct p2m_domain *p2m, unsigned long gfn, mfn_t mfn,
+                    unsigned int page_order, p2m_type_t p2mt);
+
+int 
+nestedp2m_set_entry(struct p2m_domain *p2m, unsigned long gfn, mfn_t mfn,
+                    unsigned int page_order, p2m_type_t p2mt)
+{
+    struct page_info *table_pg;
+    void *table;
+    unsigned long gfn_remainder = gfn;
+    l1_pgentry_t *p2m_entry;
+    l1_pgentry_t entry_content;
+    l2_pgentry_t l2e_content;
+    int rv = 0;
+    
+    ASSERT(p2m);
+    ASSERT(p2m->alloc_page);
+
+    /* address of nested paging table */
+    table_pg = pagetable_get_page(p2m_get_pagetable(p2m));
+    table = __map_domain_page(table_pg);
+
+#if CONFIG_PAGING_LEVELS >= 4
+    if ( !nestedp2m_next_level(p2m, &table_pg, &table, 
+                               &gfn_remainder, gfn, 
+                               L4_PAGETABLE_SHIFT - PAGE_SHIFT,
+                               L4_PAGETABLE_ENTRIES, PGT_l3_page_table) )
+        goto out;
+#endif
+
+    if ( !nestedp2m_next_level(p2m, &table_pg, &table, &gfn_remainder, 
+                               gfn, L3_PAGETABLE_SHIFT - PAGE_SHIFT,
+                               ((CONFIG_PAGING_LEVELS == 3)
+                                ? (paging_mode_hap(p2m->domain) ? 4 : 8)
+                                : L3_PAGETABLE_ENTRIES),
+                               PGT_l2_page_table) )
+        goto out;
+
+    if ( page_order == 0 )
+    {
+        if ( !nestedp2m_next_level(p2m, &table_pg, &table, 
+                                   &gfn_remainder, gfn,
+                                   L2_PAGETABLE_SHIFT - PAGE_SHIFT,
+                                   L2_PAGETABLE_ENTRIES, PGT_l1_page_table) )
+            goto out;
+
+        p2m_entry = p2m_find_entry(table, &gfn_remainder, gfn,
+                                   0, L1_PAGETABLE_ENTRIES);
+        ASSERT(p2m_entry);
+        
+        if ( mfn_valid(mfn) ) {
+            entry_content = l1e_from_pfn(mfn_x(mfn), 
+                                         p2m_type_to_flags(p2mt));
+        } else {
+            entry_content = l1e_empty();
+        }
+        
+        /* level 1 entry */
+        nested_write_p2m_entry(p2m, p2m_entry, entry_content);
+    }
+    else 
+    {
+        p2m_entry = p2m_find_entry(table, &gfn_remainder, gfn,
+                                   L2_PAGETABLE_SHIFT - PAGE_SHIFT,
+                                   L2_PAGETABLE_ENTRIES);
+        ASSERT(p2m_entry);
+        
+        /* FIXME: Deal with 4k replaced by 2MB pages */
+        if ( (l1e_get_flags(*p2m_entry) & _PAGE_PRESENT) &&
+             !(l1e_get_flags(*p2m_entry) & _PAGE_PSE) )
+        {
+            domain_crash(p2m->domain);
+            goto out;
+        }
+        
+        if ( mfn_valid(mfn) )
+            l2e_content = l2e_from_pfn(mfn_x(mfn),
+                p2m_type_to_flags(p2mt) | _PAGE_PSE);
+        else {
+            l2e_content = l2e_empty();
+	}
+        
+        entry_content.l1 = l2e_content.l2;
+        nested_write_p2m_entry(p2m, p2m_entry, entry_content);
+    }
+
+    /* Track the highest gfn for which we have ever had a valid mapping */
+    if ( mfn_valid(mfn)
+         && (gfn + (1UL << page_order) - 1 > p2m->max_mapped_pfn) )
+        p2m->max_mapped_pfn = gfn + (1UL << page_order) - 1;
+
+    /* Success */
+    rv = 1;
+
+out:
+    unmap_domain_page(table);
+    return rv;
+}
+
+/********************************************/
+/*          NESTED VIRT FUNCTIONS           */
+/********************************************/
+static void
+nestedhap_fix_p2m(struct p2m_domain *p2m, paddr_t L2_gpa, paddr_t L0_gpa)
+{
+    int rv;
+    ASSERT(p2m);
+    ASSERT(p2m->alloc_page);
+    ASSERT(p2m->set_entry);
+
+    rv = p2m->set_entry(p2m, L2_gpa >> PAGE_SHIFT,
+                         page_to_mfn(maddr_to_page(L0_gpa)),
+                         0 /*4K*/, p2m_ram_rw);
+    if (rv == 0) {
+        gdprintk(XENLOG_ERR,
+		"failed to set entry for 0x%"PRIx64" -> 0x%"PRIx64"\n",
+		L2_gpa, L0_gpa);
+        BUG();
+    }
+}
+
+/* This function uses L1_gpa to walk the P2M table in L0 hypervisor. If the
+ * walk is successful, the translated value is returned in L0_gpa. The return 
+ * value tells the upper level what to do.
+ */
+static int
+nestedhap_walk_L0_p2m(struct p2m_domain *p2m, paddr_t L1_gpa, paddr_t *L0_gpa)
+{
+    mfn_t mfn;
+    p2m_type_t p2mt;
+
+    /* we use gfn_to_mfn_query() function to walk L0 P2M table */
+    mfn = gfn_to_mfn_query(p2m, L1_gpa >> PAGE_SHIFT, &p2mt);
+
+    if ( p2m_is_paging(p2mt) || p2m_is_shared(p2mt) || !p2m_is_ram(p2mt) )
+        return NESTEDHVM_PAGEFAULT_ERROR;
+
+    if ( !mfn_valid(mfn) )
+        return NESTEDHVM_PAGEFAULT_ERROR;
+
+    *L0_gpa = (mfn_x(mfn) << PAGE_SHIFT) + (L1_gpa & ~PAGE_MASK);
+    return NESTEDHVM_PAGEFAULT_DONE;
+}
+
+/* This function uses L2_gpa to walk the P2M page table in L1. If the 
+ * walk is successful, the translated value is returned in
+ * L1_gpa. The result value tells what to do next.
+ */
+static int
+nestedhap_walk_L1_p2m(struct vcpu *v, struct p2m_domain *p2m,
+                       paddr_t L2_gpa, paddr_t *L1_gpa)
+{
+    uint32_t pfec;
+    unsigned long nested_cr3, gfn;
+    const struct paging_mode *mode = paging_get_hostmode(v);
+    
+    nested_cr3 = VCPU_NESTEDHVM(v).nh_vmcb_hcr3;
+
+    /* walk the guest table */
+    gfn = paging_p2m_ga_to_gfn(v, p2m, mode, nested_cr3, L2_gpa, &pfec);
+
+    if ( gfn == INVALID_GFN ) 
+        return NESTEDHVM_PAGEFAULT_INJECT;
+
+    *L1_gpa = (gfn << PAGE_SHIFT) + (L2_gpa & ~PAGE_MASK);
+    return NESTEDHVM_PAGEFAULT_DONE;
+}
+
+/*
+ * The following function, nestedhap_page_fault(), is for steps (3)--(10).
+ *
+ * Returns:
+ */
+int
+nestedhvm_hap_nested_page_fault(struct vcpu *v, paddr_t L2_gpa)
+{
+    int rv;
+    paddr_t L1_gpa, L0_gpa;
+    struct domain *d = v->domain;
+    struct p2m_domain *p2m, *nested_p2m;
+
+    p2m = p2m_get_hostp2m(d); /* L0 p2m */
+    nested_p2m = p2m_get_nestedp2m(v, VCPU_NESTEDHVM(v).nh_vmcb_hcr3);
+
+    /* walk the L1 P2M table, note we have to pass p2m
+     * and not nested_p2m here or we fail the walk forever,
+     * otherwise. */
+    rv = nestedhap_walk_L1_p2m(v, p2m, L2_gpa, &L1_gpa);
+
+    /* let caller to handle these two cases */
+    switch (rv) {
+    case NESTEDHVM_PAGEFAULT_INJECT:
+        return rv;
+    case NESTEDHVM_PAGEFAULT_ERROR:
+        return rv;
+    case NESTEDHVM_PAGEFAULT_DONE:
+        break;
+    default:
+        BUG();
+        break;
+    }
+
+    /* ==> we have to walk L0 P2M */
+    rv = nestedhap_walk_L0_p2m(p2m, L1_gpa, &L0_gpa);
+
+    /* let upper level caller to handle these two cases */
+    switch (rv) {
+    case NESTEDHVM_PAGEFAULT_INJECT:
+        return rv;
+    case NESTEDHVM_PAGEFAULT_ERROR:
+        return rv;
+    case NESTEDHVM_PAGEFAULT_DONE:
+        break;
+    default:
+        BUG();
+        break;
+    }
+
+    /* fix p2m_get_pagetable(nested_p2m) */
+    nestedhap_fix_p2m(nested_p2m, L2_gpa, L0_gpa);
+
+    return NESTEDHVM_PAGEFAULT_DONE;
+}
+
+/********************************************/
+/*     NESTED VIRT INITIALIZATION FUNCS     */
+/********************************************/
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff -r b0e3fea1e01c -r 4cae90a3ea9c xen/arch/x86/mm/hap/private.h
--- a/xen/arch/x86/mm/hap/private.h
+++ b/xen/arch/x86/mm/hap/private.h
@@ -30,4 +30,14 @@ unsigned long hap_gva_to_gfn_3_levels(st
 unsigned long hap_gva_to_gfn_4_levels(struct vcpu *v, unsigned long gva,
                                      uint32_t *pfec);
 
+unsigned long hap_p2m_ga_to_gfn_2_levels(struct vcpu *v,
+    struct p2m_domain *p2m, unsigned long cr3,
+    paddr_t ga, uint32_t *pfec);
+unsigned long hap_p2m_ga_to_gfn_3_levels(struct vcpu *v,
+    struct p2m_domain *p2m, unsigned long cr3,
+    paddr_t ga, uint32_t *pfec);
+unsigned long hap_p2m_ga_to_gfn_4_levels(struct vcpu *v,
+    struct p2m_domain *p2m, unsigned long cr3,
+    paddr_t ga, uint32_t *pfec);
+
 #endif /* __HAP_PRIVATE_H__ */
diff -r b0e3fea1e01c -r 4cae90a3ea9c xen/arch/x86/mm/p2m.c
--- a/xen/arch/x86/mm/p2m.c
+++ b/xen/arch/x86/mm/p2m.c
@@ -34,6 +34,7 @@
 #include <public/mem_event.h>
 #include <asm/mem_sharing.h>
 #include <xen/event.h>
+#include <asm/hvm/nestedhvm.h>
 
 /* Debugging and auditing of the P2M code? */
 #define P2M_AUDIT     0
@@ -72,7 +73,7 @@ boolean_param("hap_1gb", opt_hap_1gb);
 #define SUPERPAGE_PAGES (1UL << 9)
 #define superpage_aligned(_x)  (((_x)&(SUPERPAGE_PAGES-1))==0)
 
-static unsigned long p2m_type_to_flags(p2m_type_t t) 
+unsigned long p2m_type_to_flags(p2m_type_t t) 
 {
     unsigned long flags;
 #ifdef __x86_64__
@@ -116,9 +117,9 @@ static void audit_p2m(struct p2m_domain 
 // Find the next level's P2M entry, checking for out-of-range gfn's...
 // Returns NULL on error.
 //
-static l1_pgentry_t *
+l1_pgentry_t *
 p2m_find_entry(void *table, unsigned long *gfn_remainder,
-                   unsigned long gfn, u32 shift, u32 max)
+                   unsigned long gfn, uint32_t shift, uint32_t max)
 {
     u32 index;
 
@@ -1719,6 +1720,7 @@ static void p2m_initialise(struct domain
     INIT_PAGE_LIST_HEAD(&p2m->pod.single);
 
     p2m->domain = d;
+    p2m->cr3 = 0;
     p2m->set_entry = p2m_set_entry;
     p2m->get_entry = p2m_gfn_to_mfn;
     p2m->get_entry_current = p2m_gfn_to_mfn_current;
@@ -1730,6 +1732,28 @@ static void p2m_initialise(struct domain
     return;
 }
 
+extern int
+nestedp2m_set_entry(struct p2m_domain *p2m, unsigned long gfn, mfn_t mfn,
+                    unsigned int page_order, p2m_type_t p2mt);
+ 
+static int
+p2m_init_nestedp2m(struct domain *d)
+{
+    uint8_t i;
+    struct p2m_domain *p2m;
+
+    spin_lock_init(&d->arch.nested_p2m_lock);
+    for (i = 0; i < MAX_NESTEDP2M; i++) {
+        d->arch.nested_p2m[i] = p2m = xmalloc(struct p2m_domain);
+        if (p2m == NULL)
+            return -ENOMEM;
+        p2m_initialise(d, p2m);
+        p2m->set_entry = nestedp2m_set_entry;
+    }
+
+    return 0;
+}
+
 int p2m_init(struct domain *d)
 {
     struct p2m_domain *p2m;
@@ -1739,7 +1763,11 @@ int p2m_init(struct domain *d)
         return -ENOMEM;
     p2m_initialise(d, p2m);
 
-    return 0;
+    /* Must initialise nestedp2m unconditionally
+     * since nestedhvm_enabled(d) returns false here.
+     * (p2m_init runs too early for HVM_PARAM_* options)
+     */
+    return p2m_init_nestedp2m(d);
 }
 
 void p2m_change_entry_type_global(struct p2m_domain *p2m,
@@ -1836,6 +1864,9 @@ int p2m_alloc_table(struct p2m_domain *p
                         p2m_invalid) )
         goto error;
 
+    if (p2m_is_nestedp2m(p2m))
+        goto nesteddone;
+
     /* Copy all existing mappings from the page list and m2p */
     spin_lock(&p2m->domain->page_alloc_lock);
     page_list_for_each(page, &p2m->domain->page_list)
@@ -1857,6 +1888,7 @@ int p2m_alloc_table(struct p2m_domain *p
     }
     spin_unlock(&p2m->domain->page_alloc_lock);
 
+ nesteddone:
     P2M_PRINTK("p2m table initialised (%u pages)\n", page_count);
     p2m_unlock(p2m);
     return 0;
@@ -1881,6 +1913,9 @@ void p2m_teardown(struct p2m_domain *p2m
     mfn_t mfn;
 #endif
 
+    if (p2m == NULL)
+        return;
+
     p2m_lock(p2m);
 
 #ifdef __x86_64__
@@ -1899,11 +1934,26 @@ void p2m_teardown(struct p2m_domain *p2m
     p2m_unlock(p2m);
 }
 
+static void p2m_teardown_nestedp2m(struct domain *d)
+{
+    uint8_t i;
+
+    for (i = 0; i < MAX_NESTEDP2M; i++) {
+        xfree(d->arch.nested_p2m[i]);
+        d->arch.nested_p2m[i] = NULL;
+    }
+}
+
 void p2m_final_teardown(struct domain *d)
 {
     /* Iterate over all p2m tables per domain */
     xfree(d->arch.p2m);
     d->arch.p2m = NULL;
+
+    /* We must teardown unconditionally because
+     * we initialise them unconditionally.
+     */
+    p2m_teardown_nestedp2m(d);
 }
 
 #if P2M_AUDIT
@@ -2821,6 +2871,159 @@ void p2m_mem_paging_resume(struct p2m_do
 }
 #endif /* __x86_64__ */
 
+static struct p2m_domain *
+p2m_getlru_nestedp2m(struct domain *d, struct p2m_domain *p2m)
+{
+    int i, lru_index = -1;
+    struct p2m_domain *lrup2m, *tmp;
+
+    if (p2m == NULL) {
+        lru_index = MAX_NESTEDP2M - 1;
+        lrup2m = d->arch.nested_p2m[lru_index];
+    } else {
+        lrup2m = p2m;
+        for (i = 0; i < MAX_NESTEDP2M; i++) {
+            if (d->arch.nested_p2m[i] == p2m) {
+                lru_index = i;
+                break;
+            }
+        }
+    }
+
+    ASSERT(lru_index >= 0);
+    if (lru_index == 0) {
+        return lrup2m;
+    }
+
+    /* move the other's down the array "list" */
+    for (i = lru_index - 1; i >= 0; i--) {
+        tmp = d->arch.nested_p2m[i];
+        d->arch.nested_p2m[i+1] = tmp;        
+    }
+
+    /* make the entry the first one */
+    d->arch.nested_p2m[0] = lrup2m;
+
+    return lrup2m;
+}
+
+static int 
+p2m_flush_locked(struct p2m_domain *p2m)
+{
+    struct page_info * (*alloc)(struct p2m_domain *);
+    void (*free)(struct p2m_domain *, struct page_info *);
+
+    alloc = p2m->alloc_page;
+    free = p2m->free_page;
+
+    p2m_teardown(p2m);
+    p2m_initialise(p2m->domain, p2m);
+    p2m->set_entry = nestedp2m_set_entry;
+    BUG_ON(p2m_alloc_table(p2m, alloc, free) != 0);
+
+    ASSERT(p2m);
+    ASSERT(p2m->alloc_page);
+    return 0;
+}
+
+void
+p2m_flush(struct vcpu *v, struct p2m_domain *p2m)
+{
+    struct domain *d = p2m->domain;
+
+    ASSERT(v->domain == d);
+    VCPU_NESTEDHVM(v).nh_p2m = NULL;
+    spin_lock(&d->arch.nested_p2m_lock);
+    BUG_ON(p2m_flush_locked(p2m) != 0);
+    hvm_asid_flush_vcpu(v);
+    spin_unlock(&d->arch.nested_p2m_lock);
+}
+
+void
+p2m_flush_nestedp2m(struct domain *d)
+{
+    int i;
+
+    spin_lock(&d->arch.nested_p2m_lock);
+    for (i = 0; i < MAX_NESTEDP2M; i++)
+        BUG_ON(p2m_flush_locked(d->arch.nested_p2m[i]) != 0);
+    spin_unlock(&d->arch.nested_p2m_lock);
+}
+
+struct p2m_domain *
+p2m_get_nestedp2m(struct vcpu *v, uint64_t cr3)
+{
+    struct nestedhvm *hvm = &VCPU_NESTEDHVM(v);
+    struct domain *d;
+    struct p2m_domain *p2m;
+    int i, rv;
+
+    if (cr3 == 0)
+        cr3 = v->arch.hvm_vcpu.guest_cr[3];
+
+    if (hvm->nh_flushp2m)
+        hvm->nh_p2m = NULL;
+
+    d = v->domain;
+    spin_lock(&d->arch.nested_p2m_lock);
+    for (i = 0; i < MAX_NESTEDP2M; i++) {
+        p2m = d->arch.nested_p2m[i];
+        if (p2m->cr3 == cr3 && p2m == hvm->nh_p2m) {
+            p2m_getlru_nestedp2m(d, p2m);
+            if (hvm->nh_flushp2m) {
+               hvm_asid_flush_vcpu(v);
+               BUG_ON(p2m_flush_locked(p2m) != 0);
+            }
+            p2m->cr3 = cr3;
+            spin_unlock(&d->arch.nested_p2m_lock);
+            return p2m;
+        }
+        if (p2m->cr3 == 0) { /* found unused p2m table */
+            p2m_getlru_nestedp2m(d, p2m);
+            hvm->nh_p2m = p2m;
+            p2m->cr3 = cr3;
+            spin_unlock(&d->arch.nested_p2m_lock);
+            return p2m;
+        }
+    }
+
+    /* All p2m's are or were in use. We know the least recent used one.
+     * Destroy and re-initialize it.
+     */
+    for (i = 0; i < MAX_NESTEDP2M; i++) {
+        p2m = p2m_getlru_nestedp2m(d, NULL);
+        rv = p2m_flush_locked(p2m);
+        if (rv == 0)
+            break;
+    }
+    hvm_asid_flush_vcpu(v);
+    hvm->nh_p2m = p2m;
+    p2m->cr3 = cr3;
+    spin_unlock(&d->arch.nested_p2m_lock);
+
+    return p2m;
+}
+
+struct p2m_domain *
+p2m_get_p2m(struct vcpu *v)
+{
+    struct domain *d = v->domain;
+
+    if (!nestedhvm_enabled(d))
+        return p2m_get_hostp2m(d);
+
+    if (nestedhvm_vmentry_emulate(v))
+        return p2m_get_hostp2m(d);
+
+    if (!nestedhvm_paging_mode_hap(v))
+        return p2m_get_hostp2m(d);
+
+    if (nestedhvm_vcpu_in_guestmode(v))
+        return p2m_get_nestedp2m(v, VCPU_NESTEDHVM(v).nh_vmcb_hcr3);
+   
+    return p2m_get_hostp2m(d);
+}
+
 /*
  * Local variables:
  * mode: C
diff -r b0e3fea1e01c -r 4cae90a3ea9c xen/arch/x86/mm/paging.c
--- a/xen/arch/x86/mm/paging.c
+++ b/xen/arch/x86/mm/paging.c
@@ -26,6 +26,7 @@
 #include <asm/p2m.h>
 #include <asm/hap.h>
 #include <asm/guest_access.h>
+#include <asm/hvm/nestedhvm.h>
 #include <xen/numa.h>
 #include <xsm/xsm.h>
 
@@ -805,21 +806,52 @@ void paging_dump_vcpu_info(struct vcpu *
         printk("    paging assistance: ");
         if ( paging_mode_shadow(v->domain) )
         {
-            if ( v->arch.paging.mode )
+            if ( paging_get_hostmode(v) )
                 printk("shadowed %u-on-%u\n",
-                       v->arch.paging.mode->guest_levels,
-                       v->arch.paging.mode->shadow.shadow_levels);
+                       paging_get_hostmode(v)->guest_levels,
+                       paging_get_hostmode(v)->shadow.shadow_levels);
             else
                 printk("not shadowed\n");
         }
-        else if ( paging_mode_hap(v->domain) && v->arch.paging.mode )
+        else if ( paging_mode_hap(v->domain) && paging_get_hostmode(v) )
             printk("hap, %u levels\n",
-                   v->arch.paging.mode->guest_levels);
+                   paging_get_hostmode(v)->guest_levels);
         else
             printk("none\n");
     }
 }
 
+const struct paging_mode *paging_get_mode(struct vcpu *v)
+{
+    struct domain *d = v->domain;
+
+    if (!nestedhvm_enabled(d))
+        return paging_get_hostmode(v);
+
+    if (nestedhvm_vmentry_emulate(v))
+        return paging_get_hostmode(v);
+
+    if (!nestedhvm_paging_mode_hap(v))
+        return paging_get_hostmode(v);
+
+    if (nestedhvm_vcpu_in_guestmode(v))
+        return paging_get_nestedmode(v);
+
+    return paging_get_hostmode(v);
+}
+
+extern const struct paging_mode *hap_paging_get_mode(struct vcpu *);
+
+void paging_update_nestedmode(struct vcpu *v)
+{
+    ASSERT(nestedhvm_enabled(v->domain));
+    if (nestedhvm_paging_mode_hap(v))
+        /* nested-on-nested */
+        v->arch.paging.nestedmode = hap_paging_get_mode(v);
+    else
+        /* TODO: shadow-on-shadow */
+        v->arch.paging.nestedmode = NULL;
+}
 
 /*
  * Local variables:
diff -r b0e3fea1e01c -r 4cae90a3ea9c xen/include/asm-x86/domain.h
--- a/xen/include/asm-x86/domain.h
+++ b/xen/include/asm-x86/domain.h
@@ -207,6 +207,8 @@ struct paging_domain {
 struct paging_vcpu {
     /* Pointers to mode-specific entry points. */
     const struct paging_mode *mode;
+    /* Nested Virtualization: paging mode of nested guest */
+    const struct paging_mode *nestedmode;
     /* HVM guest: last emulate was to a pagetable */
     unsigned int last_write_was_pt:1;
     /* HVM guest: last write emulation succeeds */
@@ -222,6 +224,7 @@ struct paging_vcpu {
 #define MAX_CPUID_INPUT 40
 typedef xen_domctl_cpuid_t cpuid_input_t;
 
+#define MAX_NESTEDP2M 10
 struct p2m_domain;
 struct time_scale {
     int shift;
@@ -255,6 +258,10 @@ struct arch_domain
     struct paging_domain paging;
     struct p2m_domain *p2m;
 
+    /* nestedhvm: translate l2 guest physical to host physical */
+    struct p2m_domain *nested_p2m[MAX_NESTEDP2M];
+    spinlock_t nested_p2m_lock;
+
     /* NB. protected by d->event_lock and by irq_desc[irq].lock */
     int *irq_pirq;
     int *pirq_irq;
diff -r b0e3fea1e01c -r 4cae90a3ea9c xen/include/asm-x86/hvm/hvm.h
--- a/xen/include/asm-x86/hvm/hvm.h
+++ b/xen/include/asm-x86/hvm/hvm.h
@@ -371,7 +371,7 @@ static inline void hvm_set_info_guest(st
 
 int hvm_debug_op(struct vcpu *v, int32_t op);
 
-bool_t hvm_hap_nested_page_fault(unsigned long gfn);
+bool_t hvm_hap_nested_page_fault(paddr_t gpa, struct cpu_user_regs *regs);
 
 #define hvm_msr_tsc_aux(v) ({                                               \
     struct domain *__d = (v)->domain;                                       \
diff -r b0e3fea1e01c -r 4cae90a3ea9c xen/include/asm-x86/p2m.h
--- a/xen/include/asm-x86/p2m.h
+++ b/xen/include/asm-x86/p2m.h
@@ -173,6 +173,7 @@ struct p2m_domain {
     pagetable_t        phys_table;
 
     struct domain     *domain;   /* back pointer to domain */
+    uint64_t           cr3;      /* to identify this p2m for re-use */
 
     /* Pages used to construct the p2m */
     struct page_list_head pages;
@@ -229,8 +230,26 @@ struct p2m_domain {
 /* get host p2m table */
 #define p2m_get_hostp2m(d)      ((d)->arch.p2m)
 
+/* Get p2m table (re)usable for specified cr3.
+ * Automatically destroys and re-initializes a p2m if none found.
+ * If cr3 == 0 then v->arch.hvm_vcpu.guest_cr[3] is used.
+ */
+struct p2m_domain *p2m_get_nestedp2m(struct vcpu *v, uint64_t cr3);
+
+/* If vcpu is in host mode then behaviour matches p2m_get_hostp2m().
+ * If vcpu is in guest mode then behaviour matches p2m_get_nestedp2m().
+ */
+struct p2m_domain *p2m_get_p2m(struct vcpu *v);
+
+#define p2m_is_nestedp2m(p2m)   ((p2m) != p2m_get_hostp2m((p2m->domain)))
+
 #define p2m_get_pagetable(p2m)  ((p2m)->phys_table)
 
+/* Flushes specified p2m table */
+void p2m_flush(struct vcpu *v, struct p2m_domain *p2m);
+/* Flushes all nested p2m tables */
+void p2m_flush_nestedp2m(struct domain *d);
+
 /*
  * The P2M lock.  This protects all updates to the p2m table.
  * Updates are expected to be safe against concurrent reads,
@@ -372,6 +391,9 @@ static inline unsigned long mfn_to_gfn(s
 /* Init the datastructures for later use by the p2m code */
 int p2m_init(struct domain *d);
 
+/* PTE flags for various types of p2m entry */
+unsigned long p2m_type_to_flags(p2m_type_t t);
+
 /* Allocate a new p2m table for a domain. 
  *
  * The alloc_page and free_page functions will be used to get memory to
@@ -382,6 +404,13 @@ int p2m_alloc_table(struct p2m_domain *p
                struct page_info * (*alloc_page)(struct p2m_domain *p2m),
                void (*free_page)(struct p2m_domain *p2m, struct page_info *pg));
 
+/* Find the next level's P2M entry, checking for out-of-range gfn's...
+ * Returns NULL on error.
+ */
+l1_pgentry_t *
+p2m_find_entry(void *table, unsigned long *gfn_remainder,
+               unsigned long gfn, uint32_t shift, uint32_t max);
+
 /* Return all the p2m resources to Xen. */
 void p2m_teardown(struct p2m_domain *p2m);
 void p2m_final_teardown(struct domain *d);
diff -r b0e3fea1e01c -r 4cae90a3ea9c xen/include/asm-x86/paging.h
--- a/xen/include/asm-x86/paging.h
+++ b/xen/include/asm-x86/paging.h
@@ -110,6 +110,10 @@ struct paging_mode {
     int           (*invlpg                )(struct vcpu *v, unsigned long va);
     unsigned long (*gva_to_gfn            )(struct vcpu *v, unsigned long va,
                                             uint32_t *pfec);
+    unsigned long (*p2m_ga_to_gfn         )(struct vcpu *v,
+                                            struct p2m_domain *p2m,
+                                            unsigned long cr3,
+                                            paddr_t ga, uint32_t *pfec);
     void          (*update_cr3            )(struct vcpu *v, int do_locking);
     void          (*update_paging_modes   )(struct vcpu *v);
     void          (*write_p2m_entry       )(struct vcpu *v, unsigned long gfn,
@@ -219,6 +223,10 @@ void paging_final_teardown(struct domain
  * creation. */
 int paging_enable(struct domain *d, u32 mode);
 
+#define paging_get_hostmode(v)		((v)->arch.paging.mode)
+#define paging_get_nestedmode(v)	((v)->arch.paging.nestedmode)
+const struct paging_mode *paging_get_mode(struct vcpu *v);
+void paging_update_nestedmode(struct vcpu *v);
 
 /* Page fault handler
  * Called from pagefault handler in Xen, and from the HVM trap handlers
@@ -233,7 +241,7 @@ static inline int
 paging_fault(unsigned long va, struct cpu_user_regs *regs)
 {
     struct vcpu *v = current;
-    return v->arch.paging.mode->page_fault(v, va, regs);
+    return paging_get_hostmode(v)->page_fault(v, va, regs);
 }
 
 /* Handle invlpg requests on vcpus.
@@ -241,7 +249,7 @@ paging_fault(unsigned long va, struct cp
  * or 0 if it's safe not to do so. */
 static inline int paging_invlpg(struct vcpu *v, unsigned long va)
 {
-    return v->arch.paging.mode->invlpg(v, va);
+    return paging_get_hostmode(v)->invlpg(v, va);
 }
 
 /* Translate a guest virtual address to the frame number that the
@@ -255,7 +263,29 @@ static inline unsigned long paging_gva_t
                                               unsigned long va,
                                               uint32_t *pfec)
 {
-    return v->arch.paging.mode->gva_to_gfn(v, va, pfec);
+    return paging_get_hostmode(v)->gva_to_gfn(v, va, pfec);
+}
+
+/* Translates a guest virtual address to guest physical address
+ * where the specified cr3 is translated to host physical address
+ * using the specified p2m table.
+ * This allows to do page walks in the guest or even in the nested guest.
+ * It returns the guest's gfn or the nested guest's gfn.
+ * Use 'paddr_t' for the guest address so it won't overflow when
+ * guest or nested guest is in 32bit PAE mode.
+ */
+static inline unsigned long paging_p2m_ga_to_gfn(struct vcpu *v,
+                                                 struct p2m_domain *p2m,
+                                                 const struct paging_mode *mode,
+                                                 unsigned long cr3,
+                                                 paddr_t ga,
+                                                 uint32_t *pfec)
+{
+    if ( is_hvm_domain(v->domain) && paging_mode_hap(v->domain) )
+        return mode->p2m_ga_to_gfn(v, p2m, cr3, ga, pfec);
+
+    /* shadow paging */
+    return paging_gva_to_gfn(v, ga, pfec);
 }
 
 /* Update all the things that are derived from the guest's CR3.
@@ -263,7 +293,7 @@ static inline unsigned long paging_gva_t
  * as the value to load into the host CR3 to schedule this vcpu */
 static inline void paging_update_cr3(struct vcpu *v)
 {
-    v->arch.paging.mode->update_cr3(v, 1);
+    paging_get_hostmode(v)->update_cr3(v, 1);
 }
 
 /* Update all the things that are derived from the guest's CR0/CR3/CR4.
@@ -271,7 +301,7 @@ static inline void paging_update_cr3(str
  * has changed, and when bringing up a VCPU for the first time. */
 static inline void paging_update_paging_modes(struct vcpu *v)
 {
-    v->arch.paging.mode->update_paging_modes(v);
+    paging_get_hostmode(v)->update_paging_modes(v);
 }
 
 
@@ -283,7 +313,7 @@ static inline int paging_write_guest_ent
 {
     if ( unlikely(paging_mode_enabled(v->domain) 
                   && v->arch.paging.mode != NULL) )
-        return v->arch.paging.mode->write_guest_entry(v, p, new, gmfn);
+        return paging_get_hostmode(v)->write_guest_entry(v, p, new, gmfn);
     else 
         return (!__copy_to_user(p, &new, sizeof(new)));
 }
@@ -299,7 +329,7 @@ static inline int paging_cmpxchg_guest_e
 {
     if ( unlikely(paging_mode_enabled(v->domain) 
                   && v->arch.paging.mode != NULL) )
-        return v->arch.paging.mode->cmpxchg_guest_entry(v, p, old, new, gmfn);
+        return paging_get_hostmode(v)->cmpxchg_guest_entry(v, p, old, new, gmfn);
     else 
         return (!cmpxchg_user(p, *old, new));
 }
@@ -334,10 +364,10 @@ static inline void paging_write_p2m_entr
     struct vcpu *v = current;
     if ( v->domain != d )
         v = d->vcpu ? d->vcpu[0] : NULL;
-    if ( likely(v && paging_mode_enabled(d) && v->arch.paging.mode != NULL) )
+    if ( likely(v && paging_mode_enabled(d) && paging_get_hostmode(v) != NULL) )
     {
-        return v->arch.paging.mode->write_p2m_entry(v, gfn, p, table_mfn,
-                                                    new, level);
+        return paging_get_hostmode(v)->write_p2m_entry(v, gfn, p, table_mfn,
+                                                       new, level);
     }
     else 
         safe_write_pte(p, new);
@@ -362,7 +392,7 @@ guest_map_l1e(struct vcpu *v, unsigned l
     l2_pgentry_t l2e;
 
     if ( unlikely(paging_mode_translate(v->domain)) )
-        return v->arch.paging.mode->guest_map_l1e(v, addr, gl1mfn);
+        return paging_get_hostmode(v)->guest_map_l1e(v, addr, gl1mfn);
 
     /* Find this l1e and its enclosing l1mfn in the linear map */
     if ( __copy_from_user(&l2e, 
@@ -398,7 +428,7 @@ guest_get_eff_l1e(struct vcpu *v, unsign
         return;
     }
         
-    v->arch.paging.mode->guest_get_eff_l1e(v, addr, eff_l1e);
+    paging_get_hostmode(v)->guest_get_eff_l1e(v, addr, eff_l1e);
 }
 
 /* Read the guest's l1e that maps this address, from the kernel-mode

[-- Attachment #3: Type: text/plain, Size: 138 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH 14/14] Nested Virtualization: hap-on-hap
  2010-08-05 15:05 [PATCH 14/14] Nested Virtualization: hap-on-hap Christoph Egger
@ 2010-08-09 13:18 ` Tim Deegan
  2010-08-19 15:55   ` Christoph Egger
  0 siblings, 1 reply; 5+ messages in thread
From: Tim Deegan @ 2010-08-09 13:18 UTC (permalink / raw)
  To: Christoph Egger; +Cc: xen-devel@lists.xensource.com

Hi, 

This looks a lot nicer than the last version I reviewed.  I'm still
concerned about TLB and p2m flushes, though.

- I can't see how writes to the 'host' p2m table cause the 'shadow' p2m
  tables to be flushed.  I might just have missed it. 
- The p2m_flush operations don't look safe against other vpcus.  Mostly
  they're called with v==current, which looks OK, but what if two vcpus
  are running on the same p2m?  Also when p2m_get_nestedp2m() flushes
  the domain's LRU p2m, there's no shootdown if that p2m is in use on
  another pcpu.  That could happen if the VM has more vcpus than
  MAX_NESTEDP2M.  (Actually, that case is probably pretty broken
  generally.)

Cheers,

Tim.



-- 
Tim Deegan <Tim.Deegan@citrix.com>
Principal Software Engineer, XenServer Engineering
Citrix Systems UK Ltd.  (Company #02937203, SL9 0BG)

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH 14/14] Nested Virtualization: hap-on-hap
  2010-08-09 13:18 ` Tim Deegan
@ 2010-08-19 15:55   ` Christoph Egger
  2010-08-19 16:33     ` Tim Deegan
  0 siblings, 1 reply; 5+ messages in thread
From: Christoph Egger @ 2010-08-19 15:55 UTC (permalink / raw)
  To: Tim Deegan; +Cc: xen-devel@lists.xensource.com

On Monday 09 August 2010 15:18:22 Tim Deegan wrote:
> Hi,
>
> This looks a lot nicer than the last version I reviewed.  I'm still
> concerned about TLB and p2m flushes, though.
>
> - I can't see how writes to the 'host' p2m table cause the 'shadow' p2m
>   tables to be flushed.  I might just have missed it.

The 'shadow' p2m is flushed when
- the l1 guest runs an instruction like INVLPGA (e.g. Windows 7 does so)
- the l1 guest sets up a VMCB where
     * the tlb_control is set
     * the asid changed
     * the nested cr3 changed (and there is no free nestedp2m slot)

> - The p2m_flush operations don't look safe against other vpcus.  Mostly
>   they're called with v==current, which looks OK, but what if two vcpus
>   are running on the same p2m?  Also when p2m_get_nestedp2m() flushes
>   the domain's LRU p2m, there's no shootdown if that p2m is in use on
>   another pcpu.  That could happen if the VM has more vcpus than
>   MAX_NESTEDP2M.  (Actually, that case is probably pretty broken
>   generally.)

Yes, this is indeed an issue that needs to be fixed. How do I do
a TLB shootdown across physical cpus which schedule
vcpus bound to the l1 guest ?
The physical cpu must leave the l1/l2 guest on the tlb shootdown.
An optimization is to limit the tlb shootdown to those physical cpus
where "their" vcpus are in guestmode if this is possible to implement.

Christoph


-- 
---to satisfy European Law for business letters:
Advanced Micro Devices GmbH
Einsteinring 24, 85609 Dornach b. Muenchen
Geschaeftsfuehrer: Alberto Bozzo, Andrew Bowd
Sitz: Dornach, Gemeinde Aschheim, Landkreis Muenchen
Registergericht Muenchen, HRB Nr. 43632

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH 14/14] Nested Virtualization: hap-on-hap
  2010-08-19 15:55   ` Christoph Egger
@ 2010-08-19 16:33     ` Tim Deegan
  2010-09-01 14:27       ` Christoph Egger
  0 siblings, 1 reply; 5+ messages in thread
From: Tim Deegan @ 2010-08-19 16:33 UTC (permalink / raw)
  To: Christoph Egger; +Cc: xen-devel@lists.xensource.com

At 16:55 +0100 on 19 Aug (1282236902), Christoph Egger wrote:
> On Monday 09 August 2010 15:18:22 Tim Deegan wrote:
> > - I can't see how writes to the 'host' p2m table cause the 'shadow' p2m
> >   tables to be flushed.  I might just have missed it.
> 
> The 'shadow' p2m is flushed when
> - the l1 guest runs an instruction like INVLPGA (e.g. Windows 7 does so)
> - the l1 guest sets up a VMCB where
>      * the tlb_control is set
>      * the asid changed
>      * the nested cr3 changed (and there is no free nestedp2m slot)

OK, so the case I'm worried about is: if the L1's p2m is changed (by
ballooning, or the mem-event code, or by a page being marked as broken) 
then the shadow p2ms need to be updated or discarded because they might
contain mappings made before the change.

The equivalent problem exists in the normal shadow pagetables, which is
why the p2m code has to use a callback (shadow_write_p2m_entry()) to
write its entries.  The shadow code writes the entry and removes all
offending shadow PTEs at the same time.  You'll need something
equivalent here for safety.

BTW, it won't be enough to just declare that these are unsupported
combinations - if a nested-mode guest can give up a page of memory that
its l2 guest still has mappings of then it breaks the basic memory
safety of Xen. 

> > - The p2m_flush operations don't look safe against other vpcus.  Mostly
> >   they're called with v==current, which looks OK, but what if two vcpus
> >   are running on the same p2m?  Also when p2m_get_nestedp2m() flushes
> >   the domain's LRU p2m, there's no shootdown if that p2m is in use on
> >   another pcpu.  That could happen if the VM has more vcpus than
> >   MAX_NESTEDP2M.  (Actually, that case is probably pretty broken
> >   generally.)
> 
> Yes, this is indeed an issue that needs to be fixed. How do I do
> a TLB shootdown across physical cpus which schedule
> vcpus bound to the l1 guest ?

You can call on_selected_cpus() with the vcpu's v->vcpu_dirty_cpumask
(remembering to take smp_processor_id() out of the mask first).
If all you need is a TLB shootdown you can call flush_tlb_mask().
That will cause a VMEXIT on the target CPUs so if you make it so that
they can't VMENTER again with the old p2m settings that might be all you
need.

> The physical cpu must leave the l1/l2 guest on the tlb shootdown.
> An optimization is to limit the tlb shootdown to those physical cpus
> where "their" vcpus are in guestmode if this is possible to implement.

You'd have to add another cpumask to express that, but that's doable.

Cheers,

Tim.

-- 
Tim Deegan <Tim.Deegan@citrix.com>
Principal Software Engineer, XenServer Engineering
Citrix Systems UK Ltd.  (Company #02937203, SL9 0BG)

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH 14/14] Nested Virtualization: hap-on-hap
  2010-08-19 16:33     ` Tim Deegan
@ 2010-09-01 14:27       ` Christoph Egger
  0 siblings, 0 replies; 5+ messages in thread
From: Christoph Egger @ 2010-09-01 14:27 UTC (permalink / raw)
  To: Tim Deegan; +Cc: xen-devel@lists.xensource.com

On Thursday 19 August 2010 18:33:16 Tim Deegan wrote:
> At 16:55 +0100 on 19 Aug (1282236902), Christoph Egger wrote:
> > On Monday 09 August 2010 15:18:22 Tim Deegan wrote:
> > > - I can't see how writes to the 'host' p2m table cause the 'shadow' p2m
> > >   tables to be flushed.  I might just have missed it.
> >
> > The 'shadow' p2m is flushed when
> > - the l1 guest runs an instruction like INVLPGA (e.g. Windows 7 does so)
> > - the l1 guest sets up a VMCB where
> >      * the tlb_control is set
> >      * the asid changed
> >      * the nested cr3 changed (and there is no free nestedp2m slot)
>
> OK, so the case I'm worried about is: if the L1's p2m is changed (by
> ballooning, or the mem-event code, or by a page being marked as broken)
> then the shadow p2ms need to be updated or discarded because they might
> contain mappings made before the change.
>
> The equivalent problem exists in the normal shadow pagetables, which is
> why the p2m code has to use a callback (shadow_write_p2m_entry()) to
> write its entries.  The shadow code writes the entry and removes all
> offending shadow PTEs at the same time.  You'll need something
> equivalent here for safety.
>
> BTW, it won't be enough to just declare that these are unsupported
> combinations - if a nested-mode guest can give up a page of memory that
> its l2 guest still has mappings of then it breaks the basic memory
> safety of Xen.

I see. 


> > > - The p2m_flush operations don't look safe against other vpcus.  Mostly
> > >   they're called with v==current, which looks OK, but what if two vcpus
> > >   are running on the same p2m?  Also when p2m_get_nestedp2m() flushes
> > >   the domain's LRU p2m, there's no shootdown if that p2m is in use on
> > >   another pcpu.  That could happen if the VM has more vcpus than
> > >   MAX_NESTEDP2M.  (Actually, that case is probably pretty broken
> > >   generally.)
> >
> > Yes, this is indeed an issue that needs to be fixed. How do I do
> > a TLB shootdown across physical cpus which schedule
> > vcpus bound to the l1 guest ?
>
> You can call on_selected_cpus() with the vcpu's v->vcpu_dirty_cpumask
> (remembering to take smp_processor_id() out of the mask first).
> If all you need is a TLB shootdown you can call flush_tlb_mask().
> That will cause a VMEXIT on the target CPUs so if you make it so that
> they can't VMENTER again with the old p2m settings that might be all you
> need.
>
> > The physical cpu must leave the l1/l2 guest on the tlb shootdown.
> > An optimization is to limit the tlb shootdown to those physical cpus
> > where "their" vcpus are in guestmode if this is possible to implement.
>
> You'd have to add another cpumask to express that, but that's doable.


Thanks. I implemented the tlb shootdown per p2m and do it on those
physical cpus whose vcpus are in guestmode.


I have implemented and fixed all feedback I got so far, except for
the "flush nestedp2m on hostp2m write" we talked above.
It is on my todo list.

I will send the fourth patch series for feedback, the "flush nestedp2m on 
hostp2m write" will be part of the over next patch series.

Christoph


-- 
---to satisfy European Law for business letters:
Advanced Micro Devices GmbH
Einsteinring 24, 85609 Dornach b. Muenchen
Geschaeftsfuehrer: Alberto Bozzo, Andrew Bowd
Sitz: Dornach, Gemeinde Aschheim, Landkreis Muenchen
Registergericht Muenchen, HRB Nr. 43632

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2010-09-01 14:27 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2010-08-05 15:05 [PATCH 14/14] Nested Virtualization: hap-on-hap Christoph Egger
2010-08-09 13:18 ` Tim Deegan
2010-08-19 15:55   ` Christoph Egger
2010-08-19 16:33     ` Tim Deegan
2010-09-01 14:27       ` Christoph Egger

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).