* [PATCH 12/12] Nested Virtualization: hap-on-hap
@ 2010-12-20 16:13 Christoph Egger
2011-01-07 15:55 ` Tim Deegan
0 siblings, 1 reply; 11+ messages in thread
From: Christoph Egger @ 2010-12-20 16:13 UTC (permalink / raw)
To: xen-devel@lists.xensource.com
[-- Attachment #1: Type: text/plain, Size: 264 bytes --]
--
---to satisfy European Law for business letters:
Advanced Micro Devices GmbH
Einsteinring 24, 85609 Dornach b. Muenchen
Geschaeftsfuehrer: Alberto Bozzo, Andrew Bowd
Sitz: Dornach, Gemeinde Aschheim, Landkreis Muenchen
Registergericht Muenchen, HRB Nr. 43632
[-- Attachment #2: xen_nh12_haphap.diff --]
[-- Type: text/x-diff, Size: 60399 bytes --]
# HG changeset patch
# User cegger
# Date 1292857188 -3600
Implement Nested-on-Nested.
This allows the guest to run nested guest with hap enabled.
Signed-off-by: Christoph Egger <Christoph.Egger@amd.com>
diff -r c4837a54b175 -r fe9dbf99f70f xen/arch/x86/hvm/hvm.c
--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -1149,12 +1149,40 @@ void hvm_inject_exception(unsigned int t
hvm_funcs.inject_exception(trapnr, errcode, cr2);
}
-bool_t hvm_hap_nested_page_fault(unsigned long gfn)
+int hvm_hap_nested_page_fault(paddr_t gpa, struct cpu_user_regs *regs)
{
p2m_type_t p2mt;
mfn_t mfn;
struct vcpu *v = current;
struct p2m_domain *p2m = p2m_get_hostp2m(v->domain);
+ unsigned long gfn = gpa >> PAGE_SHIFT;
+ int rv;
+
+ /* On Nested Virtualization, walk the guest page table.
+ * If this succeeds, all is fine.
+ * If this fails, inject a nested page fault into the guest.
+ */
+ if ( nestedhvm_enabled(v->domain)
+ && nestedhvm_vcpu_in_guestmode(v)
+ && nestedhvm_paging_mode_hap(v) )
+ {
+ /* The vcpu is in guest mode and the l1 guest
+ * uses hap. That means 'gpa' is in l2 guest
+ * physical address space.
+ * Fix the nested p2m or inject nested page fault
+ * into l1 guest if not fixable. The algorithm is
+ * the same as for shadow paging.
+ */
+ rv = nestedhvm_hap_nested_page_fault(v, gpa);
+ switch (rv) {
+ case NESTEDHVM_PAGEFAULT_DONE:
+ return 1;
+ case NESTEDHVM_PAGEFAULT_ERROR:
+ return 0;
+ case NESTEDHVM_PAGEFAULT_INJECT:
+ return -1;
+ }
+ }
mfn = gfn_to_mfn_guest(p2m, gfn, &p2mt);
@@ -1257,6 +1285,15 @@ int hvm_set_efer(uint64_t value)
return X86EMUL_EXCEPTION;
}
+ if ( nestedhvm_enabled(v->domain) && cpu_has_svm &&
+ ((value & EFER_SVME) == 0 ) &&
+ ((value ^ v->arch.hvm_vcpu.guest_efer) & EFER_SVME) )
+ {
+ /* Cleared EFER.SVME: Flush all nestedp2m tables */
+ p2m_flush_nestedp2m(v->domain);
+ nestedhvm_vcpu_reset(v);
+ }
+
value |= v->arch.hvm_vcpu.guest_efer & EFER_LMA;
v->arch.hvm_vcpu.guest_efer = value;
hvm_update_guest_efer(v);
@@ -1407,8 +1444,12 @@ int hvm_set_cr0(unsigned long value)
v->arch.hvm_vcpu.guest_cr[0] = value;
hvm_update_guest_cr(v, 0);
- if ( (value ^ old_value) & X86_CR0_PG )
- paging_update_paging_modes(v);
+ if ( (value ^ old_value) & X86_CR0_PG ) {
+ if ( !nestedhvm_vmswitch_in_progress(v) && nestedhvm_vcpu_in_guestmode(v) )
+ paging_update_nestedmode(v);
+ else
+ paging_update_paging_modes(v);
+ }
return X86EMUL_OKAY;
@@ -1475,8 +1516,12 @@ int hvm_set_cr4(unsigned long value)
hvm_update_guest_cr(v, 4);
/* Modifying CR4.{PSE,PAE,PGE} invalidates all TLB entries, inc. Global. */
- if ( (old_cr ^ value) & (X86_CR4_PSE | X86_CR4_PGE | X86_CR4_PAE) )
- paging_update_paging_modes(v);
+ if ( (old_cr ^ value) & (X86_CR4_PSE | X86_CR4_PGE | X86_CR4_PAE) ) {
+ if ( !nestedhvm_vmswitch_in_progress(v) && nestedhvm_vcpu_in_guestmode(v) )
+ paging_update_nestedmode(v);
+ else
+ paging_update_paging_modes(v);
+ }
return X86EMUL_OKAY;
@@ -1989,7 +2034,7 @@ static enum hvm_copy_result __hvm_copy(
void *buf, paddr_t addr, int size, unsigned int flags, uint32_t pfec)
{
struct vcpu *curr = current;
- struct p2m_domain *p2m = p2m_get_hostp2m(curr->domain);
+ struct p2m_domain *p2m;
unsigned long gfn, mfn;
p2m_type_t p2mt;
char *p;
@@ -2002,6 +2047,8 @@ static enum hvm_copy_result __hvm_copy(
if ( in_atomic() )
return HVMCOPY_unhandleable;
+ p2m = p2m_get_hostp2m(curr->domain);
+
while ( todo > 0 )
{
count = min_t(int, PAGE_SIZE - (addr & ~PAGE_MASK), todo);
diff -r c4837a54b175 -r fe9dbf99f70f xen/arch/x86/hvm/nestedhvm.c
--- a/xen/arch/x86/hvm/nestedhvm.c
+++ b/xen/arch/x86/hvm/nestedhvm.c
@@ -20,6 +20,7 @@
#include <asm/msr.h>
#include <asm/hvm/support.h> /* for HVM_DELIVER_NO_ERROR_CODE */
#include <asm/hvm/hvm.h>
+#include <asm/p2m.h> /* for struct p2m_domain */
#include <asm/hvm/nestedhvm.h>
#include <asm/event.h> /* for local_event_delivery_(en|dis)able */
#include <asm/paging.h> /* for paging_mode_hap() */
@@ -100,13 +101,43 @@ nestedhvm_vcpu_destroy(struct vcpu *v)
void
nestedhvm_vcpu_enter_guestmode(struct vcpu *v)
{
+ struct p2m_domain *p2m;
vcpu_nestedhvm(v).nv_guestmode = 1;
+
+ p2m = vcpu_nestedhvm(v).nv_p2m;
+ if (p2m == NULL)
+ /* p2m has either been invalidated or not yet assigned. */
+ return;
+
+ cpu_set(v->processor, p2m->p2m_dirty_cpumask);
}
void
nestedhvm_vcpu_exit_guestmode(struct vcpu *v)
{
+ struct p2m_domain *p2m;
vcpu_nestedhvm(v).nv_guestmode = 0;
+
+ p2m = vcpu_nestedhvm(v).nv_p2m;
+ if (p2m == NULL)
+ /* p2m has either been invalidated or not yet assigned. */
+ return;
+
+ cpu_clear(v->processor, p2m->p2m_dirty_cpumask);
+}
+
+bool_t
+nestedhvm_is_n2(struct vcpu *v)
+{
+ if (!nestedhvm_enabled(v->domain)
+ || nestedhvm_vmswitch_in_progress(v)
+ || !nestedhvm_paging_mode_hap(v))
+ return 0;
+
+ if (nestedhvm_vcpu_in_guestmode(v))
+ return 1;
+
+ return 0;
}
/* Common shadow IO Permission bitmap */
diff -r c4837a54b175 -r fe9dbf99f70f xen/arch/x86/hvm/svm/nestedsvm.c
--- a/xen/arch/x86/hvm/svm/nestedsvm.c
+++ b/xen/arch/x86/hvm/svm/nestedsvm.c
@@ -26,6 +26,7 @@
#include <asm/hvm/svm/svmdebug.h>
#include <asm/paging.h> /* paging_mode_hap */
#include <asm/event.h> /* for local_event_delivery_(en|dis)able */
+#include <asm/p2m.h> /* p2m_get_pagetable, p2m_get_nestedp2m */
static void
nestedsvm_vcpu_clgi(struct vcpu *v)
@@ -456,6 +457,10 @@ static int nsvm_vmcb_prepare4vmrun(struc
/* Nested paging mode */
if (nestedhvm_paging_mode_hap(v)) {
/* host nested paging + guest nested paging. */
+ host_vmcb->_np_enable = 1;
+
+ host_vmcb->_h_cr3 =
+ pagetable_get_paddr(p2m_get_pagetable(p2m_get_nestedp2m(v, ns_vmcb->_h_cr3)));
/* hvm_set_cr3() below sets v->arch.hvm_vcpu.guest_cr[3] for us. */
rc = hvm_set_cr3(ns_vmcb->_cr3);
diff -r c4837a54b175 -r fe9dbf99f70f xen/arch/x86/hvm/svm/svm.c
--- a/xen/arch/x86/hvm/svm/svm.c
+++ b/xen/arch/x86/hvm/svm/svm.c
@@ -1004,14 +1004,16 @@ struct hvm_function_table * __init start
return &svm_function_table;
}
-static void svm_do_nested_pgfault(paddr_t gpa)
+static void svm_do_nested_pgfault(struct vcpu *v,
+ struct cpu_user_regs *regs, paddr_t gpa)
{
+ int ret;
unsigned long gfn = gpa >> PAGE_SHIFT;
mfn_t mfn;
p2m_type_t p2mt;
- struct p2m_domain *p2m;
+ struct p2m_domain *p2m = NULL;
- p2m = p2m_get_hostp2m(current->domain);
+ ret = hvm_hap_nested_page_fault(gpa, regs);
if ( tb_init_done )
{
@@ -1022,6 +1024,7 @@ static void svm_do_nested_pgfault(paddr_
uint32_t p2mt;
} _d;
+ p2m = p2m_get_p2m(v);
_d.gpa = gpa;
_d.qualification = 0;
_d.mfn = mfn_x(gfn_to_mfn_query(p2m, gfn, &_d.p2mt));
@@ -1029,14 +1032,26 @@ static void svm_do_nested_pgfault(paddr_
__trace_var(TRC_HVM_NPF, 0, sizeof(_d), &_d);
}
- if ( hvm_hap_nested_page_fault(gfn) )
+ switch (ret) {
+ case 0:
+ break;
+ case 1:
return;
+ case -1:
+ ASSERT(nestedhvm_enabled(v->domain) && nestedhvm_vcpu_in_guestmode(v));
+ /* inject #VMEXIT(NPF) into guest. */
+ nestedsvm_vmexit_defer(v, VMEXIT_NPF, regs->error_code, gpa);
+ return;
+ }
+ if ( p2m == NULL )
+ p2m = p2m_get_p2m(v);
/* Everything else is an error. */
mfn = gfn_to_mfn_guest(p2m, gfn, &p2mt);
- gdprintk(XENLOG_ERR, "SVM violation gpa %#"PRIpaddr", mfn %#lx, type %i\n",
- gpa, mfn_x(mfn), p2mt);
- domain_crash(current->domain);
+ gdprintk(XENLOG_ERR,
+ "SVM violation gpa %#"PRIpaddr", mfn %#lx, type %i\n",
+ gpa, mfn_x(mfn), p2mt);
+ domain_crash(v->domain);
}
static void svm_fpu_dirty_intercept(void)
@@ -1630,6 +1645,8 @@ asmlinkage void svm_vmexit_handler(struc
struct vmcb_struct *ns_vmcb = nv->nv_vmcx;
uint64_t exitinfo1, exitinfo2;
+ paging_update_nestedmode(v);
+
/* Write real exitinfo1 back into virtual vmcb.
* nestedsvm_check_intercepts() expects to have the correct
* exitinfo1 value there.
@@ -1919,7 +1936,7 @@ asmlinkage void svm_vmexit_handler(struc
case VMEXIT_NPF:
perfc_incra(svmexits, VMEXIT_NPF_PERFC);
regs->error_code = vmcb->exitinfo1;
- svm_do_nested_pgfault(vmcb->exitinfo2);
+ svm_do_nested_pgfault(v, regs, vmcb->exitinfo2);
break;
case VMEXIT_IRET: {
diff -r c4837a54b175 -r fe9dbf99f70f xen/arch/x86/hvm/vmx/vmx.c
--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -2080,7 +2080,7 @@ static void ept_handle_violation(unsigne
}
if ( (qualification & EPT_GLA_VALID) &&
- hvm_hap_nested_page_fault(gfn) )
+ hvm_hap_nested_page_fault(gpa, guest_cpu_user_regs()) )
return;
/* Everything else is an error. */
diff -r c4837a54b175 -r fe9dbf99f70f xen/arch/x86/mm/hap/Makefile
--- a/xen/arch/x86/mm/hap/Makefile
+++ b/xen/arch/x86/mm/hap/Makefile
@@ -3,6 +3,7 @@ obj-y += guest_walk_2level.o
obj-y += guest_walk_3level.o
obj-y += guest_walk_4level.o
obj-y += p2m-ept.o
+obj-y += nested_hap.o
guest_levels = $(subst level,,$(filter %level,$(subst ., ,$(subst _, ,$(1)))))
guest_walk_defns = -DGUEST_PAGING_LEVELS=$(call guest_levels,$(1))
diff -r c4837a54b175 -r fe9dbf99f70f xen/arch/x86/mm/hap/guest_walk.c
--- a/xen/arch/x86/mm/hap/guest_walk.c
+++ b/xen/arch/x86/mm/hap/guest_walk.c
@@ -29,24 +29,32 @@
#define _hap_gva_to_gfn(levels) hap_gva_to_gfn_##levels##_levels
#define hap_gva_to_gfn(levels) _hap_gva_to_gfn(levels)
+#define _hap_p2m_ga_to_gfn(levels) hap_p2m_ga_to_gfn_##levels##_levels
+#define hap_p2m_ga_to_gfn(levels) _hap_p2m_ga_to_gfn(levels)
+
#if GUEST_PAGING_LEVELS <= CONFIG_PAGING_LEVELS
#include <asm/guest_pt.h>
#include <asm/p2m.h>
unsigned long hap_gva_to_gfn(GUEST_PAGING_LEVELS)(
- struct vcpu *v, unsigned long gva, uint32_t *pfec)
+ struct vcpu *v, struct p2m_domain *p2m, unsigned long gva, uint32_t *pfec)
{
- unsigned long cr3;
+ unsigned long cr3 = v->arch.hvm_vcpu.guest_cr[3];
+ return hap_p2m_ga_to_gfn(GUEST_PAGING_LEVELS)(v, p2m, cr3, gva, pfec);
+}
+
+unsigned long hap_p2m_ga_to_gfn(GUEST_PAGING_LEVELS)(
+ struct vcpu *v, struct p2m_domain *p2m, unsigned long cr3,
+ paddr_t ga, uint32_t *pfec)
+{
uint32_t missing;
mfn_t top_mfn;
void *top_map;
p2m_type_t p2mt;
walk_t gw;
- struct p2m_domain *p2m = p2m_get_hostp2m(v->domain);
/* Get the top-level table's MFN */
- cr3 = v->arch.hvm_vcpu.guest_cr[3];
top_mfn = gfn_to_mfn_unshare(p2m, cr3 >> PAGE_SHIFT, &p2mt, 0);
if ( p2m_is_paging(p2mt) )
{
@@ -72,7 +80,7 @@ unsigned long hap_gva_to_gfn(GUEST_PAGIN
#if GUEST_PAGING_LEVELS == 3
top_map += (cr3 & ~(PAGE_MASK | 31));
#endif
- missing = guest_walk_tables(v, p2m, gva, &gw, pfec[0], top_mfn, top_map);
+ missing = guest_walk_tables(v, p2m, ga, &gw, pfec[0], top_mfn, top_map);
unmap_domain_page(top_map);
/* Interpret the answer */
@@ -122,6 +130,15 @@ unsigned long hap_gva_to_gfn(GUEST_PAGIN
return INVALID_GFN;
}
+unsigned long hap_p2m_ga_to_gfn(GUEST_PAGING_LEVELS)(
+ struct vcpu *v, struct p2m_domain *p2m, unsigned long cr3,
+ paddr_t ga, uint32_t *pfec)
+{
+ gdprintk(XENLOG_ERR,
+ "Guest paging level is greater than host paging level!\n");
+ domain_crash(v->domain);
+ return INVALID_GFN;
+}
#endif
diff -r c4837a54b175 -r fe9dbf99f70f xen/arch/x86/mm/hap/hap.c
--- a/xen/arch/x86/mm/hap/hap.c
+++ b/xen/arch/x86/mm/hap/hap.c
@@ -40,6 +40,7 @@
#include <asm/p2m.h>
#include <asm/domain.h>
#include <xen/numa.h>
+#include <asm/hvm/nestedhvm.h>
#include "private.h"
@@ -580,6 +581,7 @@ void hap_domain_init(struct domain *d)
int hap_enable(struct domain *d, u32 mode)
{
unsigned int old_pages;
+ uint8_t i;
int rv = 0;
uint32_t oldmode;
@@ -622,6 +624,12 @@ int hap_enable(struct domain *d, u32 mod
goto out;
}
+ for (i = 0; i < MAX_NESTEDP2M; i++) {
+ rv = p2m_alloc_table(d->arch.nested_p2m[i]);
+ if ( rv != 0 )
+ goto out;
+ }
+
out:
if (rv)
d->arch.paging.mode = oldmode;
@@ -631,6 +639,13 @@ int hap_enable(struct domain *d, u32 mod
void hap_final_teardown(struct domain *d)
{
+ uint8_t i;
+
+ /* Destroy nestedp2m's first */
+ for (i = 0; i < MAX_NESTEDP2M; i++) {
+ p2m_teardown(d->arch.nested_p2m[i]);
+ }
+
if ( d->arch.paging.hap.total_pages != 0 )
hap_teardown(d);
@@ -658,7 +673,7 @@ void hap_teardown(struct domain *d)
/* release the monitor table held by each vcpu */
for_each_vcpu ( d, v )
{
- if ( v->arch.paging.mode && paging_mode_external(d) )
+ if ( paging_get_hostmode(v) && paging_mode_external(d) )
{
mfn = pagetable_get_mfn(v->arch.monitor_table);
if ( mfn_valid(mfn) && (mfn_x(mfn) != 0) )
@@ -726,6 +741,7 @@ static const struct paging_mode hap_pagi
void hap_vcpu_init(struct vcpu *v)
{
v->arch.paging.mode = &hap_paging_real_mode;
+ v->arch.paging.nestedmode = &hap_paging_real_mode;
}
/************************************************/
@@ -752,6 +768,15 @@ static int hap_page_fault(struct vcpu *v
*/
static int hap_invlpg(struct vcpu *v, unsigned long va)
{
+ if (nestedhvm_enabled(v->domain)) {
+ /* Emulate INVLPGA:
+ * Must perform the flush right now or an other vcpu may
+ * use it when we the next VMRUN emulation, otherwise.
+ */
+ p2m_flush(v, vcpu_nestedhvm(v).nv_p2m);
+ return 0;
+ }
+
HAP_ERROR("Intercepted a guest INVLPG (%u:%u) with HAP enabled.\n",
v->domain->domain_id, v->vcpu_id);
domain_crash(v->domain);
@@ -764,17 +789,22 @@ static void hap_update_cr3(struct vcpu *
hvm_update_guest_cr(v, 3);
}
+const struct paging_mode *
+hap_paging_get_mode(struct vcpu *v)
+{
+ return !hvm_paging_enabled(v) ? &hap_paging_real_mode :
+ hvm_long_mode_enabled(v) ? &hap_paging_long_mode :
+ hvm_pae_enabled(v) ? &hap_paging_pae_mode :
+ &hap_paging_protected_mode;
+}
+
static void hap_update_paging_modes(struct vcpu *v)
{
struct domain *d = v->domain;
hap_lock(d);
- v->arch.paging.mode =
- !hvm_paging_enabled(v) ? &hap_paging_real_mode :
- hvm_long_mode_enabled(v) ? &hap_paging_long_mode :
- hvm_pae_enabled(v) ? &hap_paging_pae_mode :
- &hap_paging_protected_mode;
+ v->arch.paging.mode = hap_paging_get_mode(v);
if ( pagetable_is_null(v->arch.monitor_table) )
{
@@ -835,38 +865,76 @@ static void
hap_write_p2m_entry(struct vcpu *v, unsigned long gfn, l1_pgentry_t *p,
mfn_t table_mfn, l1_pgentry_t new, unsigned int level)
{
+ struct domain *d = v->domain;
uint32_t old_flags;
- hap_lock(v->domain);
+ old_flags = l1e_get_flags(*p);
- old_flags = l1e_get_flags(*p);
+ /* We know always use the host p2m here, regardless if the vcpu
+ * is in host or guest mode. The vcpu can be in guest mode by
+ * a hypercall which passes a domain and chooses mostly the first
+ * vcpu.
+ * XXX This is the reason why this function can not be used re-used
+ * for updating the nestedp2m. Otherwise, hypercalls would randomly
+ * operate on host p2m and nested p2m.
+ */
+ if ( nestedhvm_enabled(d) ) {
+ mfn_t omfn = _mfn(l1e_get_pfn(*p));
+ p2m_type_t op2mt = p2m_flags_to_type(old_flags);
+
+ if ( p2m_is_valid(op2mt) ) {
+ mfn_t nmfn = _mfn(l1e_get_pfn(new));
+ p2m_type_t np2mt = p2m_flags_to_type(l1e_get_flags(new));
+
+ if ( p2m_is_valid(np2mt) && (mfn_x(omfn) != mfn_x(nmfn)) ) {
+ /* This GFN -> MFN is going to get removed. */
+ /* XXX There is a more efficient way to do that
+ * but it works for now.
+ * Note, p2m_flush_nestedp2m calls hap_lock() internally.
+ */
+ p2m_flush_nestedp2m(d);
+ }
+ }
+ }
+
+ hap_lock(d);
+
safe_write_pte(p, new);
if ( (old_flags & _PAGE_PRESENT)
&& (level == 1 || (level == 2 && (old_flags & _PAGE_PSE))) )
- flush_tlb_mask(&v->domain->domain_dirty_cpumask);
+ flush_tlb_mask(&d->domain_dirty_cpumask);
#if CONFIG_PAGING_LEVELS == 3
/* install P2M in monitor table for PAE Xen */
if ( level == 3 )
/* We have written to the p2m l3: need to sync the per-vcpu
* copies of it in the monitor tables */
- p2m_install_entry_in_monitors(v->domain, (l3_pgentry_t *)p);
+ p2m_install_entry_in_monitors(d, (l3_pgentry_t *)p);
#endif
- hap_unlock(v->domain);
+ hap_unlock(d);
}
static unsigned long hap_gva_to_gfn_real_mode(
- struct vcpu *v, unsigned long gva, uint32_t *pfec)
+ struct vcpu *v, struct p2m_domain *p2m, unsigned long gva, uint32_t *pfec)
{
return ((paddr_t)gva >> PAGE_SHIFT);
}
+static unsigned long hap_p2m_ga_to_gfn_real_mode(
+ struct vcpu *v, struct p2m_domain *p2m, unsigned long cr3,
+ paddr_t ga, uint32_t *pfec)
+{
+ return (ga >> PAGE_SHIFT);
+}
+
+
/* Entry points into this mode of the hap code. */
static const struct paging_mode hap_paging_real_mode = {
.page_fault = hap_page_fault,
.invlpg = hap_invlpg,
.gva_to_gfn = hap_gva_to_gfn_real_mode,
+ .p2m_ga_to_gfn = hap_p2m_ga_to_gfn_real_mode,
.update_cr3 = hap_update_cr3,
.update_paging_modes = hap_update_paging_modes,
.write_p2m_entry = hap_write_p2m_entry,
@@ -877,6 +945,7 @@ static const struct paging_mode hap_pagi
.page_fault = hap_page_fault,
.invlpg = hap_invlpg,
.gva_to_gfn = hap_gva_to_gfn_2_levels,
+ .p2m_ga_to_gfn = hap_p2m_ga_to_gfn_2_levels,
.update_cr3 = hap_update_cr3,
.update_paging_modes = hap_update_paging_modes,
.write_p2m_entry = hap_write_p2m_entry,
@@ -887,6 +956,7 @@ static const struct paging_mode hap_pagi
.page_fault = hap_page_fault,
.invlpg = hap_invlpg,
.gva_to_gfn = hap_gva_to_gfn_3_levels,
+ .p2m_ga_to_gfn = hap_p2m_ga_to_gfn_3_levels,
.update_cr3 = hap_update_cr3,
.update_paging_modes = hap_update_paging_modes,
.write_p2m_entry = hap_write_p2m_entry,
@@ -897,6 +967,7 @@ static const struct paging_mode hap_pagi
.page_fault = hap_page_fault,
.invlpg = hap_invlpg,
.gva_to_gfn = hap_gva_to_gfn_4_levels,
+ .p2m_ga_to_gfn = hap_p2m_ga_to_gfn_4_levels,
.update_cr3 = hap_update_cr3,
.update_paging_modes = hap_update_paging_modes,
.write_p2m_entry = hap_write_p2m_entry,
diff -r c4837a54b175 -r fe9dbf99f70f xen/arch/x86/mm/hap/nested_hap.c
--- /dev/null
+++ b/xen/arch/x86/mm/hap/nested_hap.c
@@ -0,0 +1,231 @@
+/******************************************************************************
+ * arch/x86/mm/hap/nested_hap.c
+ *
+ * Code for Nested Virtualization
+ * Copyright (c) 2010 Advanced Micro Devices
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <asm/domain.h>
+#include <asm/page.h>
+#include <asm/paging.h>
+#include <asm/p2m.h>
+#include <asm/mem_event.h>
+#include <public/mem_event.h>
+#include <asm/mem_sharing.h>
+#include <xen/event.h>
+#include <asm/hap.h>
+#include <asm/hvm/support.h>
+
+#include <asm/hvm/nestedhvm.h>
+
+#include "private.h"
+
+/* AlGORITHM for NESTED PAGE FAULT
+ *
+ * NOTATION
+ * Levels: L0, L1, L2
+ * Guests: L1 guest, L2 guest
+ * Hypervisor: L0 hypervisor
+ * Addresses: L2-GVA, L2-GPA, L1-GVA, L1-GPA, MPA
+ *
+ * On L0, when #NPF happens, the handler function should do:
+ * hap_page_fault(GPA)
+ * {
+ * 1. If #NPF is from L1 guest, then we crash the guest VM (same as old
+ * code)
+ * 2. If #NPF is from L2 guest, then we continue from (3)
+ * 3. Get h_cr3 from L1 guest. Map h_cr3 into L0 hypervisor address space.
+ * 4. Walk the h_cr3 page table
+ * 5. - if not present, then we inject #NPF back to L1 guest and
+ * re-launch L1 guest (L1 guest will either treat this #NPF as MMIO,
+ * or fix its p2m table for L2 guest)
+ * 6. - if present, then we will get the a new translated value L1-GPA
+ * (points to L1 machine memory)
+ * 7. * Use L1-GPA to walk L0 P2M table
+ * 8. - if not present, then crash the guest (should not happen)
+ * 9. - if present, then we get a new translated value MPA
+ * (points to real machine memory)
+ * 10. * Finally, use GPA and MPA to walk nested_p2m
+ * and fix the bits.
+ * }
+ *
+ */
+
+
+/********************************************/
+/* NESTED VIRT P2M FUNCTIONS */
+/********************************************/
+/* Override macros from asm/page.h to make them work with mfn_t */
+#undef mfn_valid
+#define mfn_valid(_mfn) __mfn_valid(mfn_x(_mfn))
+#undef page_to_mfn
+#define page_to_mfn(_pg) _mfn(__page_to_mfn(_pg))
+
+void
+nestedp2m_write_p2m_entry(struct p2m_domain *p2m, unsigned long gfn,
+ l1_pgentry_t *p, mfn_t table_mfn, l1_pgentry_t new, unsigned int level)
+{
+ struct domain *d = p2m->domain;
+ uint32_t old_flags;
+
+ hap_lock(d);
+
+ old_flags = l1e_get_flags(*p);
+ safe_write_pte(p, new);
+ if (old_flags & _PAGE_PRESENT)
+ nestedhvm_vmcx_flushtlb(p2m);
+
+ hap_unlock(d);
+}
+
+/********************************************/
+/* NESTED VIRT FUNCTIONS */
+/********************************************/
+static void
+nestedhap_fix_p2m(struct p2m_domain *p2m, paddr_t L2_gpa, paddr_t L0_gpa)
+{
+ int rv;
+ ASSERT(p2m);
+ ASSERT(p2m->set_entry);
+
+ rv = p2m->set_entry(p2m, L2_gpa >> PAGE_SHIFT,
+ page_to_mfn(maddr_to_page(L0_gpa)),
+ 0 /*4K*/, p2m_ram_rw);
+ if (rv == 0) {
+ gdprintk(XENLOG_ERR,
+ "failed to set entry for 0x%"PRIx64" -> 0x%"PRIx64"\n",
+ L2_gpa, L0_gpa);
+ BUG();
+ }
+}
+
+/* This function uses L1_gpa to walk the P2M table in L0 hypervisor. If the
+ * walk is successful, the translated value is returned in L0_gpa. The return
+ * value tells the upper level what to do.
+ */
+static int
+nestedhap_walk_L0_p2m(struct p2m_domain *p2m, paddr_t L1_gpa, paddr_t *L0_gpa)
+{
+ mfn_t mfn;
+ p2m_type_t p2mt;
+
+ /* we use gfn_to_mfn_query() function to walk L0 P2M table */
+ mfn = gfn_to_mfn_query(p2m, L1_gpa >> PAGE_SHIFT, &p2mt);
+
+ if ( p2m_is_paging(p2mt) || p2m_is_shared(p2mt) || !p2m_is_ram(p2mt) )
+ return NESTEDHVM_PAGEFAULT_ERROR;
+
+ if ( !mfn_valid(mfn) )
+ return NESTEDHVM_PAGEFAULT_ERROR;
+
+ *L0_gpa = (mfn_x(mfn) << PAGE_SHIFT) + (L1_gpa & ~PAGE_MASK);
+ return NESTEDHVM_PAGEFAULT_DONE;
+}
+
+/* This function uses L2_gpa to walk the P2M page table in L1. If the
+ * walk is successful, the translated value is returned in
+ * L1_gpa. The result value tells what to do next.
+ */
+static int
+nestedhap_walk_L1_p2m(struct vcpu *v, struct p2m_domain *p2m,
+ paddr_t L2_gpa, paddr_t *L1_gpa)
+{
+ uint32_t pfec;
+ unsigned long nested_cr3, gfn;
+ const struct paging_mode *mode = paging_get_hostmode(v);
+
+ nested_cr3 = nhvm_vcpu_hostcr3(v);
+
+ /* walk the guest table */
+ gfn = paging_p2m_ga_to_gfn(v, p2m, mode, nested_cr3, L2_gpa, &pfec);
+
+ if ( gfn == INVALID_GFN )
+ return NESTEDHVM_PAGEFAULT_INJECT;
+
+ *L1_gpa = (gfn << PAGE_SHIFT) + (L2_gpa & ~PAGE_MASK);
+ return NESTEDHVM_PAGEFAULT_DONE;
+}
+
+/*
+ * The following function, nestedhap_page_fault(), is for steps (3)--(10).
+ *
+ * Returns:
+ */
+int
+nestedhvm_hap_nested_page_fault(struct vcpu *v, paddr_t L2_gpa)
+{
+ int rv;
+ paddr_t L1_gpa, L0_gpa;
+ struct domain *d = v->domain;
+ struct p2m_domain *p2m, *nested_p2m;
+
+ p2m = p2m_get_hostp2m(d); /* L0 p2m */
+ nested_p2m = p2m_get_nestedp2m(v, nhvm_vcpu_hostcr3(v));
+
+ /* walk the L1 P2M table, note we have to pass p2m
+ * and not nested_p2m here or we fail the walk forever,
+ * otherwise. */
+ rv = nestedhap_walk_L1_p2m(v, p2m, L2_gpa, &L1_gpa);
+
+ /* let caller to handle these two cases */
+ switch (rv) {
+ case NESTEDHVM_PAGEFAULT_INJECT:
+ return rv;
+ case NESTEDHVM_PAGEFAULT_ERROR:
+ return rv;
+ case NESTEDHVM_PAGEFAULT_DONE:
+ break;
+ default:
+ BUG();
+ break;
+ }
+
+ /* ==> we have to walk L0 P2M */
+ rv = nestedhap_walk_L0_p2m(p2m, L1_gpa, &L0_gpa);
+
+ /* let upper level caller to handle these two cases */
+ switch (rv) {
+ case NESTEDHVM_PAGEFAULT_INJECT:
+ return rv;
+ case NESTEDHVM_PAGEFAULT_ERROR:
+ return rv;
+ case NESTEDHVM_PAGEFAULT_DONE:
+ break;
+ default:
+ BUG();
+ break;
+ }
+
+ /* fix p2m_get_pagetable(nested_p2m) */
+ nestedhap_fix_p2m(nested_p2m, L2_gpa, L0_gpa);
+
+ return NESTEDHVM_PAGEFAULT_DONE;
+}
+
+/********************************************/
+/* NESTED VIRT INITIALIZATION FUNCS */
+/********************************************/
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff -r c4837a54b175 -r fe9dbf99f70f xen/arch/x86/mm/hap/private.h
--- a/xen/arch/x86/mm/hap/private.h
+++ b/xen/arch/x86/mm/hap/private.h
@@ -23,11 +23,27 @@
/********************************************/
/* GUEST TRANSLATION FUNCS */
/********************************************/
-unsigned long hap_gva_to_gfn_2_levels(struct vcpu *v, unsigned long gva,
+unsigned long hap_gva_to_gfn_2_levels(struct vcpu *v,
+ struct p2m_domain *p2m,
+ unsigned long gva,
uint32_t *pfec);
-unsigned long hap_gva_to_gfn_3_levels(struct vcpu *v, unsigned long gva,
+unsigned long hap_gva_to_gfn_3_levels(struct vcpu *v,
+ struct p2m_domain *p2m,
+ unsigned long gva,
uint32_t *pfec);
-unsigned long hap_gva_to_gfn_4_levels(struct vcpu *v, unsigned long gva,
+unsigned long hap_gva_to_gfn_4_levels(struct vcpu *v,
+ struct p2m_domain *p2m,
+ unsigned long gva,
uint32_t *pfec);
+unsigned long hap_p2m_ga_to_gfn_2_levels(struct vcpu *v,
+ struct p2m_domain *p2m, unsigned long cr3,
+ paddr_t ga, uint32_t *pfec);
+unsigned long hap_p2m_ga_to_gfn_3_levels(struct vcpu *v,
+ struct p2m_domain *p2m, unsigned long cr3,
+ paddr_t ga, uint32_t *pfec);
+unsigned long hap_p2m_ga_to_gfn_4_levels(struct vcpu *v,
+ struct p2m_domain *p2m, unsigned long cr3,
+ paddr_t ga, uint32_t *pfec);
+
#endif /* __HAP_PRIVATE_H__ */
diff -r c4837a54b175 -r fe9dbf99f70f xen/arch/x86/mm/p2m.c
--- a/xen/arch/x86/mm/p2m.c
+++ b/xen/arch/x86/mm/p2m.c
@@ -34,6 +34,7 @@
#include <public/mem_event.h>
#include <asm/mem_sharing.h>
#include <xen/event.h>
+#include <asm/hvm/nestedhvm.h>
/* Debugging and auditing of the P2M code? */
#define P2M_AUDIT 0
@@ -72,7 +73,7 @@ boolean_param("hap_1gb", opt_hap_1gb);
#define SUPERPAGE_PAGES (1UL << 9)
#define superpage_aligned(_x) (((_x)&(SUPERPAGE_PAGES-1))==0)
-static unsigned long p2m_type_to_flags(p2m_type_t t, mfn_t mfn)
+unsigned long p2m_type_to_flags(p2m_type_t t, mfn_t mfn)
{
unsigned long flags;
#ifdef __x86_64__
@@ -118,9 +119,9 @@ static void audit_p2m(struct p2m_domain
// Find the next level's P2M entry, checking for out-of-range gfn's...
// Returns NULL on error.
//
-static l1_pgentry_t *
+l1_pgentry_t *
p2m_find_entry(void *table, unsigned long *gfn_remainder,
- unsigned long gfn, u32 shift, u32 max)
+ unsigned long gfn, uint32_t shift, uint32_t max)
{
u32 index;
@@ -187,20 +188,17 @@ p2m_next_level(struct p2m_domain *p2m, m
switch ( type ) {
case PGT_l3_page_table:
- paging_write_p2m_entry(p2m->domain, gfn,
- p2m_entry, *table_mfn, new_entry, 4);
+ p2m->write_p2m_entry(p2m, gfn, p2m_entry, *table_mfn, new_entry, 4);
break;
case PGT_l2_page_table:
#if CONFIG_PAGING_LEVELS == 3
/* for PAE mode, PDPE only has PCD/PWT/P bits available */
new_entry = l1e_from_pfn(mfn_x(page_to_mfn(pg)), _PAGE_PRESENT);
#endif
- paging_write_p2m_entry(p2m->domain, gfn,
- p2m_entry, *table_mfn, new_entry, 3);
+ p2m->write_p2m_entry(p2m, gfn, p2m_entry, *table_mfn, new_entry, 3);
break;
case PGT_l1_page_table:
- paging_write_p2m_entry(p2m->domain, gfn,
- p2m_entry, *table_mfn, new_entry, 2);
+ p2m->write_p2m_entry(p2m, gfn, p2m_entry, *table_mfn, new_entry, 2);
break;
default:
BUG();
@@ -227,14 +225,13 @@ p2m_next_level(struct p2m_domain *p2m, m
for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
{
new_entry = l1e_from_pfn(pfn + (i * L1_PAGETABLE_ENTRIES), flags);
- paging_write_p2m_entry(p2m->domain, gfn,
- l1_entry+i, *table_mfn, new_entry, 2);
+ p2m->write_p2m_entry(p2m, gfn,
+ l1_entry+i, *table_mfn, new_entry, 2);
}
unmap_domain_page(l1_entry);
new_entry = l1e_from_pfn(mfn_x(page_to_mfn(pg)),
__PAGE_HYPERVISOR|_PAGE_USER); //disable PSE
- paging_write_p2m_entry(p2m->domain, gfn,
- p2m_entry, *table_mfn, new_entry, 3);
+ p2m->write_p2m_entry(p2m, gfn, p2m_entry, *table_mfn, new_entry, 3);
}
@@ -261,15 +258,15 @@ p2m_next_level(struct p2m_domain *p2m, m
for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
{
new_entry = l1e_from_pfn(pfn + i, flags);
- paging_write_p2m_entry(p2m->domain, gfn,
- l1_entry+i, *table_mfn, new_entry, 1);
+ p2m->write_p2m_entry(p2m, gfn,
+ l1_entry+i, *table_mfn, new_entry, 1);
}
unmap_domain_page(l1_entry);
new_entry = l1e_from_pfn(mfn_x(page_to_mfn(pg)),
__PAGE_HYPERVISOR|_PAGE_USER);
- paging_write_p2m_entry(p2m->domain, gfn,
- p2m_entry, *table_mfn, new_entry, 2);
+ p2m->write_p2m_entry(p2m, gfn,
+ p2m_entry, *table_mfn, new_entry, 2);
}
*table_mfn = _mfn(l1e_get_pfn(*p2m_entry));
@@ -1307,9 +1304,7 @@ p2m_set_entry(struct p2m_domain *p2m, un
p2m_type_to_flags(p2mt, mfn) | _PAGE_PSE)
: l3e_empty();
entry_content.l1 = l3e_content.l3;
- paging_write_p2m_entry(p2m->domain, gfn, p2m_entry,
- table_mfn, entry_content, 3);
-
+ p2m->write_p2m_entry(p2m, gfn, p2m_entry, table_mfn, entry_content, 3);
}
/*
* When using PAE Xen, we only allow 33 bits of pseudo-physical
@@ -1344,8 +1339,7 @@ p2m_set_entry(struct p2m_domain *p2m, un
entry_content = l1e_empty();
/* level 1 entry */
- paging_write_p2m_entry(p2m->domain, gfn, p2m_entry,
- table_mfn, entry_content, 1);
+ p2m->write_p2m_entry(p2m, gfn, p2m_entry, table_mfn, entry_content, 1);
}
else if ( page_order == 9 )
{
@@ -1372,8 +1366,7 @@ p2m_set_entry(struct p2m_domain *p2m, un
l2e_content = l2e_empty();
entry_content.l1 = l2e_content.l2;
- paging_write_p2m_entry(p2m->domain, gfn, p2m_entry,
- table_mfn, entry_content, 2);
+ p2m->write_p2m_entry(p2m, gfn, p2m_entry, table_mfn, entry_content, 2);
}
/* Track the highest gfn for which we have ever had a valid mapping */
@@ -1726,10 +1719,13 @@ static void p2m_initialise(struct domain
INIT_PAGE_LIST_HEAD(&p2m->pod.single);
p2m->domain = d;
+ p2m->cr3 = CR3_EADDR;
p2m->set_entry = p2m_set_entry;
p2m->get_entry = p2m_gfn_to_mfn;
p2m->get_entry_current = p2m_gfn_to_mfn_current;
p2m->change_entry_type_global = p2m_change_type_global;
+ p2m->write_p2m_entry = paging_write_p2m_entry;
+ cpus_clear(p2m->p2m_dirty_cpumask);
if ( hap_enabled(d) && (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) )
ept_p2m_init(d);
@@ -1737,6 +1733,25 @@ static void p2m_initialise(struct domain
return;
}
+static int
+p2m_init_nestedp2m(struct domain *d)
+{
+ uint8_t i;
+ struct p2m_domain *p2m;
+
+ spin_lock_init(&d->arch.nested_p2m_lock);
+ for (i = 0; i < MAX_NESTEDP2M; i++) {
+ d->arch.nested_p2m[i] = p2m = xmalloc(struct p2m_domain);
+ if (p2m == NULL)
+ return -ENOMEM;
+ p2m_initialise(d, p2m);
+ p2m->get_entry_current = p2m->get_entry;
+ p2m->write_p2m_entry = nestedp2m_write_p2m_entry;
+ }
+
+ return 0;
+}
+
int p2m_init(struct domain *d)
{
struct p2m_domain *p2m;
@@ -1746,7 +1761,11 @@ int p2m_init(struct domain *d)
return -ENOMEM;
p2m_initialise(d, p2m);
- return 0;
+ /* Must initialise nestedp2m unconditionally
+ * since nestedhvm_enabled(d) returns false here.
+ * (p2m_init runs too early for HVM_PARAM_* options)
+ */
+ return p2m_init_nestedp2m(d);
}
void p2m_change_entry_type_global(struct p2m_domain *p2m,
@@ -1840,6 +1859,9 @@ int p2m_alloc_table(struct p2m_domain *p
p2m_invalid) )
goto error;
+ if (p2m_is_nestedp2m(p2m))
+ goto nesteddone;
+
/* Copy all existing mappings from the page list and m2p */
spin_lock(&p2m->domain->page_alloc_lock);
page_list_for_each(page, &p2m->domain->page_list)
@@ -1861,6 +1883,7 @@ int p2m_alloc_table(struct p2m_domain *p
}
spin_unlock(&p2m->domain->page_alloc_lock);
+ nesteddone:
P2M_PRINTK("p2m table initialised (%u pages)\n", page_count);
p2m_unlock(p2m);
return 0;
@@ -1886,6 +1909,9 @@ void p2m_teardown(struct p2m_domain *p2m
mfn_t mfn;
#endif
+ if (p2m == NULL)
+ return;
+
p2m_lock(p2m);
#ifdef __x86_64__
@@ -1904,11 +1930,26 @@ void p2m_teardown(struct p2m_domain *p2m
p2m_unlock(p2m);
}
+static void p2m_teardown_nestedp2m(struct domain *d)
+{
+ uint8_t i;
+
+ for (i = 0; i < MAX_NESTEDP2M; i++) {
+ xfree(d->arch.nested_p2m[i]);
+ d->arch.nested_p2m[i] = NULL;
+ }
+}
+
void p2m_final_teardown(struct domain *d)
{
/* Iterate over all p2m tables per domain */
xfree(d->arch.p2m);
d->arch.p2m = NULL;
+
+ /* We must teardown unconditionally because
+ * we initialise them unconditionally.
+ */
+ p2m_teardown_nestedp2m(d);
}
#if P2M_AUDIT
@@ -2489,9 +2530,9 @@ void p2m_change_type_global(struct p2m_d
gfn = get_gpfn_from_mfn(mfn);
flags = p2m_type_to_flags(nt, _mfn(mfn));
l1e_content = l1e_from_pfn(mfn, flags | _PAGE_PSE);
- paging_write_p2m_entry(p2m->domain, gfn,
- (l1_pgentry_t *)&l3e[i3],
- l3mfn, l1e_content, 3);
+ p2m->write_p2m_entry(p2m, gfn,
+ (l1_pgentry_t *)&l3e[i3],
+ l3mfn, l1e_content, 3);
continue;
}
@@ -2520,9 +2561,9 @@ void p2m_change_type_global(struct p2m_d
* L2_PAGETABLE_ENTRIES) * L1_PAGETABLE_ENTRIES;
flags = p2m_type_to_flags(nt, _mfn(mfn));
l1e_content = l1e_from_pfn(mfn, flags | _PAGE_PSE);
- paging_write_p2m_entry(p2m->domain, gfn,
- (l1_pgentry_t *)&l2e[i2],
- l2mfn, l1e_content, 2);
+ p2m->write_p2m_entry(p2m, gfn,
+ (l1_pgentry_t *)&l2e[i2],
+ l2mfn, l1e_content, 2);
continue;
}
@@ -2544,8 +2585,8 @@ void p2m_change_type_global(struct p2m_d
/* create a new 1le entry with the new type */
flags = p2m_type_to_flags(nt, _mfn(mfn));
l1e_content = l1e_from_pfn(mfn, flags);
- paging_write_p2m_entry(p2m->domain, gfn, &l1e[i1],
- l1mfn, l1e_content, 1);
+ p2m->write_p2m_entry(p2m, gfn, &l1e[i1],
+ l1mfn, l1e_content, 1);
}
unmap_domain_page(l1e);
}
@@ -2844,6 +2885,186 @@ void p2m_mem_paging_resume(struct p2m_do
}
#endif /* __x86_64__ */
+static struct p2m_domain *
+p2m_getlru_nestedp2m(struct domain *d, struct p2m_domain *p2m)
+{
+ int i, lru_index = -1;
+ struct p2m_domain *lrup2m, *tmp;
+
+ if (p2m == NULL) {
+ lru_index = MAX_NESTEDP2M - 1;
+ lrup2m = d->arch.nested_p2m[lru_index];
+ } else {
+ lrup2m = p2m;
+ for (i = 0; i < MAX_NESTEDP2M; i++) {
+ if (d->arch.nested_p2m[i] == p2m) {
+ lru_index = i;
+ break;
+ }
+ }
+ }
+
+ ASSERT(lru_index >= 0);
+ if (lru_index == 0) {
+ return lrup2m;
+ }
+
+ /* move the other's down the array "list" */
+ for (i = lru_index - 1; i >= 0; i--) {
+ tmp = d->arch.nested_p2m[i];
+ d->arch.nested_p2m[i+1] = tmp;
+ }
+
+ /* make the entry the first one */
+ d->arch.nested_p2m[0] = lrup2m;
+
+ return lrup2m;
+}
+
+static int
+p2m_flush_locked(struct p2m_domain *p2m)
+{
+ if (p2m->cr3 == CR3_EADDR)
+ /* Microoptimisation: p2m is already empty.
+ * => about 0.3% speedup of overall system performance.
+ */
+ return 0;
+
+ p2m_teardown(p2m);
+ p2m_initialise(p2m->domain, p2m);
+ p2m->get_entry_current = p2m->get_entry;
+ p2m->write_p2m_entry = nestedp2m_write_p2m_entry;
+ BUG_ON(p2m_alloc_table(p2m) != 0);
+
+ ASSERT(p2m);
+ return 0;
+}
+
+void
+p2m_flush(struct vcpu *v, struct p2m_domain *p2m)
+{
+ struct domain *d = p2m->domain;
+
+ ASSERT(v->domain == d);
+ vcpu_nestedhvm(v).nv_p2m = NULL;
+ spin_lock(&d->arch.nested_p2m_lock);
+ BUG_ON(p2m_flush_locked(p2m) != 0);
+ spin_unlock(&d->arch.nested_p2m_lock);
+ hvm_asid_flush_vcpu(v);
+ nestedhvm_vmcx_flushtlb(p2m);
+}
+
+void
+p2m_flush_nestedp2m(struct domain *d)
+{
+ int i;
+
+ spin_lock(&d->arch.nested_p2m_lock);
+ for (i = 0; i < MAX_NESTEDP2M; i++)
+ BUG_ON(p2m_flush_locked(d->arch.nested_p2m[i]) != 0);
+ spin_unlock(&d->arch.nested_p2m_lock);
+ flush_tlb_mask(&d->domain_dirty_cpumask);
+}
+
+struct p2m_domain *
+p2m_get_nestedp2m(struct vcpu *v, uint64_t cr3)
+{
+ struct nestedvcpu *nv = &vcpu_nestedhvm(v);
+ struct domain *d;
+ struct p2m_domain *p2m;
+ int i, rv;
+
+ if (cr3 == 0 || cr3 == CR3_EADDR)
+ cr3 = v->arch.hvm_vcpu.guest_cr[3];
+
+ if (nv->nv_flushp2m && nv->nv_p2m) {
+ nv->nv_p2m = NULL;
+ }
+
+ d = v->domain;
+ spin_lock(&d->arch.nested_p2m_lock);
+ for (i = 0; i < MAX_NESTEDP2M; i++) {
+ p2m = d->arch.nested_p2m[i];
+ if (p2m->cr3 == cr3 && p2m == nv->nv_p2m) {
+ p2m_getlru_nestedp2m(d, p2m);
+ if (nv->nv_flushp2m) {
+ BUG_ON(p2m_flush_locked(p2m) != 0);
+ nv->nv_flushp2m = 0;
+ hvm_asid_flush_vcpu(v);
+ nestedhvm_vmcx_flushtlb(nv->nv_p2m);
+ }
+ p2m->cr3 = cr3;
+ spin_unlock(&d->arch.nested_p2m_lock);
+ return p2m;
+ }
+ if (p2m->cr3 == CR3_EADDR) { /* found unused p2m table */
+ nv->nv_flushp2m = 0;
+ p2m_getlru_nestedp2m(d, p2m);
+ nv->nv_p2m = p2m;
+ p2m->cr3 = cr3;
+ spin_unlock(&d->arch.nested_p2m_lock);
+ hvm_asid_flush_vcpu(v);
+ return p2m;
+ }
+ }
+
+ /* All p2m's are or were in use. We know the least recent used one.
+ * Destroy and re-initialize it.
+ */
+ for (i = 0; i < MAX_NESTEDP2M; i++) {
+ p2m = p2m_getlru_nestedp2m(d, NULL);
+ rv = p2m_flush_locked(p2m);
+ if (rv == 0)
+ break;
+ }
+ nv->nv_p2m = p2m;
+ p2m->cr3 = cr3;
+ nv->nv_flushp2m = 0;
+ spin_unlock(&d->arch.nested_p2m_lock);
+ hvm_asid_flush_vcpu(v);
+ nestedhvm_vmcx_flushtlb(nv->nv_p2m);
+
+ return p2m;
+}
+
+struct p2m_domain *
+p2m_get_p2m(struct vcpu *v)
+{
+ if (!nestedhvm_is_n2(v))
+ return p2m_get_hostp2m(v->domain);
+
+ return p2m_get_nestedp2m(v, nhvm_vcpu_hostcr3(v));
+}
+
+unsigned long paging_gva_to_gfn(struct vcpu *v,
+ unsigned long va,
+ uint32_t *pfec)
+{
+ struct p2m_domain *hostp2m = p2m_get_hostp2m(v->domain);
+ const struct paging_mode *hostmode = paging_get_hostmode(v);
+
+ if ( is_hvm_domain(v->domain)
+ && paging_mode_hap(v->domain)
+ && nestedhvm_is_n2(v) )
+ {
+ unsigned long gfn;
+ struct p2m_domain *p2m;
+ const struct paging_mode *mode;
+ uint64_t ncr3 = nhvm_vcpu_hostcr3(v);
+
+ /* translate l2 guest va into l2 guest gfn */
+ p2m = p2m_get_nestedp2m(v, ncr3);
+ mode = paging_get_nestedmode(v);
+ gfn = mode->gva_to_gfn(v, p2m, va, pfec);
+
+ /* translate l2 guest gfn into l1 guest gfn */
+ return hostmode->p2m_ga_to_gfn(v, hostp2m, ncr3,
+ gfn << PAGE_SHIFT, pfec);
+ }
+
+ return hostmode->gva_to_gfn(v, hostp2m, va, pfec);
+}
+
/*
* Local variables:
* mode: C
diff -r c4837a54b175 -r fe9dbf99f70f xen/arch/x86/mm/paging.c
--- a/xen/arch/x86/mm/paging.c
+++ b/xen/arch/x86/mm/paging.c
@@ -26,6 +26,7 @@
#include <asm/p2m.h>
#include <asm/hap.h>
#include <asm/guest_access.h>
+#include <asm/hvm/nestedhvm.h>
#include <xen/numa.h>
#include <xsm/xsm.h>
@@ -815,21 +816,58 @@ void paging_dump_vcpu_info(struct vcpu *
printk(" paging assistance: ");
if ( paging_mode_shadow(v->domain) )
{
- if ( v->arch.paging.mode )
+ if ( paging_get_hostmode(v) )
printk("shadowed %u-on-%u\n",
- v->arch.paging.mode->guest_levels,
- v->arch.paging.mode->shadow.shadow_levels);
+ paging_get_hostmode(v)->guest_levels,
+ paging_get_hostmode(v)->shadow.shadow_levels);
else
printk("not shadowed\n");
}
- else if ( paging_mode_hap(v->domain) && v->arch.paging.mode )
+ else if ( paging_mode_hap(v->domain) && paging_get_hostmode(v) )
printk("hap, %u levels\n",
- v->arch.paging.mode->guest_levels);
+ paging_get_hostmode(v)->guest_levels);
else
printk("none\n");
}
}
+const struct paging_mode *paging_get_mode(struct vcpu *v)
+{
+ if (!nestedhvm_is_n2(v))
+ return paging_get_hostmode(v);
+
+ return paging_get_nestedmode(v);
+}
+
+extern const struct paging_mode *hap_paging_get_mode(struct vcpu *);
+
+void paging_update_nestedmode(struct vcpu *v)
+{
+ ASSERT(nestedhvm_enabled(v->domain));
+ if (nestedhvm_paging_mode_hap(v))
+ /* nested-on-nested */
+ v->arch.paging.nestedmode = hap_paging_get_mode(v);
+ else
+ /* TODO: shadow-on-shadow */
+ v->arch.paging.nestedmode = NULL;
+}
+
+void paging_write_p2m_entry(struct p2m_domain *p2m, unsigned long gfn,
+ l1_pgentry_t *p, mfn_t table_mfn,
+ l1_pgentry_t new, unsigned int level)
+{
+ struct domain *d = p2m->domain;
+ struct vcpu *v = current;
+ if ( v->domain != d )
+ v = d->vcpu ? d->vcpu[0] : NULL;
+ if ( likely(v && paging_mode_enabled(d) && paging_get_hostmode(v) != NULL) )
+ {
+ return paging_get_hostmode(v)->write_p2m_entry(v, gfn, p, table_mfn,
+ new, level);
+ }
+ else
+ safe_write_pte(p, new);
+}
/*
* Local variables:
diff -r c4837a54b175 -r fe9dbf99f70f xen/arch/x86/mm/shadow/multi.c
--- a/xen/arch/x86/mm/shadow/multi.c
+++ b/xen/arch/x86/mm/shadow/multi.c
@@ -3768,7 +3768,8 @@ sh_invlpg(struct vcpu *v, unsigned long
static unsigned long
-sh_gva_to_gfn(struct vcpu *v, unsigned long va, uint32_t *pfec)
+sh_gva_to_gfn(struct vcpu *v, struct p2m_domain *p2m,
+ unsigned long va, uint32_t *pfec)
/* Called to translate a guest virtual address to what the *guest*
* pagetables would map it to. */
{
@@ -4820,7 +4821,7 @@ static mfn_t emulate_gva_to_mfn(struct v
struct p2m_domain *p2m = p2m_get_hostp2m(v->domain);
/* Translate the VA to a GFN */
- gfn = sh_gva_to_gfn(v, vaddr, &pfec);
+ gfn = sh_gva_to_gfn(v, p2m, vaddr, &pfec);
if ( gfn == INVALID_GFN )
{
if ( is_hvm_vcpu(v) )
diff -r c4837a54b175 -r fe9dbf99f70f xen/include/asm-x86/domain.h
--- a/xen/include/asm-x86/domain.h
+++ b/xen/include/asm-x86/domain.h
@@ -210,6 +210,8 @@ struct paging_domain {
struct paging_vcpu {
/* Pointers to mode-specific entry points. */
const struct paging_mode *mode;
+ /* Nested Virtualization: paging mode of nested guest */
+ const struct paging_mode *nestedmode;
/* HVM guest: last emulate was to a pagetable */
unsigned int last_write_was_pt:1;
/* HVM guest: last write emulation succeeds */
@@ -225,6 +227,7 @@ struct paging_vcpu {
#define MAX_CPUID_INPUT 40
typedef xen_domctl_cpuid_t cpuid_input_t;
+#define MAX_NESTEDP2M 10
struct p2m_domain;
struct time_scale {
int shift;
@@ -258,6 +261,10 @@ struct arch_domain
struct paging_domain paging;
struct p2m_domain *p2m;
+ /* nestedhvm: translate l2 guest physical to host physical */
+ struct p2m_domain *nested_p2m[MAX_NESTEDP2M];
+ spinlock_t nested_p2m_lock;
+
/* NB. protected by d->event_lock and by irq_desc[irq].lock */
int *irq_pirq;
int *pirq_irq;
diff -r c4837a54b175 -r fe9dbf99f70f xen/include/asm-x86/hvm/hvm.h
--- a/xen/include/asm-x86/hvm/hvm.h
+++ b/xen/include/asm-x86/hvm/hvm.h
@@ -374,7 +374,7 @@ static inline void hvm_set_info_guest(st
int hvm_debug_op(struct vcpu *v, int32_t op);
-bool_t hvm_hap_nested_page_fault(unsigned long gfn);
+int hvm_hap_nested_page_fault(paddr_t gpa, struct cpu_user_regs *regs);
#define hvm_msr_tsc_aux(v) ({ \
struct domain *__d = (v)->domain; \
diff -r c4837a54b175 -r fe9dbf99f70f xen/include/asm-x86/hvm/nestedhvm.h
--- a/xen/include/asm-x86/hvm/nestedhvm.h
+++ b/xen/include/asm-x86/hvm/nestedhvm.h
@@ -61,7 +61,9 @@ void nestedhvm_vcpu_iomap_put(unsigned l
#define nestedhvm_paging_mode_hap(v) (!!nhvm_vmcx_hap_enabled(v))
#define nestedhvm_vmswitch_in_progress(v) \
(!!vcpu_nestedhvm((v)).nv_vmswitch_in_progress)
-#define nestedhvm_vmcx_flushtlb(d) \
- flush_tlb_mask(&(d)->arch.hvm_domain.nh_dirty_cpumask)
+#define nestedhvm_vmcx_flushtlb(p2m) \
+ flush_tlb_mask(&((p2m)->p2m_dirty_cpumask))
+
+bool_t nestedhvm_is_n2(struct vcpu *v);
#endif /* _HVM_NESTEDHVM_H */
diff -r c4837a54b175 -r fe9dbf99f70f xen/include/asm-x86/p2m.h
--- a/xen/include/asm-x86/p2m.h
+++ b/xen/include/asm-x86/p2m.h
@@ -174,7 +174,15 @@ struct p2m_domain {
/* Shadow translated domain: p2m mapping */
pagetable_t phys_table;
+ /* Same as domain_dirty_cpumask but limited to
+ * this p2m and those physical cpus whose vcpu's are in
+ * guestmode.
+ */
+ cpumask_t p2m_dirty_cpumask;
+
struct domain *domain; /* back pointer to domain */
+#define CR3_EADDR (~0ULL)
+ uint64_t cr3; /* to identify this p2m for re-use */
/* Pages used to construct the p2m */
struct page_list_head pages;
@@ -194,6 +202,10 @@ struct p2m_domain {
void (*change_entry_type_global)(struct p2m_domain *p2m,
p2m_type_t ot,
p2m_type_t nt);
+ void (*write_p2m_entry)(struct p2m_domain *p2m,
+ unsigned long gfn, l1_pgentry_t *p,
+ mfn_t table_mfn, l1_pgentry_t new,
+ unsigned int level);
/* Highest guest frame that's ever been mapped in the p2m */
unsigned long max_mapped_pfn;
@@ -227,8 +239,26 @@ struct p2m_domain {
/* get host p2m table */
#define p2m_get_hostp2m(d) ((d)->arch.p2m)
+/* Get p2m table (re)usable for specified cr3.
+ * Automatically destroys and re-initializes a p2m if none found.
+ * If cr3 == 0 then v->arch.hvm_vcpu.guest_cr[3] is used.
+ */
+struct p2m_domain *p2m_get_nestedp2m(struct vcpu *v, uint64_t cr3);
+
+/* If vcpu is in host mode then behaviour matches p2m_get_hostp2m().
+ * If vcpu is in guest mode then behaviour matches p2m_get_nestedp2m().
+ */
+struct p2m_domain *p2m_get_p2m(struct vcpu *v);
+
+#define p2m_is_nestedp2m(p2m) ((p2m) != p2m_get_hostp2m((p2m->domain)))
+
#define p2m_get_pagetable(p2m) ((p2m)->phys_table)
+/* Flushes specified p2m table */
+void p2m_flush(struct vcpu *v, struct p2m_domain *p2m);
+/* Flushes all nested p2m tables */
+void p2m_flush_nestedp2m(struct domain *d);
+
/*
* The P2M lock. This protects all updates to the p2m table.
* Updates are expected to be safe against concurrent reads,
@@ -384,11 +414,21 @@ static inline unsigned long mfn_to_gfn(s
/* Init the datastructures for later use by the p2m code */
int p2m_init(struct domain *d);
+/* PTE flags for various types of p2m entry */
+unsigned long p2m_type_to_flags(p2m_type_t t, mfn_t mfn);
+
/* Allocate a new p2m table for a domain.
*
* Returns 0 for success or -errno. */
int p2m_alloc_table(struct p2m_domain *p2m);
+/* Find the next level's P2M entry, checking for out-of-range gfn's...
+ * Returns NULL on error.
+ */
+l1_pgentry_t *
+p2m_find_entry(void *table, unsigned long *gfn_remainder,
+ unsigned long gfn, uint32_t shift, uint32_t max);
+
/* Return all the p2m resources to Xen. */
void p2m_teardown(struct p2m_domain *p2m);
void p2m_final_teardown(struct domain *d);
@@ -462,6 +502,8 @@ p2m_type_t p2m_change_type(struct p2m_do
int set_mmio_p2m_entry(struct p2m_domain *p2m, unsigned long gfn, mfn_t mfn);
int clear_mmio_p2m_entry(struct p2m_domain *p2m, unsigned long gfn);
+void nestedp2m_write_p2m_entry(struct p2m_domain *p2m, unsigned long gfn,
+ l1_pgentry_t *p, mfn_t table_mfn, l1_pgentry_t new, unsigned int level);
#ifdef __x86_64__
/* Modify p2m table for shared gfn */
diff -r c4837a54b175 -r fe9dbf99f70f xen/include/asm-x86/paging.h
--- a/xen/include/asm-x86/paging.h
+++ b/xen/include/asm-x86/paging.h
@@ -108,8 +108,14 @@ struct paging_mode {
int (*page_fault )(struct vcpu *v, unsigned long va,
struct cpu_user_regs *regs);
int (*invlpg )(struct vcpu *v, unsigned long va);
- unsigned long (*gva_to_gfn )(struct vcpu *v, unsigned long va,
+ unsigned long (*gva_to_gfn )(struct vcpu *v,
+ struct p2m_domain *p2m,
+ unsigned long va,
uint32_t *pfec);
+ unsigned long (*p2m_ga_to_gfn )(struct vcpu *v,
+ struct p2m_domain *p2m,
+ unsigned long cr3,
+ paddr_t ga, uint32_t *pfec);
void (*update_cr3 )(struct vcpu *v, int do_locking);
void (*update_paging_modes )(struct vcpu *v);
void (*write_p2m_entry )(struct vcpu *v, unsigned long gfn,
@@ -219,6 +225,10 @@ void paging_final_teardown(struct domain
* creation. */
int paging_enable(struct domain *d, u32 mode);
+#define paging_get_hostmode(v) ((v)->arch.paging.mode)
+#define paging_get_nestedmode(v) ((v)->arch.paging.nestedmode)
+const struct paging_mode *paging_get_mode(struct vcpu *v);
+void paging_update_nestedmode(struct vcpu *v);
/* Page fault handler
* Called from pagefault handler in Xen, and from the HVM trap handlers
@@ -233,7 +243,7 @@ static inline int
paging_fault(unsigned long va, struct cpu_user_regs *regs)
{
struct vcpu *v = current;
- return v->arch.paging.mode->page_fault(v, va, regs);
+ return paging_get_hostmode(v)->page_fault(v, va, regs);
}
/* Handle invlpg requests on vcpus.
@@ -241,7 +251,7 @@ paging_fault(unsigned long va, struct cp
* or 0 if it's safe not to do so. */
static inline int paging_invlpg(struct vcpu *v, unsigned long va)
{
- return v->arch.paging.mode->invlpg(v, va);
+ return paging_get_hostmode(v)->invlpg(v, va);
}
/* Translate a guest virtual address to the frame number that the
@@ -251,11 +261,30 @@ static inline int paging_invlpg(struct v
* walking the tables. The caller should set the PFEC_page_present bit
* in pfec[0]; in the failure case, that bit will be cleared if appropriate. */
#define INVALID_GFN (-1UL)
-static inline unsigned long paging_gva_to_gfn(struct vcpu *v,
- unsigned long va,
- uint32_t *pfec)
+unsigned long paging_gva_to_gfn(struct vcpu *v,
+ unsigned long va,
+ uint32_t *pfec);
+
+/* Translates a guest virtual address to guest physical address
+ * where the specified cr3 is translated to host physical address
+ * using the specified p2m table.
+ * This allows to do page walks in the guest or even in the nested guest.
+ * It returns the guest's gfn or the nested guest's gfn.
+ * Use 'paddr_t' for the guest address so it won't overflow when
+ * guest or nested guest is in 32bit PAE mode.
+ */
+static inline unsigned long paging_p2m_ga_to_gfn(struct vcpu *v,
+ struct p2m_domain *p2m,
+ const struct paging_mode *mode,
+ unsigned long cr3,
+ paddr_t ga,
+ uint32_t *pfec)
{
- return v->arch.paging.mode->gva_to_gfn(v, va, pfec);
+ if ( is_hvm_domain(v->domain) && paging_mode_hap(v->domain) )
+ return mode->p2m_ga_to_gfn(v, p2m, cr3, ga, pfec);
+
+ /* shadow paging */
+ return paging_gva_to_gfn(v, ga, pfec);
}
/* Update all the things that are derived from the guest's CR3.
@@ -263,7 +292,7 @@ static inline unsigned long paging_gva_t
* as the value to load into the host CR3 to schedule this vcpu */
static inline void paging_update_cr3(struct vcpu *v)
{
- v->arch.paging.mode->update_cr3(v, 1);
+ paging_get_hostmode(v)->update_cr3(v, 1);
}
/* Update all the things that are derived from the guest's CR0/CR3/CR4.
@@ -271,7 +300,7 @@ static inline void paging_update_cr3(str
* has changed, and when bringing up a VCPU for the first time. */
static inline void paging_update_paging_modes(struct vcpu *v)
{
- v->arch.paging.mode->update_paging_modes(v);
+ paging_get_hostmode(v)->update_paging_modes(v);
}
@@ -283,7 +312,7 @@ static inline int paging_write_guest_ent
{
if ( unlikely(paging_mode_enabled(v->domain)
&& v->arch.paging.mode != NULL) )
- return v->arch.paging.mode->write_guest_entry(v, p, new, gmfn);
+ return paging_get_hostmode(v)->write_guest_entry(v, p, new, gmfn);
else
return (!__copy_to_user(p, &new, sizeof(new)));
}
@@ -299,7 +328,7 @@ static inline int paging_cmpxchg_guest_e
{
if ( unlikely(paging_mode_enabled(v->domain)
&& v->arch.paging.mode != NULL) )
- return v->arch.paging.mode->cmpxchg_guest_entry(v, p, old, new, gmfn);
+ return paging_get_hostmode(v)->cmpxchg_guest_entry(v, p, old, new, gmfn);
else
return (!cmpxchg_user(p, *old, new));
}
@@ -327,21 +356,11 @@ static inline void safe_write_pte(l1_pge
* a pointer to the entry to be written, the MFN in which the entry resides,
* the new contents of the entry, and the level in the p2m tree at which
* we are writing. */
-static inline void paging_write_p2m_entry(struct domain *d, unsigned long gfn,
- l1_pgentry_t *p, mfn_t table_mfn,
- l1_pgentry_t new, unsigned int level)
-{
- struct vcpu *v = current;
- if ( v->domain != d )
- v = d->vcpu ? d->vcpu[0] : NULL;
- if ( likely(v && paging_mode_enabled(d) && v->arch.paging.mode != NULL) )
- {
- return v->arch.paging.mode->write_p2m_entry(v, gfn, p, table_mfn,
- new, level);
- }
- else
- safe_write_pte(p, new);
-}
+struct p2m_domain;
+
+void paging_write_p2m_entry(struct p2m_domain *p2m, unsigned long gfn,
+ l1_pgentry_t *p, mfn_t table_mfn,
+ l1_pgentry_t new, unsigned int level);
/* Called from the guest to indicate that the a process is being
* torn down and its pagetables will soon be discarded */
@@ -362,7 +381,7 @@ guest_map_l1e(struct vcpu *v, unsigned l
l2_pgentry_t l2e;
if ( unlikely(paging_mode_translate(v->domain)) )
- return v->arch.paging.mode->guest_map_l1e(v, addr, gl1mfn);
+ return paging_get_hostmode(v)->guest_map_l1e(v, addr, gl1mfn);
/* Find this l1e and its enclosing l1mfn in the linear map */
if ( __copy_from_user(&l2e,
@@ -398,7 +417,7 @@ guest_get_eff_l1e(struct vcpu *v, unsign
return;
}
- v->arch.paging.mode->guest_get_eff_l1e(v, addr, eff_l1e);
+ paging_get_hostmode(v)->guest_get_eff_l1e(v, addr, eff_l1e);
}
/* Read the guest's l1e that maps this address, from the kernel-mode
[-- Attachment #3: Type: text/plain, Size: 138 bytes --]
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH 12/12] Nested Virtualization: hap-on-hap
2010-12-20 16:13 Christoph Egger
@ 2011-01-07 15:55 ` Tim Deegan
0 siblings, 0 replies; 11+ messages in thread
From: Tim Deegan @ 2011-01-07 15:55 UTC (permalink / raw)
To: Christoph Egger; +Cc: xen-devel@lists.xensource.com
Hi,
We still haven't resolved the issue of safely recycling a p2m table that
is in use on another CPU. I'll quote my last email in its entirety
here:
| At 16:27 +0000 on 20 Dec (1292862443), Christoph Egger wrote:
| > > > An other vcpu is in VMRUN emulation after a nestedp2m is assigned.
| > > > It will VMEXIT with a nested page fault.
| > >
| > > Why?
| >
| > Because the p2m is empty. The MMU can not do a page table walk.
| >
| > > > An other vcpu already running l2 guest.
| > > > It will VMEXIT with a nested page fault immediately.
| > >
| > > Hmm. It will exit for the TLB shootdown IPI, but I think you need
| to
| > > clear vcpu_nestedhvm(v).nh_p2m on the other vcpu to make sure it
| doesn't
| > > re-enter with the p2m you've just recycled.
| >
| > The p2m is empty so I don't see a problem when it gets recycled.
|
| It's only empty very briefly. You've assigned it to a vcpu which is
| about to take a nested fault and fill it with entries, right?
|
| What happens if the other vcpu is handling an SMI or executing a tight
| loop of register arithmetic for a few thousand cycles? What stops it
| seeing the new contents of the p2m?
One other issue I pointed out last time is still there in this patch:
> @@ -835,38 +865,76 @@ static void
> hap_write_p2m_entry(struct vcpu *v, unsigned long gfn, l1_pgentry_t *p,
> mfn_t table_mfn, l1_pgentry_t new, unsigned int level)
> {
> + struct domain *d = v->domain;
> uint32_t old_flags;
>
> - hap_lock(v->domain);
> + old_flags = l1e_get_flags(*p);
>
> - old_flags = l1e_get_flags(*p);
> + /* We know always use the host p2m here, regardless if the vcpu
> + * is in host or guest mode. The vcpu can be in guest mode by
> + * a hypercall which passes a domain and chooses mostly the first
> + * vcpu.
> + * XXX This is the reason why this function can not be used re-used
> + * for updating the nestedp2m. Otherwise, hypercalls would randomly
> + * operate on host p2m and nested p2m.
> + */
> + if ( nestedhvm_enabled(d) ) {
> + mfn_t omfn = _mfn(l1e_get_pfn(*p));
> + p2m_type_t op2mt = p2m_flags_to_type(old_flags);
> +
> + if ( p2m_is_valid(op2mt) ) {
> + mfn_t nmfn = _mfn(l1e_get_pfn(new));
> + p2m_type_t np2mt = p2m_flags_to_type(l1e_get_flags(new));
> +
> + if ( p2m_is_valid(np2mt) && (mfn_x(omfn) != mfn_x(nmfn)) ) {
Checking that the mfns are the same isn't quite enough, as the
permissions might have changed (e.g. log-dirty mode removing write
access). You need to test for (new == *p) to be sure that you don't
need to flush.
Cheers,
Tim.
> + /* This GFN -> MFN is going to get removed. */
> + /* XXX There is a more efficient way to do that
> + * but it works for now.
> + * Note, p2m_flush_nestedp2m calls hap_lock() internally.
> + */
> + p2m_flush_nestedp2m(d);
> + }
> + }
> + }
> +
> + hap_lock(d);
> +
--
Tim Deegan <Tim.Deegan@citrix.com>
Principal Software Engineer, Xen Platform Team
Citrix Systems UK Ltd. (Company #02937203, SL9 0BG)
^ permalink raw reply [flat|nested] 11+ messages in thread
* [PATCH 12/12] Nested Virtualization: hap-on-hap
@ 2011-03-09 14:31 Christoph Egger
2011-03-22 14:59 ` Tim Deegan
2011-03-31 15:25 ` Christoph Egger
0 siblings, 2 replies; 11+ messages in thread
From: Christoph Egger @ 2011-03-09 14:31 UTC (permalink / raw)
To: xen-devel
[-- Attachment #1: Type: text/plain, Size: 264 bytes --]
--
---to satisfy European Law for business letters:
Advanced Micro Devices GmbH
Einsteinring 24, 85609 Dornach b. Muenchen
Geschaeftsfuehrer: Alberto Bozzo, Andrew Bowd
Sitz: Dornach, Gemeinde Aschheim, Landkreis Muenchen
Registergericht Muenchen, HRB Nr. 43632
[-- Attachment #2: xen_nh12_haphap.diff --]
[-- Type: text/x-diff, Size: 65861 bytes --]
# HG changeset patch
# User cegger
# Date 1299677057 -3600
Implement Nested-on-Nested.
This allows the guest to run nested guest with hap enabled.
Signed-off-by: Christoph Egger <Christoph.Egger@amd.com>
diff -r 3ab405e67be6 -r 98598880e482 xen/arch/x86/hvm/hvm.c
--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -1170,21 +1170,50 @@ void hvm_inject_exception(unsigned int t
hvm_funcs.inject_exception(trapnr, errcode, cr2);
}
-bool_t hvm_hap_nested_page_fault(unsigned long gpa,
- bool_t gla_valid,
- unsigned long gla,
- bool_t access_valid,
- bool_t access_r,
- bool_t access_w,
- bool_t access_x)
+int hvm_hap_nested_page_fault(unsigned long gpa,
+ bool_t gla_valid,
+ unsigned long gla,
+ bool_t access_valid,
+ bool_t access_r,
+ bool_t access_w,
+ bool_t access_x)
{
unsigned long gfn = gpa >> PAGE_SHIFT;
p2m_type_t p2mt;
p2m_access_t p2ma;
mfn_t mfn;
struct vcpu *v = current;
- struct p2m_domain *p2m = p2m_get_hostp2m(v->domain);
-
+ struct p2m_domain *p2m = NULL;
+
+ /* On Nested Virtualization, walk the guest page table.
+ * If this succeeds, all is fine.
+ * If this fails, inject a nested page fault into the guest.
+ */
+ if ( nestedhvm_enabled(v->domain)
+ && nestedhvm_vcpu_in_guestmode(v)
+ && nestedhvm_paging_mode_hap(v) )
+ {
+ int rv;
+
+ /* The vcpu is in guest mode and the l1 guest
+ * uses hap. That means 'gpa' is in l2 guest
+ * physical address space.
+ * Fix the nested p2m or inject nested page fault
+ * into l1 guest if not fixable. The algorithm is
+ * the same as for shadow paging.
+ */
+ rv = nestedhvm_hap_nested_page_fault(v, gpa);
+ switch (rv) {
+ case NESTEDHVM_PAGEFAULT_DONE:
+ return 1;
+ case NESTEDHVM_PAGEFAULT_ERROR:
+ return 0;
+ case NESTEDHVM_PAGEFAULT_INJECT:
+ return -1;
+ }
+ }
+
+ p2m = p2m_get_hostp2m(v->domain);
mfn = gfn_to_mfn_type_current(p2m, gfn, &p2mt, &p2ma, p2m_guest);
/* Check access permissions first, then handle faults */
@@ -1328,6 +1357,15 @@ int hvm_set_efer(uint64_t value)
return X86EMUL_EXCEPTION;
}
+ if ( nestedhvm_enabled(v->domain) && cpu_has_svm &&
+ ((value & EFER_SVME) == 0 ) &&
+ ((value ^ v->arch.hvm_vcpu.guest_efer) & EFER_SVME) )
+ {
+ /* Cleared EFER.SVME: Flush all nestedp2m tables */
+ p2m_flush_nestedp2m(v->domain);
+ nestedhvm_vcpu_reset(v);
+ }
+
value |= v->arch.hvm_vcpu.guest_efer & EFER_LMA;
v->arch.hvm_vcpu.guest_efer = value;
hvm_update_guest_efer(v);
@@ -1478,8 +1516,12 @@ int hvm_set_cr0(unsigned long value)
v->arch.hvm_vcpu.guest_cr[0] = value;
hvm_update_guest_cr(v, 0);
- if ( (value ^ old_value) & X86_CR0_PG )
- paging_update_paging_modes(v);
+ if ( (value ^ old_value) & X86_CR0_PG ) {
+ if ( !nestedhvm_vmswitch_in_progress(v) && nestedhvm_vcpu_in_guestmode(v) )
+ paging_update_nestedmode(v);
+ else
+ paging_update_paging_modes(v);
+ }
return X86EMUL_OKAY;
@@ -1546,8 +1588,12 @@ int hvm_set_cr4(unsigned long value)
hvm_update_guest_cr(v, 4);
/* Modifying CR4.{PSE,PAE,PGE} invalidates all TLB entries, inc. Global. */
- if ( (old_cr ^ value) & (X86_CR4_PSE | X86_CR4_PGE | X86_CR4_PAE) )
- paging_update_paging_modes(v);
+ if ( (old_cr ^ value) & (X86_CR4_PSE | X86_CR4_PGE | X86_CR4_PAE) ) {
+ if ( !nestedhvm_vmswitch_in_progress(v) && nestedhvm_vcpu_in_guestmode(v) )
+ paging_update_nestedmode(v);
+ else
+ paging_update_paging_modes(v);
+ }
return X86EMUL_OKAY;
@@ -2060,7 +2106,7 @@ static enum hvm_copy_result __hvm_copy(
void *buf, paddr_t addr, int size, unsigned int flags, uint32_t pfec)
{
struct vcpu *curr = current;
- struct p2m_domain *p2m = p2m_get_hostp2m(curr->domain);
+ struct p2m_domain *p2m;
unsigned long gfn, mfn;
p2m_type_t p2mt;
char *p;
@@ -2082,6 +2128,8 @@ static enum hvm_copy_result __hvm_copy(
return HVMCOPY_unhandleable;
#endif
+ p2m = p2m_get_hostp2m(curr->domain);
+
while ( todo > 0 )
{
count = min_t(int, PAGE_SIZE - (addr & ~PAGE_MASK), todo);
diff -r 3ab405e67be6 -r 98598880e482 xen/arch/x86/hvm/nestedhvm.c
--- a/xen/arch/x86/hvm/nestedhvm.c
+++ b/xen/arch/x86/hvm/nestedhvm.c
@@ -20,6 +20,7 @@
#include <asm/msr.h>
#include <asm/hvm/support.h> /* for HVM_DELIVER_NO_ERROR_CODE */
#include <asm/hvm/hvm.h>
+#include <asm/p2m.h> /* for struct p2m_domain */
#include <asm/hvm/nestedhvm.h>
#include <asm/event.h> /* for local_event_delivery_(en|dis)able */
#include <asm/paging.h> /* for paging_mode_hap() */
@@ -96,6 +97,54 @@ nestedhvm_vcpu_destroy(struct vcpu *v)
return nhvm_vcpu_destroy(v);
}
+static void
+nestedhvm_flushtlb_ipi(void *info)
+{
+ struct vcpu *v = current;
+ /* Just flush the ASID (or request a new one).
+ * This is cheaper than flush_tlb_local() and has
+ * the same desired effect.
+ */
+ hvm_asid_flush_core();
+ vcpu_nestedhvm(v).nv_p2m = NULL;
+}
+
+void
+nestedhvm_vmcx_flushtlb(struct p2m_domain *p2m)
+{
+ int cpu = smp_processor_id();
+ if (cpu_isset(cpu, p2m->p2m_dirty_cpumask)) {
+ /* Avoid sending an IPI to myself.
+ * This is necessary to avoid a deadlock as this path
+ * can run with GIF disabled on AMD machines.
+ */
+ nestedhvm_flushtlb_ipi(NULL);
+ cpu_clear(cpu, p2m->p2m_dirty_cpumask);
+ }
+ on_selected_cpus(&p2m->p2m_dirty_cpumask, nestedhvm_flushtlb_ipi, NULL, 1);
+ cpus_clear(p2m->p2m_dirty_cpumask);
+}
+
+void
+nestedhvm_vmcx_flushtlbdomain(struct domain *d)
+{
+ on_selected_cpus(&d->domain_dirty_cpumask, nestedhvm_flushtlb_ipi, NULL, 1);
+}
+
+bool_t
+nestedhvm_is_n2(struct vcpu *v)
+{
+ if (!nestedhvm_enabled(v->domain)
+ || nestedhvm_vmswitch_in_progress(v)
+ || !nestedhvm_paging_mode_hap(v))
+ return 0;
+
+ if (nestedhvm_vcpu_in_guestmode(v))
+ return 1;
+
+ return 0;
+}
+
/* Common shadow IO Permission bitmap */
/* There four global patterns of io bitmap each guest can
diff -r 3ab405e67be6 -r 98598880e482 xen/arch/x86/hvm/svm/entry.S
--- a/xen/arch/x86/hvm/svm/entry.S
+++ b/xen/arch/x86/hvm/svm/entry.S
@@ -65,6 +65,7 @@ ENTRY(svm_asm_do_resume)
testl $~0,(r(dx),r(ax),1)
jnz .Lsvm_process_softirqs
+ call nsvm_p2m_handle_vmrun
call svm_asid_handle_vmrun
cmpb $0,addr_of(tb_init_done)
diff -r 3ab405e67be6 -r 98598880e482 xen/arch/x86/hvm/svm/nestedsvm.c
--- a/xen/arch/x86/hvm/svm/nestedsvm.c
+++ b/xen/arch/x86/hvm/svm/nestedsvm.c
@@ -26,6 +26,7 @@
#include <asm/hvm/svm/svmdebug.h>
#include <asm/paging.h> /* paging_mode_hap */
#include <asm/event.h> /* for local_event_delivery_(en|dis)able */
+#include <asm/p2m.h> /* p2m_get_pagetable, p2m_get_nestedp2m */
static void
nestedsvm_vcpu_clgi(struct vcpu *v)
@@ -320,6 +321,18 @@ static int nsvm_vmrun_permissionmap(stru
return 0;
}
+static void nestedsvm_vmcb_set_nestedp2m(struct vcpu *v,
+ struct vmcb_struct *vvmcb, struct vmcb_struct *n2vmcb)
+{
+ struct p2m_domain *p2m;
+
+ ASSERT(v != NULL);
+ ASSERT(vvmcb != NULL);
+ ASSERT(n2vmcb != NULL);
+ p2m = p2m_get_nestedp2m(v, vvmcb->_h_cr3);
+ n2vmcb->_h_cr3 = pagetable_get_paddr(p2m_get_pagetable(p2m));
+}
+
static int nsvm_vmcb_prepare4vmrun(struct vcpu *v, struct cpu_user_regs *regs)
{
struct nestedvcpu *nv = &vcpu_nestedhvm(v);
@@ -475,6 +488,9 @@ static int nsvm_vmcb_prepare4vmrun(struc
/* Nested paging mode */
if (nestedhvm_paging_mode_hap(v)) {
/* host nested paging + guest nested paging. */
+ n2vmcb->_np_enable = 1;
+
+ nestedsvm_vmcb_set_nestedp2m(v, ns_vmcb, n2vmcb);
/* hvm_set_cr3() below sets v->arch.hvm_vcpu.guest_cr[3] for us. */
rc = hvm_set_cr3(ns_vmcb->_cr3);
@@ -1323,6 +1339,27 @@ asmlinkage void nsvm_vcpu_switch(struct
}
}
+/* This runs after VMRUN emulation. It is possible
+ * that an other (v)cpu flushed the assigned nestedp2m
+ * during the VMRUN emulation. Check for this case
+ * and get a new one.
+ * Caller must ensure the GIF is cleared.
+ */
+asmlinkage void nsvm_p2m_handle_vmrun(void)
+{
+ struct vcpu *v = current;
+ struct nestedvcpu *nv;
+
+ if (!nestedhvm_enabled(v->domain))
+ return;
+
+ nv = &vcpu_nestedhvm(v);
+ if (nv->nv_p2m == NULL
+ && nestedhvm_vcpu_in_guestmode(v)
+ && nestedhvm_paging_mode_hap(v))
+ nestedsvm_vmcb_set_nestedp2m(v, nv->nv_vvmcx, nv->nv_n2vmcx);
+}
+
/* Interrupts, Virtual GIF */
int
nestedsvm_vcpu_interrupt(struct vcpu *v, const struct hvm_intack intack)
diff -r 3ab405e67be6 -r 98598880e482 xen/arch/x86/hvm/svm/svm.c
--- a/xen/arch/x86/hvm/svm/svm.c
+++ b/xen/arch/x86/hvm/svm/svm.c
@@ -1014,14 +1014,16 @@ struct hvm_function_table * __init start
return &svm_function_table;
}
-static void svm_do_nested_pgfault(paddr_t gpa)
+static void svm_do_nested_pgfault(struct vcpu *v,
+ struct cpu_user_regs *regs, paddr_t gpa)
{
+ int ret;
unsigned long gfn = gpa >> PAGE_SHIFT;
mfn_t mfn;
p2m_type_t p2mt;
- struct p2m_domain *p2m;
+ struct p2m_domain *p2m = NULL;
- p2m = p2m_get_hostp2m(current->domain);
+ ret = hvm_hap_nested_page_fault(gpa, 0, ~0ul, 0, 0, 0, 0);
if ( tb_init_done )
{
@@ -1032,6 +1034,7 @@ static void svm_do_nested_pgfault(paddr_
uint32_t p2mt;
} _d;
+ p2m = p2m_get_p2m(v);
_d.gpa = gpa;
_d.qualification = 0;
_d.mfn = mfn_x(gfn_to_mfn_query(p2m, gfn, &_d.p2mt));
@@ -1039,14 +1042,26 @@ static void svm_do_nested_pgfault(paddr_
__trace_var(TRC_HVM_NPF, 0, sizeof(_d), &_d);
}
- if ( hvm_hap_nested_page_fault(gpa, 0, ~0ul, 0, 0, 0, 0) )
+ switch (ret) {
+ case 0:
+ break;
+ case 1:
return;
+ case -1:
+ ASSERT(nestedhvm_enabled(v->domain) && nestedhvm_vcpu_in_guestmode(v));
+ /* inject #VMEXIT(NPF) into guest. */
+ nestedsvm_vmexit_defer(v, VMEXIT_NPF, regs->error_code, gpa);
+ return;
+ }
+ if ( p2m == NULL )
+ p2m = p2m_get_p2m(v);
/* Everything else is an error. */
mfn = gfn_to_mfn_guest(p2m, gfn, &p2mt);
- gdprintk(XENLOG_ERR, "SVM violation gpa %#"PRIpaddr", mfn %#lx, type %i\n",
- gpa, mfn_x(mfn), p2mt);
- domain_crash(current->domain);
+ gdprintk(XENLOG_ERR,
+ "SVM violation gpa %#"PRIpaddr", mfn %#lx, type %i\n",
+ gpa, mfn_x(mfn), p2mt);
+ domain_crash(v->domain);
}
static void svm_fpu_dirty_intercept(void)
@@ -1659,6 +1674,8 @@ asmlinkage void svm_vmexit_handler(struc
struct vmcb_struct *ns_vmcb = nv->nv_vvmcx;
uint64_t exitinfo1, exitinfo2;
+ paging_update_nestedmode(v);
+
/* Write real exitinfo1 back into virtual vmcb.
* nestedsvm_check_intercepts() expects to have the correct
* exitinfo1 value there.
@@ -1948,7 +1965,7 @@ asmlinkage void svm_vmexit_handler(struc
case VMEXIT_NPF:
perfc_incra(svmexits, VMEXIT_NPF_PERFC);
regs->error_code = vmcb->exitinfo1;
- svm_do_nested_pgfault(vmcb->exitinfo2);
+ svm_do_nested_pgfault(v, regs, vmcb->exitinfo2);
break;
case VMEXIT_IRET: {
diff -r 3ab405e67be6 -r 98598880e482 xen/arch/x86/mm/hap/Makefile
--- a/xen/arch/x86/mm/hap/Makefile
+++ b/xen/arch/x86/mm/hap/Makefile
@@ -3,6 +3,7 @@ obj-y += guest_walk_2level.o
obj-y += guest_walk_3level.o
obj-y += guest_walk_4level.o
obj-y += p2m-ept.o
+obj-y += nested_hap.o
guest_levels = $(subst level,,$(filter %level,$(subst ., ,$(subst _, ,$(1)))))
guest_walk_defns = -DGUEST_PAGING_LEVELS=$(call guest_levels,$(1))
diff -r 3ab405e67be6 -r 98598880e482 xen/arch/x86/mm/hap/guest_walk.c
--- a/xen/arch/x86/mm/hap/guest_walk.c
+++ b/xen/arch/x86/mm/hap/guest_walk.c
@@ -29,24 +29,32 @@
#define _hap_gva_to_gfn(levels) hap_gva_to_gfn_##levels##_levels
#define hap_gva_to_gfn(levels) _hap_gva_to_gfn(levels)
+#define _hap_p2m_ga_to_gfn(levels) hap_p2m_ga_to_gfn_##levels##_levels
+#define hap_p2m_ga_to_gfn(levels) _hap_p2m_ga_to_gfn(levels)
+
#if GUEST_PAGING_LEVELS <= CONFIG_PAGING_LEVELS
#include <asm/guest_pt.h>
#include <asm/p2m.h>
unsigned long hap_gva_to_gfn(GUEST_PAGING_LEVELS)(
- struct vcpu *v, unsigned long gva, uint32_t *pfec)
+ struct vcpu *v, struct p2m_domain *p2m, unsigned long gva, uint32_t *pfec)
{
- unsigned long cr3;
+ unsigned long cr3 = v->arch.hvm_vcpu.guest_cr[3];
+ return hap_p2m_ga_to_gfn(GUEST_PAGING_LEVELS)(v, p2m, cr3, gva, pfec);
+}
+
+unsigned long hap_p2m_ga_to_gfn(GUEST_PAGING_LEVELS)(
+ struct vcpu *v, struct p2m_domain *p2m, unsigned long cr3,
+ paddr_t ga, uint32_t *pfec)
+{
uint32_t missing;
mfn_t top_mfn;
void *top_map;
p2m_type_t p2mt;
walk_t gw;
- struct p2m_domain *p2m = p2m_get_hostp2m(v->domain);
/* Get the top-level table's MFN */
- cr3 = v->arch.hvm_vcpu.guest_cr[3];
top_mfn = gfn_to_mfn_unshare(p2m, cr3 >> PAGE_SHIFT, &p2mt, 0);
if ( p2m_is_paging(p2mt) )
{
@@ -72,7 +80,7 @@ unsigned long hap_gva_to_gfn(GUEST_PAGIN
#if GUEST_PAGING_LEVELS == 3
top_map += (cr3 & ~(PAGE_MASK | 31));
#endif
- missing = guest_walk_tables(v, p2m, gva, &gw, pfec[0], top_mfn, top_map);
+ missing = guest_walk_tables(v, p2m, ga, &gw, pfec[0], top_mfn, top_map);
unmap_domain_page(top_map);
/* Interpret the answer */
@@ -122,6 +130,15 @@ unsigned long hap_gva_to_gfn(GUEST_PAGIN
return INVALID_GFN;
}
+unsigned long hap_p2m_ga_to_gfn(GUEST_PAGING_LEVELS)(
+ struct vcpu *v, struct p2m_domain *p2m, unsigned long cr3,
+ paddr_t ga, uint32_t *pfec)
+{
+ gdprintk(XENLOG_ERR,
+ "Guest paging level is greater than host paging level!\n");
+ domain_crash(v->domain);
+ return INVALID_GFN;
+}
#endif
diff -r 3ab405e67be6 -r 98598880e482 xen/arch/x86/mm/hap/hap.c
--- a/xen/arch/x86/mm/hap/hap.c
+++ b/xen/arch/x86/mm/hap/hap.c
@@ -40,6 +40,7 @@
#include <asm/p2m.h>
#include <asm/domain.h>
#include <xen/numa.h>
+#include <asm/hvm/nestedhvm.h>
#include "private.h"
@@ -582,6 +583,7 @@ void hap_domain_init(struct domain *d)
int hap_enable(struct domain *d, u32 mode)
{
unsigned int old_pages;
+ uint8_t i;
int rv = 0;
domain_pause(d);
@@ -620,6 +622,12 @@ int hap_enable(struct domain *d, u32 mod
goto out;
}
+ for (i = 0; i < MAX_NESTEDP2M; i++) {
+ rv = p2m_alloc_table(d->arch.nested_p2m[i]);
+ if ( rv != 0 )
+ goto out;
+ }
+
/* Now let other users see the new mode */
d->arch.paging.mode = mode | PG_HAP_enable;
@@ -630,6 +638,13 @@ int hap_enable(struct domain *d, u32 mod
void hap_final_teardown(struct domain *d)
{
+ uint8_t i;
+
+ /* Destroy nestedp2m's first */
+ for (i = 0; i < MAX_NESTEDP2M; i++) {
+ p2m_teardown(d->arch.nested_p2m[i]);
+ }
+
if ( d->arch.paging.hap.total_pages != 0 )
hap_teardown(d);
@@ -657,7 +672,7 @@ void hap_teardown(struct domain *d)
/* release the monitor table held by each vcpu */
for_each_vcpu ( d, v )
{
- if ( v->arch.paging.mode && paging_mode_external(d) )
+ if ( paging_get_hostmode(v) && paging_mode_external(d) )
{
mfn = pagetable_get_mfn(v->arch.monitor_table);
if ( mfn_valid(mfn) && (mfn_x(mfn) != 0) )
@@ -725,6 +740,7 @@ static const struct paging_mode hap_pagi
void hap_vcpu_init(struct vcpu *v)
{
v->arch.paging.mode = &hap_paging_real_mode;
+ v->arch.paging.nestedmode = &hap_paging_real_mode;
}
/************************************************/
@@ -751,6 +767,15 @@ static int hap_page_fault(struct vcpu *v
*/
static int hap_invlpg(struct vcpu *v, unsigned long va)
{
+ if (nestedhvm_enabled(v->domain)) {
+ /* Emulate INVLPGA:
+ * Must perform the flush right now or an other vcpu may
+ * use it when we use the next VMRUN emulation, otherwise.
+ */
+ p2m_flush(v, vcpu_nestedhvm(v).nv_p2m);
+ return 1;
+ }
+
HAP_ERROR("Intercepted a guest INVLPG (%u:%u) with HAP enabled.\n",
v->domain->domain_id, v->vcpu_id);
domain_crash(v->domain);
@@ -763,17 +788,22 @@ static void hap_update_cr3(struct vcpu *
hvm_update_guest_cr(v, 3);
}
+const struct paging_mode *
+hap_paging_get_mode(struct vcpu *v)
+{
+ return !hvm_paging_enabled(v) ? &hap_paging_real_mode :
+ hvm_long_mode_enabled(v) ? &hap_paging_long_mode :
+ hvm_pae_enabled(v) ? &hap_paging_pae_mode :
+ &hap_paging_protected_mode;
+}
+
static void hap_update_paging_modes(struct vcpu *v)
{
struct domain *d = v->domain;
hap_lock(d);
- v->arch.paging.mode =
- !hvm_paging_enabled(v) ? &hap_paging_real_mode :
- hvm_long_mode_enabled(v) ? &hap_paging_long_mode :
- hvm_pae_enabled(v) ? &hap_paging_pae_mode :
- &hap_paging_protected_mode;
+ v->arch.paging.mode = hap_paging_get_mode(v);
if ( pagetable_is_null(v->arch.monitor_table) )
{
@@ -834,38 +864,81 @@ static void
hap_write_p2m_entry(struct vcpu *v, unsigned long gfn, l1_pgentry_t *p,
mfn_t table_mfn, l1_pgentry_t new, unsigned int level)
{
- uint32_t old_flags;
+ struct domain *d = v->domain;
+ uint32_t old_flags = l1e_get_flags(*p);
+ p2m_type_t op2mt = p2m_flags_to_type(old_flags);
- hap_lock(v->domain);
+ /* We know always use the host p2m here, regardless if the vcpu
+ * is in host or guest mode. The vcpu can be in guest mode by
+ * a hypercall which passes a domain and chooses mostly the first
+ * vcpu.
+ * XXX This is the reason why this function can not be used re-used
+ * for updating the nestedp2m. Otherwise, hypercalls would randomly
+ * operate on host p2m and nested p2m.
+ */
+ if ( nestedhvm_enabled(d)
+ && p2m_is_valid(op2mt) )
+ {
+ if ( l1e_get_intpte(new) != l1e_get_intpte(*p) ) {
+ p2m_type_t np2mt = p2m_flags_to_type(l1e_get_flags(new));
- old_flags = l1e_get_flags(*p);
+ /* Skip flush on vram tracking or XP mode in Win7 hang
+ * very early in the virtual BIOS (long before the bootloader
+ * runs), otherwise. VRAM tracking happens so often that
+ * flushing and fixing the nestedp2m doesn't let XP mode
+ * proceed to boot.
+ */
+ if ( !((op2mt == p2m_ram_rw && np2mt == p2m_ram_logdirty)
+ || (op2mt == p2m_ram_logdirty && np2mt == p2m_ram_rw)) )
+ {
+ /* This GFN -> MFN is going to get removed. */
+ /* XXX There is a more efficient way to do that
+ * but it works for now.
+ * Note, p2m_flush_nestedp2m calls hap_lock() internally.
+ */
+ p2m_flush_nestedp2m(d);
+ }
+ }
+ }
+
+ hap_lock(d);
+
safe_write_pte(p, new);
if ( (old_flags & _PAGE_PRESENT)
&& (level == 1 || (level == 2 && (old_flags & _PAGE_PSE))) )
- flush_tlb_mask(&v->domain->domain_dirty_cpumask);
+ flush_tlb_mask(&d->domain_dirty_cpumask);
#if CONFIG_PAGING_LEVELS == 3
/* install P2M in monitor table for PAE Xen */
if ( level == 3 )
/* We have written to the p2m l3: need to sync the per-vcpu
* copies of it in the monitor tables */
- p2m_install_entry_in_monitors(v->domain, (l3_pgentry_t *)p);
+ p2m_install_entry_in_monitors(d, (l3_pgentry_t *)p);
#endif
- hap_unlock(v->domain);
+ hap_unlock(d);
}
static unsigned long hap_gva_to_gfn_real_mode(
- struct vcpu *v, unsigned long gva, uint32_t *pfec)
+ struct vcpu *v, struct p2m_domain *p2m, unsigned long gva, uint32_t *pfec)
{
return ((paddr_t)gva >> PAGE_SHIFT);
}
+static unsigned long hap_p2m_ga_to_gfn_real_mode(
+ struct vcpu *v, struct p2m_domain *p2m, unsigned long cr3,
+ paddr_t ga, uint32_t *pfec)
+{
+ return (ga >> PAGE_SHIFT);
+}
+
+
/* Entry points into this mode of the hap code. */
static const struct paging_mode hap_paging_real_mode = {
.page_fault = hap_page_fault,
.invlpg = hap_invlpg,
.gva_to_gfn = hap_gva_to_gfn_real_mode,
+ .p2m_ga_to_gfn = hap_p2m_ga_to_gfn_real_mode,
.update_cr3 = hap_update_cr3,
.update_paging_modes = hap_update_paging_modes,
.write_p2m_entry = hap_write_p2m_entry,
@@ -876,6 +949,7 @@ static const struct paging_mode hap_pagi
.page_fault = hap_page_fault,
.invlpg = hap_invlpg,
.gva_to_gfn = hap_gva_to_gfn_2_levels,
+ .p2m_ga_to_gfn = hap_p2m_ga_to_gfn_2_levels,
.update_cr3 = hap_update_cr3,
.update_paging_modes = hap_update_paging_modes,
.write_p2m_entry = hap_write_p2m_entry,
@@ -886,6 +960,7 @@ static const struct paging_mode hap_pagi
.page_fault = hap_page_fault,
.invlpg = hap_invlpg,
.gva_to_gfn = hap_gva_to_gfn_3_levels,
+ .p2m_ga_to_gfn = hap_p2m_ga_to_gfn_3_levels,
.update_cr3 = hap_update_cr3,
.update_paging_modes = hap_update_paging_modes,
.write_p2m_entry = hap_write_p2m_entry,
@@ -896,6 +971,7 @@ static const struct paging_mode hap_pagi
.page_fault = hap_page_fault,
.invlpg = hap_invlpg,
.gva_to_gfn = hap_gva_to_gfn_4_levels,
+ .p2m_ga_to_gfn = hap_p2m_ga_to_gfn_4_levels,
.update_cr3 = hap_update_cr3,
.update_paging_modes = hap_update_paging_modes,
.write_p2m_entry = hap_write_p2m_entry,
diff -r 3ab405e67be6 -r 98598880e482 xen/arch/x86/mm/hap/nested_hap.c
--- /dev/null
+++ b/xen/arch/x86/mm/hap/nested_hap.c
@@ -0,0 +1,236 @@
+/******************************************************************************
+ * arch/x86/mm/hap/nested_hap.c
+ *
+ * Code for Nested Virtualization
+ * Copyright (c) 2011 Advanced Micro Devices
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <asm/domain.h>
+#include <asm/page.h>
+#include <asm/paging.h>
+#include <asm/p2m.h>
+#include <asm/mem_event.h>
+#include <public/mem_event.h>
+#include <asm/mem_sharing.h>
+#include <xen/event.h>
+#include <asm/hap.h>
+#include <asm/hvm/support.h>
+
+#include <asm/hvm/nestedhvm.h>
+
+#include "private.h"
+
+/* AlGORITHM for NESTED PAGE FAULT
+ *
+ * NOTATION
+ * Levels: L0, L1, L2
+ * Guests: L1 guest, L2 guest
+ * Hypervisor: L0 hypervisor
+ * Addresses: L2-GVA, L2-GPA, L1-GVA, L1-GPA, MPA
+ *
+ * On L0, when #NPF happens, the handler function should do:
+ * hap_page_fault(GPA)
+ * {
+ * 1. If #NPF is from L1 guest, then we crash the guest VM (same as old
+ * code)
+ * 2. If #NPF is from L2 guest, then we continue from (3)
+ * 3. Get h_cr3 from L1 guest. Map h_cr3 into L0 hypervisor address space.
+ * 4. Walk the h_cr3 page table
+ * 5. - if not present, then we inject #NPF back to L1 guest and
+ * re-launch L1 guest (L1 guest will either treat this #NPF as MMIO,
+ * or fix its p2m table for L2 guest)
+ * 6. - if present, then we will get the a new translated value L1-GPA
+ * (points to L1 machine memory)
+ * 7. * Use L1-GPA to walk L0 P2M table
+ * 8. - if not present, then crash the guest (should not happen)
+ * 9. - if present, then we get a new translated value MPA
+ * (points to real machine memory)
+ * 10. * Finally, use GPA and MPA to walk nested_p2m
+ * and fix the bits.
+ * }
+ *
+ */
+
+
+/********************************************/
+/* NESTED VIRT P2M FUNCTIONS */
+/********************************************/
+/* Override macros from asm/page.h to make them work with mfn_t */
+#undef mfn_valid
+#define mfn_valid(_mfn) __mfn_valid(mfn_x(_mfn))
+#undef page_to_mfn
+#define page_to_mfn(_pg) _mfn(__page_to_mfn(_pg))
+
+void
+nestedp2m_write_p2m_entry(struct p2m_domain *p2m, unsigned long gfn,
+ l1_pgentry_t *p, mfn_t table_mfn, l1_pgentry_t new, unsigned int level)
+{
+ struct domain *d = p2m->domain;
+ uint32_t old_flags;
+
+ hap_lock(d);
+
+ old_flags = l1e_get_flags(*p);
+ safe_write_pte(p, new);
+ if (old_flags & _PAGE_PRESENT)
+ nestedhvm_vmcx_flushtlb(p2m);
+
+ hap_unlock(d);
+}
+
+/********************************************/
+/* NESTED VIRT FUNCTIONS */
+/********************************************/
+static void
+nestedhap_fix_p2m(struct p2m_domain *p2m, paddr_t L2_gpa, paddr_t L0_gpa,
+ p2m_type_t p2mt, p2m_access_t p2ma)
+{
+ int rv;
+ ASSERT(p2m);
+ ASSERT(p2m->set_entry);
+
+ rv = p2m->set_entry(p2m, L2_gpa >> PAGE_SHIFT,
+ page_to_mfn(maddr_to_page(L0_gpa)),
+ 0 /*4K*/, p2mt, p2ma);
+ if (rv == 0) {
+ gdprintk(XENLOG_ERR,
+ "failed to set entry for 0x%"PRIx64" -> 0x%"PRIx64"\n",
+ L2_gpa, L0_gpa);
+ BUG();
+ }
+}
+
+/* This function uses L1_gpa to walk the P2M table in L0 hypervisor. If the
+ * walk is successful, the translated value is returned in L0_gpa. The return
+ * value tells the upper level what to do.
+ */
+static int
+nestedhap_walk_L0_p2m(struct p2m_domain *p2m, paddr_t L1_gpa, paddr_t *L0_gpa)
+{
+ mfn_t mfn;
+ p2m_type_t p2mt;
+
+ /* we use gfn_to_mfn_query() function to walk L0 P2M table */
+ mfn = gfn_to_mfn_query(p2m, L1_gpa >> PAGE_SHIFT, &p2mt);
+
+ if ( p2m_is_paging(p2mt) || p2m_is_shared(p2mt) || !p2m_is_ram(p2mt) )
+ return NESTEDHVM_PAGEFAULT_ERROR;
+
+ if ( !mfn_valid(mfn) )
+ return NESTEDHVM_PAGEFAULT_ERROR;
+
+ *L0_gpa = (mfn_x(mfn) << PAGE_SHIFT) + (L1_gpa & ~PAGE_MASK);
+ return NESTEDHVM_PAGEFAULT_DONE;
+}
+
+/* This function uses L2_gpa to walk the P2M page table in L1. If the
+ * walk is successful, the translated value is returned in
+ * L1_gpa. The result value tells what to do next.
+ */
+static int
+nestedhap_walk_L1_p2m(struct vcpu *v, struct p2m_domain *p2m,
+ paddr_t L2_gpa, paddr_t *L1_gpa)
+{
+ uint32_t pfec;
+ unsigned long nested_cr3, gfn;
+ const struct paging_mode *mode = paging_get_hostmode(v);
+
+ nested_cr3 = nhvm_vcpu_hostcr3(v);
+
+ /* walk the guest table */
+ gfn = paging_p2m_ga_to_gfn(v, p2m, mode, nested_cr3, L2_gpa, &pfec);
+
+ if ( gfn == INVALID_GFN )
+ return NESTEDHVM_PAGEFAULT_INJECT;
+
+ *L1_gpa = (gfn << PAGE_SHIFT) + (L2_gpa & ~PAGE_MASK);
+ return NESTEDHVM_PAGEFAULT_DONE;
+}
+
+/*
+ * The following function, nestedhap_page_fault(), is for steps (3)--(10).
+ *
+ * Returns:
+ */
+int
+nestedhvm_hap_nested_page_fault(struct vcpu *v, paddr_t L2_gpa)
+{
+ int rv;
+ paddr_t L1_gpa, L0_gpa;
+ struct domain *d = v->domain;
+ struct p2m_domain *p2m, *nested_p2m;
+
+ p2m = p2m_get_hostp2m(d); /* L0 p2m */
+ nested_p2m = p2m_get_nestedp2m(v, nhvm_vcpu_hostcr3(v));
+
+ /* walk the L1 P2M table, note we have to pass p2m
+ * and not nested_p2m here or we fail the walk forever,
+ * otherwise. */
+ rv = nestedhap_walk_L1_p2m(v, p2m, L2_gpa, &L1_gpa);
+
+ /* let caller to handle these two cases */
+ switch (rv) {
+ case NESTEDHVM_PAGEFAULT_INJECT:
+ return rv;
+ case NESTEDHVM_PAGEFAULT_ERROR:
+ return rv;
+ case NESTEDHVM_PAGEFAULT_DONE:
+ break;
+ default:
+ BUG();
+ break;
+ }
+
+ /* ==> we have to walk L0 P2M */
+ rv = nestedhap_walk_L0_p2m(p2m, L1_gpa, &L0_gpa);
+
+ /* let upper level caller to handle these two cases */
+ switch (rv) {
+ case NESTEDHVM_PAGEFAULT_INJECT:
+ return rv;
+ case NESTEDHVM_PAGEFAULT_ERROR:
+ return rv;
+ case NESTEDHVM_PAGEFAULT_DONE:
+ break;
+ default:
+ BUG();
+ break;
+ }
+
+ nestedp2m_lock(d);
+ /* fix p2m_get_pagetable(nested_p2m) */
+ nestedhap_fix_p2m(nested_p2m, L2_gpa, L0_gpa,
+ p2m_ram_rw,
+ p2m_access_rwx /* FIXME: Should use same permission as l1 guest */);
+ nestedp2m_unlock(d);
+
+ return NESTEDHVM_PAGEFAULT_DONE;
+}
+
+/********************************************/
+/* NESTED VIRT INITIALIZATION FUNCS */
+/********************************************/
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff -r 3ab405e67be6 -r 98598880e482 xen/arch/x86/mm/hap/private.h
--- a/xen/arch/x86/mm/hap/private.h
+++ b/xen/arch/x86/mm/hap/private.h
@@ -23,11 +23,27 @@
/********************************************/
/* GUEST TRANSLATION FUNCS */
/********************************************/
-unsigned long hap_gva_to_gfn_2_levels(struct vcpu *v, unsigned long gva,
+unsigned long hap_gva_to_gfn_2_levels(struct vcpu *v,
+ struct p2m_domain *p2m,
+ unsigned long gva,
uint32_t *pfec);
-unsigned long hap_gva_to_gfn_3_levels(struct vcpu *v, unsigned long gva,
+unsigned long hap_gva_to_gfn_3_levels(struct vcpu *v,
+ struct p2m_domain *p2m,
+ unsigned long gva,
uint32_t *pfec);
-unsigned long hap_gva_to_gfn_4_levels(struct vcpu *v, unsigned long gva,
+unsigned long hap_gva_to_gfn_4_levels(struct vcpu *v,
+ struct p2m_domain *p2m,
+ unsigned long gva,
uint32_t *pfec);
+unsigned long hap_p2m_ga_to_gfn_2_levels(struct vcpu *v,
+ struct p2m_domain *p2m, unsigned long cr3,
+ paddr_t ga, uint32_t *pfec);
+unsigned long hap_p2m_ga_to_gfn_3_levels(struct vcpu *v,
+ struct p2m_domain *p2m, unsigned long cr3,
+ paddr_t ga, uint32_t *pfec);
+unsigned long hap_p2m_ga_to_gfn_4_levels(struct vcpu *v,
+ struct p2m_domain *p2m, unsigned long cr3,
+ paddr_t ga, uint32_t *pfec);
+
#endif /* __HAP_PRIVATE_H__ */
diff -r 3ab405e67be6 -r 98598880e482 xen/arch/x86/mm/p2m.c
--- a/xen/arch/x86/mm/p2m.c
+++ b/xen/arch/x86/mm/p2m.c
@@ -34,6 +34,7 @@
#include <public/mem_event.h>
#include <asm/mem_sharing.h>
#include <xen/event.h>
+#include <asm/hvm/nestedhvm.h>
/* Debugging and auditing of the P2M code? */
#define P2M_AUDIT 0
@@ -75,7 +76,7 @@ boolean_param("hap_2mb", opt_hap_2mb);
#define SUPERPAGE_PAGES (1UL << 9)
#define superpage_aligned(_x) (((_x)&(SUPERPAGE_PAGES-1))==0)
-static unsigned long p2m_type_to_flags(p2m_type_t t, mfn_t mfn)
+unsigned long p2m_type_to_flags(p2m_type_t t, mfn_t mfn)
{
unsigned long flags;
#ifdef __x86_64__
@@ -121,9 +122,9 @@ static void audit_p2m(struct p2m_domain
// Find the next level's P2M entry, checking for out-of-range gfn's...
// Returns NULL on error.
//
-static l1_pgentry_t *
+l1_pgentry_t *
p2m_find_entry(void *table, unsigned long *gfn_remainder,
- unsigned long gfn, u32 shift, u32 max)
+ unsigned long gfn, uint32_t shift, uint32_t max)
{
u32 index;
@@ -224,20 +225,17 @@ p2m_next_level(struct p2m_domain *p2m, m
switch ( type ) {
case PGT_l3_page_table:
- paging_write_p2m_entry(p2m->domain, gfn,
- p2m_entry, *table_mfn, new_entry, 4);
+ p2m->write_p2m_entry(p2m, gfn, p2m_entry, *table_mfn, new_entry, 4);
break;
case PGT_l2_page_table:
#if CONFIG_PAGING_LEVELS == 3
/* for PAE mode, PDPE only has PCD/PWT/P bits available */
new_entry = l1e_from_pfn(mfn_x(page_to_mfn(pg)), _PAGE_PRESENT);
#endif
- paging_write_p2m_entry(p2m->domain, gfn,
- p2m_entry, *table_mfn, new_entry, 3);
+ p2m->write_p2m_entry(p2m, gfn, p2m_entry, *table_mfn, new_entry, 3);
break;
case PGT_l1_page_table:
- paging_write_p2m_entry(p2m->domain, gfn,
- p2m_entry, *table_mfn, new_entry, 2);
+ p2m->write_p2m_entry(p2m, gfn, p2m_entry, *table_mfn, new_entry, 2);
break;
default:
BUG();
@@ -264,14 +262,13 @@ p2m_next_level(struct p2m_domain *p2m, m
for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
{
new_entry = l1e_from_pfn(pfn + (i * L1_PAGETABLE_ENTRIES), flags);
- paging_write_p2m_entry(p2m->domain, gfn,
- l1_entry+i, *table_mfn, new_entry, 2);
+ p2m->write_p2m_entry(p2m, gfn,
+ l1_entry+i, *table_mfn, new_entry, 2);
}
unmap_domain_page(l1_entry);
new_entry = l1e_from_pfn(mfn_x(page_to_mfn(pg)),
__PAGE_HYPERVISOR|_PAGE_USER); //disable PSE
- paging_write_p2m_entry(p2m->domain, gfn,
- p2m_entry, *table_mfn, new_entry, 3);
+ p2m->write_p2m_entry(p2m, gfn, p2m_entry, *table_mfn, new_entry, 3);
}
@@ -298,15 +295,15 @@ p2m_next_level(struct p2m_domain *p2m, m
for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
{
new_entry = l1e_from_pfn(pfn + i, flags);
- paging_write_p2m_entry(p2m->domain, gfn,
- l1_entry+i, *table_mfn, new_entry, 1);
+ p2m->write_p2m_entry(p2m, gfn,
+ l1_entry+i, *table_mfn, new_entry, 1);
}
unmap_domain_page(l1_entry);
new_entry = l1e_from_pfn(mfn_x(page_to_mfn(pg)),
__PAGE_HYPERVISOR|_PAGE_USER);
- paging_write_p2m_entry(p2m->domain, gfn,
- p2m_entry, *table_mfn, new_entry, 2);
+ p2m->write_p2m_entry(p2m, gfn,
+ p2m_entry, *table_mfn, new_entry, 2);
}
*table_mfn = _mfn(l1e_get_pfn(*p2m_entry));
@@ -1369,8 +1366,7 @@ p2m_set_entry(struct p2m_domain *p2m, un
p2m_type_to_flags(p2mt, mfn) | _PAGE_PSE)
: l3e_empty();
entry_content.l1 = l3e_content.l3;
- paging_write_p2m_entry(p2m->domain, gfn, p2m_entry,
- table_mfn, entry_content, 3);
+ p2m->write_p2m_entry(p2m, gfn, p2m_entry, table_mfn, entry_content, 3);
/* NB: paging_write_p2m_entry() handles tlb flushes properly */
/* Free old intermediate tables if necessary */
@@ -1410,8 +1406,7 @@ p2m_set_entry(struct p2m_domain *p2m, un
entry_content = l1e_empty();
/* level 1 entry */
- paging_write_p2m_entry(p2m->domain, gfn, p2m_entry,
- table_mfn, entry_content, 1);
+ p2m->write_p2m_entry(p2m, gfn, p2m_entry, table_mfn, entry_content, 1);
/* NB: paging_write_p2m_entry() handles tlb flushes properly */
}
else if ( page_order == 9 )
@@ -1440,8 +1435,7 @@ p2m_set_entry(struct p2m_domain *p2m, un
l2e_content = l2e_empty();
entry_content.l1 = l2e_content.l2;
- paging_write_p2m_entry(p2m->domain, gfn, p2m_entry,
- table_mfn, entry_content, 2);
+ p2m->write_p2m_entry(p2m, gfn, p2m_entry, table_mfn, entry_content, 2);
/* NB: paging_write_p2m_entry() handles tlb flushes properly */
/* Free old intermediate tables if necessary */
@@ -1806,10 +1800,13 @@ static void p2m_initialise(struct domain
p2m->domain = d;
p2m->default_access = p2m_access_rwx;
+ p2m->cr3 = CR3_EADDR;
p2m->set_entry = p2m_set_entry;
p2m->get_entry = p2m_gfn_to_mfn;
p2m->get_entry_current = p2m_gfn_to_mfn_current;
p2m->change_entry_type_global = p2m_change_type_global;
+ p2m->write_p2m_entry = paging_write_p2m_entry;
+ cpus_clear(p2m->p2m_dirty_cpumask);
if ( hap_enabled(d) && (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) )
ept_p2m_init(d);
@@ -1817,6 +1814,25 @@ static void p2m_initialise(struct domain
return;
}
+static int
+p2m_init_nestedp2m(struct domain *d)
+{
+ uint8_t i;
+ struct p2m_domain *p2m;
+
+ nestedp2m_lock_init(d);
+ for (i = 0; i < MAX_NESTEDP2M; i++) {
+ d->arch.nested_p2m[i] = p2m = xmalloc(struct p2m_domain);
+ if (p2m == NULL)
+ return -ENOMEM;
+ p2m_initialise(d, p2m);
+ p2m->get_entry_current = p2m->get_entry;
+ p2m->write_p2m_entry = nestedp2m_write_p2m_entry;
+ }
+
+ return 0;
+}
+
int p2m_init(struct domain *d)
{
struct p2m_domain *p2m;
@@ -1825,8 +1841,12 @@ int p2m_init(struct domain *d)
if ( p2m == NULL )
return -ENOMEM;
p2m_initialise(d, p2m);
-
- return 0;
+
+ /* Must initialise nestedp2m unconditionally
+ * since nestedhvm_enabled(d) returns false here.
+ * (p2m_init runs too early for HVM_PARAM_* options)
+ */
+ return p2m_init_nestedp2m(d);
}
void p2m_change_entry_type_global(struct p2m_domain *p2m,
@@ -1919,6 +1939,9 @@ int p2m_alloc_table(struct p2m_domain *p
p2m_invalid, p2m->default_access) )
goto error;
+ if (p2m_is_nestedp2m(p2m))
+ goto nesteddone;
+
/* Copy all existing mappings from the page list and m2p */
spin_lock(&p2m->domain->page_alloc_lock);
page_list_for_each(page, &p2m->domain->page_list)
@@ -1940,6 +1963,7 @@ int p2m_alloc_table(struct p2m_domain *p
}
spin_unlock(&p2m->domain->page_alloc_lock);
+ nesteddone:
P2M_PRINTK("p2m table initialised (%u pages)\n", page_count);
p2m_unlock(p2m);
return 0;
@@ -1966,6 +1990,9 @@ void p2m_teardown(struct p2m_domain *p2m
mfn_t mfn;
#endif
+ if (p2m == NULL)
+ return;
+
p2m_lock(p2m);
#ifdef __x86_64__
@@ -1984,11 +2011,26 @@ void p2m_teardown(struct p2m_domain *p2m
p2m_unlock(p2m);
}
+static void p2m_teardown_nestedp2m(struct domain *d)
+{
+ uint8_t i;
+
+ for (i = 0; i < MAX_NESTEDP2M; i++) {
+ xfree(d->arch.nested_p2m[i]);
+ d->arch.nested_p2m[i] = NULL;
+ }
+}
+
void p2m_final_teardown(struct domain *d)
{
/* Iterate over all p2m tables per domain */
xfree(d->arch.p2m);
d->arch.p2m = NULL;
+
+ /* We must teardown unconditionally because
+ * we initialise them unconditionally.
+ */
+ p2m_teardown_nestedp2m(d);
}
#if P2M_AUDIT
@@ -2573,9 +2615,9 @@ void p2m_change_type_global(struct p2m_d
gfn = get_gpfn_from_mfn(mfn);
flags = p2m_type_to_flags(nt, _mfn(mfn));
l1e_content = l1e_from_pfn(mfn, flags | _PAGE_PSE);
- paging_write_p2m_entry(p2m->domain, gfn,
- (l1_pgentry_t *)&l3e[i3],
- l3mfn, l1e_content, 3);
+ p2m->write_p2m_entry(p2m, gfn,
+ (l1_pgentry_t *)&l3e[i3],
+ l3mfn, l1e_content, 3);
continue;
}
@@ -2604,9 +2646,9 @@ void p2m_change_type_global(struct p2m_d
* L2_PAGETABLE_ENTRIES) * L1_PAGETABLE_ENTRIES;
flags = p2m_type_to_flags(nt, _mfn(mfn));
l1e_content = l1e_from_pfn(mfn, flags | _PAGE_PSE);
- paging_write_p2m_entry(p2m->domain, gfn,
- (l1_pgentry_t *)&l2e[i2],
- l2mfn, l1e_content, 2);
+ p2m->write_p2m_entry(p2m, gfn,
+ (l1_pgentry_t *)&l2e[i2],
+ l2mfn, l1e_content, 2);
continue;
}
@@ -2628,8 +2670,8 @@ void p2m_change_type_global(struct p2m_d
/* create a new 1le entry with the new type */
flags = p2m_type_to_flags(nt, _mfn(mfn));
l1e_content = l1e_from_pfn(mfn, flags);
- paging_write_p2m_entry(p2m->domain, gfn, &l1e[i1],
- l1mfn, l1e_content, 1);
+ p2m->write_p2m_entry(p2m, gfn, &l1e[i1],
+ l1mfn, l1e_content, 1);
}
unmap_domain_page(l1e);
}
@@ -3048,6 +3090,179 @@ void p2m_mem_access_resume(struct p2m_do
}
#endif /* __x86_64__ */
+static struct p2m_domain *
+p2m_getlru_nestedp2m(struct domain *d, struct p2m_domain *p2m)
+{
+ int i, lru_index = -1;
+ struct p2m_domain *lrup2m, *tmp;
+
+ if (p2m == NULL) {
+ lru_index = MAX_NESTEDP2M - 1;
+ lrup2m = d->arch.nested_p2m[lru_index];
+ } else {
+ lrup2m = p2m;
+ for (i = 0; i < MAX_NESTEDP2M; i++) {
+ if (d->arch.nested_p2m[i] == p2m) {
+ lru_index = i;
+ break;
+ }
+ }
+ }
+
+ ASSERT(lru_index >= 0);
+ if (lru_index == 0) {
+ return lrup2m;
+ }
+
+ /* move the other's down the array "list" */
+ for (i = lru_index - 1; i >= 0; i--) {
+ tmp = d->arch.nested_p2m[i];
+ d->arch.nested_p2m[i+1] = tmp;
+ }
+
+ /* make the entry the first one */
+ d->arch.nested_p2m[0] = lrup2m;
+
+ return lrup2m;
+}
+
+static int
+p2m_flush_locked(struct p2m_domain *p2m)
+{
+ ASSERT(p2m);
+ if (p2m->cr3 == CR3_EADDR)
+ /* Microoptimisation: p2m is already empty.
+ * => about 0.3% speedup of overall system performance.
+ */
+ return 0;
+
+ p2m_teardown(p2m);
+ p2m_initialise(p2m->domain, p2m);
+ p2m->get_entry_current = p2m->get_entry;
+ p2m->write_p2m_entry = nestedp2m_write_p2m_entry;
+ return p2m_alloc_table(p2m);
+}
+
+void
+p2m_flush(struct vcpu *v, struct p2m_domain *p2m)
+{
+ struct domain *d = p2m->domain;
+
+ ASSERT(v->domain == d);
+ vcpu_nestedhvm(v).nv_p2m = NULL;
+ nestedp2m_lock(d);
+ BUG_ON(p2m_flush_locked(p2m) != 0);
+ hvm_asid_flush_vcpu(v);
+ nestedhvm_vmcx_flushtlb(p2m);
+ nestedp2m_unlock(d);
+}
+
+void
+p2m_flush_nestedp2m(struct domain *d)
+{
+ int i;
+
+ nestedp2m_lock(d);
+ for (i = 0; i < MAX_NESTEDP2M; i++) {
+ struct p2m_domain *p2m = d->arch.nested_p2m[i];
+ BUG_ON(p2m_flush_locked(p2m) != 0);
+ cpus_clear(p2m->p2m_dirty_cpumask);
+ }
+ nestedhvm_vmcx_flushtlbdomain(d);
+ nestedp2m_unlock(d);
+}
+
+struct p2m_domain *
+p2m_get_nestedp2m(struct vcpu *v, uint64_t cr3)
+{
+ struct nestedvcpu *nv = &vcpu_nestedhvm(v);
+ struct domain *d;
+ struct p2m_domain *p2m;
+ int i, rv;
+
+ if (cr3 == 0 || cr3 == CR3_EADDR)
+ cr3 = v->arch.hvm_vcpu.guest_cr[3];
+
+ if (nv->nv_flushp2m && nv->nv_p2m) {
+ nv->nv_p2m = NULL;
+ }
+
+ d = v->domain;
+ nestedp2m_lock(d);
+ for (i = 0; i < MAX_NESTEDP2M; i++) {
+ p2m = d->arch.nested_p2m[i];
+ if ((p2m->cr3 != cr3 && p2m->cr3 != CR3_EADDR) || (p2m != nv->nv_p2m))
+ continue;
+
+ nv->nv_flushp2m = 0;
+ p2m_getlru_nestedp2m(d, p2m);
+ nv->nv_p2m = p2m;
+ if (p2m->cr3 == CR3_EADDR)
+ hvm_asid_flush_vcpu(v);
+ p2m->cr3 = cr3;
+ cpu_set(v->processor, p2m->p2m_dirty_cpumask);
+ nestedp2m_unlock(d);
+ return p2m;
+ }
+
+ /* All p2m's are or were in use. Take the least recent used one,
+ * flush it and reuse.
+ */
+ for (i = 0; i < MAX_NESTEDP2M; i++) {
+ p2m = p2m_getlru_nestedp2m(d, NULL);
+ rv = p2m_flush_locked(p2m);
+ if (rv == 0)
+ break;
+ }
+ nv->nv_p2m = p2m;
+ p2m->cr3 = cr3;
+ nv->nv_flushp2m = 0;
+ hvm_asid_flush_vcpu(v);
+ nestedhvm_vmcx_flushtlb(nv->nv_p2m);
+ cpu_set(v->processor, p2m->p2m_dirty_cpumask);
+ nestedp2m_unlock(d);
+
+ return p2m;
+}
+
+struct p2m_domain *
+p2m_get_p2m(struct vcpu *v)
+{
+ if (!nestedhvm_is_n2(v))
+ return p2m_get_hostp2m(v->domain);
+
+ return p2m_get_nestedp2m(v, nhvm_vcpu_hostcr3(v));
+}
+
+unsigned long paging_gva_to_gfn(struct vcpu *v,
+ unsigned long va,
+ uint32_t *pfec)
+{
+ struct p2m_domain *hostp2m = p2m_get_hostp2m(v->domain);
+ const struct paging_mode *hostmode = paging_get_hostmode(v);
+
+ if ( is_hvm_domain(v->domain)
+ && paging_mode_hap(v->domain)
+ && nestedhvm_is_n2(v) )
+ {
+ unsigned long gfn;
+ struct p2m_domain *p2m;
+ const struct paging_mode *mode;
+ uint64_t ncr3 = nhvm_vcpu_hostcr3(v);
+
+ /* translate l2 guest va into l2 guest gfn */
+ p2m = p2m_get_nestedp2m(v, ncr3);
+ mode = paging_get_nestedmode(v);
+ gfn = mode->gva_to_gfn(v, p2m, va, pfec);
+
+ /* translate l2 guest gfn into l1 guest gfn */
+ return hostmode->p2m_ga_to_gfn(v, hostp2m, ncr3,
+ gfn << PAGE_SHIFT, pfec);
+ }
+
+ return hostmode->gva_to_gfn(v, hostp2m, va, pfec);
+}
+
/*
* Local variables:
* mode: C
diff -r 3ab405e67be6 -r 98598880e482 xen/arch/x86/mm/paging.c
--- a/xen/arch/x86/mm/paging.c
+++ b/xen/arch/x86/mm/paging.c
@@ -26,6 +26,7 @@
#include <asm/p2m.h>
#include <asm/hap.h>
#include <asm/guest_access.h>
+#include <asm/hvm/nestedhvm.h>
#include <xen/numa.h>
#include <xsm/xsm.h>
@@ -851,21 +852,58 @@ void paging_dump_vcpu_info(struct vcpu *
printk(" paging assistance: ");
if ( paging_mode_shadow(v->domain) )
{
- if ( v->arch.paging.mode )
+ if ( paging_get_hostmode(v) )
printk("shadowed %u-on-%u\n",
- v->arch.paging.mode->guest_levels,
- v->arch.paging.mode->shadow.shadow_levels);
+ paging_get_hostmode(v)->guest_levels,
+ paging_get_hostmode(v)->shadow.shadow_levels);
else
printk("not shadowed\n");
}
- else if ( paging_mode_hap(v->domain) && v->arch.paging.mode )
+ else if ( paging_mode_hap(v->domain) && paging_get_hostmode(v) )
printk("hap, %u levels\n",
- v->arch.paging.mode->guest_levels);
+ paging_get_hostmode(v)->guest_levels);
else
printk("none\n");
}
}
+const struct paging_mode *paging_get_mode(struct vcpu *v)
+{
+ if (!nestedhvm_is_n2(v))
+ return paging_get_hostmode(v);
+
+ return paging_get_nestedmode(v);
+}
+
+extern const struct paging_mode *hap_paging_get_mode(struct vcpu *);
+
+void paging_update_nestedmode(struct vcpu *v)
+{
+ ASSERT(nestedhvm_enabled(v->domain));
+ if (nestedhvm_paging_mode_hap(v))
+ /* nested-on-nested */
+ v->arch.paging.nestedmode = hap_paging_get_mode(v);
+ else
+ /* TODO: shadow-on-shadow */
+ v->arch.paging.nestedmode = NULL;
+}
+
+void paging_write_p2m_entry(struct p2m_domain *p2m, unsigned long gfn,
+ l1_pgentry_t *p, mfn_t table_mfn,
+ l1_pgentry_t new, unsigned int level)
+{
+ struct domain *d = p2m->domain;
+ struct vcpu *v = current;
+ if ( v->domain != d )
+ v = d->vcpu ? d->vcpu[0] : NULL;
+ if ( likely(v && paging_mode_enabled(d) && paging_get_hostmode(v) != NULL) )
+ {
+ return paging_get_hostmode(v)->write_p2m_entry(v, gfn, p, table_mfn,
+ new, level);
+ }
+ else
+ safe_write_pte(p, new);
+}
/*
* Local variables:
diff -r 3ab405e67be6 -r 98598880e482 xen/arch/x86/mm/shadow/multi.c
--- a/xen/arch/x86/mm/shadow/multi.c
+++ b/xen/arch/x86/mm/shadow/multi.c
@@ -3768,7 +3768,8 @@ sh_invlpg(struct vcpu *v, unsigned long
static unsigned long
-sh_gva_to_gfn(struct vcpu *v, unsigned long va, uint32_t *pfec)
+sh_gva_to_gfn(struct vcpu *v, struct p2m_domain *p2m,
+ unsigned long va, uint32_t *pfec)
/* Called to translate a guest virtual address to what the *guest*
* pagetables would map it to. */
{
@@ -4820,7 +4821,7 @@ static mfn_t emulate_gva_to_mfn(struct v
struct p2m_domain *p2m = p2m_get_hostp2m(v->domain);
/* Translate the VA to a GFN */
- gfn = sh_gva_to_gfn(v, vaddr, &pfec);
+ gfn = sh_gva_to_gfn(v, p2m, vaddr, &pfec);
if ( gfn == INVALID_GFN )
{
if ( is_hvm_vcpu(v) )
diff -r 3ab405e67be6 -r 98598880e482 xen/include/asm-x86/domain.h
--- a/xen/include/asm-x86/domain.h
+++ b/xen/include/asm-x86/domain.h
@@ -210,6 +210,8 @@ struct paging_domain {
struct paging_vcpu {
/* Pointers to mode-specific entry points. */
const struct paging_mode *mode;
+ /* Nested Virtualization: paging mode of nested guest */
+ const struct paging_mode *nestedmode;
/* HVM guest: last emulate was to a pagetable */
unsigned int last_write_was_pt:1;
/* HVM guest: last write emulation succeeds */
@@ -225,6 +227,7 @@ struct paging_vcpu {
#define MAX_CPUID_INPUT 40
typedef xen_domctl_cpuid_t cpuid_input_t;
+#define MAX_NESTEDP2M 10
struct p2m_domain;
struct time_scale {
int shift;
@@ -258,6 +261,12 @@ struct arch_domain
struct paging_domain paging;
struct p2m_domain *p2m;
+ /* nestedhvm: translate l2 guest physical to host physical */
+ struct p2m_domain *nested_p2m[MAX_NESTEDP2M];
+ spinlock_t nested_p2m_lock;
+ int nested_p2m_locker;
+ const char *nested_p2m_function;
+
/* NB. protected by d->event_lock and by irq_desc[irq].lock */
int *irq_pirq;
int *pirq_irq;
diff -r 3ab405e67be6 -r 98598880e482 xen/include/asm-x86/hvm/hvm.h
--- a/xen/include/asm-x86/hvm/hvm.h
+++ b/xen/include/asm-x86/hvm/hvm.h
@@ -374,12 +374,12 @@ static inline void hvm_set_info_guest(st
int hvm_debug_op(struct vcpu *v, int32_t op);
-bool_t hvm_hap_nested_page_fault(unsigned long gpa,
- bool_t gla_valid, unsigned long gla,
- bool_t access_valid,
- bool_t access_r,
- bool_t access_w,
- bool_t access_x);
+int hvm_hap_nested_page_fault(unsigned long gpa,
+ bool_t gla_valid, unsigned long gla,
+ bool_t access_valid,
+ bool_t access_r,
+ bool_t access_w,
+ bool_t access_x);
#define hvm_msr_tsc_aux(v) ({ \
struct domain *__d = (v)->domain; \
diff -r 3ab405e67be6 -r 98598880e482 xen/include/asm-x86/hvm/nestedhvm.h
--- a/xen/include/asm-x86/hvm/nestedhvm.h
+++ b/xen/include/asm-x86/hvm/nestedhvm.h
@@ -60,4 +60,9 @@ unsigned long *nestedhvm_vcpu_iomap_get(
#define nestedhvm_vmswitch_in_progress(v) \
(!!vcpu_nestedhvm((v)).nv_vmswitch_in_progress)
+void nestedhvm_vmcx_flushtlb(struct p2m_domain *p2m);
+void nestedhvm_vmcx_flushtlbdomain(struct domain *d);
+
+bool_t nestedhvm_is_n2(struct vcpu *v);
+
#endif /* _HVM_NESTEDHVM_H */
diff -r 3ab405e67be6 -r 98598880e482 xen/include/asm-x86/p2m.h
--- a/xen/include/asm-x86/p2m.h
+++ b/xen/include/asm-x86/p2m.h
@@ -199,7 +199,15 @@ struct p2m_domain {
/* Shadow translated domain: p2m mapping */
pagetable_t phys_table;
+ /* Same as domain_dirty_cpumask but limited to
+ * this p2m and those physical cpus whose vcpu's are in
+ * guestmode.
+ */
+ cpumask_t p2m_dirty_cpumask;
+
struct domain *domain; /* back pointer to domain */
+#define CR3_EADDR (~0ULL)
+ uint64_t cr3; /* to identify this p2m for re-use */
/* Pages used to construct the p2m */
struct page_list_head pages;
@@ -223,6 +231,11 @@ struct p2m_domain {
p2m_type_t ot,
p2m_type_t nt);
+ void (*write_p2m_entry)(struct p2m_domain *p2m,
+ unsigned long gfn, l1_pgentry_t *p,
+ mfn_t table_mfn, l1_pgentry_t new,
+ unsigned int level);
+
/* Default P2M access type for each page in the the domain: new pages,
* swapped in pages, cleared pages, and pages that are ambiquously
* retyped get this access type. See definition of p2m_access_t. */
@@ -264,8 +277,26 @@ struct p2m_domain {
/* get host p2m table */
#define p2m_get_hostp2m(d) ((d)->arch.p2m)
+/* Get p2m table (re)usable for specified cr3.
+ * Automatically destroys and re-initializes a p2m if none found.
+ * If cr3 == 0 then v->arch.hvm_vcpu.guest_cr[3] is used.
+ */
+struct p2m_domain *p2m_get_nestedp2m(struct vcpu *v, uint64_t cr3);
+
+/* If vcpu is in host mode then behaviour matches p2m_get_hostp2m().
+ * If vcpu is in guest mode then behaviour matches p2m_get_nestedp2m().
+ */
+struct p2m_domain *p2m_get_p2m(struct vcpu *v);
+
+#define p2m_is_nestedp2m(p2m) ((p2m) != p2m_get_hostp2m((p2m->domain)))
+
#define p2m_get_pagetable(p2m) ((p2m)->phys_table)
+/* Flushes specified p2m table */
+void p2m_flush(struct vcpu *v, struct p2m_domain *p2m);
+/* Flushes all nested p2m tables */
+void p2m_flush_nestedp2m(struct domain *d);
+
/*
* The P2M lock. This protects all updates to the p2m table.
* Updates are expected to be safe against concurrent reads,
@@ -307,6 +338,38 @@ struct p2m_domain {
(current->processor == (_p2m)->locker)
+#define nestedp2m_lock_init(_domain) \
+ do { \
+ spin_lock_init(&(_domain)->arch.nested_p2m_lock); \
+ (_domain)->arch.nested_p2m_locker = -1; \
+ (_domain)->arch.nested_p2m_function = "nobody"; \
+ } while (0)
+
+#define nestedp2m_locked_by_me(_domain) \
+ (current->processor == (_domain)->arch.nested_p2m_locker)
+
+#define nestedp2m_lock(_domain) \
+ do { \
+ if ( nestedp2m_locked_by_me(_domain) ) \
+ { \
+ printk("Error: p2m lock held by %s\n", \
+ (_domain)->arch.nested_p2m_function); \
+ BUG(); \
+ } \
+ spin_lock(&(_domain)->arch.nested_p2m_lock); \
+ ASSERT((_domain)->arch.nested_p2m_locker == -1); \
+ (_domain)->arch.nested_p2m_locker = current->processor; \
+ (_domain)->arch.nested_p2m_function = __func__; \
+ } while (0)
+
+#define nestedp2m_unlock(_domain) \
+ do { \
+ ASSERT(nestedp2m_locked_by_me(_domain)); \
+ (_domain)->arch.nested_p2m_locker = -1; \
+ (_domain)->arch.nested_p2m_function = "nobody"; \
+ spin_unlock(&(_domain)->arch.nested_p2m_lock); \
+ } while (0)
+
/* Extract the type from the PTE flags that store it */
static inline p2m_type_t p2m_flags_to_type(unsigned long flags)
{
@@ -424,11 +487,21 @@ static inline unsigned long mfn_to_gfn(s
/* Init the datastructures for later use by the p2m code */
int p2m_init(struct domain *d);
+/* PTE flags for various types of p2m entry */
+unsigned long p2m_type_to_flags(p2m_type_t t, mfn_t mfn);
+
/* Allocate a new p2m table for a domain.
*
* Returns 0 for success or -errno. */
int p2m_alloc_table(struct p2m_domain *p2m);
+/* Find the next level's P2M entry, checking for out-of-range gfn's...
+ * Returns NULL on error.
+ */
+l1_pgentry_t *
+p2m_find_entry(void *table, unsigned long *gfn_remainder,
+ unsigned long gfn, uint32_t shift, uint32_t max);
+
/* Return all the p2m resources to Xen. */
void p2m_teardown(struct p2m_domain *p2m);
void p2m_final_teardown(struct domain *d);
@@ -502,6 +575,8 @@ p2m_type_t p2m_change_type(struct p2m_do
int set_mmio_p2m_entry(struct p2m_domain *p2m, unsigned long gfn, mfn_t mfn);
int clear_mmio_p2m_entry(struct p2m_domain *p2m, unsigned long gfn);
+void nestedp2m_write_p2m_entry(struct p2m_domain *p2m, unsigned long gfn,
+ l1_pgentry_t *p, mfn_t table_mfn, l1_pgentry_t new, unsigned int level);
#ifdef __x86_64__
/* Modify p2m table for shared gfn */
diff -r 3ab405e67be6 -r 98598880e482 xen/include/asm-x86/paging.h
--- a/xen/include/asm-x86/paging.h
+++ b/xen/include/asm-x86/paging.h
@@ -108,8 +108,14 @@ struct paging_mode {
int (*page_fault )(struct vcpu *v, unsigned long va,
struct cpu_user_regs *regs);
int (*invlpg )(struct vcpu *v, unsigned long va);
- unsigned long (*gva_to_gfn )(struct vcpu *v, unsigned long va,
+ unsigned long (*gva_to_gfn )(struct vcpu *v,
+ struct p2m_domain *p2m,
+ unsigned long va,
uint32_t *pfec);
+ unsigned long (*p2m_ga_to_gfn )(struct vcpu *v,
+ struct p2m_domain *p2m,
+ unsigned long cr3,
+ paddr_t ga, uint32_t *pfec);
void (*update_cr3 )(struct vcpu *v, int do_locking);
void (*update_paging_modes )(struct vcpu *v);
void (*write_p2m_entry )(struct vcpu *v, unsigned long gfn,
@@ -219,6 +225,10 @@ void paging_final_teardown(struct domain
* creation. */
int paging_enable(struct domain *d, u32 mode);
+#define paging_get_hostmode(v) ((v)->arch.paging.mode)
+#define paging_get_nestedmode(v) ((v)->arch.paging.nestedmode)
+const struct paging_mode *paging_get_mode(struct vcpu *v);
+void paging_update_nestedmode(struct vcpu *v);
/* Page fault handler
* Called from pagefault handler in Xen, and from the HVM trap handlers
@@ -233,7 +243,7 @@ static inline int
paging_fault(unsigned long va, struct cpu_user_regs *regs)
{
struct vcpu *v = current;
- return v->arch.paging.mode->page_fault(v, va, regs);
+ return paging_get_hostmode(v)->page_fault(v, va, regs);
}
/* Handle invlpg requests on vcpus.
@@ -241,7 +251,7 @@ paging_fault(unsigned long va, struct cp
* or 0 if it's safe not to do so. */
static inline int paging_invlpg(struct vcpu *v, unsigned long va)
{
- return v->arch.paging.mode->invlpg(v, va);
+ return paging_get_hostmode(v)->invlpg(v, va);
}
/* Translate a guest virtual address to the frame number that the
@@ -251,11 +261,30 @@ static inline int paging_invlpg(struct v
* walking the tables. The caller should set the PFEC_page_present bit
* in pfec[0]; in the failure case, that bit will be cleared if appropriate. */
#define INVALID_GFN (-1UL)
-static inline unsigned long paging_gva_to_gfn(struct vcpu *v,
- unsigned long va,
- uint32_t *pfec)
+unsigned long paging_gva_to_gfn(struct vcpu *v,
+ unsigned long va,
+ uint32_t *pfec);
+
+/* Translates a guest virtual address to guest physical address
+ * where the specified cr3 is translated to host physical address
+ * using the specified p2m table.
+ * This allows to do page walks in the guest or even in the nested guest.
+ * It returns the guest's gfn or the nested guest's gfn.
+ * Use 'paddr_t' for the guest address so it won't overflow when
+ * guest or nested guest is in 32bit PAE mode.
+ */
+static inline unsigned long paging_p2m_ga_to_gfn(struct vcpu *v,
+ struct p2m_domain *p2m,
+ const struct paging_mode *mode,
+ unsigned long cr3,
+ paddr_t ga,
+ uint32_t *pfec)
{
- return v->arch.paging.mode->gva_to_gfn(v, va, pfec);
+ if ( is_hvm_domain(v->domain) && paging_mode_hap(v->domain) )
+ return mode->p2m_ga_to_gfn(v, p2m, cr3, ga, pfec);
+
+ /* shadow paging */
+ return paging_gva_to_gfn(v, ga, pfec);
}
/* Update all the things that are derived from the guest's CR3.
@@ -263,7 +292,7 @@ static inline unsigned long paging_gva_t
* as the value to load into the host CR3 to schedule this vcpu */
static inline void paging_update_cr3(struct vcpu *v)
{
- v->arch.paging.mode->update_cr3(v, 1);
+ paging_get_hostmode(v)->update_cr3(v, 1);
}
/* Update all the things that are derived from the guest's CR0/CR3/CR4.
@@ -271,7 +300,7 @@ static inline void paging_update_cr3(str
* has changed, and when bringing up a VCPU for the first time. */
static inline void paging_update_paging_modes(struct vcpu *v)
{
- v->arch.paging.mode->update_paging_modes(v);
+ paging_get_hostmode(v)->update_paging_modes(v);
}
@@ -283,7 +312,7 @@ static inline int paging_write_guest_ent
{
if ( unlikely(paging_mode_enabled(v->domain)
&& v->arch.paging.mode != NULL) )
- return v->arch.paging.mode->write_guest_entry(v, p, new, gmfn);
+ return paging_get_hostmode(v)->write_guest_entry(v, p, new, gmfn);
else
return (!__copy_to_user(p, &new, sizeof(new)));
}
@@ -299,7 +328,7 @@ static inline int paging_cmpxchg_guest_e
{
if ( unlikely(paging_mode_enabled(v->domain)
&& v->arch.paging.mode != NULL) )
- return v->arch.paging.mode->cmpxchg_guest_entry(v, p, old, new, gmfn);
+ return paging_get_hostmode(v)->cmpxchg_guest_entry(v, p, old, new, gmfn);
else
return (!cmpxchg_user(p, *old, new));
}
@@ -327,21 +356,11 @@ static inline void safe_write_pte(l1_pge
* a pointer to the entry to be written, the MFN in which the entry resides,
* the new contents of the entry, and the level in the p2m tree at which
* we are writing. */
-static inline void paging_write_p2m_entry(struct domain *d, unsigned long gfn,
- l1_pgentry_t *p, mfn_t table_mfn,
- l1_pgentry_t new, unsigned int level)
-{
- struct vcpu *v = current;
- if ( v->domain != d )
- v = d->vcpu ? d->vcpu[0] : NULL;
- if ( likely(v && paging_mode_enabled(d) && v->arch.paging.mode != NULL) )
- {
- return v->arch.paging.mode->write_p2m_entry(v, gfn, p, table_mfn,
- new, level);
- }
- else
- safe_write_pte(p, new);
-}
+struct p2m_domain;
+
+void paging_write_p2m_entry(struct p2m_domain *p2m, unsigned long gfn,
+ l1_pgentry_t *p, mfn_t table_mfn,
+ l1_pgentry_t new, unsigned int level);
/* Called from the guest to indicate that the a process is being
* torn down and its pagetables will soon be discarded */
@@ -362,7 +381,7 @@ guest_map_l1e(struct vcpu *v, unsigned l
l2_pgentry_t l2e;
if ( unlikely(paging_mode_translate(v->domain)) )
- return v->arch.paging.mode->guest_map_l1e(v, addr, gl1mfn);
+ return paging_get_hostmode(v)->guest_map_l1e(v, addr, gl1mfn);
/* Find this l1e and its enclosing l1mfn in the linear map */
if ( __copy_from_user(&l2e,
@@ -398,7 +417,7 @@ guest_get_eff_l1e(struct vcpu *v, unsign
return;
}
- v->arch.paging.mode->guest_get_eff_l1e(v, addr, eff_l1e);
+ paging_get_hostmode(v)->guest_get_eff_l1e(v, addr, eff_l1e);
}
/* Read the guest's l1e that maps this address, from the kernel-mode
[-- Attachment #3: Type: text/plain, Size: 138 bytes --]
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH 12/12] Nested Virtualization: hap-on-hap
2011-03-09 14:31 [PATCH 12/12] Nested Virtualization: hap-on-hap Christoph Egger
@ 2011-03-22 14:59 ` Tim Deegan
2011-03-31 15:25 ` Christoph Egger
1 sibling, 0 replies; 11+ messages in thread
From: Tim Deegan @ 2011-03-22 14:59 UTC (permalink / raw)
To: Christoph Egger; +Cc: xen-devel@lists.xensource.com
Hi,
Looks like you've sorted out shooting down old users of a p2m table.
hap_write_p2m_entry still isn't right, though:
> @@ -834,38 +864,81 @@ static void
> hap_write_p2m_entry(struct vcpu *v, unsigned long gfn, l1_pgentry_t *p,
> mfn_t table_mfn, l1_pgentry_t new, unsigned int level)
> {
> - uint32_t old_flags;
> + struct domain *d = v->domain;
> + uint32_t old_flags = l1e_get_flags(*p);
You have moved this read outside the hap_lock. Please put it back.
> + p2m_type_t op2mt = p2m_flags_to_type(old_flags);
>
> - hap_lock(v->domain);
> + /* We know always use the host p2m here, regardless if the vcpu
> + * is in host or guest mode. The vcpu can be in guest mode by
> + * a hypercall which passes a domain and chooses mostly the first
> + * vcpu.
> + * XXX This is the reason why this function can not be used re-used
> + * for updating the nestedp2m. Otherwise, hypercalls would randomly
> + * operate on host p2m and nested p2m.
> + */
> + if ( nestedhvm_enabled(d)
> + && p2m_is_valid(op2mt) )
> + {
> + if ( l1e_get_intpte(new) != l1e_get_intpte(*p) ) {
> + p2m_type_t np2mt = p2m_flags_to_type(l1e_get_flags(new));
>
> - old_flags = l1e_get_flags(*p);
> + /* Skip flush on vram tracking or XP mode in Win7 hang
> + * very early in the virtual BIOS (long before the bootloader
> + * runs), otherwise. VRAM tracking happens so often that
> + * flushing and fixing the nestedp2m doesn't let XP mode
> + * proceed to boot.
> + */
> + if ( !((op2mt == p2m_ram_rw && np2mt == p2m_ram_logdirty)
> + || (op2mt == p2m_ram_logdirty && np2mt == p2m_ram_rw)) )
That's not safe. If the MFN has changed, you _have_ to flush, even if
you're replacing a logdirty entry with a r/w one. And if you're
replacing a r/w entry with a logdirty one, you _have_ to flush or
logdirty doesn't work correctly. If that case is too slow then you
should batch the flushes somehow, not just skip them.
Cheers,
Tim.
> + {
> + /* This GFN -> MFN is going to get removed. */
> + /* XXX There is a more efficient way to do that
> + * but it works for now.
> + * Note, p2m_flush_nestedp2m calls hap_lock() internally.
> + */
> + p2m_flush_nestedp2m(d);
> + }
> + }
> + }
> +
> + hap_lock(d);
> +
> safe_write_pte(p, new);
> if ( (old_flags & _PAGE_PRESENT)
> && (level == 1 || (level == 2 && (old_flags & _PAGE_PSE))) )
> - flush_tlb_mask(&v->domain->domain_dirty_cpumask);
> + flush_tlb_mask(&d->domain_dirty_cpumask);
>
> #if CONFIG_PAGING_LEVELS == 3
> /* install P2M in monitor table for PAE Xen */
> if ( level == 3 )
> /* We have written to the p2m l3: need to sync the per-vcpu
> * copies of it in the monitor tables */
> - p2m_install_entry_in_monitors(v->domain, (l3_pgentry_t *)p);
> + p2m_install_entry_in_monitors(d, (l3_pgentry_t *)p);
> #endif
>
> - hap_unlock(v->domain);
> + hap_unlock(d);
> }
--
Tim Deegan <Tim.Deegan@citrix.com>
Principal Software Engineer, Xen Platform Team
Citrix Systems UK Ltd. (Company #02937203, SL9 0BG)
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH 12/12] Nested Virtualization: hap-on-hap
2011-03-09 14:31 [PATCH 12/12] Nested Virtualization: hap-on-hap Christoph Egger
2011-03-22 14:59 ` Tim Deegan
@ 2011-03-31 15:25 ` Christoph Egger
2011-04-05 15:48 ` Christoph Egger
1 sibling, 1 reply; 11+ messages in thread
From: Christoph Egger @ 2011-03-31 15:25 UTC (permalink / raw)
To: xen-devel; +Cc: Tim Deegan
[-- Attachment #1: Type: text/plain, Size: 337 bytes --]
This is the new version. I fixed the open items from Tim's last review.
--
---to satisfy European Law for business letters:
Advanced Micro Devices GmbH
Einsteinring 24, 85609 Dornach b. Muenchen
Geschaeftsfuehrer: Alberto Bozzo, Andrew Bowd
Sitz: Dornach, Gemeinde Aschheim, Landkreis Muenchen
Registergericht Muenchen, HRB Nr. 43632
[-- Attachment #2: xen_nh12_haphap.diff --]
[-- Type: text/x-diff, Size: 65861 bytes --]
# HG changeset patch
# User cegger
# Date 1299677057 -3600
Implement Nested-on-Nested.
This allows the guest to run nested guest with hap enabled.
Signed-off-by: Christoph Egger <Christoph.Egger@amd.com>
diff -r 3ab405e67be6 -r 98598880e482 xen/arch/x86/hvm/hvm.c
--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -1170,21 +1170,50 @@ void hvm_inject_exception(unsigned int t
hvm_funcs.inject_exception(trapnr, errcode, cr2);
}
-bool_t hvm_hap_nested_page_fault(unsigned long gpa,
- bool_t gla_valid,
- unsigned long gla,
- bool_t access_valid,
- bool_t access_r,
- bool_t access_w,
- bool_t access_x)
+int hvm_hap_nested_page_fault(unsigned long gpa,
+ bool_t gla_valid,
+ unsigned long gla,
+ bool_t access_valid,
+ bool_t access_r,
+ bool_t access_w,
+ bool_t access_x)
{
unsigned long gfn = gpa >> PAGE_SHIFT;
p2m_type_t p2mt;
p2m_access_t p2ma;
mfn_t mfn;
struct vcpu *v = current;
- struct p2m_domain *p2m = p2m_get_hostp2m(v->domain);
-
+ struct p2m_domain *p2m = NULL;
+
+ /* On Nested Virtualization, walk the guest page table.
+ * If this succeeds, all is fine.
+ * If this fails, inject a nested page fault into the guest.
+ */
+ if ( nestedhvm_enabled(v->domain)
+ && nestedhvm_vcpu_in_guestmode(v)
+ && nestedhvm_paging_mode_hap(v) )
+ {
+ int rv;
+
+ /* The vcpu is in guest mode and the l1 guest
+ * uses hap. That means 'gpa' is in l2 guest
+ * physical address space.
+ * Fix the nested p2m or inject nested page fault
+ * into l1 guest if not fixable. The algorithm is
+ * the same as for shadow paging.
+ */
+ rv = nestedhvm_hap_nested_page_fault(v, gpa);
+ switch (rv) {
+ case NESTEDHVM_PAGEFAULT_DONE:
+ return 1;
+ case NESTEDHVM_PAGEFAULT_ERROR:
+ return 0;
+ case NESTEDHVM_PAGEFAULT_INJECT:
+ return -1;
+ }
+ }
+
+ p2m = p2m_get_hostp2m(v->domain);
mfn = gfn_to_mfn_type_current(p2m, gfn, &p2mt, &p2ma, p2m_guest);
/* Check access permissions first, then handle faults */
@@ -1328,6 +1357,15 @@ int hvm_set_efer(uint64_t value)
return X86EMUL_EXCEPTION;
}
+ if ( nestedhvm_enabled(v->domain) && cpu_has_svm &&
+ ((value & EFER_SVME) == 0 ) &&
+ ((value ^ v->arch.hvm_vcpu.guest_efer) & EFER_SVME) )
+ {
+ /* Cleared EFER.SVME: Flush all nestedp2m tables */
+ p2m_flush_nestedp2m(v->domain);
+ nestedhvm_vcpu_reset(v);
+ }
+
value |= v->arch.hvm_vcpu.guest_efer & EFER_LMA;
v->arch.hvm_vcpu.guest_efer = value;
hvm_update_guest_efer(v);
@@ -1478,8 +1516,12 @@ int hvm_set_cr0(unsigned long value)
v->arch.hvm_vcpu.guest_cr[0] = value;
hvm_update_guest_cr(v, 0);
- if ( (value ^ old_value) & X86_CR0_PG )
- paging_update_paging_modes(v);
+ if ( (value ^ old_value) & X86_CR0_PG ) {
+ if ( !nestedhvm_vmswitch_in_progress(v) && nestedhvm_vcpu_in_guestmode(v) )
+ paging_update_nestedmode(v);
+ else
+ paging_update_paging_modes(v);
+ }
return X86EMUL_OKAY;
@@ -1546,8 +1588,12 @@ int hvm_set_cr4(unsigned long value)
hvm_update_guest_cr(v, 4);
/* Modifying CR4.{PSE,PAE,PGE} invalidates all TLB entries, inc. Global. */
- if ( (old_cr ^ value) & (X86_CR4_PSE | X86_CR4_PGE | X86_CR4_PAE) )
- paging_update_paging_modes(v);
+ if ( (old_cr ^ value) & (X86_CR4_PSE | X86_CR4_PGE | X86_CR4_PAE) ) {
+ if ( !nestedhvm_vmswitch_in_progress(v) && nestedhvm_vcpu_in_guestmode(v) )
+ paging_update_nestedmode(v);
+ else
+ paging_update_paging_modes(v);
+ }
return X86EMUL_OKAY;
@@ -2060,7 +2106,7 @@ static enum hvm_copy_result __hvm_copy(
void *buf, paddr_t addr, int size, unsigned int flags, uint32_t pfec)
{
struct vcpu *curr = current;
- struct p2m_domain *p2m = p2m_get_hostp2m(curr->domain);
+ struct p2m_domain *p2m;
unsigned long gfn, mfn;
p2m_type_t p2mt;
char *p;
@@ -2082,6 +2128,8 @@ static enum hvm_copy_result __hvm_copy(
return HVMCOPY_unhandleable;
#endif
+ p2m = p2m_get_hostp2m(curr->domain);
+
while ( todo > 0 )
{
count = min_t(int, PAGE_SIZE - (addr & ~PAGE_MASK), todo);
diff -r 3ab405e67be6 -r 98598880e482 xen/arch/x86/hvm/nestedhvm.c
--- a/xen/arch/x86/hvm/nestedhvm.c
+++ b/xen/arch/x86/hvm/nestedhvm.c
@@ -20,6 +20,7 @@
#include <asm/msr.h>
#include <asm/hvm/support.h> /* for HVM_DELIVER_NO_ERROR_CODE */
#include <asm/hvm/hvm.h>
+#include <asm/p2m.h> /* for struct p2m_domain */
#include <asm/hvm/nestedhvm.h>
#include <asm/event.h> /* for local_event_delivery_(en|dis)able */
#include <asm/paging.h> /* for paging_mode_hap() */
@@ -96,6 +97,54 @@ nestedhvm_vcpu_destroy(struct vcpu *v)
return nhvm_vcpu_destroy(v);
}
+static void
+nestedhvm_flushtlb_ipi(void *info)
+{
+ struct vcpu *v = current;
+ /* Just flush the ASID (or request a new one).
+ * This is cheaper than flush_tlb_local() and has
+ * the same desired effect.
+ */
+ hvm_asid_flush_core();
+ vcpu_nestedhvm(v).nv_p2m = NULL;
+}
+
+void
+nestedhvm_vmcx_flushtlb(struct p2m_domain *p2m)
+{
+ int cpu = smp_processor_id();
+ if (cpu_isset(cpu, p2m->p2m_dirty_cpumask)) {
+ /* Avoid sending an IPI to myself.
+ * This is necessary to avoid a deadlock as this path
+ * can run with GIF disabled on AMD machines.
+ */
+ nestedhvm_flushtlb_ipi(NULL);
+ cpu_clear(cpu, p2m->p2m_dirty_cpumask);
+ }
+ on_selected_cpus(&p2m->p2m_dirty_cpumask, nestedhvm_flushtlb_ipi, NULL, 1);
+ cpus_clear(p2m->p2m_dirty_cpumask);
+}
+
+void
+nestedhvm_vmcx_flushtlbdomain(struct domain *d)
+{
+ on_selected_cpus(&d->domain_dirty_cpumask, nestedhvm_flushtlb_ipi, NULL, 1);
+}
+
+bool_t
+nestedhvm_is_n2(struct vcpu *v)
+{
+ if (!nestedhvm_enabled(v->domain)
+ || nestedhvm_vmswitch_in_progress(v)
+ || !nestedhvm_paging_mode_hap(v))
+ return 0;
+
+ if (nestedhvm_vcpu_in_guestmode(v))
+ return 1;
+
+ return 0;
+}
+
/* Common shadow IO Permission bitmap */
/* There four global patterns of io bitmap each guest can
diff -r 3ab405e67be6 -r 98598880e482 xen/arch/x86/hvm/svm/entry.S
--- a/xen/arch/x86/hvm/svm/entry.S
+++ b/xen/arch/x86/hvm/svm/entry.S
@@ -65,6 +65,7 @@ ENTRY(svm_asm_do_resume)
testl $~0,(r(dx),r(ax),1)
jnz .Lsvm_process_softirqs
+ call nsvm_p2m_handle_vmrun
call svm_asid_handle_vmrun
cmpb $0,addr_of(tb_init_done)
diff -r 3ab405e67be6 -r 98598880e482 xen/arch/x86/hvm/svm/nestedsvm.c
--- a/xen/arch/x86/hvm/svm/nestedsvm.c
+++ b/xen/arch/x86/hvm/svm/nestedsvm.c
@@ -26,6 +26,7 @@
#include <asm/hvm/svm/svmdebug.h>
#include <asm/paging.h> /* paging_mode_hap */
#include <asm/event.h> /* for local_event_delivery_(en|dis)able */
+#include <asm/p2m.h> /* p2m_get_pagetable, p2m_get_nestedp2m */
static void
nestedsvm_vcpu_clgi(struct vcpu *v)
@@ -320,6 +321,18 @@ static int nsvm_vmrun_permissionmap(stru
return 0;
}
+static void nestedsvm_vmcb_set_nestedp2m(struct vcpu *v,
+ struct vmcb_struct *vvmcb, struct vmcb_struct *n2vmcb)
+{
+ struct p2m_domain *p2m;
+
+ ASSERT(v != NULL);
+ ASSERT(vvmcb != NULL);
+ ASSERT(n2vmcb != NULL);
+ p2m = p2m_get_nestedp2m(v, vvmcb->_h_cr3);
+ n2vmcb->_h_cr3 = pagetable_get_paddr(p2m_get_pagetable(p2m));
+}
+
static int nsvm_vmcb_prepare4vmrun(struct vcpu *v, struct cpu_user_regs *regs)
{
struct nestedvcpu *nv = &vcpu_nestedhvm(v);
@@ -475,6 +488,9 @@ static int nsvm_vmcb_prepare4vmrun(struc
/* Nested paging mode */
if (nestedhvm_paging_mode_hap(v)) {
/* host nested paging + guest nested paging. */
+ n2vmcb->_np_enable = 1;
+
+ nestedsvm_vmcb_set_nestedp2m(v, ns_vmcb, n2vmcb);
/* hvm_set_cr3() below sets v->arch.hvm_vcpu.guest_cr[3] for us. */
rc = hvm_set_cr3(ns_vmcb->_cr3);
@@ -1323,6 +1339,27 @@ asmlinkage void nsvm_vcpu_switch(struct
}
}
+/* This runs after VMRUN emulation. It is possible
+ * that an other (v)cpu flushed the assigned nestedp2m
+ * during the VMRUN emulation. Check for this case
+ * and get a new one.
+ * Caller must ensure the GIF is cleared.
+ */
+asmlinkage void nsvm_p2m_handle_vmrun(void)
+{
+ struct vcpu *v = current;
+ struct nestedvcpu *nv;
+
+ if (!nestedhvm_enabled(v->domain))
+ return;
+
+ nv = &vcpu_nestedhvm(v);
+ if (nv->nv_p2m == NULL
+ && nestedhvm_vcpu_in_guestmode(v)
+ && nestedhvm_paging_mode_hap(v))
+ nestedsvm_vmcb_set_nestedp2m(v, nv->nv_vvmcx, nv->nv_n2vmcx);
+}
+
/* Interrupts, Virtual GIF */
int
nestedsvm_vcpu_interrupt(struct vcpu *v, const struct hvm_intack intack)
diff -r 3ab405e67be6 -r 98598880e482 xen/arch/x86/hvm/svm/svm.c
--- a/xen/arch/x86/hvm/svm/svm.c
+++ b/xen/arch/x86/hvm/svm/svm.c
@@ -1014,14 +1014,16 @@ struct hvm_function_table * __init start
return &svm_function_table;
}
-static void svm_do_nested_pgfault(paddr_t gpa)
+static void svm_do_nested_pgfault(struct vcpu *v,
+ struct cpu_user_regs *regs, paddr_t gpa)
{
+ int ret;
unsigned long gfn = gpa >> PAGE_SHIFT;
mfn_t mfn;
p2m_type_t p2mt;
- struct p2m_domain *p2m;
+ struct p2m_domain *p2m = NULL;
- p2m = p2m_get_hostp2m(current->domain);
+ ret = hvm_hap_nested_page_fault(gpa, 0, ~0ul, 0, 0, 0, 0);
if ( tb_init_done )
{
@@ -1032,6 +1034,7 @@ static void svm_do_nested_pgfault(paddr_
uint32_t p2mt;
} _d;
+ p2m = p2m_get_p2m(v);
_d.gpa = gpa;
_d.qualification = 0;
_d.mfn = mfn_x(gfn_to_mfn_query(p2m, gfn, &_d.p2mt));
@@ -1039,14 +1042,26 @@ static void svm_do_nested_pgfault(paddr_
__trace_var(TRC_HVM_NPF, 0, sizeof(_d), &_d);
}
- if ( hvm_hap_nested_page_fault(gpa, 0, ~0ul, 0, 0, 0, 0) )
+ switch (ret) {
+ case 0:
+ break;
+ case 1:
return;
+ case -1:
+ ASSERT(nestedhvm_enabled(v->domain) && nestedhvm_vcpu_in_guestmode(v));
+ /* inject #VMEXIT(NPF) into guest. */
+ nestedsvm_vmexit_defer(v, VMEXIT_NPF, regs->error_code, gpa);
+ return;
+ }
+ if ( p2m == NULL )
+ p2m = p2m_get_p2m(v);
/* Everything else is an error. */
mfn = gfn_to_mfn_guest(p2m, gfn, &p2mt);
- gdprintk(XENLOG_ERR, "SVM violation gpa %#"PRIpaddr", mfn %#lx, type %i\n",
- gpa, mfn_x(mfn), p2mt);
- domain_crash(current->domain);
+ gdprintk(XENLOG_ERR,
+ "SVM violation gpa %#"PRIpaddr", mfn %#lx, type %i\n",
+ gpa, mfn_x(mfn), p2mt);
+ domain_crash(v->domain);
}
static void svm_fpu_dirty_intercept(void)
@@ -1659,6 +1674,8 @@ asmlinkage void svm_vmexit_handler(struc
struct vmcb_struct *ns_vmcb = nv->nv_vvmcx;
uint64_t exitinfo1, exitinfo2;
+ paging_update_nestedmode(v);
+
/* Write real exitinfo1 back into virtual vmcb.
* nestedsvm_check_intercepts() expects to have the correct
* exitinfo1 value there.
@@ -1948,7 +1965,7 @@ asmlinkage void svm_vmexit_handler(struc
case VMEXIT_NPF:
perfc_incra(svmexits, VMEXIT_NPF_PERFC);
regs->error_code = vmcb->exitinfo1;
- svm_do_nested_pgfault(vmcb->exitinfo2);
+ svm_do_nested_pgfault(v, regs, vmcb->exitinfo2);
break;
case VMEXIT_IRET: {
diff -r 3ab405e67be6 -r 98598880e482 xen/arch/x86/mm/hap/Makefile
--- a/xen/arch/x86/mm/hap/Makefile
+++ b/xen/arch/x86/mm/hap/Makefile
@@ -3,6 +3,7 @@ obj-y += guest_walk_2level.o
obj-y += guest_walk_3level.o
obj-y += guest_walk_4level.o
obj-y += p2m-ept.o
+obj-y += nested_hap.o
guest_levels = $(subst level,,$(filter %level,$(subst ., ,$(subst _, ,$(1)))))
guest_walk_defns = -DGUEST_PAGING_LEVELS=$(call guest_levels,$(1))
diff -r 3ab405e67be6 -r 98598880e482 xen/arch/x86/mm/hap/guest_walk.c
--- a/xen/arch/x86/mm/hap/guest_walk.c
+++ b/xen/arch/x86/mm/hap/guest_walk.c
@@ -29,24 +29,32 @@
#define _hap_gva_to_gfn(levels) hap_gva_to_gfn_##levels##_levels
#define hap_gva_to_gfn(levels) _hap_gva_to_gfn(levels)
+#define _hap_p2m_ga_to_gfn(levels) hap_p2m_ga_to_gfn_##levels##_levels
+#define hap_p2m_ga_to_gfn(levels) _hap_p2m_ga_to_gfn(levels)
+
#if GUEST_PAGING_LEVELS <= CONFIG_PAGING_LEVELS
#include <asm/guest_pt.h>
#include <asm/p2m.h>
unsigned long hap_gva_to_gfn(GUEST_PAGING_LEVELS)(
- struct vcpu *v, unsigned long gva, uint32_t *pfec)
+ struct vcpu *v, struct p2m_domain *p2m, unsigned long gva, uint32_t *pfec)
{
- unsigned long cr3;
+ unsigned long cr3 = v->arch.hvm_vcpu.guest_cr[3];
+ return hap_p2m_ga_to_gfn(GUEST_PAGING_LEVELS)(v, p2m, cr3, gva, pfec);
+}
+
+unsigned long hap_p2m_ga_to_gfn(GUEST_PAGING_LEVELS)(
+ struct vcpu *v, struct p2m_domain *p2m, unsigned long cr3,
+ paddr_t ga, uint32_t *pfec)
+{
uint32_t missing;
mfn_t top_mfn;
void *top_map;
p2m_type_t p2mt;
walk_t gw;
- struct p2m_domain *p2m = p2m_get_hostp2m(v->domain);
/* Get the top-level table's MFN */
- cr3 = v->arch.hvm_vcpu.guest_cr[3];
top_mfn = gfn_to_mfn_unshare(p2m, cr3 >> PAGE_SHIFT, &p2mt, 0);
if ( p2m_is_paging(p2mt) )
{
@@ -72,7 +80,7 @@ unsigned long hap_gva_to_gfn(GUEST_PAGIN
#if GUEST_PAGING_LEVELS == 3
top_map += (cr3 & ~(PAGE_MASK | 31));
#endif
- missing = guest_walk_tables(v, p2m, gva, &gw, pfec[0], top_mfn, top_map);
+ missing = guest_walk_tables(v, p2m, ga, &gw, pfec[0], top_mfn, top_map);
unmap_domain_page(top_map);
/* Interpret the answer */
@@ -122,6 +130,15 @@ unsigned long hap_gva_to_gfn(GUEST_PAGIN
return INVALID_GFN;
}
+unsigned long hap_p2m_ga_to_gfn(GUEST_PAGING_LEVELS)(
+ struct vcpu *v, struct p2m_domain *p2m, unsigned long cr3,
+ paddr_t ga, uint32_t *pfec)
+{
+ gdprintk(XENLOG_ERR,
+ "Guest paging level is greater than host paging level!\n");
+ domain_crash(v->domain);
+ return INVALID_GFN;
+}
#endif
diff -r 3ab405e67be6 -r 98598880e482 xen/arch/x86/mm/hap/hap.c
--- a/xen/arch/x86/mm/hap/hap.c
+++ b/xen/arch/x86/mm/hap/hap.c
@@ -40,6 +40,7 @@
#include <asm/p2m.h>
#include <asm/domain.h>
#include <xen/numa.h>
+#include <asm/hvm/nestedhvm.h>
#include "private.h"
@@ -582,6 +583,7 @@ void hap_domain_init(struct domain *d)
int hap_enable(struct domain *d, u32 mode)
{
unsigned int old_pages;
+ uint8_t i;
int rv = 0;
domain_pause(d);
@@ -620,6 +622,12 @@ int hap_enable(struct domain *d, u32 mod
goto out;
}
+ for (i = 0; i < MAX_NESTEDP2M; i++) {
+ rv = p2m_alloc_table(d->arch.nested_p2m[i]);
+ if ( rv != 0 )
+ goto out;
+ }
+
/* Now let other users see the new mode */
d->arch.paging.mode = mode | PG_HAP_enable;
@@ -630,6 +638,13 @@ int hap_enable(struct domain *d, u32 mod
void hap_final_teardown(struct domain *d)
{
+ uint8_t i;
+
+ /* Destroy nestedp2m's first */
+ for (i = 0; i < MAX_NESTEDP2M; i++) {
+ p2m_teardown(d->arch.nested_p2m[i]);
+ }
+
if ( d->arch.paging.hap.total_pages != 0 )
hap_teardown(d);
@@ -657,7 +672,7 @@ void hap_teardown(struct domain *d)
/* release the monitor table held by each vcpu */
for_each_vcpu ( d, v )
{
- if ( v->arch.paging.mode && paging_mode_external(d) )
+ if ( paging_get_hostmode(v) && paging_mode_external(d) )
{
mfn = pagetable_get_mfn(v->arch.monitor_table);
if ( mfn_valid(mfn) && (mfn_x(mfn) != 0) )
@@ -725,6 +740,7 @@ static const struct paging_mode hap_pagi
void hap_vcpu_init(struct vcpu *v)
{
v->arch.paging.mode = &hap_paging_real_mode;
+ v->arch.paging.nestedmode = &hap_paging_real_mode;
}
/************************************************/
@@ -751,6 +767,15 @@ static int hap_page_fault(struct vcpu *v
*/
static int hap_invlpg(struct vcpu *v, unsigned long va)
{
+ if (nestedhvm_enabled(v->domain)) {
+ /* Emulate INVLPGA:
+ * Must perform the flush right now or an other vcpu may
+ * use it when we use the next VMRUN emulation, otherwise.
+ */
+ p2m_flush(v, vcpu_nestedhvm(v).nv_p2m);
+ return 1;
+ }
+
HAP_ERROR("Intercepted a guest INVLPG (%u:%u) with HAP enabled.\n",
v->domain->domain_id, v->vcpu_id);
domain_crash(v->domain);
@@ -763,17 +788,22 @@ static void hap_update_cr3(struct vcpu *
hvm_update_guest_cr(v, 3);
}
+const struct paging_mode *
+hap_paging_get_mode(struct vcpu *v)
+{
+ return !hvm_paging_enabled(v) ? &hap_paging_real_mode :
+ hvm_long_mode_enabled(v) ? &hap_paging_long_mode :
+ hvm_pae_enabled(v) ? &hap_paging_pae_mode :
+ &hap_paging_protected_mode;
+}
+
static void hap_update_paging_modes(struct vcpu *v)
{
struct domain *d = v->domain;
hap_lock(d);
- v->arch.paging.mode =
- !hvm_paging_enabled(v) ? &hap_paging_real_mode :
- hvm_long_mode_enabled(v) ? &hap_paging_long_mode :
- hvm_pae_enabled(v) ? &hap_paging_pae_mode :
- &hap_paging_protected_mode;
+ v->arch.paging.mode = hap_paging_get_mode(v);
if ( pagetable_is_null(v->arch.monitor_table) )
{
@@ -834,38 +864,81 @@ static void
hap_write_p2m_entry(struct vcpu *v, unsigned long gfn, l1_pgentry_t *p,
mfn_t table_mfn, l1_pgentry_t new, unsigned int level)
{
- uint32_t old_flags;
+ struct domain *d = v->domain;
+ uint32_t old_flags = l1e_get_flags(*p);
+ p2m_type_t op2mt = p2m_flags_to_type(old_flags);
- hap_lock(v->domain);
+ /* We know always use the host p2m here, regardless if the vcpu
+ * is in host or guest mode. The vcpu can be in guest mode by
+ * a hypercall which passes a domain and chooses mostly the first
+ * vcpu.
+ * XXX This is the reason why this function can not be used re-used
+ * for updating the nestedp2m. Otherwise, hypercalls would randomly
+ * operate on host p2m and nested p2m.
+ */
+ if ( nestedhvm_enabled(d)
+ && p2m_is_valid(op2mt) )
+ {
+ if ( l1e_get_intpte(new) != l1e_get_intpte(*p) ) {
+ p2m_type_t np2mt = p2m_flags_to_type(l1e_get_flags(new));
- old_flags = l1e_get_flags(*p);
+ /* Skip flush on vram tracking or XP mode in Win7 hang
+ * very early in the virtual BIOS (long before the bootloader
+ * runs), otherwise. VRAM tracking happens so often that
+ * flushing and fixing the nestedp2m doesn't let XP mode
+ * proceed to boot.
+ */
+ if ( !((op2mt == p2m_ram_rw && np2mt == p2m_ram_logdirty)
+ || (op2mt == p2m_ram_logdirty && np2mt == p2m_ram_rw)) )
+ {
+ /* This GFN -> MFN is going to get removed. */
+ /* XXX There is a more efficient way to do that
+ * but it works for now.
+ * Note, p2m_flush_nestedp2m calls hap_lock() internally.
+ */
+ p2m_flush_nestedp2m(d);
+ }
+ }
+ }
+
+ hap_lock(d);
+
safe_write_pte(p, new);
if ( (old_flags & _PAGE_PRESENT)
&& (level == 1 || (level == 2 && (old_flags & _PAGE_PSE))) )
- flush_tlb_mask(&v->domain->domain_dirty_cpumask);
+ flush_tlb_mask(&d->domain_dirty_cpumask);
#if CONFIG_PAGING_LEVELS == 3
/* install P2M in monitor table for PAE Xen */
if ( level == 3 )
/* We have written to the p2m l3: need to sync the per-vcpu
* copies of it in the monitor tables */
- p2m_install_entry_in_monitors(v->domain, (l3_pgentry_t *)p);
+ p2m_install_entry_in_monitors(d, (l3_pgentry_t *)p);
#endif
- hap_unlock(v->domain);
+ hap_unlock(d);
}
static unsigned long hap_gva_to_gfn_real_mode(
- struct vcpu *v, unsigned long gva, uint32_t *pfec)
+ struct vcpu *v, struct p2m_domain *p2m, unsigned long gva, uint32_t *pfec)
{
return ((paddr_t)gva >> PAGE_SHIFT);
}
+static unsigned long hap_p2m_ga_to_gfn_real_mode(
+ struct vcpu *v, struct p2m_domain *p2m, unsigned long cr3,
+ paddr_t ga, uint32_t *pfec)
+{
+ return (ga >> PAGE_SHIFT);
+}
+
+
/* Entry points into this mode of the hap code. */
static const struct paging_mode hap_paging_real_mode = {
.page_fault = hap_page_fault,
.invlpg = hap_invlpg,
.gva_to_gfn = hap_gva_to_gfn_real_mode,
+ .p2m_ga_to_gfn = hap_p2m_ga_to_gfn_real_mode,
.update_cr3 = hap_update_cr3,
.update_paging_modes = hap_update_paging_modes,
.write_p2m_entry = hap_write_p2m_entry,
@@ -876,6 +949,7 @@ static const struct paging_mode hap_pagi
.page_fault = hap_page_fault,
.invlpg = hap_invlpg,
.gva_to_gfn = hap_gva_to_gfn_2_levels,
+ .p2m_ga_to_gfn = hap_p2m_ga_to_gfn_2_levels,
.update_cr3 = hap_update_cr3,
.update_paging_modes = hap_update_paging_modes,
.write_p2m_entry = hap_write_p2m_entry,
@@ -886,6 +960,7 @@ static const struct paging_mode hap_pagi
.page_fault = hap_page_fault,
.invlpg = hap_invlpg,
.gva_to_gfn = hap_gva_to_gfn_3_levels,
+ .p2m_ga_to_gfn = hap_p2m_ga_to_gfn_3_levels,
.update_cr3 = hap_update_cr3,
.update_paging_modes = hap_update_paging_modes,
.write_p2m_entry = hap_write_p2m_entry,
@@ -896,6 +971,7 @@ static const struct paging_mode hap_pagi
.page_fault = hap_page_fault,
.invlpg = hap_invlpg,
.gva_to_gfn = hap_gva_to_gfn_4_levels,
+ .p2m_ga_to_gfn = hap_p2m_ga_to_gfn_4_levels,
.update_cr3 = hap_update_cr3,
.update_paging_modes = hap_update_paging_modes,
.write_p2m_entry = hap_write_p2m_entry,
diff -r 3ab405e67be6 -r 98598880e482 xen/arch/x86/mm/hap/nested_hap.c
--- /dev/null
+++ b/xen/arch/x86/mm/hap/nested_hap.c
@@ -0,0 +1,236 @@
+/******************************************************************************
+ * arch/x86/mm/hap/nested_hap.c
+ *
+ * Code for Nested Virtualization
+ * Copyright (c) 2011 Advanced Micro Devices
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <asm/domain.h>
+#include <asm/page.h>
+#include <asm/paging.h>
+#include <asm/p2m.h>
+#include <asm/mem_event.h>
+#include <public/mem_event.h>
+#include <asm/mem_sharing.h>
+#include <xen/event.h>
+#include <asm/hap.h>
+#include <asm/hvm/support.h>
+
+#include <asm/hvm/nestedhvm.h>
+
+#include "private.h"
+
+/* AlGORITHM for NESTED PAGE FAULT
+ *
+ * NOTATION
+ * Levels: L0, L1, L2
+ * Guests: L1 guest, L2 guest
+ * Hypervisor: L0 hypervisor
+ * Addresses: L2-GVA, L2-GPA, L1-GVA, L1-GPA, MPA
+ *
+ * On L0, when #NPF happens, the handler function should do:
+ * hap_page_fault(GPA)
+ * {
+ * 1. If #NPF is from L1 guest, then we crash the guest VM (same as old
+ * code)
+ * 2. If #NPF is from L2 guest, then we continue from (3)
+ * 3. Get h_cr3 from L1 guest. Map h_cr3 into L0 hypervisor address space.
+ * 4. Walk the h_cr3 page table
+ * 5. - if not present, then we inject #NPF back to L1 guest and
+ * re-launch L1 guest (L1 guest will either treat this #NPF as MMIO,
+ * or fix its p2m table for L2 guest)
+ * 6. - if present, then we will get the a new translated value L1-GPA
+ * (points to L1 machine memory)
+ * 7. * Use L1-GPA to walk L0 P2M table
+ * 8. - if not present, then crash the guest (should not happen)
+ * 9. - if present, then we get a new translated value MPA
+ * (points to real machine memory)
+ * 10. * Finally, use GPA and MPA to walk nested_p2m
+ * and fix the bits.
+ * }
+ *
+ */
+
+
+/********************************************/
+/* NESTED VIRT P2M FUNCTIONS */
+/********************************************/
+/* Override macros from asm/page.h to make them work with mfn_t */
+#undef mfn_valid
+#define mfn_valid(_mfn) __mfn_valid(mfn_x(_mfn))
+#undef page_to_mfn
+#define page_to_mfn(_pg) _mfn(__page_to_mfn(_pg))
+
+void
+nestedp2m_write_p2m_entry(struct p2m_domain *p2m, unsigned long gfn,
+ l1_pgentry_t *p, mfn_t table_mfn, l1_pgentry_t new, unsigned int level)
+{
+ struct domain *d = p2m->domain;
+ uint32_t old_flags;
+
+ hap_lock(d);
+
+ old_flags = l1e_get_flags(*p);
+ safe_write_pte(p, new);
+ if (old_flags & _PAGE_PRESENT)
+ nestedhvm_vmcx_flushtlb(p2m);
+
+ hap_unlock(d);
+}
+
+/********************************************/
+/* NESTED VIRT FUNCTIONS */
+/********************************************/
+static void
+nestedhap_fix_p2m(struct p2m_domain *p2m, paddr_t L2_gpa, paddr_t L0_gpa,
+ p2m_type_t p2mt, p2m_access_t p2ma)
+{
+ int rv;
+ ASSERT(p2m);
+ ASSERT(p2m->set_entry);
+
+ rv = p2m->set_entry(p2m, L2_gpa >> PAGE_SHIFT,
+ page_to_mfn(maddr_to_page(L0_gpa)),
+ 0 /*4K*/, p2mt, p2ma);
+ if (rv == 0) {
+ gdprintk(XENLOG_ERR,
+ "failed to set entry for 0x%"PRIx64" -> 0x%"PRIx64"\n",
+ L2_gpa, L0_gpa);
+ BUG();
+ }
+}
+
+/* This function uses L1_gpa to walk the P2M table in L0 hypervisor. If the
+ * walk is successful, the translated value is returned in L0_gpa. The return
+ * value tells the upper level what to do.
+ */
+static int
+nestedhap_walk_L0_p2m(struct p2m_domain *p2m, paddr_t L1_gpa, paddr_t *L0_gpa)
+{
+ mfn_t mfn;
+ p2m_type_t p2mt;
+
+ /* we use gfn_to_mfn_query() function to walk L0 P2M table */
+ mfn = gfn_to_mfn_query(p2m, L1_gpa >> PAGE_SHIFT, &p2mt);
+
+ if ( p2m_is_paging(p2mt) || p2m_is_shared(p2mt) || !p2m_is_ram(p2mt) )
+ return NESTEDHVM_PAGEFAULT_ERROR;
+
+ if ( !mfn_valid(mfn) )
+ return NESTEDHVM_PAGEFAULT_ERROR;
+
+ *L0_gpa = (mfn_x(mfn) << PAGE_SHIFT) + (L1_gpa & ~PAGE_MASK);
+ return NESTEDHVM_PAGEFAULT_DONE;
+}
+
+/* This function uses L2_gpa to walk the P2M page table in L1. If the
+ * walk is successful, the translated value is returned in
+ * L1_gpa. The result value tells what to do next.
+ */
+static int
+nestedhap_walk_L1_p2m(struct vcpu *v, struct p2m_domain *p2m,
+ paddr_t L2_gpa, paddr_t *L1_gpa)
+{
+ uint32_t pfec;
+ unsigned long nested_cr3, gfn;
+ const struct paging_mode *mode = paging_get_hostmode(v);
+
+ nested_cr3 = nhvm_vcpu_hostcr3(v);
+
+ /* walk the guest table */
+ gfn = paging_p2m_ga_to_gfn(v, p2m, mode, nested_cr3, L2_gpa, &pfec);
+
+ if ( gfn == INVALID_GFN )
+ return NESTEDHVM_PAGEFAULT_INJECT;
+
+ *L1_gpa = (gfn << PAGE_SHIFT) + (L2_gpa & ~PAGE_MASK);
+ return NESTEDHVM_PAGEFAULT_DONE;
+}
+
+/*
+ * The following function, nestedhap_page_fault(), is for steps (3)--(10).
+ *
+ * Returns:
+ */
+int
+nestedhvm_hap_nested_page_fault(struct vcpu *v, paddr_t L2_gpa)
+{
+ int rv;
+ paddr_t L1_gpa, L0_gpa;
+ struct domain *d = v->domain;
+ struct p2m_domain *p2m, *nested_p2m;
+
+ p2m = p2m_get_hostp2m(d); /* L0 p2m */
+ nested_p2m = p2m_get_nestedp2m(v, nhvm_vcpu_hostcr3(v));
+
+ /* walk the L1 P2M table, note we have to pass p2m
+ * and not nested_p2m here or we fail the walk forever,
+ * otherwise. */
+ rv = nestedhap_walk_L1_p2m(v, p2m, L2_gpa, &L1_gpa);
+
+ /* let caller to handle these two cases */
+ switch (rv) {
+ case NESTEDHVM_PAGEFAULT_INJECT:
+ return rv;
+ case NESTEDHVM_PAGEFAULT_ERROR:
+ return rv;
+ case NESTEDHVM_PAGEFAULT_DONE:
+ break;
+ default:
+ BUG();
+ break;
+ }
+
+ /* ==> we have to walk L0 P2M */
+ rv = nestedhap_walk_L0_p2m(p2m, L1_gpa, &L0_gpa);
+
+ /* let upper level caller to handle these two cases */
+ switch (rv) {
+ case NESTEDHVM_PAGEFAULT_INJECT:
+ return rv;
+ case NESTEDHVM_PAGEFAULT_ERROR:
+ return rv;
+ case NESTEDHVM_PAGEFAULT_DONE:
+ break;
+ default:
+ BUG();
+ break;
+ }
+
+ nestedp2m_lock(d);
+ /* fix p2m_get_pagetable(nested_p2m) */
+ nestedhap_fix_p2m(nested_p2m, L2_gpa, L0_gpa,
+ p2m_ram_rw,
+ p2m_access_rwx /* FIXME: Should use same permission as l1 guest */);
+ nestedp2m_unlock(d);
+
+ return NESTEDHVM_PAGEFAULT_DONE;
+}
+
+/********************************************/
+/* NESTED VIRT INITIALIZATION FUNCS */
+/********************************************/
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff -r 3ab405e67be6 -r 98598880e482 xen/arch/x86/mm/hap/private.h
--- a/xen/arch/x86/mm/hap/private.h
+++ b/xen/arch/x86/mm/hap/private.h
@@ -23,11 +23,27 @@
/********************************************/
/* GUEST TRANSLATION FUNCS */
/********************************************/
-unsigned long hap_gva_to_gfn_2_levels(struct vcpu *v, unsigned long gva,
+unsigned long hap_gva_to_gfn_2_levels(struct vcpu *v,
+ struct p2m_domain *p2m,
+ unsigned long gva,
uint32_t *pfec);
-unsigned long hap_gva_to_gfn_3_levels(struct vcpu *v, unsigned long gva,
+unsigned long hap_gva_to_gfn_3_levels(struct vcpu *v,
+ struct p2m_domain *p2m,
+ unsigned long gva,
uint32_t *pfec);
-unsigned long hap_gva_to_gfn_4_levels(struct vcpu *v, unsigned long gva,
+unsigned long hap_gva_to_gfn_4_levels(struct vcpu *v,
+ struct p2m_domain *p2m,
+ unsigned long gva,
uint32_t *pfec);
+unsigned long hap_p2m_ga_to_gfn_2_levels(struct vcpu *v,
+ struct p2m_domain *p2m, unsigned long cr3,
+ paddr_t ga, uint32_t *pfec);
+unsigned long hap_p2m_ga_to_gfn_3_levels(struct vcpu *v,
+ struct p2m_domain *p2m, unsigned long cr3,
+ paddr_t ga, uint32_t *pfec);
+unsigned long hap_p2m_ga_to_gfn_4_levels(struct vcpu *v,
+ struct p2m_domain *p2m, unsigned long cr3,
+ paddr_t ga, uint32_t *pfec);
+
#endif /* __HAP_PRIVATE_H__ */
diff -r 3ab405e67be6 -r 98598880e482 xen/arch/x86/mm/p2m.c
--- a/xen/arch/x86/mm/p2m.c
+++ b/xen/arch/x86/mm/p2m.c
@@ -34,6 +34,7 @@
#include <public/mem_event.h>
#include <asm/mem_sharing.h>
#include <xen/event.h>
+#include <asm/hvm/nestedhvm.h>
/* Debugging and auditing of the P2M code? */
#define P2M_AUDIT 0
@@ -75,7 +76,7 @@ boolean_param("hap_2mb", opt_hap_2mb);
#define SUPERPAGE_PAGES (1UL << 9)
#define superpage_aligned(_x) (((_x)&(SUPERPAGE_PAGES-1))==0)
-static unsigned long p2m_type_to_flags(p2m_type_t t, mfn_t mfn)
+unsigned long p2m_type_to_flags(p2m_type_t t, mfn_t mfn)
{
unsigned long flags;
#ifdef __x86_64__
@@ -121,9 +122,9 @@ static void audit_p2m(struct p2m_domain
// Find the next level's P2M entry, checking for out-of-range gfn's...
// Returns NULL on error.
//
-static l1_pgentry_t *
+l1_pgentry_t *
p2m_find_entry(void *table, unsigned long *gfn_remainder,
- unsigned long gfn, u32 shift, u32 max)
+ unsigned long gfn, uint32_t shift, uint32_t max)
{
u32 index;
@@ -224,20 +225,17 @@ p2m_next_level(struct p2m_domain *p2m, m
switch ( type ) {
case PGT_l3_page_table:
- paging_write_p2m_entry(p2m->domain, gfn,
- p2m_entry, *table_mfn, new_entry, 4);
+ p2m->write_p2m_entry(p2m, gfn, p2m_entry, *table_mfn, new_entry, 4);
break;
case PGT_l2_page_table:
#if CONFIG_PAGING_LEVELS == 3
/* for PAE mode, PDPE only has PCD/PWT/P bits available */
new_entry = l1e_from_pfn(mfn_x(page_to_mfn(pg)), _PAGE_PRESENT);
#endif
- paging_write_p2m_entry(p2m->domain, gfn,
- p2m_entry, *table_mfn, new_entry, 3);
+ p2m->write_p2m_entry(p2m, gfn, p2m_entry, *table_mfn, new_entry, 3);
break;
case PGT_l1_page_table:
- paging_write_p2m_entry(p2m->domain, gfn,
- p2m_entry, *table_mfn, new_entry, 2);
+ p2m->write_p2m_entry(p2m, gfn, p2m_entry, *table_mfn, new_entry, 2);
break;
default:
BUG();
@@ -264,14 +262,13 @@ p2m_next_level(struct p2m_domain *p2m, m
for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
{
new_entry = l1e_from_pfn(pfn + (i * L1_PAGETABLE_ENTRIES), flags);
- paging_write_p2m_entry(p2m->domain, gfn,
- l1_entry+i, *table_mfn, new_entry, 2);
+ p2m->write_p2m_entry(p2m, gfn,
+ l1_entry+i, *table_mfn, new_entry, 2);
}
unmap_domain_page(l1_entry);
new_entry = l1e_from_pfn(mfn_x(page_to_mfn(pg)),
__PAGE_HYPERVISOR|_PAGE_USER); //disable PSE
- paging_write_p2m_entry(p2m->domain, gfn,
- p2m_entry, *table_mfn, new_entry, 3);
+ p2m->write_p2m_entry(p2m, gfn, p2m_entry, *table_mfn, new_entry, 3);
}
@@ -298,15 +295,15 @@ p2m_next_level(struct p2m_domain *p2m, m
for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
{
new_entry = l1e_from_pfn(pfn + i, flags);
- paging_write_p2m_entry(p2m->domain, gfn,
- l1_entry+i, *table_mfn, new_entry, 1);
+ p2m->write_p2m_entry(p2m, gfn,
+ l1_entry+i, *table_mfn, new_entry, 1);
}
unmap_domain_page(l1_entry);
new_entry = l1e_from_pfn(mfn_x(page_to_mfn(pg)),
__PAGE_HYPERVISOR|_PAGE_USER);
- paging_write_p2m_entry(p2m->domain, gfn,
- p2m_entry, *table_mfn, new_entry, 2);
+ p2m->write_p2m_entry(p2m, gfn,
+ p2m_entry, *table_mfn, new_entry, 2);
}
*table_mfn = _mfn(l1e_get_pfn(*p2m_entry));
@@ -1369,8 +1366,7 @@ p2m_set_entry(struct p2m_domain *p2m, un
p2m_type_to_flags(p2mt, mfn) | _PAGE_PSE)
: l3e_empty();
entry_content.l1 = l3e_content.l3;
- paging_write_p2m_entry(p2m->domain, gfn, p2m_entry,
- table_mfn, entry_content, 3);
+ p2m->write_p2m_entry(p2m, gfn, p2m_entry, table_mfn, entry_content, 3);
/* NB: paging_write_p2m_entry() handles tlb flushes properly */
/* Free old intermediate tables if necessary */
@@ -1410,8 +1406,7 @@ p2m_set_entry(struct p2m_domain *p2m, un
entry_content = l1e_empty();
/* level 1 entry */
- paging_write_p2m_entry(p2m->domain, gfn, p2m_entry,
- table_mfn, entry_content, 1);
+ p2m->write_p2m_entry(p2m, gfn, p2m_entry, table_mfn, entry_content, 1);
/* NB: paging_write_p2m_entry() handles tlb flushes properly */
}
else if ( page_order == 9 )
@@ -1440,8 +1435,7 @@ p2m_set_entry(struct p2m_domain *p2m, un
l2e_content = l2e_empty();
entry_content.l1 = l2e_content.l2;
- paging_write_p2m_entry(p2m->domain, gfn, p2m_entry,
- table_mfn, entry_content, 2);
+ p2m->write_p2m_entry(p2m, gfn, p2m_entry, table_mfn, entry_content, 2);
/* NB: paging_write_p2m_entry() handles tlb flushes properly */
/* Free old intermediate tables if necessary */
@@ -1806,10 +1800,13 @@ static void p2m_initialise(struct domain
p2m->domain = d;
p2m->default_access = p2m_access_rwx;
+ p2m->cr3 = CR3_EADDR;
p2m->set_entry = p2m_set_entry;
p2m->get_entry = p2m_gfn_to_mfn;
p2m->get_entry_current = p2m_gfn_to_mfn_current;
p2m->change_entry_type_global = p2m_change_type_global;
+ p2m->write_p2m_entry = paging_write_p2m_entry;
+ cpus_clear(p2m->p2m_dirty_cpumask);
if ( hap_enabled(d) && (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) )
ept_p2m_init(d);
@@ -1817,6 +1814,25 @@ static void p2m_initialise(struct domain
return;
}
+static int
+p2m_init_nestedp2m(struct domain *d)
+{
+ uint8_t i;
+ struct p2m_domain *p2m;
+
+ nestedp2m_lock_init(d);
+ for (i = 0; i < MAX_NESTEDP2M; i++) {
+ d->arch.nested_p2m[i] = p2m = xmalloc(struct p2m_domain);
+ if (p2m == NULL)
+ return -ENOMEM;
+ p2m_initialise(d, p2m);
+ p2m->get_entry_current = p2m->get_entry;
+ p2m->write_p2m_entry = nestedp2m_write_p2m_entry;
+ }
+
+ return 0;
+}
+
int p2m_init(struct domain *d)
{
struct p2m_domain *p2m;
@@ -1825,8 +1841,12 @@ int p2m_init(struct domain *d)
if ( p2m == NULL )
return -ENOMEM;
p2m_initialise(d, p2m);
-
- return 0;
+
+ /* Must initialise nestedp2m unconditionally
+ * since nestedhvm_enabled(d) returns false here.
+ * (p2m_init runs too early for HVM_PARAM_* options)
+ */
+ return p2m_init_nestedp2m(d);
}
void p2m_change_entry_type_global(struct p2m_domain *p2m,
@@ -1919,6 +1939,9 @@ int p2m_alloc_table(struct p2m_domain *p
p2m_invalid, p2m->default_access) )
goto error;
+ if (p2m_is_nestedp2m(p2m))
+ goto nesteddone;
+
/* Copy all existing mappings from the page list and m2p */
spin_lock(&p2m->domain->page_alloc_lock);
page_list_for_each(page, &p2m->domain->page_list)
@@ -1940,6 +1963,7 @@ int p2m_alloc_table(struct p2m_domain *p
}
spin_unlock(&p2m->domain->page_alloc_lock);
+ nesteddone:
P2M_PRINTK("p2m table initialised (%u pages)\n", page_count);
p2m_unlock(p2m);
return 0;
@@ -1966,6 +1990,9 @@ void p2m_teardown(struct p2m_domain *p2m
mfn_t mfn;
#endif
+ if (p2m == NULL)
+ return;
+
p2m_lock(p2m);
#ifdef __x86_64__
@@ -1984,11 +2011,26 @@ void p2m_teardown(struct p2m_domain *p2m
p2m_unlock(p2m);
}
+static void p2m_teardown_nestedp2m(struct domain *d)
+{
+ uint8_t i;
+
+ for (i = 0; i < MAX_NESTEDP2M; i++) {
+ xfree(d->arch.nested_p2m[i]);
+ d->arch.nested_p2m[i] = NULL;
+ }
+}
+
void p2m_final_teardown(struct domain *d)
{
/* Iterate over all p2m tables per domain */
xfree(d->arch.p2m);
d->arch.p2m = NULL;
+
+ /* We must teardown unconditionally because
+ * we initialise them unconditionally.
+ */
+ p2m_teardown_nestedp2m(d);
}
#if P2M_AUDIT
@@ -2573,9 +2615,9 @@ void p2m_change_type_global(struct p2m_d
gfn = get_gpfn_from_mfn(mfn);
flags = p2m_type_to_flags(nt, _mfn(mfn));
l1e_content = l1e_from_pfn(mfn, flags | _PAGE_PSE);
- paging_write_p2m_entry(p2m->domain, gfn,
- (l1_pgentry_t *)&l3e[i3],
- l3mfn, l1e_content, 3);
+ p2m->write_p2m_entry(p2m, gfn,
+ (l1_pgentry_t *)&l3e[i3],
+ l3mfn, l1e_content, 3);
continue;
}
@@ -2604,9 +2646,9 @@ void p2m_change_type_global(struct p2m_d
* L2_PAGETABLE_ENTRIES) * L1_PAGETABLE_ENTRIES;
flags = p2m_type_to_flags(nt, _mfn(mfn));
l1e_content = l1e_from_pfn(mfn, flags | _PAGE_PSE);
- paging_write_p2m_entry(p2m->domain, gfn,
- (l1_pgentry_t *)&l2e[i2],
- l2mfn, l1e_content, 2);
+ p2m->write_p2m_entry(p2m, gfn,
+ (l1_pgentry_t *)&l2e[i2],
+ l2mfn, l1e_content, 2);
continue;
}
@@ -2628,8 +2670,8 @@ void p2m_change_type_global(struct p2m_d
/* create a new 1le entry with the new type */
flags = p2m_type_to_flags(nt, _mfn(mfn));
l1e_content = l1e_from_pfn(mfn, flags);
- paging_write_p2m_entry(p2m->domain, gfn, &l1e[i1],
- l1mfn, l1e_content, 1);
+ p2m->write_p2m_entry(p2m, gfn, &l1e[i1],
+ l1mfn, l1e_content, 1);
}
unmap_domain_page(l1e);
}
@@ -3048,6 +3090,179 @@ void p2m_mem_access_resume(struct p2m_do
}
#endif /* __x86_64__ */
+static struct p2m_domain *
+p2m_getlru_nestedp2m(struct domain *d, struct p2m_domain *p2m)
+{
+ int i, lru_index = -1;
+ struct p2m_domain *lrup2m, *tmp;
+
+ if (p2m == NULL) {
+ lru_index = MAX_NESTEDP2M - 1;
+ lrup2m = d->arch.nested_p2m[lru_index];
+ } else {
+ lrup2m = p2m;
+ for (i = 0; i < MAX_NESTEDP2M; i++) {
+ if (d->arch.nested_p2m[i] == p2m) {
+ lru_index = i;
+ break;
+ }
+ }
+ }
+
+ ASSERT(lru_index >= 0);
+ if (lru_index == 0) {
+ return lrup2m;
+ }
+
+ /* move the other's down the array "list" */
+ for (i = lru_index - 1; i >= 0; i--) {
+ tmp = d->arch.nested_p2m[i];
+ d->arch.nested_p2m[i+1] = tmp;
+ }
+
+ /* make the entry the first one */
+ d->arch.nested_p2m[0] = lrup2m;
+
+ return lrup2m;
+}
+
+static int
+p2m_flush_locked(struct p2m_domain *p2m)
+{
+ ASSERT(p2m);
+ if (p2m->cr3 == CR3_EADDR)
+ /* Microoptimisation: p2m is already empty.
+ * => about 0.3% speedup of overall system performance.
+ */
+ return 0;
+
+ p2m_teardown(p2m);
+ p2m_initialise(p2m->domain, p2m);
+ p2m->get_entry_current = p2m->get_entry;
+ p2m->write_p2m_entry = nestedp2m_write_p2m_entry;
+ return p2m_alloc_table(p2m);
+}
+
+void
+p2m_flush(struct vcpu *v, struct p2m_domain *p2m)
+{
+ struct domain *d = p2m->domain;
+
+ ASSERT(v->domain == d);
+ vcpu_nestedhvm(v).nv_p2m = NULL;
+ nestedp2m_lock(d);
+ BUG_ON(p2m_flush_locked(p2m) != 0);
+ hvm_asid_flush_vcpu(v);
+ nestedhvm_vmcx_flushtlb(p2m);
+ nestedp2m_unlock(d);
+}
+
+void
+p2m_flush_nestedp2m(struct domain *d)
+{
+ int i;
+
+ nestedp2m_lock(d);
+ for (i = 0; i < MAX_NESTEDP2M; i++) {
+ struct p2m_domain *p2m = d->arch.nested_p2m[i];
+ BUG_ON(p2m_flush_locked(p2m) != 0);
+ cpus_clear(p2m->p2m_dirty_cpumask);
+ }
+ nestedhvm_vmcx_flushtlbdomain(d);
+ nestedp2m_unlock(d);
+}
+
+struct p2m_domain *
+p2m_get_nestedp2m(struct vcpu *v, uint64_t cr3)
+{
+ struct nestedvcpu *nv = &vcpu_nestedhvm(v);
+ struct domain *d;
+ struct p2m_domain *p2m;
+ int i, rv;
+
+ if (cr3 == 0 || cr3 == CR3_EADDR)
+ cr3 = v->arch.hvm_vcpu.guest_cr[3];
+
+ if (nv->nv_flushp2m && nv->nv_p2m) {
+ nv->nv_p2m = NULL;
+ }
+
+ d = v->domain;
+ nestedp2m_lock(d);
+ for (i = 0; i < MAX_NESTEDP2M; i++) {
+ p2m = d->arch.nested_p2m[i];
+ if ((p2m->cr3 != cr3 && p2m->cr3 != CR3_EADDR) || (p2m != nv->nv_p2m))
+ continue;
+
+ nv->nv_flushp2m = 0;
+ p2m_getlru_nestedp2m(d, p2m);
+ nv->nv_p2m = p2m;
+ if (p2m->cr3 == CR3_EADDR)
+ hvm_asid_flush_vcpu(v);
+ p2m->cr3 = cr3;
+ cpu_set(v->processor, p2m->p2m_dirty_cpumask);
+ nestedp2m_unlock(d);
+ return p2m;
+ }
+
+ /* All p2m's are or were in use. Take the least recent used one,
+ * flush it and reuse.
+ */
+ for (i = 0; i < MAX_NESTEDP2M; i++) {
+ p2m = p2m_getlru_nestedp2m(d, NULL);
+ rv = p2m_flush_locked(p2m);
+ if (rv == 0)
+ break;
+ }
+ nv->nv_p2m = p2m;
+ p2m->cr3 = cr3;
+ nv->nv_flushp2m = 0;
+ hvm_asid_flush_vcpu(v);
+ nestedhvm_vmcx_flushtlb(nv->nv_p2m);
+ cpu_set(v->processor, p2m->p2m_dirty_cpumask);
+ nestedp2m_unlock(d);
+
+ return p2m;
+}
+
+struct p2m_domain *
+p2m_get_p2m(struct vcpu *v)
+{
+ if (!nestedhvm_is_n2(v))
+ return p2m_get_hostp2m(v->domain);
+
+ return p2m_get_nestedp2m(v, nhvm_vcpu_hostcr3(v));
+}
+
+unsigned long paging_gva_to_gfn(struct vcpu *v,
+ unsigned long va,
+ uint32_t *pfec)
+{
+ struct p2m_domain *hostp2m = p2m_get_hostp2m(v->domain);
+ const struct paging_mode *hostmode = paging_get_hostmode(v);
+
+ if ( is_hvm_domain(v->domain)
+ && paging_mode_hap(v->domain)
+ && nestedhvm_is_n2(v) )
+ {
+ unsigned long gfn;
+ struct p2m_domain *p2m;
+ const struct paging_mode *mode;
+ uint64_t ncr3 = nhvm_vcpu_hostcr3(v);
+
+ /* translate l2 guest va into l2 guest gfn */
+ p2m = p2m_get_nestedp2m(v, ncr3);
+ mode = paging_get_nestedmode(v);
+ gfn = mode->gva_to_gfn(v, p2m, va, pfec);
+
+ /* translate l2 guest gfn into l1 guest gfn */
+ return hostmode->p2m_ga_to_gfn(v, hostp2m, ncr3,
+ gfn << PAGE_SHIFT, pfec);
+ }
+
+ return hostmode->gva_to_gfn(v, hostp2m, va, pfec);
+}
+
/*
* Local variables:
* mode: C
diff -r 3ab405e67be6 -r 98598880e482 xen/arch/x86/mm/paging.c
--- a/xen/arch/x86/mm/paging.c
+++ b/xen/arch/x86/mm/paging.c
@@ -26,6 +26,7 @@
#include <asm/p2m.h>
#include <asm/hap.h>
#include <asm/guest_access.h>
+#include <asm/hvm/nestedhvm.h>
#include <xen/numa.h>
#include <xsm/xsm.h>
@@ -851,21 +852,58 @@ void paging_dump_vcpu_info(struct vcpu *
printk(" paging assistance: ");
if ( paging_mode_shadow(v->domain) )
{
- if ( v->arch.paging.mode )
+ if ( paging_get_hostmode(v) )
printk("shadowed %u-on-%u\n",
- v->arch.paging.mode->guest_levels,
- v->arch.paging.mode->shadow.shadow_levels);
+ paging_get_hostmode(v)->guest_levels,
+ paging_get_hostmode(v)->shadow.shadow_levels);
else
printk("not shadowed\n");
}
- else if ( paging_mode_hap(v->domain) && v->arch.paging.mode )
+ else if ( paging_mode_hap(v->domain) && paging_get_hostmode(v) )
printk("hap, %u levels\n",
- v->arch.paging.mode->guest_levels);
+ paging_get_hostmode(v)->guest_levels);
else
printk("none\n");
}
}
+const struct paging_mode *paging_get_mode(struct vcpu *v)
+{
+ if (!nestedhvm_is_n2(v))
+ return paging_get_hostmode(v);
+
+ return paging_get_nestedmode(v);
+}
+
+extern const struct paging_mode *hap_paging_get_mode(struct vcpu *);
+
+void paging_update_nestedmode(struct vcpu *v)
+{
+ ASSERT(nestedhvm_enabled(v->domain));
+ if (nestedhvm_paging_mode_hap(v))
+ /* nested-on-nested */
+ v->arch.paging.nestedmode = hap_paging_get_mode(v);
+ else
+ /* TODO: shadow-on-shadow */
+ v->arch.paging.nestedmode = NULL;
+}
+
+void paging_write_p2m_entry(struct p2m_domain *p2m, unsigned long gfn,
+ l1_pgentry_t *p, mfn_t table_mfn,
+ l1_pgentry_t new, unsigned int level)
+{
+ struct domain *d = p2m->domain;
+ struct vcpu *v = current;
+ if ( v->domain != d )
+ v = d->vcpu ? d->vcpu[0] : NULL;
+ if ( likely(v && paging_mode_enabled(d) && paging_get_hostmode(v) != NULL) )
+ {
+ return paging_get_hostmode(v)->write_p2m_entry(v, gfn, p, table_mfn,
+ new, level);
+ }
+ else
+ safe_write_pte(p, new);
+}
/*
* Local variables:
diff -r 3ab405e67be6 -r 98598880e482 xen/arch/x86/mm/shadow/multi.c
--- a/xen/arch/x86/mm/shadow/multi.c
+++ b/xen/arch/x86/mm/shadow/multi.c
@@ -3768,7 +3768,8 @@ sh_invlpg(struct vcpu *v, unsigned long
static unsigned long
-sh_gva_to_gfn(struct vcpu *v, unsigned long va, uint32_t *pfec)
+sh_gva_to_gfn(struct vcpu *v, struct p2m_domain *p2m,
+ unsigned long va, uint32_t *pfec)
/* Called to translate a guest virtual address to what the *guest*
* pagetables would map it to. */
{
@@ -4820,7 +4821,7 @@ static mfn_t emulate_gva_to_mfn(struct v
struct p2m_domain *p2m = p2m_get_hostp2m(v->domain);
/* Translate the VA to a GFN */
- gfn = sh_gva_to_gfn(v, vaddr, &pfec);
+ gfn = sh_gva_to_gfn(v, p2m, vaddr, &pfec);
if ( gfn == INVALID_GFN )
{
if ( is_hvm_vcpu(v) )
diff -r 3ab405e67be6 -r 98598880e482 xen/include/asm-x86/domain.h
--- a/xen/include/asm-x86/domain.h
+++ b/xen/include/asm-x86/domain.h
@@ -210,6 +210,8 @@ struct paging_domain {
struct paging_vcpu {
/* Pointers to mode-specific entry points. */
const struct paging_mode *mode;
+ /* Nested Virtualization: paging mode of nested guest */
+ const struct paging_mode *nestedmode;
/* HVM guest: last emulate was to a pagetable */
unsigned int last_write_was_pt:1;
/* HVM guest: last write emulation succeeds */
@@ -225,6 +227,7 @@ struct paging_vcpu {
#define MAX_CPUID_INPUT 40
typedef xen_domctl_cpuid_t cpuid_input_t;
+#define MAX_NESTEDP2M 10
struct p2m_domain;
struct time_scale {
int shift;
@@ -258,6 +261,12 @@ struct arch_domain
struct paging_domain paging;
struct p2m_domain *p2m;
+ /* nestedhvm: translate l2 guest physical to host physical */
+ struct p2m_domain *nested_p2m[MAX_NESTEDP2M];
+ spinlock_t nested_p2m_lock;
+ int nested_p2m_locker;
+ const char *nested_p2m_function;
+
/* NB. protected by d->event_lock and by irq_desc[irq].lock */
int *irq_pirq;
int *pirq_irq;
diff -r 3ab405e67be6 -r 98598880e482 xen/include/asm-x86/hvm/hvm.h
--- a/xen/include/asm-x86/hvm/hvm.h
+++ b/xen/include/asm-x86/hvm/hvm.h
@@ -374,12 +374,12 @@ static inline void hvm_set_info_guest(st
int hvm_debug_op(struct vcpu *v, int32_t op);
-bool_t hvm_hap_nested_page_fault(unsigned long gpa,
- bool_t gla_valid, unsigned long gla,
- bool_t access_valid,
- bool_t access_r,
- bool_t access_w,
- bool_t access_x);
+int hvm_hap_nested_page_fault(unsigned long gpa,
+ bool_t gla_valid, unsigned long gla,
+ bool_t access_valid,
+ bool_t access_r,
+ bool_t access_w,
+ bool_t access_x);
#define hvm_msr_tsc_aux(v) ({ \
struct domain *__d = (v)->domain; \
diff -r 3ab405e67be6 -r 98598880e482 xen/include/asm-x86/hvm/nestedhvm.h
--- a/xen/include/asm-x86/hvm/nestedhvm.h
+++ b/xen/include/asm-x86/hvm/nestedhvm.h
@@ -60,4 +60,9 @@ unsigned long *nestedhvm_vcpu_iomap_get(
#define nestedhvm_vmswitch_in_progress(v) \
(!!vcpu_nestedhvm((v)).nv_vmswitch_in_progress)
+void nestedhvm_vmcx_flushtlb(struct p2m_domain *p2m);
+void nestedhvm_vmcx_flushtlbdomain(struct domain *d);
+
+bool_t nestedhvm_is_n2(struct vcpu *v);
+
#endif /* _HVM_NESTEDHVM_H */
diff -r 3ab405e67be6 -r 98598880e482 xen/include/asm-x86/p2m.h
--- a/xen/include/asm-x86/p2m.h
+++ b/xen/include/asm-x86/p2m.h
@@ -199,7 +199,15 @@ struct p2m_domain {
/* Shadow translated domain: p2m mapping */
pagetable_t phys_table;
+ /* Same as domain_dirty_cpumask but limited to
+ * this p2m and those physical cpus whose vcpu's are in
+ * guestmode.
+ */
+ cpumask_t p2m_dirty_cpumask;
+
struct domain *domain; /* back pointer to domain */
+#define CR3_EADDR (~0ULL)
+ uint64_t cr3; /* to identify this p2m for re-use */
/* Pages used to construct the p2m */
struct page_list_head pages;
@@ -223,6 +231,11 @@ struct p2m_domain {
p2m_type_t ot,
p2m_type_t nt);
+ void (*write_p2m_entry)(struct p2m_domain *p2m,
+ unsigned long gfn, l1_pgentry_t *p,
+ mfn_t table_mfn, l1_pgentry_t new,
+ unsigned int level);
+
/* Default P2M access type for each page in the the domain: new pages,
* swapped in pages, cleared pages, and pages that are ambiquously
* retyped get this access type. See definition of p2m_access_t. */
@@ -264,8 +277,26 @@ struct p2m_domain {
/* get host p2m table */
#define p2m_get_hostp2m(d) ((d)->arch.p2m)
+/* Get p2m table (re)usable for specified cr3.
+ * Automatically destroys and re-initializes a p2m if none found.
+ * If cr3 == 0 then v->arch.hvm_vcpu.guest_cr[3] is used.
+ */
+struct p2m_domain *p2m_get_nestedp2m(struct vcpu *v, uint64_t cr3);
+
+/* If vcpu is in host mode then behaviour matches p2m_get_hostp2m().
+ * If vcpu is in guest mode then behaviour matches p2m_get_nestedp2m().
+ */
+struct p2m_domain *p2m_get_p2m(struct vcpu *v);
+
+#define p2m_is_nestedp2m(p2m) ((p2m) != p2m_get_hostp2m((p2m->domain)))
+
#define p2m_get_pagetable(p2m) ((p2m)->phys_table)
+/* Flushes specified p2m table */
+void p2m_flush(struct vcpu *v, struct p2m_domain *p2m);
+/* Flushes all nested p2m tables */
+void p2m_flush_nestedp2m(struct domain *d);
+
/*
* The P2M lock. This protects all updates to the p2m table.
* Updates are expected to be safe against concurrent reads,
@@ -307,6 +338,38 @@ struct p2m_domain {
(current->processor == (_p2m)->locker)
+#define nestedp2m_lock_init(_domain) \
+ do { \
+ spin_lock_init(&(_domain)->arch.nested_p2m_lock); \
+ (_domain)->arch.nested_p2m_locker = -1; \
+ (_domain)->arch.nested_p2m_function = "nobody"; \
+ } while (0)
+
+#define nestedp2m_locked_by_me(_domain) \
+ (current->processor == (_domain)->arch.nested_p2m_locker)
+
+#define nestedp2m_lock(_domain) \
+ do { \
+ if ( nestedp2m_locked_by_me(_domain) ) \
+ { \
+ printk("Error: p2m lock held by %s\n", \
+ (_domain)->arch.nested_p2m_function); \
+ BUG(); \
+ } \
+ spin_lock(&(_domain)->arch.nested_p2m_lock); \
+ ASSERT((_domain)->arch.nested_p2m_locker == -1); \
+ (_domain)->arch.nested_p2m_locker = current->processor; \
+ (_domain)->arch.nested_p2m_function = __func__; \
+ } while (0)
+
+#define nestedp2m_unlock(_domain) \
+ do { \
+ ASSERT(nestedp2m_locked_by_me(_domain)); \
+ (_domain)->arch.nested_p2m_locker = -1; \
+ (_domain)->arch.nested_p2m_function = "nobody"; \
+ spin_unlock(&(_domain)->arch.nested_p2m_lock); \
+ } while (0)
+
/* Extract the type from the PTE flags that store it */
static inline p2m_type_t p2m_flags_to_type(unsigned long flags)
{
@@ -424,11 +487,21 @@ static inline unsigned long mfn_to_gfn(s
/* Init the datastructures for later use by the p2m code */
int p2m_init(struct domain *d);
+/* PTE flags for various types of p2m entry */
+unsigned long p2m_type_to_flags(p2m_type_t t, mfn_t mfn);
+
/* Allocate a new p2m table for a domain.
*
* Returns 0 for success or -errno. */
int p2m_alloc_table(struct p2m_domain *p2m);
+/* Find the next level's P2M entry, checking for out-of-range gfn's...
+ * Returns NULL on error.
+ */
+l1_pgentry_t *
+p2m_find_entry(void *table, unsigned long *gfn_remainder,
+ unsigned long gfn, uint32_t shift, uint32_t max);
+
/* Return all the p2m resources to Xen. */
void p2m_teardown(struct p2m_domain *p2m);
void p2m_final_teardown(struct domain *d);
@@ -502,6 +575,8 @@ p2m_type_t p2m_change_type(struct p2m_do
int set_mmio_p2m_entry(struct p2m_domain *p2m, unsigned long gfn, mfn_t mfn);
int clear_mmio_p2m_entry(struct p2m_domain *p2m, unsigned long gfn);
+void nestedp2m_write_p2m_entry(struct p2m_domain *p2m, unsigned long gfn,
+ l1_pgentry_t *p, mfn_t table_mfn, l1_pgentry_t new, unsigned int level);
#ifdef __x86_64__
/* Modify p2m table for shared gfn */
diff -r 3ab405e67be6 -r 98598880e482 xen/include/asm-x86/paging.h
--- a/xen/include/asm-x86/paging.h
+++ b/xen/include/asm-x86/paging.h
@@ -108,8 +108,14 @@ struct paging_mode {
int (*page_fault )(struct vcpu *v, unsigned long va,
struct cpu_user_regs *regs);
int (*invlpg )(struct vcpu *v, unsigned long va);
- unsigned long (*gva_to_gfn )(struct vcpu *v, unsigned long va,
+ unsigned long (*gva_to_gfn )(struct vcpu *v,
+ struct p2m_domain *p2m,
+ unsigned long va,
uint32_t *pfec);
+ unsigned long (*p2m_ga_to_gfn )(struct vcpu *v,
+ struct p2m_domain *p2m,
+ unsigned long cr3,
+ paddr_t ga, uint32_t *pfec);
void (*update_cr3 )(struct vcpu *v, int do_locking);
void (*update_paging_modes )(struct vcpu *v);
void (*write_p2m_entry )(struct vcpu *v, unsigned long gfn,
@@ -219,6 +225,10 @@ void paging_final_teardown(struct domain
* creation. */
int paging_enable(struct domain *d, u32 mode);
+#define paging_get_hostmode(v) ((v)->arch.paging.mode)
+#define paging_get_nestedmode(v) ((v)->arch.paging.nestedmode)
+const struct paging_mode *paging_get_mode(struct vcpu *v);
+void paging_update_nestedmode(struct vcpu *v);
/* Page fault handler
* Called from pagefault handler in Xen, and from the HVM trap handlers
@@ -233,7 +243,7 @@ static inline int
paging_fault(unsigned long va, struct cpu_user_regs *regs)
{
struct vcpu *v = current;
- return v->arch.paging.mode->page_fault(v, va, regs);
+ return paging_get_hostmode(v)->page_fault(v, va, regs);
}
/* Handle invlpg requests on vcpus.
@@ -241,7 +251,7 @@ paging_fault(unsigned long va, struct cp
* or 0 if it's safe not to do so. */
static inline int paging_invlpg(struct vcpu *v, unsigned long va)
{
- return v->arch.paging.mode->invlpg(v, va);
+ return paging_get_hostmode(v)->invlpg(v, va);
}
/* Translate a guest virtual address to the frame number that the
@@ -251,11 +261,30 @@ static inline int paging_invlpg(struct v
* walking the tables. The caller should set the PFEC_page_present bit
* in pfec[0]; in the failure case, that bit will be cleared if appropriate. */
#define INVALID_GFN (-1UL)
-static inline unsigned long paging_gva_to_gfn(struct vcpu *v,
- unsigned long va,
- uint32_t *pfec)
+unsigned long paging_gva_to_gfn(struct vcpu *v,
+ unsigned long va,
+ uint32_t *pfec);
+
+/* Translates a guest virtual address to guest physical address
+ * where the specified cr3 is translated to host physical address
+ * using the specified p2m table.
+ * This allows to do page walks in the guest or even in the nested guest.
+ * It returns the guest's gfn or the nested guest's gfn.
+ * Use 'paddr_t' for the guest address so it won't overflow when
+ * guest or nested guest is in 32bit PAE mode.
+ */
+static inline unsigned long paging_p2m_ga_to_gfn(struct vcpu *v,
+ struct p2m_domain *p2m,
+ const struct paging_mode *mode,
+ unsigned long cr3,
+ paddr_t ga,
+ uint32_t *pfec)
{
- return v->arch.paging.mode->gva_to_gfn(v, va, pfec);
+ if ( is_hvm_domain(v->domain) && paging_mode_hap(v->domain) )
+ return mode->p2m_ga_to_gfn(v, p2m, cr3, ga, pfec);
+
+ /* shadow paging */
+ return paging_gva_to_gfn(v, ga, pfec);
}
/* Update all the things that are derived from the guest's CR3.
@@ -263,7 +292,7 @@ static inline unsigned long paging_gva_t
* as the value to load into the host CR3 to schedule this vcpu */
static inline void paging_update_cr3(struct vcpu *v)
{
- v->arch.paging.mode->update_cr3(v, 1);
+ paging_get_hostmode(v)->update_cr3(v, 1);
}
/* Update all the things that are derived from the guest's CR0/CR3/CR4.
@@ -271,7 +300,7 @@ static inline void paging_update_cr3(str
* has changed, and when bringing up a VCPU for the first time. */
static inline void paging_update_paging_modes(struct vcpu *v)
{
- v->arch.paging.mode->update_paging_modes(v);
+ paging_get_hostmode(v)->update_paging_modes(v);
}
@@ -283,7 +312,7 @@ static inline int paging_write_guest_ent
{
if ( unlikely(paging_mode_enabled(v->domain)
&& v->arch.paging.mode != NULL) )
- return v->arch.paging.mode->write_guest_entry(v, p, new, gmfn);
+ return paging_get_hostmode(v)->write_guest_entry(v, p, new, gmfn);
else
return (!__copy_to_user(p, &new, sizeof(new)));
}
@@ -299,7 +328,7 @@ static inline int paging_cmpxchg_guest_e
{
if ( unlikely(paging_mode_enabled(v->domain)
&& v->arch.paging.mode != NULL) )
- return v->arch.paging.mode->cmpxchg_guest_entry(v, p, old, new, gmfn);
+ return paging_get_hostmode(v)->cmpxchg_guest_entry(v, p, old, new, gmfn);
else
return (!cmpxchg_user(p, *old, new));
}
@@ -327,21 +356,11 @@ static inline void safe_write_pte(l1_pge
* a pointer to the entry to be written, the MFN in which the entry resides,
* the new contents of the entry, and the level in the p2m tree at which
* we are writing. */
-static inline void paging_write_p2m_entry(struct domain *d, unsigned long gfn,
- l1_pgentry_t *p, mfn_t table_mfn,
- l1_pgentry_t new, unsigned int level)
-{
- struct vcpu *v = current;
- if ( v->domain != d )
- v = d->vcpu ? d->vcpu[0] : NULL;
- if ( likely(v && paging_mode_enabled(d) && v->arch.paging.mode != NULL) )
- {
- return v->arch.paging.mode->write_p2m_entry(v, gfn, p, table_mfn,
- new, level);
- }
- else
- safe_write_pte(p, new);
-}
+struct p2m_domain;
+
+void paging_write_p2m_entry(struct p2m_domain *p2m, unsigned long gfn,
+ l1_pgentry_t *p, mfn_t table_mfn,
+ l1_pgentry_t new, unsigned int level);
/* Called from the guest to indicate that the a process is being
* torn down and its pagetables will soon be discarded */
@@ -362,7 +381,7 @@ guest_map_l1e(struct vcpu *v, unsigned l
l2_pgentry_t l2e;
if ( unlikely(paging_mode_translate(v->domain)) )
- return v->arch.paging.mode->guest_map_l1e(v, addr, gl1mfn);
+ return paging_get_hostmode(v)->guest_map_l1e(v, addr, gl1mfn);
/* Find this l1e and its enclosing l1mfn in the linear map */
if ( __copy_from_user(&l2e,
@@ -398,7 +417,7 @@ guest_get_eff_l1e(struct vcpu *v, unsign
return;
}
- v->arch.paging.mode->guest_get_eff_l1e(v, addr, eff_l1e);
+ paging_get_hostmode(v)->guest_get_eff_l1e(v, addr, eff_l1e);
}
/* Read the guest's l1e that maps this address, from the kernel-mode
[-- Attachment #3: Type: text/plain, Size: 138 bytes --]
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH 12/12] Nested Virtualization: hap-on-hap
2011-03-31 15:25 ` Christoph Egger
@ 2011-04-05 15:48 ` Christoph Egger
2011-04-06 10:29 ` Tim Deegan
2011-04-29 9:03 ` Jan Beulich
0 siblings, 2 replies; 11+ messages in thread
From: Christoph Egger @ 2011-04-05 15:48 UTC (permalink / raw)
To: xen-devel@lists.xensource.com; +Cc: Tim Deegan
[-- Attachment #1: Type: text/plain, Size: 502 bytes --]
On 03/31/11 17:25, Christoph Egger wrote:
>
> This is the new version. I fixed the open items from Tim's last review.
Sorry, I mistakenly resent an older version and noticed it just now.
This time this is the latest version.
Christoph
--
---to satisfy European Law for business letters:
Advanced Micro Devices GmbH
Einsteinring 24, 85689 Dornach b. Muenchen
Geschaeftsfuehrer: Alberto Bozzo, Andrew Bowd
Sitz: Dornach, Gemeinde Aschheim, Landkreis Muenchen
Registergericht Muenchen, HRB Nr. 43632
[-- Attachment #2: xen_nh12_haphap.diff --]
[-- Type: text/plain, Size: 66809 bytes --]
# HG changeset patch
# User cegger
# Date 1302011049 -7200
Implement Nested-on-Nested.
This allows the guest to run nested guest with hap enabled.
Signed-off-by: Christoph Egger <Christoph.Egger@amd.com>
diff -r cfde4384be14 -r 28809c365861 xen/arch/x86/hvm/hvm.c
--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -1170,21 +1170,50 @@ void hvm_inject_exception(unsigned int t
hvm_funcs.inject_exception(trapnr, errcode, cr2);
}
-bool_t hvm_hap_nested_page_fault(unsigned long gpa,
- bool_t gla_valid,
- unsigned long gla,
- bool_t access_valid,
- bool_t access_r,
- bool_t access_w,
- bool_t access_x)
+int hvm_hap_nested_page_fault(unsigned long gpa,
+ bool_t gla_valid,
+ unsigned long gla,
+ bool_t access_valid,
+ bool_t access_r,
+ bool_t access_w,
+ bool_t access_x)
{
unsigned long gfn = gpa >> PAGE_SHIFT;
p2m_type_t p2mt;
p2m_access_t p2ma;
mfn_t mfn;
struct vcpu *v = current;
- struct p2m_domain *p2m = p2m_get_hostp2m(v->domain);
-
+ struct p2m_domain *p2m = NULL;
+
+ /* On Nested Virtualization, walk the guest page table.
+ * If this succeeds, all is fine.
+ * If this fails, inject a nested page fault into the guest.
+ */
+ if ( nestedhvm_enabled(v->domain)
+ && nestedhvm_vcpu_in_guestmode(v)
+ && nestedhvm_paging_mode_hap(v) )
+ {
+ int rv;
+
+ /* The vcpu is in guest mode and the l1 guest
+ * uses hap. That means 'gpa' is in l2 guest
+ * physical address space.
+ * Fix the nested p2m or inject nested page fault
+ * into l1 guest if not fixable. The algorithm is
+ * the same as for shadow paging.
+ */
+ rv = nestedhvm_hap_nested_page_fault(v, gpa);
+ switch (rv) {
+ case NESTEDHVM_PAGEFAULT_DONE:
+ return 1;
+ case NESTEDHVM_PAGEFAULT_ERROR:
+ return 0;
+ case NESTEDHVM_PAGEFAULT_INJECT:
+ return -1;
+ }
+ }
+
+ p2m = p2m_get_hostp2m(v->domain);
mfn = gfn_to_mfn_type_current(p2m, gfn, &p2mt, &p2ma, p2m_guest);
/* Check access permissions first, then handle faults */
@@ -1328,6 +1357,15 @@ int hvm_set_efer(uint64_t value)
return X86EMUL_EXCEPTION;
}
+ if ( nestedhvm_enabled(v->domain) && cpu_has_svm &&
+ ((value & EFER_SVME) == 0 ) &&
+ ((value ^ v->arch.hvm_vcpu.guest_efer) & EFER_SVME) )
+ {
+ /* Cleared EFER.SVME: Flush all nestedp2m tables */
+ p2m_flush_nestedp2m(v->domain);
+ nestedhvm_vcpu_reset(v);
+ }
+
value |= v->arch.hvm_vcpu.guest_efer & EFER_LMA;
v->arch.hvm_vcpu.guest_efer = value;
hvm_update_guest_efer(v);
@@ -1478,8 +1516,12 @@ int hvm_set_cr0(unsigned long value)
v->arch.hvm_vcpu.guest_cr[0] = value;
hvm_update_guest_cr(v, 0);
- if ( (value ^ old_value) & X86_CR0_PG )
- paging_update_paging_modes(v);
+ if ( (value ^ old_value) & X86_CR0_PG ) {
+ if ( !nestedhvm_vmswitch_in_progress(v) && nestedhvm_vcpu_in_guestmode(v) )
+ paging_update_nestedmode(v);
+ else
+ paging_update_paging_modes(v);
+ }
return X86EMUL_OKAY;
@@ -1546,8 +1588,12 @@ int hvm_set_cr4(unsigned long value)
hvm_update_guest_cr(v, 4);
/* Modifying CR4.{PSE,PAE,PGE} invalidates all TLB entries, inc. Global. */
- if ( (old_cr ^ value) & (X86_CR4_PSE | X86_CR4_PGE | X86_CR4_PAE) )
- paging_update_paging_modes(v);
+ if ( (old_cr ^ value) & (X86_CR4_PSE | X86_CR4_PGE | X86_CR4_PAE) ) {
+ if ( !nestedhvm_vmswitch_in_progress(v) && nestedhvm_vcpu_in_guestmode(v) )
+ paging_update_nestedmode(v);
+ else
+ paging_update_paging_modes(v);
+ }
return X86EMUL_OKAY;
@@ -2060,7 +2106,7 @@ static enum hvm_copy_result __hvm_copy(
void *buf, paddr_t addr, int size, unsigned int flags, uint32_t pfec)
{
struct vcpu *curr = current;
- struct p2m_domain *p2m = p2m_get_hostp2m(curr->domain);
+ struct p2m_domain *p2m;
unsigned long gfn, mfn;
p2m_type_t p2mt;
char *p;
@@ -2082,6 +2128,8 @@ static enum hvm_copy_result __hvm_copy(
return HVMCOPY_unhandleable;
#endif
+ p2m = p2m_get_hostp2m(curr->domain);
+
while ( todo > 0 )
{
count = min_t(int, PAGE_SIZE - (addr & ~PAGE_MASK), todo);
diff -r cfde4384be14 -r 28809c365861 xen/arch/x86/hvm/nestedhvm.c
--- a/xen/arch/x86/hvm/nestedhvm.c
+++ b/xen/arch/x86/hvm/nestedhvm.c
@@ -20,6 +20,7 @@
#include <asm/msr.h>
#include <asm/hvm/support.h> /* for HVM_DELIVER_NO_ERROR_CODE */
#include <asm/hvm/hvm.h>
+#include <asm/p2m.h> /* for struct p2m_domain */
#include <asm/hvm/nestedhvm.h>
#include <asm/event.h> /* for local_event_delivery_(en|dis)able */
#include <asm/paging.h> /* for paging_mode_hap() */
@@ -96,6 +97,54 @@ nestedhvm_vcpu_destroy(struct vcpu *v)
return nhvm_vcpu_destroy(v);
}
+static void
+nestedhvm_flushtlb_ipi(void *info)
+{
+ struct vcpu *v = current;
+ struct domain *d = info;
+
+ ASSERT(d != NULL);
+ if (v->domain != d) {
+ /* This cpu doesn't belong to the domain */
+ return;
+ }
+
+ /* Just flush the ASID (or request a new one).
+ * This is cheaper than flush_tlb_local() and has
+ * the same desired effect.
+ */
+ hvm_asid_flush_core();
+ vcpu_nestedhvm(v).nv_p2m = NULL;
+}
+
+void
+nestedhvm_vmcx_flushtlb(struct p2m_domain *p2m)
+{
+ on_selected_cpus(&p2m->p2m_dirty_cpumask, nestedhvm_flushtlb_ipi,
+ p2m->domain, 1);
+ cpus_clear(p2m->p2m_dirty_cpumask);
+}
+
+void
+nestedhvm_vmcx_flushtlbdomain(struct domain *d)
+{
+ on_selected_cpus(&d->domain_dirty_cpumask, nestedhvm_flushtlb_ipi, d, 1);
+}
+
+bool_t
+nestedhvm_is_n2(struct vcpu *v)
+{
+ if (!nestedhvm_enabled(v->domain)
+ || nestedhvm_vmswitch_in_progress(v)
+ || !nestedhvm_paging_mode_hap(v))
+ return 0;
+
+ if (nestedhvm_vcpu_in_guestmode(v))
+ return 1;
+
+ return 0;
+}
+
/* Common shadow IO Permission bitmap */
/* There four global patterns of io bitmap each guest can
diff -r cfde4384be14 -r 28809c365861 xen/arch/x86/hvm/svm/nestedsvm.c
--- a/xen/arch/x86/hvm/svm/nestedsvm.c
+++ b/xen/arch/x86/hvm/svm/nestedsvm.c
@@ -26,6 +26,7 @@
#include <asm/hvm/svm/svmdebug.h>
#include <asm/paging.h> /* paging_mode_hap */
#include <asm/event.h> /* for local_event_delivery_(en|dis)able */
+#include <asm/p2m.h> /* p2m_get_pagetable, p2m_get_nestedp2m */
static void
nestedsvm_vcpu_clgi(struct vcpu *v)
@@ -320,6 +321,18 @@ static int nsvm_vmrun_permissionmap(stru
return 0;
}
+static void nestedsvm_vmcb_set_nestedp2m(struct vcpu *v,
+ struct vmcb_struct *vvmcb, struct vmcb_struct *n2vmcb)
+{
+ struct p2m_domain *p2m;
+
+ ASSERT(v != NULL);
+ ASSERT(vvmcb != NULL);
+ ASSERT(n2vmcb != NULL);
+ p2m = p2m_get_nestedp2m(v, vvmcb->_h_cr3);
+ n2vmcb->_h_cr3 = pagetable_get_paddr(p2m_get_pagetable(p2m));
+}
+
static int nsvm_vmcb_prepare4vmrun(struct vcpu *v, struct cpu_user_regs *regs)
{
struct nestedvcpu *nv = &vcpu_nestedhvm(v);
@@ -475,6 +488,9 @@ static int nsvm_vmcb_prepare4vmrun(struc
/* Nested paging mode */
if (nestedhvm_paging_mode_hap(v)) {
/* host nested paging + guest nested paging. */
+ n2vmcb->_np_enable = 1;
+
+ nestedsvm_vmcb_set_nestedp2m(v, ns_vmcb, n2vmcb);
/* hvm_set_cr3() below sets v->arch.hvm_vcpu.guest_cr[3] for us. */
rc = hvm_set_cr3(ns_vmcb->_cr3);
@@ -1318,8 +1334,20 @@ asmlinkage void nsvm_vcpu_switch(struct
ret = nsvm_vcpu_vmrun(v, regs);
if (ret < 0)
goto vmexit;
+
+ ASSERT(nestedhvm_vcpu_in_guestmode(v));
nv->nv_vmentry_pending = 0;
- return;
+ }
+
+ if (nestedhvm_vcpu_in_guestmode(v)
+ && nestedhvm_paging_mode_hap(v))
+ {
+ /* In case left the l2 guest due to a physical interrupt (e.g. IPI)
+ * that is not for the l1 guest then we continue running the l2 guest
+ * but check if the nestedp2m is still valid.
+ */
+ if (nv->nv_p2m == NULL)
+ nestedsvm_vmcb_set_nestedp2m(v, nv->nv_vvmcx, nv->nv_n2vmcx);
}
}
diff -r cfde4384be14 -r 28809c365861 xen/arch/x86/hvm/svm/svm.c
--- a/xen/arch/x86/hvm/svm/svm.c
+++ b/xen/arch/x86/hvm/svm/svm.c
@@ -1014,14 +1014,16 @@ struct hvm_function_table * __init start
return &svm_function_table;
}
-static void svm_do_nested_pgfault(paddr_t gpa)
+static void svm_do_nested_pgfault(struct vcpu *v,
+ struct cpu_user_regs *regs, paddr_t gpa)
{
+ int ret;
unsigned long gfn = gpa >> PAGE_SHIFT;
mfn_t mfn;
p2m_type_t p2mt;
- struct p2m_domain *p2m;
+ struct p2m_domain *p2m = NULL;
- p2m = p2m_get_hostp2m(current->domain);
+ ret = hvm_hap_nested_page_fault(gpa, 0, ~0ul, 0, 0, 0, 0);
if ( tb_init_done )
{
@@ -1032,6 +1034,7 @@ static void svm_do_nested_pgfault(paddr_
uint32_t p2mt;
} _d;
+ p2m = p2m_get_p2m(v);
_d.gpa = gpa;
_d.qualification = 0;
_d.mfn = mfn_x(gfn_to_mfn_query(p2m, gfn, &_d.p2mt));
@@ -1039,14 +1042,26 @@ static void svm_do_nested_pgfault(paddr_
__trace_var(TRC_HVM_NPF, 0, sizeof(_d), &_d);
}
- if ( hvm_hap_nested_page_fault(gpa, 0, ~0ul, 0, 0, 0, 0) )
+ switch (ret) {
+ case 0:
+ break;
+ case 1:
return;
+ case -1:
+ ASSERT(nestedhvm_enabled(v->domain) && nestedhvm_vcpu_in_guestmode(v));
+ /* inject #VMEXIT(NPF) into guest. */
+ nestedsvm_vmexit_defer(v, VMEXIT_NPF, regs->error_code, gpa);
+ return;
+ }
+ if ( p2m == NULL )
+ p2m = p2m_get_p2m(v);
/* Everything else is an error. */
mfn = gfn_to_mfn_guest(p2m, gfn, &p2mt);
- gdprintk(XENLOG_ERR, "SVM violation gpa %#"PRIpaddr", mfn %#lx, type %i\n",
- gpa, mfn_x(mfn), p2mt);
- domain_crash(current->domain);
+ gdprintk(XENLOG_ERR,
+ "SVM violation gpa %#"PRIpaddr", mfn %#lx, type %i\n",
+ gpa, mfn_x(mfn), p2mt);
+ domain_crash(v->domain);
}
static void svm_fpu_dirty_intercept(void)
@@ -1659,6 +1674,8 @@ asmlinkage void svm_vmexit_handler(struc
struct vmcb_struct *ns_vmcb = nv->nv_vvmcx;
uint64_t exitinfo1, exitinfo2;
+ paging_update_nestedmode(v);
+
/* Write real exitinfo1 back into virtual vmcb.
* nestedsvm_check_intercepts() expects to have the correct
* exitinfo1 value there.
@@ -1948,7 +1965,7 @@ asmlinkage void svm_vmexit_handler(struc
case VMEXIT_NPF:
perfc_incra(svmexits, VMEXIT_NPF_PERFC);
regs->error_code = vmcb->exitinfo1;
- svm_do_nested_pgfault(vmcb->exitinfo2);
+ svm_do_nested_pgfault(v, regs, vmcb->exitinfo2);
break;
case VMEXIT_IRET: {
diff -r cfde4384be14 -r 28809c365861 xen/arch/x86/mm/hap/Makefile
--- a/xen/arch/x86/mm/hap/Makefile
+++ b/xen/arch/x86/mm/hap/Makefile
@@ -3,6 +3,7 @@ obj-y += guest_walk_2level.o
obj-y += guest_walk_3level.o
obj-y += guest_walk_4level.o
obj-y += p2m-ept.o
+obj-y += nested_hap.o
guest_levels = $(subst level,,$(filter %level,$(subst ., ,$(subst _, ,$(1)))))
guest_walk_defns = -DGUEST_PAGING_LEVELS=$(call guest_levels,$(1))
diff -r cfde4384be14 -r 28809c365861 xen/arch/x86/mm/hap/guest_walk.c
--- a/xen/arch/x86/mm/hap/guest_walk.c
+++ b/xen/arch/x86/mm/hap/guest_walk.c
@@ -29,24 +29,32 @@
#define _hap_gva_to_gfn(levels) hap_gva_to_gfn_##levels##_levels
#define hap_gva_to_gfn(levels) _hap_gva_to_gfn(levels)
+#define _hap_p2m_ga_to_gfn(levels) hap_p2m_ga_to_gfn_##levels##_levels
+#define hap_p2m_ga_to_gfn(levels) _hap_p2m_ga_to_gfn(levels)
+
#if GUEST_PAGING_LEVELS <= CONFIG_PAGING_LEVELS
#include <asm/guest_pt.h>
#include <asm/p2m.h>
unsigned long hap_gva_to_gfn(GUEST_PAGING_LEVELS)(
- struct vcpu *v, unsigned long gva, uint32_t *pfec)
+ struct vcpu *v, struct p2m_domain *p2m, unsigned long gva, uint32_t *pfec)
{
- unsigned long cr3;
+ unsigned long cr3 = v->arch.hvm_vcpu.guest_cr[3];
+ return hap_p2m_ga_to_gfn(GUEST_PAGING_LEVELS)(v, p2m, cr3, gva, pfec);
+}
+
+unsigned long hap_p2m_ga_to_gfn(GUEST_PAGING_LEVELS)(
+ struct vcpu *v, struct p2m_domain *p2m, unsigned long cr3,
+ paddr_t ga, uint32_t *pfec)
+{
uint32_t missing;
mfn_t top_mfn;
void *top_map;
p2m_type_t p2mt;
walk_t gw;
- struct p2m_domain *p2m = p2m_get_hostp2m(v->domain);
/* Get the top-level table's MFN */
- cr3 = v->arch.hvm_vcpu.guest_cr[3];
top_mfn = gfn_to_mfn_unshare(p2m, cr3 >> PAGE_SHIFT, &p2mt, 0);
if ( p2m_is_paging(p2mt) )
{
@@ -72,7 +80,7 @@ unsigned long hap_gva_to_gfn(GUEST_PAGIN
#if GUEST_PAGING_LEVELS == 3
top_map += (cr3 & ~(PAGE_MASK | 31));
#endif
- missing = guest_walk_tables(v, p2m, gva, &gw, pfec[0], top_mfn, top_map);
+ missing = guest_walk_tables(v, p2m, ga, &gw, pfec[0], top_mfn, top_map);
unmap_domain_page(top_map);
/* Interpret the answer */
@@ -122,6 +130,15 @@ unsigned long hap_gva_to_gfn(GUEST_PAGIN
return INVALID_GFN;
}
+unsigned long hap_p2m_ga_to_gfn(GUEST_PAGING_LEVELS)(
+ struct vcpu *v, struct p2m_domain *p2m, unsigned long cr3,
+ paddr_t ga, uint32_t *pfec)
+{
+ gdprintk(XENLOG_ERR,
+ "Guest paging level is greater than host paging level!\n");
+ domain_crash(v->domain);
+ return INVALID_GFN;
+}
#endif
diff -r cfde4384be14 -r 28809c365861 xen/arch/x86/mm/hap/hap.c
--- a/xen/arch/x86/mm/hap/hap.c
+++ b/xen/arch/x86/mm/hap/hap.c
@@ -40,6 +40,7 @@
#include <asm/p2m.h>
#include <asm/domain.h>
#include <xen/numa.h>
+#include <asm/hvm/nestedhvm.h>
#include "private.h"
@@ -582,6 +583,7 @@ void hap_domain_init(struct domain *d)
int hap_enable(struct domain *d, u32 mode)
{
unsigned int old_pages;
+ uint8_t i;
int rv = 0;
domain_pause(d);
@@ -620,6 +622,12 @@ int hap_enable(struct domain *d, u32 mod
goto out;
}
+ for (i = 0; i < MAX_NESTEDP2M; i++) {
+ rv = p2m_alloc_table(d->arch.nested_p2m[i]);
+ if ( rv != 0 )
+ goto out;
+ }
+
/* Now let other users see the new mode */
d->arch.paging.mode = mode | PG_HAP_enable;
@@ -630,6 +638,13 @@ int hap_enable(struct domain *d, u32 mod
void hap_final_teardown(struct domain *d)
{
+ uint8_t i;
+
+ /* Destroy nestedp2m's first */
+ for (i = 0; i < MAX_NESTEDP2M; i++) {
+ p2m_teardown(d->arch.nested_p2m[i]);
+ }
+
if ( d->arch.paging.hap.total_pages != 0 )
hap_teardown(d);
@@ -657,7 +672,7 @@ void hap_teardown(struct domain *d)
/* release the monitor table held by each vcpu */
for_each_vcpu ( d, v )
{
- if ( v->arch.paging.mode && paging_mode_external(d) )
+ if ( paging_get_hostmode(v) && paging_mode_external(d) )
{
mfn = pagetable_get_mfn(v->arch.monitor_table);
if ( mfn_valid(mfn) && (mfn_x(mfn) != 0) )
@@ -725,6 +740,7 @@ static const struct paging_mode hap_pagi
void hap_vcpu_init(struct vcpu *v)
{
v->arch.paging.mode = &hap_paging_real_mode;
+ v->arch.paging.nestedmode = &hap_paging_real_mode;
}
/************************************************/
@@ -751,6 +767,15 @@ static int hap_page_fault(struct vcpu *v
*/
static int hap_invlpg(struct vcpu *v, unsigned long va)
{
+ if (nestedhvm_enabled(v->domain)) {
+ /* Emulate INVLPGA:
+ * Must perform the flush right now or an other vcpu may
+ * use it when we use the next VMRUN emulation, otherwise.
+ */
+ p2m_flush(v, vcpu_nestedhvm(v).nv_p2m);
+ return 1;
+ }
+
HAP_ERROR("Intercepted a guest INVLPG (%u:%u) with HAP enabled.\n",
v->domain->domain_id, v->vcpu_id);
domain_crash(v->domain);
@@ -763,17 +788,22 @@ static void hap_update_cr3(struct vcpu *
hvm_update_guest_cr(v, 3);
}
+const struct paging_mode *
+hap_paging_get_mode(struct vcpu *v)
+{
+ return !hvm_paging_enabled(v) ? &hap_paging_real_mode :
+ hvm_long_mode_enabled(v) ? &hap_paging_long_mode :
+ hvm_pae_enabled(v) ? &hap_paging_pae_mode :
+ &hap_paging_protected_mode;
+}
+
static void hap_update_paging_modes(struct vcpu *v)
{
struct domain *d = v->domain;
hap_lock(d);
- v->arch.paging.mode =
- !hvm_paging_enabled(v) ? &hap_paging_real_mode :
- hvm_long_mode_enabled(v) ? &hap_paging_long_mode :
- hvm_pae_enabled(v) ? &hap_paging_pae_mode :
- &hap_paging_protected_mode;
+ v->arch.paging.mode = hap_paging_get_mode(v);
if ( pagetable_is_null(v->arch.monitor_table) )
{
@@ -834,38 +864,70 @@ static void
hap_write_p2m_entry(struct vcpu *v, unsigned long gfn, l1_pgentry_t *p,
mfn_t table_mfn, l1_pgentry_t new, unsigned int level)
{
+ struct domain *d = v->domain;
uint32_t old_flags;
+ bool_t flush_nestedp2m = 0;
- hap_lock(v->domain);
+ /* We know always use the host p2m here, regardless if the vcpu
+ * is in host or guest mode. The vcpu can be in guest mode by
+ * a hypercall which passes a domain and chooses mostly the first
+ * vcpu.
+ * XXX This is the reason why this function can not be used re-used
+ * for updating the nestedp2m. Otherwise, hypercalls would randomly
+ * operate on host p2m and nested p2m.
+ */
+ hap_lock(d);
old_flags = l1e_get_flags(*p);
+
+ if ( nestedhvm_enabled(d) && (old_flags & _PAGE_PRESENT) ) {
+ /* We are replacing a valid entry so we need to flush nested p2ms,
+ * unless the only change is an increase in access rights. */
+ mfn_t omfn = _mfn(l1e_get_pfn(*p));
+ mfn_t nmfn = _mfn(l1e_get_pfn(new));
+ flush_nestedp2m = !( mfn_x(omfn) == mfn_x(nmfn)
+ && perms_strictly_increased(old_flags, l1e_get_flags(new)) );
+ }
+
safe_write_pte(p, new);
if ( (old_flags & _PAGE_PRESENT)
&& (level == 1 || (level == 2 && (old_flags & _PAGE_PSE))) )
- flush_tlb_mask(&v->domain->domain_dirty_cpumask);
+ flush_tlb_mask(&d->domain_dirty_cpumask);
#if CONFIG_PAGING_LEVELS == 3
/* install P2M in monitor table for PAE Xen */
if ( level == 3 )
/* We have written to the p2m l3: need to sync the per-vcpu
* copies of it in the monitor tables */
- p2m_install_entry_in_monitors(v->domain, (l3_pgentry_t *)p);
+ p2m_install_entry_in_monitors(d, (l3_pgentry_t *)p);
#endif
- hap_unlock(v->domain);
+ hap_unlock(d);
+
+ if ( flush_nestedp2m )
+ p2m_flush_nestedp2m(d);
}
static unsigned long hap_gva_to_gfn_real_mode(
- struct vcpu *v, unsigned long gva, uint32_t *pfec)
+ struct vcpu *v, struct p2m_domain *p2m, unsigned long gva, uint32_t *pfec)
{
return ((paddr_t)gva >> PAGE_SHIFT);
}
+static unsigned long hap_p2m_ga_to_gfn_real_mode(
+ struct vcpu *v, struct p2m_domain *p2m, unsigned long cr3,
+ paddr_t ga, uint32_t *pfec)
+{
+ return (ga >> PAGE_SHIFT);
+}
+
+
/* Entry points into this mode of the hap code. */
static const struct paging_mode hap_paging_real_mode = {
.page_fault = hap_page_fault,
.invlpg = hap_invlpg,
.gva_to_gfn = hap_gva_to_gfn_real_mode,
+ .p2m_ga_to_gfn = hap_p2m_ga_to_gfn_real_mode,
.update_cr3 = hap_update_cr3,
.update_paging_modes = hap_update_paging_modes,
.write_p2m_entry = hap_write_p2m_entry,
@@ -876,6 +938,7 @@ static const struct paging_mode hap_pagi
.page_fault = hap_page_fault,
.invlpg = hap_invlpg,
.gva_to_gfn = hap_gva_to_gfn_2_levels,
+ .p2m_ga_to_gfn = hap_p2m_ga_to_gfn_2_levels,
.update_cr3 = hap_update_cr3,
.update_paging_modes = hap_update_paging_modes,
.write_p2m_entry = hap_write_p2m_entry,
@@ -886,6 +949,7 @@ static const struct paging_mode hap_pagi
.page_fault = hap_page_fault,
.invlpg = hap_invlpg,
.gva_to_gfn = hap_gva_to_gfn_3_levels,
+ .p2m_ga_to_gfn = hap_p2m_ga_to_gfn_3_levels,
.update_cr3 = hap_update_cr3,
.update_paging_modes = hap_update_paging_modes,
.write_p2m_entry = hap_write_p2m_entry,
@@ -896,6 +960,7 @@ static const struct paging_mode hap_pagi
.page_fault = hap_page_fault,
.invlpg = hap_invlpg,
.gva_to_gfn = hap_gva_to_gfn_4_levels,
+ .p2m_ga_to_gfn = hap_p2m_ga_to_gfn_4_levels,
.update_cr3 = hap_update_cr3,
.update_paging_modes = hap_update_paging_modes,
.write_p2m_entry = hap_write_p2m_entry,
diff -r cfde4384be14 -r 28809c365861 xen/arch/x86/mm/hap/nested_hap.c
--- /dev/null
+++ b/xen/arch/x86/mm/hap/nested_hap.c
@@ -0,0 +1,236 @@
+/******************************************************************************
+ * arch/x86/mm/hap/nested_hap.c
+ *
+ * Code for Nested Virtualization
+ * Copyright (c) 2011 Advanced Micro Devices
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <asm/domain.h>
+#include <asm/page.h>
+#include <asm/paging.h>
+#include <asm/p2m.h>
+#include <asm/mem_event.h>
+#include <public/mem_event.h>
+#include <asm/mem_sharing.h>
+#include <xen/event.h>
+#include <asm/hap.h>
+#include <asm/hvm/support.h>
+
+#include <asm/hvm/nestedhvm.h>
+
+#include "private.h"
+
+/* AlGORITHM for NESTED PAGE FAULT
+ *
+ * NOTATION
+ * Levels: L0, L1, L2
+ * Guests: L1 guest, L2 guest
+ * Hypervisor: L0 hypervisor
+ * Addresses: L2-GVA, L2-GPA, L1-GVA, L1-GPA, MPA
+ *
+ * On L0, when #NPF happens, the handler function should do:
+ * hap_page_fault(GPA)
+ * {
+ * 1. If #NPF is from L1 guest, then we crash the guest VM (same as old
+ * code)
+ * 2. If #NPF is from L2 guest, then we continue from (3)
+ * 3. Get h_cr3 from L1 guest. Map h_cr3 into L0 hypervisor address space.
+ * 4. Walk the h_cr3 page table
+ * 5. - if not present, then we inject #NPF back to L1 guest and
+ * re-launch L1 guest (L1 guest will either treat this #NPF as MMIO,
+ * or fix its p2m table for L2 guest)
+ * 6. - if present, then we will get the a new translated value L1-GPA
+ * (points to L1 machine memory)
+ * 7. * Use L1-GPA to walk L0 P2M table
+ * 8. - if not present, then crash the guest (should not happen)
+ * 9. - if present, then we get a new translated value MPA
+ * (points to real machine memory)
+ * 10. * Finally, use GPA and MPA to walk nested_p2m
+ * and fix the bits.
+ * }
+ *
+ */
+
+
+/********************************************/
+/* NESTED VIRT P2M FUNCTIONS */
+/********************************************/
+/* Override macros from asm/page.h to make them work with mfn_t */
+#undef mfn_valid
+#define mfn_valid(_mfn) __mfn_valid(mfn_x(_mfn))
+#undef page_to_mfn
+#define page_to_mfn(_pg) _mfn(__page_to_mfn(_pg))
+
+void
+nestedp2m_write_p2m_entry(struct p2m_domain *p2m, unsigned long gfn,
+ l1_pgentry_t *p, mfn_t table_mfn, l1_pgentry_t new, unsigned int level)
+{
+ struct domain *d = p2m->domain;
+ uint32_t old_flags;
+
+ hap_lock(d);
+
+ old_flags = l1e_get_flags(*p);
+ safe_write_pte(p, new);
+ if (old_flags & _PAGE_PRESENT)
+ nestedhvm_vmcx_flushtlb(p2m);
+
+ hap_unlock(d);
+}
+
+/********************************************/
+/* NESTED VIRT FUNCTIONS */
+/********************************************/
+static void
+nestedhap_fix_p2m(struct p2m_domain *p2m, paddr_t L2_gpa, paddr_t L0_gpa,
+ p2m_type_t p2mt, p2m_access_t p2ma)
+{
+ int rv;
+ ASSERT(p2m);
+ ASSERT(p2m->set_entry);
+
+ rv = p2m->set_entry(p2m, L2_gpa >> PAGE_SHIFT,
+ page_to_mfn(maddr_to_page(L0_gpa)),
+ 0 /*4K*/, p2mt, p2ma);
+ if (rv == 0) {
+ gdprintk(XENLOG_ERR,
+ "failed to set entry for 0x%"PRIx64" -> 0x%"PRIx64"\n",
+ L2_gpa, L0_gpa);
+ BUG();
+ }
+}
+
+/* This function uses L1_gpa to walk the P2M table in L0 hypervisor. If the
+ * walk is successful, the translated value is returned in L0_gpa. The return
+ * value tells the upper level what to do.
+ */
+static int
+nestedhap_walk_L0_p2m(struct p2m_domain *p2m, paddr_t L1_gpa, paddr_t *L0_gpa)
+{
+ mfn_t mfn;
+ p2m_type_t p2mt;
+
+ /* we use gfn_to_mfn_query() function to walk L0 P2M table */
+ mfn = gfn_to_mfn_query(p2m, L1_gpa >> PAGE_SHIFT, &p2mt);
+
+ if ( p2m_is_paging(p2mt) || p2m_is_shared(p2mt) || !p2m_is_ram(p2mt) )
+ return NESTEDHVM_PAGEFAULT_ERROR;
+
+ if ( !mfn_valid(mfn) )
+ return NESTEDHVM_PAGEFAULT_ERROR;
+
+ *L0_gpa = (mfn_x(mfn) << PAGE_SHIFT) + (L1_gpa & ~PAGE_MASK);
+ return NESTEDHVM_PAGEFAULT_DONE;
+}
+
+/* This function uses L2_gpa to walk the P2M page table in L1. If the
+ * walk is successful, the translated value is returned in
+ * L1_gpa. The result value tells what to do next.
+ */
+static int
+nestedhap_walk_L1_p2m(struct vcpu *v, struct p2m_domain *p2m,
+ paddr_t L2_gpa, paddr_t *L1_gpa)
+{
+ uint32_t pfec;
+ unsigned long nested_cr3, gfn;
+ const struct paging_mode *mode = paging_get_hostmode(v);
+
+ nested_cr3 = nhvm_vcpu_hostcr3(v);
+
+ /* walk the guest table */
+ gfn = paging_p2m_ga_to_gfn(v, p2m, mode, nested_cr3, L2_gpa, &pfec);
+
+ if ( gfn == INVALID_GFN )
+ return NESTEDHVM_PAGEFAULT_INJECT;
+
+ *L1_gpa = (gfn << PAGE_SHIFT) + (L2_gpa & ~PAGE_MASK);
+ return NESTEDHVM_PAGEFAULT_DONE;
+}
+
+/*
+ * The following function, nestedhap_page_fault(), is for steps (3)--(10).
+ *
+ * Returns:
+ */
+int
+nestedhvm_hap_nested_page_fault(struct vcpu *v, paddr_t L2_gpa)
+{
+ int rv;
+ paddr_t L1_gpa, L0_gpa;
+ struct domain *d = v->domain;
+ struct p2m_domain *p2m, *nested_p2m;
+
+ p2m = p2m_get_hostp2m(d); /* L0 p2m */
+ nested_p2m = p2m_get_nestedp2m(v, nhvm_vcpu_hostcr3(v));
+
+ /* walk the L1 P2M table, note we have to pass p2m
+ * and not nested_p2m here or we fail the walk forever,
+ * otherwise. */
+ rv = nestedhap_walk_L1_p2m(v, p2m, L2_gpa, &L1_gpa);
+
+ /* let caller to handle these two cases */
+ switch (rv) {
+ case NESTEDHVM_PAGEFAULT_INJECT:
+ return rv;
+ case NESTEDHVM_PAGEFAULT_ERROR:
+ return rv;
+ case NESTEDHVM_PAGEFAULT_DONE:
+ break;
+ default:
+ BUG();
+ break;
+ }
+
+ /* ==> we have to walk L0 P2M */
+ rv = nestedhap_walk_L0_p2m(p2m, L1_gpa, &L0_gpa);
+
+ /* let upper level caller to handle these two cases */
+ switch (rv) {
+ case NESTEDHVM_PAGEFAULT_INJECT:
+ return rv;
+ case NESTEDHVM_PAGEFAULT_ERROR:
+ return rv;
+ case NESTEDHVM_PAGEFAULT_DONE:
+ break;
+ default:
+ BUG();
+ break;
+ }
+
+ nestedp2m_lock(d);
+ /* fix p2m_get_pagetable(nested_p2m) */
+ nestedhap_fix_p2m(nested_p2m, L2_gpa, L0_gpa,
+ p2m_ram_rw,
+ p2m_access_rwx /* FIXME: Should use same permission as l1 guest */);
+ nestedp2m_unlock(d);
+
+ return NESTEDHVM_PAGEFAULT_DONE;
+}
+
+/********************************************/
+/* NESTED VIRT INITIALIZATION FUNCS */
+/********************************************/
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff -r cfde4384be14 -r 28809c365861 xen/arch/x86/mm/hap/private.h
--- a/xen/arch/x86/mm/hap/private.h
+++ b/xen/arch/x86/mm/hap/private.h
@@ -23,11 +23,27 @@
/********************************************/
/* GUEST TRANSLATION FUNCS */
/********************************************/
-unsigned long hap_gva_to_gfn_2_levels(struct vcpu *v, unsigned long gva,
+unsigned long hap_gva_to_gfn_2_levels(struct vcpu *v,
+ struct p2m_domain *p2m,
+ unsigned long gva,
uint32_t *pfec);
-unsigned long hap_gva_to_gfn_3_levels(struct vcpu *v, unsigned long gva,
+unsigned long hap_gva_to_gfn_3_levels(struct vcpu *v,
+ struct p2m_domain *p2m,
+ unsigned long gva,
uint32_t *pfec);
-unsigned long hap_gva_to_gfn_4_levels(struct vcpu *v, unsigned long gva,
+unsigned long hap_gva_to_gfn_4_levels(struct vcpu *v,
+ struct p2m_domain *p2m,
+ unsigned long gva,
uint32_t *pfec);
+unsigned long hap_p2m_ga_to_gfn_2_levels(struct vcpu *v,
+ struct p2m_domain *p2m, unsigned long cr3,
+ paddr_t ga, uint32_t *pfec);
+unsigned long hap_p2m_ga_to_gfn_3_levels(struct vcpu *v,
+ struct p2m_domain *p2m, unsigned long cr3,
+ paddr_t ga, uint32_t *pfec);
+unsigned long hap_p2m_ga_to_gfn_4_levels(struct vcpu *v,
+ struct p2m_domain *p2m, unsigned long cr3,
+ paddr_t ga, uint32_t *pfec);
+
#endif /* __HAP_PRIVATE_H__ */
diff -r cfde4384be14 -r 28809c365861 xen/arch/x86/mm/p2m.c
--- a/xen/arch/x86/mm/p2m.c
+++ b/xen/arch/x86/mm/p2m.c
@@ -34,6 +34,7 @@
#include <public/mem_event.h>
#include <asm/mem_sharing.h>
#include <xen/event.h>
+#include <asm/hvm/nestedhvm.h>
/* Debugging and auditing of the P2M code? */
#define P2M_AUDIT 0
@@ -75,7 +76,7 @@ boolean_param("hap_2mb", opt_hap_2mb);
#define SUPERPAGE_PAGES (1UL << 9)
#define superpage_aligned(_x) (((_x)&(SUPERPAGE_PAGES-1))==0)
-static unsigned long p2m_type_to_flags(p2m_type_t t, mfn_t mfn)
+unsigned long p2m_type_to_flags(p2m_type_t t, mfn_t mfn)
{
unsigned long flags;
#ifdef __x86_64__
@@ -121,9 +122,9 @@ static void audit_p2m(struct p2m_domain
// Find the next level's P2M entry, checking for out-of-range gfn's...
// Returns NULL on error.
//
-static l1_pgentry_t *
+l1_pgentry_t *
p2m_find_entry(void *table, unsigned long *gfn_remainder,
- unsigned long gfn, u32 shift, u32 max)
+ unsigned long gfn, uint32_t shift, uint32_t max)
{
u32 index;
@@ -224,20 +225,17 @@ p2m_next_level(struct p2m_domain *p2m, m
switch ( type ) {
case PGT_l3_page_table:
- paging_write_p2m_entry(p2m->domain, gfn,
- p2m_entry, *table_mfn, new_entry, 4);
+ p2m->write_p2m_entry(p2m, gfn, p2m_entry, *table_mfn, new_entry, 4);
break;
case PGT_l2_page_table:
#if CONFIG_PAGING_LEVELS == 3
/* for PAE mode, PDPE only has PCD/PWT/P bits available */
new_entry = l1e_from_pfn(mfn_x(page_to_mfn(pg)), _PAGE_PRESENT);
#endif
- paging_write_p2m_entry(p2m->domain, gfn,
- p2m_entry, *table_mfn, new_entry, 3);
+ p2m->write_p2m_entry(p2m, gfn, p2m_entry, *table_mfn, new_entry, 3);
break;
case PGT_l1_page_table:
- paging_write_p2m_entry(p2m->domain, gfn,
- p2m_entry, *table_mfn, new_entry, 2);
+ p2m->write_p2m_entry(p2m, gfn, p2m_entry, *table_mfn, new_entry, 2);
break;
default:
BUG();
@@ -264,14 +262,13 @@ p2m_next_level(struct p2m_domain *p2m, m
for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
{
new_entry = l1e_from_pfn(pfn + (i * L1_PAGETABLE_ENTRIES), flags);
- paging_write_p2m_entry(p2m->domain, gfn,
- l1_entry+i, *table_mfn, new_entry, 2);
+ p2m->write_p2m_entry(p2m, gfn,
+ l1_entry+i, *table_mfn, new_entry, 2);
}
unmap_domain_page(l1_entry);
new_entry = l1e_from_pfn(mfn_x(page_to_mfn(pg)),
__PAGE_HYPERVISOR|_PAGE_USER); //disable PSE
- paging_write_p2m_entry(p2m->domain, gfn,
- p2m_entry, *table_mfn, new_entry, 3);
+ p2m->write_p2m_entry(p2m, gfn, p2m_entry, *table_mfn, new_entry, 3);
}
@@ -298,15 +295,15 @@ p2m_next_level(struct p2m_domain *p2m, m
for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
{
new_entry = l1e_from_pfn(pfn + i, flags);
- paging_write_p2m_entry(p2m->domain, gfn,
- l1_entry+i, *table_mfn, new_entry, 1);
+ p2m->write_p2m_entry(p2m, gfn,
+ l1_entry+i, *table_mfn, new_entry, 1);
}
unmap_domain_page(l1_entry);
new_entry = l1e_from_pfn(mfn_x(page_to_mfn(pg)),
__PAGE_HYPERVISOR|_PAGE_USER);
- paging_write_p2m_entry(p2m->domain, gfn,
- p2m_entry, *table_mfn, new_entry, 2);
+ p2m->write_p2m_entry(p2m, gfn,
+ p2m_entry, *table_mfn, new_entry, 2);
}
*table_mfn = _mfn(l1e_get_pfn(*p2m_entry));
@@ -1369,8 +1366,7 @@ p2m_set_entry(struct p2m_domain *p2m, un
p2m_type_to_flags(p2mt, mfn) | _PAGE_PSE)
: l3e_empty();
entry_content.l1 = l3e_content.l3;
- paging_write_p2m_entry(p2m->domain, gfn, p2m_entry,
- table_mfn, entry_content, 3);
+ p2m->write_p2m_entry(p2m, gfn, p2m_entry, table_mfn, entry_content, 3);
/* NB: paging_write_p2m_entry() handles tlb flushes properly */
/* Free old intermediate tables if necessary */
@@ -1410,8 +1406,7 @@ p2m_set_entry(struct p2m_domain *p2m, un
entry_content = l1e_empty();
/* level 1 entry */
- paging_write_p2m_entry(p2m->domain, gfn, p2m_entry,
- table_mfn, entry_content, 1);
+ p2m->write_p2m_entry(p2m, gfn, p2m_entry, table_mfn, entry_content, 1);
/* NB: paging_write_p2m_entry() handles tlb flushes properly */
}
else if ( page_order == 9 )
@@ -1440,8 +1435,7 @@ p2m_set_entry(struct p2m_domain *p2m, un
l2e_content = l2e_empty();
entry_content.l1 = l2e_content.l2;
- paging_write_p2m_entry(p2m->domain, gfn, p2m_entry,
- table_mfn, entry_content, 2);
+ p2m->write_p2m_entry(p2m, gfn, p2m_entry, table_mfn, entry_content, 2);
/* NB: paging_write_p2m_entry() handles tlb flushes properly */
/* Free old intermediate tables if necessary */
@@ -1806,10 +1800,13 @@ static void p2m_initialise(struct domain
p2m->domain = d;
p2m->default_access = p2m_access_rwx;
+ p2m->cr3 = CR3_EADDR;
p2m->set_entry = p2m_set_entry;
p2m->get_entry = p2m_gfn_to_mfn;
p2m->get_entry_current = p2m_gfn_to_mfn_current;
p2m->change_entry_type_global = p2m_change_type_global;
+ p2m->write_p2m_entry = paging_write_p2m_entry;
+ cpus_clear(p2m->p2m_dirty_cpumask);
if ( hap_enabled(d) && (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) )
ept_p2m_init(d);
@@ -1817,6 +1814,25 @@ static void p2m_initialise(struct domain
return;
}
+static int
+p2m_init_nestedp2m(struct domain *d)
+{
+ uint8_t i;
+ struct p2m_domain *p2m;
+
+ nestedp2m_lock_init(d);
+ for (i = 0; i < MAX_NESTEDP2M; i++) {
+ d->arch.nested_p2m[i] = p2m = xmalloc(struct p2m_domain);
+ if (p2m == NULL)
+ return -ENOMEM;
+ p2m_initialise(d, p2m);
+ p2m->get_entry_current = p2m->get_entry;
+ p2m->write_p2m_entry = nestedp2m_write_p2m_entry;
+ }
+
+ return 0;
+}
+
int p2m_init(struct domain *d)
{
struct p2m_domain *p2m;
@@ -1825,8 +1841,12 @@ int p2m_init(struct domain *d)
if ( p2m == NULL )
return -ENOMEM;
p2m_initialise(d, p2m);
-
- return 0;
+
+ /* Must initialise nestedp2m unconditionally
+ * since nestedhvm_enabled(d) returns false here.
+ * (p2m_init runs too early for HVM_PARAM_* options)
+ */
+ return p2m_init_nestedp2m(d);
}
void p2m_change_entry_type_global(struct p2m_domain *p2m,
@@ -1919,6 +1939,9 @@ int p2m_alloc_table(struct p2m_domain *p
p2m_invalid, p2m->default_access) )
goto error;
+ if (p2m_is_nestedp2m(p2m))
+ goto nesteddone;
+
/* Copy all existing mappings from the page list and m2p */
spin_lock(&p2m->domain->page_alloc_lock);
page_list_for_each(page, &p2m->domain->page_list)
@@ -1940,6 +1963,7 @@ int p2m_alloc_table(struct p2m_domain *p
}
spin_unlock(&p2m->domain->page_alloc_lock);
+ nesteddone:
P2M_PRINTK("p2m table initialised (%u pages)\n", page_count);
p2m_unlock(p2m);
return 0;
@@ -1966,6 +1990,9 @@ void p2m_teardown(struct p2m_domain *p2m
mfn_t mfn;
#endif
+ if (p2m == NULL)
+ return;
+
p2m_lock(p2m);
#ifdef __x86_64__
@@ -1984,11 +2011,26 @@ void p2m_teardown(struct p2m_domain *p2m
p2m_unlock(p2m);
}
+static void p2m_teardown_nestedp2m(struct domain *d)
+{
+ uint8_t i;
+
+ for (i = 0; i < MAX_NESTEDP2M; i++) {
+ xfree(d->arch.nested_p2m[i]);
+ d->arch.nested_p2m[i] = NULL;
+ }
+}
+
void p2m_final_teardown(struct domain *d)
{
/* Iterate over all p2m tables per domain */
xfree(d->arch.p2m);
d->arch.p2m = NULL;
+
+ /* We must teardown unconditionally because
+ * we initialise them unconditionally.
+ */
+ p2m_teardown_nestedp2m(d);
}
#if P2M_AUDIT
@@ -2573,9 +2615,9 @@ void p2m_change_type_global(struct p2m_d
gfn = get_gpfn_from_mfn(mfn);
flags = p2m_type_to_flags(nt, _mfn(mfn));
l1e_content = l1e_from_pfn(mfn, flags | _PAGE_PSE);
- paging_write_p2m_entry(p2m->domain, gfn,
- (l1_pgentry_t *)&l3e[i3],
- l3mfn, l1e_content, 3);
+ p2m->write_p2m_entry(p2m, gfn,
+ (l1_pgentry_t *)&l3e[i3],
+ l3mfn, l1e_content, 3);
continue;
}
@@ -2604,9 +2646,9 @@ void p2m_change_type_global(struct p2m_d
* L2_PAGETABLE_ENTRIES) * L1_PAGETABLE_ENTRIES;
flags = p2m_type_to_flags(nt, _mfn(mfn));
l1e_content = l1e_from_pfn(mfn, flags | _PAGE_PSE);
- paging_write_p2m_entry(p2m->domain, gfn,
- (l1_pgentry_t *)&l2e[i2],
- l2mfn, l1e_content, 2);
+ p2m->write_p2m_entry(p2m, gfn,
+ (l1_pgentry_t *)&l2e[i2],
+ l2mfn, l1e_content, 2);
continue;
}
@@ -2628,8 +2670,8 @@ void p2m_change_type_global(struct p2m_d
/* create a new 1le entry with the new type */
flags = p2m_type_to_flags(nt, _mfn(mfn));
l1e_content = l1e_from_pfn(mfn, flags);
- paging_write_p2m_entry(p2m->domain, gfn, &l1e[i1],
- l1mfn, l1e_content, 1);
+ p2m->write_p2m_entry(p2m, gfn, &l1e[i1],
+ l1mfn, l1e_content, 1);
}
unmap_domain_page(l1e);
}
@@ -3048,6 +3090,182 @@ void p2m_mem_access_resume(struct p2m_do
}
#endif /* __x86_64__ */
+static struct p2m_domain *
+p2m_getlru_nestedp2m(struct domain *d, struct p2m_domain *p2m)
+{
+ int i, lru_index = -1;
+ struct p2m_domain *lrup2m, *tmp;
+
+ if (p2m == NULL) {
+ lru_index = MAX_NESTEDP2M - 1;
+ lrup2m = d->arch.nested_p2m[lru_index];
+ } else {
+ lrup2m = p2m;
+ for (i = 0; i < MAX_NESTEDP2M; i++) {
+ if (d->arch.nested_p2m[i] == p2m) {
+ lru_index = i;
+ break;
+ }
+ }
+ }
+
+ ASSERT(lru_index >= 0);
+ if (lru_index == 0) {
+ return lrup2m;
+ }
+
+ /* move the other's down the array "list" */
+ for (i = lru_index - 1; i >= 0; i--) {
+ tmp = d->arch.nested_p2m[i];
+ d->arch.nested_p2m[i+1] = tmp;
+ }
+
+ /* make the entry the first one */
+ d->arch.nested_p2m[0] = lrup2m;
+
+ return lrup2m;
+}
+
+static int
+p2m_flush_locked(struct p2m_domain *p2m)
+{
+ ASSERT(p2m);
+ if (p2m->cr3 == CR3_EADDR)
+ /* Microoptimisation: p2m is already empty.
+ * => about 0.3% speedup of overall system performance.
+ */
+ return 0;
+
+ p2m_teardown(p2m);
+ p2m_initialise(p2m->domain, p2m);
+ p2m->get_entry_current = p2m->get_entry;
+ p2m->write_p2m_entry = nestedp2m_write_p2m_entry;
+ return p2m_alloc_table(p2m);
+}
+
+void
+p2m_flush(struct vcpu *v, struct p2m_domain *p2m)
+{
+ struct domain *d = p2m->domain;
+
+ ASSERT(v->domain == d);
+ vcpu_nestedhvm(v).nv_p2m = NULL;
+ nestedp2m_lock(d);
+ BUG_ON(p2m_flush_locked(p2m) != 0);
+ hvm_asid_flush_vcpu(v);
+ nestedhvm_vmcx_flushtlb(p2m);
+ nestedp2m_unlock(d);
+}
+
+void
+p2m_flush_nestedp2m(struct domain *d)
+{
+ int i;
+
+ nestedp2m_lock(d);
+ for (i = 0; i < MAX_NESTEDP2M; i++) {
+ struct p2m_domain *p2m = d->arch.nested_p2m[i];
+ BUG_ON(p2m_flush_locked(p2m) != 0);
+ cpus_clear(p2m->p2m_dirty_cpumask);
+ }
+ nestedhvm_vmcx_flushtlbdomain(d);
+ nestedp2m_unlock(d);
+}
+
+struct p2m_domain *
+p2m_get_nestedp2m(struct vcpu *v, uint64_t cr3)
+{
+ /* Use volatile to prevent gcc to cache nv->nv_p2m in a cpu register as
+ * this may change within the loop by an other (v)cpu.
+ */
+ volatile struct nestedvcpu *nv = &vcpu_nestedhvm(v);
+ struct domain *d;
+ struct p2m_domain *p2m;
+ int i, rv;
+
+ if (cr3 == 0 || cr3 == CR3_EADDR)
+ cr3 = v->arch.hvm_vcpu.guest_cr[3];
+
+ if (nv->nv_flushp2m && nv->nv_p2m) {
+ nv->nv_p2m = NULL;
+ }
+
+ d = v->domain;
+ nestedp2m_lock(d);
+ for (i = 0; i < MAX_NESTEDP2M; i++) {
+ p2m = d->arch.nested_p2m[i];
+ if ((p2m->cr3 != cr3 && p2m->cr3 != CR3_EADDR) || (p2m != nv->nv_p2m))
+ continue;
+
+ nv->nv_flushp2m = 0;
+ p2m_getlru_nestedp2m(d, p2m);
+ nv->nv_p2m = p2m;
+ if (p2m->cr3 == CR3_EADDR)
+ hvm_asid_flush_vcpu(v);
+ p2m->cr3 = cr3;
+ cpu_set(v->processor, p2m->p2m_dirty_cpumask);
+ nestedp2m_unlock(d);
+ return p2m;
+ }
+
+ /* All p2m's are or were in use. Take the least recent used one,
+ * flush it and reuse.
+ */
+ for (i = 0; i < MAX_NESTEDP2M; i++) {
+ p2m = p2m_getlru_nestedp2m(d, NULL);
+ rv = p2m_flush_locked(p2m);
+ if (rv == 0)
+ break;
+ }
+ nv->nv_p2m = p2m;
+ p2m->cr3 = cr3;
+ nv->nv_flushp2m = 0;
+ hvm_asid_flush_vcpu(v);
+ nestedhvm_vmcx_flushtlb(nv->nv_p2m);
+ cpu_set(v->processor, p2m->p2m_dirty_cpumask);
+ nestedp2m_unlock(d);
+
+ return p2m;
+}
+
+struct p2m_domain *
+p2m_get_p2m(struct vcpu *v)
+{
+ if (!nestedhvm_is_n2(v))
+ return p2m_get_hostp2m(v->domain);
+
+ return p2m_get_nestedp2m(v, nhvm_vcpu_hostcr3(v));
+}
+
+unsigned long paging_gva_to_gfn(struct vcpu *v,
+ unsigned long va,
+ uint32_t *pfec)
+{
+ struct p2m_domain *hostp2m = p2m_get_hostp2m(v->domain);
+ const struct paging_mode *hostmode = paging_get_hostmode(v);
+
+ if ( is_hvm_domain(v->domain)
+ && paging_mode_hap(v->domain)
+ && nestedhvm_is_n2(v) )
+ {
+ unsigned long gfn;
+ struct p2m_domain *p2m;
+ const struct paging_mode *mode;
+ uint64_t ncr3 = nhvm_vcpu_hostcr3(v);
+
+ /* translate l2 guest va into l2 guest gfn */
+ p2m = p2m_get_nestedp2m(v, ncr3);
+ mode = paging_get_nestedmode(v);
+ gfn = mode->gva_to_gfn(v, p2m, va, pfec);
+
+ /* translate l2 guest gfn into l1 guest gfn */
+ return hostmode->p2m_ga_to_gfn(v, hostp2m, ncr3,
+ gfn << PAGE_SHIFT, pfec);
+ }
+
+ return hostmode->gva_to_gfn(v, hostp2m, va, pfec);
+}
+
/*
* Local variables:
* mode: C
diff -r cfde4384be14 -r 28809c365861 xen/arch/x86/mm/paging.c
--- a/xen/arch/x86/mm/paging.c
+++ b/xen/arch/x86/mm/paging.c
@@ -26,6 +26,7 @@
#include <asm/p2m.h>
#include <asm/hap.h>
#include <asm/guest_access.h>
+#include <asm/hvm/nestedhvm.h>
#include <xen/numa.h>
#include <xsm/xsm.h>
@@ -851,21 +852,58 @@ void paging_dump_vcpu_info(struct vcpu *
printk(" paging assistance: ");
if ( paging_mode_shadow(v->domain) )
{
- if ( v->arch.paging.mode )
+ if ( paging_get_hostmode(v) )
printk("shadowed %u-on-%u\n",
- v->arch.paging.mode->guest_levels,
- v->arch.paging.mode->shadow.shadow_levels);
+ paging_get_hostmode(v)->guest_levels,
+ paging_get_hostmode(v)->shadow.shadow_levels);
else
printk("not shadowed\n");
}
- else if ( paging_mode_hap(v->domain) && v->arch.paging.mode )
+ else if ( paging_mode_hap(v->domain) && paging_get_hostmode(v) )
printk("hap, %u levels\n",
- v->arch.paging.mode->guest_levels);
+ paging_get_hostmode(v)->guest_levels);
else
printk("none\n");
}
}
+const struct paging_mode *paging_get_mode(struct vcpu *v)
+{
+ if (!nestedhvm_is_n2(v))
+ return paging_get_hostmode(v);
+
+ return paging_get_nestedmode(v);
+}
+
+extern const struct paging_mode *hap_paging_get_mode(struct vcpu *);
+
+void paging_update_nestedmode(struct vcpu *v)
+{
+ ASSERT(nestedhvm_enabled(v->domain));
+ if (nestedhvm_paging_mode_hap(v))
+ /* nested-on-nested */
+ v->arch.paging.nestedmode = hap_paging_get_mode(v);
+ else
+ /* TODO: shadow-on-shadow */
+ v->arch.paging.nestedmode = NULL;
+}
+
+void paging_write_p2m_entry(struct p2m_domain *p2m, unsigned long gfn,
+ l1_pgentry_t *p, mfn_t table_mfn,
+ l1_pgentry_t new, unsigned int level)
+{
+ struct domain *d = p2m->domain;
+ struct vcpu *v = current;
+ if ( v->domain != d )
+ v = d->vcpu ? d->vcpu[0] : NULL;
+ if ( likely(v && paging_mode_enabled(d) && paging_get_hostmode(v) != NULL) )
+ {
+ return paging_get_hostmode(v)->write_p2m_entry(v, gfn, p, table_mfn,
+ new, level);
+ }
+ else
+ safe_write_pte(p, new);
+}
/*
* Local variables:
diff -r cfde4384be14 -r 28809c365861 xen/arch/x86/mm/shadow/multi.c
--- a/xen/arch/x86/mm/shadow/multi.c
+++ b/xen/arch/x86/mm/shadow/multi.c
@@ -837,22 +837,6 @@ shadow_write_entries(void *d, void *s, i
if ( map != NULL ) sh_unmap_domain_page(map);
}
-static inline int
-perms_strictly_increased(u32 old_flags, u32 new_flags)
-/* Given the flags of two entries, are the new flags a strict
- * increase in rights over the old ones? */
-{
- u32 of = old_flags & (_PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX_BIT);
- u32 nf = new_flags & (_PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX_BIT);
- /* Flip the NX bit, since it's the only one that decreases rights;
- * we calculate as if it were an "X" bit. */
- of ^= _PAGE_NX_BIT;
- nf ^= _PAGE_NX_BIT;
- /* If the changed bits are all set in the new flags, then rights strictly
- * increased between old and new. */
- return ((of | (of ^ nf)) == nf);
-}
-
/* type is only used to distinguish grant map pages from ordinary RAM
* i.e. non-p2m_is_grant() pages are treated as p2m_ram_rw. */
static int inline
@@ -3768,7 +3752,8 @@ sh_invlpg(struct vcpu *v, unsigned long
static unsigned long
-sh_gva_to_gfn(struct vcpu *v, unsigned long va, uint32_t *pfec)
+sh_gva_to_gfn(struct vcpu *v, struct p2m_domain *p2m,
+ unsigned long va, uint32_t *pfec)
/* Called to translate a guest virtual address to what the *guest*
* pagetables would map it to. */
{
@@ -4820,7 +4805,7 @@ static mfn_t emulate_gva_to_mfn(struct v
struct p2m_domain *p2m = p2m_get_hostp2m(v->domain);
/* Translate the VA to a GFN */
- gfn = sh_gva_to_gfn(v, vaddr, &pfec);
+ gfn = sh_gva_to_gfn(v, p2m, vaddr, &pfec);
if ( gfn == INVALID_GFN )
{
if ( is_hvm_vcpu(v) )
diff -r cfde4384be14 -r 28809c365861 xen/include/asm-x86/domain.h
--- a/xen/include/asm-x86/domain.h
+++ b/xen/include/asm-x86/domain.h
@@ -210,6 +210,8 @@ struct paging_domain {
struct paging_vcpu {
/* Pointers to mode-specific entry points. */
const struct paging_mode *mode;
+ /* Nested Virtualization: paging mode of nested guest */
+ const struct paging_mode *nestedmode;
/* HVM guest: last emulate was to a pagetable */
unsigned int last_write_was_pt:1;
/* HVM guest: last write emulation succeeds */
@@ -225,6 +227,7 @@ struct paging_vcpu {
#define MAX_CPUID_INPUT 40
typedef xen_domctl_cpuid_t cpuid_input_t;
+#define MAX_NESTEDP2M 10
struct p2m_domain;
struct time_scale {
int shift;
@@ -258,6 +261,12 @@ struct arch_domain
struct paging_domain paging;
struct p2m_domain *p2m;
+ /* nestedhvm: translate l2 guest physical to host physical */
+ struct p2m_domain *nested_p2m[MAX_NESTEDP2M];
+ spinlock_t nested_p2m_lock;
+ int nested_p2m_locker;
+ const char *nested_p2m_function;
+
/* NB. protected by d->event_lock and by irq_desc[irq].lock */
int *irq_pirq;
int *pirq_irq;
diff -r cfde4384be14 -r 28809c365861 xen/include/asm-x86/hvm/hvm.h
--- a/xen/include/asm-x86/hvm/hvm.h
+++ b/xen/include/asm-x86/hvm/hvm.h
@@ -374,12 +374,12 @@ static inline void hvm_set_info_guest(st
int hvm_debug_op(struct vcpu *v, int32_t op);
-bool_t hvm_hap_nested_page_fault(unsigned long gpa,
- bool_t gla_valid, unsigned long gla,
- bool_t access_valid,
- bool_t access_r,
- bool_t access_w,
- bool_t access_x);
+int hvm_hap_nested_page_fault(unsigned long gpa,
+ bool_t gla_valid, unsigned long gla,
+ bool_t access_valid,
+ bool_t access_r,
+ bool_t access_w,
+ bool_t access_x);
#define hvm_msr_tsc_aux(v) ({ \
struct domain *__d = (v)->domain; \
diff -r cfde4384be14 -r 28809c365861 xen/include/asm-x86/hvm/nestedhvm.h
--- a/xen/include/asm-x86/hvm/nestedhvm.h
+++ b/xen/include/asm-x86/hvm/nestedhvm.h
@@ -60,4 +60,9 @@ unsigned long *nestedhvm_vcpu_iomap_get(
#define nestedhvm_vmswitch_in_progress(v) \
(!!vcpu_nestedhvm((v)).nv_vmswitch_in_progress)
+void nestedhvm_vmcx_flushtlb(struct p2m_domain *p2m);
+void nestedhvm_vmcx_flushtlbdomain(struct domain *d);
+
+bool_t nestedhvm_is_n2(struct vcpu *v);
+
#endif /* _HVM_NESTEDHVM_H */
diff -r cfde4384be14 -r 28809c365861 xen/include/asm-x86/p2m.h
--- a/xen/include/asm-x86/p2m.h
+++ b/xen/include/asm-x86/p2m.h
@@ -199,7 +199,15 @@ struct p2m_domain {
/* Shadow translated domain: p2m mapping */
pagetable_t phys_table;
+ /* Same as domain_dirty_cpumask but limited to
+ * this p2m and those physical cpus whose vcpu's are in
+ * guestmode.
+ */
+ cpumask_t p2m_dirty_cpumask;
+
struct domain *domain; /* back pointer to domain */
+#define CR3_EADDR (~0ULL)
+ uint64_t cr3; /* to identify this p2m for re-use */
/* Pages used to construct the p2m */
struct page_list_head pages;
@@ -223,6 +231,11 @@ struct p2m_domain {
p2m_type_t ot,
p2m_type_t nt);
+ void (*write_p2m_entry)(struct p2m_domain *p2m,
+ unsigned long gfn, l1_pgentry_t *p,
+ mfn_t table_mfn, l1_pgentry_t new,
+ unsigned int level);
+
/* Default P2M access type for each page in the the domain: new pages,
* swapped in pages, cleared pages, and pages that are ambiquously
* retyped get this access type. See definition of p2m_access_t. */
@@ -264,8 +277,26 @@ struct p2m_domain {
/* get host p2m table */
#define p2m_get_hostp2m(d) ((d)->arch.p2m)
+/* Get p2m table (re)usable for specified cr3.
+ * Automatically destroys and re-initializes a p2m if none found.
+ * If cr3 == 0 then v->arch.hvm_vcpu.guest_cr[3] is used.
+ */
+struct p2m_domain *p2m_get_nestedp2m(struct vcpu *v, uint64_t cr3);
+
+/* If vcpu is in host mode then behaviour matches p2m_get_hostp2m().
+ * If vcpu is in guest mode then behaviour matches p2m_get_nestedp2m().
+ */
+struct p2m_domain *p2m_get_p2m(struct vcpu *v);
+
+#define p2m_is_nestedp2m(p2m) ((p2m) != p2m_get_hostp2m((p2m->domain)))
+
#define p2m_get_pagetable(p2m) ((p2m)->phys_table)
+/* Flushes specified p2m table */
+void p2m_flush(struct vcpu *v, struct p2m_domain *p2m);
+/* Flushes all nested p2m tables */
+void p2m_flush_nestedp2m(struct domain *d);
+
/*
* The P2M lock. This protects all updates to the p2m table.
* Updates are expected to be safe against concurrent reads,
@@ -307,6 +338,38 @@ struct p2m_domain {
(current->processor == (_p2m)->locker)
+#define nestedp2m_lock_init(_domain) \
+ do { \
+ spin_lock_init(&(_domain)->arch.nested_p2m_lock); \
+ (_domain)->arch.nested_p2m_locker = -1; \
+ (_domain)->arch.nested_p2m_function = "nobody"; \
+ } while (0)
+
+#define nestedp2m_locked_by_me(_domain) \
+ (current->processor == (_domain)->arch.nested_p2m_locker)
+
+#define nestedp2m_lock(_domain) \
+ do { \
+ if ( nestedp2m_locked_by_me(_domain) ) \
+ { \
+ printk("Error: p2m lock held by %s\n", \
+ (_domain)->arch.nested_p2m_function); \
+ BUG(); \
+ } \
+ spin_lock(&(_domain)->arch.nested_p2m_lock); \
+ ASSERT((_domain)->arch.nested_p2m_locker == -1); \
+ (_domain)->arch.nested_p2m_locker = current->processor; \
+ (_domain)->arch.nested_p2m_function = __func__; \
+ } while (0)
+
+#define nestedp2m_unlock(_domain) \
+ do { \
+ ASSERT(nestedp2m_locked_by_me(_domain)); \
+ (_domain)->arch.nested_p2m_locker = -1; \
+ (_domain)->arch.nested_p2m_function = "nobody"; \
+ spin_unlock(&(_domain)->arch.nested_p2m_lock); \
+ } while (0)
+
/* Extract the type from the PTE flags that store it */
static inline p2m_type_t p2m_flags_to_type(unsigned long flags)
{
@@ -424,11 +487,21 @@ static inline unsigned long mfn_to_gfn(s
/* Init the datastructures for later use by the p2m code */
int p2m_init(struct domain *d);
+/* PTE flags for various types of p2m entry */
+unsigned long p2m_type_to_flags(p2m_type_t t, mfn_t mfn);
+
/* Allocate a new p2m table for a domain.
*
* Returns 0 for success or -errno. */
int p2m_alloc_table(struct p2m_domain *p2m);
+/* Find the next level's P2M entry, checking for out-of-range gfn's...
+ * Returns NULL on error.
+ */
+l1_pgentry_t *
+p2m_find_entry(void *table, unsigned long *gfn_remainder,
+ unsigned long gfn, uint32_t shift, uint32_t max);
+
/* Return all the p2m resources to Xen. */
void p2m_teardown(struct p2m_domain *p2m);
void p2m_final_teardown(struct domain *d);
@@ -502,6 +575,8 @@ p2m_type_t p2m_change_type(struct p2m_do
int set_mmio_p2m_entry(struct p2m_domain *p2m, unsigned long gfn, mfn_t mfn);
int clear_mmio_p2m_entry(struct p2m_domain *p2m, unsigned long gfn);
+void nestedp2m_write_p2m_entry(struct p2m_domain *p2m, unsigned long gfn,
+ l1_pgentry_t *p, mfn_t table_mfn, l1_pgentry_t new, unsigned int level);
#ifdef __x86_64__
/* Modify p2m table for shared gfn */
diff -r cfde4384be14 -r 28809c365861 xen/include/asm-x86/page.h
--- a/xen/include/asm-x86/page.h
+++ b/xen/include/asm-x86/page.h
@@ -391,6 +391,23 @@ static inline uint32_t cacheattr_to_pte_
return ((cacheattr & 4) << 5) | ((cacheattr & 3) << 3);
}
+/* return true if permission increased */
+static inline bool_t
+perms_strictly_increased(uint32_t old_flags, uint32_t new_flags)
+/* Given the flags of two entries, are the new flags a strict
+ * increase in rights over the old ones? */
+{
+ uint32_t of = old_flags & (_PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX_BIT);
+ uint32_t nf = new_flags & (_PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX_BIT);
+ /* Flip the NX bit, since it's the only one that decreases rights;
+ * we calculate as if it were an "X" bit. */
+ of ^= _PAGE_NX_BIT;
+ nf ^= _PAGE_NX_BIT;
+ /* If the changed bits are all set in the new flags, then rights strictly
+ * increased between old and new. */
+ return ((of | (of ^ nf)) == nf);
+}
+
#endif /* !__ASSEMBLY__ */
#define PAGE_ALIGN(x) (((x) + PAGE_SIZE - 1) & PAGE_MASK)
diff -r cfde4384be14 -r 28809c365861 xen/include/asm-x86/paging.h
--- a/xen/include/asm-x86/paging.h
+++ b/xen/include/asm-x86/paging.h
@@ -108,8 +108,14 @@ struct paging_mode {
int (*page_fault )(struct vcpu *v, unsigned long va,
struct cpu_user_regs *regs);
int (*invlpg )(struct vcpu *v, unsigned long va);
- unsigned long (*gva_to_gfn )(struct vcpu *v, unsigned long va,
+ unsigned long (*gva_to_gfn )(struct vcpu *v,
+ struct p2m_domain *p2m,
+ unsigned long va,
uint32_t *pfec);
+ unsigned long (*p2m_ga_to_gfn )(struct vcpu *v,
+ struct p2m_domain *p2m,
+ unsigned long cr3,
+ paddr_t ga, uint32_t *pfec);
void (*update_cr3 )(struct vcpu *v, int do_locking);
void (*update_paging_modes )(struct vcpu *v);
void (*write_p2m_entry )(struct vcpu *v, unsigned long gfn,
@@ -219,6 +225,10 @@ void paging_final_teardown(struct domain
* creation. */
int paging_enable(struct domain *d, u32 mode);
+#define paging_get_hostmode(v) ((v)->arch.paging.mode)
+#define paging_get_nestedmode(v) ((v)->arch.paging.nestedmode)
+const struct paging_mode *paging_get_mode(struct vcpu *v);
+void paging_update_nestedmode(struct vcpu *v);
/* Page fault handler
* Called from pagefault handler in Xen, and from the HVM trap handlers
@@ -233,7 +243,7 @@ static inline int
paging_fault(unsigned long va, struct cpu_user_regs *regs)
{
struct vcpu *v = current;
- return v->arch.paging.mode->page_fault(v, va, regs);
+ return paging_get_hostmode(v)->page_fault(v, va, regs);
}
/* Handle invlpg requests on vcpus.
@@ -241,7 +251,7 @@ paging_fault(unsigned long va, struct cp
* or 0 if it's safe not to do so. */
static inline int paging_invlpg(struct vcpu *v, unsigned long va)
{
- return v->arch.paging.mode->invlpg(v, va);
+ return paging_get_hostmode(v)->invlpg(v, va);
}
/* Translate a guest virtual address to the frame number that the
@@ -251,11 +261,30 @@ static inline int paging_invlpg(struct v
* walking the tables. The caller should set the PFEC_page_present bit
* in pfec[0]; in the failure case, that bit will be cleared if appropriate. */
#define INVALID_GFN (-1UL)
-static inline unsigned long paging_gva_to_gfn(struct vcpu *v,
- unsigned long va,
- uint32_t *pfec)
+unsigned long paging_gva_to_gfn(struct vcpu *v,
+ unsigned long va,
+ uint32_t *pfec);
+
+/* Translates a guest virtual address to guest physical address
+ * where the specified cr3 is translated to host physical address
+ * using the specified p2m table.
+ * This allows to do page walks in the guest or even in the nested guest.
+ * It returns the guest's gfn or the nested guest's gfn.
+ * Use 'paddr_t' for the guest address so it won't overflow when
+ * guest or nested guest is in 32bit PAE mode.
+ */
+static inline unsigned long paging_p2m_ga_to_gfn(struct vcpu *v,
+ struct p2m_domain *p2m,
+ const struct paging_mode *mode,
+ unsigned long cr3,
+ paddr_t ga,
+ uint32_t *pfec)
{
- return v->arch.paging.mode->gva_to_gfn(v, va, pfec);
+ if ( is_hvm_domain(v->domain) && paging_mode_hap(v->domain) )
+ return mode->p2m_ga_to_gfn(v, p2m, cr3, ga, pfec);
+
+ /* shadow paging */
+ return paging_gva_to_gfn(v, ga, pfec);
}
/* Update all the things that are derived from the guest's CR3.
@@ -263,7 +292,7 @@ static inline unsigned long paging_gva_t
* as the value to load into the host CR3 to schedule this vcpu */
static inline void paging_update_cr3(struct vcpu *v)
{
- v->arch.paging.mode->update_cr3(v, 1);
+ paging_get_hostmode(v)->update_cr3(v, 1);
}
/* Update all the things that are derived from the guest's CR0/CR3/CR4.
@@ -271,7 +300,7 @@ static inline void paging_update_cr3(str
* has changed, and when bringing up a VCPU for the first time. */
static inline void paging_update_paging_modes(struct vcpu *v)
{
- v->arch.paging.mode->update_paging_modes(v);
+ paging_get_hostmode(v)->update_paging_modes(v);
}
@@ -283,7 +312,7 @@ static inline int paging_write_guest_ent
{
if ( unlikely(paging_mode_enabled(v->domain)
&& v->arch.paging.mode != NULL) )
- return v->arch.paging.mode->write_guest_entry(v, p, new, gmfn);
+ return paging_get_hostmode(v)->write_guest_entry(v, p, new, gmfn);
else
return (!__copy_to_user(p, &new, sizeof(new)));
}
@@ -299,7 +328,7 @@ static inline int paging_cmpxchg_guest_e
{
if ( unlikely(paging_mode_enabled(v->domain)
&& v->arch.paging.mode != NULL) )
- return v->arch.paging.mode->cmpxchg_guest_entry(v, p, old, new, gmfn);
+ return paging_get_hostmode(v)->cmpxchg_guest_entry(v, p, old, new, gmfn);
else
return (!cmpxchg_user(p, *old, new));
}
@@ -327,21 +356,11 @@ static inline void safe_write_pte(l1_pge
* a pointer to the entry to be written, the MFN in which the entry resides,
* the new contents of the entry, and the level in the p2m tree at which
* we are writing. */
-static inline void paging_write_p2m_entry(struct domain *d, unsigned long gfn,
- l1_pgentry_t *p, mfn_t table_mfn,
- l1_pgentry_t new, unsigned int level)
-{
- struct vcpu *v = current;
- if ( v->domain != d )
- v = d->vcpu ? d->vcpu[0] : NULL;
- if ( likely(v && paging_mode_enabled(d) && v->arch.paging.mode != NULL) )
- {
- return v->arch.paging.mode->write_p2m_entry(v, gfn, p, table_mfn,
- new, level);
- }
- else
- safe_write_pte(p, new);
-}
+struct p2m_domain;
+
+void paging_write_p2m_entry(struct p2m_domain *p2m, unsigned long gfn,
+ l1_pgentry_t *p, mfn_t table_mfn,
+ l1_pgentry_t new, unsigned int level);
/* Called from the guest to indicate that the a process is being
* torn down and its pagetables will soon be discarded */
@@ -362,7 +381,7 @@ guest_map_l1e(struct vcpu *v, unsigned l
l2_pgentry_t l2e;
if ( unlikely(paging_mode_translate(v->domain)) )
- return v->arch.paging.mode->guest_map_l1e(v, addr, gl1mfn);
+ return paging_get_hostmode(v)->guest_map_l1e(v, addr, gl1mfn);
/* Find this l1e and its enclosing l1mfn in the linear map */
if ( __copy_from_user(&l2e,
@@ -398,7 +417,7 @@ guest_get_eff_l1e(struct vcpu *v, unsign
return;
}
- v->arch.paging.mode->guest_get_eff_l1e(v, addr, eff_l1e);
+ paging_get_hostmode(v)->guest_get_eff_l1e(v, addr, eff_l1e);
}
/* Read the guest's l1e that maps this address, from the kernel-mode
[-- Attachment #3: Type: text/plain, Size: 138 bytes --]
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH 12/12] Nested Virtualization: hap-on-hap
2011-04-05 15:48 ` Christoph Egger
@ 2011-04-06 10:29 ` Tim Deegan
2011-04-06 14:42 ` Christoph Egger
2011-04-29 9:03 ` Jan Beulich
1 sibling, 1 reply; 11+ messages in thread
From: Tim Deegan @ 2011-04-06 10:29 UTC (permalink / raw)
To: Christoph Egger; +Cc: xen-devel@lists.xensource.com, eddie.dong
At 16:48 +0100 on 05 Apr (1302022090), Christoph Egger wrote:
> On 03/31/11 17:25, Christoph Egger wrote:
> >
> > This is the new version. I fixed the open items from Tim's last review.
>
> Sorry, I mistakenly resent an older version and noticed it just now.
> This time this is the latest version.
Thank you. I have applied the full series, which should appear as
23157--23168 in the staging tree soon. I had to forward-port a few
small things, and I also fixed up one last race condition in the remote
shootdowns, as 23170:86f87da1445a, so please check that I havent broken
anything in your tests.
Cheers,
Tim.
--
Tim Deegan <Tim.Deegan@citrix.com>
Principal Software Engineer, Xen Platform Team
Citrix Systems UK Ltd. (Company #02937203, SL9 0BG)
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH 12/12] Nested Virtualization: hap-on-hap
2011-04-06 10:29 ` Tim Deegan
@ 2011-04-06 14:42 ` Christoph Egger
0 siblings, 0 replies; 11+ messages in thread
From: Christoph Egger @ 2011-04-06 14:42 UTC (permalink / raw)
To: xen-devel
On 04/06/11 12:29, Tim Deegan wrote:
> At 16:48 +0100 on 05 Apr (1302022090), Christoph Egger wrote:
>> On 03/31/11 17:25, Christoph Egger wrote:
>>>
>>> This is the new version. I fixed the open items from Tim's last review.
>>
>> Sorry, I mistakenly resent an older version and noticed it just now.
>> This time this is the latest version.
>
> Thank you. I have applied the full series, which should appear as
> 23157--23168 in the staging tree soon. I had to forward-port a few
> small things, and I also fixed up one last race condition in the remote
> shootdowns, as 23170:86f87da1445a, so please check that I havent broken
> anything in your tests.
Thank you for applying the patch series. xentrace was broken again but
that wasn't you. I already submitted a fix for this.
Christoph
--
---to satisfy European Law for business letters:
Advanced Micro Devices GmbH
Einsteinring 24, 85689 Dornach b. Muenchen
Geschaeftsfuehrer: Alberto Bozzo, Andrew Bowd
Sitz: Dornach, Gemeinde Aschheim, Landkreis Muenchen
Registergericht Muenchen, HRB Nr. 43632
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH 12/12] Nested Virtualization: hap-on-hap
2011-04-05 15:48 ` Christoph Egger
2011-04-06 10:29 ` Tim Deegan
@ 2011-04-29 9:03 ` Jan Beulich
2011-04-29 9:09 ` Christoph Egger
1 sibling, 1 reply; 11+ messages in thread
From: Jan Beulich @ 2011-04-29 9:03 UTC (permalink / raw)
To: Christoph Egger; +Cc: xen-devel@lists.xensource.com, Tim Deegan
>>> On 05.04.11 at 17:48, Christoph Egger <Christoph.Egger@amd.com> wrote:
>diff -r cfde4384be14 -r 28809c365861 xen/include/asm-x86/domain.h
>--- a/xen/include/asm-x86/domain.h
>+++ b/xen/include/asm-x86/domain.h
>...
>@@ -225,6 +227,7 @@ struct paging_vcpu {
> #define MAX_CPUID_INPUT 40
> typedef xen_domctl_cpuid_t cpuid_input_t;
>
>+#define MAX_NESTEDP2M 10
> struct p2m_domain;
> struct time_scale {
> int shift;
>@@ -258,6 +261,12 @@ struct arch_domain
> struct paging_domain paging;
> struct p2m_domain *p2m;
>
>+ /* nestedhvm: translate l2 guest physical to host physical */
>+ struct p2m_domain *nested_p2m[MAX_NESTEDP2M];
>+ spinlock_t nested_p2m_lock;
>+ int nested_p2m_locker;
>+ const char *nested_p2m_function;
>+
> /* NB. protected by d->event_lock and by irq_desc[irq].lock */
> int *irq_pirq;
> int *pirq_irq;
Was there a specific reason to add this to struct arch_domain
instead of struct hvm_domain? I.e. can any pf these fields be
used on pv (or idle) domains?
Thanks, Jan
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH 12/12] Nested Virtualization: hap-on-hap
2011-04-29 9:03 ` Jan Beulich
@ 2011-04-29 9:09 ` Christoph Egger
2011-04-29 9:19 ` Jan Beulich
0 siblings, 1 reply; 11+ messages in thread
From: Christoph Egger @ 2011-04-29 9:09 UTC (permalink / raw)
To: Jan Beulich; +Cc: xen-devel@lists.xensource.com, Tim Deegan
On 04/29/11 11:03, Jan Beulich wrote:
>>>> On 05.04.11 at 17:48, Christoph Egger<Christoph.Egger@amd.com> wrote:
>> diff -r cfde4384be14 -r 28809c365861 xen/include/asm-x86/domain.h
>> --- a/xen/include/asm-x86/domain.h
>> +++ b/xen/include/asm-x86/domain.h
>> ...
>> @@ -225,6 +227,7 @@ struct paging_vcpu {
>> #define MAX_CPUID_INPUT 40
>> typedef xen_domctl_cpuid_t cpuid_input_t;
>>
>> +#define MAX_NESTEDP2M 10
>> struct p2m_domain;
>> struct time_scale {
>> int shift;
>> @@ -258,6 +261,12 @@ struct arch_domain
>> struct paging_domain paging;
>> struct p2m_domain *p2m;
>>
>> + /* nestedhvm: translate l2 guest physical to host physical */
>> + struct p2m_domain *nested_p2m[MAX_NESTEDP2M];
>> + spinlock_t nested_p2m_lock;
>> + int nested_p2m_locker;
>> + const char *nested_p2m_function;
>> +
>> /* NB. protected by d->event_lock and by irq_desc[irq].lock */
>> int *irq_pirq;
>> int *pirq_irq;
>
> Was there a specific reason to add this to struct arch_domain
> instead of struct hvm_domain? I.e. can any pf these fields be
> used on pv (or idle) domains?
The reason is that there is already a 'struct p2m_domain *p2m' field.
If that can be moved to struct hvm_domain then nested_p2m can
definitely move over to there, too.
Christoph
--
---to satisfy European Law for business letters:
Advanced Micro Devices GmbH
Einsteinring 24, 85689 Dornach b. Muenchen
Geschaeftsfuehrer: Alberto Bozzo, Andrew Bowd
Sitz: Dornach, Gemeinde Aschheim, Landkreis Muenchen
Registergericht Muenchen, HRB Nr. 43632
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH 12/12] Nested Virtualization: hap-on-hap
2011-04-29 9:09 ` Christoph Egger
@ 2011-04-29 9:19 ` Jan Beulich
0 siblings, 0 replies; 11+ messages in thread
From: Jan Beulich @ 2011-04-29 9:19 UTC (permalink / raw)
To: Christoph Egger; +Cc: xen-devel@lists.xensource.com, Tim Deegan
>>> On 29.04.11 at 11:09, Christoph Egger <Christoph.Egger@amd.com> wrote:
> On 04/29/11 11:03, Jan Beulich wrote:
>>>>> On 05.04.11 at 17:48, Christoph Egger<Christoph.Egger@amd.com> wrote:
>>> diff -r cfde4384be14 -r 28809c365861 xen/include/asm-x86/domain.h
>>> --- a/xen/include/asm-x86/domain.h
>>> +++ b/xen/include/asm-x86/domain.h
>>> ...
>>> @@ -225,6 +227,7 @@ struct paging_vcpu {
>>> #define MAX_CPUID_INPUT 40
>>> typedef xen_domctl_cpuid_t cpuid_input_t;
>>>
>>> +#define MAX_NESTEDP2M 10
>>> struct p2m_domain;
>>> struct time_scale {
>>> int shift;
>>> @@ -258,6 +261,12 @@ struct arch_domain
>>> struct paging_domain paging;
>>> struct p2m_domain *p2m;
>>>
>>> + /* nestedhvm: translate l2 guest physical to host physical */
>>> + struct p2m_domain *nested_p2m[MAX_NESTEDP2M];
>>> + spinlock_t nested_p2m_lock;
>>> + int nested_p2m_locker;
>>> + const char *nested_p2m_function;
>>> +
>>> /* NB. protected by d->event_lock and by irq_desc[irq].lock */
>>> int *irq_pirq;
>>> int *pirq_irq;
>>
>> Was there a specific reason to add this to struct arch_domain
>> instead of struct hvm_domain? I.e. can any pf these fields be
>> used on pv (or idle) domains?
>
> The reason is that there is already a 'struct p2m_domain *p2m' field.
> If that can be moved to struct hvm_domain then nested_p2m can
> definitely move over to there, too.
No, I don't think these are connected - a pv domain can still require
a p2m (e.g. for the iommu), but I would have thought that the
nesting stuff doesn't apply there.
Jan
^ permalink raw reply [flat|nested] 11+ messages in thread
end of thread, other threads:[~2011-04-29 9:19 UTC | newest]
Thread overview: 11+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2011-03-09 14:31 [PATCH 12/12] Nested Virtualization: hap-on-hap Christoph Egger
2011-03-22 14:59 ` Tim Deegan
2011-03-31 15:25 ` Christoph Egger
2011-04-05 15:48 ` Christoph Egger
2011-04-06 10:29 ` Tim Deegan
2011-04-06 14:42 ` Christoph Egger
2011-04-29 9:03 ` Jan Beulich
2011-04-29 9:09 ` Christoph Egger
2011-04-29 9:19 ` Jan Beulich
-- strict thread matches above, loose matches on Subject: below --
2010-12-20 16:13 Christoph Egger
2011-01-07 15:55 ` Tim Deegan
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).