Re: [PATCH 15/17] vmx: nest: virtual ept for nested

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Tim Deegan <Tim.Deegan@citrix.com>
To: Qing He <qing.he@intel.com>
Cc: "xen-devel@lists.xensource.com" <xen-devel@lists.xensource.com>
Subject: Re: [PATCH 15/17] vmx: nest: virtual ept for nested
Date: Thu, 20 May 2010 13:21:51 +0100	[thread overview]
Message-ID: <20100520122151.GT4164@whitby.uk.xensource.com> (raw)
In-Reply-To: <1271929289-18572-16-git-send-email-qing.he@intel.com>

At 10:41 +0100 on 22 Apr (1271932887), Qing He wrote:
> This patch adds virtual ept capability to L1.
> It's implemented as a simple per vCPU vTLB like component
> independent to domain wide p2m.
> 
> Signed-off-by: Qing He <qing.he@intel.com>

> diff -r 22df5f7ec6d3 -r 7f54e6615e1e xen/arch/x86/hvm/vmx/nest.c
> --- a/xen/arch/x86/hvm/vmx/nest.c       Thu Apr 22 22:30:09 2010 +0800
> +++ b/xen/arch/x86/hvm/vmx/nest.c       Thu Apr 22 22:30:10 2010 +0800
> @@ -26,6 +26,7 @@
>  #include <asm/hvm/vmx/vmx.h>
>  #include <asm/hvm/vmx/vvmcs.h>
>  #include <asm/hvm/vmx/nest.h>
> +#include <asm/hvm/vmx/vept.h>
> 
>  /*
>   * VMX instructions support functions
> @@ -295,6 +296,9 @@
>      __vmptrld(virt_to_maddr(nest->hvmcs));
>      v->arch.hvm_vmx.launched = 0;
> 
> +    nest->geptp = 0;
> +    nest->vept = vept_init(v);
> +
>      vmreturn(regs, VMSUCCEED);
> 
>  out:
> @@ -313,6 +317,9 @@
>      if ( unlikely(!nest->guest_vmxon_pa) )
>          goto invalid_op;
> 
> +    vept_teardown(nest->vept);
> +    nest->vept = 0;
> +
>      nest->guest_vmxon_pa = 0;
>      __vmpclear(virt_to_maddr(nest->svmcs));
> 
> @@ -529,6 +536,67 @@
>      return vmx_nest_handle_vmresume(regs);
>  }
> 
> +int vmx_nest_handle_invept(struct cpu_user_regs *regs)
> +{
> +    struct vcpu *v = current;
> +    struct vmx_inst_decoded decode;
> +    struct vmx_nest_struct *nest = &v->arch.hvm_vmx.nest;
> +    mfn_t mfn;
> +    u64 eptp;
> +    int type;
> +
> +    if ( unlikely(!nest->guest_vmxon_pa) )
> +        goto invalid_op;
> +
> +    decode_vmx_inst(regs, &decode);
> +
> +    hvm_copy_from_guest_virt(&eptp, decode.mem, sizeof(eptp), 0);
> +    type = reg_read(regs, decode.reg2);

Needs error handling like the other new instructions. 

> +    /* TODO: physical invept on other cpus */

?

> +    switch ( type )
> +    {
> +    case 1:
> +        mfn = vept_invalidate(nest->vept, eptp);
> +        if ( eptp == nest->geptp )
> +            nest->geptp = 0;
> +
> +        if ( __mfn_valid(mfn_x(mfn)) )
> +            __invept(1, mfn_x(mfn) << PAGE_SHIFT | (eptp & 0xfff), 0);
> +        break;
> +    case 2:
> +        vept_invalidate_all(nest->vept);
> +        nest->geptp = 0;
> +        break;
> +    default:
> +        gdprintk(XENLOG_ERR, "nest: unsupported invept type %d\n", type);
> +        break;
> +    }
> +
> +    vmreturn(regs, VMSUCCEED);
> +
> +    return X86EMUL_OKAY;
> +
> +invalid_op:
> +    hvm_inject_exception(TRAP_invalid_op, 0, 0);
> +    return X86EMUL_EXCEPTION;
> +}
> +
> +int vmx_nest_vept(struct vcpu *v)
> +{
> +    struct vmx_nest_struct *nest = &v->arch.hvm_vmx.nest;
> +    int r = 0;
> +
> +    if ( paging_mode_hap(v->domain) &&
> +         (__get_vvmcs(nest->vvmcs, CPU_BASED_VM_EXEC_CONTROL) &
> +          CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) &&
> +         (__get_vvmcs(nest->vvmcs, SECONDARY_VM_EXEC_CONTROL) &
> +          SECONDARY_EXEC_ENABLE_EPT) )
> +        r = 1;
> +
> +    return r;
> +}
> +
>  /*
>   * Nested VMX context switch
>   */
> @@ -739,7 +807,14 @@
>      vvmcs_to_shadow(nest->vvmcs, CR0_GUEST_HOST_MASK);
>      vvmcs_to_shadow(nest->vvmcs, CR4_GUEST_HOST_MASK);
> 
> -    /* TODO: PDPTRs for nested ept */
> +    if ( vmx_nest_vept(v) )
> +    {
> +        vvmcs_to_shadow(nest->vvmcs, GUEST_PDPTR0);
> +        vvmcs_to_shadow(nest->vvmcs, GUEST_PDPTR1);
> +        vvmcs_to_shadow(nest->vvmcs, GUEST_PDPTR2);
> +        vvmcs_to_shadow(nest->vvmcs, GUEST_PDPTR3);
> +    }
> +
>      /* TODO: CR3 target control */
>  }
> 
> @@ -787,14 +862,32 @@
>      }
>  #endif
> 
> +
> +    /* loading EPT_POINTER for L2 */
> +    if ( vmx_nest_vept(v) )
> +    {
> +        u64 geptp;
> +        mfn_t mfn;
> +
> +        geptp = __get_vvmcs(nest->vvmcs, EPT_POINTER);
> +        if ( geptp != nest->geptp )
> +        {
> +            mfn = vept_load_eptp(nest->vept, geptp);

What if vept_load_eptp() returns INVALID_MFN?

> +            nest->geptp = geptp;
> +
> +            __vmwrite(EPT_POINTER, (mfn_x(mfn) << PAGE_SHIFT) | 0x1e);
> +#ifdef __i386__
> +            __vmwrite(EPT_POINTER_HIGH, (mfn_x(mfn) << PAGE_SHIFT) >> 32);
> +#endif
> +        }
> +    }
> +
>      regs->rip = __get_vvmcs(nest->vvmcs, GUEST_RIP);
>      regs->rsp = __get_vvmcs(nest->vvmcs, GUEST_RSP);
>      regs->rflags = __get_vvmcs(nest->vvmcs, GUEST_RFLAGS);
> 
>      /* updating host cr0 to sync TS bit */
>      __vmwrite(HOST_CR0, v->arch.hvm_vmx.host_cr0);
> -
> -    /* TODO: EPT_POINTER */
>  }
> 
>  static void sync_vvmcs_guest_state(struct vmx_nest_struct *nest)
> @@ -1064,8 +1157,26 @@
>          break;
>      }
> 
> +    case EXIT_REASON_EPT_VIOLATION:
> +    {
> +        unsigned long exit_qualification = __vmread(EXIT_QUALIFICATION);
> +        paddr_t gpa = __vmread(GUEST_PHYSICAL_ADDRESS);
> +#ifdef __i386__
> +        gpa |= (paddr_t)__vmread(GUEST_PHYSICAL_ADDRESS_HIGH) << 32;
> +#endif
> +        if ( vmx_nest_vept(v) )
> +        {
> +            if ( !vept_ept_violation(nest->vept, nest->geptp,
> +                     exit_qualification, gpa) )
> +                bypass_l0 = 1;
> +            else
> +                nest->vmexit_pending = 1;

Since bypass_l0 is set from vmexit_pending() here it looks like it's
always going to be set.  Does that mean we never handle a real EPT
violation at L0?  I would expect there to be three possible outcomes
here: give the violation to L1, give it to L0, or fix it in the vept and
discard it.

> +        }
> +
> +        break;
> +    }
> +
>      case EXIT_REASON_WBINVD:
> -    case EXIT_REASON_EPT_VIOLATION:
>      case EXIT_REASON_EPT_MISCONFIG:
>      case EXIT_REASON_EXTERNAL_INTERRUPT:
>          /* pass to L0 handler */
> @@ -1229,11 +1340,14 @@
>          data = (data << 32) | eax;
>          break;
>      case MSR_IA32_VMX_PROCBASED_CTLS:
> +        mask = paging_mode_hap(current->domain)?
> +                   0: CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
> +
>          rdmsr(regs->ecx, eax, edx);
>  #define REMOVED_EXEC_CONTROL_CAP (CPU_BASED_TPR_SHADOW \
> -            | CPU_BASED_ACTIVATE_MSR_BITMAP            \
> -            | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)
> +            | CPU_BASED_ACTIVATE_MSR_BITMAP)
>          data = edx & ~REMOVED_EXEC_CONTROL_CAP;
> +        data = edx & ~mask;
>          data = (data << 32) | eax;
>          break;
>      case MSR_IA32_VMX_EXIT_CTLS:
> @@ -1254,12 +1368,20 @@
>          data = (data << 32) | eax;
>          break;
>      case MSR_IA32_VMX_PROCBASED_CTLS2:
> -        mask = 0;
> +        mask = paging_mode_hap(current->domain)?
> +                   SECONDARY_EXEC_ENABLE_EPT : 0;
> 
>          rdmsr(regs->ecx, eax, edx);
>          data = edx & mask;
>          data = (data << 32) | eax;
>          break;
> +    case MSR_IA32_VMX_EPT_VPID_CAP:
> +        rdmsr(regs->ecx, eax, edx);
> +#define REMOVED_EPT_VPID_CAP_HIGH   ( 1 | 1<<8 | 1<<9 | 1<<10 | 1<<11 )
> +#define REMOVED_EPT_VPID_CAP_LOW    ( 1<<16 | 1<<17 | 1<<26 )
> +        data = edx & ~REMOVED_EPT_VPID_CAP_HIGH;
> +        data = (data << 32) | (eax & ~REMOVED_EPT_VPID_CAP_LOW);
> +        break;
> 
>      /* pass through MSRs */
>      case IA32_FEATURE_CONTROL_MSR:
> diff -r 22df5f7ec6d3 -r 7f54e6615e1e xen/arch/x86/hvm/vmx/vept.c
> --- /dev/null   Thu Jan 01 00:00:00 1970 +0000
> +++ b/xen/arch/x86/hvm/vmx/vept.c       Thu Apr 22 22:30:10 2010 +0800
> @@ -0,0 +1,574 @@
> +/*
> + * vept.c: virtual EPT for nested virtualization
> + *
> + * Copyright (c) 2010, Intel Corporation.
> + * Author: Qing He <qing.he@intel.com>
> + *
> + * This program is free software; you can redistribute it and/or modify it
> + * under the terms and conditions of the GNU General Public License,
> + * version 2, as published by the Free Software Foundation.
> + *
> + * This program is distributed in the hope it will be useful, but WITHOUT
> + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
> + * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
> + * more details.
> + *
> + * You should have received a copy of the GNU General Public License along with
> + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
> + * Place - Suite 330, Boston, MA 02111-1307 USA.
> + *
> + */
> +
> +#include <xen/config.h>
> +#include <xen/types.h>
> +#include <xen/list.h>
> +#include <xen/mm.h>
> +#include <xen/paging.h>
> +#include <xen/domain_page.h>
> +#include <xen/sched.h>
> +#include <asm/page.h>
> +#include <xen/numa.h>
> +#include <asm/hvm/vmx/vmx.h>
> +#include <asm/hvm/vmx/vept.h>
> +
> +#undef mfn_to_page
> +#define mfn_to_page(_m) __mfn_to_page(mfn_x(_m))
> +#undef mfn_valid
> +#define mfn_valid(_mfn) __mfn_valid(mfn_x(_mfn))
> +#undef page_to_mfn
> +#define page_to_mfn(_pg) _mfn(__page_to_mfn(_pg))
> +
> +/*
> + * This virtual EPT implementation is independent to p2m facility
> + * and has some different characteristics. It works in a similar
> + * way as shadow page table (guest table and host table composition),
> + * but is per-vcpu, and of vTLB style
> + *   - per vCPU so no lock is required

What happens when dom0 changes domU's p2m table?  Don't you need to
shoot down existing vEPT tables from a foreign CPU?

> + *   - vTLB style signifies honoring all invalidations, and not
> + * write protection. Unlike ordinary page table, since EPT updates
> + * and invalidations are minimal in a well written VMM, overhead
> + * is also minimized.
> + *
> + * The physical root is loaded directly to L2 sVMCS, without entering
> + * any other host controls. Multiple `cache slots' are maintained
> + * for multiple guest EPTPs, with simple LRU replacement.
> + *
> + * One of the limitations so far, is that it doesn't work with
> + * L0 emulation code, so L1 p2m_mmio_direct on top of L0 p2m_mmio_dm
> + * is not supported as for now.

Is this something you intend to fix before we check it in?

> + */
> +
> +#define VEPT_MAX_SLOTS 8
> +#define VEPT_ALLOCATION_SIZE 512
> +
> +struct vept_slot {
> +    u64               eptp;   /* guest eptp */
> +    mfn_t             root;   /* root of phys table */
> +    struct list_head  list;
> +
> +    struct page_list_head page_list;
> +};
> +
> +struct vept {
> +    struct list_head   used_slots; /* lru: new->tail, old->head */
> +    struct list_head   free_slots;
> +
> +    int                total_pages;
> +    int                free_pages;
> +    struct page_list_head freelist;
> +
> +    struct vcpu       *vcpu;
> +};
> +
> +
> +static struct vept_slot *__get_eptp_slot(struct vept *vept, u64 geptp)
> +{
> +    struct vept_slot *slot, *tmp;
> +
> +    list_for_each_entry_safe( slot, tmp, &vept->used_slots, list )
> +        if ( slot->eptp == geptp )
> +            return slot;
> +
> +    return NULL;
> +}
> +
> +static struct vept_slot *get_eptp_slot(struct vept *vept, u64 geptp)
> +{
> +    struct vept_slot *slot;
> +
> +    slot = __get_eptp_slot(vept, geptp);
> +    if ( slot != NULL )
> +        list_del(&slot->list);
> +
> +    return slot;
> +}
> +
> +static void __clear_slot(struct vept *vept, struct vept_slot *slot)
> +{
> +    struct page_info *pg;
> +
> +    slot->eptp = 0;
> +
> +    while ( !page_list_empty(&slot->page_list) )
> +    {
> +        pg = page_list_remove_head(&slot->page_list);
> +        page_list_add_tail(pg, &vept->freelist);
> +
> +        vept->free_pages++;
> +    }
> +}
> +
> +static struct vept_slot *get_free_slot(struct vept *vept)
> +{
> +    struct vept_slot *slot = NULL;
> +
> +    if ( !list_empty(&vept->free_slots) )
> +    {
> +        slot = list_entry(vept->free_slots.next, struct vept_slot, list);
> +        list_del(&slot->list);
> +    }
> +    else if ( !list_empty(&vept->used_slots) )
> +    {
> +        slot = list_entry(vept->used_slots.next, struct vept_slot, list);
> +        list_del(&slot->list);
> +        __clear_slot(vept, slot);
> +    }
> +
> +    return slot;
> +}
> +
> +static void clear_all_slots(struct vept *vept)
> +{
> +    struct vept_slot *slot, *tmp;
> +
> +    list_for_each_entry_safe( slot, tmp, &vept->used_slots, list )
> +    {
> +        list_del(&slot->list);
> +        __clear_slot(vept, slot);
> +        list_add_tail(&slot->list, &vept->free_slots);
> +    }
> +}
> +
> +static int free_some_pages(struct vept *vept, struct vept_slot *curr)
> +{
> +    struct vept_slot *slot;
> +    int r = 0;
> +
> +    if ( !list_empty(&vept->used_slots) )
> +    {
> +        slot = list_entry(vept->used_slots.next, struct vept_slot, list);
> +        if ( slot != curr )
> +        {
> +            list_del(&slot->list);
> +            __clear_slot(vept, slot);
> +            list_add_tail(&slot->list, &vept->free_slots);
> +
> +            r = 1;
> +        }
> +    }
> +
> +    return r;
> +}
> +
> +struct vept *vept_init(struct vcpu *v)
> +{
> +    struct vept *vept;
> +    struct vept_slot *slot;
> +    struct page_info *pg;
> +    int i;
> +
> +    vept = xmalloc(struct vept);
> +    if ( vept == NULL )
> +        goto out;
> +
> +    memset(vept, 0, sizeof(*vept));
> +    vept->vcpu = v;
> +
> +    INIT_PAGE_LIST_HEAD(&vept->freelist);
> +    INIT_LIST_HEAD(&vept->used_slots);
> +    INIT_LIST_HEAD(&vept->free_slots);
> +
> +    for ( i = 0; i < VEPT_MAX_SLOTS; i++ )
> +    {
> +        slot = xmalloc(struct vept_slot);
> +        if ( slot == NULL )
> +            break;
> +
> +        memset(slot, 0, sizeof(*slot));
> +
> +        INIT_LIST_HEAD(&slot->list);
> +        INIT_PAGE_LIST_HEAD(&slot->page_list);
> +
> +        list_add(&slot->list, &vept->free_slots);
> +    }
> +
> +    for ( i = 0; i < VEPT_ALLOCATION_SIZE; i++ )

Why a fixed 2MB allocation?  What if your nested domains are very large?

> +    {
> +        pg = alloc_domheap_page(NULL, MEMF_node(domain_to_node(v->domain)));

Shouldn't this be allocated from the paging pool like other EPT memory?

> +        if ( pg == NULL )
> +            break;

Return an error?

> +        page_list_add_tail(pg, &vept->freelist);
> +        vept->total_pages++;
> +        vept->free_pages++;
> +    }
> +
> + out:
> +    return vept;
> +}
> +
> +void vept_teardown(struct vept *vept)
> +{
> +    struct page_info *pg;
> +    struct vept_slot *slot, *tmp;
> +
> +    clear_all_slots(vept);
> +
> +    while ( !page_list_empty(&vept->freelist) )
> +    {
> +        pg = page_list_remove_head(&vept->freelist);
> +        free_domheap_page(pg);
> +        vept->free_pages++;
> +        vept->total_pages++;
> +    }
> +
> +    list_for_each_entry_safe( slot, tmp, &vept->free_slots, list )
> +        xfree(slot);
> +
> +    xfree(vept);
> +}
> +
> +mfn_t vept_load_eptp(struct vept *vept, u64 geptp)
> +{
> +    struct page_info *pg;
> +    struct vept_slot *slot;
> +    mfn_t mfn = _mfn(INVALID_MFN);
> +    void *addr;
> +
> +    ASSERT(vept->vcpu == current);
> +
> +    slot = get_eptp_slot(vept, geptp);
> +    if ( slot == NULL )
> +    {
> +        slot = get_free_slot(vept);
> +        if ( unlikely(slot == NULL) )
> +        {
> +            gdprintk(XENLOG_ERR, "nest: can't get free slot\n");
> +            return mfn;
> +        }
> +
> +        while ( !vept->free_pages )
> +            if ( !free_some_pages(vept, slot) )
> +            {
> +                slot->eptp = 0;
> +                list_add_tail(&slot->list, &vept->free_slots);
> +                gdprintk(XENLOG_ERR, "nest: vept no free pages\n");
> +
> +                return mfn;
> +            }
> +
> +        vept->free_pages--;
> +        pg = page_list_remove_head(&vept->freelist);
> +
> +        mfn = page_to_mfn(pg);
> +        addr = map_domain_page(mfn_x(mfn));
> +        clear_page(addr);
> +        unmap_domain_page(addr);
> +        page_list_add_tail(pg, &slot->page_list);
> +        slot->eptp = geptp;
> +        slot->root = mfn;
> +    }
> +
> +    mfn = slot->root;
> +    list_add_tail(&slot->list, &vept->used_slots);
> +
> +    return mfn;
> +}
> +
> +mfn_t vept_invalidate(struct vept *vept, u64 geptp)
> +{
> +    struct vept_slot *slot;
> +    mfn_t mfn = _mfn(INVALID_MFN);
> +
> +    ASSERT(vept->vcpu == current);
> +
> +    slot = get_eptp_slot(vept, geptp);
> +    if ( slot != NULL )
> +    {
> +        mfn = slot->root;
> +        __clear_slot(vept, slot);
> +        list_add_tail(&slot->list, &vept->free_slots);
> +    }
> +
> +    return mfn;
> +}
> +
> +void vept_invalidate_all(struct vept *vept)
> +{
> +    ASSERT(vept->vcpu == current);
> +
> +    clear_all_slots(vept);
> +}
> +
> +/*
> + * guest EPT walk and EPT violation
> + */
> +struct ept_walk {
> +    unsigned long gfn;
> +    unsigned long gfn_remainder;
> +    ept_entry_t l4e, l3e, l2e, l1e;
> +    mfn_t l4mfn, l3mfn, l2mfn, l1mfn;
> +    int sp;
> +};
> +typedef struct ept_walk ept_walk_t;
> +
> +#define GEPT_NORMAL_PAGE  0
> +#define GEPT_SUPER_PAGE   1
> +#define GEPT_NOT_PRESENT  2
> +static int guest_ept_next_level(struct vcpu *v, ept_entry_t **table,
> +               unsigned long *gfn_remainder, int level, u32 *ar,
> +               ept_entry_t *entry, mfn_t *next_mfn)
> +{
> +    int index;
> +    ept_entry_t *ept_entry;
> +    ept_entry_t *next;
> +    p2m_type_t p2mt;
> +    int rc = GEPT_NORMAL_PAGE;
> +    mfn_t mfn;
> +
> +    index = *gfn_remainder >> (level * EPT_TABLE_ORDER);
> +
> +    ept_entry = (*table) + index;
> +    *entry = *ept_entry;
> +    *ar &= entry->epte & 0x7;
> +
> +    *gfn_remainder &= (1UL << (level * EPT_TABLE_ORDER)) - 1;
> +
> +    if ( !(ept_entry->epte & 0x7) )
> +        rc = GEPT_NOT_PRESENT;
> +    else if ( ept_entry->sp_avail )
> +        rc = GEPT_SUPER_PAGE;
> +    else
> +    {
> +        mfn = gfn_to_mfn(v->domain, ept_entry->mfn, &p2mt);
> +        if ( !p2m_is_ram(p2mt) )
> +            return GEPT_NOT_PRESENT;
> +
> +        if ( next_mfn )
> +        {
> +            next = map_domain_page(mfn_x(mfn));
> +            unmap_domain_page(*table);
> +
> +            *table = next;
> +            *next_mfn = mfn;
> +        }
> +    }
> +
> +    return rc;
> +}
> +
> +static u32 guest_walk_ept(struct vcpu *v, ept_walk_t *gw,
> +                          u64 geptp, u64 ggpa)
> +{
> +    ept_entry_t *table;
> +    p2m_type_t p2mt;
> +    int rc;
> +    u32 ar = 0x7;
> +
> +    unsigned long gfn = (unsigned long) (ggpa >> PAGE_SHIFT);
> +    unsigned long gfn_remainder = gfn;
> +
> +    memset(gw, 0, sizeof(*gw));
> +    gw->gfn = gfn;
> +    gw->sp = 0;
> +
> +    gw->l4mfn = gfn_to_mfn(v->domain, geptp >> PAGE_SHIFT, &p2mt);
> +    if ( !p2m_is_ram(p2mt) )
> +        return 0;
> +
> +    table = map_domain_page(mfn_x(gw->l4mfn));
> +
> +    rc = guest_ept_next_level(v, &table, &gfn_remainder, 3, &ar,
> +                              &gw->l4e, &gw->l3mfn);
> +
> +    if ( rc )
> +        goto out;
> +
> +    rc = guest_ept_next_level(v, &table, &gfn_remainder, 2, &ar,
> +                              &gw->l3e, &gw->l2mfn);
> +
> +    if ( rc == GEPT_SUPER_PAGE )
> +        gw->sp = 2;
> +    if ( rc )
> +        goto out;
> +
> +    rc = guest_ept_next_level(v, &table, &gfn_remainder, 1, &ar,
> +                              &gw->l2e, &gw->l1mfn);
> +
> +    if ( rc == GEPT_SUPER_PAGE )
> +        gw->sp = 1;
> +    if ( rc )
> +        goto out;
> +
> +    rc = guest_ept_next_level(v, &table, &gfn_remainder, 0, &ar,
> +                              &gw->l1e, NULL);
> +
> + out:
> +    gw->gfn_remainder = gfn_remainder;
> +    unmap_domain_page(*table);
> +    return ar;
> +}
> +
> +static void epte_set_ar_bits(ept_entry_t *entry, unsigned long ar)
> +{
> +    entry->epte &= ~0x7f;
> +    entry->epte |= ar & 0x7f;
> +}
> +
> +static int shadow_ept_next_level(struct vept *vept, struct vept_slot *slot,
> +                       ept_entry_t **table, unsigned long *gfn_remainder,
> +                       int level, u32 *ar, ept_entry_t gentry)
> +{
> +    int index;
> +    ept_entry_t *sentry;
> +    ept_entry_t *next;
> +    mfn_t mfn;
> +    struct page_info *pg;
> +
> +    index = *gfn_remainder >> (level * EPT_TABLE_ORDER);
> +
> +    sentry = (*table) + index;
> +    *ar = sentry->epte & 0x7;
> +
> +    *gfn_remainder &= (1UL << (level * EPT_TABLE_ORDER)) - 1;
> +
> +    if ( !(sentry->epte & 0x7) )
> +    {
> +        while ( !vept->free_pages )
> +            if ( !free_some_pages(vept, slot) )
> +            {
> +                gdprintk(XENLOG_ERR, "nest: vept no free pages\n");
> +                return 0;
> +            }
> +
> +        vept->free_pages--;
> +        pg = page_list_remove_head(&vept->freelist);
> +        page_list_add_tail(pg, &slot->page_list);
> +        mfn = page_to_mfn(pg);
> +        next = map_domain_page(mfn_x(mfn));
> +        clear_page(next);
> +
> +        sentry->mfn = mfn_x(mfn);
> +    }
> +    else
> +    {
> +        next = map_domain_page(sentry->mfn);
> +    }
> +
> +    epte_set_ar_bits(sentry, gentry.epte);
> +
> +    unmap_domain_page(*table);
> +    *table = next;
> +
> +    return 1;
> +}
> +
> +int vept_ept_violation(struct vept *vept, u64 geptp,
> +                       unsigned long qualification, paddr_t addr)
> +{
> +    ept_walk_t gw;
> +    struct vept_slot *slot;
> +    ept_entry_t *table, *gept;
> +    ept_entry_t *sentry, *gentry;
> +    u32 old_entry, sp_ar = 0;
> +    p2m_type_t p2mt;
> +    unsigned long mfn_start = 0;
> +    unsigned long gfn_remainder;
> +    int rc, i;
> +
> +    ASSERT(vept->vcpu == current);
> +
> +    slot = __get_eptp_slot(vept, geptp);
> +    if ( unlikely(slot == NULL) )
> +        return 0;
> +
> +    rc = guest_walk_ept(vept->vcpu, &gw, geptp, addr);
> +
> +    if ( !(rc & (qualification & 0x7)) )    /* inject to guest */
> +        return 1;
> +
> +    if ( gw.sp == 2 )  /* 1G */
> +    {
> +        sp_ar = gw.l3e.epte & 0x7;
> +        mfn_start = gw.l3e.mfn +
> +                    (gw.gfn_remainder & (~(1 << EPT_TABLE_ORDER) - 1));
> +    }
> +    if ( gw.sp == 1 )  /* 2M */
> +    {
> +        sp_ar = gw.l2e.epte & 0x7;
> +        mfn_start = gw.l2e.mfn;
> +    }
> +    else
> +        mfn_start = 0;
> +
> +    table = map_domain_page(mfn_x(slot->root));
> +    gfn_remainder = gw.gfn;
> +
> +    shadow_ept_next_level(vept, slot, &table, &gfn_remainder, 3,
> +                          &old_entry, gw.l4e);

What if shadow_ept_next_level() returns 0 ?

> +    shadow_ept_next_level(vept, slot, &table, &gfn_remainder, 2,
> +                          &old_entry, gw.l3e);

Ditto

> +    shadow_ept_next_level(vept, slot, &table, &gfn_remainder, 1,
> +                          &old_entry, (gw.sp == 2) ? gw.l3e : gw.l2e);

Ditto

> +    /* if l1p is just allocated, do a full prefetch */
> +    if ( !old_entry && !gw.sp )
> +    {
> +        gept = map_domain_page(mfn_x(gw.l1mfn));
> +        for ( i = 0; i < 512; i++ )
> +        {
> +            gentry = gept + i;
> +            sentry = table + i;
> +            if ( gentry->epte & 0x7 )
> +            {
> +                sentry->mfn = mfn_x(gfn_to_mfn_guest(vept->vcpu->domain,
> +                                        gentry->mfn, &p2mt));
> +                epte_set_ar_bits(sentry, gentry->epte);
> +            }
> +            else
> +                sentry->epte = 0;
> +        }
> +        unmap_domain_page(gept);
> +    }
> +    else if ( !old_entry && gw.sp )
> +    {
> +        for ( i = 0; i < 512; i++ )
> +        {
> +            sentry = table + i;
> +            sentry->mfn = mfn_x(gfn_to_mfn_guest(vept->vcpu->domain,
> +                                    mfn_start + i, &p2mt));
> +            epte_set_ar_bits(sentry, sp_ar);
> +        }
> +    }
> +    else if ( old_entry && !gw.sp )
> +    {
> +        i = gw.gfn & ((1 << EPT_TABLE_ORDER) - 1);
> +        sentry = table + i;
> +        sentry->mfn = mfn_x(gfn_to_mfn_guest(vept->vcpu->domain,
> +                                gw.l1e.mfn, &p2mt));
> +        epte_set_ar_bits(sentry, gw.l1e.epte);
> +    }
> +    else    // old_entry && gw.sp
> +    {
> +        i = gw.gfn & ((1 << EPT_TABLE_ORDER) - 1);
> +        sentry = table + i;
> +        sentry->mfn = mfn_x(gfn_to_mfn_guest(vept->vcpu->domain,
> +                                mfn_start + i, &p2mt));
> +        epte_set_ar_bits(sentry, sp_ar);
> +    }
> +
> +    unmap_domain_page(table);
> +    return 0;
> +}
> diff -r 22df5f7ec6d3 -r 7f54e6615e1e xen/arch/x86/hvm/vmx/vmx.c
> --- a/xen/arch/x86/hvm/vmx/vmx.c        Thu Apr 22 22:30:09 2010 +0800
> +++ b/xen/arch/x86/hvm/vmx/vmx.c        Thu Apr 22 22:30:10 2010 +0800
> @@ -1032,6 +1032,14 @@
>      p2m_type_t p2mt;
>      char *p;
> 
> +    /*
> +     * If in nesting EPT operation, L0 doesn't have the knowledge on
> +     * how to interpret CR3, it's L1's responsibility to provide
> +     * GUEST_PDPTRn, we rely solely on them.
> +     */
> +    if ( v->arch.hvm_vcpu.in_nesting && vmx_nest_vept(v) )
> +        return;
> +
>      /* EPT needs to load PDPTRS into VMCS for PAE. */
>      if ( !hvm_pae_enabled(v) || (v->arch.hvm_vcpu.guest_efer & EFER_LMA) )
>          return;
> @@ -2705,6 +2713,11 @@
>          if ( vmx_nest_handle_vmxon(regs) == X86EMUL_OKAY )
>              __update_guest_eip(inst_len);
>          break;
> +    case EXIT_REASON_INVEPT:
> +        inst_len = __get_instruction_length();
> +        if ( vmx_nest_handle_invept(regs) == X86EMUL_OKAY )
> +            __update_guest_eip(inst_len);
> +        break;
> 
>      case EXIT_REASON_MWAIT_INSTRUCTION:
>      case EXIT_REASON_MONITOR_INSTRUCTION:
> diff -r 22df5f7ec6d3 -r 7f54e6615e1e xen/include/asm-x86/hvm/vmx/nest.h
> --- a/xen/include/asm-x86/hvm/vmx/nest.h        Thu Apr 22 22:30:09 2010 +0800
> +++ b/xen/include/asm-x86/hvm/vmx/nest.h        Thu Apr 22 22:30:10 2010 +0800
> @@ -47,6 +47,9 @@
> 
>      unsigned long        intr_info;
>      unsigned long        error_code;
> +
> +    u64                  geptp;
> +    struct vept         *vept;
>  };
> 
>  asmlinkage void vmx_nest_switch_mode(void);
> @@ -64,6 +67,8 @@
>  int vmx_nest_handle_vmresume(struct cpu_user_regs *regs);
>  int vmx_nest_handle_vmlaunch(struct cpu_user_regs *regs);
> 
> +int vmx_nest_handle_invept(struct cpu_user_regs *regs);
> +
>  void vmx_nest_update_exec_control(struct vcpu *v, unsigned long value);
>  void vmx_nest_update_secondary_exec_control(struct vcpu *v,
>                                              unsigned long value);
> @@ -81,4 +86,6 @@
>  int vmx_nest_msr_write_intercept(struct cpu_user_regs *regs,
>                                   u64 msr_content);
> 
> +int vmx_nest_vept(struct vcpu *v);
> +
>  #endif /* __ASM_X86_HVM_NEST_H__ */
> diff -r 22df5f7ec6d3 -r 7f54e6615e1e xen/include/asm-x86/hvm/vmx/vept.h
> --- /dev/null   Thu Jan 01 00:00:00 1970 +0000
> +++ b/xen/include/asm-x86/hvm/vmx/vept.h        Thu Apr 22 22:30:10 2010 +0800
> @@ -0,0 +1,10 @@
> +#include <asm/hvm/vmx/vmx.h>
> +
> +
> +struct vept *vept_init(struct vcpu *v);
> +void vept_teardown(struct vept *vept);
> +mfn_t vept_load_eptp(struct vept *vept, u64 eptp);
> +mfn_t vept_invalidate(struct vept *vept, u64 eptp);
> +void vept_invalidate_all(struct vept *vept);
> +int vept_ept_violation(struct vept *vept, u64 eptp,
> +                       unsigned long qualification, paddr_t addr);
> 
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xensource.com
> http://lists.xensource.com/xen-devel

-- 
Tim Deegan <Tim.Deegan@citrix.com>
Principal Software Engineer, XenServer Engineering
Citrix Systems UK Ltd.  (Company #02937203, SL9 0BG)

next prev parent reply	other threads:[~2010-05-20 12:21 UTC|newest]

Thread overview: 51+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2010-04-22  9:41 [PATCH 00/17][RFC] Nested virtualization for VMX Qing He
2010-04-22  9:41 ` [PATCH 01/17] vmx: nest: fix CR4.VME in update_guest_cr Qing He
2010-05-20  9:26   ` Tim Deegan
2010-05-20  9:36     ` Qing He
2010-04-22  9:41 ` [PATCH 02/17] vmx: nest: rename host_vmcs Qing He
2010-04-22  9:41 ` [PATCH 03/17] vmx: nest: wrapper for control update Qing He
2010-05-20  9:34   ` Tim Deegan
2010-05-20  9:46     ` Qing He
2010-05-20 12:57       ` Keir Fraser
2010-04-22  9:41 ` [PATCH 04/17] vmx: nest: domain and vcpu flags Qing He
2010-05-20  9:37   ` Tim Deegan
2010-05-20  9:51     ` Christoph Egger
2010-05-20  9:54     ` Qing He
2010-05-20 10:55       ` Tim Deegan
2010-05-20 12:53         ` Qing He
2010-05-20 14:06           ` Christoph Egger
2010-04-22  9:41 ` [PATCH 05/17] vmx: nest: nested control structure Qing He
2010-04-22  9:41 ` [PATCH 06/17] vmx: nest: virtual vmcs layout Qing He
2010-04-22  9:41 ` [PATCH 07/17] vmx: nest: handling VMX instruction exits Qing He
2010-05-20 10:53   ` Tim Deegan
2010-05-20 13:28     ` Qing He
2010-04-22  9:41 ` [PATCH 08/17] vmx: nest: L1 <-> L2 context switch Qing He
2010-05-20 11:11   ` Tim Deegan
2010-05-20 13:49     ` Qing He
2010-05-21  9:19       ` Tim Deegan
2010-05-21 10:31         ` Qing He
2010-05-25 15:27           ` Tim Deegan
2010-04-22  9:41 ` [PATCH 09/17] vmx: nest: interrupt Qing He
2010-05-20 11:21   ` Tim Deegan
2010-05-20 15:55     ` Qing He
2010-04-22  9:41 ` [PATCH 10/17] vmx: nest: VMExit handler in L2 Qing He
2010-05-20 11:44   ` Tim Deegan
2010-05-20 16:06     ` Qing He
2010-05-21  8:42       ` Tim Deegan
2010-05-21 10:35         ` Qing He
2010-05-25 15:34           ` Tim Deegan
2010-04-22  9:41 ` [PATCH 11/17] vmx: nest: L2 tsc Qing He
2010-05-20 11:47   ` Tim Deegan
2010-05-20 16:07     ` Qing He
2010-04-22  9:41 ` [PATCH 12/17] vmx: nest: CR0.TS and #NM Qing He
2010-04-22  9:41 ` [PATCH 13/17] vmx: nest: capability reporting MSRs Qing He
2010-05-20 11:52   ` Tim Deegan
2010-04-22  9:41 ` [PATCH 14/17] vmx: nest: enable virtual VMX Qing He
2010-04-22  9:41 ` [PATCH 15/17] vmx: nest: virtual ept for nested Qing He
2010-05-20 12:21   ` Tim Deegan [this message]
2010-05-21 10:24     ` Qing He
2010-05-25 16:02       ` Tim Deegan
2010-04-22  9:41 ` [PATCH 16/17] vmx: nest: hvmtrace " Qing He
2010-04-22  9:41 ` [PATCH 17/17] tools: nest: allow enabling nesting Qing He
2010-04-22 10:15 ` [PATCH 00/17][RFC] Nested virtualization for VMX Christoph Egger
2010-04-23 10:10   ` He, Qing

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20100520122151.GT4164@whitby.uk.xensource.com \
    --to=tim.deegan@citrix.com \
    --cc=qing.he@intel.com \
    --cc=xen-devel@lists.xensource.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.