From mboxrd@z Thu Jan  1 00:00:00 1970
From: Wei Wang <wei.wang2@amd.com>
Subject: Re: RFC: AMD support for paging
Date: Wed, 15 Feb 2012 13:09:11 +0100
Message-ID: <4F3BA067.4010303@amd.com>
References: <91f45eaceac6f38f9df39ed7d60c47a7.squirrel@webmail.lagarcavilla.org>
Mime-Version: 1.0
Content-Type: text/plain; charset="us-ascii"; Format="flowed"
Content-Transfer-Encoding: 7bit
Return-path: <xen-devel-bounces@lists.xensource.com>
In-Reply-To: <91f45eaceac6f38f9df39ed7d60c47a7.squirrel@webmail.lagarcavilla.org>
List-Unsubscribe: <http://lists.xensource.com/mailman/options/xen-devel>,
	<mailto:xen-devel-request@lists.xensource.com?subject=unsubscribe>
List-Post: <mailto:xen-devel@lists.xensource.com>
List-Help: <mailto:xen-devel-request@lists.xensource.com?subject=help>
List-Subscribe: <http://lists.xensource.com/mailman/listinfo/xen-devel>,
	<mailto:xen-devel-request@lists.xensource.com?subject=subscribe>
Sender: xen-devel-bounces@lists.xensource.com
Errors-To: xen-devel-bounces@lists.xensource.com
To: andres@lagarcavilla.org
Cc: olaf@aepfle.de, xen-devel@lists.xensource.com, tim@xen.org, keir.xen@gmail.com, JBeulich@suse.com, adin@gridcentric.ca
List-Id: xen-devel@lists.xenproject.org

On 02/14/2012 08:05 PM, Andres Lagar-Cavilla wrote:
> We started hashing out some AMD support for mem_paging and mem_access.
> Right now my VMs boot, page out a bit, and then die on an HVM triple
> fault.
>
> Most importantly, I want to get somebody from AMD to comment/help out on
> this. It feels like we're inches away from enabling support for this very
> nice feature. I'm not sure who exactly on AMD monitors the list for these
> kinds of things. It'd be great to have you on board!
>
> For starters, the changes to the p2m code are relatively mild, but it'd be
> great if somebody spots a red flag.
>
> Another issue: comments indicate that bits 59-62 in NPT entries are used
> for IOMMU flags but effectively bits 61-62 are. Repossessing one bit (59)
> would give us enough space to support mem_access. Right now we only have 7
> bits available for Xen flags and that is not enough for paging and access.
> Is bit 59 effectively reserved?

Hi
bit 59 is used by iommu hardware for ATS response. In most cases, for 
p2m_ram_rw pages, U bit must be 0. But maybe for other page types that 
are not potentially used by DMA, this bit could be non-zero. I could 
tested it on my iommu machines if you had some patches that use U bits.

Thanks,
Wei

>
> Finally, the triple fault. Maybe I'm missing something obvious. Comments
> welcome.
>
> Patch inline below, thanks!
> Andres
>
> Enable AMD support for paging.
>
> Signed-off-by: Andres Lagar-Cavilla<andres@lagarcavilla.org>
> Signed-off-by: Adin Scannell<adin@scannell.ca>
>
> diff -r 25ca78889ed4 -r 10ca4e4293ce xen/arch/x86/mm/mem_event.c
> --- a/xen/arch/x86/mm/mem_event.c
> +++ b/xen/arch/x86/mm/mem_event.c
> @@ -537,10 +537,6 @@ int mem_event_domctl(struct domain *d, x
>               if ( !hap_enabled(d) )
>                   break;
>
> -            /* Currently only EPT is supported */
> -            if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL )
> -                break;
> -
>               rc = -EXDEV;
>               /* Disallow paging in a PoD guest */
>               if ( p2m->pod.entry_count )
> diff -r 25ca78889ed4 -r 10ca4e4293ce xen/arch/x86/mm/p2m-pt.c
> --- a/xen/arch/x86/mm/p2m-pt.c
> +++ b/xen/arch/x86/mm/p2m-pt.c
> @@ -53,6 +53,20 @@
>   #define P2M_BASE_FLAGS \
>           (_PAGE_PRESENT | _PAGE_USER | _PAGE_DIRTY | _PAGE_ACCESSED)
>
> +#ifdef __x86_64__
> +/* l1e_from_pfn is not designed to have INVALID_MFN stored. The 0xff..ff
> + * value tramples over the higher-order bits used for flags (NX, p2mt,
> + * etc.) This happens for paging entries. Thus we do this clip/unclip
> + * juggle for l1 entries only (no paging superpages!) */
> +#define EFF_MFN_WIDTH       (PADDR_BITS-PAGE_SHIFT) /* 40 bits */
> +#define clipped_mfn(mfn)    ((mfn)&  ((1UL<<  EFF_MFN_WIDTH) - 1))
> +#define unclip_mfn(mfn)     ((clipped_mfn((mfn)) == INVALID_MFN) ? \
> +                                INVALID_MFN : (mfn))
> +#else
> +#define clipped_mfn(mfn)    (mfn)
> +#define unclip_mfn(mfn)     (mfn)
> +#endif /* __x86_64__ */
> +
>   static unsigned long p2m_type_to_flags(p2m_type_t t, mfn_t mfn)
>   {
>       unsigned long flags;
> @@ -77,6 +91,9 @@ static unsigned long p2m_type_to_flags(p
>       case p2m_invalid:
>       case p2m_mmio_dm:
>       case p2m_populate_on_demand:
> +    case p2m_ram_paging_out:
> +    case p2m_ram_paged:
> +    case p2m_ram_paging_in:
>       default:
>           return flags;
>       case p2m_ram_ro:
> @@ -168,7 +185,7 @@ p2m_next_level(struct p2m_domain *p2m, m
>                                         shift, max)) )
>           return 0;
>
> -    /* PoD: Not present doesn't imply empty. */
> +    /* PoD/paging: Not present doesn't imply empty. */
>       if ( !l1e_get_flags(*p2m_entry) )
>       {
>           struct page_info *pg;
> @@ -384,8 +401,9 @@ p2m_set_entry(struct p2m_domain *p2m, un
>                                      0, L1_PAGETABLE_ENTRIES);
>           ASSERT(p2m_entry);
>
> -        if ( mfn_valid(mfn) || (p2mt == p2m_mmio_direct) )
> -            entry_content = l1e_from_pfn(mfn_x(mfn),
> +        if ( mfn_valid(mfn) || (p2mt == p2m_mmio_direct) ||
> +             (p2mt == p2m_ram_paged) || (p2mt == p2m_ram_paging_in) )
> +            entry_content = l1e_from_pfn(clipped_mfn(mfn_x(mfn)),
>                                            p2m_type_to_flags(p2mt, mfn));
>           else
>               entry_content = l1e_empty();
> @@ -393,7 +411,7 @@ p2m_set_entry(struct p2m_domain *p2m, un
>           if ( entry_content.l1 != 0 )
>           {
>               p2m_add_iommu_flags(&entry_content, 0, iommu_pte_flags);
> -            old_mfn = l1e_get_pfn(*p2m_entry);
> +            old_mfn = unclip_mfn(l1e_get_pfn(*p2m_entry));
>           }
>           /* level 1 entry */
>           p2m->write_p2m_entry(p2m, gfn, p2m_entry, table_mfn,
> entry_content, 1);
> @@ -615,11 +633,12 @@ pod_retry_l1:
>                              sizeof(l1e));
>
>       if ( ret == 0 ) {
> +        unsigned long l1e_mfn = unclip_mfn(l1e_get_pfn(l1e));
>           p2mt = p2m_flags_to_type(l1e_get_flags(l1e));
> -        ASSERT(l1e_get_pfn(l1e) != INVALID_MFN || !p2m_is_ram(p2mt));
> +        ASSERT( (l1e_mfn != INVALID_MFN || !p2m_is_ram(p2mt)) ||
> +                (l1e_mfn == INVALID_MFN&&  p2m_is_paging(p2mt)) );
>
> -        if ( p2m_flags_to_type(l1e_get_flags(l1e))
> -             == p2m_populate_on_demand )
> +        if ( p2mt == p2m_populate_on_demand )
>           {
>               /* The read has succeeded, so we know that the mapping
>                * exits at this point.  */
> @@ -641,7 +660,7 @@ pod_retry_l1:
>           }
>
>           if ( p2m_is_valid(p2mt) || p2m_is_grant(p2mt) )
> -            mfn = _mfn(l1e_get_pfn(l1e));
> +            mfn = _mfn(l1e_mfn);
>           else
>               /* XXX see above */
>               p2mt = p2m_mmio_dm;
> @@ -783,18 +802,26 @@ pod_retry_l2:
>   pod_retry_l1:
>       if ( (l1e_get_flags(*l1e)&  _PAGE_PRESENT) == 0 )
>       {
> +        p2m_type_t l1t = p2m_flags_to_type(l1e_get_flags(*l1e));
>           /* PoD: Try to populate */
> -        if ( p2m_flags_to_type(l1e_get_flags(*l1e)) ==
> p2m_populate_on_demand )
> +        if ( l1t == p2m_populate_on_demand )
>           {
>               if ( q != p2m_query ) {
>                   if ( !p2m_pod_demand_populate(p2m, gfn, PAGE_ORDER_4K, q) )
>                       goto pod_retry_l1;
>               } else
>                   *t = p2m_populate_on_demand;
> +        } else {
> +            if ( p2m_is_paging(l1t) )
> +            {
> +                *t = l1t;
> +                /* No need to unclip due to check below */
> +                mfn = _mfn(l1e_get_pfn(*l1e));
> +            }
>           }
>
>           unmap_domain_page(l1e);
> -        return _mfn(INVALID_MFN);
> +        return (l1t == p2m_ram_paging_out) ? mfn : _mfn(INVALID_MFN);
>       }
>       mfn = _mfn(l1e_get_pfn(*l1e));
>       *t = p2m_flags_to_type(l1e_get_flags(*l1e));
> @@ -914,7 +941,7 @@ static void p2m_change_type_global(struc
>                       flags = l1e_get_flags(l1e[i1]);
>                       if ( p2m_flags_to_type(flags) != ot )
>                           continue;
> -                    mfn = l1e_get_pfn(l1e[i1]);
> +                    mfn = unclip_mfn(l1e_get_pfn(l1e[i1]));
>                       gfn = i1 + (i2 + (i3
>   #if CONFIG_PAGING_LEVELS>= 4
>   					+ (i4 * L3_PAGETABLE_ENTRIES)
> @@ -923,7 +950,7 @@ static void p2m_change_type_global(struc
>                              * L2_PAGETABLE_ENTRIES) * L1_PAGETABLE_ENTRIES;
>                       /* create a new 1le entry with the new type */
>                       flags = p2m_type_to_flags(nt, _mfn(mfn));
> -                    l1e_content = l1e_from_pfn(mfn, flags);
> +                    l1e_content = l1e_from_pfn(clipped_mfn(mfn), flags);
>                       p2m->write_p2m_entry(p2m, gfn,&l1e[i1],
>                                            l1mfn, l1e_content, 1);
>                   }
> @@ -1073,7 +1100,7 @@ long p2m_pt_audit_p2m(struct p2m_domain
>                                   entry_count++;
>                               continue;
>                           }
> -                        mfn = l1e_get_pfn(l1e[i1]);
> +                        mfn = unclip_mfn(l1e_get_pfn(l1e[i1]));
>                           ASSERT(mfn_valid(_mfn(mfn)));
>                           m2pfn = get_gpfn_from_mfn(mfn);
>                           if ( m2pfn != gfn&&
>
>
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xensource.com
> http://lists.xensource.com/xen-devel
>