[PATCH] patch to support super page (2M) with EPT

All of lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH] patch to support super page (2M) with EPT
@ 2008-05-09  9:10 Xin, Xiaohui
  2008-05-11 20:33 ` Huang2, Wei
  0 siblings, 1 reply; 14+ messages in thread
From: Xin, Xiaohui @ 2008-05-09  9:10 UTC (permalink / raw)
  To: xen-devel


[-- Attachment #1.1: Type: text/plain, Size: 835 bytes --]

Attached are the patches to support super page with EPT. We only support
2M size. And shadow may still work fine with 4K pages.

The patches can be split into 3 parts. Apply order is as attached.

 

tool.diff 

To allocate 2M physical contiguous memory in guest except the first 2M
and the last 2M.

The first 2M covers special memory, and Xen use the last few pages in
guest memory to do special things.

We let them to be 4K pages as normal.

super_page_common.patch 

To modify the p2m interfaces by adding an order parameter, such as
guest_physmap_add_page(), p2m_set_entry(), etc.

p2m-ept-file.patch

            To handle the EPT tables to support super page.            

 

 

Signed-off-by: Xin Xiaohui <xiaohui.xin@intel.com>

Signed-off-by: Li Xin, B <xin.b.li@intel.com>

 

 

 


[-- Attachment #1.2: Type: text/html, Size: 5406 bytes --]

[-- Attachment #2: tool.diff --]
[-- Type: application/octet-stream, Size: 2945 bytes --]

diff -r ccbbe6fe5827 tools/libxc/xc_hvm_build.c
--- a/tools/libxc/xc_hvm_build.c	Mon May 05 10:16:58 2008 +0100
+++ b/tools/libxc/xc_hvm_build.c	Fri May 09 01:07:36 2008 +0800
@@ -165,7 +165,7 @@ static int setup_guest(int xc_handle,
     uint32_t *ident_pt;
     struct elf_binary elf;
     uint64_t v_start, v_end;
-    int rc;
+    int rc, left;
     xen_capabilities_info_t caps;
 
     /* An HVM guest must be initialised with at least 2MB memory. */
@@ -213,19 +213,64 @@ static int setup_guest(int xc_handle,
      * Allocate memory for HVM guest, skipping VGA hole 0xA0000-0xC0000.
      * We allocate pages in batches of no more than 2048 to ensure that
      * we can be preempted and hence dom0 remains responsive.
-     */
+     * 1) Allocate 4K pages for the first 2M guest memory;
+     * 2) try to allocate 2M continous pages for the left guest memory
+     *    or use 4K pages;
+     * 3) Since the last page of the guest memory will be dereserved at last,
+     *    we try just allocate 4K pages for the last 2M guest memory.
+     */
+
     rc = xc_domain_memory_populate_physmap(
         xc_handle, dom, 0xa0, 0, 0, &page_array[0x00]);
     cur_pages = 0xc0;
-    while ( (rc == 0) && (nr_pages > cur_pages) )
+
+    if ( rc == 0 )
+        rc = xc_domain_memory_populate_physmap(
+            xc_handle, dom, 0x200-0xc0, 0, 0, &page_array[0xc0]);
+
+    cur_pages = 0x200;
+
+    left = nr_pages - ((nr_pages >> 9 ) << 9 );
+
+    while ( (rc == 0) && ( (left ? nr_pages : (nr_pages - 0x200))  > cur_pages) )
     {
         unsigned long count = nr_pages - cur_pages;
         if ( count > 2048 )
+        {
             count = 2048;
+            rc = xc_domain_memory_populate_physmap(
+                xc_handle, dom, 4, 9, 0, &page_array[cur_pages]);
+            if ( rc != 0 )
+            {
+                PERROR("Cannot allocate more 2M pages for HVM guest.\n");
+                rc = xc_domain_memory_populate_physmap(
+                    xc_handle, dom, count, 0, 0, &page_array[cur_pages]);
+                if ( rc != 0 )
+                {
+                    PERROR("Could not allocate memory for HVM guest.\n");
+                    goto error_out;
+                }
+            }
+        }
+        else
+        {
+            rc = xc_domain_memory_populate_physmap(
+                xc_handle, dom, count, 0, 0, &page_array[cur_pages]);
+
+            if ( rc != 0 )
+            {
+                PERROR("Could not allocate memory for HVM guest.\n");
+                goto error_out;
+            }
+        }
+
+        cur_pages += count;
+    }
+
+    if ( !left )
         rc = xc_domain_memory_populate_physmap(
-            xc_handle, dom, count, 0, 0, &page_array[cur_pages]);
-        cur_pages += count;
-    }
+            xc_handle, dom, nr_pages - cur_pages, 0, 0, &page_array[cur_pages]);
+
     if ( rc != 0 )
     {
         PERROR("Could not allocate memory for HVM guest.\n");

[-- Attachment #3: super_page_common.patch --]
[-- Type: application/octet-stream, Size: 15687 bytes --]

diff -r 26b88953b0c8 xen/arch/ia64/xen/mm.c
--- a/xen/arch/ia64/xen/mm.c	Wed May 07 23:13:22 2008 +0800
+++ b/xen/arch/ia64/xen/mm.c	Thu May 08 00:35:05 2008 +0800
@@ -2415,7 +2415,7 @@ steal_page(struct domain *d, struct page
 
 int
 guest_physmap_add_page(struct domain *d, unsigned long gpfn,
-                       unsigned long mfn)
+                       unsigned long mfn, int order)
 {
     BUG_ON(!mfn_valid(mfn));
     BUG_ON(mfn_to_page(mfn)->count_info != (PGC_allocated | 1));
@@ -2432,7 +2432,7 @@ guest_physmap_add_page(struct domain *d,
 
 void
 guest_physmap_remove_page(struct domain *d, unsigned long gpfn,
-                          unsigned long mfn)
+                          unsigned long mfn, int order)
 {
     BUG_ON(mfn == 0);//XXX
     zap_domain_page_one(d, gpfn << PAGE_SHIFT, 0, mfn);
@@ -2838,7 +2838,7 @@ arch_memory_op(int op, XEN_GUEST_HANDLE(
         if (prev_mfn && mfn_valid(prev_mfn)) {
             if (is_xen_heap_mfn(prev_mfn))
                 /* Xen heap frames are simply unhooked from this phys slot. */
-                guest_physmap_remove_page(d, xatp.gpfn, prev_mfn);
+                guest_physmap_remove_page(d, xatp.gpfn, prev_mfn, 0);
             else
                 /* Normal domain memory is freed, to avoid leaking memory. */
                 guest_remove_page(d, xatp.gpfn);
@@ -2847,10 +2847,10 @@ arch_memory_op(int op, XEN_GUEST_HANDLE(
         /* Unmap from old location, if any. */
         gpfn = get_gpfn_from_mfn(mfn);
         if (gpfn != INVALID_M2P_ENTRY)
-            guest_physmap_remove_page(d, gpfn, mfn);
+            guest_physmap_remove_page(d, gpfn, mfn, 0);
 
         /* Map at new location. */
-        guest_physmap_add_page(d, xatp.gpfn, mfn);
+        guest_physmap_add_page(d, xatp.gpfn, mfn, 0);
 
     out:
         domain_unlock(d);
diff -r 26b88953b0c8 xen/arch/powerpc/mm.c
--- a/xen/arch/powerpc/mm.c	Wed May 07 23:13:22 2008 +0800
+++ b/xen/arch/powerpc/mm.c	Thu May 08 00:35:05 2008 +0800
@@ -591,7 +591,7 @@ void guest_physmap_add_page(
 }
 
 void guest_physmap_remove_page(
-    struct domain *d, unsigned long gpfn, unsigned long mfn)
+    struct domain *d, unsigned long gpfn, unsigned long mfn, int order)
 {
     if (page_get_owner(mfn_to_page(mfn)) != d) {
         printk("Won't unmap foreign MFN 0x%lx for DOM%d\n", mfn, d->domain_id);
diff -r 26b88953b0c8 xen/arch/x86/mm.c
--- a/xen/arch/x86/mm.c	Wed May 07 23:13:22 2008 +0800
+++ b/xen/arch/x86/mm.c	Thu May 08 00:35:05 2008 +0800
@@ -3310,7 +3310,7 @@ long arch_memory_op(int op, XEN_GUEST_HA
         {
             if ( is_xen_heap_mfn(prev_mfn) )
                 /* Xen heap frames are simply unhooked from this phys slot. */
-                guest_physmap_remove_page(d, xatp.gpfn, prev_mfn);
+                guest_physmap_remove_page(d, xatp.gpfn, prev_mfn, 0);
             else
                 /* Normal domain memory is freed, to avoid leaking memory. */
                 guest_remove_page(d, xatp.gpfn);
@@ -3319,10 +3319,10 @@ long arch_memory_op(int op, XEN_GUEST_HA
         /* Unmap from old location, if any. */
         gpfn = get_gpfn_from_mfn(mfn);
         if ( gpfn != INVALID_M2P_ENTRY )
-            guest_physmap_remove_page(d, gpfn, mfn);
+            guest_physmap_remove_page(d, gpfn, mfn, 0);
 
         /* Map at new location. */
-        guest_physmap_add_page(d, xatp.gpfn, mfn);
+        guest_physmap_add_page(d, xatp.gpfn, mfn, 0);
 
         domain_unlock(d);
 
diff -r 26b88953b0c8 xen/arch/x86/mm/p2m.c
--- a/xen/arch/x86/mm/p2m.c	Wed May 07 23:13:22 2008 +0800
+++ b/xen/arch/x86/mm/p2m.c	Thu May 08 01:17:02 2008 +0800
@@ -204,7 +204,7 @@ p2m_next_level(struct domain *d, mfn_t *
 
 // Returns 0 on error (out of memory)
 static int
-p2m_set_entry(struct domain *d, unsigned long gfn, mfn_t mfn, p2m_type_t p2mt)
+p2m_set_entry(struct domain *d, unsigned long gfn, mfn_t mfn, int order, p2m_type_t p2mt)
 {
     // XXX -- this might be able to be faster iff current->domain == d
     mfn_t table_mfn = pagetable_get_mfn(d->arch.phys_table);
@@ -433,9 +433,9 @@ void p2m_change_entry_type_global(struct
 }
 
 static inline
-int set_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn, p2m_type_t p2mt)
-{
-    return d->arch.p2m->set_entry(d, gfn, mfn, p2mt);
+int set_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn, int order, p2m_type_t p2mt)
+{
+    return d->arch.p2m->set_entry(d, gfn, mfn, order, p2mt);
 }
 
 // Allocate a new p2m table for a domain.
@@ -498,7 +498,7 @@ int p2m_alloc_table(struct domain *d,
     P2M_PRINTK("populating p2m table\n");
 
     /* Initialise physmap tables for slot zero. Other code assumes this. */
-    if ( !set_p2m_entry(d, 0, _mfn(INVALID_MFN), p2m_invalid) )
+    if ( !set_p2m_entry(d, 0, _mfn(INVALID_MFN), 0, p2m_invalid) )
         goto error;
 
     /* Copy all existing mappings from the page list and m2p */
@@ -517,7 +517,7 @@ int p2m_alloc_table(struct domain *d,
             (gfn != 0x55555555L)
 #endif
              && gfn != INVALID_M2P_ENTRY
-            && !set_p2m_entry(d, gfn, mfn, p2m_ram_rw) )
+            && !set_p2m_entry(d, gfn, mfn, 0, p2m_ram_rw) )
             goto error;
     }
 
@@ -750,30 +750,32 @@ static void audit_p2m(struct domain *d)
 
 
 static void
-p2m_remove_page(struct domain *d, unsigned long gfn, unsigned long mfn)
+p2m_remove_page(struct domain *d, unsigned long gfn, unsigned long mfn,
+                    int order )
 {
     if ( !paging_mode_translate(d) )
         return;
     P2M_DEBUG("removing gfn=%#lx mfn=%#lx\n", gfn, mfn);
 
-    set_p2m_entry(d, gfn, _mfn(INVALID_MFN), p2m_invalid);
+    set_p2m_entry(d, gfn, _mfn(INVALID_MFN), order, p2m_invalid);
     set_gpfn_from_mfn(mfn, INVALID_M2P_ENTRY);
 }
 
 void
 guest_physmap_remove_page(struct domain *d, unsigned long gfn,
-                          unsigned long mfn)
+                          unsigned long mfn, int order )
 {
     p2m_lock(d->arch.p2m);
     audit_p2m(d);
-    p2m_remove_page(d, gfn, mfn);
+    for ( int i = 0; i < ( 1<< order); i++ )
+        p2m_remove_page(d, gfn+i, mfn+i, order);
     audit_p2m(d);
     p2m_unlock(d->arch.p2m);
 }
 
 int
 guest_physmap_add_entry(struct domain *d, unsigned long gfn,
-                        unsigned long mfn, p2m_type_t t)
+                        unsigned long mfn, int order, p2m_type_t t)
 {
     unsigned long ogfn;
     p2m_type_t ot;
@@ -831,13 +833,13 @@ guest_physmap_add_entry(struct domain *d
             P2M_DEBUG("old gfn=%#lx -> mfn %#lx\n",
                       ogfn , mfn_x(omfn));
             if ( mfn_x(omfn) == mfn )
-                p2m_remove_page(d, ogfn, mfn);
+                p2m_remove_page(d, ogfn, mfn, order);
         }
     }
 
     if ( mfn_valid(_mfn(mfn)) ) 
     {
-        if ( !set_p2m_entry(d, gfn, _mfn(mfn), t) )
+        if ( !set_p2m_entry(d, gfn, _mfn(mfn), order, t) )
             rc = -EINVAL;
         set_gpfn_from_mfn(mfn, gfn);
     }
@@ -845,7 +847,7 @@ guest_physmap_add_entry(struct domain *d
     {
         gdprintk(XENLOG_WARNING, "Adding bad mfn to p2m map (%#lx -> %#lx)\n",
                  gfn, mfn);
-        if ( !set_p2m_entry(d, gfn, _mfn(INVALID_MFN), p2m_invalid) )
+        if ( !set_p2m_entry(d, gfn, _mfn(INVALID_MFN), order, p2m_invalid) )
             rc = -EINVAL;
     }
 
@@ -967,7 +969,7 @@ p2m_type_t p2m_change_type(struct domain
 
     mfn = gfn_to_mfn(d, gfn, &pt);
     if ( pt == ot )
-        set_p2m_entry(d, gfn, mfn, nt);
+        set_p2m_entry(d, gfn, mfn, 0, nt);
 
     p2m_unlock(d->arch.p2m);
 
@@ -991,7 +993,7 @@ set_mmio_p2m_entry(struct domain *d, uns
         set_gpfn_from_mfn(mfn_x(omfn), INVALID_M2P_ENTRY);
     }
 
-    rc = set_p2m_entry(d, gfn, mfn, p2m_mmio_direct);
+    rc = set_p2m_entry(d, gfn, mfn, 0, p2m_mmio_direct);
     if ( 0 == rc )
         gdprintk(XENLOG_ERR,
             "set_mmio_p2m_entry: set_p2m_entry failed! mfn=%08lx\n",
@@ -1015,7 +1017,7 @@ clear_mmio_p2m_entry(struct domain *d, u
             "clear_mmio_p2m_entry: gfn_to_mfn failed! gfn=%08lx\n", gfn);
         return 0;
     }
-    rc = set_p2m_entry(d, gfn, _mfn(INVALID_MFN), 0);
+    rc = set_p2m_entry(d, gfn, _mfn(INVALID_MFN), 0, 0);
 
     return rc;
 }
diff -r 26b88953b0c8 xen/common/grant_table.c
--- a/xen/common/grant_table.c	Wed May 07 23:13:22 2008 +0800
+++ b/xen/common/grant_table.c	Thu May 08 00:35:05 2008 +0800
@@ -1159,7 +1159,7 @@ gnttab_transfer(
         spin_lock(&e->grant_table->lock);
 
         sha = &shared_entry(e->grant_table, gop.ref);
-        guest_physmap_add_page(e, sha->frame, mfn);
+        guest_physmap_add_page(e, sha->frame, mfn, 0);
         sha->frame = mfn;
         wmb();
         sha->flags |= GTF_transfer_completed;
diff -r 26b88953b0c8 xen/common/memory.c
--- a/xen/common/memory.c	Wed May 07 23:13:22 2008 +0800
+++ b/xen/common/memory.c	Thu May 08 00:35:05 2008 +0800
@@ -109,8 +109,11 @@ static void populate_physmap(struct memo
             goto out;
         }
 
-        if ( unlikely(__copy_from_guest_offset(&gpfn, a->extent_list, i, 1)) )
-            goto out;
+        if ( unlikely(__copy_from_guest_offset(&gpfn, a->extent_list, i << a->extent_order, 1)) )
+        {
+            printk("copy_from_guest failed.\n");
+            goto out;
+        }
 
         page = alloc_domheap_pages(
             d, a->extent_order, a->memflags | MEMF_node(node));
@@ -126,11 +129,7 @@ static void populate_physmap(struct memo
         mfn = page_to_mfn(page);
 
         if ( unlikely(paging_mode_translate(d)) )
-        {
-            for ( j = 0; j < (1 << a->extent_order); j++ )
-                if ( guest_physmap_add_page(d, gpfn + j, mfn + j) )
-                    goto out;
-        }
+            guest_physmap_add_page(d, gpfn, mfn, a->extent_order);
         else
         {
             for ( j = 0; j < (1 << a->extent_order); j++ )
@@ -172,7 +171,7 @@ int guest_remove_page(struct domain *d, 
     if ( test_and_clear_bit(_PGC_allocated, &page->count_info) )
         put_page(page);
 
-    guest_physmap_remove_page(d, gmfn, mfn);
+    guest_physmap_remove_page(d, gmfn, mfn, 0);
 
     put_page(page);
 
@@ -419,7 +418,7 @@ static long memory_exchange(XEN_GUEST_HA
             if ( !test_and_clear_bit(_PGC_allocated, &page->count_info) )
                 BUG();
             mfn = page_to_mfn(page);
-            guest_physmap_remove_page(d, mfn_to_gmfn(d, mfn), mfn);
+            guest_physmap_remove_page(d, mfn_to_gmfn(d, mfn), mfn, 0);
             put_page(page);
         }
 
@@ -441,8 +440,7 @@ static long memory_exchange(XEN_GUEST_HA
             if ( unlikely(paging_mode_translate(d)) )
             {
                 /* Ignore failure here. There's nothing we can do. */
-                for ( k = 0; k < (1UL << exch.out.extent_order); k++ )
-                    (void)guest_physmap_add_page(d, gpfn + k, mfn + k);
+                (void)guest_physmap_add_page(d, gpfn, mfn, exch.out.extent_order);
             }
             else
             {
diff -r 26b88953b0c8 xen/include/asm-ia64/grant_table.h
--- a/xen/include/asm-ia64/grant_table.h	Wed May 07 23:13:22 2008 +0800
+++ b/xen/include/asm-ia64/grant_table.h	Thu May 08 00:35:05 2008 +0800
@@ -13,7 +13,7 @@ int replace_grant_host_mapping(unsigned 
 int replace_grant_host_mapping(unsigned long gpaddr, unsigned long mfn, unsigned long new_gpaddr, unsigned int flags);
 
 // for grant transfer
-int guest_physmap_add_page(struct domain *d, unsigned long gpfn, unsigned long mfn);
+int guest_physmap_add_page(struct domain *d, unsigned long gpfn, unsigned long mfn, int order);
 
 /* XXX
  * somewhere appropriate
diff -r 26b88953b0c8 xen/include/asm-ia64/shadow.h
--- a/xen/include/asm-ia64/shadow.h	Wed May 07 23:13:22 2008 +0800
+++ b/xen/include/asm-ia64/shadow.h	Thu May 08 00:35:05 2008 +0800
@@ -40,8 +40,10 @@
  * Utilities to change relationship of gpfn->mfn for designated domain,
  * which is required by gnttab transfer, balloon, device model and etc.
  */
-int guest_physmap_add_page(struct domain *d, unsigned long gpfn, unsigned long mfn);
-void guest_physmap_remove_page(struct domain *d, unsigned long gpfn, unsigned long mfn);
+int guest_physmap_add_page(struct domain *d, unsigned long gpfn, 
+                        unsigned long mfn, int order);
+void guest_physmap_remove_page(struct domain *d, unsigned long gpfn, 
+                        unsigned long mfn, int order);
 
 static inline int
 shadow_mode_enabled(struct domain *d)
diff -r 26b88953b0c8 xen/include/asm-powerpc/mm.h
--- a/xen/include/asm-powerpc/mm.h	Wed May 07 23:13:22 2008 +0800
+++ b/xen/include/asm-powerpc/mm.h	Thu May 08 00:35:05 2008 +0800
@@ -278,9 +278,9 @@ extern int guest_physmap_max_mem_pages(s
 extern int guest_physmap_max_mem_pages(struct domain *d, unsigned long new_max);
 
 extern void guest_physmap_add_page(
-    struct domain *d, unsigned long gpfn, unsigned long mfn);
+    struct domain *d, unsigned long gpfn, unsigned long mfn, int order);
 
 extern void guest_physmap_remove_page(
-    struct domain *d, unsigned long gpfn, unsigned long mfn);
+    struct domain *d, unsigned long gpfn, unsigned long mfn, int order);
 
 #endif
diff -r 26b88953b0c8 xen/include/asm-x86/p2m.h
--- a/xen/include/asm-x86/p2m.h	Wed May 07 23:13:22 2008 +0800
+++ b/xen/include/asm-x86/p2m.h	Thu May 08 19:52:57 2008 +0800
@@ -102,7 +102,7 @@ struct p2m_domain {
     void               (*free_page   )(struct domain *d,
                                        struct page_info *pg);
     int                (*set_entry   )(struct domain *d, unsigned long gfn,
-                                       mfn_t mfn, p2m_type_t p2mt);
+                                       mfn_t mfn, int order, p2m_type_t p2mt);
     mfn_t              (*get_entry   )(struct domain *d, unsigned long gfn,
                                        p2m_type_t *p2mt);
     mfn_t              (*get_entry_current)(unsigned long gfn,
@@ -203,21 +203,32 @@ void p2m_final_teardown(struct domain *d
 
 /* Add a page to a domain's p2m table */
 int guest_physmap_add_entry(struct domain *d, unsigned long gfn,
-                             unsigned long mfn, p2m_type_t t);
+                             unsigned long mfn, int order, p2m_type_t t);
 
 /* Untyped version for RAM only, for compatibility 
  *
  * Return 0 for success
  */
 static inline int guest_physmap_add_page(struct domain *d, unsigned long gfn,
-                                         unsigned long mfn)
-{
-    return guest_physmap_add_entry(d, gfn, mfn, p2m_ram_rw);
+                                         unsigned long mfn, int order)
+{
+    int ret;
+
+    for ( int i = 0; i < (1 << order); i++ )
+    {
+        ret = guest_physmap_add_entry(d, gfn+i, mfn+i, order, p2m_ram_rw);
+        if ( ret != 0 )
+            break;
+    }
+
+    /* TODO: fix exit path when failure */
+
+    return ret;
 }
 
 /* Remove a page from a domain's p2m table */
 void guest_physmap_remove_page(struct domain *d, unsigned long gfn,
-                               unsigned long mfn);
+                               unsigned long mfn, int order);
 
 /* Change types across all p2m entries in a domain */
 void p2m_change_type_global(struct domain *d, p2m_type_t ot, p2m_type_t nt);
diff -r 26b88953b0c8 xen/include/xen/paging.h
--- a/xen/include/xen/paging.h	Wed May 07 23:13:22 2008 +0800
+++ b/xen/include/xen/paging.h	Thu May 08 00:35:05 2008 +0800
@@ -18,8 +18,8 @@
 #else
 
 #define paging_mode_translate(d)              (0)
-#define guest_physmap_add_page(d, p, m)       (0)
-#define guest_physmap_remove_page(d, p, m)    ((void)0)
+#define guest_physmap_add_page(d, p, m, order)       (0)
+#define guest_physmap_remove_page(d, p, m, order)    ((void)0)
 
 #endif
 

[-- Attachment #4: p2m-ept-file.patch --]
[-- Type: application/octet-stream, Size: 9705 bytes --]

diff -r bc9cf015d722 xen/arch/x86/mm/hap/p2m-ept.c
--- a/xen/arch/x86/mm/hap/p2m-ept.c	Fri May 09 01:46:33 2008 +0800
+++ b/xen/arch/x86/mm/hap/p2m-ept.c	Sat May 10 01:16:31 2008 +0800
@@ -20,6 +20,7 @@
 #include <xen/domain_page.h>
 #include <xen/sched.h>
 #include <asm/current.h>
+#include <asm/paging.h>
 #include <asm/types.h>
 #include <asm/domain.h>
 #include <asm/p2m.h>
@@ -46,6 +47,9 @@ static void ept_p2m_type_to_flags(ept_en
     }
 }
 
+#define GUEST_TABLE_NORMAL_PAGE 1
+#define GUEST_TABLE_SUPER_PAGE  2
+
 static int ept_next_level(struct domain *d, bool_t read_only,
                           ept_entry_t **table, unsigned long *gfn_remainder,
                           u32 shift)
@@ -54,7 +58,6 @@ static int ept_next_level(struct domain 
     u32 index;
 
     index = *gfn_remainder >> shift;
-    *gfn_remainder &= (1UL << shift) - 1;
 
     ept_entry = (*table) + index;
 
@@ -83,31 +86,53 @@ static int ept_next_level(struct domain 
         ept_entry->r = ept_entry->w = ept_entry->x = 1;
     }
 
-    next = map_domain_page(ept_entry->mfn);
-    unmap_domain_page(*table);
-    *table = next;
-
-    return 1;
+    if ( !ept_entry->sp_avail )
+    {
+        *gfn_remainder &= (1UL << shift) - 1;
+        next = map_domain_page(ept_entry->mfn);
+        unmap_domain_page(*table);
+        *table = next;
+        return GUEST_TABLE_NORMAL_PAGE;
+    }
+    else
+        return GUEST_TABLE_SUPER_PAGE;
 }
 
 static int
-ept_set_entry(struct domain *d, unsigned long gfn, mfn_t mfn, p2m_type_t p2mt)
-{
-    ept_entry_t *table =
-        map_domain_page(mfn_x(pagetable_get_mfn(d->arch.phys_table)));
-    unsigned long gfn_remainder = gfn;
+ept_set_entry(struct domain *d, unsigned long gfn, mfn_t mfn, int order,
+                p2m_type_t p2mt)
+{
+    ept_entry_t *table = NULL;
+    unsigned long gfn_remainder = gfn, offset = 0;
     ept_entry_t *ept_entry = NULL;
     u32 index;
-    int i, rv = 0;
+    int i, rv = 0, ret = 0;
+    int walk_level = order / EPT_TABLE_ORDER;
 
     /* Should check if gfn obeys GAW here */
 
-    for ( i = EPT_DEFAULT_GAW; i > 0; i-- )
-        if ( !ept_next_level(d, 0, &table, &gfn_remainder,
-                             i * EPT_TABLE_ORDER) )
+    if (  order != 0 )
+        if ( (gfn & ((1UL << order) - 1)) )
+            return 1;
+
+    table = map_domain_page(mfn_x(pagetable_get_mfn(d->arch.phys_table)));
+
+    ASSERT(table != NULL);
+
+    for ( i = EPT_DEFAULT_GAW; i > walk_level; i-- )
+    {
+        ret = ept_next_level(d, 0, &table, &gfn_remainder,
+          i * EPT_TABLE_ORDER);
+        if ( !ret )
             goto out;
-
-    index = gfn_remainder;
+        else if ( ret == GUEST_TABLE_SUPER_PAGE )
+            break;
+    }
+
+    index = gfn_remainder >> ( i ?  (i * EPT_TABLE_ORDER): order);
+    walk_level = ( i ? ( i * EPT_TABLE_ORDER) : order) / EPT_TABLE_ORDER;
+    offset = (gfn_remainder & ( ((1 << (i*EPT_TABLE_ORDER)) - 1)));
+
     ept_entry = table + index;
 
     if ( mfn_valid(mfn_x(mfn)) || (p2mt == p2m_mmio_direct) )
@@ -117,9 +142,20 @@ ept_set_entry(struct domain *d, unsigned
             d->arch.p2m->max_mapped_pfn = gfn;
 
         ept_entry->emt = EPT_DEFAULT_MT;
-        ept_entry->sp_avail = 0;
+        ept_entry->sp_avail = walk_level ? 1 : 0;
+
+        if ( ret == GUEST_TABLE_SUPER_PAGE )
+        {
+            ept_entry->mfn = mfn_x(mfn) - offset;
+            if ( ept_entry->avail1 == p2m_ram_logdirty &&
+              p2mt == p2m_ram_rw )
+                for ( i = 0; i < 512; i++ )
+                    paging_mark_dirty(d, mfn_x(mfn)-offset+i);
+        }
+        else
+            ept_entry->mfn = mfn_x(mfn);
+
         ept_entry->avail1 = p2mt;
-        ept_entry->mfn = mfn_x(mfn);
         ept_entry->rsvd = 0;
         ept_entry->avail2 = 0;
         /* last step */
@@ -132,14 +168,42 @@ ept_set_entry(struct domain *d, unsigned
     /* Success */
     rv = 1;
 
- out:
+out:
     unmap_domain_page(table);
 
     ept_sync_domain(d);
 
+    /* Now the p2m table is not shared with vt-d page table */
+
+    if ( iommu_enabled && is_hvm_domain(d) )
+    {
+        if ( p2mt == p2m_ram_rw )
+        {
+            if ( ret == GUEST_TABLE_SUPER_PAGE )
+            {
+                for ( i = 0; i < 512; i++ )
+                    iommu_map_page(d, gfn-offset+i, mfn_x(mfn)-offset+i);
+            }
+            else if ( ret )
+                iommu_map_page(d, gfn, mfn_x(mfn));
+        }
+        else
+        {
+            if ( ret == GUEST_TABLE_SUPER_PAGE )
+            {
+                for ( i = 0; i < 512; i++ )
+                    iommu_unmap_page(d, gfn-offset+i);
+            }
+            else if ( ret )
+                iommu_unmap_page(d, gfn);
+        }
+    }
+
+#ifdef P2M_SHARE_WITH_VTD_PAGE_TABLE
     /* If p2m table is shared with vtd page-table. */
     if ( iommu_enabled && is_hvm_domain(d) && (p2mt == p2m_mmio_direct) )
         iommu_flush(d, gfn, (u64*)ept_entry);
+#endif
 
     return rv;
 }
@@ -152,7 +216,7 @@ static mfn_t ept_get_entry(struct domain
     unsigned long gfn_remainder = gfn;
     ept_entry_t *ept_entry;
     u32 index;
-    int i;
+    int i, ret=0;
     mfn_t mfn = _mfn(INVALID_MFN);
 
     *t = p2m_mmio_dm;
@@ -164,17 +228,31 @@ static mfn_t ept_get_entry(struct domain
     /* Should check if gfn obeys GAW here. */
 
     for ( i = EPT_DEFAULT_GAW; i > 0; i-- )
-        if ( !ept_next_level(d, 1, &table, &gfn_remainder,
-                             i * EPT_TABLE_ORDER) )
+    {
+        ret = ept_next_level(d, 1, &table, &gfn_remainder,
+                             i * EPT_TABLE_ORDER);
+        if ( !ret )
             goto out;
-
-    index = gfn_remainder;
+        else if ( ret == GUEST_TABLE_SUPER_PAGE )
+            break;
+    }
+
+    index = gfn_remainder >> ( i * EPT_TABLE_ORDER);
     ept_entry = table + index;
 
     if ( ept_entry->avail1 != p2m_invalid )
     {
         *t = ept_entry->avail1;
         mfn = _mfn(ept_entry->mfn);
+        if ( i )
+        {
+            /* we may meet super pages, and to split into 4k pages
+             * to emulate p2m table
+             */
+            unsigned long split_mfn = 
+              mfn_x(mfn) + (gfn_remainder & ( ((1 << (i*EPT_TABLE_ORDER)) - 1 )));
+            mfn = _mfn(split_mfn);
+        }
     }
 
  out:
@@ -205,33 +283,63 @@ static void ept_change_entry_type_global
     l4e = map_domain_page(mfn_x(pagetable_get_mfn(d->arch.phys_table)));
     for (i4 = 0; i4 < EPT_PAGETABLE_ENTRIES; i4++ )
     {
-        if ( !l4e[i4].epte || l4e[i4].sp_avail )
+        if ( !l4e[i4].epte )
             continue;
-        l3e = map_domain_page(l4e[i4].mfn);
-        for ( i3 = 0; i3 < EPT_PAGETABLE_ENTRIES; i3++ )
-        {
-            if ( !l3e[i3].epte || l3e[i3].sp_avail )
+        if ( !l4e[i4].sp_avail )
+        {
+            l3e = map_domain_page(l4e[i4].mfn);
+            for ( i3 = 0; i3 < EPT_PAGETABLE_ENTRIES; i3++ )
+            {
+                if ( !l3e[i3].epte )
+                    continue;
+                if ( !l3e[i3].sp_avail )
+                {
+                    l2e = map_domain_page(l3e[i3].mfn);
+                    for ( i2 = 0; i2 < EPT_PAGETABLE_ENTRIES; i2++ )
+                    {
+                        if ( !l2e[i2].epte )
+                            continue;
+                        if ( !l2e[i2].sp_avail )
+                        {
+                            l1e = map_domain_page(l2e[i2].mfn);
+                            for ( i1  = 0; i1 < EPT_PAGETABLE_ENTRIES; i1++ )
+                            {
+                                if ( !l1e[i1].epte )
+                                    continue;
+                                if ( l1e[i1].avail1 != ot )
+                                    continue;
+                                l1e[i1].avail1 = nt;
+                                ept_p2m_type_to_flags(l1e+i1, nt);
+                            }
+                            unmap_domain_page(l1e);
+                        }
+                        else
+                        {
+                            if ( l2e[i2].avail1 != ot )
+                                continue;
+                            l2e[i2].avail1 = nt;
+                            ept_p2m_type_to_flags(l2e+i2, nt);
+                        }
+                    }
+                    unmap_domain_page(l2e);
+                }
+                else
+                {
+                    if ( l3e[i3].avail1 != ot )
+                        continue;
+                    l3e[i3].avail1 = nt;
+                    ept_p2m_type_to_flags(l3e+i3, nt);
+                }
+            }
+            unmap_domain_page(l3e);
+        }
+        else
+        {
+            if ( l4e[i4].avail1 != ot )
                 continue;
-            l2e = map_domain_page(l3e[i3].mfn);
-            for ( i2 = 0; i2 < EPT_PAGETABLE_ENTRIES; i2++ )
-            {
-                if ( !l2e[i2].epte || l2e[i2].sp_avail )
-                    continue;
-                l1e = map_domain_page(l2e[i2].mfn);
-                for ( i1  = 0; i1 < EPT_PAGETABLE_ENTRIES; i1++ )
-                {
-                    if ( !l1e[i1].epte )
-                        continue;
-                    if ( l1e[i1].avail1 != ot )
-                        continue;
-                    l1e[i1].avail1 = nt;
-                    ept_p2m_type_to_flags(l1e+i1, nt);
-                }
-                unmap_domain_page(l1e);
-            }
-            unmap_domain_page(l2e);
-        }
-        unmap_domain_page(l3e);
+            l4e[i4].avail1 = nt;
+            ept_p2m_type_to_flags(l4e+i4, nt);
+        }
     }
     unmap_domain_page(l4e);
 

[-- Attachment #5: Type: text/plain, Size: 138 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 14+ messages in thread

* RE: [PATCH] patch to support super page (2M) with EPT
  2008-05-09  9:10 Xin, Xiaohui
@ 2008-05-11 20:33 ` Huang2, Wei
  2008-05-12  4:36   ` Huang2, Wei
  0 siblings, 1 reply; 14+ messages in thread
From: Huang2, Wei @ 2008-05-11 20:33 UTC (permalink / raw)
  To: Xin, Xiaohui, xen-devel

[-- Attachment #1.1: Type: text/plain, Size: 1292 bytes --]

Could we work together for a common solution? As far as I can see, it
largely overlaps with my super page patch. The major difference is
between p2m.c and p2m-ept.c.

-Wei

From: xen-devel-bounces@lists.xensource.com
[mailto:xen-devel-bounces@lists.xensource.com] On Behalf Of Xin, Xiaohui
Sent: Friday, May 09, 2008 4:11 AM
To: xen-devel@lists.xensource.com
Subject: [Xen-devel][PATCH] patch to support super page (2M) with EPT

Attached are the patches to support super page with EPT. We only support
2M size. And shadow may still work fine with 4K pages.

The patches can be split into 3 parts. Apply order is as attached.

tool.diff 

To allocate 2M physical contiguous memory in guest except the first 2M
and the last 2M.

The first 2M covers special memory, and Xen use the last few pages in
guest memory to do special things.

We let them to be 4K pages as normal.

super_page_common.patch 

To modify the p2m interfaces by adding an order parameter, such as
guest_physmap_add_page(), p2m_set_entry(), etc.

p2m-ept-file.patch

            To handle the EPT tables to support super page.            

Signed-off-by: Xin Xiaohui <xiaohui.xin@intel.com>

Signed-off-by: Li Xin, B <xin.b.li@intel.com>

[-- Attachment #1.2: Type: text/html, Size: 6329 bytes --]

[-- Attachment #2: Type: text/plain, Size: 138 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 14+ messages in thread

* RE: [PATCH] patch to support super page (2M) with EPT
  2008-05-11 20:33 ` Huang2, Wei
@ 2008-05-12  4:36   ` Huang2, Wei
  2008-05-12  5:04     ` Xin, Xiaohui
  0 siblings, 1 reply; 14+ messages in thread
From: Huang2, Wei @ 2008-05-12  4:36 UTC (permalink / raw)
  To: Xin, Xiaohui, xen-devel


[-- Attachment #1.1: Type: text/plain, Size: 2084 bytes --]

This is the latest one I created. Please review it and I will re-submit.

 

1.      It includes the patch for p2m-ept.c, directly from your previous
patch. 

2.      Xc_hvm_create.c is based on my original approach. It includes
support for both 2MB and 4MB pages. Also it considers the case of odd
page size (such as 255MB). But I did not allocate the last 2MB area
using 4KB  pages. Let me know if it is a big issue.

3.      The rest are pretty similar.

 

Thanks,

 

-Wei

 

From: xen-devel-bounces@lists.xensource.com
[mailto:xen-devel-bounces@lists.xensource.com] On Behalf Of Huang2, Wei
Sent: Sunday, May 11, 2008 3:34 PM
To: Xin, Xiaohui; xen-devel@lists.xensource.com
Subject: RE: [Xen-devel][PATCH] patch to support super page (2M) with
EPT

 

Could we work together for a common solution? As far as I can see, it
largely overlaps with my super page patch. The major difference is
between p2m.c and p2m-ept.c.

 

-Wei

 

From: xen-devel-bounces@lists.xensource.com
[mailto:xen-devel-bounces@lists.xensource.com] On Behalf Of Xin, Xiaohui
Sent: Friday, May 09, 2008 4:11 AM
To: xen-devel@lists.xensource.com
Subject: [Xen-devel][PATCH] patch to support super page (2M) with EPT

 

Attached are the patches to support super page with EPT. We only support
2M size. And shadow may still work fine with 4K pages.

The patches can be split into 3 parts. Apply order is as attached.

 

tool.diff 

To allocate 2M physical contiguous memory in guest except the first 2M
and the last 2M.

The first 2M covers special memory, and Xen use the last few pages in
guest memory to do special things.

We let them to be 4K pages as normal.

super_page_common.patch 

To modify the p2m interfaces by adding an order parameter, such as
guest_physmap_add_page(), p2m_set_entry(), etc.

p2m-ept-file.patch

            To handle the EPT tables to support super page.            

 

 

Signed-off-by: Xin Xiaohui <xiaohui.xin@intel.com>

Signed-off-by: Li Xin, B <xin.b.li@intel.com>

 

 

 


[-- Attachment #1.2: Type: text/html, Size: 10340 bytes --]

[-- Attachment #2: super_page_patch.txt --]
[-- Type: text/plain, Size: 44486 bytes --]

diff -r 810d8c3ac992 tools/libxc/xc_hvm_build.c
--- a/tools/libxc/xc_hvm_build.c	Thu May 08 16:58:33 2008 +0100
+++ b/tools/libxc/xc_hvm_build.c	Sun May 11 17:21:52 2008 -0500
@@ -157,8 +157,10 @@ static int setup_guest(int xc_handle,
                        char *image, unsigned long image_size)
 {
     xen_pfn_t *page_array = NULL;
+    xen_pfn_t *super_page_array = NULL;
     unsigned long i, nr_pages = (unsigned long)memsize << (20 - PAGE_SHIFT);
-    unsigned long special_page_nr, entry_eip, cur_pages;
+    unsigned long nr_super_pages;
+    unsigned long special_page_nr, entry_eip, cur_pages, limit;
     struct xen_add_to_physmap xatp;
     struct shared_info *shared_info;
     void *e820_page;
@@ -167,6 +169,8 @@ static int setup_guest(int xc_handle,
     uint64_t v_start, v_end;
     int rc;
     xen_capabilities_info_t caps;
+    int super_page_shift;
+    int super_page_order;
 
     /* An HVM guest must be initialised with at least 2MB memory. */
     if ( memsize < 2 )
@@ -189,6 +193,15 @@ static int setup_guest(int xc_handle,
         PERROR("Guest OS must load to a page boundary.\n");
         goto error_out;
     }
+
+    /* check for PAE support and setup page size shift appropriately */
+    if ( strstr(caps, "x86_32p") )
+        super_page_shift = 1; 
+    else
+        super_page_shift = 2; 
+
+    nr_super_pages = (unsigned long)memsize >> super_page_shift;
+    super_page_order = 9 + (super_page_shift - 1);
 
     IPRINTF("VIRTUAL MEMORY ARRANGEMENT:\n"
             "  Loader:        %016"PRIx64"->%016"PRIx64"\n"
@@ -198,7 +211,9 @@ static int setup_guest(int xc_handle,
             v_start, v_end,
             elf_uval(&elf, elf.ehdr, e_entry));
 
-    if ( (page_array = malloc(nr_pages * sizeof(xen_pfn_t))) == NULL )
+    if ( (page_array = malloc(nr_pages * sizeof(xen_pfn_t))) == NULL ||
+         (super_page_array = 
+          malloc(nr_super_pages * sizeof(xen_pfn_t))) == NULL )
     {
         PERROR("Could not allocate memory.\n");
         goto error_out;
@@ -206,26 +221,45 @@ static int setup_guest(int xc_handle,
 
     for ( i = 0; i < nr_pages; i++ )
         page_array[i] = i;
+    for ( i = 0; i < nr_super_pages; i++ )
+        super_page_array[i] = i << super_page_order;
     for ( i = HVM_BELOW_4G_RAM_END >> PAGE_SHIFT; i < nr_pages; i++ )
         page_array[i] += HVM_BELOW_4G_MMIO_LENGTH >> PAGE_SHIFT;
-
-    /*
-     * Allocate memory for HVM guest, skipping VGA hole 0xA0000-0xC0000.
-     * We allocate pages in batches of no more than 2048 to ensure that
-     * we can be preempted and hence dom0 remains responsive.
-     */
+    for ( i = HVM_BELOW_4G_RAM_END >> (PAGE_SHIFT + super_page_order); 
+          i < nr_super_pages; i++ )
+        super_page_array[i] += HVM_BELOW_4G_MMIO_LENGTH >> PAGE_SHIFT;
+
+
+    /* Allocate memory for HVM guest, skipping VGA hole 0xA0000-0xC0000. */
     rc = xc_domain_memory_populate_physmap(
         xc_handle, dom, 0xa0, 0, 0, &page_array[0x00]);
-    cur_pages = 0xc0;
-    while ( (rc == 0) && (nr_pages > cur_pages) )
-    {
-        unsigned long count = nr_pages - cur_pages;
-        if ( count > 2048 )
-            count = 2048;
+    if ( rc == 0 )
         rc = xc_domain_memory_populate_physmap(
-            xc_handle, dom, count, 0, 0, &page_array[cur_pages]);
+            xc_handle, dom, (0x200<<(super_page_shift-1)) - 0xc0, 0, 0,
+            &page_array[0xc0]);
+    
+    /* We allocate pages in batches of no more than 8MB to ensure that
+     * we can be preempted and hence dom0 remains responsive.
+     */
+    limit = 4 / super_page_shift;
+    cur_pages = 1;
+    while ( (rc == 0) && (nr_super_pages > cur_pages) )
+    {
+        unsigned long count = nr_super_pages - cur_pages;
+        if ( count > limit )
+            count = limit;
+        rc = xc_domain_memory_populate_physmap(xc_handle, dom, count, 
+                                               super_page_order, 0, 
+                                               &super_page_array[cur_pages]);
         cur_pages += count;
     }
+
+    /* handle the case of odd number physical memory size (such as 255MB) */
+    if ( rc == 0 )
+        rc = xc_domain_memory_populate_physmap(
+            xc_handle, dom, nr_pages - (nr_super_pages << super_page_order), 
+            0, 0, &page_array[nr_super_pages << super_page_order]);
+
     if ( rc != 0 )
     {
         PERROR("Could not allocate memory for HVM guest.\n");
@@ -314,10 +348,12 @@ static int setup_guest(int xc_handle,
     }
 
     free(page_array);
+    free(super_page_array);
     return 0;
 
  error_out:
     free(page_array);
+    free(super_page_array);
     return -1;
 }
 
diff -r 810d8c3ac992 xen/arch/ia64/xen/mm.c
--- a/xen/arch/ia64/xen/mm.c	Thu May 08 16:58:33 2008 +0100
+++ b/xen/arch/ia64/xen/mm.c	Sun May 11 17:29:52 2008 -0500
@@ -2415,7 +2415,7 @@ steal_page(struct domain *d, struct page
 
 int
 guest_physmap_add_page(struct domain *d, unsigned long gpfn,
-                       unsigned long mfn)
+                       unsigned long mfn, unsigned int page_order)
 {
     BUG_ON(!mfn_valid(mfn));
     BUG_ON(mfn_to_page(mfn)->count_info != (PGC_allocated | 1));
@@ -2432,7 +2432,7 @@ guest_physmap_add_page(struct domain *d,
 
 void
 guest_physmap_remove_page(struct domain *d, unsigned long gpfn,
-                          unsigned long mfn)
+                          unsigned long mfn, unsigned int page_order)
 {
     BUG_ON(mfn == 0);//XXX
     zap_domain_page_one(d, gpfn << PAGE_SHIFT, 0, mfn);
@@ -2838,7 +2838,7 @@ arch_memory_op(int op, XEN_GUEST_HANDLE(
         if (prev_mfn && mfn_valid(prev_mfn)) {
             if (is_xen_heap_mfn(prev_mfn))
                 /* Xen heap frames are simply unhooked from this phys slot. */
-                guest_physmap_remove_page(d, xatp.gpfn, prev_mfn);
+                guest_physmap_remove_page(d, xatp.gpfn, prev_mfn, 0);
             else
                 /* Normal domain memory is freed, to avoid leaking memory. */
                 guest_remove_page(d, xatp.gpfn);
@@ -2847,10 +2847,10 @@ arch_memory_op(int op, XEN_GUEST_HANDLE(
         /* Unmap from old location, if any. */
         gpfn = get_gpfn_from_mfn(mfn);
         if (gpfn != INVALID_M2P_ENTRY)
-            guest_physmap_remove_page(d, gpfn, mfn);
+            guest_physmap_remove_page(d, gpfn, mfn, 0);
 
         /* Map at new location. */
-        guest_physmap_add_page(d, xatp.gpfn, mfn);
+        guest_physmap_add_page(d, xatp.gpfn, mfn, 0);
 
     out:
         domain_unlock(d);
diff -r 810d8c3ac992 xen/arch/x86/mm.c
--- a/xen/arch/x86/mm.c	Thu May 08 16:58:33 2008 +0100
+++ b/xen/arch/x86/mm.c	Sun May 11 17:21:52 2008 -0500
@@ -3287,7 +3287,8 @@ long arch_memory_op(int op, XEN_GUEST_HA
         {
             if ( is_xen_heap_mfn(prev_mfn) )
                 /* Xen heap frames are simply unhooked from this phys slot. */
-                guest_physmap_remove_page(d, xatp.gpfn, prev_mfn);
+                guest_physmap_remove_page(d, xatp.gpfn, prev_mfn,
+                                          NORMAL_PAGE_ORDER);
             else
                 /* Normal domain memory is freed, to avoid leaking memory. */
                 guest_remove_page(d, xatp.gpfn);
@@ -3296,10 +3297,10 @@ long arch_memory_op(int op, XEN_GUEST_HA
         /* Unmap from old location, if any. */
         gpfn = get_gpfn_from_mfn(mfn);
         if ( gpfn != INVALID_M2P_ENTRY )
-            guest_physmap_remove_page(d, gpfn, mfn);
+            guest_physmap_remove_page(d, gpfn, mfn, NORMAL_PAGE_ORDER);
 
         /* Map at new location. */
-        guest_physmap_add_page(d, xatp.gpfn, mfn);
+        guest_physmap_add_page(d, xatp.gpfn, mfn, NORMAL_PAGE_ORDER);
 
         domain_unlock(d);
 
diff -r 810d8c3ac992 xen/arch/x86/mm/hap/p2m-ept.c
--- a/xen/arch/x86/mm/hap/p2m-ept.c	Thu May 08 16:58:33 2008 +0100
+++ b/xen/arch/x86/mm/hap/p2m-ept.c	Sun May 11 17:21:52 2008 -0500
@@ -20,6 +20,7 @@
 #include <xen/domain_page.h>
 #include <xen/sched.h>
 #include <asm/current.h>
+#include <asm/paging.h>
 #include <asm/types.h>
 #include <asm/domain.h>
 #include <asm/p2m.h>
@@ -46,6 +47,9 @@ static void ept_p2m_type_to_flags(ept_en
     }
 }
 
+#define GUEST_TABLE_NORMAL_PAGE 1
+#define GUEST_TABLE_SUPER_PAGE  2
+
 static int ept_next_level(struct domain *d, bool_t read_only,
                           ept_entry_t **table, unsigned long *gfn_remainder,
                           u32 shift)
@@ -54,7 +58,6 @@ static int ept_next_level(struct domain 
     u32 index;
 
     index = *gfn_remainder >> shift;
-    *gfn_remainder &= (1UL << shift) - 1;
 
     ept_entry = (*table) + index;
 
@@ -83,31 +86,53 @@ static int ept_next_level(struct domain 
         ept_entry->r = ept_entry->w = ept_entry->x = 1;
     }
 
-    next = map_domain_page(ept_entry->mfn);
-    unmap_domain_page(*table);
-    *table = next;
-
-    return 1;
+    if ( !ept_entry->sp_avail )
+    {
+        *gfn_remainder &= (1UL << shift) - 1;
+        next = map_domain_page(ept_entry->mfn);
+        unmap_domain_page(*table);
+        *table = next;
+        return GUEST_TABLE_NORMAL_PAGE;
+    }
+    else
+        return GUEST_TABLE_SUPER_PAGE;
 }
 
 static int
-ept_set_entry(struct domain *d, unsigned long gfn, mfn_t mfn, p2m_type_t p2mt)
-{
-    ept_entry_t *table =
-        map_domain_page(mfn_x(pagetable_get_mfn(d->arch.phys_table)));
-    unsigned long gfn_remainder = gfn;
+ept_set_entry(struct domain *d, unsigned long gfn, mfn_t mfn, 
+              unsigned int order, p2m_type_t p2mt)
+{
+    ept_entry_t *table = NULL;
+    unsigned long gfn_remainder = gfn, offset = 0;
     ept_entry_t *ept_entry = NULL;
     u32 index;
-    int i, rv = 0;
+    int i, rv = 0, ret = 0;
+    int walk_level = order / EPT_TABLE_ORDER;
 
     /* Should check if gfn obeys GAW here */
 
-    for ( i = EPT_DEFAULT_GAW; i > 0; i-- )
-        if ( !ept_next_level(d, 0, &table, &gfn_remainder,
-                             i * EPT_TABLE_ORDER) )
+    if (  order != 0 )
+        if ( (gfn & ((1UL << order) - 1)) )
+            return 1;
+
+    table = map_domain_page(mfn_x(pagetable_get_mfn(d->arch.phys_table)));
+
+    ASSERT(table != NULL);
+
+    for ( i = EPT_DEFAULT_GAW; i > walk_level; i-- )
+    {
+        ret = ept_next_level(d, 0, &table, &gfn_remainder,
+          i * EPT_TABLE_ORDER);
+        if ( !ret )
             goto out;
-
-    index = gfn_remainder;
+        else if ( ret == GUEST_TABLE_SUPER_PAGE )
+            break;
+    }
+
+    index = gfn_remainder >> ( i ?  (i * EPT_TABLE_ORDER): order);
+    walk_level = ( i ? ( i * EPT_TABLE_ORDER) : order) / EPT_TABLE_ORDER;
+    offset = (gfn_remainder & ( ((1 << (i*EPT_TABLE_ORDER)) - 1)));
+
     ept_entry = table + index;
 
     if ( mfn_valid(mfn_x(mfn)) || (p2mt == p2m_mmio_direct) )
@@ -117,9 +142,20 @@ ept_set_entry(struct domain *d, unsigned
             d->arch.p2m->max_mapped_pfn = gfn;
 
         ept_entry->emt = EPT_DEFAULT_MT;
-        ept_entry->sp_avail = 0;
+        ept_entry->sp_avail = walk_level ? 1 : 0;
+
+        if ( ret == GUEST_TABLE_SUPER_PAGE )
+        {
+            ept_entry->mfn = mfn_x(mfn) - offset;
+            if ( ept_entry->avail1 == p2m_ram_logdirty &&
+              p2mt == p2m_ram_rw )
+                for ( i = 0; i < 512; i++ )
+                    paging_mark_dirty(d, mfn_x(mfn)-offset+i);
+        }
+        else
+            ept_entry->mfn = mfn_x(mfn);
+
         ept_entry->avail1 = p2mt;
-        ept_entry->mfn = mfn_x(mfn);
         ept_entry->rsvd = 0;
         ept_entry->avail2 = 0;
         /* last step */
@@ -132,14 +168,42 @@ ept_set_entry(struct domain *d, unsigned
     /* Success */
     rv = 1;
 
- out:
+out:
     unmap_domain_page(table);
 
     ept_sync_domain(d);
 
+    /* Now the p2m table is not shared with vt-d page table */
+
+    if ( iommu_enabled && is_hvm_domain(d) )
+    {
+        if ( p2mt == p2m_ram_rw )
+        {
+            if ( ret == GUEST_TABLE_SUPER_PAGE )
+            {
+                for ( i = 0; i < 512; i++ )
+                    iommu_map_page(d, gfn-offset+i, mfn_x(mfn)-offset+i);
+            }
+            else if ( ret )
+                iommu_map_page(d, gfn, mfn_x(mfn));
+        }
+        else
+        {
+            if ( ret == GUEST_TABLE_SUPER_PAGE )
+            {
+                for ( i = 0; i < 512; i++ )
+                    iommu_unmap_page(d, gfn-offset+i);
+            }
+            else if ( ret )
+                iommu_unmap_page(d, gfn);
+        }
+    }
+
+#ifdef P2M_SHARE_WITH_VTD_PAGE_TABLE
     /* If p2m table is shared with vtd page-table. */
     if ( iommu_enabled && is_hvm_domain(d) && (p2mt == p2m_mmio_direct) )
         iommu_flush(d, gfn, (u64*)ept_entry);
+#endif
 
     return rv;
 }
@@ -152,7 +216,7 @@ static mfn_t ept_get_entry(struct domain
     unsigned long gfn_remainder = gfn;
     ept_entry_t *ept_entry;
     u32 index;
-    int i;
+    int i, ret=0;
     mfn_t mfn = _mfn(INVALID_MFN);
 
     *t = p2m_mmio_dm;
@@ -164,17 +228,31 @@ static mfn_t ept_get_entry(struct domain
     /* Should check if gfn obeys GAW here. */
 
     for ( i = EPT_DEFAULT_GAW; i > 0; i-- )
-        if ( !ept_next_level(d, 1, &table, &gfn_remainder,
-                             i * EPT_TABLE_ORDER) )
+    {
+        ret = ept_next_level(d, 1, &table, &gfn_remainder,
+                             i * EPT_TABLE_ORDER);
+        if ( !ret )
             goto out;
-
-    index = gfn_remainder;
+        else if ( ret == GUEST_TABLE_SUPER_PAGE )
+            break;
+    }
+
+    index = gfn_remainder >> ( i * EPT_TABLE_ORDER);
     ept_entry = table + index;
 
     if ( ept_entry->avail1 != p2m_invalid )
     {
         *t = ept_entry->avail1;
         mfn = _mfn(ept_entry->mfn);
+        if ( i )
+        {
+            /* we may meet super pages, and to split into 4k pages
+             * to emulate p2m table
+             */
+            unsigned long split_mfn = 
+              mfn_x(mfn) + (gfn_remainder & ( ((1 << (i*EPT_TABLE_ORDER)) - 1 )));
+            mfn = _mfn(split_mfn);
+        }
     }
 
  out:
@@ -205,33 +283,63 @@ static void ept_change_entry_type_global
     l4e = map_domain_page(mfn_x(pagetable_get_mfn(d->arch.phys_table)));
     for (i4 = 0; i4 < EPT_PAGETABLE_ENTRIES; i4++ )
     {
-        if ( !l4e[i4].epte || l4e[i4].sp_avail )
+        if ( !l4e[i4].epte )
             continue;
-        l3e = map_domain_page(l4e[i4].mfn);
-        for ( i3 = 0; i3 < EPT_PAGETABLE_ENTRIES; i3++ )
-        {
-            if ( !l3e[i3].epte || l3e[i3].sp_avail )
+        if ( !l4e[i4].sp_avail )
+        {
+            l3e = map_domain_page(l4e[i4].mfn);
+            for ( i3 = 0; i3 < EPT_PAGETABLE_ENTRIES; i3++ )
+            {
+                if ( !l3e[i3].epte )
+                    continue;
+                if ( !l3e[i3].sp_avail )
+                {
+                    l2e = map_domain_page(l3e[i3].mfn);
+                    for ( i2 = 0; i2 < EPT_PAGETABLE_ENTRIES; i2++ )
+                    {
+                        if ( !l2e[i2].epte )
+                            continue;
+                        if ( !l2e[i2].sp_avail )
+                        {
+                            l1e = map_domain_page(l2e[i2].mfn);
+                            for ( i1  = 0; i1 < EPT_PAGETABLE_ENTRIES; i1++ )
+                            {
+                                if ( !l1e[i1].epte )
+                                    continue;
+                                if ( l1e[i1].avail1 != ot )
+                                    continue;
+                                l1e[i1].avail1 = nt;
+                                ept_p2m_type_to_flags(l1e+i1, nt);
+                            }
+                            unmap_domain_page(l1e);
+                        }
+                        else
+                        {
+                            if ( l2e[i2].avail1 != ot )
+                                continue;
+                            l2e[i2].avail1 = nt;
+                            ept_p2m_type_to_flags(l2e+i2, nt);
+                        }
+                    }
+                    unmap_domain_page(l2e);
+                }
+                else
+                {
+                    if ( l3e[i3].avail1 != ot )
+                        continue;
+                    l3e[i3].avail1 = nt;
+                    ept_p2m_type_to_flags(l3e+i3, nt);
+                }
+            }
+            unmap_domain_page(l3e);
+        }
+        else
+        {
+            if ( l4e[i4].avail1 != ot )
                 continue;
-            l2e = map_domain_page(l3e[i3].mfn);
-            for ( i2 = 0; i2 < EPT_PAGETABLE_ENTRIES; i2++ )
-            {
-                if ( !l2e[i2].epte || l2e[i2].sp_avail )
-                    continue;
-                l1e = map_domain_page(l2e[i2].mfn);
-                for ( i1  = 0; i1 < EPT_PAGETABLE_ENTRIES; i1++ )
-                {
-                    if ( !l1e[i1].epte )
-                        continue;
-                    if ( l1e[i1].avail1 != ot )
-                        continue;
-                    l1e[i1].avail1 = nt;
-                    ept_p2m_type_to_flags(l1e+i1, nt);
-                }
-                unmap_domain_page(l1e);
-            }
-            unmap_domain_page(l2e);
-        }
-        unmap_domain_page(l3e);
+            l4e[i4].avail1 = nt;
+            ept_p2m_type_to_flags(l4e+i4, nt);
+        }
     }
     unmap_domain_page(l4e);
 
diff -r 810d8c3ac992 xen/arch/x86/mm/p2m.c
--- a/xen/arch/x86/mm/p2m.c	Thu May 08 16:58:33 2008 +0100
+++ b/xen/arch/x86/mm/p2m.c	Sun May 11 17:21:52 2008 -0500
@@ -151,9 +151,11 @@ p2m_next_level(struct domain *d, mfn_t *
                unsigned long *gfn_remainder, unsigned long gfn, u32 shift,
                u32 max, unsigned long type)
 {
+    l1_pgentry_t *l1_entry;
     l1_pgentry_t *p2m_entry;
     l1_pgentry_t new_entry;
     void *next;
+    int i;
     ASSERT(d->arch.p2m->alloc_page);
 
     if ( !(p2m_entry = p2m_find_entry(*table, gfn_remainder, gfn,
@@ -194,6 +196,44 @@ p2m_next_level(struct domain *d, mfn_t *
             break;
         }
     }
+
+    ASSERT(l1e_get_flags(*p2m_entry) & _PAGE_PRESENT);
+
+    /* split single large page into 4KB page in P2M table */
+    if ( type == PGT_l1_page_table && (l1e_get_flags(*p2m_entry) & _PAGE_PSE) )
+    {
+        unsigned long flags, pfn;
+        struct page_info *pg = d->arch.p2m->alloc_page(d);
+        if ( pg == NULL )
+            return 0;
+        list_add_tail(&pg->list, &d->arch.p2m->pages);
+        pg->u.inuse.type_info = PGT_l1_page_table | 1 | PGT_validated;
+        pg->count_info = 1;
+        
+        /* New splintered mappings inherit the flags of the old superpage, 
+         * with a little reorganisation for the _PAGE_PSE_PAT bit. */
+        flags = l1e_get_flags(*p2m_entry);
+        pfn = l1e_get_pfn(*p2m_entry);
+        if ( pfn & 1 )           /* ==> _PAGE_PSE_PAT was set */
+            pfn -= 1;            /* Clear it; _PAGE_PSE becomes _PAGE_PAT */
+        else
+            flags &= ~_PAGE_PSE; /* Clear _PAGE_PSE (== _PAGE_PAT) */
+        
+        l1_entry = map_domain_page(mfn_x(page_to_mfn(pg)));
+        for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
+        {
+            new_entry = l1e_from_pfn(pfn + i, flags);
+            paging_write_p2m_entry(d, gfn,
+                                   l1_entry+i, *table_mfn, new_entry, 1);
+        }
+        unmap_domain_page(l1_entry);
+        
+        new_entry = l1e_from_pfn(mfn_x(page_to_mfn(pg)),
+                                 __PAGE_HYPERVISOR|_PAGE_USER);
+        paging_write_p2m_entry(d, gfn,
+                               p2m_entry, *table_mfn, new_entry, 2);
+    }
+
     *table_mfn = _mfn(l1e_get_pfn(*p2m_entry));
     next = map_domain_page(mfn_x(*table_mfn));
     unmap_domain_page(*table);
@@ -204,7 +244,8 @@ p2m_next_level(struct domain *d, mfn_t *
 
 // Returns 0 on error (out of memory)
 static int
-p2m_set_entry(struct domain *d, unsigned long gfn, mfn_t mfn, p2m_type_t p2mt)
+p2m_set_entry(struct domain *d, unsigned long gfn, mfn_t mfn, 
+              unsigned int page_order, p2m_type_t p2mt)
 {
     // XXX -- this might be able to be faster iff current->domain == d
     mfn_t table_mfn = pagetable_get_mfn(d->arch.phys_table);
@@ -212,6 +253,7 @@ p2m_set_entry(struct domain *d, unsigned
     unsigned long gfn_remainder = gfn;
     l1_pgentry_t *p2m_entry;
     l1_pgentry_t entry_content;
+    l2_pgentry_t l2e_content;
     int rv=0;
 
 #if CONFIG_PAGING_LEVELS >= 4
@@ -235,26 +277,53 @@ p2m_set_entry(struct domain *d, unsigned
                          PGT_l2_page_table) )
         goto out;
 
-    if ( !p2m_next_level(d, &table_mfn, &table, &gfn_remainder, gfn,
-                         L2_PAGETABLE_SHIFT - PAGE_SHIFT,
-                         L2_PAGETABLE_ENTRIES, PGT_l1_page_table) )
-        goto out;
-
-    p2m_entry = p2m_find_entry(table, &gfn_remainder, gfn,
-                               0, L1_PAGETABLE_ENTRIES);
-    ASSERT(p2m_entry);
+    if ( page_order == NORMAL_PAGE_ORDER )
+    {
+        if ( !p2m_next_level(d, &table_mfn, &table, &gfn_remainder, gfn,
+                             L2_PAGETABLE_SHIFT - PAGE_SHIFT,
+                             L2_PAGETABLE_ENTRIES, PGT_l1_page_table) )
+            goto out;
+
+        p2m_entry = p2m_find_entry(table, &gfn_remainder, gfn,
+                                   0, L1_PAGETABLE_ENTRIES);
+        ASSERT(p2m_entry);
+        
+        if ( mfn_valid(mfn) || (p2mt == p2m_mmio_direct) )
+            entry_content = l1e_from_pfn(mfn_x(mfn), p2m_type_to_flags(p2mt));
+        else
+            entry_content = l1e_empty();
+        
+        /* level 1 entry */
+        paging_write_p2m_entry(d, gfn, p2m_entry, table_mfn, entry_content, 1);
+    }
+    else 
+    {
+        p2m_entry = p2m_find_entry(table, &gfn_remainder, gfn,
+                                   L2_PAGETABLE_SHIFT - PAGE_SHIFT,
+                                   L2_PAGETABLE_ENTRIES);
+        ASSERT(p2m_entry);
+        
+        if ( (l1e_get_flags(*p2m_entry) & _PAGE_PRESENT) &&
+             !(l1e_get_flags(*p2m_entry) & _PAGE_PSE) )
+        {
+            P2M_ERROR("configure P2M table 4KB L2 entry with large page\n");
+            domain_crash(d);
+            goto out;
+        }
+        
+        if ( mfn_valid(mfn) )
+            l2e_content = l2e_from_pfn(mfn_x(mfn),
+                                       p2m_type_to_flags(p2mt) | _PAGE_PSE);
+        else
+            l2e_content = l2e_empty();
+        
+        entry_content.l1 = l2e_content.l2;
+        paging_write_p2m_entry(d, gfn, p2m_entry, table_mfn, entry_content, 2);
+    }
 
     /* Track the highest gfn for which we have ever had a valid mapping */
     if ( mfn_valid(mfn) && (gfn > d->arch.p2m->max_mapped_pfn) )
         d->arch.p2m->max_mapped_pfn = gfn;
-
-    if ( mfn_valid(mfn) || (p2mt == p2m_mmio_direct) )
-        entry_content = l1e_from_pfn(mfn_x(mfn), p2m_type_to_flags(p2mt));
-    else
-        entry_content = l1e_empty();
-
-    /* level 1 entry */
-    paging_write_p2m_entry(d, gfn, p2m_entry, table_mfn, entry_content, 1);
 
     if ( iommu_enabled && is_hvm_domain(d) )
     {
@@ -335,6 +404,16 @@ p2m_gfn_to_mfn(struct domain *d, unsigne
         unmap_domain_page(l2e);
         return _mfn(INVALID_MFN);
     }
+    else if ( (l2e_get_flags(*l2e) & _PAGE_PSE) )
+    {
+        mfn = _mfn(l2e_get_pfn(*l2e) + l1_table_offset(addr));
+        *t = p2m_flags_to_type(l2e_get_flags(*l2e));
+        unmap_domain_page(l2e);
+        
+        ASSERT(mfn_valid(mfn) || !p2m_is_ram(*t));
+        return (p2m_is_valid(*t)) ? mfn : _mfn(INVALID_MFN);
+    }
+
     mfn = _mfn(l2e_get_pfn(*l2e));
     unmap_domain_page(l2e);
 
@@ -358,6 +437,7 @@ static mfn_t p2m_gfn_to_mfn_current(unsi
 {
     mfn_t mfn = _mfn(INVALID_MFN);
     p2m_type_t p2mt = p2m_mmio_dm;
+    paddr_t addr = ((paddr_t)gfn) << PAGE_SHIFT;
     /* XXX This is for compatibility with the old model, where anything not 
      * XXX marked as RAM was considered to be emulated MMIO space.
      * XXX Once we start explicitly registering MMIO regions in the p2m 
@@ -366,25 +446,44 @@ static mfn_t p2m_gfn_to_mfn_current(unsi
     if ( gfn <= current->domain->arch.p2m->max_mapped_pfn )
     {
         l1_pgentry_t l1e = l1e_empty();
+        l2_pgentry_t l2e = l2e_empty();
         int ret;
 
         ASSERT(gfn < (RO_MPT_VIRT_END - RO_MPT_VIRT_START) 
                / sizeof(l1_pgentry_t));
 
-        /* Need to __copy_from_user because the p2m is sparse and this
-         * part might not exist */
-        ret = __copy_from_user(&l1e,
-                               &phys_to_machine_mapping[gfn],
-                               sizeof(l1e));
-
-        if ( ret == 0 ) {
-            p2mt = p2m_flags_to_type(l1e_get_flags(l1e));
-            ASSERT(l1e_get_pfn(l1e) != INVALID_MFN || !p2m_is_ram(p2mt));
+        ret = __copy_from_user(&l2e,
+                               &__linear_l1_table[l1_linear_offset(RO_MPT_VIRT_START) + l2_linear_offset(addr)],
+                               sizeof(l2e));
+        
+        if ( (ret == 0) && (l2e_get_flags(l2e) & _PAGE_PRESENT) && 
+             (l2e_get_flags(l2e) & _PAGE_PSE) ) 
+        {
+            p2mt = p2m_flags_to_type(l2e_get_flags(l2e));
+            ASSERT(l2e_get_pfn(l2e) != INVALID_MFN || !p2m_is_ram(p2mt));
             if ( p2m_is_valid(p2mt) )
-                mfn = _mfn(l1e_get_pfn(l1e));
-            else 
-                /* XXX see above */
+                mfn = _mfn(l2e_get_pfn(l2e) + l1_table_offset(addr));
+            else
                 p2mt = p2m_mmio_dm;
+        }
+        else
+        {
+        
+            /* Need to __copy_from_user because the p2m is sparse and this
+             * part might not exist */
+            ret = __copy_from_user(&l1e,
+                                   &phys_to_machine_mapping[gfn],
+                                   sizeof(l1e));
+            
+            if ( ret == 0 ) {
+                p2mt = p2m_flags_to_type(l1e_get_flags(l1e));
+                ASSERT(l1e_get_pfn(l1e) != INVALID_MFN || !p2m_is_ram(p2mt));
+                if ( p2m_is_valid(p2mt) )
+                    mfn = _mfn(l1e_get_pfn(l1e));
+                else 
+                    /* XXX see above */
+                    p2mt = p2m_mmio_dm;
+            }
         }
     }
 
@@ -430,9 +529,10 @@ void p2m_change_entry_type_global(struct
 }
 
 static inline
-int set_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn, p2m_type_t p2mt)
-{
-    return d->arch.p2m->set_entry(d, gfn, mfn, p2mt);
+int set_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn, 
+                  unsigned int page_order, p2m_type_t p2mt)
+{
+    return d->arch.p2m->set_entry(d, gfn, mfn, page_order, p2mt);
 }
 
 // Allocate a new p2m table for a domain.
@@ -493,7 +593,8 @@ int p2m_alloc_table(struct domain *d,
     P2M_PRINTK("populating p2m table\n");
 
     /* Initialise physmap tables for slot zero. Other code assumes this. */
-    if ( !set_p2m_entry(d, 0, _mfn(INVALID_MFN), p2m_invalid) )
+    if ( !set_p2m_entry(d, 0, _mfn(INVALID_MFN), NORMAL_PAGE_ORDER,
+                        p2m_invalid) )
         goto error;
 
     /* Copy all existing mappings from the page list and m2p */
@@ -512,7 +613,7 @@ int p2m_alloc_table(struct domain *d,
             (gfn != 0x55555555L)
 #endif
              && gfn != INVALID_M2P_ENTRY
-            && !set_p2m_entry(d, gfn, mfn, p2m_ram_rw) )
+            && !set_p2m_entry(d, gfn, mfn, NORMAL_PAGE_ORDER, p2m_ram_rw) )
             goto error;
     }
 
@@ -688,6 +789,28 @@ static void audit_p2m(struct domain *d)
                         gfn += 1 << (L2_PAGETABLE_SHIFT - PAGE_SHIFT);
                         continue;
                     }
+                    
+                    /* check for super page */
+                    if ( l2e_get_flags(l2e[i2]) & _PAGE_PSE )
+                    {
+                        mfn = l2e_get_pfn(l2e[i2]);
+                        ASSERT(mfn_valid(_mfn(mfn)));
+                        for ( i1 = 0; i1 < L1_PAGETABLE_ENTRIES; i1++)
+                        {
+                            m2pfn = get_gpfn_from_mfn(mfn+i1);
+                            if ( m2pfn != (gfn + i) )
+                            {
+                                pmbad++;
+                                P2M_PRINTK("mismatch: gfn %#lx -> mfn %#lx"
+                                           " -> gfn %#lx\n", gfn+i, mfn+i,
+                                           m2pfn);
+                                BUG();
+                            }
+                        }
+                        gfn += 1 << (L2_PAGETABLE_SHIFT - PAGE_SHIFT);
+                        continue;
+                    }
+
                     l1e = map_domain_page(mfn_x(_mfn(l2e_get_pfn(l2e[i2]))));
 
                     for ( i1 = 0; i1 < L1_PAGETABLE_ENTRIES; i1++, gfn++ )
@@ -737,35 +860,40 @@ static void audit_p2m(struct domain *d)
 
 
 static void
-p2m_remove_page(struct domain *d, unsigned long gfn, unsigned long mfn)
-{
+p2m_remove_page(struct domain *d, unsigned long gfn, unsigned long mfn,
+                unsigned int page_order)
+{
+    int i;
     if ( !paging_mode_translate(d) )
         return;
     P2M_DEBUG("removing gfn=%#lx mfn=%#lx\n", gfn, mfn);
 
-    set_p2m_entry(d, gfn, _mfn(INVALID_MFN), p2m_invalid);
-    set_gpfn_from_mfn(mfn, INVALID_M2P_ENTRY);
+    set_p2m_entry(d, gfn, _mfn(INVALID_MFN), page_order, p2m_invalid);
+    for ( i = 0; i < (1UL << page_order); i++ )
+        set_gpfn_from_mfn(mfn+i, INVALID_M2P_ENTRY);
 }
 
 void
 guest_physmap_remove_page(struct domain *d, unsigned long gfn,
-                          unsigned long mfn)
+                          unsigned long mfn, unsigned int page_order)
 {
     p2m_lock(d->arch.p2m);
     audit_p2m(d);
-    p2m_remove_page(d, gfn, mfn);
+    p2m_remove_page(d, gfn, mfn, page_order);
     audit_p2m(d);
     p2m_unlock(d->arch.p2m);
 }
 
 int
 guest_physmap_add_entry(struct domain *d, unsigned long gfn,
-                        unsigned long mfn, p2m_type_t t)
+                        unsigned long mfn, unsigned int page_order, 
+                        p2m_type_t t)
 {
     unsigned long ogfn;
     p2m_type_t ot;
     mfn_t omfn;
     int rc = 0;
+    int i;
 
     if ( !paging_mode_translate(d) )
         return -EINVAL;
@@ -795,7 +923,8 @@ guest_physmap_add_entry(struct domain *d
     if ( p2m_is_ram(ot) )
     {
         ASSERT(mfn_valid(omfn));
-        set_gpfn_from_mfn(mfn_x(omfn), INVALID_M2P_ENTRY);
+        for ( i = 0; i < (1UL << page_order); i++ )
+            set_gpfn_from_mfn(mfn_x(omfn)+i, INVALID_M2P_ENTRY);
     }
 
     ogfn = mfn_to_gfn(d, _mfn(mfn));
@@ -818,21 +947,23 @@ guest_physmap_add_entry(struct domain *d
             P2M_DEBUG("old gfn=%#lx -> mfn %#lx\n",
                       ogfn , mfn_x(omfn));
             if ( mfn_x(omfn) == mfn )
-                p2m_remove_page(d, ogfn, mfn);
+                p2m_remove_page(d, ogfn, mfn, page_order);
         }
     }
 
     if ( mfn_valid(_mfn(mfn)) ) 
     {
-        if ( !set_p2m_entry(d, gfn, _mfn(mfn), t) )
+        if ( !set_p2m_entry(d, gfn, _mfn(mfn), page_order, t) )
             rc = -EINVAL;
-        set_gpfn_from_mfn(mfn, gfn);
+        for ( i = 0; i < (1UL << page_order); i++ )
+            set_gpfn_from_mfn(mfn+i, gfn+i);
     }
     else
     {
         gdprintk(XENLOG_WARNING, "Adding bad mfn to p2m map (%#lx -> %#lx)\n",
                  gfn, mfn);
-        if ( !set_p2m_entry(d, gfn, _mfn(INVALID_MFN), p2m_invalid) )
+        if ( !set_p2m_entry(d, gfn, _mfn(INVALID_MFN), page_order, 
+                            p2m_invalid) )
             rc = -EINVAL;
     }
 
@@ -851,7 +982,7 @@ void p2m_change_type_global(struct domai
     l1_pgentry_t l1e_content;
     l1_pgentry_t *l1e;
     l2_pgentry_t *l2e;
-    mfn_t l1mfn;
+    mfn_t l1mfn, l2mfn;
     int i1, i2;
     l3_pgentry_t *l3e;
     int i3;
@@ -891,11 +1022,26 @@ void p2m_change_type_global(struct domai
             {
                 continue;
             }
+            l2mfn = _mfn(l3e_get_pfn(l3e[i3]));
             l2e = map_domain_page(l3e_get_pfn(l3e[i3]));
             for ( i2 = 0; i2 < L2_PAGETABLE_ENTRIES; i2++ )
             {
                 if ( !(l2e_get_flags(l2e[i2]) & _PAGE_PRESENT) )
                 {
+                    continue;
+                }
+
+                if ( (l2e_get_flags(l2e[i2]) & _PAGE_PSE) )
+                {
+                    flags = l2e_get_flags(l2e[i2]);
+                    if ( p2m_flags_to_type(flags) != ot )
+                        continue;
+                    mfn = l2e_get_pfn(l2e[i2]);
+                    gfn = get_gpfn_from_mfn(mfn);
+                    flags = p2m_flags_to_type(nt);
+                    l1e_content = l1e_from_pfn(mfn, flags | _PAGE_PSE);
+                    paging_write_p2m_entry(d, gfn, (l1_pgentry_t *)&l2e[i2],
+                                           l2mfn, l1e_content, 2);
                     continue;
                 }
 
@@ -944,7 +1090,7 @@ p2m_type_t p2m_change_type(struct domain
 
     mfn = gfn_to_mfn(d, gfn, &pt);
     if ( pt == ot )
-        set_p2m_entry(d, gfn, mfn, nt);
+        set_p2m_entry(d, gfn, mfn, NORMAL_PAGE_ORDER, nt);
 
     p2m_unlock(d->arch.p2m);
 
@@ -968,7 +1114,7 @@ set_mmio_p2m_entry(struct domain *d, uns
         set_gpfn_from_mfn(mfn_x(omfn), INVALID_M2P_ENTRY);
     }
 
-    rc = set_p2m_entry(d, gfn, mfn, p2m_mmio_direct);
+    rc = set_p2m_entry(d, gfn, mfn, NORMAL_PAGE_ORDER, p2m_mmio_direct);
     if ( 0 == rc )
         gdprintk(XENLOG_ERR,
             "set_mmio_p2m_entry: set_p2m_entry failed! mfn=%08lx\n",
@@ -992,7 +1138,7 @@ clear_mmio_p2m_entry(struct domain *d, u
             "clear_mmio_p2m_entry: gfn_to_mfn failed! gfn=%08lx\n", gfn);
         return 0;
     }
-    rc = set_p2m_entry(d, gfn, _mfn(INVALID_MFN), 0);
+    rc = set_p2m_entry(d, gfn, _mfn(INVALID_MFN), NORMAL_PAGE_ORDER, 0);
 
     return rc;
 }
diff -r 810d8c3ac992 xen/common/grant_table.c
--- a/xen/common/grant_table.c	Thu May 08 16:58:33 2008 +0100
+++ b/xen/common/grant_table.c	Sun May 11 17:21:52 2008 -0500
@@ -1159,7 +1159,7 @@ gnttab_transfer(
         spin_lock(&e->grant_table->lock);
 
         sha = &shared_entry(e->grant_table, gop.ref);
-        guest_physmap_add_page(e, sha->frame, mfn);
+        guest_physmap_add_page(e, sha->frame, mfn, NORMAL_PAGE_ORDER);
         sha->frame = mfn;
         wmb();
         sha->flags |= GTF_transfer_completed;
diff -r 810d8c3ac992 xen/common/memory.c
--- a/xen/common/memory.c	Thu May 08 16:58:33 2008 +0100
+++ b/xen/common/memory.c	Sun May 11 17:21:52 2008 -0500
@@ -114,34 +114,60 @@ static void populate_physmap(struct memo
 
         page = alloc_domheap_pages(
             d, a->extent_order, a->memflags | MEMF_node(node));
-        if ( unlikely(page == NULL) ) 
-        {
-            gdprintk(XENLOG_INFO, "Could not allocate order=%d extent: "
-                     "id=%d memflags=%x (%ld of %d)\n",
-                     a->extent_order, d->domain_id, a->memflags,
-                     i, a->nr_extents);
-            goto out;
-        }
-
-        mfn = page_to_mfn(page);
-
-        if ( unlikely(paging_mode_translate(d)) )
-        {
+
+        if ( unlikely(page == NULL) )
+        {
+            /* fail if it is not under translate mode */
+            if ( !paging_mode_translate(d) )
+            {
+                gdprintk(XENLOG_INFO, "Could not allocate order=%d extent: "
+                         "id=%d memflags=%x (%ld of %d)\n",
+                         a->extent_order, d->domain_id, a->memflags,
+                         i, a->nr_extents);
+                goto out;
+            }
+            
+            /* try to allocate using 4KB page instead */
             for ( j = 0; j < (1 << a->extent_order); j++ )
-                if ( guest_physmap_add_page(d, gpfn + j, mfn + j) )
+            {
+                page = alloc_domheap_pages(d, 0, 
+                                           a->memflags | MEMF_node(node));
+                if ( page == NULL )
+                {
+                    gdprintk(XENLOG_INFO, "Could not allocate order=%d extent:"
+                             "id=%d memflags=%x (%ld of %d)\n",
+                             0, d->domain_id, a->memflags, i, a->nr_extents);
                     goto out;
-        }
-        else
-        {
-            for ( j = 0; j < (1 << a->extent_order); j++ )
-                set_gpfn_from_mfn(mfn + j, gpfn + j);
-
-            /* Inform the domain of the new page's machine address. */ 
-            if ( unlikely(__copy_to_guest_offset(a->extent_list, i, &mfn, 1)) )
-                goto out;
-        }
-    }
-
+                }
+                
+                mfn = page_to_mfn(page);
+                
+                if ( guest_physmap_add_page(d, gpfn+j, mfn, 
+                                            NORMAL_PAGE_ORDER) )
+                    goto out;
+            }
+        }
+        else /* successful in allocating page of extent_order */
+        {
+            mfn = page_to_mfn(page);
+            
+            if ( unlikely(paging_mode_translate(d)) )
+            {
+                if ( guest_physmap_add_page(d, gpfn, mfn, a->extent_order) )
+                    goto out;
+            }
+            else
+            {
+                for ( j = 0; j < (1 << a->extent_order); j++ )
+                    set_gpfn_from_mfn(mfn + j, gpfn + j);
+                
+                /* Inform the domain of the new page's machine address. */ 
+                if ( unlikely(__copy_to_guest_offset(a->extent_list, i, &mfn, 
+                                                     1)) )
+                    goto out;
+            }
+        }
+    }
  out:
     a->nr_done = i;
 }
@@ -172,7 +198,7 @@ int guest_remove_page(struct domain *d, 
     if ( test_and_clear_bit(_PGC_allocated, &page->count_info) )
         put_page(page);
 
-    guest_physmap_remove_page(d, gmfn, mfn);
+    guest_physmap_remove_page(d, gmfn, mfn, NORMAL_PAGE_ORDER);
 
     put_page(page);
 
@@ -419,7 +445,8 @@ static long memory_exchange(XEN_GUEST_HA
             if ( !test_and_clear_bit(_PGC_allocated, &page->count_info) )
                 BUG();
             mfn = page_to_mfn(page);
-            guest_physmap_remove_page(d, mfn_to_gmfn(d, mfn), mfn);
+            guest_physmap_remove_page(d, mfn_to_gmfn(d, mfn), mfn, 
+                                      NORMAL_PAGE_ORDER);
             put_page(page);
         }
 
@@ -441,8 +468,8 @@ static long memory_exchange(XEN_GUEST_HA
             if ( unlikely(paging_mode_translate(d)) )
             {
                 /* Ignore failure here. There's nothing we can do. */
-                for ( k = 0; k < (1UL << exch.out.extent_order); k++ )
-                    (void)guest_physmap_add_page(d, gpfn + k, mfn + k);
+                (void)guest_physmap_add_page(d, gpfn, mfn, 
+                                             exch.out.extent_order);
             }
             else
             {
diff -r 810d8c3ac992 xen/include/asm-ia64/shadow.h
--- a/xen/include/asm-ia64/shadow.h	Thu May 08 16:58:33 2008 +0100
+++ b/xen/include/asm-ia64/shadow.h	Sun May 11 17:21:52 2008 -0500
@@ -40,8 +40,10 @@
  * Utilities to change relationship of gpfn->mfn for designated domain,
  * which is required by gnttab transfer, balloon, device model and etc.
  */
-int guest_physmap_add_page(struct domain *d, unsigned long gpfn, unsigned long mfn);
-void guest_physmap_remove_page(struct domain *d, unsigned long gpfn, unsigned long mfn);
+int guest_physmap_add_page(struct domain *d, unsigned long gpfn, 
+                           unsigned long mfn, unsigned int page_order);
+void guest_physmap_remove_page(struct domain *d, unsigned long gpfn, 
+                               unsigned long mfn, unsigned int page_order);
 
 static inline int
 shadow_mode_enabled(struct domain *d)
diff -r 810d8c3ac992 xen/include/asm-x86/mm.h
--- a/xen/include/asm-x86/mm.h	Thu May 08 16:58:33 2008 +0100
+++ b/xen/include/asm-x86/mm.h	Sun May 11 17:21:52 2008 -0500
@@ -124,6 +124,14 @@ static inline u32 pickle_domptr(struct d
 /* The order of the largest allocation unit we use for shadow pages */
 #define SHADOW_MAX_ORDER 2 /* Need up to 16k allocs for 32-bit on PAE/64 */
 
+/* The order of continuously allocated super page frames */
+#define NORMAL_PAGE_ORDER 0  /* 4KB page */
+#if CONFIG_PAGING_LEVELS == 2
+#define SUPER_PAGE_ORDER  10 /* 4MB page */
+#else
+#define SUPER_PAGE_ORDER  9  /* 2MB page */
+#endif
+
 #define page_get_owner(_p)    (unpickle_domptr((_p)->u.inuse._domain))
 #define page_set_owner(_p,_d) ((_p)->u.inuse._domain = pickle_domptr(_d))
 
diff -r 810d8c3ac992 xen/include/asm-x86/p2m.h
--- a/xen/include/asm-x86/p2m.h	Thu May 08 16:58:33 2008 +0100
+++ b/xen/include/asm-x86/p2m.h	Sun May 11 17:21:52 2008 -0500
@@ -102,7 +102,8 @@ struct p2m_domain {
     void               (*free_page   )(struct domain *d,
                                        struct page_info *pg);
     int                (*set_entry   )(struct domain *d, unsigned long gfn,
-                                       mfn_t mfn, p2m_type_t p2mt);
+                                       mfn_t mfn, unsigned int page_order,
+                                       p2m_type_t p2mt);
     mfn_t              (*get_entry   )(struct domain *d, unsigned long gfn,
                                        p2m_type_t *p2mt);
     mfn_t              (*get_entry_current)(unsigned long gfn,
@@ -203,21 +204,23 @@ void p2m_final_teardown(struct domain *d
 
 /* Add a page to a domain's p2m table */
 int guest_physmap_add_entry(struct domain *d, unsigned long gfn,
-                             unsigned long mfn, p2m_type_t t);
+                            unsigned long mfn, unsigned int page_order, 
+                            p2m_type_t t);
 
 /* Untyped version for RAM only, for compatibility 
  *
  * Return 0 for success
  */
 static inline int guest_physmap_add_page(struct domain *d, unsigned long gfn,
-                                         unsigned long mfn)
-{
-    return guest_physmap_add_entry(d, gfn, mfn, p2m_ram_rw);
+                                         unsigned long mfn,
+                                         unsigned int page_order)
+{
+    return guest_physmap_add_entry(d, gfn, mfn, page_order, p2m_ram_rw);
 }
 
 /* Remove a page from a domain's p2m table */
 void guest_physmap_remove_page(struct domain *d, unsigned long gfn,
-                               unsigned long mfn);
+                               unsigned long mfn, unsigned int page_order);
 
 /* Change types across all p2m entries in a domain */
 void p2m_change_type_global(struct domain *d, p2m_type_t ot, p2m_type_t nt);
diff -r 810d8c3ac992 xen/include/xen/paging.h
--- a/xen/include/xen/paging.h	Thu May 08 16:58:33 2008 +0100
+++ b/xen/include/xen/paging.h	Sun May 11 17:21:52 2008 -0500
@@ -18,8 +18,8 @@
 #else
 
 #define paging_mode_translate(d)              (0)
-#define guest_physmap_add_page(d, p, m)       (0)
-#define guest_physmap_remove_page(d, p, m)    ((void)0)
+#define guest_physmap_add_page(d, p, m, o)       (0)
+#define guest_physmap_remove_page(d, p, m, o)    ((void)0)
 
 #endif
 

[-- Attachment #3: Type: text/plain, Size: 138 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 14+ messages in thread

* RE: [PATCH] patch to support super page (2M) with EPT
  2008-05-12  4:36   ` Huang2, Wei
@ 2008-05-12  5:04     ` Xin, Xiaohui
  2008-05-12  7:03       ` Keir Fraser
  0 siblings, 1 reply; 14+ messages in thread
From: Xin, Xiaohui @ 2008-05-12  5:04 UTC (permalink / raw)
  To: Huang2, Wei, xen-devel

[-- Attachment #1.1: Type: text/plain, Size: 3075 bytes --]

Some comments here:

1) Basically 4M pages allocations is not hardware naturally for EPT, we
only use 2M super pages now. 

I remembered that Keir said that 2M pages allocation is sufficient, and
he removed all the pure 32bit support already.

2)  If we don't allocate the last 2M area with 4kb pages, the EPT will
meet some problem. Xen will set one of the 4k page

     there to be invalid, logically that means we should invalid the all
the 2M page if we allocate it with 2M, and then the 

     special pages Xen used in the high end of the guest memory can not
be used then. May we know how you cope with that?

Thanks

Xiaohui

________________________________

From: xen-devel-bounces@lists.xensource.com
[mailto:xen-devel-bounces@lists.xensource.com] On Behalf Of Huang2, Wei
Sent: Monday, May 12, 2008 12:36 PM
To: Xin, Xiaohui; xen-devel@lists.xensource.com
Subject: RE: [Xen-devel][PATCH] patch to support super page (2M) with
EPT

This is the latest one I created. Please review it and I will re-submit.

1.       It includes the patch for p2m-ept.c, directly from your
previous patch. 

2.       Xc_hvm_create.c is based on my original approach. It includes
support for both 2MB and 4MB pages. Also it considers the case of odd
page size (such as 255MB). But I did not allocate the last 2MB area
using 4KB  pages. Let me know if it is a big issue.

3.       The rest are pretty similar.

Thanks,

-Wei

From: xen-devel-bounces@lists.xensource.com
[mailto:xen-devel-bounces@lists.xensource.com] On Behalf Of Huang2, Wei
Sent: Sunday, May 11, 2008 3:34 PM
To: Xin, Xiaohui; xen-devel@lists.xensource.com
Subject: RE: [Xen-devel][PATCH] patch to support super page (2M) with
EPT

Could we work together for a common solution? As far as I can see, it
largely overlaps with my super page patch. The major difference is
between p2m.c and p2m-ept.c.

-Wei

From: xen-devel-bounces@lists.xensource.com
[mailto:xen-devel-bounces@lists.xensource.com] On Behalf Of Xin, Xiaohui
Sent: Friday, May 09, 2008 4:11 AM
To: xen-devel@lists.xensource.com
Subject: [Xen-devel][PATCH] patch to support super page (2M) with EPT

Attached are the patches to support super page with EPT. We only support
2M size. And shadow may still work fine with 4K pages.

The patches can be split into 3 parts. Apply order is as attached.

tool.diff 

To allocate 2M physical contiguous memory in guest except the first 2M
and the last 2M.

The first 2M covers special memory, and Xen use the last few pages in
guest memory to do special things.

We let them to be 4K pages as normal.

super_page_common.patch 

To modify the p2m interfaces by adding an order parameter, such as
guest_physmap_add_page(), p2m_set_entry(), etc.

p2m-ept-file.patch

            To handle the EPT tables to support super page.            

Signed-off-by: Xin Xiaohui <xiaohui.xin@intel.com>

Signed-off-by: Li Xin, B <xin.b.li@intel.com>

[-- Attachment #1.2: Type: text/html, Size: 16792 bytes --]

[-- Attachment #2: Type: text/plain, Size: 138 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] patch to support super page (2M) with EPT
  2008-05-12  5:04     ` Xin, Xiaohui
@ 2008-05-12  7:03       ` Keir Fraser
  2008-05-12 17:28         ` Huang2, Wei
  0 siblings, 1 reply; 14+ messages in thread
From: Keir Fraser @ 2008-05-12  7:03 UTC (permalink / raw)
  To: Xin, Xiaohui, Huang2, Wei, xen-devel


[-- Attachment #1.1: Type: text/plain, Size: 3450 bytes --]

Yes, absolutely no need for 4M page support. We do not support the 32-bit
non-PAE build target any more.

 -- Keir

On 12/5/08 06:04, "Xin, Xiaohui" <xiaohui.xin@intel.com> wrote:

> Some comments here:
> 1) Basically 4M pages allocations is not hardware naturally for EPT, we only
> use 2M super pages now.
> I remembered that Keir said that 2M pages allocation is sufficient, and he
> removed all the pure 32bit support already.
> 2)  If we don¹t allocate the last 2M area with 4kb pages, the EPT will meet
> some problem. Xen will set one of the 4k page
>      there to be invalid, logically that means we should invalid the all the
> 2M page if we allocate it with 2M, and then the
>      special pages Xen used in the high end of the guest memory can not be
> used then. May we know how you cope with that?
>  
> Thanks
> Xiaohui
>  
> 
> 
> From: xen-devel-bounces@lists.xensource.com
> [mailto:xen-devel-bounces@lists.xensource.com] On Behalf Of Huang2, Wei
> Sent: Monday, May 12, 2008 12:36 PM
> To: Xin, Xiaohui; xen-devel@lists.xensource.com
> Subject: RE: [Xen-devel][PATCH] patch to support super page (2M) with EPT
>  
> This is the latest one I created. Please review it and I will re-submit.
>  
> 1.      It includes the patch for p2m-ept.c, directly from your previous
> patch. 
> 
> 2.      Xc_hvm_create.c is based on my original approach. It includes support
> for both 2MB and 4MB pages. Also it considers the case of odd page size (such
> as 255MB). But I did not allocate the last 2MB area using 4KB  pages. Let me
> know if it is a big issue.
> 
> 3.      The rest are pretty similar.
> 
>  
> Thanks,
>  
> -Wei
>  
> 
> From: xen-devel-bounces@lists.xensource.com
> [mailto:xen-devel-bounces@lists.xensource.com] On Behalf Of Huang2, Wei
> Sent: Sunday, May 11, 2008 3:34 PM
> To: Xin, Xiaohui; xen-devel@lists.xensource.com
> Subject: RE: [Xen-devel][PATCH] patch to support super page (2M) with EPT
>  
> Could we work together for a common solution? As far as I can see, it largely
> overlaps with my super page patch. The major difference is between p2m.c and
> p2m-ept.c.
>  
> -Wei
>  
> 
> From: xen-devel-bounces@lists.xensource.com
> [mailto:xen-devel-bounces@lists.xensource.com] On Behalf Of Xin, Xiaohui
> Sent: Friday, May 09, 2008 4:11 AM
> To: xen-devel@lists.xensource.com
> Subject: [Xen-devel][PATCH] patch to support super page (2M) with EPT
>  
> Attached are the patches to support super page with EPT. We only support 2M
> size. And shadow may still work fine with 4K pages.
> The patches can be split into 3 parts. Apply order is as attached.
>  
> tool.diff 
> To allocate 2M physical contiguous memory in guest except the first 2M and the
> last 2M.
> The first 2M covers special memory, and Xen use the last few pages in guest
> memory to do special things.
> We let them to be 4K pages as normal.
> super_page_common.patch
> To modify the p2m interfaces by adding an order parameter, such as
> guest_physmap_add_page(), p2m_set_entry(), etc.
> p2m-ept-file.patch
>            To handle the EPT tables to support super page.
>  
>  
> Signed-off-by: Xin Xiaohui <xiaohui.xin@intel.com>
> Signed-off-by: Li Xin, B <xin.b.li@intel.com>
>  
>  
>  
> 
> 
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xensource.com
> http://lists.xensource.com/xen-devel



[-- Attachment #1.2: Type: text/html, Size: 7488 bytes --]

[-- Attachment #2: Type: text/plain, Size: 138 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 14+ messages in thread

* RE: [PATCH] patch to support super page (2M) with EPT
@ 2008-05-12 12:14 Huang2, Wei
  0 siblings, 0 replies; 14+ messages in thread
From: Huang2, Wei @ 2008-05-12 12:14 UTC (permalink / raw)
  To: Keir Fraser, Xin, Xiaohui, xen-devel

I will re-submit another one today.

-----Original Message-----
From: Keir Fraser <keir.fraser@eu.citrix.com>
Sent: Monday, May 12, 2008 2:03 AM
To: Xin, Xiaohui <xiaohui.xin@intel.com>; Huang2, Wei <Wei.Huang2@amd.com>; xen-devel@lists.xensource.com <xen-devel@lists.xensource.com>
Subject: Re: [Xen-devel][PATCH] patch to support super page (2M) with EPT

Yes, absolutely no need for 4M page support. We do not support the 32-bit non-PAE build target any more.

 -- Keir

On 12/5/08 06:04, "Xin, Xiaohui" <xiaohui.xin@intel.com> wrote:



	Some comments here:
	1) Basically 4M pages allocations is not hardware naturally for EPT, we only use 2M super pages now. 
	I remembered that Keir said that 2M pages allocation is sufficient, and he removed all the pure 32bit support already.
	2)  If we don’t allocate the last 2M area with 4kb pages, the EPT will meet some problem. Xen will set one of the 4k page
	     there to be invalid, logically that means we should invalid the all the 2M page if we allocate it with 2M, and then the 
	     special pages Xen used in the high end of the guest memory can not be used then. May we know how you cope with that?
	 
	Thanks
	Xiaohui
	 
	

	________________________________

	From: xen-devel-bounces@lists.xensource.com [mailto:xen-devel-bounces@lists.xensource.com] On Behalf Of Huang2, Wei
	Sent: Monday, May 12, 2008 12:36 PM
	To: Xin, Xiaohui; xen-devel@lists.xensource.com
	Subject: RE: [Xen-devel][PATCH] patch to support super page (2M) with EPT
	
	This is the latest one I created. Please review it and I will re-submit.
	 
	1.      It includes the patch for p2m-ept.c, directly from your previous patch. 
	
	2.      Xc_hvm_create.c is based on my original approach. It includes support for both 2MB and 4MB pages. Also it considers the case of odd page size (such as 255MB). But I did not allocate the last 2MB area using 4KB  pages. Let me know if it is a big issue.
	
	3.      The rest are pretty similar.
	
	
	Thanks,
	 
	-Wei
	 
	
	From: xen-devel-bounces@lists.xensource.com [mailto:xen-devel-bounces@lists.xensource.com] On Behalf Of Huang2, Wei
	Sent: Sunday, May 11, 2008 3:34 PM
	To: Xin, Xiaohui; xen-devel@lists.xensource.com
	Subject: RE: [Xen-devel][PATCH] patch to support super page (2M) with EPT
	
	Could we work together for a common solution? As far as I can see, it largely overlaps with my super page patch. The major difference is between p2m.c and p2m-ept.c.
	 
	-Wei
	 
	
	From: xen-devel-bounces@lists.xensource.com [mailto:xen-devel-bounces@lists.xensource.com] On Behalf Of Xin, Xiaohui
	Sent: Friday, May 09, 2008 4:11 AM
	To: xen-devel@lists.xensource.com
	Subject: [Xen-devel][PATCH] patch to support super page (2M) with EPT
	
	Attached are the patches to support super page with EPT. We only support 2M size. And shadow may still work fine with 4K pages.
	The patches can be split into 3 parts. Apply order is as attached.
	 
	tool.diff 
	To allocate 2M physical contiguous memory in guest except the first 2M and the last 2M.
	The first 2M covers special memory, and Xen use the last few pages in guest memory to do special things.
	We let them to be 4K pages as normal.
	super_page_common.patch 
	To modify the p2m interfaces by adding an order parameter, such as guest_physmap_add_page(), p2m_set_entry(), etc.
	p2m-ept-file.patch
	           To handle the EPT tables to support super page.            
	 
	 
	Signed-off-by: Xin Xiaohui <xiaohui.xin@intel.com>
	Signed-off-by: Li Xin, B <xin.b.li@intel.com>
	 
	
	 
	
	
________________________________

	_______________________________________________
	Xen-devel mailing list
	Xen-devel@lists.xensource.com
	http://lists.xensource.com/xen-devel
	

^ permalink raw reply	[flat|nested] 14+ messages in thread

* RE: [PATCH] patch to support super page (2M) with EPT
  2008-05-12  7:03       ` Keir Fraser
@ 2008-05-12 17:28         ` Huang2, Wei
  2008-05-13  8:46           ` Keir Fraser
  0 siblings, 1 reply; 14+ messages in thread
From: Huang2, Wei @ 2008-05-12 17:28 UTC (permalink / raw)
  To: Keir Fraser, Xin, Xiaohui, xen-devel


[-- Attachment #1.1: Type: text/plain, Size: 3984 bytes --]

Here is a revised version. I get rid of 4MB support, as suggested. I did
not see the issue mentioned by Xiaohui related to splitting last 2M into
4KB pages. But anyway, I attached two versions for your reference. Keir,
please let me know if you have comments.
 
 
-Wei

________________________________

From: Keir Fraser [mailto:keir.fraser@eu.citrix.com] 
Sent: Monday, May 12, 2008 2:03 AM
To: Xin, Xiaohui; Huang2, Wei; xen-devel@lists.xensource.com
Subject: Re: [Xen-devel][PATCH] patch to support super page (2M) with
EPT


Yes, absolutely no need for 4M page support. We do not support the
32-bit non-PAE build target any more.

 -- Keir

On 12/5/08 06:04, "Xin, Xiaohui" <xiaohui.xin@intel.com> wrote:



	Some comments here:
	1) Basically 4M pages allocations is not hardware naturally for
EPT, we only use 2M super pages now. 
	I remembered that Keir said that 2M pages allocation is
sufficient, and he removed all the pure 32bit support already.
	2)  If we don't allocate the last 2M area with 4kb pages, the
EPT will meet some problem. Xen will set one of the 4k page
	     there to be invalid, logically that means we should invalid
the all the 2M page if we allocate it with 2M, and then the 
	     special pages Xen used in the high end of the guest memory
can not be used then. May we know how you cope with that?
	 
	Thanks
	Xiaohui
	 
	

	
________________________________


	From: xen-devel-bounces@lists.xensource.com [
mailto:xen-devel-bounces@lists.xensource.com] On Behalf Of Huang2, Wei
	Sent: Monday, May 12, 2008 12:36 PM
	To: Xin, Xiaohui; xen-devel@lists.xensource.com
	Subject: RE: [Xen-devel][PATCH] patch to support super page (2M)
with EPT
	
	This is the latest one I created. Please review it and I will
re-submit.
	 
	1.      It includes the patch for p2m-ept.c, directly from your
previous patch. 
	
	2.      Xc_hvm_create.c is based on my original approach. It
includes support for both 2MB and 4MB pages. Also it considers the case
of odd page size (such as 255MB). But I did not allocate the last 2MB
area using 4KB  pages. Let me know if it is a big issue.
	
	3.      The rest are pretty similar.
	
	
	Thanks,
	 
	-Wei
	 
	
	From: xen-devel-bounces@lists.xensource.com [
mailto:xen-devel-bounces@lists.xensource.com] On Behalf Of Huang2, Wei
	Sent: Sunday, May 11, 2008 3:34 PM
	To: Xin, Xiaohui; xen-devel@lists.xensource.com
	Subject: RE: [Xen-devel][PATCH] patch to support super page (2M)
with EPT
	
	Could we work together for a common solution? As far as I can
see, it largely overlaps with my super page patch. The major difference
is between p2m.c and p2m-ept.c.
	 
	-Wei
	 
	
	From: xen-devel-bounces@lists.xensource.com [
mailto:xen-devel-bounces@lists.xensource.com] On Behalf Of Xin, Xiaohui
	Sent: Friday, May 09, 2008 4:11 AM
	To: xen-devel@lists.xensource.com
	Subject: [Xen-devel][PATCH] patch to support super page (2M)
with EPT
	
	Attached are the patches to support super page with EPT. We only
support 2M size. And shadow may still work fine with 4K pages.
	The patches can be split into 3 parts. Apply order is as
attached.
	 
	tool.diff 
	To allocate 2M physical contiguous memory in guest except the
first 2M and the last 2M.
	The first 2M covers special memory, and Xen use the last few
pages in guest memory to do special things.
	We let them to be 4K pages as normal.
	super_page_common.patch 
	To modify the p2m interfaces by adding an order parameter, such
as guest_physmap_add_page(), p2m_set_entry(), etc.
	p2m-ept-file.patch
	           To handle the EPT tables to support super page.

	 
	 
	Signed-off-by: Xin Xiaohui <xiaohui.xin@intel.com>
	Signed-off-by: Li Xin, B <xin.b.li@intel.com>
	 
	
	 
	
	
________________________________

	_______________________________________________
	Xen-devel mailing list
	Xen-devel@lists.xensource.com
	http://lists.xensource.com/xen-devel
	




[-- Attachment #1.2: Type: text/html, Size: 8934 bytes --]

[-- Attachment #2: super_page_patch_fix_last_2MB.txt --]
[-- Type: text/plain, Size: 44600 bytes --]

diff -r 810d8c3ac992 tools/libxc/xc_hvm_build.c
--- a/tools/libxc/xc_hvm_build.c	Thu May 08 16:58:33 2008 +0100
+++ b/tools/libxc/xc_hvm_build.c	Mon May 12 06:05:39 2008 -0500
@@ -157,8 +157,10 @@ static int setup_guest(int xc_handle,
                        char *image, unsigned long image_size)
 {
     xen_pfn_t *page_array = NULL;
+    xen_pfn_t *super_page_array = NULL;
     unsigned long i, nr_pages = (unsigned long)memsize << (20 - PAGE_SHIFT);
-    unsigned long special_page_nr, entry_eip, cur_pages;
+    unsigned long nr_super_pages;
+    unsigned long special_page_nr, entry_eip, cur_pages, limit;
     struct xen_add_to_physmap xatp;
     struct shared_info *shared_info;
     void *e820_page;
@@ -167,6 +169,7 @@ static int setup_guest(int xc_handle,
     uint64_t v_start, v_end;
     int rc;
     xen_capabilities_info_t caps;
+    int super_page_order;
 
     /* An HVM guest must be initialised with at least 2MB memory. */
     if ( memsize < 2 )
@@ -189,6 +192,17 @@ static int setup_guest(int xc_handle,
         PERROR("Guest OS must load to a page boundary.\n");
         goto error_out;
     }
+
+    /* We only support 2MB super pages since 32bit non-PAE is not officially
+     * supported by Xen any more. Plus, we try to allocate the last memory
+     * area using 4KB pages. For this reason, if memsize is an even number, 
+     * we have to decrese nr_super_pages by 1. As for an odd memsize, this is
+     * enforced automatically (see below).
+     */
+    nr_super_pages = (unsigned long)memsize >> 1;
+    if ( (memsize % 2) == 0 )
+        nr_super_pages -= 1;
+    super_page_order = 9;
 
     IPRINTF("VIRTUAL MEMORY ARRANGEMENT:\n"
             "  Loader:        %016"PRIx64"->%016"PRIx64"\n"
@@ -198,7 +212,9 @@ static int setup_guest(int xc_handle,
             v_start, v_end,
             elf_uval(&elf, elf.ehdr, e_entry));
 
-    if ( (page_array = malloc(nr_pages * sizeof(xen_pfn_t))) == NULL )
+    if ( (page_array = malloc(nr_pages * sizeof(xen_pfn_t))) == NULL ||
+         (super_page_array = 
+          malloc(nr_super_pages * sizeof(xen_pfn_t))) == NULL )
     {
         PERROR("Could not allocate memory.\n");
         goto error_out;
@@ -206,26 +222,44 @@ static int setup_guest(int xc_handle,
 
     for ( i = 0; i < nr_pages; i++ )
         page_array[i] = i;
+    for ( i = 0; i < nr_super_pages; i++ )
+        super_page_array[i] = i << super_page_order;
     for ( i = HVM_BELOW_4G_RAM_END >> PAGE_SHIFT; i < nr_pages; i++ )
         page_array[i] += HVM_BELOW_4G_MMIO_LENGTH >> PAGE_SHIFT;
-
-    /*
-     * Allocate memory for HVM guest, skipping VGA hole 0xA0000-0xC0000.
-     * We allocate pages in batches of no more than 2048 to ensure that
-     * we can be preempted and hence dom0 remains responsive.
-     */
+    for ( i = HVM_BELOW_4G_RAM_END >> (PAGE_SHIFT + super_page_order); 
+          i < nr_super_pages; i++ )
+        super_page_array[i] += HVM_BELOW_4G_MMIO_LENGTH >> PAGE_SHIFT;
+
+
+    /* Allocate memory for HVM guest, skipping VGA hole 0xA0000-0xC0000. */
     rc = xc_domain_memory_populate_physmap(
         xc_handle, dom, 0xa0, 0, 0, &page_array[0x00]);
-    cur_pages = 0xc0;
-    while ( (rc == 0) && (nr_pages > cur_pages) )
-    {
-        unsigned long count = nr_pages - cur_pages;
-        if ( count > 2048 )
-            count = 2048;
+    if ( rc == 0 )
         rc = xc_domain_memory_populate_physmap(
-            xc_handle, dom, count, 0, 0, &page_array[cur_pages]);
+            xc_handle, dom, 0x200-0xc0, 0, 0, &page_array[0xc0]);
+    
+    /* We allocate pages in batches of no more than 8MB to ensure that
+     * we can be preempted and hence dom0 remains responsive.
+     */
+    limit = 4;
+    cur_pages = 1;
+    while ( (rc == 0) && (nr_super_pages > cur_pages) )
+    {
+        unsigned long count = nr_super_pages - cur_pages;
+        if ( count > limit )
+            count = limit;
+        rc = xc_domain_memory_populate_physmap(xc_handle, dom, count, 
+                                               super_page_order, 0, 
+                                               &super_page_array[cur_pages]);
         cur_pages += count;
     }
+
+    /* handle the case of odd number physical memory size, such as 255MB */
+    if ( rc == 0 )
+        rc = xc_domain_memory_populate_physmap(
+            xc_handle, dom, nr_pages - (nr_super_pages << super_page_order), 
+            0, 0, &page_array[nr_super_pages << super_page_order]);
+
     if ( rc != 0 )
     {
         PERROR("Could not allocate memory for HVM guest.\n");
@@ -314,10 +348,12 @@ static int setup_guest(int xc_handle,
     }
 
     free(page_array);
+    free(super_page_array);
     return 0;
 
  error_out:
     free(page_array);
+    free(super_page_array);
     return -1;
 }
 
diff -r 810d8c3ac992 xen/arch/ia64/xen/mm.c
--- a/xen/arch/ia64/xen/mm.c	Thu May 08 16:58:33 2008 +0100
+++ b/xen/arch/ia64/xen/mm.c	Sun May 11 16:58:19 2008 -0500
@@ -2415,7 +2415,7 @@ steal_page(struct domain *d, struct page
 
 int
 guest_physmap_add_page(struct domain *d, unsigned long gpfn,
-                       unsigned long mfn)
+                       unsigned long mfn, unsigned int page_order)
 {
     BUG_ON(!mfn_valid(mfn));
     BUG_ON(mfn_to_page(mfn)->count_info != (PGC_allocated | 1));
@@ -2432,7 +2432,7 @@ guest_physmap_add_page(struct domain *d,
 
 void
 guest_physmap_remove_page(struct domain *d, unsigned long gpfn,
-                          unsigned long mfn)
+                          unsigned long mfn, unsigned int page_order)
 {
     BUG_ON(mfn == 0);//XXX
     zap_domain_page_one(d, gpfn << PAGE_SHIFT, 0, mfn);
@@ -2838,7 +2838,7 @@ arch_memory_op(int op, XEN_GUEST_HANDLE(
         if (prev_mfn && mfn_valid(prev_mfn)) {
             if (is_xen_heap_mfn(prev_mfn))
                 /* Xen heap frames are simply unhooked from this phys slot. */
-                guest_physmap_remove_page(d, xatp.gpfn, prev_mfn);
+                guest_physmap_remove_page(d, xatp.gpfn, prev_mfn, 0);
             else
                 /* Normal domain memory is freed, to avoid leaking memory. */
                 guest_remove_page(d, xatp.gpfn);
@@ -2847,10 +2847,10 @@ arch_memory_op(int op, XEN_GUEST_HANDLE(
         /* Unmap from old location, if any. */
         gpfn = get_gpfn_from_mfn(mfn);
         if (gpfn != INVALID_M2P_ENTRY)
-            guest_physmap_remove_page(d, gpfn, mfn);
+            guest_physmap_remove_page(d, gpfn, mfn, 0);
 
         /* Map at new location. */
-        guest_physmap_add_page(d, xatp.gpfn, mfn);
+        guest_physmap_add_page(d, xatp.gpfn, mfn, 0);
 
     out:
         domain_unlock(d);
diff -r 810d8c3ac992 xen/arch/x86/mm.c
--- a/xen/arch/x86/mm.c	Thu May 08 16:58:33 2008 +0100
+++ b/xen/arch/x86/mm.c	Sun May 11 10:53:29 2008 -0500
@@ -3287,7 +3287,8 @@ long arch_memory_op(int op, XEN_GUEST_HA
         {
             if ( is_xen_heap_mfn(prev_mfn) )
                 /* Xen heap frames are simply unhooked from this phys slot. */
-                guest_physmap_remove_page(d, xatp.gpfn, prev_mfn);
+                guest_physmap_remove_page(d, xatp.gpfn, prev_mfn,
+                                          NORMAL_PAGE_ORDER);
             else
                 /* Normal domain memory is freed, to avoid leaking memory. */
                 guest_remove_page(d, xatp.gpfn);
@@ -3296,10 +3297,10 @@ long arch_memory_op(int op, XEN_GUEST_HA
         /* Unmap from old location, if any. */
         gpfn = get_gpfn_from_mfn(mfn);
         if ( gpfn != INVALID_M2P_ENTRY )
-            guest_physmap_remove_page(d, gpfn, mfn);
+            guest_physmap_remove_page(d, gpfn, mfn, NORMAL_PAGE_ORDER);
 
         /* Map at new location. */
-        guest_physmap_add_page(d, xatp.gpfn, mfn);
+        guest_physmap_add_page(d, xatp.gpfn, mfn, NORMAL_PAGE_ORDER);
 
         domain_unlock(d);
 
diff -r 810d8c3ac992 xen/arch/x86/mm/hap/p2m-ept.c
--- a/xen/arch/x86/mm/hap/p2m-ept.c	Thu May 08 16:58:33 2008 +0100
+++ b/xen/arch/x86/mm/hap/p2m-ept.c	Sun May 11 16:45:36 2008 -0500
@@ -20,6 +20,7 @@
 #include <xen/domain_page.h>
 #include <xen/sched.h>
 #include <asm/current.h>
+#include <asm/paging.h>
 #include <asm/types.h>
 #include <asm/domain.h>
 #include <asm/p2m.h>
@@ -46,6 +47,9 @@ static void ept_p2m_type_to_flags(ept_en
     }
 }
 
+#define GUEST_TABLE_NORMAL_PAGE 1
+#define GUEST_TABLE_SUPER_PAGE  2
+
 static int ept_next_level(struct domain *d, bool_t read_only,
                           ept_entry_t **table, unsigned long *gfn_remainder,
                           u32 shift)
@@ -54,7 +58,6 @@ static int ept_next_level(struct domain 
     u32 index;
 
     index = *gfn_remainder >> shift;
-    *gfn_remainder &= (1UL << shift) - 1;
 
     ept_entry = (*table) + index;
 
@@ -83,31 +86,53 @@ static int ept_next_level(struct domain 
         ept_entry->r = ept_entry->w = ept_entry->x = 1;
     }
 
-    next = map_domain_page(ept_entry->mfn);
-    unmap_domain_page(*table);
-    *table = next;
-
-    return 1;
+    if ( !ept_entry->sp_avail )
+    {
+        *gfn_remainder &= (1UL << shift) - 1;
+        next = map_domain_page(ept_entry->mfn);
+        unmap_domain_page(*table);
+        *table = next;
+        return GUEST_TABLE_NORMAL_PAGE;
+    }
+    else
+        return GUEST_TABLE_SUPER_PAGE;
 }
 
 static int
-ept_set_entry(struct domain *d, unsigned long gfn, mfn_t mfn, p2m_type_t p2mt)
-{
-    ept_entry_t *table =
-        map_domain_page(mfn_x(pagetable_get_mfn(d->arch.phys_table)));
-    unsigned long gfn_remainder = gfn;
+ept_set_entry(struct domain *d, unsigned long gfn, mfn_t mfn, 
+              unsigned int order, p2m_type_t p2mt)
+{
+    ept_entry_t *table = NULL;
+    unsigned long gfn_remainder = gfn, offset = 0;
     ept_entry_t *ept_entry = NULL;
     u32 index;
-    int i, rv = 0;
+    int i, rv = 0, ret = 0;
+    int walk_level = order / EPT_TABLE_ORDER;
 
     /* Should check if gfn obeys GAW here */
 
-    for ( i = EPT_DEFAULT_GAW; i > 0; i-- )
-        if ( !ept_next_level(d, 0, &table, &gfn_remainder,
-                             i * EPT_TABLE_ORDER) )
+    if (  order != 0 )
+        if ( (gfn & ((1UL << order) - 1)) )
+            return 1;
+
+    table = map_domain_page(mfn_x(pagetable_get_mfn(d->arch.phys_table)));
+
+    ASSERT(table != NULL);
+
+    for ( i = EPT_DEFAULT_GAW; i > walk_level; i-- )
+    {
+        ret = ept_next_level(d, 0, &table, &gfn_remainder,
+          i * EPT_TABLE_ORDER);
+        if ( !ret )
             goto out;
-
-    index = gfn_remainder;
+        else if ( ret == GUEST_TABLE_SUPER_PAGE )
+            break;
+    }
+
+    index = gfn_remainder >> ( i ?  (i * EPT_TABLE_ORDER): order);
+    walk_level = ( i ? ( i * EPT_TABLE_ORDER) : order) / EPT_TABLE_ORDER;
+    offset = (gfn_remainder & ( ((1 << (i*EPT_TABLE_ORDER)) - 1)));
+
     ept_entry = table + index;
 
     if ( mfn_valid(mfn_x(mfn)) || (p2mt == p2m_mmio_direct) )
@@ -117,9 +142,20 @@ ept_set_entry(struct domain *d, unsigned
             d->arch.p2m->max_mapped_pfn = gfn;
 
         ept_entry->emt = EPT_DEFAULT_MT;
-        ept_entry->sp_avail = 0;
+        ept_entry->sp_avail = walk_level ? 1 : 0;
+
+        if ( ret == GUEST_TABLE_SUPER_PAGE )
+        {
+            ept_entry->mfn = mfn_x(mfn) - offset;
+            if ( ept_entry->avail1 == p2m_ram_logdirty &&
+              p2mt == p2m_ram_rw )
+                for ( i = 0; i < 512; i++ )
+                    paging_mark_dirty(d, mfn_x(mfn)-offset+i);
+        }
+        else
+            ept_entry->mfn = mfn_x(mfn);
+
         ept_entry->avail1 = p2mt;
-        ept_entry->mfn = mfn_x(mfn);
         ept_entry->rsvd = 0;
         ept_entry->avail2 = 0;
         /* last step */
@@ -132,14 +168,42 @@ ept_set_entry(struct domain *d, unsigned
     /* Success */
     rv = 1;
 
- out:
+out:
     unmap_domain_page(table);
 
     ept_sync_domain(d);
 
+    /* Now the p2m table is not shared with vt-d page table */
+
+    if ( iommu_enabled && is_hvm_domain(d) )
+    {
+        if ( p2mt == p2m_ram_rw )
+        {
+            if ( ret == GUEST_TABLE_SUPER_PAGE )
+            {
+                for ( i = 0; i < 512; i++ )
+                    iommu_map_page(d, gfn-offset+i, mfn_x(mfn)-offset+i);
+            }
+            else if ( ret )
+                iommu_map_page(d, gfn, mfn_x(mfn));
+        }
+        else
+        {
+            if ( ret == GUEST_TABLE_SUPER_PAGE )
+            {
+                for ( i = 0; i < 512; i++ )
+                    iommu_unmap_page(d, gfn-offset+i);
+            }
+            else if ( ret )
+                iommu_unmap_page(d, gfn);
+        }
+    }
+
+#ifdef P2M_SHARE_WITH_VTD_PAGE_TABLE
     /* If p2m table is shared with vtd page-table. */
     if ( iommu_enabled && is_hvm_domain(d) && (p2mt == p2m_mmio_direct) )
         iommu_flush(d, gfn, (u64*)ept_entry);
+#endif
 
     return rv;
 }
@@ -152,7 +216,7 @@ static mfn_t ept_get_entry(struct domain
     unsigned long gfn_remainder = gfn;
     ept_entry_t *ept_entry;
     u32 index;
-    int i;
+    int i, ret=0;
     mfn_t mfn = _mfn(INVALID_MFN);
 
     *t = p2m_mmio_dm;
@@ -164,17 +228,31 @@ static mfn_t ept_get_entry(struct domain
     /* Should check if gfn obeys GAW here. */
 
     for ( i = EPT_DEFAULT_GAW; i > 0; i-- )
-        if ( !ept_next_level(d, 1, &table, &gfn_remainder,
-                             i * EPT_TABLE_ORDER) )
+    {
+        ret = ept_next_level(d, 1, &table, &gfn_remainder,
+                             i * EPT_TABLE_ORDER);
+        if ( !ret )
             goto out;
-
-    index = gfn_remainder;
+        else if ( ret == GUEST_TABLE_SUPER_PAGE )
+            break;
+    }
+
+    index = gfn_remainder >> ( i * EPT_TABLE_ORDER);
     ept_entry = table + index;
 
     if ( ept_entry->avail1 != p2m_invalid )
     {
         *t = ept_entry->avail1;
         mfn = _mfn(ept_entry->mfn);
+        if ( i )
+        {
+            /* we may meet super pages, and to split into 4k pages
+             * to emulate p2m table
+             */
+            unsigned long split_mfn = 
+              mfn_x(mfn) + (gfn_remainder & ( ((1 << (i*EPT_TABLE_ORDER)) - 1 )));
+            mfn = _mfn(split_mfn);
+        }
     }
 
  out:
@@ -205,33 +283,63 @@ static void ept_change_entry_type_global
     l4e = map_domain_page(mfn_x(pagetable_get_mfn(d->arch.phys_table)));
     for (i4 = 0; i4 < EPT_PAGETABLE_ENTRIES; i4++ )
     {
-        if ( !l4e[i4].epte || l4e[i4].sp_avail )
+        if ( !l4e[i4].epte )
             continue;
-        l3e = map_domain_page(l4e[i4].mfn);
-        for ( i3 = 0; i3 < EPT_PAGETABLE_ENTRIES; i3++ )
-        {
-            if ( !l3e[i3].epte || l3e[i3].sp_avail )
+        if ( !l4e[i4].sp_avail )
+        {
+            l3e = map_domain_page(l4e[i4].mfn);
+            for ( i3 = 0; i3 < EPT_PAGETABLE_ENTRIES; i3++ )
+            {
+                if ( !l3e[i3].epte )
+                    continue;
+                if ( !l3e[i3].sp_avail )
+                {
+                    l2e = map_domain_page(l3e[i3].mfn);
+                    for ( i2 = 0; i2 < EPT_PAGETABLE_ENTRIES; i2++ )
+                    {
+                        if ( !l2e[i2].epte )
+                            continue;
+                        if ( !l2e[i2].sp_avail )
+                        {
+                            l1e = map_domain_page(l2e[i2].mfn);
+                            for ( i1  = 0; i1 < EPT_PAGETABLE_ENTRIES; i1++ )
+                            {
+                                if ( !l1e[i1].epte )
+                                    continue;
+                                if ( l1e[i1].avail1 != ot )
+                                    continue;
+                                l1e[i1].avail1 = nt;
+                                ept_p2m_type_to_flags(l1e+i1, nt);
+                            }
+                            unmap_domain_page(l1e);
+                        }
+                        else
+                        {
+                            if ( l2e[i2].avail1 != ot )
+                                continue;
+                            l2e[i2].avail1 = nt;
+                            ept_p2m_type_to_flags(l2e+i2, nt);
+                        }
+                    }
+                    unmap_domain_page(l2e);
+                }
+                else
+                {
+                    if ( l3e[i3].avail1 != ot )
+                        continue;
+                    l3e[i3].avail1 = nt;
+                    ept_p2m_type_to_flags(l3e+i3, nt);
+                }
+            }
+            unmap_domain_page(l3e);
+        }
+        else
+        {
+            if ( l4e[i4].avail1 != ot )
                 continue;
-            l2e = map_domain_page(l3e[i3].mfn);
-            for ( i2 = 0; i2 < EPT_PAGETABLE_ENTRIES; i2++ )
-            {
-                if ( !l2e[i2].epte || l2e[i2].sp_avail )
-                    continue;
-                l1e = map_domain_page(l2e[i2].mfn);
-                for ( i1  = 0; i1 < EPT_PAGETABLE_ENTRIES; i1++ )
-                {
-                    if ( !l1e[i1].epte )
-                        continue;
-                    if ( l1e[i1].avail1 != ot )
-                        continue;
-                    l1e[i1].avail1 = nt;
-                    ept_p2m_type_to_flags(l1e+i1, nt);
-                }
-                unmap_domain_page(l1e);
-            }
-            unmap_domain_page(l2e);
-        }
-        unmap_domain_page(l3e);
+            l4e[i4].avail1 = nt;
+            ept_p2m_type_to_flags(l4e+i4, nt);
+        }
     }
     unmap_domain_page(l4e);
 
diff -r 810d8c3ac992 xen/arch/x86/mm/p2m.c
--- a/xen/arch/x86/mm/p2m.c	Thu May 08 16:58:33 2008 +0100
+++ b/xen/arch/x86/mm/p2m.c	Mon May 12 06:07:03 2008 -0500
@@ -151,9 +151,11 @@ p2m_next_level(struct domain *d, mfn_t *
                unsigned long *gfn_remainder, unsigned long gfn, u32 shift,
                u32 max, unsigned long type)
 {
+    l1_pgentry_t *l1_entry;
     l1_pgentry_t *p2m_entry;
     l1_pgentry_t new_entry;
     void *next;
+    int i;
     ASSERT(d->arch.p2m->alloc_page);
 
     if ( !(p2m_entry = p2m_find_entry(*table, gfn_remainder, gfn,
@@ -194,6 +196,44 @@ p2m_next_level(struct domain *d, mfn_t *
             break;
         }
     }
+
+    ASSERT(l1e_get_flags(*p2m_entry) & _PAGE_PRESENT);
+
+    /* split single large page into 4KB page in P2M table */
+    if ( type == PGT_l1_page_table && (l1e_get_flags(*p2m_entry) & _PAGE_PSE) )
+    {
+        unsigned long flags, pfn;
+        struct page_info *pg = d->arch.p2m->alloc_page(d);
+        if ( pg == NULL )
+            return 0;
+        list_add_tail(&pg->list, &d->arch.p2m->pages);
+        pg->u.inuse.type_info = PGT_l1_page_table | 1 | PGT_validated;
+        pg->count_info = 1;
+        
+        /* New splintered mappings inherit the flags of the old superpage, 
+         * with a little reorganisation for the _PAGE_PSE_PAT bit. */
+        flags = l1e_get_flags(*p2m_entry);
+        pfn = l1e_get_pfn(*p2m_entry);
+        if ( pfn & 1 )           /* ==> _PAGE_PSE_PAT was set */
+            pfn -= 1;            /* Clear it; _PAGE_PSE becomes _PAGE_PAT */
+        else
+            flags &= ~_PAGE_PSE; /* Clear _PAGE_PSE (== _PAGE_PAT) */
+        
+        l1_entry = map_domain_page(mfn_x(page_to_mfn(pg)));
+        for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
+        {
+            new_entry = l1e_from_pfn(pfn + i, flags);
+            paging_write_p2m_entry(d, gfn,
+                                   l1_entry+i, *table_mfn, new_entry, 1);
+        }
+        unmap_domain_page(l1_entry);
+        
+        new_entry = l1e_from_pfn(mfn_x(page_to_mfn(pg)),
+                                 __PAGE_HYPERVISOR|_PAGE_USER);
+        paging_write_p2m_entry(d, gfn,
+                               p2m_entry, *table_mfn, new_entry, 2);
+    }
+
     *table_mfn = _mfn(l1e_get_pfn(*p2m_entry));
     next = map_domain_page(mfn_x(*table_mfn));
     unmap_domain_page(*table);
@@ -204,7 +244,8 @@ p2m_next_level(struct domain *d, mfn_t *
 
 // Returns 0 on error (out of memory)
 static int
-p2m_set_entry(struct domain *d, unsigned long gfn, mfn_t mfn, p2m_type_t p2mt)
+p2m_set_entry(struct domain *d, unsigned long gfn, mfn_t mfn, 
+              unsigned int page_order, p2m_type_t p2mt)
 {
     // XXX -- this might be able to be faster iff current->domain == d
     mfn_t table_mfn = pagetable_get_mfn(d->arch.phys_table);
@@ -212,6 +253,7 @@ p2m_set_entry(struct domain *d, unsigned
     unsigned long gfn_remainder = gfn;
     l1_pgentry_t *p2m_entry;
     l1_pgentry_t entry_content;
+    l2_pgentry_t l2e_content;
     int rv=0;
 
 #if CONFIG_PAGING_LEVELS >= 4
@@ -235,26 +277,53 @@ p2m_set_entry(struct domain *d, unsigned
                          PGT_l2_page_table) )
         goto out;
 
-    if ( !p2m_next_level(d, &table_mfn, &table, &gfn_remainder, gfn,
-                         L2_PAGETABLE_SHIFT - PAGE_SHIFT,
-                         L2_PAGETABLE_ENTRIES, PGT_l1_page_table) )
-        goto out;
-
-    p2m_entry = p2m_find_entry(table, &gfn_remainder, gfn,
-                               0, L1_PAGETABLE_ENTRIES);
-    ASSERT(p2m_entry);
+    if ( page_order == NORMAL_PAGE_ORDER )
+    {
+        if ( !p2m_next_level(d, &table_mfn, &table, &gfn_remainder, gfn,
+                             L2_PAGETABLE_SHIFT - PAGE_SHIFT,
+                             L2_PAGETABLE_ENTRIES, PGT_l1_page_table) )
+            goto out;
+
+        p2m_entry = p2m_find_entry(table, &gfn_remainder, gfn,
+                                   0, L1_PAGETABLE_ENTRIES);
+        ASSERT(p2m_entry);
+        
+        if ( mfn_valid(mfn) || (p2mt == p2m_mmio_direct) )
+            entry_content = l1e_from_pfn(mfn_x(mfn), p2m_type_to_flags(p2mt));
+        else
+            entry_content = l1e_empty();
+        
+        /* level 1 entry */
+        paging_write_p2m_entry(d, gfn, p2m_entry, table_mfn, entry_content, 1);
+    }
+    else 
+    {
+        p2m_entry = p2m_find_entry(table, &gfn_remainder, gfn,
+                                   L2_PAGETABLE_SHIFT - PAGE_SHIFT,
+                                   L2_PAGETABLE_ENTRIES);
+        ASSERT(p2m_entry);
+        
+        if ( (l1e_get_flags(*p2m_entry) & _PAGE_PRESENT) &&
+             !(l1e_get_flags(*p2m_entry) & _PAGE_PSE) )
+        {
+            P2M_ERROR("configure P2M table 4KB L2 entry with large page\n");
+            domain_crash(d);
+            goto out;
+        }
+        
+        if ( mfn_valid(mfn) )
+            l2e_content = l2e_from_pfn(mfn_x(mfn),
+                                       p2m_type_to_flags(p2mt) | _PAGE_PSE);
+        else
+            l2e_content = l2e_empty();
+        
+        entry_content.l1 = l2e_content.l2;
+        paging_write_p2m_entry(d, gfn, p2m_entry, table_mfn, entry_content, 2);
+    }
 
     /* Track the highest gfn for which we have ever had a valid mapping */
     if ( mfn_valid(mfn) && (gfn > d->arch.p2m->max_mapped_pfn) )
         d->arch.p2m->max_mapped_pfn = gfn;
-
-    if ( mfn_valid(mfn) || (p2mt == p2m_mmio_direct) )
-        entry_content = l1e_from_pfn(mfn_x(mfn), p2m_type_to_flags(p2mt));
-    else
-        entry_content = l1e_empty();
-
-    /* level 1 entry */
-    paging_write_p2m_entry(d, gfn, p2m_entry, table_mfn, entry_content, 1);
 
     if ( iommu_enabled && is_hvm_domain(d) )
     {
@@ -335,6 +404,16 @@ p2m_gfn_to_mfn(struct domain *d, unsigne
         unmap_domain_page(l2e);
         return _mfn(INVALID_MFN);
     }
+    else if ( (l2e_get_flags(*l2e) & _PAGE_PSE) )
+    {
+        mfn = _mfn(l2e_get_pfn(*l2e) + l1_table_offset(addr));
+        *t = p2m_flags_to_type(l2e_get_flags(*l2e));
+        unmap_domain_page(l2e);
+        
+        ASSERT(mfn_valid(mfn) || !p2m_is_ram(*t));
+        return (p2m_is_valid(*t)) ? mfn : _mfn(INVALID_MFN);
+    }
+
     mfn = _mfn(l2e_get_pfn(*l2e));
     unmap_domain_page(l2e);
 
@@ -358,6 +437,7 @@ static mfn_t p2m_gfn_to_mfn_current(unsi
 {
     mfn_t mfn = _mfn(INVALID_MFN);
     p2m_type_t p2mt = p2m_mmio_dm;
+    paddr_t addr = ((paddr_t)gfn) << PAGE_SHIFT;
     /* XXX This is for compatibility with the old model, where anything not 
      * XXX marked as RAM was considered to be emulated MMIO space.
      * XXX Once we start explicitly registering MMIO regions in the p2m 
@@ -366,25 +446,44 @@ static mfn_t p2m_gfn_to_mfn_current(unsi
     if ( gfn <= current->domain->arch.p2m->max_mapped_pfn )
     {
         l1_pgentry_t l1e = l1e_empty();
+        l2_pgentry_t l2e = l2e_empty();
         int ret;
 
         ASSERT(gfn < (RO_MPT_VIRT_END - RO_MPT_VIRT_START) 
                / sizeof(l1_pgentry_t));
 
-        /* Need to __copy_from_user because the p2m is sparse and this
-         * part might not exist */
-        ret = __copy_from_user(&l1e,
-                               &phys_to_machine_mapping[gfn],
-                               sizeof(l1e));
-
-        if ( ret == 0 ) {
-            p2mt = p2m_flags_to_type(l1e_get_flags(l1e));
-            ASSERT(l1e_get_pfn(l1e) != INVALID_MFN || !p2m_is_ram(p2mt));
+        ret = __copy_from_user(&l2e,
+                               &__linear_l1_table[l1_linear_offset(RO_MPT_VIRT_START) + l2_linear_offset(addr)],
+                               sizeof(l2e));
+        
+        if ( (ret == 0) && (l2e_get_flags(l2e) & _PAGE_PRESENT) && 
+             (l2e_get_flags(l2e) & _PAGE_PSE) ) 
+        {
+            p2mt = p2m_flags_to_type(l2e_get_flags(l2e));
+            ASSERT(l2e_get_pfn(l2e) != INVALID_MFN || !p2m_is_ram(p2mt));
             if ( p2m_is_valid(p2mt) )
-                mfn = _mfn(l1e_get_pfn(l1e));
-            else 
-                /* XXX see above */
+                mfn = _mfn(l2e_get_pfn(l2e) + l1_table_offset(addr));
+            else
                 p2mt = p2m_mmio_dm;
+        }
+        else
+        {
+        
+            /* Need to __copy_from_user because the p2m is sparse and this
+             * part might not exist */
+            ret = __copy_from_user(&l1e,
+                                   &phys_to_machine_mapping[gfn],
+                                   sizeof(l1e));
+            
+            if ( ret == 0 ) {
+                p2mt = p2m_flags_to_type(l1e_get_flags(l1e));
+                ASSERT(l1e_get_pfn(l1e) != INVALID_MFN || !p2m_is_ram(p2mt));
+                if ( p2m_is_valid(p2mt) )
+                    mfn = _mfn(l1e_get_pfn(l1e));
+                else 
+                    /* XXX see above */
+                    p2mt = p2m_mmio_dm;
+            }
         }
     }
 
@@ -430,9 +529,10 @@ void p2m_change_entry_type_global(struct
 }
 
 static inline
-int set_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn, p2m_type_t p2mt)
-{
-    return d->arch.p2m->set_entry(d, gfn, mfn, p2mt);
+int set_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn, 
+                  unsigned int page_order, p2m_type_t p2mt)
+{
+    return d->arch.p2m->set_entry(d, gfn, mfn, page_order, p2mt);
 }
 
 // Allocate a new p2m table for a domain.
@@ -493,7 +593,8 @@ int p2m_alloc_table(struct domain *d,
     P2M_PRINTK("populating p2m table\n");
 
     /* Initialise physmap tables for slot zero. Other code assumes this. */
-    if ( !set_p2m_entry(d, 0, _mfn(INVALID_MFN), p2m_invalid) )
+    if ( !set_p2m_entry(d, 0, _mfn(INVALID_MFN), NORMAL_PAGE_ORDER,
+                        p2m_invalid) )
         goto error;
 
     /* Copy all existing mappings from the page list and m2p */
@@ -512,7 +613,7 @@ int p2m_alloc_table(struct domain *d,
             (gfn != 0x55555555L)
 #endif
              && gfn != INVALID_M2P_ENTRY
-            && !set_p2m_entry(d, gfn, mfn, p2m_ram_rw) )
+            && !set_p2m_entry(d, gfn, mfn, NORMAL_PAGE_ORDER, p2m_ram_rw) )
             goto error;
     }
 
@@ -688,6 +789,28 @@ static void audit_p2m(struct domain *d)
                         gfn += 1 << (L2_PAGETABLE_SHIFT - PAGE_SHIFT);
                         continue;
                     }
+                    
+                    /* check for super page */
+                    if ( l2e_get_flags(l2e[i2]) & _PAGE_PSE )
+                    {
+                        mfn = l2e_get_pfn(l2e[i2]);
+                        ASSERT(mfn_valid(_mfn(mfn)));
+                        for ( i1 = 0; i1 < L1_PAGETABLE_ENTRIES; i1++)
+                        {
+                            m2pfn = get_gpfn_from_mfn(mfn+i1);
+                            if ( m2pfn != (gfn + i) )
+                            {
+                                pmbad++;
+                                P2M_PRINTK("mismatch: gfn %#lx -> mfn %#lx"
+                                           " -> gfn %#lx\n", gfn+i, mfn+i,
+                                           m2pfn);
+                                BUG();
+                            }
+                        }
+                        gfn += 1 << (L2_PAGETABLE_SHIFT - PAGE_SHIFT);
+                        continue;
+                    }
+
                     l1e = map_domain_page(mfn_x(_mfn(l2e_get_pfn(l2e[i2]))));
 
                     for ( i1 = 0; i1 < L1_PAGETABLE_ENTRIES; i1++, gfn++ )
@@ -737,35 +860,40 @@ static void audit_p2m(struct domain *d)
 
 
 static void
-p2m_remove_page(struct domain *d, unsigned long gfn, unsigned long mfn)
-{
+p2m_remove_page(struct domain *d, unsigned long gfn, unsigned long mfn,
+                unsigned int page_order)
+{
+    int i;
     if ( !paging_mode_translate(d) )
         return;
     P2M_DEBUG("removing gfn=%#lx mfn=%#lx\n", gfn, mfn);
 
-    set_p2m_entry(d, gfn, _mfn(INVALID_MFN), p2m_invalid);
-    set_gpfn_from_mfn(mfn, INVALID_M2P_ENTRY);
+    set_p2m_entry(d, gfn, _mfn(INVALID_MFN), page_order, p2m_invalid);
+    for ( i = 0; i < (1UL << page_order); i++ )
+        set_gpfn_from_mfn(mfn+i, INVALID_M2P_ENTRY);
 }
 
 void
 guest_physmap_remove_page(struct domain *d, unsigned long gfn,
-                          unsigned long mfn)
+                          unsigned long mfn, unsigned int page_order)
 {
     p2m_lock(d->arch.p2m);
     audit_p2m(d);
-    p2m_remove_page(d, gfn, mfn);
+    p2m_remove_page(d, gfn, mfn, page_order);
     audit_p2m(d);
     p2m_unlock(d->arch.p2m);
 }
 
 int
 guest_physmap_add_entry(struct domain *d, unsigned long gfn,
-                        unsigned long mfn, p2m_type_t t)
+                        unsigned long mfn, unsigned int page_order, 
+                        p2m_type_t t)
 {
     unsigned long ogfn;
     p2m_type_t ot;
     mfn_t omfn;
     int rc = 0;
+    int i;
 
     if ( !paging_mode_translate(d) )
         return -EINVAL;
@@ -795,7 +923,8 @@ guest_physmap_add_entry(struct domain *d
     if ( p2m_is_ram(ot) )
     {
         ASSERT(mfn_valid(omfn));
-        set_gpfn_from_mfn(mfn_x(omfn), INVALID_M2P_ENTRY);
+        for ( i = 0; i < (1UL << page_order); i++ )
+            set_gpfn_from_mfn(mfn_x(omfn)+i, INVALID_M2P_ENTRY);
     }
 
     ogfn = mfn_to_gfn(d, _mfn(mfn));
@@ -818,21 +947,23 @@ guest_physmap_add_entry(struct domain *d
             P2M_DEBUG("old gfn=%#lx -> mfn %#lx\n",
                       ogfn , mfn_x(omfn));
             if ( mfn_x(omfn) == mfn )
-                p2m_remove_page(d, ogfn, mfn);
+                p2m_remove_page(d, ogfn, mfn, page_order);
         }
     }
 
     if ( mfn_valid(_mfn(mfn)) ) 
     {
-        if ( !set_p2m_entry(d, gfn, _mfn(mfn), t) )
+        if ( !set_p2m_entry(d, gfn, _mfn(mfn), page_order, t) )
             rc = -EINVAL;
-        set_gpfn_from_mfn(mfn, gfn);
+        for ( i = 0; i < (1UL << page_order); i++ )
+            set_gpfn_from_mfn(mfn+i, gfn+i);
     }
     else
     {
         gdprintk(XENLOG_WARNING, "Adding bad mfn to p2m map (%#lx -> %#lx)\n",
                  gfn, mfn);
-        if ( !set_p2m_entry(d, gfn, _mfn(INVALID_MFN), p2m_invalid) )
+        if ( !set_p2m_entry(d, gfn, _mfn(INVALID_MFN), page_order, 
+                            p2m_invalid) )
             rc = -EINVAL;
     }
 
@@ -851,7 +982,7 @@ void p2m_change_type_global(struct domai
     l1_pgentry_t l1e_content;
     l1_pgentry_t *l1e;
     l2_pgentry_t *l2e;
-    mfn_t l1mfn;
+    mfn_t l1mfn, l2mfn;
     int i1, i2;
     l3_pgentry_t *l3e;
     int i3;
@@ -891,11 +1022,26 @@ void p2m_change_type_global(struct domai
             {
                 continue;
             }
+            l2mfn = _mfn(l3e_get_pfn(l3e[i3]));
             l2e = map_domain_page(l3e_get_pfn(l3e[i3]));
             for ( i2 = 0; i2 < L2_PAGETABLE_ENTRIES; i2++ )
             {
                 if ( !(l2e_get_flags(l2e[i2]) & _PAGE_PRESENT) )
                 {
+                    continue;
+                }
+
+                if ( (l2e_get_flags(l2e[i2]) & _PAGE_PSE) )
+                {
+                    flags = l2e_get_flags(l2e[i2]);
+                    if ( p2m_flags_to_type(flags) != ot )
+                        continue;
+                    mfn = l2e_get_pfn(l2e[i2]);
+                    gfn = get_gpfn_from_mfn(mfn);
+                    flags = p2m_flags_to_type(nt);
+                    l1e_content = l1e_from_pfn(mfn, flags | _PAGE_PSE);
+                    paging_write_p2m_entry(d, gfn, (l1_pgentry_t *)&l2e[i2],
+                                           l2mfn, l1e_content, 2);
                     continue;
                 }
 
@@ -944,7 +1090,7 @@ p2m_type_t p2m_change_type(struct domain
 
     mfn = gfn_to_mfn(d, gfn, &pt);
     if ( pt == ot )
-        set_p2m_entry(d, gfn, mfn, nt);
+        set_p2m_entry(d, gfn, mfn, NORMAL_PAGE_ORDER, nt);
 
     p2m_unlock(d->arch.p2m);
 
@@ -968,7 +1114,7 @@ set_mmio_p2m_entry(struct domain *d, uns
         set_gpfn_from_mfn(mfn_x(omfn), INVALID_M2P_ENTRY);
     }
 
-    rc = set_p2m_entry(d, gfn, mfn, p2m_mmio_direct);
+    rc = set_p2m_entry(d, gfn, mfn, NORMAL_PAGE_ORDER, p2m_mmio_direct);
     if ( 0 == rc )
         gdprintk(XENLOG_ERR,
             "set_mmio_p2m_entry: set_p2m_entry failed! mfn=%08lx\n",
@@ -992,7 +1138,7 @@ clear_mmio_p2m_entry(struct domain *d, u
             "clear_mmio_p2m_entry: gfn_to_mfn failed! gfn=%08lx\n", gfn);
         return 0;
     }
-    rc = set_p2m_entry(d, gfn, _mfn(INVALID_MFN), 0);
+    rc = set_p2m_entry(d, gfn, _mfn(INVALID_MFN), NORMAL_PAGE_ORDER, 0);
 
     return rc;
 }
diff -r 810d8c3ac992 xen/common/grant_table.c
--- a/xen/common/grant_table.c	Thu May 08 16:58:33 2008 +0100
+++ b/xen/common/grant_table.c	Sun May 11 11:00:12 2008 -0500
@@ -1159,7 +1159,7 @@ gnttab_transfer(
         spin_lock(&e->grant_table->lock);
 
         sha = &shared_entry(e->grant_table, gop.ref);
-        guest_physmap_add_page(e, sha->frame, mfn);
+        guest_physmap_add_page(e, sha->frame, mfn, NORMAL_PAGE_ORDER);
         sha->frame = mfn;
         wmb();
         sha->flags |= GTF_transfer_completed;
diff -r 810d8c3ac992 xen/common/memory.c
--- a/xen/common/memory.c	Thu May 08 16:58:33 2008 +0100
+++ b/xen/common/memory.c	Sun May 11 11:03:13 2008 -0500
@@ -114,34 +114,60 @@ static void populate_physmap(struct memo
 
         page = alloc_domheap_pages(
             d, a->extent_order, a->memflags | MEMF_node(node));
-        if ( unlikely(page == NULL) ) 
-        {
-            gdprintk(XENLOG_INFO, "Could not allocate order=%d extent: "
-                     "id=%d memflags=%x (%ld of %d)\n",
-                     a->extent_order, d->domain_id, a->memflags,
-                     i, a->nr_extents);
-            goto out;
-        }
-
-        mfn = page_to_mfn(page);
-
-        if ( unlikely(paging_mode_translate(d)) )
-        {
+
+        if ( unlikely(page == NULL) )
+        {
+            /* fail if it is not under translate mode */
+            if ( !paging_mode_translate(d) )
+            {
+                gdprintk(XENLOG_INFO, "Could not allocate order=%d extent: "
+                         "id=%d memflags=%x (%ld of %d)\n",
+                         a->extent_order, d->domain_id, a->memflags,
+                         i, a->nr_extents);
+                goto out;
+            }
+            
+            /* try to allocate using 4KB page instead */
             for ( j = 0; j < (1 << a->extent_order); j++ )
-                if ( guest_physmap_add_page(d, gpfn + j, mfn + j) )
+            {
+                page = alloc_domheap_pages(d, 0, 
+                                           a->memflags | MEMF_node(node));
+                if ( page == NULL )
+                {
+                    gdprintk(XENLOG_INFO, "Could not allocate order=%d extent:"
+                             "id=%d memflags=%x (%ld of %d)\n",
+                             0, d->domain_id, a->memflags, i, a->nr_extents);
                     goto out;
-        }
-        else
-        {
-            for ( j = 0; j < (1 << a->extent_order); j++ )
-                set_gpfn_from_mfn(mfn + j, gpfn + j);
-
-            /* Inform the domain of the new page's machine address. */ 
-            if ( unlikely(__copy_to_guest_offset(a->extent_list, i, &mfn, 1)) )
-                goto out;
-        }
-    }
-
+                }
+                
+                mfn = page_to_mfn(page);
+                
+                if ( guest_physmap_add_page(d, gpfn+j, mfn, 
+                                            NORMAL_PAGE_ORDER) )
+                    goto out;
+            }
+        }
+        else /* successful in allocating page of extent_order */
+        {
+            mfn = page_to_mfn(page);
+            
+            if ( unlikely(paging_mode_translate(d)) )
+            {
+                if ( guest_physmap_add_page(d, gpfn, mfn, a->extent_order) )
+                    goto out;
+            }
+            else
+            {
+                for ( j = 0; j < (1 << a->extent_order); j++ )
+                    set_gpfn_from_mfn(mfn + j, gpfn + j);
+                
+                /* Inform the domain of the new page's machine address. */ 
+                if ( unlikely(__copy_to_guest_offset(a->extent_list, i, &mfn, 
+                                                     1)) )
+                    goto out;
+            }
+        }
+    }
  out:
     a->nr_done = i;
 }
@@ -172,7 +198,7 @@ int guest_remove_page(struct domain *d, 
     if ( test_and_clear_bit(_PGC_allocated, &page->count_info) )
         put_page(page);
 
-    guest_physmap_remove_page(d, gmfn, mfn);
+    guest_physmap_remove_page(d, gmfn, mfn, NORMAL_PAGE_ORDER);
 
     put_page(page);
 
@@ -419,7 +445,8 @@ static long memory_exchange(XEN_GUEST_HA
             if ( !test_and_clear_bit(_PGC_allocated, &page->count_info) )
                 BUG();
             mfn = page_to_mfn(page);
-            guest_physmap_remove_page(d, mfn_to_gmfn(d, mfn), mfn);
+            guest_physmap_remove_page(d, mfn_to_gmfn(d, mfn), mfn, 
+                                      NORMAL_PAGE_ORDER);
             put_page(page);
         }
 
@@ -441,8 +468,8 @@ static long memory_exchange(XEN_GUEST_HA
             if ( unlikely(paging_mode_translate(d)) )
             {
                 /* Ignore failure here. There's nothing we can do. */
-                for ( k = 0; k < (1UL << exch.out.extent_order); k++ )
-                    (void)guest_physmap_add_page(d, gpfn + k, mfn + k);
+                (void)guest_physmap_add_page(d, gpfn, mfn, 
+                                             exch.out.extent_order);
             }
             else
             {
diff -r 810d8c3ac992 xen/include/asm-ia64/shadow.h
--- a/xen/include/asm-ia64/shadow.h	Thu May 08 16:58:33 2008 +0100
+++ b/xen/include/asm-ia64/shadow.h	Sun May 11 10:57:28 2008 -0500
@@ -40,8 +40,10 @@
  * Utilities to change relationship of gpfn->mfn for designated domain,
  * which is required by gnttab transfer, balloon, device model and etc.
  */
-int guest_physmap_add_page(struct domain *d, unsigned long gpfn, unsigned long mfn);
-void guest_physmap_remove_page(struct domain *d, unsigned long gpfn, unsigned long mfn);
+int guest_physmap_add_page(struct domain *d, unsigned long gpfn, 
+                           unsigned long mfn, unsigned int page_order);
+void guest_physmap_remove_page(struct domain *d, unsigned long gpfn, 
+                               unsigned long mfn, unsigned int page_order);
 
 static inline int
 shadow_mode_enabled(struct domain *d)
diff -r 810d8c3ac992 xen/include/asm-x86/mm.h
--- a/xen/include/asm-x86/mm.h	Thu May 08 16:58:33 2008 +0100
+++ b/xen/include/asm-x86/mm.h	Sun May 11 10:43:50 2008 -0500
@@ -124,6 +124,14 @@ static inline u32 pickle_domptr(struct d
 /* The order of the largest allocation unit we use for shadow pages */
 #define SHADOW_MAX_ORDER 2 /* Need up to 16k allocs for 32-bit on PAE/64 */
 
+/* The order of continuously allocated super page frames */
+#define NORMAL_PAGE_ORDER 0  /* 4KB page */
+#if CONFIG_PAGING_LEVELS == 2
+#define SUPER_PAGE_ORDER  10 /* 4MB page */
+#else
+#define SUPER_PAGE_ORDER  9  /* 2MB page */
+#endif
+
 #define page_get_owner(_p)    (unpickle_domptr((_p)->u.inuse._domain))
 #define page_set_owner(_p,_d) ((_p)->u.inuse._domain = pickle_domptr(_d))
 
diff -r 810d8c3ac992 xen/include/asm-x86/p2m.h
--- a/xen/include/asm-x86/p2m.h	Thu May 08 16:58:33 2008 +0100
+++ b/xen/include/asm-x86/p2m.h	Sun May 11 10:57:28 2008 -0500
@@ -102,7 +102,8 @@ struct p2m_domain {
     void               (*free_page   )(struct domain *d,
                                        struct page_info *pg);
     int                (*set_entry   )(struct domain *d, unsigned long gfn,
-                                       mfn_t mfn, p2m_type_t p2mt);
+                                       mfn_t mfn, unsigned int page_order,
+                                       p2m_type_t p2mt);
     mfn_t              (*get_entry   )(struct domain *d, unsigned long gfn,
                                        p2m_type_t *p2mt);
     mfn_t              (*get_entry_current)(unsigned long gfn,
@@ -203,21 +204,23 @@ void p2m_final_teardown(struct domain *d
 
 /* Add a page to a domain's p2m table */
 int guest_physmap_add_entry(struct domain *d, unsigned long gfn,
-                             unsigned long mfn, p2m_type_t t);
+                            unsigned long mfn, unsigned int page_order, 
+                            p2m_type_t t);
 
 /* Untyped version for RAM only, for compatibility 
  *
  * Return 0 for success
  */
 static inline int guest_physmap_add_page(struct domain *d, unsigned long gfn,
-                                         unsigned long mfn)
-{
-    return guest_physmap_add_entry(d, gfn, mfn, p2m_ram_rw);
+                                         unsigned long mfn,
+                                         unsigned int page_order)
+{
+    return guest_physmap_add_entry(d, gfn, mfn, page_order, p2m_ram_rw);
 }
 
 /* Remove a page from a domain's p2m table */
 void guest_physmap_remove_page(struct domain *d, unsigned long gfn,
-                               unsigned long mfn);
+                               unsigned long mfn, unsigned int page_order);
 
 /* Change types across all p2m entries in a domain */
 void p2m_change_type_global(struct domain *d, p2m_type_t ot, p2m_type_t nt);
diff -r 810d8c3ac992 xen/include/xen/paging.h
--- a/xen/include/xen/paging.h	Thu May 08 16:58:33 2008 +0100
+++ b/xen/include/xen/paging.h	Sun May 11 16:38:05 2008 -0500
@@ -18,8 +18,8 @@
 #else
 
 #define paging_mode_translate(d)              (0)
-#define guest_physmap_add_page(d, p, m)       (0)
-#define guest_physmap_remove_page(d, p, m)    ((void)0)
+#define guest_physmap_add_page(d, p, m, o)       (0)
+#define guest_physmap_remove_page(d, p, m, o)    ((void)0)
 
 #endif
 

[-- Attachment #3: super_page_patch.txt --]
[-- Type: text/plain, Size: 44468 bytes --]

diff -r 810d8c3ac992 -r 583dca746efb tools/libxc/xc_hvm_build.c
--- a/tools/libxc/xc_hvm_build.c	Thu May 08 16:58:33 2008 +0100
+++ b/tools/libxc/xc_hvm_build.c	Mon May 12 04:06:34 2008 -0500
@@ -157,8 +157,10 @@ static int setup_guest(int xc_handle,
                        char *image, unsigned long image_size)
 {
     xen_pfn_t *page_array = NULL;
+    xen_pfn_t *super_page_array = NULL;
     unsigned long i, nr_pages = (unsigned long)memsize << (20 - PAGE_SHIFT);
-    unsigned long special_page_nr, entry_eip, cur_pages;
+    unsigned long nr_super_pages;
+    unsigned long special_page_nr, entry_eip, cur_pages, limit;
     struct xen_add_to_physmap xatp;
     struct shared_info *shared_info;
     void *e820_page;
@@ -167,6 +169,7 @@ static int setup_guest(int xc_handle,
     uint64_t v_start, v_end;
     int rc;
     xen_capabilities_info_t caps;
+    int super_page_order;
 
     /* An HVM guest must be initialised with at least 2MB memory. */
     if ( memsize < 2 )
@@ -189,6 +192,12 @@ static int setup_guest(int xc_handle,
         PERROR("Guest OS must load to a page boundary.\n");
         goto error_out;
     }
+
+    /* We only support 2MB super pages since 32bit non-PAE is not officially
+     * supported by Xen any more.
+     */
+    nr_super_pages = (unsigned long)memsize >> 1;
+    super_page_order = 9;
 
     IPRINTF("VIRTUAL MEMORY ARRANGEMENT:\n"
             "  Loader:        %016"PRIx64"->%016"PRIx64"\n"
@@ -198,7 +207,9 @@ static int setup_guest(int xc_handle,
             v_start, v_end,
             elf_uval(&elf, elf.ehdr, e_entry));
 
-    if ( (page_array = malloc(nr_pages * sizeof(xen_pfn_t))) == NULL )
+    if ( (page_array = malloc(nr_pages * sizeof(xen_pfn_t))) == NULL ||
+         (super_page_array = 
+          malloc(nr_super_pages * sizeof(xen_pfn_t))) == NULL )
     {
         PERROR("Could not allocate memory.\n");
         goto error_out;
@@ -206,26 +217,44 @@ static int setup_guest(int xc_handle,
 
     for ( i = 0; i < nr_pages; i++ )
         page_array[i] = i;
+    for ( i = 0; i < nr_super_pages; i++ )
+        super_page_array[i] = i << super_page_order;
     for ( i = HVM_BELOW_4G_RAM_END >> PAGE_SHIFT; i < nr_pages; i++ )
         page_array[i] += HVM_BELOW_4G_MMIO_LENGTH >> PAGE_SHIFT;
-
-    /*
-     * Allocate memory for HVM guest, skipping VGA hole 0xA0000-0xC0000.
-     * We allocate pages in batches of no more than 2048 to ensure that
-     * we can be preempted and hence dom0 remains responsive.
-     */
+    for ( i = HVM_BELOW_4G_RAM_END >> (PAGE_SHIFT + super_page_order); 
+          i < nr_super_pages; i++ )
+        super_page_array[i] += HVM_BELOW_4G_MMIO_LENGTH >> PAGE_SHIFT;
+
+
+    /* Allocate memory for HVM guest, skipping VGA hole 0xA0000-0xC0000. */
     rc = xc_domain_memory_populate_physmap(
         xc_handle, dom, 0xa0, 0, 0, &page_array[0x00]);
-    cur_pages = 0xc0;
-    while ( (rc == 0) && (nr_pages > cur_pages) )
-    {
-        unsigned long count = nr_pages - cur_pages;
-        if ( count > 2048 )
-            count = 2048;
+    if ( rc == 0 )
         rc = xc_domain_memory_populate_physmap(
-            xc_handle, dom, count, 0, 0, &page_array[cur_pages]);
+            xc_handle, dom, 0x200-0xc0, 0, 0, &page_array[0xc0]);
+    
+    /* We allocate pages in batches of no more than 8MB to ensure that
+     * we can be preempted and hence dom0 remains responsive.
+     */
+    limit = 4;
+    cur_pages = 1;
+    while ( (rc == 0) && (nr_super_pages > cur_pages) )
+    {
+        unsigned long count = nr_super_pages - cur_pages;
+        if ( count > limit )
+            count = limit;
+        rc = xc_domain_memory_populate_physmap(xc_handle, dom, count, 
+                                               super_page_order, 0, 
+                                               &super_page_array[cur_pages]);
         cur_pages += count;
     }
+
+    /* handle the case of odd number physical memory size, such as 255MB */
+    if ( rc == 0 )
+        rc = xc_domain_memory_populate_physmap(
+            xc_handle, dom, nr_pages - (nr_super_pages << super_page_order), 
+            0, 0, &page_array[nr_super_pages << super_page_order]);
+
     if ( rc != 0 )
     {
         PERROR("Could not allocate memory for HVM guest.\n");
@@ -314,10 +343,12 @@ static int setup_guest(int xc_handle,
     }
 
     free(page_array);
+    free(super_page_array);
     return 0;
 
  error_out:
     free(page_array);
+    free(super_page_array);
     return -1;
 }
 
diff -r 810d8c3ac992 -r 583dca746efb xen/arch/ia64/xen/mm.c
--- a/xen/arch/ia64/xen/mm.c	Thu May 08 16:58:33 2008 +0100
+++ b/xen/arch/ia64/xen/mm.c	Mon May 12 04:06:34 2008 -0500
@@ -2415,7 +2415,7 @@ steal_page(struct domain *d, struct page
 
 int
 guest_physmap_add_page(struct domain *d, unsigned long gpfn,
-                       unsigned long mfn)
+                       unsigned long mfn, unsigned int page_order)
 {
     BUG_ON(!mfn_valid(mfn));
     BUG_ON(mfn_to_page(mfn)->count_info != (PGC_allocated | 1));
@@ -2432,7 +2432,7 @@ guest_physmap_add_page(struct domain *d,
 
 void
 guest_physmap_remove_page(struct domain *d, unsigned long gpfn,
-                          unsigned long mfn)
+                          unsigned long mfn, unsigned int page_order)
 {
     BUG_ON(mfn == 0);//XXX
     zap_domain_page_one(d, gpfn << PAGE_SHIFT, 0, mfn);
@@ -2838,7 +2838,7 @@ arch_memory_op(int op, XEN_GUEST_HANDLE(
         if (prev_mfn && mfn_valid(prev_mfn)) {
             if (is_xen_heap_mfn(prev_mfn))
                 /* Xen heap frames are simply unhooked from this phys slot. */
-                guest_physmap_remove_page(d, xatp.gpfn, prev_mfn);
+                guest_physmap_remove_page(d, xatp.gpfn, prev_mfn, 0);
             else
                 /* Normal domain memory is freed, to avoid leaking memory. */
                 guest_remove_page(d, xatp.gpfn);
@@ -2847,10 +2847,10 @@ arch_memory_op(int op, XEN_GUEST_HANDLE(
         /* Unmap from old location, if any. */
         gpfn = get_gpfn_from_mfn(mfn);
         if (gpfn != INVALID_M2P_ENTRY)
-            guest_physmap_remove_page(d, gpfn, mfn);
+            guest_physmap_remove_page(d, gpfn, mfn, 0);
 
         /* Map at new location. */
-        guest_physmap_add_page(d, xatp.gpfn, mfn);
+        guest_physmap_add_page(d, xatp.gpfn, mfn, 0);
 
     out:
         domain_unlock(d);
diff -r 810d8c3ac992 -r 583dca746efb xen/arch/x86/mm.c
--- a/xen/arch/x86/mm.c	Thu May 08 16:58:33 2008 +0100
+++ b/xen/arch/x86/mm.c	Mon May 12 04:06:34 2008 -0500
@@ -3287,7 +3287,8 @@ long arch_memory_op(int op, XEN_GUEST_HA
         {
             if ( is_xen_heap_mfn(prev_mfn) )
                 /* Xen heap frames are simply unhooked from this phys slot. */
-                guest_physmap_remove_page(d, xatp.gpfn, prev_mfn);
+                guest_physmap_remove_page(d, xatp.gpfn, prev_mfn,
+                                          NORMAL_PAGE_ORDER);
             else
                 /* Normal domain memory is freed, to avoid leaking memory. */
                 guest_remove_page(d, xatp.gpfn);
@@ -3296,10 +3297,10 @@ long arch_memory_op(int op, XEN_GUEST_HA
         /* Unmap from old location, if any. */
         gpfn = get_gpfn_from_mfn(mfn);
         if ( gpfn != INVALID_M2P_ENTRY )
-            guest_physmap_remove_page(d, gpfn, mfn);
+            guest_physmap_remove_page(d, gpfn, mfn, NORMAL_PAGE_ORDER);
 
         /* Map at new location. */
-        guest_physmap_add_page(d, xatp.gpfn, mfn);
+        guest_physmap_add_page(d, xatp.gpfn, mfn, NORMAL_PAGE_ORDER);
 
         domain_unlock(d);
 
diff -r 810d8c3ac992 -r 583dca746efb xen/arch/x86/mm/hap/p2m-ept.c
--- a/xen/arch/x86/mm/hap/p2m-ept.c	Thu May 08 16:58:33 2008 +0100
+++ b/xen/arch/x86/mm/hap/p2m-ept.c	Mon May 12 04:06:34 2008 -0500
@@ -20,6 +20,7 @@
 #include <xen/domain_page.h>
 #include <xen/sched.h>
 #include <asm/current.h>
+#include <asm/paging.h>
 #include <asm/types.h>
 #include <asm/domain.h>
 #include <asm/p2m.h>
@@ -46,6 +47,9 @@ static void ept_p2m_type_to_flags(ept_en
     }
 }
 
+#define GUEST_TABLE_NORMAL_PAGE 1
+#define GUEST_TABLE_SUPER_PAGE  2
+
 static int ept_next_level(struct domain *d, bool_t read_only,
                           ept_entry_t **table, unsigned long *gfn_remainder,
                           u32 shift)
@@ -54,7 +58,6 @@ static int ept_next_level(struct domain 
     u32 index;
 
     index = *gfn_remainder >> shift;
-    *gfn_remainder &= (1UL << shift) - 1;
 
     ept_entry = (*table) + index;
 
@@ -83,31 +86,53 @@ static int ept_next_level(struct domain 
         ept_entry->r = ept_entry->w = ept_entry->x = 1;
     }
 
-    next = map_domain_page(ept_entry->mfn);
-    unmap_domain_page(*table);
-    *table = next;
-
-    return 1;
+    if ( !ept_entry->sp_avail )
+    {
+        *gfn_remainder &= (1UL << shift) - 1;
+        next = map_domain_page(ept_entry->mfn);
+        unmap_domain_page(*table);
+        *table = next;
+        return GUEST_TABLE_NORMAL_PAGE;
+    }
+    else
+        return GUEST_TABLE_SUPER_PAGE;
 }
 
 static int
-ept_set_entry(struct domain *d, unsigned long gfn, mfn_t mfn, p2m_type_t p2mt)
-{
-    ept_entry_t *table =
-        map_domain_page(mfn_x(pagetable_get_mfn(d->arch.phys_table)));
-    unsigned long gfn_remainder = gfn;
+ept_set_entry(struct domain *d, unsigned long gfn, mfn_t mfn, 
+              unsigned int order, p2m_type_t p2mt)
+{
+    ept_entry_t *table = NULL;
+    unsigned long gfn_remainder = gfn, offset = 0;
     ept_entry_t *ept_entry = NULL;
     u32 index;
-    int i, rv = 0;
+    int i, rv = 0, ret = 0;
+    int walk_level = order / EPT_TABLE_ORDER;
 
     /* Should check if gfn obeys GAW here */
 
-    for ( i = EPT_DEFAULT_GAW; i > 0; i-- )
-        if ( !ept_next_level(d, 0, &table, &gfn_remainder,
-                             i * EPT_TABLE_ORDER) )
+    if (  order != 0 )
+        if ( (gfn & ((1UL << order) - 1)) )
+            return 1;
+
+    table = map_domain_page(mfn_x(pagetable_get_mfn(d->arch.phys_table)));
+
+    ASSERT(table != NULL);
+
+    for ( i = EPT_DEFAULT_GAW; i > walk_level; i-- )
+    {
+        ret = ept_next_level(d, 0, &table, &gfn_remainder,
+          i * EPT_TABLE_ORDER);
+        if ( !ret )
             goto out;
-
-    index = gfn_remainder;
+        else if ( ret == GUEST_TABLE_SUPER_PAGE )
+            break;
+    }
+
+    index = gfn_remainder >> ( i ?  (i * EPT_TABLE_ORDER): order);
+    walk_level = ( i ? ( i * EPT_TABLE_ORDER) : order) / EPT_TABLE_ORDER;
+    offset = (gfn_remainder & ( ((1 << (i*EPT_TABLE_ORDER)) - 1)));
+
     ept_entry = table + index;
 
     if ( mfn_valid(mfn_x(mfn)) || (p2mt == p2m_mmio_direct) )
@@ -117,9 +142,20 @@ ept_set_entry(struct domain *d, unsigned
             d->arch.p2m->max_mapped_pfn = gfn;
 
         ept_entry->emt = EPT_DEFAULT_MT;
-        ept_entry->sp_avail = 0;
+        ept_entry->sp_avail = walk_level ? 1 : 0;
+
+        if ( ret == GUEST_TABLE_SUPER_PAGE )
+        {
+            ept_entry->mfn = mfn_x(mfn) - offset;
+            if ( ept_entry->avail1 == p2m_ram_logdirty &&
+              p2mt == p2m_ram_rw )
+                for ( i = 0; i < 512; i++ )
+                    paging_mark_dirty(d, mfn_x(mfn)-offset+i);
+        }
+        else
+            ept_entry->mfn = mfn_x(mfn);
+
         ept_entry->avail1 = p2mt;
-        ept_entry->mfn = mfn_x(mfn);
         ept_entry->rsvd = 0;
         ept_entry->avail2 = 0;
         /* last step */
@@ -132,14 +168,42 @@ ept_set_entry(struct domain *d, unsigned
     /* Success */
     rv = 1;
 
- out:
+out:
     unmap_domain_page(table);
 
     ept_sync_domain(d);
 
+    /* Now the p2m table is not shared with vt-d page table */
+
+    if ( iommu_enabled && is_hvm_domain(d) )
+    {
+        if ( p2mt == p2m_ram_rw )
+        {
+            if ( ret == GUEST_TABLE_SUPER_PAGE )
+            {
+                for ( i = 0; i < 512; i++ )
+                    iommu_map_page(d, gfn-offset+i, mfn_x(mfn)-offset+i);
+            }
+            else if ( ret )
+                iommu_map_page(d, gfn, mfn_x(mfn));
+        }
+        else
+        {
+            if ( ret == GUEST_TABLE_SUPER_PAGE )
+            {
+                for ( i = 0; i < 512; i++ )
+                    iommu_unmap_page(d, gfn-offset+i);
+            }
+            else if ( ret )
+                iommu_unmap_page(d, gfn);
+        }
+    }
+
+#ifdef P2M_SHARE_WITH_VTD_PAGE_TABLE
     /* If p2m table is shared with vtd page-table. */
     if ( iommu_enabled && is_hvm_domain(d) && (p2mt == p2m_mmio_direct) )
         iommu_flush(d, gfn, (u64*)ept_entry);
+#endif
 
     return rv;
 }
@@ -152,7 +216,7 @@ static mfn_t ept_get_entry(struct domain
     unsigned long gfn_remainder = gfn;
     ept_entry_t *ept_entry;
     u32 index;
-    int i;
+    int i, ret=0;
     mfn_t mfn = _mfn(INVALID_MFN);
 
     *t = p2m_mmio_dm;
@@ -164,17 +228,31 @@ static mfn_t ept_get_entry(struct domain
     /* Should check if gfn obeys GAW here. */
 
     for ( i = EPT_DEFAULT_GAW; i > 0; i-- )
-        if ( !ept_next_level(d, 1, &table, &gfn_remainder,
-                             i * EPT_TABLE_ORDER) )
+    {
+        ret = ept_next_level(d, 1, &table, &gfn_remainder,
+                             i * EPT_TABLE_ORDER);
+        if ( !ret )
             goto out;
-
-    index = gfn_remainder;
+        else if ( ret == GUEST_TABLE_SUPER_PAGE )
+            break;
+    }
+
+    index = gfn_remainder >> ( i * EPT_TABLE_ORDER);
     ept_entry = table + index;
 
     if ( ept_entry->avail1 != p2m_invalid )
     {
         *t = ept_entry->avail1;
         mfn = _mfn(ept_entry->mfn);
+        if ( i )
+        {
+            /* we may meet super pages, and to split into 4k pages
+             * to emulate p2m table
+             */
+            unsigned long split_mfn = 
+              mfn_x(mfn) + (gfn_remainder & ( ((1 << (i*EPT_TABLE_ORDER)) - 1 )));
+            mfn = _mfn(split_mfn);
+        }
     }
 
  out:
@@ -205,33 +283,63 @@ static void ept_change_entry_type_global
     l4e = map_domain_page(mfn_x(pagetable_get_mfn(d->arch.phys_table)));
     for (i4 = 0; i4 < EPT_PAGETABLE_ENTRIES; i4++ )
     {
-        if ( !l4e[i4].epte || l4e[i4].sp_avail )
+        if ( !l4e[i4].epte )
             continue;
-        l3e = map_domain_page(l4e[i4].mfn);
-        for ( i3 = 0; i3 < EPT_PAGETABLE_ENTRIES; i3++ )
-        {
-            if ( !l3e[i3].epte || l3e[i3].sp_avail )
+        if ( !l4e[i4].sp_avail )
+        {
+            l3e = map_domain_page(l4e[i4].mfn);
+            for ( i3 = 0; i3 < EPT_PAGETABLE_ENTRIES; i3++ )
+            {
+                if ( !l3e[i3].epte )
+                    continue;
+                if ( !l3e[i3].sp_avail )
+                {
+                    l2e = map_domain_page(l3e[i3].mfn);
+                    for ( i2 = 0; i2 < EPT_PAGETABLE_ENTRIES; i2++ )
+                    {
+                        if ( !l2e[i2].epte )
+                            continue;
+                        if ( !l2e[i2].sp_avail )
+                        {
+                            l1e = map_domain_page(l2e[i2].mfn);
+                            for ( i1  = 0; i1 < EPT_PAGETABLE_ENTRIES; i1++ )
+                            {
+                                if ( !l1e[i1].epte )
+                                    continue;
+                                if ( l1e[i1].avail1 != ot )
+                                    continue;
+                                l1e[i1].avail1 = nt;
+                                ept_p2m_type_to_flags(l1e+i1, nt);
+                            }
+                            unmap_domain_page(l1e);
+                        }
+                        else
+                        {
+                            if ( l2e[i2].avail1 != ot )
+                                continue;
+                            l2e[i2].avail1 = nt;
+                            ept_p2m_type_to_flags(l2e+i2, nt);
+                        }
+                    }
+                    unmap_domain_page(l2e);
+                }
+                else
+                {
+                    if ( l3e[i3].avail1 != ot )
+                        continue;
+                    l3e[i3].avail1 = nt;
+                    ept_p2m_type_to_flags(l3e+i3, nt);
+                }
+            }
+            unmap_domain_page(l3e);
+        }
+        else
+        {
+            if ( l4e[i4].avail1 != ot )
                 continue;
-            l2e = map_domain_page(l3e[i3].mfn);
-            for ( i2 = 0; i2 < EPT_PAGETABLE_ENTRIES; i2++ )
-            {
-                if ( !l2e[i2].epte || l2e[i2].sp_avail )
-                    continue;
-                l1e = map_domain_page(l2e[i2].mfn);
-                for ( i1  = 0; i1 < EPT_PAGETABLE_ENTRIES; i1++ )
-                {
-                    if ( !l1e[i1].epte )
-                        continue;
-                    if ( l1e[i1].avail1 != ot )
-                        continue;
-                    l1e[i1].avail1 = nt;
-                    ept_p2m_type_to_flags(l1e+i1, nt);
-                }
-                unmap_domain_page(l1e);
-            }
-            unmap_domain_page(l2e);
-        }
-        unmap_domain_page(l3e);
+            l4e[i4].avail1 = nt;
+            ept_p2m_type_to_flags(l4e+i4, nt);
+        }
     }
     unmap_domain_page(l4e);
 
diff -r 810d8c3ac992 -r 583dca746efb xen/arch/x86/mm/p2m.c
--- a/xen/arch/x86/mm/p2m.c	Thu May 08 16:58:33 2008 +0100
+++ b/xen/arch/x86/mm/p2m.c	Mon May 12 04:06:34 2008 -0500
@@ -151,9 +151,11 @@ p2m_next_level(struct domain *d, mfn_t *
                unsigned long *gfn_remainder, unsigned long gfn, u32 shift,
                u32 max, unsigned long type)
 {
+    l1_pgentry_t *l1_entry;
     l1_pgentry_t *p2m_entry;
     l1_pgentry_t new_entry;
     void *next;
+    int i;
     ASSERT(d->arch.p2m->alloc_page);
 
     if ( !(p2m_entry = p2m_find_entry(*table, gfn_remainder, gfn,
@@ -194,6 +196,44 @@ p2m_next_level(struct domain *d, mfn_t *
             break;
         }
     }
+
+    ASSERT(l1e_get_flags(*p2m_entry) & _PAGE_PRESENT);
+
+    /* split single large page into 4KB page in P2M table */
+    if ( type == PGT_l1_page_table && (l1e_get_flags(*p2m_entry) & _PAGE_PSE) )
+    {
+        unsigned long flags, pfn;
+        struct page_info *pg = d->arch.p2m->alloc_page(d);
+        if ( pg == NULL )
+            return 0;
+        list_add_tail(&pg->list, &d->arch.p2m->pages);
+        pg->u.inuse.type_info = PGT_l1_page_table | 1 | PGT_validated;
+        pg->count_info = 1;
+        
+        /* New splintered mappings inherit the flags of the old superpage, 
+         * with a little reorganisation for the _PAGE_PSE_PAT bit. */
+        flags = l1e_get_flags(*p2m_entry);
+        pfn = l1e_get_pfn(*p2m_entry);
+        if ( pfn & 1 )           /* ==> _PAGE_PSE_PAT was set */
+            pfn -= 1;            /* Clear it; _PAGE_PSE becomes _PAGE_PAT */
+        else
+            flags &= ~_PAGE_PSE; /* Clear _PAGE_PSE (== _PAGE_PAT) */
+        
+        l1_entry = map_domain_page(mfn_x(page_to_mfn(pg)));
+        for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
+        {
+            new_entry = l1e_from_pfn(pfn + i, flags);
+            paging_write_p2m_entry(d, gfn,
+                                   l1_entry+i, *table_mfn, new_entry, 1);
+        }
+        unmap_domain_page(l1_entry);
+        
+        new_entry = l1e_from_pfn(mfn_x(page_to_mfn(pg)),
+                                 __PAGE_HYPERVISOR|_PAGE_USER);
+        paging_write_p2m_entry(d, gfn,
+                               p2m_entry, *table_mfn, new_entry, 2);
+    }
+
     *table_mfn = _mfn(l1e_get_pfn(*p2m_entry));
     next = map_domain_page(mfn_x(*table_mfn));
     unmap_domain_page(*table);
@@ -204,7 +244,8 @@ p2m_next_level(struct domain *d, mfn_t *
 
 // Returns 0 on error (out of memory)
 static int
-p2m_set_entry(struct domain *d, unsigned long gfn, mfn_t mfn, p2m_type_t p2mt)
+p2m_set_entry(struct domain *d, unsigned long gfn, mfn_t mfn, 
+              unsigned int page_order, p2m_type_t p2mt)
 {
     // XXX -- this might be able to be faster iff current->domain == d
     mfn_t table_mfn = pagetable_get_mfn(d->arch.phys_table);
@@ -212,6 +253,7 @@ p2m_set_entry(struct domain *d, unsigned
     unsigned long gfn_remainder = gfn;
     l1_pgentry_t *p2m_entry;
     l1_pgentry_t entry_content;
+    l2_pgentry_t l2e_content;
     int rv=0;
 
 #if CONFIG_PAGING_LEVELS >= 4
@@ -235,26 +277,53 @@ p2m_set_entry(struct domain *d, unsigned
                          PGT_l2_page_table) )
         goto out;
 
-    if ( !p2m_next_level(d, &table_mfn, &table, &gfn_remainder, gfn,
-                         L2_PAGETABLE_SHIFT - PAGE_SHIFT,
-                         L2_PAGETABLE_ENTRIES, PGT_l1_page_table) )
-        goto out;
-
-    p2m_entry = p2m_find_entry(table, &gfn_remainder, gfn,
-                               0, L1_PAGETABLE_ENTRIES);
-    ASSERT(p2m_entry);
+    if ( page_order == NORMAL_PAGE_ORDER )
+    {
+        if ( !p2m_next_level(d, &table_mfn, &table, &gfn_remainder, gfn,
+                             L2_PAGETABLE_SHIFT - PAGE_SHIFT,
+                             L2_PAGETABLE_ENTRIES, PGT_l1_page_table) )
+            goto out;
+
+        p2m_entry = p2m_find_entry(table, &gfn_remainder, gfn,
+                                   0, L1_PAGETABLE_ENTRIES);
+        ASSERT(p2m_entry);
+        
+        if ( mfn_valid(mfn) || (p2mt == p2m_mmio_direct) )
+            entry_content = l1e_from_pfn(mfn_x(mfn), p2m_type_to_flags(p2mt));
+        else
+            entry_content = l1e_empty();
+        
+        /* level 1 entry */
+        paging_write_p2m_entry(d, gfn, p2m_entry, table_mfn, entry_content, 1);
+    }
+    else 
+    {
+        p2m_entry = p2m_find_entry(table, &gfn_remainder, gfn,
+                                   L2_PAGETABLE_SHIFT - PAGE_SHIFT,
+                                   L2_PAGETABLE_ENTRIES);
+        ASSERT(p2m_entry);
+        
+        if ( (l1e_get_flags(*p2m_entry) & _PAGE_PRESENT) &&
+             !(l1e_get_flags(*p2m_entry) & _PAGE_PSE) )
+        {
+            P2M_ERROR("configure P2M table 4KB L2 entry with large page\n");
+            domain_crash(d);
+            goto out;
+        }
+        
+        if ( mfn_valid(mfn) )
+            l2e_content = l2e_from_pfn(mfn_x(mfn),
+                                       p2m_type_to_flags(p2mt) | _PAGE_PSE);
+        else
+            l2e_content = l2e_empty();
+        
+        entry_content.l1 = l2e_content.l2;
+        paging_write_p2m_entry(d, gfn, p2m_entry, table_mfn, entry_content, 2);
+    }
 
     /* Track the highest gfn for which we have ever had a valid mapping */
     if ( mfn_valid(mfn) && (gfn > d->arch.p2m->max_mapped_pfn) )
         d->arch.p2m->max_mapped_pfn = gfn;
-
-    if ( mfn_valid(mfn) || (p2mt == p2m_mmio_direct) )
-        entry_content = l1e_from_pfn(mfn_x(mfn), p2m_type_to_flags(p2mt));
-    else
-        entry_content = l1e_empty();
-
-    /* level 1 entry */
-    paging_write_p2m_entry(d, gfn, p2m_entry, table_mfn, entry_content, 1);
 
     if ( iommu_enabled && is_hvm_domain(d) )
     {
@@ -335,6 +404,16 @@ p2m_gfn_to_mfn(struct domain *d, unsigne
         unmap_domain_page(l2e);
         return _mfn(INVALID_MFN);
     }
+    else if ( (l2e_get_flags(*l2e) & _PAGE_PSE) )
+    {
+        mfn = _mfn(l2e_get_pfn(*l2e) + l1_table_offset(addr));
+        *t = p2m_flags_to_type(l2e_get_flags(*l2e));
+        unmap_domain_page(l2e);
+        
+        ASSERT(mfn_valid(mfn) || !p2m_is_ram(*t));
+        return (p2m_is_valid(*t)) ? mfn : _mfn(INVALID_MFN);
+    }
+
     mfn = _mfn(l2e_get_pfn(*l2e));
     unmap_domain_page(l2e);
 
@@ -358,6 +437,7 @@ static mfn_t p2m_gfn_to_mfn_current(unsi
 {
     mfn_t mfn = _mfn(INVALID_MFN);
     p2m_type_t p2mt = p2m_mmio_dm;
+    paddr_t addr = ((paddr_t)gfn) << PAGE_SHIFT;
     /* XXX This is for compatibility with the old model, where anything not 
      * XXX marked as RAM was considered to be emulated MMIO space.
      * XXX Once we start explicitly registering MMIO regions in the p2m 
@@ -366,25 +446,44 @@ static mfn_t p2m_gfn_to_mfn_current(unsi
     if ( gfn <= current->domain->arch.p2m->max_mapped_pfn )
     {
         l1_pgentry_t l1e = l1e_empty();
+        l2_pgentry_t l2e = l2e_empty();
         int ret;
 
         ASSERT(gfn < (RO_MPT_VIRT_END - RO_MPT_VIRT_START) 
                / sizeof(l1_pgentry_t));
 
-        /* Need to __copy_from_user because the p2m is sparse and this
-         * part might not exist */
-        ret = __copy_from_user(&l1e,
-                               &phys_to_machine_mapping[gfn],
-                               sizeof(l1e));
-
-        if ( ret == 0 ) {
-            p2mt = p2m_flags_to_type(l1e_get_flags(l1e));
-            ASSERT(l1e_get_pfn(l1e) != INVALID_MFN || !p2m_is_ram(p2mt));
+        ret = __copy_from_user(&l2e,
+                               &__linear_l1_table[l1_linear_offset(RO_MPT_VIRT_START) + l2_linear_offset(addr)],
+                               sizeof(l2e));
+        
+        if ( (ret == 0) && (l2e_get_flags(l2e) & _PAGE_PRESENT) && 
+             (l2e_get_flags(l2e) & _PAGE_PSE) ) 
+        {
+            p2mt = p2m_flags_to_type(l2e_get_flags(l2e));
+            ASSERT(l2e_get_pfn(l2e) != INVALID_MFN || !p2m_is_ram(p2mt));
             if ( p2m_is_valid(p2mt) )
-                mfn = _mfn(l1e_get_pfn(l1e));
-            else 
-                /* XXX see above */
+                mfn = _mfn(l2e_get_pfn(l2e) + l1_table_offset(addr));
+            else
                 p2mt = p2m_mmio_dm;
+        }
+        else
+        {
+        
+            /* Need to __copy_from_user because the p2m is sparse and this
+             * part might not exist */
+            ret = __copy_from_user(&l1e,
+                                   &phys_to_machine_mapping[gfn],
+                                   sizeof(l1e));
+            
+            if ( ret == 0 ) {
+                p2mt = p2m_flags_to_type(l1e_get_flags(l1e));
+                ASSERT(l1e_get_pfn(l1e) != INVALID_MFN || !p2m_is_ram(p2mt));
+                if ( p2m_is_valid(p2mt) )
+                    mfn = _mfn(l1e_get_pfn(l1e));
+                else 
+                    /* XXX see above */
+                    p2mt = p2m_mmio_dm;
+            }
         }
     }
 
@@ -430,9 +529,10 @@ void p2m_change_entry_type_global(struct
 }
 
 static inline
-int set_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn, p2m_type_t p2mt)
-{
-    return d->arch.p2m->set_entry(d, gfn, mfn, p2mt);
+int set_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn, 
+                  unsigned int page_order, p2m_type_t p2mt)
+{
+    return d->arch.p2m->set_entry(d, gfn, mfn, page_order, p2mt);
 }
 
 // Allocate a new p2m table for a domain.
@@ -493,7 +593,8 @@ int p2m_alloc_table(struct domain *d,
     P2M_PRINTK("populating p2m table\n");
 
     /* Initialise physmap tables for slot zero. Other code assumes this. */
-    if ( !set_p2m_entry(d, 0, _mfn(INVALID_MFN), p2m_invalid) )
+    if ( !set_p2m_entry(d, 0, _mfn(INVALID_MFN), NORMAL_PAGE_ORDER,
+                        p2m_invalid) )
         goto error;
 
     /* Copy all existing mappings from the page list and m2p */
@@ -512,7 +613,7 @@ int p2m_alloc_table(struct domain *d,
             (gfn != 0x55555555L)
 #endif
              && gfn != INVALID_M2P_ENTRY
-            && !set_p2m_entry(d, gfn, mfn, p2m_ram_rw) )
+            && !set_p2m_entry(d, gfn, mfn, NORMAL_PAGE_ORDER, p2m_ram_rw) )
             goto error;
     }
 
@@ -688,6 +789,28 @@ static void audit_p2m(struct domain *d)
                         gfn += 1 << (L2_PAGETABLE_SHIFT - PAGE_SHIFT);
                         continue;
                     }
+                    
+                    /* check for super page */
+                    if ( l2e_get_flags(l2e[i2]) & _PAGE_PSE )
+                    {
+                        mfn = l2e_get_pfn(l2e[i2]);
+                        ASSERT(mfn_valid(_mfn(mfn)));
+                        for ( i1 = 0; i1 < L1_PAGETABLE_ENTRIES; i1++)
+                        {
+                            m2pfn = get_gpfn_from_mfn(mfn+i1);
+                            if ( m2pfn != (gfn + i) )
+                            {
+                                pmbad++;
+                                P2M_PRINTK("mismatch: gfn %#lx -> mfn %#lx"
+                                           " -> gfn %#lx\n", gfn+i, mfn+i,
+                                           m2pfn);
+                                BUG();
+                            }
+                        }
+                        gfn += 1 << (L2_PAGETABLE_SHIFT - PAGE_SHIFT);
+                        continue;
+                    }
+
                     l1e = map_domain_page(mfn_x(_mfn(l2e_get_pfn(l2e[i2]))));
 
                     for ( i1 = 0; i1 < L1_PAGETABLE_ENTRIES; i1++, gfn++ )
@@ -737,35 +860,40 @@ static void audit_p2m(struct domain *d)
 
 
 static void
-p2m_remove_page(struct domain *d, unsigned long gfn, unsigned long mfn)
-{
+p2m_remove_page(struct domain *d, unsigned long gfn, unsigned long mfn,
+                unsigned int page_order)
+{
+    int i;
     if ( !paging_mode_translate(d) )
         return;
     P2M_DEBUG("removing gfn=%#lx mfn=%#lx\n", gfn, mfn);
 
-    set_p2m_entry(d, gfn, _mfn(INVALID_MFN), p2m_invalid);
-    set_gpfn_from_mfn(mfn, INVALID_M2P_ENTRY);
+    set_p2m_entry(d, gfn, _mfn(INVALID_MFN), page_order, p2m_invalid);
+    for ( i = 0; i < (1UL << page_order); i++ )
+        set_gpfn_from_mfn(mfn+i, INVALID_M2P_ENTRY);
 }
 
 void
 guest_physmap_remove_page(struct domain *d, unsigned long gfn,
-                          unsigned long mfn)
+                          unsigned long mfn, unsigned int page_order)
 {
     p2m_lock(d->arch.p2m);
     audit_p2m(d);
-    p2m_remove_page(d, gfn, mfn);
+    p2m_remove_page(d, gfn, mfn, page_order);
     audit_p2m(d);
     p2m_unlock(d->arch.p2m);
 }
 
 int
 guest_physmap_add_entry(struct domain *d, unsigned long gfn,
-                        unsigned long mfn, p2m_type_t t)
+                        unsigned long mfn, unsigned int page_order, 
+                        p2m_type_t t)
 {
     unsigned long ogfn;
     p2m_type_t ot;
     mfn_t omfn;
     int rc = 0;
+    int i;
 
     if ( !paging_mode_translate(d) )
         return -EINVAL;
@@ -795,7 +923,8 @@ guest_physmap_add_entry(struct domain *d
     if ( p2m_is_ram(ot) )
     {
         ASSERT(mfn_valid(omfn));
-        set_gpfn_from_mfn(mfn_x(omfn), INVALID_M2P_ENTRY);
+        for ( i = 0; i < (1UL << page_order); i++ )
+            set_gpfn_from_mfn(mfn_x(omfn)+i, INVALID_M2P_ENTRY);
     }
 
     ogfn = mfn_to_gfn(d, _mfn(mfn));
@@ -818,21 +947,23 @@ guest_physmap_add_entry(struct domain *d
             P2M_DEBUG("old gfn=%#lx -> mfn %#lx\n",
                       ogfn , mfn_x(omfn));
             if ( mfn_x(omfn) == mfn )
-                p2m_remove_page(d, ogfn, mfn);
+                p2m_remove_page(d, ogfn, mfn, page_order);
         }
     }
 
     if ( mfn_valid(_mfn(mfn)) ) 
     {
-        if ( !set_p2m_entry(d, gfn, _mfn(mfn), t) )
+        if ( !set_p2m_entry(d, gfn, _mfn(mfn), page_order, t) )
             rc = -EINVAL;
-        set_gpfn_from_mfn(mfn, gfn);
+        for ( i = 0; i < (1UL << page_order); i++ )
+            set_gpfn_from_mfn(mfn+i, gfn+i);
     }
     else
     {
         gdprintk(XENLOG_WARNING, "Adding bad mfn to p2m map (%#lx -> %#lx)\n",
                  gfn, mfn);
-        if ( !set_p2m_entry(d, gfn, _mfn(INVALID_MFN), p2m_invalid) )
+        if ( !set_p2m_entry(d, gfn, _mfn(INVALID_MFN), page_order, 
+                            p2m_invalid) )
             rc = -EINVAL;
     }
 
@@ -851,7 +982,7 @@ void p2m_change_type_global(struct domai
     l1_pgentry_t l1e_content;
     l1_pgentry_t *l1e;
     l2_pgentry_t *l2e;
-    mfn_t l1mfn;
+    mfn_t l1mfn, l2mfn;
     int i1, i2;
     l3_pgentry_t *l3e;
     int i3;
@@ -891,11 +1022,26 @@ void p2m_change_type_global(struct domai
             {
                 continue;
             }
+            l2mfn = _mfn(l3e_get_pfn(l3e[i3]));
             l2e = map_domain_page(l3e_get_pfn(l3e[i3]));
             for ( i2 = 0; i2 < L2_PAGETABLE_ENTRIES; i2++ )
             {
                 if ( !(l2e_get_flags(l2e[i2]) & _PAGE_PRESENT) )
                 {
+                    continue;
+                }
+
+                if ( (l2e_get_flags(l2e[i2]) & _PAGE_PSE) )
+                {
+                    flags = l2e_get_flags(l2e[i2]);
+                    if ( p2m_flags_to_type(flags) != ot )
+                        continue;
+                    mfn = l2e_get_pfn(l2e[i2]);
+                    gfn = get_gpfn_from_mfn(mfn);
+                    flags = p2m_flags_to_type(nt);
+                    l1e_content = l1e_from_pfn(mfn, flags | _PAGE_PSE);
+                    paging_write_p2m_entry(d, gfn, (l1_pgentry_t *)&l2e[i2],
+                                           l2mfn, l1e_content, 2);
                     continue;
                 }
 
@@ -944,7 +1090,7 @@ p2m_type_t p2m_change_type(struct domain
 
     mfn = gfn_to_mfn(d, gfn, &pt);
     if ( pt == ot )
-        set_p2m_entry(d, gfn, mfn, nt);
+        set_p2m_entry(d, gfn, mfn, NORMAL_PAGE_ORDER, nt);
 
     p2m_unlock(d->arch.p2m);
 
@@ -968,7 +1114,7 @@ set_mmio_p2m_entry(struct domain *d, uns
         set_gpfn_from_mfn(mfn_x(omfn), INVALID_M2P_ENTRY);
     }
 
-    rc = set_p2m_entry(d, gfn, mfn, p2m_mmio_direct);
+    rc = set_p2m_entry(d, gfn, mfn, NORMAL_PAGE_ORDER, p2m_mmio_direct);
     if ( 0 == rc )
         gdprintk(XENLOG_ERR,
             "set_mmio_p2m_entry: set_p2m_entry failed! mfn=%08lx\n",
@@ -992,7 +1138,7 @@ clear_mmio_p2m_entry(struct domain *d, u
             "clear_mmio_p2m_entry: gfn_to_mfn failed! gfn=%08lx\n", gfn);
         return 0;
     }
-    rc = set_p2m_entry(d, gfn, _mfn(INVALID_MFN), 0);
+    rc = set_p2m_entry(d, gfn, _mfn(INVALID_MFN), NORMAL_PAGE_ORDER, 0);
 
     return rc;
 }
diff -r 810d8c3ac992 -r 583dca746efb xen/common/grant_table.c
--- a/xen/common/grant_table.c	Thu May 08 16:58:33 2008 +0100
+++ b/xen/common/grant_table.c	Mon May 12 04:06:34 2008 -0500
@@ -1159,7 +1159,7 @@ gnttab_transfer(
         spin_lock(&e->grant_table->lock);
 
         sha = &shared_entry(e->grant_table, gop.ref);
-        guest_physmap_add_page(e, sha->frame, mfn);
+        guest_physmap_add_page(e, sha->frame, mfn, NORMAL_PAGE_ORDER);
         sha->frame = mfn;
         wmb();
         sha->flags |= GTF_transfer_completed;
diff -r 810d8c3ac992 -r 583dca746efb xen/common/memory.c
--- a/xen/common/memory.c	Thu May 08 16:58:33 2008 +0100
+++ b/xen/common/memory.c	Mon May 12 04:06:34 2008 -0500
@@ -114,34 +114,60 @@ static void populate_physmap(struct memo
 
         page = alloc_domheap_pages(
             d, a->extent_order, a->memflags | MEMF_node(node));
-        if ( unlikely(page == NULL) ) 
-        {
-            gdprintk(XENLOG_INFO, "Could not allocate order=%d extent: "
-                     "id=%d memflags=%x (%ld of %d)\n",
-                     a->extent_order, d->domain_id, a->memflags,
-                     i, a->nr_extents);
-            goto out;
-        }
-
-        mfn = page_to_mfn(page);
-
-        if ( unlikely(paging_mode_translate(d)) )
-        {
+
+        if ( unlikely(page == NULL) )
+        {
+            /* fail if it is not under translate mode */
+            if ( !paging_mode_translate(d) )
+            {
+                gdprintk(XENLOG_INFO, "Could not allocate order=%d extent: "
+                         "id=%d memflags=%x (%ld of %d)\n",
+                         a->extent_order, d->domain_id, a->memflags,
+                         i, a->nr_extents);
+                goto out;
+            }
+            
+            /* try to allocate using 4KB page instead */
             for ( j = 0; j < (1 << a->extent_order); j++ )
-                if ( guest_physmap_add_page(d, gpfn + j, mfn + j) )
+            {
+                page = alloc_domheap_pages(d, 0, 
+                                           a->memflags | MEMF_node(node));
+                if ( page == NULL )
+                {
+                    gdprintk(XENLOG_INFO, "Could not allocate order=%d extent:"
+                             "id=%d memflags=%x (%ld of %d)\n",
+                             0, d->domain_id, a->memflags, i, a->nr_extents);
                     goto out;
-        }
-        else
-        {
-            for ( j = 0; j < (1 << a->extent_order); j++ )
-                set_gpfn_from_mfn(mfn + j, gpfn + j);
-
-            /* Inform the domain of the new page's machine address. */ 
-            if ( unlikely(__copy_to_guest_offset(a->extent_list, i, &mfn, 1)) )
-                goto out;
-        }
-    }
-
+                }
+                
+                mfn = page_to_mfn(page);
+                
+                if ( guest_physmap_add_page(d, gpfn+j, mfn, 
+                                            NORMAL_PAGE_ORDER) )
+                    goto out;
+            }
+        }
+        else /* successful in allocating page of extent_order */
+        {
+            mfn = page_to_mfn(page);
+            
+            if ( unlikely(paging_mode_translate(d)) )
+            {
+                if ( guest_physmap_add_page(d, gpfn, mfn, a->extent_order) )
+                    goto out;
+            }
+            else
+            {
+                for ( j = 0; j < (1 << a->extent_order); j++ )
+                    set_gpfn_from_mfn(mfn + j, gpfn + j);
+                
+                /* Inform the domain of the new page's machine address. */ 
+                if ( unlikely(__copy_to_guest_offset(a->extent_list, i, &mfn, 
+                                                     1)) )
+                    goto out;
+            }
+        }
+    }
  out:
     a->nr_done = i;
 }
@@ -172,7 +198,7 @@ int guest_remove_page(struct domain *d, 
     if ( test_and_clear_bit(_PGC_allocated, &page->count_info) )
         put_page(page);
 
-    guest_physmap_remove_page(d, gmfn, mfn);
+    guest_physmap_remove_page(d, gmfn, mfn, NORMAL_PAGE_ORDER);
 
     put_page(page);
 
@@ -419,7 +445,8 @@ static long memory_exchange(XEN_GUEST_HA
             if ( !test_and_clear_bit(_PGC_allocated, &page->count_info) )
                 BUG();
             mfn = page_to_mfn(page);
-            guest_physmap_remove_page(d, mfn_to_gmfn(d, mfn), mfn);
+            guest_physmap_remove_page(d, mfn_to_gmfn(d, mfn), mfn, 
+                                      NORMAL_PAGE_ORDER);
             put_page(page);
         }
 
@@ -441,8 +468,8 @@ static long memory_exchange(XEN_GUEST_HA
             if ( unlikely(paging_mode_translate(d)) )
             {
                 /* Ignore failure here. There's nothing we can do. */
-                for ( k = 0; k < (1UL << exch.out.extent_order); k++ )
-                    (void)guest_physmap_add_page(d, gpfn + k, mfn + k);
+                (void)guest_physmap_add_page(d, gpfn, mfn, 
+                                             exch.out.extent_order);
             }
             else
             {
diff -r 810d8c3ac992 -r 583dca746efb xen/include/asm-ia64/shadow.h
--- a/xen/include/asm-ia64/shadow.h	Thu May 08 16:58:33 2008 +0100
+++ b/xen/include/asm-ia64/shadow.h	Mon May 12 04:06:34 2008 -0500
@@ -40,8 +40,10 @@
  * Utilities to change relationship of gpfn->mfn for designated domain,
  * which is required by gnttab transfer, balloon, device model and etc.
  */
-int guest_physmap_add_page(struct domain *d, unsigned long gpfn, unsigned long mfn);
-void guest_physmap_remove_page(struct domain *d, unsigned long gpfn, unsigned long mfn);
+int guest_physmap_add_page(struct domain *d, unsigned long gpfn, 
+                           unsigned long mfn, unsigned int page_order);
+void guest_physmap_remove_page(struct domain *d, unsigned long gpfn, 
+                               unsigned long mfn, unsigned int page_order);
 
 static inline int
 shadow_mode_enabled(struct domain *d)
diff -r 810d8c3ac992 -r 583dca746efb xen/include/asm-x86/mm.h
--- a/xen/include/asm-x86/mm.h	Thu May 08 16:58:33 2008 +0100
+++ b/xen/include/asm-x86/mm.h	Mon May 12 04:06:34 2008 -0500
@@ -124,6 +124,14 @@ static inline u32 pickle_domptr(struct d
 /* The order of the largest allocation unit we use for shadow pages */
 #define SHADOW_MAX_ORDER 2 /* Need up to 16k allocs for 32-bit on PAE/64 */
 
+/* The order of continuously allocated super page frames */
+#define NORMAL_PAGE_ORDER 0  /* 4KB page */
+#if CONFIG_PAGING_LEVELS == 2
+#define SUPER_PAGE_ORDER  10 /* 4MB page */
+#else
+#define SUPER_PAGE_ORDER  9  /* 2MB page */
+#endif
+
 #define page_get_owner(_p)    (unpickle_domptr((_p)->u.inuse._domain))
 #define page_set_owner(_p,_d) ((_p)->u.inuse._domain = pickle_domptr(_d))
 
diff -r 810d8c3ac992 -r 583dca746efb xen/include/asm-x86/p2m.h
--- a/xen/include/asm-x86/p2m.h	Thu May 08 16:58:33 2008 +0100
+++ b/xen/include/asm-x86/p2m.h	Mon May 12 04:06:34 2008 -0500
@@ -102,7 +102,8 @@ struct p2m_domain {
     void               (*free_page   )(struct domain *d,
                                        struct page_info *pg);
     int                (*set_entry   )(struct domain *d, unsigned long gfn,
-                                       mfn_t mfn, p2m_type_t p2mt);
+                                       mfn_t mfn, unsigned int page_order,
+                                       p2m_type_t p2mt);
     mfn_t              (*get_entry   )(struct domain *d, unsigned long gfn,
                                        p2m_type_t *p2mt);
     mfn_t              (*get_entry_current)(unsigned long gfn,
@@ -203,21 +204,23 @@ void p2m_final_teardown(struct domain *d
 
 /* Add a page to a domain's p2m table */
 int guest_physmap_add_entry(struct domain *d, unsigned long gfn,
-                             unsigned long mfn, p2m_type_t t);
+                            unsigned long mfn, unsigned int page_order, 
+                            p2m_type_t t);
 
 /* Untyped version for RAM only, for compatibility 
  *
  * Return 0 for success
  */
 static inline int guest_physmap_add_page(struct domain *d, unsigned long gfn,
-                                         unsigned long mfn)
-{
-    return guest_physmap_add_entry(d, gfn, mfn, p2m_ram_rw);
+                                         unsigned long mfn,
+                                         unsigned int page_order)
+{
+    return guest_physmap_add_entry(d, gfn, mfn, page_order, p2m_ram_rw);
 }
 
 /* Remove a page from a domain's p2m table */
 void guest_physmap_remove_page(struct domain *d, unsigned long gfn,
-                               unsigned long mfn);
+                               unsigned long mfn, unsigned int page_order);
 
 /* Change types across all p2m entries in a domain */
 void p2m_change_type_global(struct domain *d, p2m_type_t ot, p2m_type_t nt);
diff -r 810d8c3ac992 -r 583dca746efb xen/include/xen/paging.h
--- a/xen/include/xen/paging.h	Thu May 08 16:58:33 2008 +0100
+++ b/xen/include/xen/paging.h	Mon May 12 04:06:34 2008 -0500
@@ -18,8 +18,8 @@
 #else
 
 #define paging_mode_translate(d)              (0)
-#define guest_physmap_add_page(d, p, m)       (0)
-#define guest_physmap_remove_page(d, p, m)    ((void)0)
+#define guest_physmap_add_page(d, p, m, o)       (0)
+#define guest_physmap_remove_page(d, p, m, o)    ((void)0)
 
 #endif
 

[-- Attachment #4: Type: text/plain, Size: 138 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] patch to support super page (2M) with EPT
  2008-05-12 17:28         ` Huang2, Wei
@ 2008-05-13  8:46           ` Keir Fraser
  2008-05-13 13:36             ` Huang2, Wei
  0 siblings, 1 reply; 14+ messages in thread
From: Keir Fraser @ 2008-05-13  8:46 UTC (permalink / raw)
  To: Huang2, Wei, Xin, Xiaohui, xen-devel


[-- Attachment #1.1: Type: text/plain, Size: 4862 bytes --]

Overall your changes to common code seem a bit more invasive than in the
Intel patch. In particular I don¹t understand why you made such changes to
common/memory.c. The other patch makes far fewer changes (and even some of
those would go away since they have erroneously changed the populate_physmap
interface). So my feeling is that the Intel patch is a slightly more elegant
base to start with: extra changes that your patch makes really need to be
accounted for.

 -- Keir

On 12/5/08 18:28, "Huang2, Wei" <Wei.Huang2@amd.com> wrote:

> Here is a revised version. I get rid of 4MB support, as suggested. I did not
> see the issue mentioned by Xiaohui related to splitting last 2M into 4KB
> pages. But anyway, I attached two versions for your reference. Keir, please
> let me know if you have comments.
>  
>  
> -Wei
> 
> 
> From: Keir Fraser [mailto:keir.fraser@eu.citrix.com]
> Sent: Monday, May 12, 2008 2:03 AM
> To: Xin, Xiaohui; Huang2, Wei; xen-devel@lists.xensource.com
> Subject: Re: [Xen-devel][PATCH] patch to support super page (2M) with EPT
> 
> Yes, absolutely no need for 4M page support. We do not support the 32-bit
> non-PAE build target any more.
> 
>  -- Keir
> 
> On 12/5/08 06:04, "Xin, Xiaohui" <xiaohui.xin@intel.com> wrote:
> 
>> Some comments here:
>> 1) Basically 4M pages  allocations is not hardware naturally for EPT, we only
>> use 2M super pages now.
>> I remembered that Keir said that 2M pages allocation is sufficient, and he
>> removed all the pure 32bit support already.
>> 2)  If we don¹t allocate  the last 2M area with 4kb pages, the EPT will meet
>> some problem. Xen will set  one of the 4k page
>>      there to be invalid,  logically that means we should invalid the all the
>> 2M page if we allocate it  with 2M, and then the
>>      special pages Xen used  in the high end of the guest memory can not be
>> used then. May we know how you  cope with  that?
>>  
>> Thanks
>> Xiaohui
>>  
>>  
>>  
>> 
>>   
>> From:  xen-devel-bounces@lists.xensource.com
>> [mailto:xen-devel-bounces@lists.xensource.com]  On Behalf Of Huang2, Wei
>> Sent: Monday, May 12, 2008 12:36  PM
>> To: Xin, Xiaohui;  xen-devel@lists.xensource.com
>> Subject: RE: [Xen-devel][PATCH] patch  to support super page (2M) with EPT
>> 
>> This is the latest one I created. Please review it and  I will re-submit.
>>  
>> 1.       It includes the  patch for p2m-ept.c, directly from your previous
>> patch.  
>> 
>> 2.       Xc_hvm_create.c  is based on my original approach. It includes
>> support for both 2MB and 4MB  pages. Also it considers the case of odd page
>> size (such as 255MB). But I did  not allocate the last 2MB area using 4KB
>> pages. Let me know if it is a  big issue.
>> 
>> 3.       The rest are  pretty similar.
>> 
>> 
>> Thanks,
>>  
>> -Wei
>>  
>> 
>> From: xen-devel-bounces@lists.xensource.com
>> [mailto:xen-devel-bounces@lists.xensource.com]  On Behalf Of Huang2, Wei
>> Sent: Sunday, May 11, 2008 3:34  PM
>> To: Xin, Xiaohui;  xen-devel@lists.xensource.com
>> Subject: RE: [Xen-devel][PATCH] patch  to support super page (2M) with EPT
>> 
>> Could we work together for a common solution? As far  as I can see, it
>> largely overlaps with my super page patch. The major  difference is between
>> p2m.c and  p2m-ept.c.
>>  
>> -Wei
>>  
>> 
>> From: xen-devel-bounces@lists.xensource.com
>> [mailto:xen-devel-bounces@lists.xensource.com]  On Behalf Of Xin, Xiaohui
>> Sent: Friday, May 09, 2008 4:11  AM
>> To: xen-devel@lists.xensource.com
>> Subject:  [Xen-devel][PATCH] patch to support super page (2M) with  EPT
>> 
>> Attached are the patches to support  super page with EPT. We only support 2M
>> size. And shadow may still work fine  with 4K pages.
>> The patches can be split into 3 parts. Apply order is as  attached.
>>  
>> tool.diff 
>> To allocate 2M physical contiguous memory  in guest except the first 2M and
>> the last 2M.
>> The first 2M covers special  memory, and Xen use the last few pages in guest
>> memory to do special  things.
>> We let them to be 4K pages as normal.
>> super_page_common.patch
>> To modify the p2m interfaces by adding an order parameter, such as
>> guest_physmap_add_page(), p2m_set_entry(),  etc.
>> p2m-ept-file.patch
>>            To  handle the EPT tables to support super page.
>>  
>>  
>> Signed-off-by:  Xin Xiaohui <xiaohui.xin@intel.com>
>> Signed-off-by: Li Xin, B  <xin.b.li@intel.com>
>>  
>> 
>>  
>> 
>>  
>> 
>>  _______________________________________________
>> Xen-devel  mailing list
>> Xen-devel@lists.xensource.com
>> http://lists.xensource.com/xen-devel
> 
> 
> 
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xensource.com
> http://lists.xensource.com/xen-devel



[-- Attachment #1.2: Type: text/html, Size: 10039 bytes --]

[-- Attachment #2: Type: text/plain, Size: 138 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 14+ messages in thread

* RE: [PATCH] patch to support super page (2M) with EPT
  2008-05-13  8:46           ` Keir Fraser
@ 2008-05-13 13:36             ` Huang2, Wei
  2008-05-13 13:39               ` Li, Xin B
  0 siblings, 1 reply; 14+ messages in thread
From: Huang2, Wei @ 2008-05-13 13:36 UTC (permalink / raw)
  To: Keir Fraser, Xin, Xiaohui, xen-devel

[-- Attachment #1.1: Type: text/plain, Size: 5184 bytes --]

Memory.c looks more invasive because it takes care of failure cases
using 4KB pages. Xiaohui's patch tries to allocate pages using
extend_order. But if this request fails for any reason, the guest cannot
be started anymore. 

-Wei

From: Keir Fraser [mailto:keir.fraser@eu.citrix.com] 
Sent: Tuesday, May 13, 2008 3:47 AM
To: Huang2, Wei; Xin, Xiaohui; xen-devel@lists.xensource.com
Subject: Re: [Xen-devel][PATCH] patch to support super page (2M) with
EPT

Overall your changes to common code seem a bit more invasive than in the
Intel patch. In particular I don't understand why you made such changes
to common/memory.c. The other patch makes far fewer changes (and even
some of those would go away since they have erroneously changed the
populate_physmap interface). So my feeling is that the Intel patch is a
slightly more elegant base to start with: extra changes that your patch
makes really need to be accounted for.

 -- Keir

On 12/5/08 18:28, "Huang2, Wei" <Wei.Huang2@amd.com> wrote:

Here is a revised version. I get rid of 4MB support, as suggested. I did
not see the issue mentioned by Xiaohui related to splitting last 2M into
4KB pages. But anyway, I attached two versions for your reference. Keir,
please let me know if you have comments.

-Wei

________________________________

From: Keir Fraser [mailto:keir.fraser@eu.citrix.com] 
Sent: Monday, May 12, 2008 2:03 AM
To: Xin, Xiaohui; Huang2, Wei; xen-devel@lists.xensource.com
Subject: Re: [Xen-devel][PATCH] patch to support super page (2M) with
EPT

Yes, absolutely no need for 4M page support. We do not support the
32-bit non-PAE build target any more.

 -- Keir

On 12/5/08 06:04, "Xin, Xiaohui" <xiaohui.xin@intel.com> wrote:

Some comments here:
1) Basically 4M pages  allocations is not hardware naturally for EPT, we
only use 2M super pages now.  
I remembered that Keir said that 2M pages allocation is sufficient, and
he  removed all the pure 32bit support already.
2)  If we don't allocate  the last 2M area with 4kb pages, the EPT will
meet some problem. Xen will set  one of the 4k page
     there to be invalid,  logically that means we should invalid the
all the 2M page if we allocate it  with 2M, and then the 
     special pages Xen used  in the high end of the guest memory can not
be used then. May we know how you  cope with  that?

Thanks
Xiaohui

________________________________

From:  xen-devel-bounces@lists.xensource.com
[mailto:xen-devel-bounces@lists.xensource.com]  On Behalf Of Huang2, Wei
Sent: Monday, May 12, 2008 12:36  PM
To: Xin, Xiaohui;  xen-devel@lists.xensource.com
Subject: RE: [Xen-devel][PATCH] patch  to support super page (2M) with
EPT

This is the latest one I created. Please review it and  I will
re-submit.

1.       It includes the  patch for p2m-ept.c, directly from your
previous patch.  

2.       Xc_hvm_create.c  is based on my original approach. It includes
support for both 2MB and 4MB  pages. Also it considers the case of odd
page size (such as 255MB). But I did  not allocate the last 2MB area
using 4KB  pages. Let me know if it is a  big issue.

3.       The rest are  pretty similar.

Thanks,

-Wei

From: xen-devel-bounces@lists.xensource.com
[mailto:xen-devel-bounces@lists.xensource.com]  On Behalf Of Huang2, Wei
Sent: Sunday, May 11, 2008 3:34  PM
To: Xin, Xiaohui;  xen-devel@lists.xensource.com
Subject: RE: [Xen-devel][PATCH] patch  to support super page (2M) with
EPT

Could we work together for a common solution? As far  as I can see, it
largely overlaps with my super page patch. The major  difference is
between p2m.c and  p2m-ept.c.

-Wei

From: xen-devel-bounces@lists.xensource.com
[mailto:xen-devel-bounces@lists.xensource.com]  On Behalf Of Xin,
Xiaohui
Sent: Friday, May 09, 2008 4:11  AM
To: xen-devel@lists.xensource.com
Subject:  [Xen-devel][PATCH] patch to support super page (2M) with  EPT

Attached are the patches to support  super page with EPT. We only
support 2M size. And shadow may still work fine  with 4K pages.
The patches can be split into 3 parts. Apply order is as  attached.

tool.diff 
To allocate 2M physical contiguous memory  in guest except the first 2M
and the last 2M.
The first 2M covers special  memory, and Xen use the last few pages in
guest memory to do special  things.
We let them to be 4K pages as normal.
super_page_common.patch  
To modify the p2m interfaces by adding an order parameter, such as
guest_physmap_add_page(), p2m_set_entry(),  etc.
p2m-ept-file.patch
           To  handle the EPT tables to support super page.             

Signed-off-by:  Xin Xiaohui <xiaohui.xin@intel.com>
Signed-off-by: Li Xin, B  <xin.b.li@intel.com>

________________________________

_______________________________________________
Xen-devel  mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

________________________________

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

[-- Attachment #1.2: Type: text/html, Size: 13865 bytes --]

[-- Attachment #2: Type: text/plain, Size: 138 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 14+ messages in thread

* RE: [PATCH] patch to support super page (2M) with EPT
  2008-05-13 13:36             ` Huang2, Wei
@ 2008-05-13 13:39               ` Li, Xin B
  2008-05-13 13:51                 ` Keir Fraser
  0 siblings, 1 reply; 14+ messages in thread
From: Li, Xin B @ 2008-05-13 13:39 UTC (permalink / raw)
  To: Huang2, Wei, Keir Fraser, Xin, Xiaohui, xen-devel


[-- Attachment #1.1: Type: text/plain, Size: 5700 bytes --]

if 2M page allocation fails, the domain builer will try to use 4K allocation instead.
-Xin


________________________________

	From: xen-devel-bounces@lists.xensource.com [mailto:xen-devel-bounces@lists.xensource.com] On Behalf Of Huang2, Wei
	Sent: 2008年5月13日 21:37
	To: Keir Fraser; Xin, Xiaohui; xen-devel@lists.xensource.com
	Subject: RE: [Xen-devel][PATCH] patch to support super page (2M) with EPT
	
	

	Memory.c looks more invasive because it takes care of failure cases using 4KB pages. Xiaohui’s patch tries to allocate pages using extend_order. But if this request fails for any reason, the guest cannot be started anymore. 

	 

	-Wei

	 

	From: Keir Fraser [mailto:keir.fraser@eu.citrix.com] 
	Sent: Tuesday, May 13, 2008 3:47 AM
	To: Huang2, Wei; Xin, Xiaohui; xen-devel@lists.xensource.com
	Subject: Re: [Xen-devel][PATCH] patch to support super page (2M) with EPT

	 

	Overall your changes to common code seem a bit more invasive than in the Intel patch. In particular I don’t understand why you made such changes to common/memory.c. The other patch makes far fewer changes (and even some of those would go away since they have erroneously changed the populate_physmap interface). So my feeling is that the Intel patch is a slightly more elegant base to start with: extra changes that your patch makes really need to be accounted for.
	
	 -- Keir
	
	On 12/5/08 18:28, "Huang2, Wei" <Wei.Huang2@amd.com> wrote:

	Here is a revised version. I get rid of 4MB support, as suggested. I did not see the issue mentioned by Xiaohui related to splitting last 2M into 4KB pages. But anyway, I attached two versions for your reference. Keir, please let me know if you have comments.
	
	 
	-Wei

	
________________________________


	From: Keir Fraser [mailto:keir.fraser@eu.citrix.com] 
	Sent: Monday, May 12, 2008 2:03 AM
	To: Xin, Xiaohui; Huang2, Wei; xen-devel@lists.xensource.com
	Subject: Re: [Xen-devel][PATCH] patch to support super page (2M) with EPT
	
	Yes, absolutely no need for 4M page support. We do not support the 32-bit non-PAE build target any more.
	
	 -- Keir
	
	On 12/5/08 06:04, "Xin, Xiaohui" <xiaohui.xin@intel.com> wrote:

	Some comments here:
	1) Basically 4M pages  allocations is not hardware naturally for EPT, we only use 2M super pages now.  
	I remembered that Keir said that 2M pages allocation is sufficient, and he  removed all the pure 32bit support already.
	2)  If we don’t allocate  the last 2M area with 4kb pages, the EPT will meet some problem. Xen will set  one of the 4k page
	     there to be invalid,  logically that means we should invalid the all the 2M page if we allocate it  with 2M, and then the 
	     special pages Xen used  in the high end of the guest memory can not be used then. May we know how you  cope with  that?
	 
	Thanks
	Xiaohui
	 

	 

	
________________________________


	From:  xen-devel-bounces@lists.xensource.com [mailto:xen-devel-bounces@lists.xensource.com]  On Behalf Of Huang2, Wei
	Sent: Monday, May 12, 2008 12:36  PM
	To: Xin, Xiaohui;  xen-devel@lists.xensource.com
	Subject: RE: [Xen-devel][PATCH] patch  to support super page (2M) with EPT
	
	This is the latest one I created. Please review it and  I will re-submit.
	 
	1.       It includes the  patch for p2m-ept.c, directly from your previous patch.  
	
	2.       Xc_hvm_create.c  is based on my original approach. It includes support for both 2MB and 4MB  pages. Also it considers the case of odd page size (such as 255MB). But I did  not allocate the last 2MB area using 4KB  pages. Let me know if it is a  big issue.
	
	3.       The rest are  pretty similar.
	
	
	Thanks,
	 
	-Wei
	 
	
	From: xen-devel-bounces@lists.xensource.com [mailto:xen-devel-bounces@lists.xensource.com]  On Behalf Of Huang2, Wei
	Sent: Sunday, May 11, 2008 3:34  PM
	To: Xin, Xiaohui;  xen-devel@lists.xensource.com
	Subject: RE: [Xen-devel][PATCH] patch  to support super page (2M) with EPT
	
	Could we work together for a common solution? As far  as I can see, it largely overlaps with my super page patch. The major  difference is between p2m.c and  p2m-ept.c.
	 
	-Wei
	 
	
	From: xen-devel-bounces@lists.xensource.com [mailto:xen-devel-bounces@lists.xensource.com]  On Behalf Of Xin, Xiaohui
	Sent: Friday, May 09, 2008 4:11  AM
	To: xen-devel@lists.xensource.com
	Subject:  [Xen-devel][PATCH] patch to support super page (2M) with  EPT
	
	Attached are the patches to support  super page with EPT. We only support 2M size. And shadow may still work fine  with 4K pages.
	The patches can be split into 3 parts. Apply order is as  attached.
	 
	tool.diff 
	To allocate 2M physical contiguous memory  in guest except the first 2M and the last 2M.
	The first 2M covers special  memory, and Xen use the last few pages in guest memory to do special  things.
	We let them to be 4K pages as normal.
	super_page_common.patch  
	To modify the p2m interfaces by adding an order parameter, such as  guest_physmap_add_page(), p2m_set_entry(),  etc.
	p2m-ept-file.patch
	           To  handle the EPT tables to support super page.             
	 
	 
	Signed-off-by:  Xin Xiaohui <xiaohui.xin@intel.com>
	Signed-off-by: Li Xin, B  <xin.b.li@intel.com>
	 
	
	 
	
	 

	
________________________________


	_______________________________________________
	Xen-devel  mailing list
	Xen-devel@lists.xensource.com
	http://lists.xensource.com/xen-devel

	 

	
________________________________


	_______________________________________________
	Xen-devel mailing list
	Xen-devel@lists.xensource.com
	http://lists.xensource.com/xen-devel

	 


[-- Attachment #1.2: Type: text/html, Size: 15381 bytes --]

[-- Attachment #2: Type: text/plain, Size: 138 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] patch to support super page (2M) with EPT
  2008-05-13 13:39               ` Li, Xin B
@ 2008-05-13 13:51                 ` Keir Fraser
  2008-05-13 15:49                   ` Huang2, Wei
  0 siblings, 1 reply; 14+ messages in thread
From: Keir Fraser @ 2008-05-13 13:51 UTC (permalink / raw)
  To: Li, Xin B, Huang2, Wei, Xin, Xiaohui, xen-devel


[-- Attachment #1.1: Type: text/plain, Size: 6468 bytes --]

Exactly. The interface for populate_physmap() is clear — if you ask for
order-9 allocations then that is what you must get. Otherwise the allocation
fails. It is up to the caller to retry with order-0 allocations _if_ that is
a suitable fallback.

 -- Keir

On 13/5/08 14:39, "Li, Xin B" <xin.b.li@intel.com> wrote:

> if 2M page allocation fails, the domain builer will try to use 4K allocation
> instead.
> -Xin
> 
>>  
>>  
>> 
>>  From: xen-devel-bounces@lists.xensource.com
>> [mailto:xen-devel-bounces@lists.xensource.com] On Behalf Of Huang2,  Wei
>> Sent: 2008年5月13日 21:37
>> To: Keir Fraser; Xin, Xiaohui;  xen-devel@lists.xensource.com
>> Subject: RE: [Xen-devel][PATCH] patch  to support super page (2M) with EPT
>> 
>>  
>>  
>>  
>> 
>> Memory.c  looks more invasive because it takes care of failure cases using
>> 4KB pages.  Xiaohui’s patch tries to allocate pages using extend_order. But
>> if this  request fails for any reason, the guest cannot be started anymore.
>>  
>>  
>>  
>> -Wei
>>  
>>  
>>  
>>  
>>  
>> 
>> From: Keir Fraser  [mailto:keir.fraser@eu.citrix.com]
>> Sent: Tuesday, May 13, 2008 3:47  AM
>> To: Huang2, Wei; Xin, Xiaohui;  xen-devel@lists.xensource.com
>> Subject: Re: [Xen-devel][PATCH] patch  to support super page (2M) with EPT
>>  
>>  
>>  
>> Overall your  changes to common code seem a bit more invasive than in the
>> Intel patch. In  particular I don’t understand why you made such changes to
>> common/memory.c.  The other patch makes far fewer changes (and even some of
>> those would go away  since they have erroneously changed the populate_physmap
>> interface). So my  feeling is that the Intel patch is a slightly more elegant
>> base to start with:  extra changes that your patch makes really need to be
>> accounted  for.
>> 
>>  -- Keir
>> 
>> On 12/5/08 18:28, "Huang2, Wei"  <Wei.Huang2@amd.com> wrote:
>>  
>> Here is  a revised version. I get rid of 4MB support, as suggested. I did not
>> see the  issue mentioned by Xiaohui related to splitting last 2M into 4KB
>> pages. But  anyway, I attached two versions for your reference. Keir, please
>> let me know  if you have comments.
>> 
>>  
>> -Wei
>>  
>>  
>> 
>>  
>>  
>> 
>> From: Keir Fraser [mailto:keir.fraser@eu.citrix.com]
>> Sent: Monday, May 12, 2008 2:03 AM
>> To: Xin, Xiaohui;  Huang2, Wei; xen-devel@lists.xensource.com
>> Subject: Re:  [Xen-devel][PATCH] patch to support super page (2M) with EPT
>> 
>> Yes,  absolutely no need for 4M page support. We do not support the 32-bit
>> non-PAE  build target any more.
>> 
>>  -- Keir
>> 
>> On 12/5/08 06:04, "Xin,  Xiaohui" <xiaohui.xin@intel.com> wrote:
>>  
>> Some  comments here:
>> 1) Basically 4M pages  allocations is not hardware  naturally for EPT, we
>> only use 2M super pages now.
>> I remembered that  Keir said that 2M pages allocation is sufficient, and he
>> removed all the  pure 32bit support already.
>> 2)  If we don’t allocate  the last 2M  area with 4kb pages, the EPT will meet
>> some problem. Xen will set  one of  the 4k page
>>      there to be invalid,   logically that means we should invalid the all
>> the 2M page if we  allocate it  with 2M, and then the
>>      special pages Xen used  in the high end  of the guest memory can not be
>> used then. May we know how you  cope with   that?
>>  
>> Thanks
>> Xiaohui
>>  
>>  
>>  
>>  
>>  
>> 
>>  
>>  
>> 
>> From:   xen-devel-bounces@lists.xensource.com
>> [mailto:xen-devel-bounces@lists.xensource.com]   On Behalf Of Huang2, Wei
>> Sent: Monday, May 12, 2008  12:36  PM
>> To: Xin, Xiaohui;   xen-devel@lists.xensource.com
>> Subject: RE: [Xen-devel][PATCH]  patch  to support super page (2M) with EPT
>> 
>> This  is the latest one I created. Please review it and  I will  re-submit.
>>  
>> 1.        It  includes the  patch for p2m-ept.c, directly from your previous
>> patch.   
>> 
>> 2.        Xc_hvm_create.c   is based on my original approach. It includes
>> support for both 2MB and  4MB  pages. Also it considers the case of odd page
>> size (such as 255MB).  But I did  not allocate the last 2MB area using 4KB
>> pages. Let me  know if it is a  big issue.
>> 
>> 3.        The  rest are  pretty similar.
>> 
>> 
>> Thanks,
>>  
>> -Wei
>>  
>> 
>> From:  xen-devel-bounces@lists.xensource.com
>> [mailto:xen-devel-bounces@lists.xensource.com]   On Behalf Of Huang2, Wei
>> Sent: Sunday, May 11, 2008  3:34  PM
>> To: Xin, Xiaohui;   xen-devel@lists.xensource.com
>> Subject: RE: [Xen-devel][PATCH]  patch  to support super page (2M) with EPT
>> 
>> Could  we work together for a common solution? As far  as I can see, it
>> largely  overlaps with my super page patch. The major  difference is between
>> p2m.c  and  p2m-ept.c.
>>  
>> -Wei
>>  
>> 
>> From:  xen-devel-bounces@lists.xensource.com
>> [mailto:xen-devel-bounces@lists.xensource.com]   On Behalf Of Xin, Xiaohui
>> Sent: Friday, May 09, 2008  4:11  AM
>> To: xen-devel@lists.xensource.com
>> Subject:   [Xen-devel][PATCH] patch to support super page (2M) with   EPT
>> 
>> Attached are the  patches to support  super page with EPT. We only support 2M
>> size. And  shadow may still work fine  with 4K pages.
>> The patches can be split  into 3 parts. Apply order is as  attached.
>>  
>> tool.diff 
>> To  allocate 2M physical contiguous memory  in guest except the first 2M and
>> the last 2M.
>> The first 2M covers special  memory, and Xen use the last  few pages in guest
>> memory to do special  things.
>> We let them to be 4K  pages as normal.
>> super_page_common.patch
>> To modify the p2m  interfaces by adding an order parameter, such as
>> guest_physmap_add_page(), p2m_set_entry(),   etc.
>> p2m-ept-file.patch
>>            To   handle the EPT tables to support super page.
>>  
>>  
>> Signed-off-by:   Xin Xiaohui <xiaohui.xin@intel.com>
>> Signed-off-by: Li Xin, B   <xin.b.li@intel.com>
>>  
>> 
>>  
>> 
>>  
>>  
>>  
>> 
>>  
>>  
>> 
>> _______________________________________________
>> Xen-devel   mailing list
>> Xen-devel@lists.xensource.com
>> http://lists.xensource.com/xen-devel
>>  
>>  
>>  
>>  
>> 
>>  
>>  
>> 
>> _______________________________________________
>> Xen-devel  mailing list
>> Xen-devel@lists.xensource.com
>> http://lists.xensource.com/xen-devel
>>  
>>  
> 



[-- Attachment #1.2: Type: text/html, Size: 13253 bytes --]

[-- Attachment #2: Type: text/plain, Size: 138 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 14+ messages in thread

* RE: [PATCH] patch to support super page (2M) with EPT
  2008-05-13 13:51                 ` Keir Fraser
@ 2008-05-13 15:49                   ` Huang2, Wei
  2008-05-14  8:40                     ` Keir Fraser
  0 siblings, 1 reply; 14+ messages in thread
From: Huang2, Wei @ 2008-05-13 15:49 UTC (permalink / raw)
  To: Keir Fraser, Li, Xin B, Xin, Xiaohui, xen-devel


[-- Attachment #1.1: Type: text/plain, Size: 6560 bytes --]

Re-submit. It fixes the issues based on your comments.
 
Thanks,
 
-Wei

________________________________

From: Keir Fraser [mailto:keir.fraser@eu.citrix.com] 
Sent: Tuesday, May 13, 2008 8:51 AM
To: Li, Xin B; Huang2, Wei; Xin, Xiaohui; xen-devel@lists.xensource.com
Subject: Re: [Xen-devel][PATCH] patch to support super page (2M) with EPT


Exactly. The interface for populate_physmap() is clear - if you ask for order-9 allocations then that is what you must get. Otherwise the allocation fails. It is up to the caller to retry with order-0 allocations _if_ that is a suitable fallback.

 -- Keir

On 13/5/08 14:39, "Li, Xin B" <xin.b.li@intel.com> wrote:



	if 2M page allocation fails, the domain builer will try to use 4K allocation instead.
	-Xin
	
	

		
		 
		
________________________________

		From: xen-devel-bounces@lists.xensource.com  [mailto:xen-devel-bounces@lists.xensource.com] On Behalf Of Huang2,  Wei
		Sent: 2008年5月13日 21:37
		To: Keir Fraser; Xin, Xiaohui;  xen-devel@lists.xensource.com
		Subject: RE: [Xen-devel][PATCH] patch  to support super page (2M) with EPT
		
		 
		 
		 
		
		Memory.c  looks more invasive because it takes care of failure cases using 4KB pages.  Xiaohui’s patch tries to allocate pages using extend_order. But if this  request fails for any reason, the guest cannot be started anymore.  
		
		
		
		-Wei
		
		
		
		 
		 
		
		From: Keir Fraser  [mailto:keir.fraser@eu.citrix.com] 
		Sent: Tuesday, May 13, 2008 3:47  AM
		To: Huang2, Wei; Xin, Xiaohui;  xen-devel@lists.xensource.com
		Subject: Re: [Xen-devel][PATCH] patch  to support super page (2M) with EPT
		
		 
		 
		Overall your  changes to common code seem a bit more invasive than in the Intel patch. In  particular I don’t understand why you made such changes to common/memory.c.  The other patch makes far fewer changes (and even some of those would go away  since they have erroneously changed the populate_physmap interface). So my  feeling is that the Intel patch is a slightly more elegant base to start with:  extra changes that your patch makes really need to be accounted  for.
		
		 -- Keir
		
		On 12/5/08 18:28, "Huang2, Wei"  <Wei.Huang2@amd.com> wrote:
		
		Here is  a revised version. I get rid of 4MB support, as suggested. I did not see the  issue mentioned by Xiaohui related to splitting last 2M into 4KB pages. But  anyway, I attached two versions for your reference. Keir, please let me know  if you have comments.
		
		 
		-Wei
		

		
		
________________________________


		
		
		From: Keir Fraser [mailto:keir.fraser@eu.citrix.com]  
		Sent: Monday, May 12, 2008 2:03 AM
		To: Xin, Xiaohui;  Huang2, Wei; xen-devel@lists.xensource.com
		Subject: Re:  [Xen-devel][PATCH] patch to support super page (2M) with EPT
		
		Yes,  absolutely no need for 4M page support. We do not support the 32-bit non-PAE  build target any more.
		
		 -- Keir
		
		On 12/5/08 06:04, "Xin,  Xiaohui" <xiaohui.xin@intel.com> wrote:
		
		Some  comments here:
		1) Basically 4M pages  allocations is not hardware  naturally for EPT, we only use 2M super pages now.  
		I remembered that  Keir said that 2M pages allocation is sufficient, and he  removed all the  pure 32bit support already.
		2)  If we don’t allocate  the last 2M  area with 4kb pages, the EPT will meet some problem. Xen will set  one of  the 4k page
		     there to be invalid,   logically that means we should invalid the all the 2M page if we  allocate it  with 2M, and then the  
		     special pages Xen used  in the high end  of the guest memory can not be used then. May we know how you  cope with   that?
		 
		Thanks
		Xiaohui
		 
		

		
		
________________________________


		
		
		From:   xen-devel-bounces@lists.xensource.com [mailto:xen-devel-bounces@lists.xensource.com]   On Behalf Of Huang2, Wei
		Sent: Monday, May 12, 2008  12:36  PM
		To: Xin, Xiaohui;   xen-devel@lists.xensource.com
		Subject: RE: [Xen-devel][PATCH]  patch  to support super page (2M) with EPT
		
		This  is the latest one I created. Please review it and  I will  re-submit.
		 
		1.        It  includes the  patch for p2m-ept.c, directly from your previous patch.   
		
		2.        Xc_hvm_create.c   is based on my original approach. It includes support for both 2MB and  4MB  pages. Also it considers the case of odd page size (such as 255MB).  But I did  not allocate the last 2MB area using 4KB  pages. Let me  know if it is a  big issue.
		
		3.        The  rest are  pretty similar.
		
		
		Thanks,
		 
		-Wei
		 
		
		From:  xen-devel-bounces@lists.xensource.com [mailto:xen-devel-bounces@lists.xensource.com]   On Behalf Of Huang2, Wei
		Sent: Sunday, May 11, 2008  3:34  PM
		To: Xin, Xiaohui;   xen-devel@lists.xensource.com
		Subject: RE: [Xen-devel][PATCH]  patch  to support super page (2M) with EPT
		
		Could  we work together for a common solution? As far  as I can see, it largely  overlaps with my super page patch. The major  difference is between p2m.c  and  p2m-ept.c.
		 
		-Wei
		 
		
		From:  xen-devel-bounces@lists.xensource.com [mailto:xen-devel-bounces@lists.xensource.com]   On Behalf Of Xin, Xiaohui
		Sent: Friday, May 09, 2008  4:11  AM
		To: xen-devel@lists.xensource.com
		Subject:   [Xen-devel][PATCH] patch to support super page (2M) with   EPT
		
		Attached are the  patches to support  super page with EPT. We only support 2M size. And  shadow may still work fine  with 4K pages.
		The patches can be split  into 3 parts. Apply order is as  attached.
		 
		tool.diff 
		To  allocate 2M physical contiguous memory  in guest except the first 2M and  the last 2M.
		The first 2M covers special  memory, and Xen use the last  few pages in guest memory to do special  things.
		We let them to be 4K  pages as normal.
		super_page_common.patch  
		To modify the p2m  interfaces by adding an order parameter, such as   guest_physmap_add_page(), p2m_set_entry(),   etc.
		p2m-ept-file.patch
		           To   handle the EPT tables to support super page.              
		 
		 
		Signed-off-by:   Xin Xiaohui <xiaohui.xin@intel.com>
		Signed-off-by: Li Xin, B   <xin.b.li@intel.com>
		 
		
		 
		
		 
		

		
		
________________________________


		
		
		_______________________________________________
		Xen-devel   mailing list
		Xen-devel@lists.xensource.com
		http://lists.xensource.com/xen-devel
		
		
		

		
		
________________________________


		
		
		_______________________________________________
		Xen-devel  mailing list
		Xen-devel@lists.xensource.com
		http://lists.xensource.com/xen-devel
		
		 
		

	
	




[-- Attachment #1.2: Type: text/html, Size: 14876 bytes --]

[-- Attachment #2: super_page_patch_new.txt --]
[-- Type: text/plain, Size: 41054 bytes --]

diff -r 810d8c3ac992 tools/libxc/xc_hvm_build.c
--- a/tools/libxc/xc_hvm_build.c	Thu May 08 16:58:33 2008 +0100
+++ b/tools/libxc/xc_hvm_build.c	Tue May 13 03:39:27 2008 -0500
@@ -165,7 +165,7 @@ static int setup_guest(int xc_handle,
     uint32_t *ident_pt;
     struct elf_binary elf;
     uint64_t v_start, v_end;
-    int rc;
+    int rc, left;
     xen_capabilities_info_t caps;
 
     /* An HVM guest must be initialised with at least 2MB memory. */
@@ -213,19 +213,64 @@ static int setup_guest(int xc_handle,
      * Allocate memory for HVM guest, skipping VGA hole 0xA0000-0xC0000.
      * We allocate pages in batches of no more than 2048 to ensure that
      * we can be preempted and hence dom0 remains responsive.
-     */
+     * 1) Allocate 4K pages for the first 2M guest memory;
+     * 2) try to allocate 2M continous pages for the left guest memory
+     *    or use 4K pages;
+     * 3) Since the last page of the guest memory will be dereserved at last,
+     *    we try just allocate 4K pages for the last 2M guest memory.
+     */
+
     rc = xc_domain_memory_populate_physmap(
         xc_handle, dom, 0xa0, 0, 0, &page_array[0x00]);
     cur_pages = 0xc0;
-    while ( (rc == 0) && (nr_pages > cur_pages) )
+
+    if ( rc == 0 )
+        rc = xc_domain_memory_populate_physmap(
+            xc_handle, dom, 0x200-0xc0, 0, 0, &page_array[0xc0]);
+
+    cur_pages = 0x200;
+
+    left = nr_pages - ((nr_pages >> 9 ) << 9 );
+
+    while ( (rc == 0) && ( (left ? nr_pages : (nr_pages - 0x200))  > cur_pages) )
     {
         unsigned long count = nr_pages - cur_pages;
         if ( count > 2048 )
+        {
             count = 2048;
+            rc = xc_domain_memory_populate_physmap(
+                xc_handle, dom, 4, 9, 0, &page_array[cur_pages]);
+            if ( rc != 0 )
+            {
+                PERROR("Cannot allocate more 2M pages for HVM guest.\n");
+                rc = xc_domain_memory_populate_physmap(
+                    xc_handle, dom, count, 0, 0, &page_array[cur_pages]);
+                if ( rc != 0 )
+                {
+                    PERROR("Could not allocate memory for HVM guest.\n");
+                    goto error_out;
+                }
+            }
+        }
+        else
+        {
+            rc = xc_domain_memory_populate_physmap(
+                xc_handle, dom, count, 0, 0, &page_array[cur_pages]);
+
+            if ( rc != 0 )
+            {
+                PERROR("Could not allocate memory for HVM guest.\n");
+                goto error_out;
+            }
+        }
+
+        cur_pages += count;
+    }
+
+    if ( !left )
         rc = xc_domain_memory_populate_physmap(
-            xc_handle, dom, count, 0, 0, &page_array[cur_pages]);
-        cur_pages += count;
-    }
+            xc_handle, dom, nr_pages - cur_pages, 0, 0, &page_array[cur_pages]);
+
     if ( rc != 0 )
     {
         PERROR("Could not allocate memory for HVM guest.\n");
diff -r 810d8c3ac992 xen/arch/ia64/xen/mm.c
--- a/xen/arch/ia64/xen/mm.c	Thu May 08 16:58:33 2008 +0100
+++ b/xen/arch/ia64/xen/mm.c	Tue May 13 03:39:03 2008 -0500
@@ -2415,7 +2415,7 @@ steal_page(struct domain *d, struct page
 
 int
 guest_physmap_add_page(struct domain *d, unsigned long gpfn,
-                       unsigned long mfn)
+                       unsigned long mfn, unsigned int page_order)
 {
     BUG_ON(!mfn_valid(mfn));
     BUG_ON(mfn_to_page(mfn)->count_info != (PGC_allocated | 1));
@@ -2432,7 +2432,7 @@ guest_physmap_add_page(struct domain *d,
 
 void
 guest_physmap_remove_page(struct domain *d, unsigned long gpfn,
-                          unsigned long mfn)
+                          unsigned long mfn, unsigned int page_order)
 {
     BUG_ON(mfn == 0);//XXX
     zap_domain_page_one(d, gpfn << PAGE_SHIFT, 0, mfn);
@@ -2838,7 +2838,7 @@ arch_memory_op(int op, XEN_GUEST_HANDLE(
         if (prev_mfn && mfn_valid(prev_mfn)) {
             if (is_xen_heap_mfn(prev_mfn))
                 /* Xen heap frames are simply unhooked from this phys slot. */
-                guest_physmap_remove_page(d, xatp.gpfn, prev_mfn);
+                guest_physmap_remove_page(d, xatp.gpfn, prev_mfn, 0);
             else
                 /* Normal domain memory is freed, to avoid leaking memory. */
                 guest_remove_page(d, xatp.gpfn);
@@ -2847,10 +2847,10 @@ arch_memory_op(int op, XEN_GUEST_HANDLE(
         /* Unmap from old location, if any. */
         gpfn = get_gpfn_from_mfn(mfn);
         if (gpfn != INVALID_M2P_ENTRY)
-            guest_physmap_remove_page(d, gpfn, mfn);
+            guest_physmap_remove_page(d, gpfn, mfn, 0);
 
         /* Map at new location. */
-        guest_physmap_add_page(d, xatp.gpfn, mfn);
+        guest_physmap_add_page(d, xatp.gpfn, mfn, 0);
 
     out:
         domain_unlock(d);
diff -r 810d8c3ac992 xen/arch/x86/mm.c
--- a/xen/arch/x86/mm.c	Thu May 08 16:58:33 2008 +0100
+++ b/xen/arch/x86/mm.c	Tue May 13 03:39:03 2008 -0500
@@ -3287,7 +3287,8 @@ long arch_memory_op(int op, XEN_GUEST_HA
         {
             if ( is_xen_heap_mfn(prev_mfn) )
                 /* Xen heap frames are simply unhooked from this phys slot. */
-                guest_physmap_remove_page(d, xatp.gpfn, prev_mfn);
+                guest_physmap_remove_page(d, xatp.gpfn, prev_mfn,
+                                          NORMAL_PAGE_ORDER);
             else
                 /* Normal domain memory is freed, to avoid leaking memory. */
                 guest_remove_page(d, xatp.gpfn);
@@ -3296,10 +3297,10 @@ long arch_memory_op(int op, XEN_GUEST_HA
         /* Unmap from old location, if any. */
         gpfn = get_gpfn_from_mfn(mfn);
         if ( gpfn != INVALID_M2P_ENTRY )
-            guest_physmap_remove_page(d, gpfn, mfn);
+            guest_physmap_remove_page(d, gpfn, mfn, NORMAL_PAGE_ORDER);
 
         /* Map at new location. */
-        guest_physmap_add_page(d, xatp.gpfn, mfn);
+        guest_physmap_add_page(d, xatp.gpfn, mfn, NORMAL_PAGE_ORDER);
 
         domain_unlock(d);
 
diff -r 810d8c3ac992 xen/arch/x86/mm/hap/p2m-ept.c
--- a/xen/arch/x86/mm/hap/p2m-ept.c	Thu May 08 16:58:33 2008 +0100
+++ b/xen/arch/x86/mm/hap/p2m-ept.c	Tue May 13 03:39:03 2008 -0500
@@ -20,6 +20,7 @@
 #include <xen/domain_page.h>
 #include <xen/sched.h>
 #include <asm/current.h>
+#include <asm/paging.h>
 #include <asm/types.h>
 #include <asm/domain.h>
 #include <asm/p2m.h>
@@ -46,6 +47,9 @@ static void ept_p2m_type_to_flags(ept_en
     }
 }
 
+#define GUEST_TABLE_NORMAL_PAGE 1
+#define GUEST_TABLE_SUPER_PAGE  2
+
 static int ept_next_level(struct domain *d, bool_t read_only,
                           ept_entry_t **table, unsigned long *gfn_remainder,
                           u32 shift)
@@ -54,7 +58,6 @@ static int ept_next_level(struct domain 
     u32 index;
 
     index = *gfn_remainder >> shift;
-    *gfn_remainder &= (1UL << shift) - 1;
 
     ept_entry = (*table) + index;
 
@@ -83,31 +86,53 @@ static int ept_next_level(struct domain 
         ept_entry->r = ept_entry->w = ept_entry->x = 1;
     }
 
-    next = map_domain_page(ept_entry->mfn);
-    unmap_domain_page(*table);
-    *table = next;
-
-    return 1;
+    if ( !ept_entry->sp_avail )
+    {
+        *gfn_remainder &= (1UL << shift) - 1;
+        next = map_domain_page(ept_entry->mfn);
+        unmap_domain_page(*table);
+        *table = next;
+        return GUEST_TABLE_NORMAL_PAGE;
+    }
+    else
+        return GUEST_TABLE_SUPER_PAGE;
 }
 
 static int
-ept_set_entry(struct domain *d, unsigned long gfn, mfn_t mfn, p2m_type_t p2mt)
-{
-    ept_entry_t *table =
-        map_domain_page(mfn_x(pagetable_get_mfn(d->arch.phys_table)));
-    unsigned long gfn_remainder = gfn;
+ept_set_entry(struct domain *d, unsigned long gfn, mfn_t mfn, 
+              unsigned int order, p2m_type_t p2mt)
+{
+    ept_entry_t *table = NULL;
+    unsigned long gfn_remainder = gfn, offset = 0;
     ept_entry_t *ept_entry = NULL;
     u32 index;
-    int i, rv = 0;
+    int i, rv = 0, ret = 0;
+    int walk_level = order / EPT_TABLE_ORDER;
 
     /* Should check if gfn obeys GAW here */
 
-    for ( i = EPT_DEFAULT_GAW; i > 0; i-- )
-        if ( !ept_next_level(d, 0, &table, &gfn_remainder,
-                             i * EPT_TABLE_ORDER) )
+    if (  order != 0 )
+        if ( (gfn & ((1UL << order) - 1)) )
+            return 1;
+
+    table = map_domain_page(mfn_x(pagetable_get_mfn(d->arch.phys_table)));
+
+    ASSERT(table != NULL);
+
+    for ( i = EPT_DEFAULT_GAW; i > walk_level; i-- )
+    {
+        ret = ept_next_level(d, 0, &table, &gfn_remainder,
+          i * EPT_TABLE_ORDER);
+        if ( !ret )
             goto out;
-
-    index = gfn_remainder;
+        else if ( ret == GUEST_TABLE_SUPER_PAGE )
+            break;
+    }
+
+    index = gfn_remainder >> ( i ?  (i * EPT_TABLE_ORDER): order);
+    walk_level = ( i ? ( i * EPT_TABLE_ORDER) : order) / EPT_TABLE_ORDER;
+    offset = (gfn_remainder & ( ((1 << (i*EPT_TABLE_ORDER)) - 1)));
+
     ept_entry = table + index;
 
     if ( mfn_valid(mfn_x(mfn)) || (p2mt == p2m_mmio_direct) )
@@ -117,9 +142,20 @@ ept_set_entry(struct domain *d, unsigned
             d->arch.p2m->max_mapped_pfn = gfn;
 
         ept_entry->emt = EPT_DEFAULT_MT;
-        ept_entry->sp_avail = 0;
+        ept_entry->sp_avail = walk_level ? 1 : 0;
+
+        if ( ret == GUEST_TABLE_SUPER_PAGE )
+        {
+            ept_entry->mfn = mfn_x(mfn) - offset;
+            if ( ept_entry->avail1 == p2m_ram_logdirty &&
+              p2mt == p2m_ram_rw )
+                for ( i = 0; i < 512; i++ )
+                    paging_mark_dirty(d, mfn_x(mfn)-offset+i);
+        }
+        else
+            ept_entry->mfn = mfn_x(mfn);
+
         ept_entry->avail1 = p2mt;
-        ept_entry->mfn = mfn_x(mfn);
         ept_entry->rsvd = 0;
         ept_entry->avail2 = 0;
         /* last step */
@@ -132,14 +168,42 @@ ept_set_entry(struct domain *d, unsigned
     /* Success */
     rv = 1;
 
- out:
+out:
     unmap_domain_page(table);
 
     ept_sync_domain(d);
 
+    /* Now the p2m table is not shared with vt-d page table */
+
+    if ( iommu_enabled && is_hvm_domain(d) )
+    {
+        if ( p2mt == p2m_ram_rw )
+        {
+            if ( ret == GUEST_TABLE_SUPER_PAGE )
+            {
+                for ( i = 0; i < 512; i++ )
+                    iommu_map_page(d, gfn-offset+i, mfn_x(mfn)-offset+i);
+            }
+            else if ( ret )
+                iommu_map_page(d, gfn, mfn_x(mfn));
+        }
+        else
+        {
+            if ( ret == GUEST_TABLE_SUPER_PAGE )
+            {
+                for ( i = 0; i < 512; i++ )
+                    iommu_unmap_page(d, gfn-offset+i);
+            }
+            else if ( ret )
+                iommu_unmap_page(d, gfn);
+        }
+    }
+
+#ifdef P2M_SHARE_WITH_VTD_PAGE_TABLE
     /* If p2m table is shared with vtd page-table. */
     if ( iommu_enabled && is_hvm_domain(d) && (p2mt == p2m_mmio_direct) )
         iommu_flush(d, gfn, (u64*)ept_entry);
+#endif
 
     return rv;
 }
@@ -152,7 +216,7 @@ static mfn_t ept_get_entry(struct domain
     unsigned long gfn_remainder = gfn;
     ept_entry_t *ept_entry;
     u32 index;
-    int i;
+    int i, ret=0;
     mfn_t mfn = _mfn(INVALID_MFN);
 
     *t = p2m_mmio_dm;
@@ -164,17 +228,31 @@ static mfn_t ept_get_entry(struct domain
     /* Should check if gfn obeys GAW here. */
 
     for ( i = EPT_DEFAULT_GAW; i > 0; i-- )
-        if ( !ept_next_level(d, 1, &table, &gfn_remainder,
-                             i * EPT_TABLE_ORDER) )
+    {
+        ret = ept_next_level(d, 1, &table, &gfn_remainder,
+                             i * EPT_TABLE_ORDER);
+        if ( !ret )
             goto out;
-
-    index = gfn_remainder;
+        else if ( ret == GUEST_TABLE_SUPER_PAGE )
+            break;
+    }
+
+    index = gfn_remainder >> ( i * EPT_TABLE_ORDER);
     ept_entry = table + index;
 
     if ( ept_entry->avail1 != p2m_invalid )
     {
         *t = ept_entry->avail1;
         mfn = _mfn(ept_entry->mfn);
+        if ( i )
+        {
+            /* we may meet super pages, and to split into 4k pages
+             * to emulate p2m table
+             */
+            unsigned long split_mfn = 
+              mfn_x(mfn) + (gfn_remainder & ( ((1 << (i*EPT_TABLE_ORDER)) - 1 )));
+            mfn = _mfn(split_mfn);
+        }
     }
 
  out:
@@ -205,33 +283,63 @@ static void ept_change_entry_type_global
     l4e = map_domain_page(mfn_x(pagetable_get_mfn(d->arch.phys_table)));
     for (i4 = 0; i4 < EPT_PAGETABLE_ENTRIES; i4++ )
     {
-        if ( !l4e[i4].epte || l4e[i4].sp_avail )
+        if ( !l4e[i4].epte )
             continue;
-        l3e = map_domain_page(l4e[i4].mfn);
-        for ( i3 = 0; i3 < EPT_PAGETABLE_ENTRIES; i3++ )
-        {
-            if ( !l3e[i3].epte || l3e[i3].sp_avail )
+        if ( !l4e[i4].sp_avail )
+        {
+            l3e = map_domain_page(l4e[i4].mfn);
+            for ( i3 = 0; i3 < EPT_PAGETABLE_ENTRIES; i3++ )
+            {
+                if ( !l3e[i3].epte )
+                    continue;
+                if ( !l3e[i3].sp_avail )
+                {
+                    l2e = map_domain_page(l3e[i3].mfn);
+                    for ( i2 = 0; i2 < EPT_PAGETABLE_ENTRIES; i2++ )
+                    {
+                        if ( !l2e[i2].epte )
+                            continue;
+                        if ( !l2e[i2].sp_avail )
+                        {
+                            l1e = map_domain_page(l2e[i2].mfn);
+                            for ( i1  = 0; i1 < EPT_PAGETABLE_ENTRIES; i1++ )
+                            {
+                                if ( !l1e[i1].epte )
+                                    continue;
+                                if ( l1e[i1].avail1 != ot )
+                                    continue;
+                                l1e[i1].avail1 = nt;
+                                ept_p2m_type_to_flags(l1e+i1, nt);
+                            }
+                            unmap_domain_page(l1e);
+                        }
+                        else
+                        {
+                            if ( l2e[i2].avail1 != ot )
+                                continue;
+                            l2e[i2].avail1 = nt;
+                            ept_p2m_type_to_flags(l2e+i2, nt);
+                        }
+                    }
+                    unmap_domain_page(l2e);
+                }
+                else
+                {
+                    if ( l3e[i3].avail1 != ot )
+                        continue;
+                    l3e[i3].avail1 = nt;
+                    ept_p2m_type_to_flags(l3e+i3, nt);
+                }
+            }
+            unmap_domain_page(l3e);
+        }
+        else
+        {
+            if ( l4e[i4].avail1 != ot )
                 continue;
-            l2e = map_domain_page(l3e[i3].mfn);
-            for ( i2 = 0; i2 < EPT_PAGETABLE_ENTRIES; i2++ )
-            {
-                if ( !l2e[i2].epte || l2e[i2].sp_avail )
-                    continue;
-                l1e = map_domain_page(l2e[i2].mfn);
-                for ( i1  = 0; i1 < EPT_PAGETABLE_ENTRIES; i1++ )
-                {
-                    if ( !l1e[i1].epte )
-                        continue;
-                    if ( l1e[i1].avail1 != ot )
-                        continue;
-                    l1e[i1].avail1 = nt;
-                    ept_p2m_type_to_flags(l1e+i1, nt);
-                }
-                unmap_domain_page(l1e);
-            }
-            unmap_domain_page(l2e);
-        }
-        unmap_domain_page(l3e);
+            l4e[i4].avail1 = nt;
+            ept_p2m_type_to_flags(l4e+i4, nt);
+        }
     }
     unmap_domain_page(l4e);
 
diff -r 810d8c3ac992 xen/arch/x86/mm/p2m.c
--- a/xen/arch/x86/mm/p2m.c	Thu May 08 16:58:33 2008 +0100
+++ b/xen/arch/x86/mm/p2m.c	Tue May 13 04:28:16 2008 -0500
@@ -151,9 +151,11 @@ p2m_next_level(struct domain *d, mfn_t *
                unsigned long *gfn_remainder, unsigned long gfn, u32 shift,
                u32 max, unsigned long type)
 {
+    l1_pgentry_t *l1_entry;
     l1_pgentry_t *p2m_entry;
     l1_pgentry_t new_entry;
     void *next;
+    int i;
     ASSERT(d->arch.p2m->alloc_page);
 
     if ( !(p2m_entry = p2m_find_entry(*table, gfn_remainder, gfn,
@@ -194,6 +196,44 @@ p2m_next_level(struct domain *d, mfn_t *
             break;
         }
     }
+
+    ASSERT(l1e_get_flags(*p2m_entry) & _PAGE_PRESENT);
+
+    /* split single large page into 4KB page in P2M table */
+    if ( type == PGT_l1_page_table && (l1e_get_flags(*p2m_entry) & _PAGE_PSE) )
+    {
+        unsigned long flags, pfn;
+        struct page_info *pg = d->arch.p2m->alloc_page(d);
+        if ( pg == NULL )
+            return 0;
+        list_add_tail(&pg->list, &d->arch.p2m->pages);
+        pg->u.inuse.type_info = PGT_l1_page_table | 1 | PGT_validated;
+        pg->count_info = 1;
+        
+        /* New splintered mappings inherit the flags of the old superpage, 
+         * with a little reorganisation for the _PAGE_PSE_PAT bit. */
+        flags = l1e_get_flags(*p2m_entry);
+        pfn = l1e_get_pfn(*p2m_entry);
+        if ( pfn & 1 )           /* ==> _PAGE_PSE_PAT was set */
+            pfn -= 1;            /* Clear it; _PAGE_PSE becomes _PAGE_PAT */
+        else
+            flags &= ~_PAGE_PSE; /* Clear _PAGE_PSE (== _PAGE_PAT) */
+        
+        l1_entry = map_domain_page(mfn_x(page_to_mfn(pg)));
+        for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
+        {
+            new_entry = l1e_from_pfn(pfn + i, flags);
+            paging_write_p2m_entry(d, gfn,
+                                   l1_entry+i, *table_mfn, new_entry, 1);
+        }
+        unmap_domain_page(l1_entry);
+        
+        new_entry = l1e_from_pfn(mfn_x(page_to_mfn(pg)),
+                                 __PAGE_HYPERVISOR|_PAGE_USER);
+        paging_write_p2m_entry(d, gfn,
+                               p2m_entry, *table_mfn, new_entry, 2);
+    }
+
     *table_mfn = _mfn(l1e_get_pfn(*p2m_entry));
     next = map_domain_page(mfn_x(*table_mfn));
     unmap_domain_page(*table);
@@ -204,7 +244,8 @@ p2m_next_level(struct domain *d, mfn_t *
 
 // Returns 0 on error (out of memory)
 static int
-p2m_set_entry(struct domain *d, unsigned long gfn, mfn_t mfn, p2m_type_t p2mt)
+p2m_set_entry(struct domain *d, unsigned long gfn, mfn_t mfn, 
+              unsigned int page_order, p2m_type_t p2mt)
 {
     // XXX -- this might be able to be faster iff current->domain == d
     mfn_t table_mfn = pagetable_get_mfn(d->arch.phys_table);
@@ -212,6 +253,7 @@ p2m_set_entry(struct domain *d, unsigned
     unsigned long gfn_remainder = gfn;
     l1_pgentry_t *p2m_entry;
     l1_pgentry_t entry_content;
+    l2_pgentry_t l2e_content;
     int rv=0;
 
 #if CONFIG_PAGING_LEVELS >= 4
@@ -235,26 +277,53 @@ p2m_set_entry(struct domain *d, unsigned
                          PGT_l2_page_table) )
         goto out;
 
-    if ( !p2m_next_level(d, &table_mfn, &table, &gfn_remainder, gfn,
-                         L2_PAGETABLE_SHIFT - PAGE_SHIFT,
-                         L2_PAGETABLE_ENTRIES, PGT_l1_page_table) )
-        goto out;
-
-    p2m_entry = p2m_find_entry(table, &gfn_remainder, gfn,
-                               0, L1_PAGETABLE_ENTRIES);
-    ASSERT(p2m_entry);
+    if ( page_order == NORMAL_PAGE_ORDER )
+    {
+        if ( !p2m_next_level(d, &table_mfn, &table, &gfn_remainder, gfn,
+                             L2_PAGETABLE_SHIFT - PAGE_SHIFT,
+                             L2_PAGETABLE_ENTRIES, PGT_l1_page_table) )
+            goto out;
+
+        p2m_entry = p2m_find_entry(table, &gfn_remainder, gfn,
+                                   0, L1_PAGETABLE_ENTRIES);
+        ASSERT(p2m_entry);
+        
+        if ( mfn_valid(mfn) || (p2mt == p2m_mmio_direct) )
+            entry_content = l1e_from_pfn(mfn_x(mfn), p2m_type_to_flags(p2mt));
+        else
+            entry_content = l1e_empty();
+        
+        /* level 1 entry */
+        paging_write_p2m_entry(d, gfn, p2m_entry, table_mfn, entry_content, 1);
+    }
+    else 
+    {
+        p2m_entry = p2m_find_entry(table, &gfn_remainder, gfn,
+                                   L2_PAGETABLE_SHIFT - PAGE_SHIFT,
+                                   L2_PAGETABLE_ENTRIES);
+        ASSERT(p2m_entry);
+        
+        if ( (l1e_get_flags(*p2m_entry) & _PAGE_PRESENT) &&
+             !(l1e_get_flags(*p2m_entry) & _PAGE_PSE) )
+        {
+            P2M_ERROR("configure P2M table 4KB L2 entry with large page\n");
+            domain_crash(d);
+            goto out;
+        }
+        
+        if ( mfn_valid(mfn) )
+            l2e_content = l2e_from_pfn(mfn_x(mfn),
+                                       p2m_type_to_flags(p2mt) | _PAGE_PSE);
+        else
+            l2e_content = l2e_empty();
+        
+        entry_content.l1 = l2e_content.l2;
+        paging_write_p2m_entry(d, gfn, p2m_entry, table_mfn, entry_content, 2);
+    }
 
     /* Track the highest gfn for which we have ever had a valid mapping */
     if ( mfn_valid(mfn) && (gfn > d->arch.p2m->max_mapped_pfn) )
         d->arch.p2m->max_mapped_pfn = gfn;
-
-    if ( mfn_valid(mfn) || (p2mt == p2m_mmio_direct) )
-        entry_content = l1e_from_pfn(mfn_x(mfn), p2m_type_to_flags(p2mt));
-    else
-        entry_content = l1e_empty();
-
-    /* level 1 entry */
-    paging_write_p2m_entry(d, gfn, p2m_entry, table_mfn, entry_content, 1);
 
     if ( iommu_enabled && is_hvm_domain(d) )
     {
@@ -335,6 +404,16 @@ p2m_gfn_to_mfn(struct domain *d, unsigne
         unmap_domain_page(l2e);
         return _mfn(INVALID_MFN);
     }
+    else if ( (l2e_get_flags(*l2e) & _PAGE_PSE) )
+    {
+        mfn = _mfn(l2e_get_pfn(*l2e) + l1_table_offset(addr));
+        *t = p2m_flags_to_type(l2e_get_flags(*l2e));
+        unmap_domain_page(l2e);
+        
+        ASSERT(mfn_valid(mfn) || !p2m_is_ram(*t));
+        return (p2m_is_valid(*t)) ? mfn : _mfn(INVALID_MFN);
+    }
+
     mfn = _mfn(l2e_get_pfn(*l2e));
     unmap_domain_page(l2e);
 
@@ -358,6 +437,7 @@ static mfn_t p2m_gfn_to_mfn_current(unsi
 {
     mfn_t mfn = _mfn(INVALID_MFN);
     p2m_type_t p2mt = p2m_mmio_dm;
+    paddr_t addr = ((paddr_t)gfn) << PAGE_SHIFT;
     /* XXX This is for compatibility with the old model, where anything not 
      * XXX marked as RAM was considered to be emulated MMIO space.
      * XXX Once we start explicitly registering MMIO regions in the p2m 
@@ -366,25 +446,44 @@ static mfn_t p2m_gfn_to_mfn_current(unsi
     if ( gfn <= current->domain->arch.p2m->max_mapped_pfn )
     {
         l1_pgentry_t l1e = l1e_empty();
+        l2_pgentry_t l2e = l2e_empty();
         int ret;
 
         ASSERT(gfn < (RO_MPT_VIRT_END - RO_MPT_VIRT_START) 
                / sizeof(l1_pgentry_t));
 
-        /* Need to __copy_from_user because the p2m is sparse and this
-         * part might not exist */
-        ret = __copy_from_user(&l1e,
-                               &phys_to_machine_mapping[gfn],
-                               sizeof(l1e));
-
-        if ( ret == 0 ) {
-            p2mt = p2m_flags_to_type(l1e_get_flags(l1e));
-            ASSERT(l1e_get_pfn(l1e) != INVALID_MFN || !p2m_is_ram(p2mt));
+        ret = __copy_from_user(&l2e,
+                               &__linear_l1_table[l1_linear_offset(RO_MPT_VIRT_START) + l2_linear_offset(addr)],
+                               sizeof(l2e));
+        
+        if ( (ret == 0) && (l2e_get_flags(l2e) & _PAGE_PRESENT) && 
+             (l2e_get_flags(l2e) & _PAGE_PSE) ) 
+        {
+            p2mt = p2m_flags_to_type(l2e_get_flags(l2e));
+            ASSERT(l2e_get_pfn(l2e) != INVALID_MFN || !p2m_is_ram(p2mt));
             if ( p2m_is_valid(p2mt) )
-                mfn = _mfn(l1e_get_pfn(l1e));
-            else 
-                /* XXX see above */
+                mfn = _mfn(l2e_get_pfn(l2e) + l1_table_offset(addr));
+            else
                 p2mt = p2m_mmio_dm;
+        }
+        else
+        {
+        
+            /* Need to __copy_from_user because the p2m is sparse and this
+             * part might not exist */
+            ret = __copy_from_user(&l1e,
+                                   &phys_to_machine_mapping[gfn],
+                                   sizeof(l1e));
+            
+            if ( ret == 0 ) {
+                p2mt = p2m_flags_to_type(l1e_get_flags(l1e));
+                ASSERT(l1e_get_pfn(l1e) != INVALID_MFN || !p2m_is_ram(p2mt));
+                if ( p2m_is_valid(p2mt) )
+                    mfn = _mfn(l1e_get_pfn(l1e));
+                else 
+                    /* XXX see above */
+                    p2mt = p2m_mmio_dm;
+            }
         }
     }
 
@@ -430,9 +529,10 @@ void p2m_change_entry_type_global(struct
 }
 
 static inline
-int set_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn, p2m_type_t p2mt)
-{
-    return d->arch.p2m->set_entry(d, gfn, mfn, p2mt);
+int set_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn, 
+                  unsigned int page_order, p2m_type_t p2mt)
+{
+    return d->arch.p2m->set_entry(d, gfn, mfn, page_order, p2mt);
 }
 
 // Allocate a new p2m table for a domain.
@@ -493,7 +593,8 @@ int p2m_alloc_table(struct domain *d,
     P2M_PRINTK("populating p2m table\n");
 
     /* Initialise physmap tables for slot zero. Other code assumes this. */
-    if ( !set_p2m_entry(d, 0, _mfn(INVALID_MFN), p2m_invalid) )
+    if ( !set_p2m_entry(d, 0, _mfn(INVALID_MFN), NORMAL_PAGE_ORDER,
+                        p2m_invalid) )
         goto error;
 
     /* Copy all existing mappings from the page list and m2p */
@@ -512,7 +613,7 @@ int p2m_alloc_table(struct domain *d,
             (gfn != 0x55555555L)
 #endif
              && gfn != INVALID_M2P_ENTRY
-            && !set_p2m_entry(d, gfn, mfn, p2m_ram_rw) )
+            && !set_p2m_entry(d, gfn, mfn, NORMAL_PAGE_ORDER, p2m_ram_rw) )
             goto error;
     }
 
@@ -688,6 +789,28 @@ static void audit_p2m(struct domain *d)
                         gfn += 1 << (L2_PAGETABLE_SHIFT - PAGE_SHIFT);
                         continue;
                     }
+                    
+                    /* check for super page */
+                    if ( l2e_get_flags(l2e[i2]) & _PAGE_PSE )
+                    {
+                        mfn = l2e_get_pfn(l2e[i2]);
+                        ASSERT(mfn_valid(_mfn(mfn)));
+                        for ( i1 = 0; i1 < L1_PAGETABLE_ENTRIES; i1++)
+                        {
+                            m2pfn = get_gpfn_from_mfn(mfn+i1);
+                            if ( m2pfn != (gfn + i) )
+                            {
+                                pmbad++;
+                                P2M_PRINTK("mismatch: gfn %#lx -> mfn %#lx"
+                                           " -> gfn %#lx\n", gfn+i, mfn+i,
+                                           m2pfn);
+                                BUG();
+                            }
+                        }
+                        gfn += 1 << (L2_PAGETABLE_SHIFT - PAGE_SHIFT);
+                        continue;
+                    }
+
                     l1e = map_domain_page(mfn_x(_mfn(l2e_get_pfn(l2e[i2]))));
 
                     for ( i1 = 0; i1 < L1_PAGETABLE_ENTRIES; i1++, gfn++ )
@@ -737,35 +860,40 @@ static void audit_p2m(struct domain *d)
 
 
 static void
-p2m_remove_page(struct domain *d, unsigned long gfn, unsigned long mfn)
-{
+p2m_remove_page(struct domain *d, unsigned long gfn, unsigned long mfn,
+                unsigned int page_order)
+{
+    int i;
     if ( !paging_mode_translate(d) )
         return;
     P2M_DEBUG("removing gfn=%#lx mfn=%#lx\n", gfn, mfn);
 
-    set_p2m_entry(d, gfn, _mfn(INVALID_MFN), p2m_invalid);
-    set_gpfn_from_mfn(mfn, INVALID_M2P_ENTRY);
+    set_p2m_entry(d, gfn, _mfn(INVALID_MFN), page_order, p2m_invalid);
+    for ( i = 0; i < (1UL << page_order); i++ )
+        set_gpfn_from_mfn(mfn+i, INVALID_M2P_ENTRY);
 }
 
 void
 guest_physmap_remove_page(struct domain *d, unsigned long gfn,
-                          unsigned long mfn)
+                          unsigned long mfn, unsigned int page_order)
 {
     p2m_lock(d->arch.p2m);
     audit_p2m(d);
-    p2m_remove_page(d, gfn, mfn);
+    p2m_remove_page(d, gfn, mfn, page_order);
     audit_p2m(d);
     p2m_unlock(d->arch.p2m);
 }
 
 int
 guest_physmap_add_entry(struct domain *d, unsigned long gfn,
-                        unsigned long mfn, p2m_type_t t)
+                        unsigned long mfn, unsigned int page_order, 
+                        p2m_type_t t)
 {
     unsigned long ogfn;
     p2m_type_t ot;
     mfn_t omfn;
     int rc = 0;
+    int i;
 
     if ( !paging_mode_translate(d) )
         return -EINVAL;
@@ -795,7 +923,8 @@ guest_physmap_add_entry(struct domain *d
     if ( p2m_is_ram(ot) )
     {
         ASSERT(mfn_valid(omfn));
-        set_gpfn_from_mfn(mfn_x(omfn), INVALID_M2P_ENTRY);
+        for ( i = 0; i < (1UL << page_order); i++ )
+            set_gpfn_from_mfn(mfn_x(omfn)+i, INVALID_M2P_ENTRY);
     }
 
     ogfn = mfn_to_gfn(d, _mfn(mfn));
@@ -818,21 +947,23 @@ guest_physmap_add_entry(struct domain *d
             P2M_DEBUG("old gfn=%#lx -> mfn %#lx\n",
                       ogfn , mfn_x(omfn));
             if ( mfn_x(omfn) == mfn )
-                p2m_remove_page(d, ogfn, mfn);
+                p2m_remove_page(d, ogfn, mfn, page_order);
         }
     }
 
     if ( mfn_valid(_mfn(mfn)) ) 
     {
-        if ( !set_p2m_entry(d, gfn, _mfn(mfn), t) )
+        if ( !set_p2m_entry(d, gfn, _mfn(mfn), page_order, t) )
             rc = -EINVAL;
-        set_gpfn_from_mfn(mfn, gfn);
+        for ( i = 0; i < (1UL << page_order); i++ )
+            set_gpfn_from_mfn(mfn+i, gfn+i);
     }
     else
     {
         gdprintk(XENLOG_WARNING, "Adding bad mfn to p2m map (%#lx -> %#lx)\n",
                  gfn, mfn);
-        if ( !set_p2m_entry(d, gfn, _mfn(INVALID_MFN), p2m_invalid) )
+        if ( !set_p2m_entry(d, gfn, _mfn(INVALID_MFN), page_order, 
+                            p2m_invalid) )
             rc = -EINVAL;
     }
 
@@ -851,7 +982,7 @@ void p2m_change_type_global(struct domai
     l1_pgentry_t l1e_content;
     l1_pgentry_t *l1e;
     l2_pgentry_t *l2e;
-    mfn_t l1mfn;
+    mfn_t l1mfn, l2mfn;
     int i1, i2;
     l3_pgentry_t *l3e;
     int i3;
@@ -891,11 +1022,26 @@ void p2m_change_type_global(struct domai
             {
                 continue;
             }
+            l2mfn = _mfn(l3e_get_pfn(l3e[i3]));
             l2e = map_domain_page(l3e_get_pfn(l3e[i3]));
             for ( i2 = 0; i2 < L2_PAGETABLE_ENTRIES; i2++ )
             {
                 if ( !(l2e_get_flags(l2e[i2]) & _PAGE_PRESENT) )
                 {
+                    continue;
+                }
+
+                if ( (l2e_get_flags(l2e[i2]) & _PAGE_PSE) )
+                {
+                    flags = l2e_get_flags(l2e[i2]);
+                    if ( p2m_flags_to_type(flags) != ot )
+                        continue;
+                    mfn = l2e_get_pfn(l2e[i2]);
+                    gfn = get_gpfn_from_mfn(mfn);
+                    flags = p2m_flags_to_type(nt);
+                    l1e_content = l1e_from_pfn(mfn, flags | _PAGE_PSE);
+                    paging_write_p2m_entry(d, gfn, (l1_pgentry_t *)&l2e[i2],
+                                           l2mfn, l1e_content, 2);
                     continue;
                 }
 
@@ -944,7 +1090,7 @@ p2m_type_t p2m_change_type(struct domain
 
     mfn = gfn_to_mfn(d, gfn, &pt);
     if ( pt == ot )
-        set_p2m_entry(d, gfn, mfn, nt);
+        set_p2m_entry(d, gfn, mfn, NORMAL_PAGE_ORDER, nt);
 
     p2m_unlock(d->arch.p2m);
 
@@ -968,7 +1114,7 @@ set_mmio_p2m_entry(struct domain *d, uns
         set_gpfn_from_mfn(mfn_x(omfn), INVALID_M2P_ENTRY);
     }
 
-    rc = set_p2m_entry(d, gfn, mfn, p2m_mmio_direct);
+    rc = set_p2m_entry(d, gfn, mfn, NORMAL_PAGE_ORDER, p2m_mmio_direct);
     if ( 0 == rc )
         gdprintk(XENLOG_ERR,
             "set_mmio_p2m_entry: set_p2m_entry failed! mfn=%08lx\n",
@@ -992,7 +1138,7 @@ clear_mmio_p2m_entry(struct domain *d, u
             "clear_mmio_p2m_entry: gfn_to_mfn failed! gfn=%08lx\n", gfn);
         return 0;
     }
-    rc = set_p2m_entry(d, gfn, _mfn(INVALID_MFN), 0);
+    rc = set_p2m_entry(d, gfn, _mfn(INVALID_MFN), NORMAL_PAGE_ORDER, 0);
 
     return rc;
 }
diff -r 810d8c3ac992 xen/common/grant_table.c
--- a/xen/common/grant_table.c	Thu May 08 16:58:33 2008 +0100
+++ b/xen/common/grant_table.c	Tue May 13 03:39:03 2008 -0500
@@ -1159,7 +1159,7 @@ gnttab_transfer(
         spin_lock(&e->grant_table->lock);
 
         sha = &shared_entry(e->grant_table, gop.ref);
-        guest_physmap_add_page(e, sha->frame, mfn);
+        guest_physmap_add_page(e, sha->frame, mfn, NORMAL_PAGE_ORDER);
         sha->frame = mfn;
         wmb();
         sha->flags |= GTF_transfer_completed;
diff -r 810d8c3ac992 xen/common/memory.c
--- a/xen/common/memory.c	Thu May 08 16:58:33 2008 +0100
+++ b/xen/common/memory.c	Tue May 13 03:49:48 2008 -0500
@@ -109,8 +109,12 @@ static void populate_physmap(struct memo
             goto out;
         }
 
-        if ( unlikely(__copy_from_guest_offset(&gpfn, a->extent_list, i, 1)) )
-            goto out;
+        if ( unlikely(__copy_from_guest_offset(&gpfn, a->extent_list, 
+                                               i << a->extent_order, 1)) )
+        {
+            printk("copy_from_guest failed.\n");
+            goto out;
+        }
 
         page = alloc_domheap_pages(
             d, a->extent_order, a->memflags | MEMF_node(node));
@@ -126,11 +130,7 @@ static void populate_physmap(struct memo
         mfn = page_to_mfn(page);
 
         if ( unlikely(paging_mode_translate(d)) )
-        {
-            for ( j = 0; j < (1 << a->extent_order); j++ )
-                if ( guest_physmap_add_page(d, gpfn + j, mfn + j) )
-                    goto out;
-        }
+            guest_physmap_add_page(d, gpfn, mfn, a->extent_order);
         else
         {
             for ( j = 0; j < (1 << a->extent_order); j++ )
@@ -172,7 +172,7 @@ int guest_remove_page(struct domain *d, 
     if ( test_and_clear_bit(_PGC_allocated, &page->count_info) )
         put_page(page);
 
-    guest_physmap_remove_page(d, gmfn, mfn);
+    guest_physmap_remove_page(d, gmfn, mfn, 0);
 
     put_page(page);
 
@@ -419,7 +419,7 @@ static long memory_exchange(XEN_GUEST_HA
             if ( !test_and_clear_bit(_PGC_allocated, &page->count_info) )
                 BUG();
             mfn = page_to_mfn(page);
-            guest_physmap_remove_page(d, mfn_to_gmfn(d, mfn), mfn);
+            guest_physmap_remove_page(d, mfn_to_gmfn(d, mfn), mfn, 0);
             put_page(page);
         }
 
@@ -441,8 +441,7 @@ static long memory_exchange(XEN_GUEST_HA
             if ( unlikely(paging_mode_translate(d)) )
             {
                 /* Ignore failure here. There's nothing we can do. */
-                for ( k = 0; k < (1UL << exch.out.extent_order); k++ )
-                    (void)guest_physmap_add_page(d, gpfn + k, mfn + k);
+                (void)guest_physmap_add_page(d, gpfn, mfn, exch.out.extent_order);
             }
             else
             {
diff -r 810d8c3ac992 xen/include/asm-ia64/grant_table.h
--- a/xen/include/asm-ia64/grant_table.h	Thu May 08 16:58:33 2008 +0100
+++ b/xen/include/asm-ia64/grant_table.h	Tue May 13 04:38:40 2008 -0500
@@ -13,7 +13,7 @@ int replace_grant_host_mapping(unsigned 
 int replace_grant_host_mapping(unsigned long gpaddr, unsigned long mfn, unsigned long new_gpaddr, unsigned int flags);
 
 // for grant transfer
-int guest_physmap_add_page(struct domain *d, unsigned long gpfn, unsigned long mfn);
+int guest_physmap_add_page(struct domain *d, unsigned long gpfn, unsigned long mfn, int order);
 
 /* XXX
  * somewhere appropriate
diff -r 810d8c3ac992 xen/include/asm-ia64/shadow.h
--- a/xen/include/asm-ia64/shadow.h	Thu May 08 16:58:33 2008 +0100
+++ b/xen/include/asm-ia64/shadow.h	Tue May 13 03:39:03 2008 -0500
@@ -40,8 +40,10 @@
  * Utilities to change relationship of gpfn->mfn for designated domain,
  * which is required by gnttab transfer, balloon, device model and etc.
  */
-int guest_physmap_add_page(struct domain *d, unsigned long gpfn, unsigned long mfn);
-void guest_physmap_remove_page(struct domain *d, unsigned long gpfn, unsigned long mfn);
+int guest_physmap_add_page(struct domain *d, unsigned long gpfn, 
+                           unsigned long mfn, unsigned int page_order);
+void guest_physmap_remove_page(struct domain *d, unsigned long gpfn, 
+                               unsigned long mfn, unsigned int page_order);
 
 static inline int
 shadow_mode_enabled(struct domain *d)
diff -r 810d8c3ac992 xen/include/asm-x86/mm.h
--- a/xen/include/asm-x86/mm.h	Thu May 08 16:58:33 2008 +0100
+++ b/xen/include/asm-x86/mm.h	Tue May 13 03:39:03 2008 -0500
@@ -124,6 +124,14 @@ static inline u32 pickle_domptr(struct d
 /* The order of the largest allocation unit we use for shadow pages */
 #define SHADOW_MAX_ORDER 2 /* Need up to 16k allocs for 32-bit on PAE/64 */
 
+/* The order of continuously allocated super page frames */
+#define NORMAL_PAGE_ORDER 0  /* 4KB page */
+#if CONFIG_PAGING_LEVELS == 2
+#define SUPER_PAGE_ORDER  10 /* 4MB page */
+#else
+#define SUPER_PAGE_ORDER  9  /* 2MB page */
+#endif
+
 #define page_get_owner(_p)    (unpickle_domptr((_p)->u.inuse._domain))
 #define page_set_owner(_p,_d) ((_p)->u.inuse._domain = pickle_domptr(_d))
 
diff -r 810d8c3ac992 xen/include/asm-x86/p2m.h
--- a/xen/include/asm-x86/p2m.h	Thu May 08 16:58:33 2008 +0100
+++ b/xen/include/asm-x86/p2m.h	Tue May 13 03:39:03 2008 -0500
@@ -102,7 +102,8 @@ struct p2m_domain {
     void               (*free_page   )(struct domain *d,
                                        struct page_info *pg);
     int                (*set_entry   )(struct domain *d, unsigned long gfn,
-                                       mfn_t mfn, p2m_type_t p2mt);
+                                       mfn_t mfn, unsigned int page_order,
+                                       p2m_type_t p2mt);
     mfn_t              (*get_entry   )(struct domain *d, unsigned long gfn,
                                        p2m_type_t *p2mt);
     mfn_t              (*get_entry_current)(unsigned long gfn,
@@ -203,21 +204,23 @@ void p2m_final_teardown(struct domain *d
 
 /* Add a page to a domain's p2m table */
 int guest_physmap_add_entry(struct domain *d, unsigned long gfn,
-                             unsigned long mfn, p2m_type_t t);
+                            unsigned long mfn, unsigned int page_order, 
+                            p2m_type_t t);
 
 /* Untyped version for RAM only, for compatibility 
  *
  * Return 0 for success
  */
 static inline int guest_physmap_add_page(struct domain *d, unsigned long gfn,
-                                         unsigned long mfn)
-{
-    return guest_physmap_add_entry(d, gfn, mfn, p2m_ram_rw);
+                                         unsigned long mfn,
+                                         unsigned int page_order)
+{
+    return guest_physmap_add_entry(d, gfn, mfn, page_order, p2m_ram_rw);
 }
 
 /* Remove a page from a domain's p2m table */
 void guest_physmap_remove_page(struct domain *d, unsigned long gfn,
-                               unsigned long mfn);
+                               unsigned long mfn, unsigned int page_order);
 
 /* Change types across all p2m entries in a domain */
 void p2m_change_type_global(struct domain *d, p2m_type_t ot, p2m_type_t nt);
diff -r 810d8c3ac992 xen/include/xen/paging.h
--- a/xen/include/xen/paging.h	Thu May 08 16:58:33 2008 +0100
+++ b/xen/include/xen/paging.h	Tue May 13 03:39:03 2008 -0500
@@ -18,8 +18,8 @@
 #else
 
 #define paging_mode_translate(d)              (0)
-#define guest_physmap_add_page(d, p, m)       (0)
-#define guest_physmap_remove_page(d, p, m)    ((void)0)
+#define guest_physmap_add_page(d, p, m, o)       (0)
+#define guest_physmap_remove_page(d, p, m, o)    ((void)0)
 
 #endif
 

[-- Attachment #3: Type: text/plain, Size: 138 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] patch to support super page (2M) with EPT
  2008-05-13 15:49                   ` Huang2, Wei
@ 2008-05-14  8:40                     ` Keir Fraser
  2008-05-14 21:22                       ` Huang2, Wei
  0 siblings, 1 reply; 14+ messages in thread
From: Keir Fraser @ 2008-05-14  8:40 UTC (permalink / raw)
  To: Huang2, Wei, Li, Xin B, Xin, Xiaohui, xen-devel


[-- Attachment #1.1: Type: text/plain, Size: 7568 bytes --]

You now break the populate_physmap interface like the original Intel patch
did. You cannot change the index argument to __copy_from_guest_offset().
Also you add in a bogus printk in the same chunk of the patch.

You do not use SUPER_PAGE_ORDER anywhere, and NORMAL_PAGE_ORDER only in some
places. You may as well remove both macros.

 -- Keir

On 13/5/08 16:49, "Huang2, Wei" <Wei.Huang2@amd.com> wrote:

> Re-submit. It fixes the issues based on your comments.
>  
> Thanks,
>  
> -Wei
> 
> 
> From: Keir Fraser [mailto:keir.fraser@eu.citrix.com]
> Sent: Tuesday, May 13, 2008 8:51 AM
> To: Li, Xin B; Huang2, Wei; Xin, Xiaohui; xen-devel@lists.xensource.com
> Subject: Re: [Xen-devel][PATCH] patch to support super page (2M) with EPT
> 
> Exactly. The interface for populate_physmap() is clear — if you ask for
> order-9 allocations then that is what you must get. Otherwise the allocation
> fails. It is up to the caller to retry with order-0 allocations _if_ that is a
> suitable fallback.
> 
>  -- Keir
> 
> On 13/5/08 14:39, "Li, Xin B" <xin.b.li@intel.com> wrote:
> 
>> if  2M page allocation fails, the domain builer will try to use 4K allocation
>> instead.
>> -Xin
>> 
>>  
>>> 
>>>  
>>>  
>>> 
>>>  From: xen-devel-bounces@lists.xensource.com
>>> [mailto:xen-devel-bounces@lists.xensource.com]  On Behalf Of Huang2,  Wei
>>> Sent: 2008年5月13日  21:37
>>> To: Keir Fraser; Xin, Xiaohui;   xen-devel@lists.xensource.com
>>> Subject: RE:  [Xen-devel][PATCH] patch  to support super page (2M) with  EPT
>>> 
>>>  
>>>  
>>>  
>>> 
>>> Memory.c  looks more invasive because it takes  care of failure cases using
>>> 4KB pages.  Xiaohui’s patch tries to  allocate pages using extend_order. But
>>> if this  request fails for any  reason, the guest cannot be started anymore.
>>> 
>>> 
>>> 
>>> -Wei
>>> 
>>> 
>>> 
>>>  
>>>  
>>> 
>>> From: Keir Fraser  [mailto:keir.fraser@eu.citrix.com]
>>> Sent: Tuesday, May 13, 2008 3:47  AM
>>> To: Huang2,  Wei; Xin, Xiaohui;  xen-devel@lists.xensource.com
>>> Subject:  Re: [Xen-devel][PATCH] patch  to support super page (2M) with  EPT
>>> 
>>>  
>>>  
>>> Overall your  changes to common code seem a bit  more invasive than in the
>>> Intel patch. In  particular I don’t  understand why you made such changes to
>>> common/memory.c.  The other  patch makes far fewer changes (and even some of
>>> those would go away   since they have erroneously changed the
>>> populate_physmap interface).  So my  feeling is that the Intel patch is a
>>> slightly more elegant base  to start with:  extra changes that your patch
>>> makes really need to be  accounted  for.
>>> 
>>>  -- Keir
>>> 
>>> On 12/5/08 18:28, "Huang2,  Wei"  <Wei.Huang2@amd.com> wrote:
>>> 
>>> Here is  a revised version. I get rid of 4MB  support, as suggested. I did
>>> not see the  issue mentioned by Xiaohui  related to splitting last 2M into
>>> 4KB pages. But  anyway, I attached  two versions for your reference. Keir,
>>> please let me know  if you have  comments.
>>> 
>>>  
>>> -Wei
>>>  
>>> 
>>>  
>>> 
>>>   
>>> 
>>> 
>>> From: Keir Fraser [mailto:keir.fraser@eu.citrix.com]
>>> Sent: Monday, May 12, 2008 2:03 AM
>>> To: Xin,  Xiaohui;  Huang2, Wei; xen-devel@lists.xensource.com
>>> Subject:  Re:  [Xen-devel][PATCH] patch to support super page (2M) with  EPT
>>> 
>>> Yes,  absolutely no need for 4M page support. We do not  support the 32-bit
>>> non-PAE  build target any more.
>>> 
>>>  --  Keir
>>> 
>>> On 12/5/08 06:04, "Xin,  Xiaohui"  <xiaohui.xin@intel.com> wrote:
>>> 
>>> Some  comments here:
>>> 1) Basically 4M pages   allocations is not hardware  naturally for EPT, we
>>> only use 2M  super pages now.
>>> I remembered that  Keir said that 2M pages  allocation is sufficient, and he
>>> removed all the  pure 32bit  support already.
>>> 2)  If we don’t allocate  the last 2M   area with 4kb pages, the EPT will
>>> meet some problem. Xen will set   one of  the 4k page
>>>      there to be  invalid,   logically that means we should invalid the all
>>> the 2M  page if we  allocate it  with 2M, and then the
>>>      special pages Xen used  in the  high end  of the guest memory can not
>>> be used then. May we know how you   cope with    that?
>>>  
>>> Thanks
>>> Xiaohui
>>>  
>>>  
>>>  
>>>  
>>> 
>>>  
>>> 
>>>   
>>> 
>>> 
>>> From:    xen-devel-bounces@lists.xensource.com
>>> [mailto:xen-devel-bounces@lists.xensource.com]    On Behalf Of Huang2, Wei
>>> Sent: Monday, May 12,  2008  12:36  PM
>>> To: Xin, Xiaohui;    xen-devel@lists.xensource.com
>>> Subject: RE:  [Xen-devel][PATCH]  patch  to support super page (2M) with
>>> EPT
>>> 
>>> This  is the  latest one I created. Please review it and  I will
>>> re-submit.
>>>  
>>> 1.         It  includes the  patch for p2m-ept.c,  directly from your
>>> previous patch.
>>> 
>>> 2.         Xc_hvm_create.c   is based on my original  approach. It includes
>>> support for both 2MB and  4MB  pages. Also  it considers the case of odd
>>> page size (such as 255MB).  But I did   not allocate the last 2MB area using
>>> 4KB  pages. Let me   know if it is a  big issue.
>>> 
>>> 3.         The  rest are  pretty  similar.
>>> 
>>> 
>>> Thanks,
>>>  
>>> -Wei
>>>  
>>> 
>>> From:   xen-devel-bounces@lists.xensource.com
>>> [mailto:xen-devel-bounces@lists.xensource.com]    On Behalf Of Huang2, Wei
>>> Sent: Sunday, May 11,  2008  3:34  PM
>>> To: Xin, Xiaohui;    xen-devel@lists.xensource.com
>>> Subject: RE:  [Xen-devel][PATCH]  patch  to support super page (2M) with
>>> EPT
>>> 
>>> Could  we work  together for a common solution? As far  as I can see, it
>>> largely   overlaps with my super page patch. The major  difference is
>>> between p2m.c  and   p2m-ept.c.
>>>  
>>> -Wei
>>>  
>>> 
>>> From:   xen-devel-bounces@lists.xensource.com
>>> [mailto:xen-devel-bounces@lists.xensource.com]    On Behalf Of Xin, Xiaohui
>>> Sent: Friday, May 09,  2008  4:11  AM
>>> To:  xen-devel@lists.xensource.com
>>> Subject:    [Xen-devel][PATCH] patch to support super page (2M) with    EPT
>>> 
>>> Attached are the  patches to support   super page with EPT. We only support
>>> 2M size. And  shadow may  still work fine  with 4K pages.
>>> The patches can be split  into  3 parts. Apply order is as  attached.
>>>  
>>> tool.diff 
>>> To   allocate 2M physical contiguous memory  in guest except the first  2M
>>> and  the last 2M.
>>> The first 2M covers special  memory, and  Xen use the last  few pages in
>>> guest memory to do special   things.
>>> We let them to be 4K  pages as  normal.
>>> super_page_common.patch
>>> To modify the p2m   interfaces by adding an order parameter, such as
>>> guest_physmap_add_page(), p2m_set_entry(),    etc.
>>> p2m-ept-file.patch
>>>            To    handle the EPT tables to support super page.
>>>  
>>>  
>>> Signed-off-by:    Xin Xiaohui <xiaohui.xin@intel.com>
>>> Signed-off-by: Li  Xin, B    <xin.b.li@intel.com>
>>>  
>>> 
>>>  
>>> 
>>>  
>>>  
>>> 
>>>  
>>> 
>>>   
>>> 
>>> 
>>> _______________________________________________
>>> Xen-devel    mailing list
>>> Xen-devel@lists.xensource.com
>>> http://lists.xensource.com/xen-devel
>>> 
>>> 
>>>  
>>> 
>>>  
>>> 
>>>   
>>> 
>>> 
>>> _______________________________________________
>>> Xen-devel   mailing list
>>> Xen-devel@lists.xensource.com
>>> http://lists.xensource.com/xen-devel
>>> 
>>>  
>> 
> 
> 



[-- Attachment #1.2: Type: text/html, Size: 15502 bytes --]

[-- Attachment #2: Type: text/plain, Size: 138 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 14+ messages in thread

* RE: [PATCH] patch to support super page (2M) with EPT
  2008-05-14  8:40                     ` Keir Fraser
@ 2008-05-14 21:22                       ` Huang2, Wei
  0 siblings, 0 replies; 14+ messages in thread
From: Huang2, Wei @ 2008-05-14 21:22 UTC (permalink / raw)
  To: Keir Fraser, Li, Xin B, Xin, Xiaohui, xen-devel


[-- Attachment #1.1: Type: text/plain, Size: 7962 bytes --]

Keir,
 
Here is the latest version. The changes include:
 
1. NORMAL_PAGE_ORDER and SUPER_PAGE_ORDER are removed
2. Changes to __copy_from_guest_offset() index are removed from populate_physmap() function
3. Because of (2), a super_page_array is created and passed to xc_domain_memory_populate_physmap() for allocating 2M pages
4. When 2M requests can not be satisfied, use 4K pages instead.
 
 
Thanks,
 
-Wei

________________________________

From: Keir Fraser [mailto:keir.fraser@eu.citrix.com] 
Sent: Wednesday, May 14, 2008 3:40 AM
To: Huang2, Wei; Li, Xin B; Xin, Xiaohui; xen-devel@lists.xensource.com
Subject: Re: [Xen-devel][PATCH] patch to support super page (2M) with EPT


You now break the populate_physmap interface like the original Intel patch did. You cannot change the index argument to __copy_from_guest_offset(). Also you add in a bogus printk in the same chunk of the patch.

You do not use SUPER_PAGE_ORDER anywhere, and NORMAL_PAGE_ORDER only in some places. You may as well remove both macros.

 -- Keir

On 13/5/08 16:49, "Huang2, Wei" <Wei.Huang2@amd.com> wrote:



	Re-submit. It fixes the issues based on your comments.
	
	Thanks,
	
	-Wei
	
	
________________________________

	From: Keir Fraser [mailto:keir.fraser@eu.citrix.com] 
	Sent: Tuesday, May 13, 2008 8:51 AM
	To: Li, Xin B; Huang2, Wei; Xin, Xiaohui; xen-devel@lists.xensource.com
	Subject: Re: [Xen-devel][PATCH] patch to support super page (2M) with EPT
	
	Exactly. The interface for populate_physmap() is clear - if you ask for order-9 allocations then that is what you must get. Otherwise the allocation fails. It is up to the caller to retry with order-0 allocations _if_ that is a suitable fallback.
	
	 -- Keir
	
	On 13/5/08 14:39, "Li, Xin B" <xin.b.li@intel.com> wrote:
	
	

		if  2M page allocation fails, the domain builer will try to use 4K allocation  instead.
		-Xin
		
		 
		

			
			 
			 
			
________________________________

			From: xen-devel-bounces@lists.xensource.com   [mailto:xen-devel-bounces@lists.xensource.com]  On Behalf Of Huang2,  Wei
			Sent: 2008年5月13日  21:37
			To: Keir Fraser; Xin, Xiaohui;   xen-devel@lists.xensource.com
			Subject: RE:  [Xen-devel][PATCH] patch  to support super page (2M) with  EPT
			
			 
			 
			 
			
			Memory.c  looks more invasive because it takes  care of failure cases using 4KB pages.  Xiaohui’s patch tries to  allocate pages using extend_order. But if this  request fails for any  reason, the guest cannot be started anymore.   
			
			
			
			-Wei
			
			
			
			 
			 
			
			From: Keir Fraser  [mailto:keir.fraser@eu.citrix.com]  
			Sent: Tuesday, May 13, 2008 3:47  AM
			To: Huang2,  Wei; Xin, Xiaohui;  xen-devel@lists.xensource.com
			Subject:  Re: [Xen-devel][PATCH] patch  to support super page (2M) with  EPT
			
			 
			 
			Overall your  changes to common code seem a bit  more invasive than in the Intel patch. In  particular I don’t  understand why you made such changes to common/memory.c.  The other  patch makes far fewer changes (and even some of those would go away   since they have erroneously changed the populate_physmap interface).  So my  feeling is that the Intel patch is a slightly more elegant base  to start with:  extra changes that your patch makes really need to be  accounted  for.
			
			 -- Keir
			
			On 12/5/08 18:28, "Huang2,  Wei"  <Wei.Huang2@amd.com> wrote:
			
			Here is  a revised version. I get rid of 4MB  support, as suggested. I did not see the  issue mentioned by Xiaohui  related to splitting last 2M into 4KB pages. But  anyway, I attached  two versions for your reference. Keir, please let me know  if you have  comments.
			
			 
			-Wei
			

			
			 
			
________________________________

			
			
			From: Keir Fraser [mailto:keir.fraser@eu.citrix.com]   
			Sent: Monday, May 12, 2008 2:03 AM
			To: Xin,  Xiaohui;  Huang2, Wei; xen-devel@lists.xensource.com
			Subject:  Re:  [Xen-devel][PATCH] patch to support super page (2M) with  EPT
			
			Yes,  absolutely no need for 4M page support. We do not  support the 32-bit non-PAE  build target any more.
			
			 --  Keir
			
			On 12/5/08 06:04, "Xin,  Xiaohui"  <xiaohui.xin@intel.com> wrote:
			
			Some  comments here:
			1) Basically 4M pages   allocations is not hardware  naturally for EPT, we only use 2M  super pages now.  
			I remembered that  Keir said that 2M pages  allocation is sufficient, and he  removed all the  pure 32bit  support already.
			2)  If we don’t allocate  the last 2M   area with 4kb pages, the EPT will meet some problem. Xen will set   one of  the 4k page
			     there to be  invalid,   logically that means we should invalid the all the 2M  page if we  allocate it  with 2M, and then the   
			     special pages Xen used  in the  high end  of the guest memory can not be used then. May we know how you   cope with    that?
			 
			Thanks
			Xiaohui
			 
			

			
			 
			
________________________________

			  

			
			
			From:    xen-devel-bounces@lists.xensource.com [mailto:xen-devel-bounces@lists.xensource.com]    On Behalf Of Huang2, Wei
			Sent: Monday, May 12,  2008  12:36  PM
			To: Xin, Xiaohui;    xen-devel@lists.xensource.com
			Subject: RE:  [Xen-devel][PATCH]  patch  to support super page (2M) with  EPT
			
			This  is the  latest one I created. Please review it and  I will   re-submit.
			 
			1.         It  includes the  patch for p2m-ept.c,  directly from your previous patch.    
			
			2.         Xc_hvm_create.c   is based on my original  approach. It includes support for both 2MB and  4MB  pages. Also  it considers the case of odd page size (such as 255MB).  But I did   not allocate the last 2MB area using 4KB  pages. Let me   know if it is a  big issue.
			
			3.         The  rest are  pretty  similar.
			
			
			Thanks,
			 
			-Wei
			 
			
			From:   xen-devel-bounces@lists.xensource.com [mailto:xen-devel-bounces@lists.xensource.com]    On Behalf Of Huang2, Wei
			Sent: Sunday, May 11,  2008  3:34  PM
			To: Xin, Xiaohui;    xen-devel@lists.xensource.com
			Subject: RE:  [Xen-devel][PATCH]  patch  to support super page (2M) with  EPT
			
			Could  we work  together for a common solution? As far  as I can see, it largely   overlaps with my super page patch. The major  difference is  between p2m.c  and   p2m-ept.c.
			 
			-Wei
			 
			
			From:   xen-devel-bounces@lists.xensource.com [mailto:xen-devel-bounces@lists.xensource.com]    On Behalf Of Xin, Xiaohui
			Sent: Friday, May 09,  2008  4:11  AM
			To:  xen-devel@lists.xensource.com
			Subject:    [Xen-devel][PATCH] patch to support super page (2M) with    EPT
			
			Attached are the  patches to support   super page with EPT. We only support 2M size. And  shadow may  still work fine  with 4K pages.
			The patches can be split  into  3 parts. Apply order is as  attached.
			 
			tool.diff 
			To   allocate 2M physical contiguous memory  in guest except the first  2M and  the last 2M.
			The first 2M covers special  memory, and  Xen use the last  few pages in guest memory to do special   things.
			We let them to be 4K  pages as  normal.
			super_page_common.patch  
			To modify the p2m   interfaces by adding an order parameter, such as    guest_physmap_add_page(), p2m_set_entry(),    etc.
			p2m-ept-file.patch
			           To    handle the EPT tables to support super page.               
			 
			 
			Signed-off-by:    Xin Xiaohui <xiaohui.xin@intel.com>
			Signed-off-by: Li  Xin, B    <xin.b.li@intel.com>
			 
			
			 
			
			 
			

			
			 
			
________________________________

			
			
			_______________________________________________
			Xen-devel    mailing list
			Xen-devel@lists.xensource.com
			http://lists.xensource.com/xen-devel
			
			
			

			
			 
			
________________________________

			
			
			_______________________________________________
			Xen-devel   mailing list
			Xen-devel@lists.xensource.com
			http://lists.xensource.com/xen-devel
			
			 
			

		
		

	
	
	




[-- Attachment #1.2: Type: text/html, Size: 18772 bytes --]

[-- Attachment #2: super_page_patch.txt --]
[-- Type: text/plain, Size: 41142 bytes --]

diff -r 53195719f762 tools/libxc/xc_hvm_build.c
--- a/tools/libxc/xc_hvm_build.c	Tue May 13 15:08:17 2008 +0100
+++ b/tools/libxc/xc_hvm_build.c	Wed May 14 10:37:02 2008 -0500
@@ -157,15 +157,17 @@ static int setup_guest(int xc_handle,
                        char *image, unsigned long image_size)
 {
     xen_pfn_t *page_array = NULL;
+    xen_pfn_t *super_array = NULL;
     unsigned long i, nr_pages = (unsigned long)memsize << (20 - PAGE_SHIFT);
-    unsigned long special_page_nr, entry_eip, cur_pages;
+    unsigned long nr_super_pages;
+    unsigned long special_page_nr, entry_eip, cur_super_pages;
     struct xen_add_to_physmap xatp;
     struct shared_info *shared_info;
     void *e820_page;
     uint32_t *ident_pt;
     struct elf_binary elf;
     uint64_t v_start, v_end;
-    int rc;
+    int rc, left;
     xen_capabilities_info_t caps;
 
     /* An HVM guest must be initialised with at least 2MB memory. */
@@ -198,7 +200,13 @@ static int setup_guest(int xc_handle,
             v_start, v_end,
             elf_uval(&elf, elf.ehdr, e_entry));
 
-    if ( (page_array = malloc(nr_pages * sizeof(xen_pfn_t))) == NULL )
+    /* Since Xen only supports PAE and 64bit, the super page size is 2MB 
+     * and its order is 9. 
+     */
+    nr_super_pages = (unsigned long)memsize >> 1;
+    
+    if ( (page_array = malloc(nr_pages * sizeof(xen_pfn_t))) == NULL ||
+         (super_array = malloc(nr_super_pages * sizeof(xen_pfn_t))) == NULL )
     {
         PERROR("Could not allocate memory.\n");
         goto error_out;
@@ -206,26 +214,64 @@ static int setup_guest(int xc_handle,
 
     for ( i = 0; i < nr_pages; i++ )
         page_array[i] = i;
+    for ( i = 0; i < nr_super_pages; i++ )
+        super_array[i] = i << 9;
     for ( i = HVM_BELOW_4G_RAM_END >> PAGE_SHIFT; i < nr_pages; i++ )
         page_array[i] += HVM_BELOW_4G_MMIO_LENGTH >> PAGE_SHIFT;
+    for ( i = HVM_BELOW_4G_RAM_END >> (PAGE_SHIFT + 9); i < nr_super_pages; 
+          i++ )
+        super_array[i] += HVM_BELOW_4G_MMIO_LENGTH >> PAGE_SHIFT;
 
     /*
      * Allocate memory for HVM guest, skipping VGA hole 0xA0000-0xC0000.
-     * We allocate pages in batches of no more than 2048 to ensure that
+     * We allocate pages in batches of no more than 8MB to ensure that
      * we can be preempted and hence dom0 remains responsive.
+     * 
+     * 1) Allocate 4K pages for the first 2M guest memory;
+     * 2) Try to allocate 2M continous pages for the left guest memory
+     *    or use 4K pages;
+     * 3) Allocate the reset memory using 4K pages.
      */
     rc = xc_domain_memory_populate_physmap(
         xc_handle, dom, 0xa0, 0, 0, &page_array[0x00]);
-    cur_pages = 0xc0;
-    while ( (rc == 0) && (nr_pages > cur_pages) )
-    {
-        unsigned long count = nr_pages - cur_pages;
-        if ( count > 2048 )
-            count = 2048;
+    if ( rc == 0 )
         rc = xc_domain_memory_populate_physmap(
-            xc_handle, dom, count, 0, 0, &page_array[cur_pages]);
-        cur_pages += count;
-    }
+            xc_handle, dom, 0x200-0xc0, 0, 0, &page_array[0xc0]);
+
+    /* The amount of 4K pages left behind by super page allocation */
+    left = nr_pages - ((nr_pages >> 9 ) << 9 );
+    cur_super_pages = 1;
+
+    /* Start to allocate super pages */
+    while ( (rc == 0) && (nr_super_pages > cur_super_pages) )
+    {
+        unsigned long count = nr_super_pages - cur_super_pages;
+        if ( count > 4 )
+            count = 4;
+
+        rc = xc_domain_memory_populate_physmap(
+            xc_handle, dom, count, 9, 0, &super_array[cur_super_pages]);
+
+        if ( rc != 0 )
+        {
+            PERROR("Cannot allocate any more 2M pages for HVM guest.\n");
+            rc = xc_domain_memory_populate_physmap(
+                xc_handle, dom, count << 9, 0, 0, 
+                &page_array[cur_super_pages << 9]);
+            if ( rc != 0 )
+            {
+                PERROR("Could not allocate memory for HVM guest.\n");
+                goto error_out;
+            }
+        }
+
+        cur_super_pages += count;
+    }
+
+    if ( rc == 0 && left )
+        rc = xc_domain_memory_populate_physmap(xc_handle, dom, left, 0, 0, 
+            &page_array[cur_super_pages << 9]);
+
     if ( rc != 0 )
     {
         PERROR("Could not allocate memory for HVM guest.\n");
@@ -314,10 +360,12 @@ static int setup_guest(int xc_handle,
     }
 
     free(page_array);
+    free(super_array);
     return 0;
 
  error_out:
     free(page_array);
+    free(super_array);
     return -1;
 }
 
diff -r 53195719f762 xen/arch/ia64/xen/mm.c
--- a/xen/arch/ia64/xen/mm.c	Tue May 13 15:08:17 2008 +0100
+++ b/xen/arch/ia64/xen/mm.c	Wed May 14 09:47:50 2008 -0500
@@ -2415,7 +2415,7 @@ steal_page(struct domain *d, struct page
 
 int
 guest_physmap_add_page(struct domain *d, unsigned long gpfn,
-                       unsigned long mfn)
+                       unsigned long mfn, unsigned int page_order)
 {
     BUG_ON(!mfn_valid(mfn));
     BUG_ON(mfn_to_page(mfn)->count_info != (PGC_allocated | 1));
@@ -2432,7 +2432,7 @@ guest_physmap_add_page(struct domain *d,
 
 void
 guest_physmap_remove_page(struct domain *d, unsigned long gpfn,
-                          unsigned long mfn)
+                          unsigned long mfn, unsigned int page_order)
 {
     BUG_ON(mfn == 0);//XXX
     zap_domain_page_one(d, gpfn << PAGE_SHIFT, 0, mfn);
@@ -2838,7 +2838,7 @@ arch_memory_op(int op, XEN_GUEST_HANDLE(
         if (prev_mfn && mfn_valid(prev_mfn)) {
             if (is_xen_heap_mfn(prev_mfn))
                 /* Xen heap frames are simply unhooked from this phys slot. */
-                guest_physmap_remove_page(d, xatp.gpfn, prev_mfn);
+                guest_physmap_remove_page(d, xatp.gpfn, prev_mfn, 0);
             else
                 /* Normal domain memory is freed, to avoid leaking memory. */
                 guest_remove_page(d, xatp.gpfn);
@@ -2847,10 +2847,10 @@ arch_memory_op(int op, XEN_GUEST_HANDLE(
         /* Unmap from old location, if any. */
         gpfn = get_gpfn_from_mfn(mfn);
         if (gpfn != INVALID_M2P_ENTRY)
-            guest_physmap_remove_page(d, gpfn, mfn);
+            guest_physmap_remove_page(d, gpfn, mfn, 0);
 
         /* Map at new location. */
-        guest_physmap_add_page(d, xatp.gpfn, mfn);
+        guest_physmap_add_page(d, xatp.gpfn, mfn, 0);
 
     out:
         domain_unlock(d);
diff -r 53195719f762 xen/arch/x86/mm.c
--- a/xen/arch/x86/mm.c	Tue May 13 15:08:17 2008 +0100
+++ b/xen/arch/x86/mm.c	Wed May 14 09:54:38 2008 -0500
@@ -3297,7 +3297,7 @@ long arch_memory_op(int op, XEN_GUEST_HA
         {
             if ( is_xen_heap_mfn(prev_mfn) )
                 /* Xen heap frames are simply unhooked from this phys slot. */
-                guest_physmap_remove_page(d, xatp.gpfn, prev_mfn);
+                guest_physmap_remove_page(d, xatp.gpfn, prev_mfn, 0);
             else
                 /* Normal domain memory is freed, to avoid leaking memory. */
                 guest_remove_page(d, xatp.gpfn);
@@ -3306,10 +3306,10 @@ long arch_memory_op(int op, XEN_GUEST_HA
         /* Unmap from old location, if any. */
         gpfn = get_gpfn_from_mfn(mfn);
         if ( gpfn != INVALID_M2P_ENTRY )
-            guest_physmap_remove_page(d, gpfn, mfn);
+            guest_physmap_remove_page(d, gpfn, mfn, 0);
 
         /* Map at new location. */
-        guest_physmap_add_page(d, xatp.gpfn, mfn);
+        guest_physmap_add_page(d, xatp.gpfn, mfn, 0);
 
         domain_unlock(d);
 
diff -r 53195719f762 xen/arch/x86/mm/hap/p2m-ept.c
--- a/xen/arch/x86/mm/hap/p2m-ept.c	Tue May 13 15:08:17 2008 +0100
+++ b/xen/arch/x86/mm/hap/p2m-ept.c	Wed May 14 09:47:50 2008 -0500
@@ -20,6 +20,7 @@
 #include <xen/domain_page.h>
 #include <xen/sched.h>
 #include <asm/current.h>
+#include <asm/paging.h>
 #include <asm/types.h>
 #include <asm/domain.h>
 #include <asm/p2m.h>
@@ -46,6 +47,9 @@ static void ept_p2m_type_to_flags(ept_en
     }
 }
 
+#define GUEST_TABLE_NORMAL_PAGE 1
+#define GUEST_TABLE_SUPER_PAGE  2
+
 static int ept_next_level(struct domain *d, bool_t read_only,
                           ept_entry_t **table, unsigned long *gfn_remainder,
                           u32 shift)
@@ -54,7 +58,6 @@ static int ept_next_level(struct domain 
     u32 index;
 
     index = *gfn_remainder >> shift;
-    *gfn_remainder &= (1UL << shift) - 1;
 
     ept_entry = (*table) + index;
 
@@ -83,31 +86,53 @@ static int ept_next_level(struct domain 
         ept_entry->r = ept_entry->w = ept_entry->x = 1;
     }
 
-    next = map_domain_page(ept_entry->mfn);
-    unmap_domain_page(*table);
-    *table = next;
-
-    return 1;
+    if ( !ept_entry->sp_avail )
+    {
+        *gfn_remainder &= (1UL << shift) - 1;
+        next = map_domain_page(ept_entry->mfn);
+        unmap_domain_page(*table);
+        *table = next;
+        return GUEST_TABLE_NORMAL_PAGE;
+    }
+    else
+        return GUEST_TABLE_SUPER_PAGE;
 }
 
 static int
-ept_set_entry(struct domain *d, unsigned long gfn, mfn_t mfn, p2m_type_t p2mt)
-{
-    ept_entry_t *table =
-        map_domain_page(mfn_x(pagetable_get_mfn(d->arch.phys_table)));
-    unsigned long gfn_remainder = gfn;
+ept_set_entry(struct domain *d, unsigned long gfn, mfn_t mfn, 
+              unsigned int order, p2m_type_t p2mt)
+{
+    ept_entry_t *table = NULL;
+    unsigned long gfn_remainder = gfn, offset = 0;
     ept_entry_t *ept_entry = NULL;
     u32 index;
-    int i, rv = 0;
+    int i, rv = 0, ret = 0;
+    int walk_level = order / EPT_TABLE_ORDER;
 
     /* Should check if gfn obeys GAW here */
 
-    for ( i = EPT_DEFAULT_GAW; i > 0; i-- )
-        if ( !ept_next_level(d, 0, &table, &gfn_remainder,
-                             i * EPT_TABLE_ORDER) )
+    if (  order != 0 )
+        if ( (gfn & ((1UL << order) - 1)) )
+            return 1;
+
+    table = map_domain_page(mfn_x(pagetable_get_mfn(d->arch.phys_table)));
+
+    ASSERT(table != NULL);
+
+    for ( i = EPT_DEFAULT_GAW; i > walk_level; i-- )
+    {
+        ret = ept_next_level(d, 0, &table, &gfn_remainder,
+          i * EPT_TABLE_ORDER);
+        if ( !ret )
             goto out;
-
-    index = gfn_remainder;
+        else if ( ret == GUEST_TABLE_SUPER_PAGE )
+            break;
+    }
+
+    index = gfn_remainder >> ( i ?  (i * EPT_TABLE_ORDER): order);
+    walk_level = ( i ? ( i * EPT_TABLE_ORDER) : order) / EPT_TABLE_ORDER;
+    offset = (gfn_remainder & ( ((1 << (i*EPT_TABLE_ORDER)) - 1)));
+
     ept_entry = table + index;
 
     if ( mfn_valid(mfn_x(mfn)) || (p2mt == p2m_mmio_direct) )
@@ -117,9 +142,20 @@ ept_set_entry(struct domain *d, unsigned
             d->arch.p2m->max_mapped_pfn = gfn;
 
         ept_entry->emt = EPT_DEFAULT_MT;
-        ept_entry->sp_avail = 0;
+        ept_entry->sp_avail = walk_level ? 1 : 0;
+
+        if ( ret == GUEST_TABLE_SUPER_PAGE )
+        {
+            ept_entry->mfn = mfn_x(mfn) - offset;
+            if ( ept_entry->avail1 == p2m_ram_logdirty &&
+              p2mt == p2m_ram_rw )
+                for ( i = 0; i < 512; i++ )
+                    paging_mark_dirty(d, mfn_x(mfn)-offset+i);
+        }
+        else
+            ept_entry->mfn = mfn_x(mfn);
+
         ept_entry->avail1 = p2mt;
-        ept_entry->mfn = mfn_x(mfn);
         ept_entry->rsvd = 0;
         ept_entry->avail2 = 0;
         /* last step */
@@ -132,14 +168,42 @@ ept_set_entry(struct domain *d, unsigned
     /* Success */
     rv = 1;
 
- out:
+out:
     unmap_domain_page(table);
 
     ept_sync_domain(d);
 
+    /* Now the p2m table is not shared with vt-d page table */
+
+    if ( iommu_enabled && is_hvm_domain(d) )
+    {
+        if ( p2mt == p2m_ram_rw )
+        {
+            if ( ret == GUEST_TABLE_SUPER_PAGE )
+            {
+                for ( i = 0; i < 512; i++ )
+                    iommu_map_page(d, gfn-offset+i, mfn_x(mfn)-offset+i);
+            }
+            else if ( ret )
+                iommu_map_page(d, gfn, mfn_x(mfn));
+        }
+        else
+        {
+            if ( ret == GUEST_TABLE_SUPER_PAGE )
+            {
+                for ( i = 0; i < 512; i++ )
+                    iommu_unmap_page(d, gfn-offset+i);
+            }
+            else if ( ret )
+                iommu_unmap_page(d, gfn);
+        }
+    }
+
+#ifdef P2M_SHARE_WITH_VTD_PAGE_TABLE
     /* If p2m table is shared with vtd page-table. */
     if ( iommu_enabled && is_hvm_domain(d) && (p2mt == p2m_mmio_direct) )
         iommu_flush(d, gfn, (u64*)ept_entry);
+#endif
 
     return rv;
 }
@@ -152,7 +216,7 @@ static mfn_t ept_get_entry(struct domain
     unsigned long gfn_remainder = gfn;
     ept_entry_t *ept_entry;
     u32 index;
-    int i;
+    int i, ret=0;
     mfn_t mfn = _mfn(INVALID_MFN);
 
     *t = p2m_mmio_dm;
@@ -164,17 +228,31 @@ static mfn_t ept_get_entry(struct domain
     /* Should check if gfn obeys GAW here. */
 
     for ( i = EPT_DEFAULT_GAW; i > 0; i-- )
-        if ( !ept_next_level(d, 1, &table, &gfn_remainder,
-                             i * EPT_TABLE_ORDER) )
+    {
+        ret = ept_next_level(d, 1, &table, &gfn_remainder,
+                             i * EPT_TABLE_ORDER);
+        if ( !ret )
             goto out;
-
-    index = gfn_remainder;
+        else if ( ret == GUEST_TABLE_SUPER_PAGE )
+            break;
+    }
+
+    index = gfn_remainder >> ( i * EPT_TABLE_ORDER);
     ept_entry = table + index;
 
     if ( ept_entry->avail1 != p2m_invalid )
     {
         *t = ept_entry->avail1;
         mfn = _mfn(ept_entry->mfn);
+        if ( i )
+        {
+            /* we may meet super pages, and to split into 4k pages
+             * to emulate p2m table
+             */
+            unsigned long split_mfn = 
+              mfn_x(mfn) + (gfn_remainder & ( ((1 << (i*EPT_TABLE_ORDER)) - 1 )));
+            mfn = _mfn(split_mfn);
+        }
     }
 
  out:
@@ -205,33 +283,63 @@ static void ept_change_entry_type_global
     l4e = map_domain_page(mfn_x(pagetable_get_mfn(d->arch.phys_table)));
     for (i4 = 0; i4 < EPT_PAGETABLE_ENTRIES; i4++ )
     {
-        if ( !l4e[i4].epte || l4e[i4].sp_avail )
+        if ( !l4e[i4].epte )
             continue;
-        l3e = map_domain_page(l4e[i4].mfn);
-        for ( i3 = 0; i3 < EPT_PAGETABLE_ENTRIES; i3++ )
-        {
-            if ( !l3e[i3].epte || l3e[i3].sp_avail )
+        if ( !l4e[i4].sp_avail )
+        {
+            l3e = map_domain_page(l4e[i4].mfn);
+            for ( i3 = 0; i3 < EPT_PAGETABLE_ENTRIES; i3++ )
+            {
+                if ( !l3e[i3].epte )
+                    continue;
+                if ( !l3e[i3].sp_avail )
+                {
+                    l2e = map_domain_page(l3e[i3].mfn);
+                    for ( i2 = 0; i2 < EPT_PAGETABLE_ENTRIES; i2++ )
+                    {
+                        if ( !l2e[i2].epte )
+                            continue;
+                        if ( !l2e[i2].sp_avail )
+                        {
+                            l1e = map_domain_page(l2e[i2].mfn);
+                            for ( i1  = 0; i1 < EPT_PAGETABLE_ENTRIES; i1++ )
+                            {
+                                if ( !l1e[i1].epte )
+                                    continue;
+                                if ( l1e[i1].avail1 != ot )
+                                    continue;
+                                l1e[i1].avail1 = nt;
+                                ept_p2m_type_to_flags(l1e+i1, nt);
+                            }
+                            unmap_domain_page(l1e);
+                        }
+                        else
+                        {
+                            if ( l2e[i2].avail1 != ot )
+                                continue;
+                            l2e[i2].avail1 = nt;
+                            ept_p2m_type_to_flags(l2e+i2, nt);
+                        }
+                    }
+                    unmap_domain_page(l2e);
+                }
+                else
+                {
+                    if ( l3e[i3].avail1 != ot )
+                        continue;
+                    l3e[i3].avail1 = nt;
+                    ept_p2m_type_to_flags(l3e+i3, nt);
+                }
+            }
+            unmap_domain_page(l3e);
+        }
+        else
+        {
+            if ( l4e[i4].avail1 != ot )
                 continue;
-            l2e = map_domain_page(l3e[i3].mfn);
-            for ( i2 = 0; i2 < EPT_PAGETABLE_ENTRIES; i2++ )
-            {
-                if ( !l2e[i2].epte || l2e[i2].sp_avail )
-                    continue;
-                l1e = map_domain_page(l2e[i2].mfn);
-                for ( i1  = 0; i1 < EPT_PAGETABLE_ENTRIES; i1++ )
-                {
-                    if ( !l1e[i1].epte )
-                        continue;
-                    if ( l1e[i1].avail1 != ot )
-                        continue;
-                    l1e[i1].avail1 = nt;
-                    ept_p2m_type_to_flags(l1e+i1, nt);
-                }
-                unmap_domain_page(l1e);
-            }
-            unmap_domain_page(l2e);
-        }
-        unmap_domain_page(l3e);
+            l4e[i4].avail1 = nt;
+            ept_p2m_type_to_flags(l4e+i4, nt);
+        }
     }
     unmap_domain_page(l4e);
 
diff -r 53195719f762 xen/arch/x86/mm/p2m.c
--- a/xen/arch/x86/mm/p2m.c	Tue May 13 15:08:17 2008 +0100
+++ b/xen/arch/x86/mm/p2m.c	Wed May 14 09:54:38 2008 -0500
@@ -151,9 +151,11 @@ p2m_next_level(struct domain *d, mfn_t *
                unsigned long *gfn_remainder, unsigned long gfn, u32 shift,
                u32 max, unsigned long type)
 {
+    l1_pgentry_t *l1_entry;
     l1_pgentry_t *p2m_entry;
     l1_pgentry_t new_entry;
     void *next;
+    int i;
     ASSERT(d->arch.p2m->alloc_page);
 
     if ( !(p2m_entry = p2m_find_entry(*table, gfn_remainder, gfn,
@@ -194,6 +196,44 @@ p2m_next_level(struct domain *d, mfn_t *
             break;
         }
     }
+
+    ASSERT(l1e_get_flags(*p2m_entry) & _PAGE_PRESENT);
+
+    /* split single large page into 4KB page in P2M table */
+    if ( type == PGT_l1_page_table && (l1e_get_flags(*p2m_entry) & _PAGE_PSE) )
+    {
+        unsigned long flags, pfn;
+        struct page_info *pg = d->arch.p2m->alloc_page(d);
+        if ( pg == NULL )
+            return 0;
+        list_add_tail(&pg->list, &d->arch.p2m->pages);
+        pg->u.inuse.type_info = PGT_l1_page_table | 1 | PGT_validated;
+        pg->count_info = 1;
+        
+        /* New splintered mappings inherit the flags of the old superpage, 
+         * with a little reorganisation for the _PAGE_PSE_PAT bit. */
+        flags = l1e_get_flags(*p2m_entry);
+        pfn = l1e_get_pfn(*p2m_entry);
+        if ( pfn & 1 )           /* ==> _PAGE_PSE_PAT was set */
+            pfn -= 1;            /* Clear it; _PAGE_PSE becomes _PAGE_PAT */
+        else
+            flags &= ~_PAGE_PSE; /* Clear _PAGE_PSE (== _PAGE_PAT) */
+        
+        l1_entry = map_domain_page(mfn_x(page_to_mfn(pg)));
+        for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
+        {
+            new_entry = l1e_from_pfn(pfn + i, flags);
+            paging_write_p2m_entry(d, gfn,
+                                   l1_entry+i, *table_mfn, new_entry, 1);
+        }
+        unmap_domain_page(l1_entry);
+        
+        new_entry = l1e_from_pfn(mfn_x(page_to_mfn(pg)),
+                                 __PAGE_HYPERVISOR|_PAGE_USER);
+        paging_write_p2m_entry(d, gfn,
+                               p2m_entry, *table_mfn, new_entry, 2);
+    }
+
     *table_mfn = _mfn(l1e_get_pfn(*p2m_entry));
     next = map_domain_page(mfn_x(*table_mfn));
     unmap_domain_page(*table);
@@ -204,7 +244,8 @@ p2m_next_level(struct domain *d, mfn_t *
 
 // Returns 0 on error (out of memory)
 static int
-p2m_set_entry(struct domain *d, unsigned long gfn, mfn_t mfn, p2m_type_t p2mt)
+p2m_set_entry(struct domain *d, unsigned long gfn, mfn_t mfn, 
+              unsigned int page_order, p2m_type_t p2mt)
 {
     // XXX -- this might be able to be faster iff current->domain == d
     mfn_t table_mfn = pagetable_get_mfn(d->arch.phys_table);
@@ -212,6 +253,7 @@ p2m_set_entry(struct domain *d, unsigned
     unsigned long gfn_remainder = gfn;
     l1_pgentry_t *p2m_entry;
     l1_pgentry_t entry_content;
+    l2_pgentry_t l2e_content;
     int rv=0;
 
 #if CONFIG_PAGING_LEVELS >= 4
@@ -235,26 +277,53 @@ p2m_set_entry(struct domain *d, unsigned
                          PGT_l2_page_table) )
         goto out;
 
-    if ( !p2m_next_level(d, &table_mfn, &table, &gfn_remainder, gfn,
-                         L2_PAGETABLE_SHIFT - PAGE_SHIFT,
-                         L2_PAGETABLE_ENTRIES, PGT_l1_page_table) )
-        goto out;
-
-    p2m_entry = p2m_find_entry(table, &gfn_remainder, gfn,
-                               0, L1_PAGETABLE_ENTRIES);
-    ASSERT(p2m_entry);
+    if ( page_order == 0 )
+    {
+        if ( !p2m_next_level(d, &table_mfn, &table, &gfn_remainder, gfn,
+                             L2_PAGETABLE_SHIFT - PAGE_SHIFT,
+                             L2_PAGETABLE_ENTRIES, PGT_l1_page_table) )
+            goto out;
+
+        p2m_entry = p2m_find_entry(table, &gfn_remainder, gfn,
+                                   0, L1_PAGETABLE_ENTRIES);
+        ASSERT(p2m_entry);
+        
+        if ( mfn_valid(mfn) || (p2mt == p2m_mmio_direct) )
+            entry_content = l1e_from_pfn(mfn_x(mfn), p2m_type_to_flags(p2mt));
+        else
+            entry_content = l1e_empty();
+        
+        /* level 1 entry */
+        paging_write_p2m_entry(d, gfn, p2m_entry, table_mfn, entry_content, 1);
+    }
+    else 
+    {
+        p2m_entry = p2m_find_entry(table, &gfn_remainder, gfn,
+                                   L2_PAGETABLE_SHIFT - PAGE_SHIFT,
+                                   L2_PAGETABLE_ENTRIES);
+        ASSERT(p2m_entry);
+        
+        if ( (l1e_get_flags(*p2m_entry) & _PAGE_PRESENT) &&
+             !(l1e_get_flags(*p2m_entry) & _PAGE_PSE) )
+        {
+            P2M_ERROR("configure P2M table 4KB L2 entry with large page\n");
+            domain_crash(d);
+            goto out;
+        }
+        
+        if ( mfn_valid(mfn) )
+            l2e_content = l2e_from_pfn(mfn_x(mfn),
+                                       p2m_type_to_flags(p2mt) | _PAGE_PSE);
+        else
+            l2e_content = l2e_empty();
+        
+        entry_content.l1 = l2e_content.l2;
+        paging_write_p2m_entry(d, gfn, p2m_entry, table_mfn, entry_content, 2);
+    }
 
     /* Track the highest gfn for which we have ever had a valid mapping */
     if ( mfn_valid(mfn) && (gfn > d->arch.p2m->max_mapped_pfn) )
         d->arch.p2m->max_mapped_pfn = gfn;
-
-    if ( mfn_valid(mfn) || (p2mt == p2m_mmio_direct) )
-        entry_content = l1e_from_pfn(mfn_x(mfn), p2m_type_to_flags(p2mt));
-    else
-        entry_content = l1e_empty();
-
-    /* level 1 entry */
-    paging_write_p2m_entry(d, gfn, p2m_entry, table_mfn, entry_content, 1);
 
     if ( iommu_enabled && is_hvm_domain(d) )
     {
@@ -335,6 +404,16 @@ p2m_gfn_to_mfn(struct domain *d, unsigne
         unmap_domain_page(l2e);
         return _mfn(INVALID_MFN);
     }
+    else if ( (l2e_get_flags(*l2e) & _PAGE_PSE) )
+    {
+        mfn = _mfn(l2e_get_pfn(*l2e) + l1_table_offset(addr));
+        *t = p2m_flags_to_type(l2e_get_flags(*l2e));
+        unmap_domain_page(l2e);
+        
+        ASSERT(mfn_valid(mfn) || !p2m_is_ram(*t));
+        return (p2m_is_valid(*t)) ? mfn : _mfn(INVALID_MFN);
+    }
+
     mfn = _mfn(l2e_get_pfn(*l2e));
     unmap_domain_page(l2e);
 
@@ -358,6 +437,7 @@ static mfn_t p2m_gfn_to_mfn_current(unsi
 {
     mfn_t mfn = _mfn(INVALID_MFN);
     p2m_type_t p2mt = p2m_mmio_dm;
+    paddr_t addr = ((paddr_t)gfn) << PAGE_SHIFT;
     /* XXX This is for compatibility with the old model, where anything not 
      * XXX marked as RAM was considered to be emulated MMIO space.
      * XXX Once we start explicitly registering MMIO regions in the p2m 
@@ -366,25 +446,44 @@ static mfn_t p2m_gfn_to_mfn_current(unsi
     if ( gfn <= current->domain->arch.p2m->max_mapped_pfn )
     {
         l1_pgentry_t l1e = l1e_empty();
+        l2_pgentry_t l2e = l2e_empty();
         int ret;
 
         ASSERT(gfn < (RO_MPT_VIRT_END - RO_MPT_VIRT_START) 
                / sizeof(l1_pgentry_t));
 
-        /* Need to __copy_from_user because the p2m is sparse and this
-         * part might not exist */
-        ret = __copy_from_user(&l1e,
-                               &phys_to_machine_mapping[gfn],
-                               sizeof(l1e));
-
-        if ( ret == 0 ) {
-            p2mt = p2m_flags_to_type(l1e_get_flags(l1e));
-            ASSERT(l1e_get_pfn(l1e) != INVALID_MFN || !p2m_is_ram(p2mt));
+        ret = __copy_from_user(&l2e,
+                               &__linear_l1_table[l1_linear_offset(RO_MPT_VIRT_START) + l2_linear_offset(addr)],
+                               sizeof(l2e));
+        
+        if ( (ret == 0) && (l2e_get_flags(l2e) & _PAGE_PRESENT) && 
+             (l2e_get_flags(l2e) & _PAGE_PSE) ) 
+        {
+            p2mt = p2m_flags_to_type(l2e_get_flags(l2e));
+            ASSERT(l2e_get_pfn(l2e) != INVALID_MFN || !p2m_is_ram(p2mt));
             if ( p2m_is_valid(p2mt) )
-                mfn = _mfn(l1e_get_pfn(l1e));
-            else 
-                /* XXX see above */
+                mfn = _mfn(l2e_get_pfn(l2e) + l1_table_offset(addr));
+            else
                 p2mt = p2m_mmio_dm;
+        }
+        else
+        {
+        
+            /* Need to __copy_from_user because the p2m is sparse and this
+             * part might not exist */
+            ret = __copy_from_user(&l1e,
+                                   &phys_to_machine_mapping[gfn],
+                                   sizeof(l1e));
+            
+            if ( ret == 0 ) {
+                p2mt = p2m_flags_to_type(l1e_get_flags(l1e));
+                ASSERT(l1e_get_pfn(l1e) != INVALID_MFN || !p2m_is_ram(p2mt));
+                if ( p2m_is_valid(p2mt) )
+                    mfn = _mfn(l1e_get_pfn(l1e));
+                else 
+                    /* XXX see above */
+                    p2mt = p2m_mmio_dm;
+            }
         }
     }
 
@@ -430,9 +529,10 @@ void p2m_change_entry_type_global(struct
 }
 
 static inline
-int set_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn, p2m_type_t p2mt)
-{
-    return d->arch.p2m->set_entry(d, gfn, mfn, p2mt);
+int set_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn, 
+                  unsigned int page_order, p2m_type_t p2mt)
+{
+    return d->arch.p2m->set_entry(d, gfn, mfn, page_order, p2mt);
 }
 
 // Allocate a new p2m table for a domain.
@@ -493,7 +593,8 @@ int p2m_alloc_table(struct domain *d,
     P2M_PRINTK("populating p2m table\n");
 
     /* Initialise physmap tables for slot zero. Other code assumes this. */
-    if ( !set_p2m_entry(d, 0, _mfn(INVALID_MFN), p2m_invalid) )
+    if ( !set_p2m_entry(d, 0, _mfn(INVALID_MFN), 0,
+                        p2m_invalid) )
         goto error;
 
     /* Copy all existing mappings from the page list and m2p */
@@ -512,7 +613,7 @@ int p2m_alloc_table(struct domain *d,
             (gfn != 0x55555555L)
 #endif
              && gfn != INVALID_M2P_ENTRY
-            && !set_p2m_entry(d, gfn, mfn, p2m_ram_rw) )
+            && !set_p2m_entry(d, gfn, mfn, 0, p2m_ram_rw) )
             goto error;
     }
 
@@ -688,6 +789,28 @@ static void audit_p2m(struct domain *d)
                         gfn += 1 << (L2_PAGETABLE_SHIFT - PAGE_SHIFT);
                         continue;
                     }
+                    
+                    /* check for super page */
+                    if ( l2e_get_flags(l2e[i2]) & _PAGE_PSE )
+                    {
+                        mfn = l2e_get_pfn(l2e[i2]);
+                        ASSERT(mfn_valid(_mfn(mfn)));
+                        for ( i1 = 0; i1 < L1_PAGETABLE_ENTRIES; i1++)
+                        {
+                            m2pfn = get_gpfn_from_mfn(mfn+i1);
+                            if ( m2pfn != (gfn + i) )
+                            {
+                                pmbad++;
+                                P2M_PRINTK("mismatch: gfn %#lx -> mfn %#lx"
+                                           " -> gfn %#lx\n", gfn+i, mfn+i,
+                                           m2pfn);
+                                BUG();
+                            }
+                        }
+                        gfn += 1 << (L2_PAGETABLE_SHIFT - PAGE_SHIFT);
+                        continue;
+                    }
+
                     l1e = map_domain_page(mfn_x(_mfn(l2e_get_pfn(l2e[i2]))));
 
                     for ( i1 = 0; i1 < L1_PAGETABLE_ENTRIES; i1++, gfn++ )
@@ -737,35 +860,40 @@ static void audit_p2m(struct domain *d)
 
 
 static void
-p2m_remove_page(struct domain *d, unsigned long gfn, unsigned long mfn)
-{
+p2m_remove_page(struct domain *d, unsigned long gfn, unsigned long mfn,
+                unsigned int page_order)
+{
+    int i;
     if ( !paging_mode_translate(d) )
         return;
     P2M_DEBUG("removing gfn=%#lx mfn=%#lx\n", gfn, mfn);
 
-    set_p2m_entry(d, gfn, _mfn(INVALID_MFN), p2m_invalid);
-    set_gpfn_from_mfn(mfn, INVALID_M2P_ENTRY);
+    set_p2m_entry(d, gfn, _mfn(INVALID_MFN), page_order, p2m_invalid);
+    for ( i = 0; i < (1UL << page_order); i++ )
+        set_gpfn_from_mfn(mfn+i, INVALID_M2P_ENTRY);
 }
 
 void
 guest_physmap_remove_page(struct domain *d, unsigned long gfn,
-                          unsigned long mfn)
+                          unsigned long mfn, unsigned int page_order)
 {
     p2m_lock(d->arch.p2m);
     audit_p2m(d);
-    p2m_remove_page(d, gfn, mfn);
+    p2m_remove_page(d, gfn, mfn, page_order);
     audit_p2m(d);
     p2m_unlock(d->arch.p2m);
 }
 
 int
 guest_physmap_add_entry(struct domain *d, unsigned long gfn,
-                        unsigned long mfn, p2m_type_t t)
+                        unsigned long mfn, unsigned int page_order, 
+                        p2m_type_t t)
 {
     unsigned long ogfn;
     p2m_type_t ot;
     mfn_t omfn;
     int rc = 0;
+    int i;
 
     if ( !paging_mode_translate(d) )
         return -EINVAL;
@@ -795,7 +923,8 @@ guest_physmap_add_entry(struct domain *d
     if ( p2m_is_ram(ot) )
     {
         ASSERT(mfn_valid(omfn));
-        set_gpfn_from_mfn(mfn_x(omfn), INVALID_M2P_ENTRY);
+        for ( i = 0; i < (1UL << page_order); i++ )
+            set_gpfn_from_mfn(mfn_x(omfn)+i, INVALID_M2P_ENTRY);
     }
 
     ogfn = mfn_to_gfn(d, _mfn(mfn));
@@ -818,21 +947,23 @@ guest_physmap_add_entry(struct domain *d
             P2M_DEBUG("old gfn=%#lx -> mfn %#lx\n",
                       ogfn , mfn_x(omfn));
             if ( mfn_x(omfn) == mfn )
-                p2m_remove_page(d, ogfn, mfn);
+                p2m_remove_page(d, ogfn, mfn, page_order);
         }
     }
 
     if ( mfn_valid(_mfn(mfn)) ) 
     {
-        if ( !set_p2m_entry(d, gfn, _mfn(mfn), t) )
+        if ( !set_p2m_entry(d, gfn, _mfn(mfn), page_order, t) )
             rc = -EINVAL;
-        set_gpfn_from_mfn(mfn, gfn);
+        for ( i = 0; i < (1UL << page_order); i++ )
+            set_gpfn_from_mfn(mfn+i, gfn+i);
     }
     else
     {
         gdprintk(XENLOG_WARNING, "Adding bad mfn to p2m map (%#lx -> %#lx)\n",
                  gfn, mfn);
-        if ( !set_p2m_entry(d, gfn, _mfn(INVALID_MFN), p2m_invalid) )
+        if ( !set_p2m_entry(d, gfn, _mfn(INVALID_MFN), page_order, 
+                            p2m_invalid) )
             rc = -EINVAL;
     }
 
@@ -851,7 +982,7 @@ void p2m_change_type_global(struct domai
     l1_pgentry_t l1e_content;
     l1_pgentry_t *l1e;
     l2_pgentry_t *l2e;
-    mfn_t l1mfn;
+    mfn_t l1mfn, l2mfn;
     int i1, i2;
     l3_pgentry_t *l3e;
     int i3;
@@ -891,11 +1022,26 @@ void p2m_change_type_global(struct domai
             {
                 continue;
             }
+            l2mfn = _mfn(l3e_get_pfn(l3e[i3]));
             l2e = map_domain_page(l3e_get_pfn(l3e[i3]));
             for ( i2 = 0; i2 < L2_PAGETABLE_ENTRIES; i2++ )
             {
                 if ( !(l2e_get_flags(l2e[i2]) & _PAGE_PRESENT) )
                 {
+                    continue;
+                }
+
+                if ( (l2e_get_flags(l2e[i2]) & _PAGE_PSE) )
+                {
+                    flags = l2e_get_flags(l2e[i2]);
+                    if ( p2m_flags_to_type(flags) != ot )
+                        continue;
+                    mfn = l2e_get_pfn(l2e[i2]);
+                    gfn = get_gpfn_from_mfn(mfn);
+                    flags = p2m_flags_to_type(nt);
+                    l1e_content = l1e_from_pfn(mfn, flags | _PAGE_PSE);
+                    paging_write_p2m_entry(d, gfn, (l1_pgentry_t *)&l2e[i2],
+                                           l2mfn, l1e_content, 2);
                     continue;
                 }
 
@@ -944,7 +1090,7 @@ p2m_type_t p2m_change_type(struct domain
 
     mfn = gfn_to_mfn(d, gfn, &pt);
     if ( pt == ot )
-        set_p2m_entry(d, gfn, mfn, nt);
+        set_p2m_entry(d, gfn, mfn, 0, nt);
 
     p2m_unlock(d->arch.p2m);
 
@@ -968,7 +1114,7 @@ set_mmio_p2m_entry(struct domain *d, uns
         set_gpfn_from_mfn(mfn_x(omfn), INVALID_M2P_ENTRY);
     }
 
-    rc = set_p2m_entry(d, gfn, mfn, p2m_mmio_direct);
+    rc = set_p2m_entry(d, gfn, mfn, 0, p2m_mmio_direct);
     if ( 0 == rc )
         gdprintk(XENLOG_ERR,
             "set_mmio_p2m_entry: set_p2m_entry failed! mfn=%08lx\n",
@@ -992,7 +1138,7 @@ clear_mmio_p2m_entry(struct domain *d, u
             "clear_mmio_p2m_entry: gfn_to_mfn failed! gfn=%08lx\n", gfn);
         return 0;
     }
-    rc = set_p2m_entry(d, gfn, _mfn(INVALID_MFN), 0);
+    rc = set_p2m_entry(d, gfn, _mfn(INVALID_MFN), 0, 0);
 
     return rc;
 }
diff -r 53195719f762 xen/common/grant_table.c
--- a/xen/common/grant_table.c	Tue May 13 15:08:17 2008 +0100
+++ b/xen/common/grant_table.c	Wed May 14 09:54:38 2008 -0500
@@ -1159,7 +1159,7 @@ gnttab_transfer(
         spin_lock(&e->grant_table->lock);
 
         sha = &shared_entry(e->grant_table, gop.ref);
-        guest_physmap_add_page(e, sha->frame, mfn);
+        guest_physmap_add_page(e, sha->frame, mfn, 0);
         sha->frame = mfn;
         wmb();
         sha->flags |= GTF_transfer_completed;
diff -r 53195719f762 xen/common/memory.c
--- a/xen/common/memory.c	Tue May 13 15:08:17 2008 +0100
+++ b/xen/common/memory.c	Wed May 14 09:54:38 2008 -0500
@@ -126,11 +126,7 @@ static void populate_physmap(struct memo
         mfn = page_to_mfn(page);
 
         if ( unlikely(paging_mode_translate(d)) )
-        {
-            for ( j = 0; j < (1 << a->extent_order); j++ )
-                if ( guest_physmap_add_page(d, gpfn + j, mfn + j) )
-                    goto out;
-        }
+            guest_physmap_add_page(d, gpfn, mfn, a->extent_order);
         else
         {
             for ( j = 0; j < (1 << a->extent_order); j++ )
@@ -172,7 +168,7 @@ int guest_remove_page(struct domain *d, 
     if ( test_and_clear_bit(_PGC_allocated, &page->count_info) )
         put_page(page);
 
-    guest_physmap_remove_page(d, gmfn, mfn);
+    guest_physmap_remove_page(d, gmfn, mfn, 0);
 
     put_page(page);
 
@@ -419,7 +415,7 @@ static long memory_exchange(XEN_GUEST_HA
             if ( !test_and_clear_bit(_PGC_allocated, &page->count_info) )
                 BUG();
             mfn = page_to_mfn(page);
-            guest_physmap_remove_page(d, mfn_to_gmfn(d, mfn), mfn);
+            guest_physmap_remove_page(d, mfn_to_gmfn(d, mfn), mfn, 0);
             put_page(page);
         }
 
@@ -441,8 +437,7 @@ static long memory_exchange(XEN_GUEST_HA
             if ( unlikely(paging_mode_translate(d)) )
             {
                 /* Ignore failure here. There's nothing we can do. */
-                for ( k = 0; k < (1UL << exch.out.extent_order); k++ )
-                    (void)guest_physmap_add_page(d, gpfn + k, mfn + k);
+                (void)guest_physmap_add_page(d, gpfn, mfn, exch.out.extent_order);
             }
             else
             {
diff -r 53195719f762 xen/include/asm-ia64/grant_table.h
--- a/xen/include/asm-ia64/grant_table.h	Tue May 13 15:08:17 2008 +0100
+++ b/xen/include/asm-ia64/grant_table.h	Wed May 14 09:47:50 2008 -0500
@@ -13,7 +13,7 @@ int replace_grant_host_mapping(unsigned 
 int replace_grant_host_mapping(unsigned long gpaddr, unsigned long mfn, unsigned long new_gpaddr, unsigned int flags);
 
 // for grant transfer
-int guest_physmap_add_page(struct domain *d, unsigned long gpfn, unsigned long mfn);
+int guest_physmap_add_page(struct domain *d, unsigned long gpfn, unsigned long mfn, int order);
 
 /* XXX
  * somewhere appropriate
diff -r 53195719f762 xen/include/asm-ia64/shadow.h
--- a/xen/include/asm-ia64/shadow.h	Tue May 13 15:08:17 2008 +0100
+++ b/xen/include/asm-ia64/shadow.h	Wed May 14 09:47:50 2008 -0500
@@ -40,8 +40,10 @@
  * Utilities to change relationship of gpfn->mfn for designated domain,
  * which is required by gnttab transfer, balloon, device model and etc.
  */
-int guest_physmap_add_page(struct domain *d, unsigned long gpfn, unsigned long mfn);
-void guest_physmap_remove_page(struct domain *d, unsigned long gpfn, unsigned long mfn);
+int guest_physmap_add_page(struct domain *d, unsigned long gpfn, 
+                           unsigned long mfn, unsigned int page_order);
+void guest_physmap_remove_page(struct domain *d, unsigned long gpfn, 
+                               unsigned long mfn, unsigned int page_order);
 
 static inline int
 shadow_mode_enabled(struct domain *d)
diff -r 53195719f762 xen/include/asm-x86/p2m.h
--- a/xen/include/asm-x86/p2m.h	Tue May 13 15:08:17 2008 +0100
+++ b/xen/include/asm-x86/p2m.h	Wed May 14 09:47:50 2008 -0500
@@ -102,7 +102,8 @@ struct p2m_domain {
     void               (*free_page   )(struct domain *d,
                                        struct page_info *pg);
     int                (*set_entry   )(struct domain *d, unsigned long gfn,
-                                       mfn_t mfn, p2m_type_t p2mt);
+                                       mfn_t mfn, unsigned int page_order,
+                                       p2m_type_t p2mt);
     mfn_t              (*get_entry   )(struct domain *d, unsigned long gfn,
                                        p2m_type_t *p2mt);
     mfn_t              (*get_entry_current)(unsigned long gfn,
@@ -203,21 +204,23 @@ void p2m_final_teardown(struct domain *d
 
 /* Add a page to a domain's p2m table */
 int guest_physmap_add_entry(struct domain *d, unsigned long gfn,
-                             unsigned long mfn, p2m_type_t t);
+                            unsigned long mfn, unsigned int page_order, 
+                            p2m_type_t t);
 
 /* Untyped version for RAM only, for compatibility 
  *
  * Return 0 for success
  */
 static inline int guest_physmap_add_page(struct domain *d, unsigned long gfn,
-                                         unsigned long mfn)
-{
-    return guest_physmap_add_entry(d, gfn, mfn, p2m_ram_rw);
+                                         unsigned long mfn,
+                                         unsigned int page_order)
+{
+    return guest_physmap_add_entry(d, gfn, mfn, page_order, p2m_ram_rw);
 }
 
 /* Remove a page from a domain's p2m table */
 void guest_physmap_remove_page(struct domain *d, unsigned long gfn,
-                               unsigned long mfn);
+                               unsigned long mfn, unsigned int page_order);
 
 /* Change types across all p2m entries in a domain */
 void p2m_change_type_global(struct domain *d, p2m_type_t ot, p2m_type_t nt);
diff -r 53195719f762 xen/include/xen/paging.h
--- a/xen/include/xen/paging.h	Tue May 13 15:08:17 2008 +0100
+++ b/xen/include/xen/paging.h	Wed May 14 09:47:50 2008 -0500
@@ -18,8 +18,8 @@
 #else
 
 #define paging_mode_translate(d)              (0)
-#define guest_physmap_add_page(d, p, m)       (0)
-#define guest_physmap_remove_page(d, p, m)    ((void)0)
+#define guest_physmap_add_page(d, p, m, o)       (0)
+#define guest_physmap_remove_page(d, p, m, o)    ((void)0)
 
 #endif
 

[-- Attachment #3: Type: text/plain, Size: 138 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2008-05-14 21:22 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-05-12 12:14 [PATCH] patch to support super page (2M) with EPT Huang2, Wei
  -- strict thread matches above, loose matches on Subject: below --
2008-05-09  9:10 Xin, Xiaohui
2008-05-11 20:33 ` Huang2, Wei
2008-05-12  4:36   ` Huang2, Wei
2008-05-12  5:04     ` Xin, Xiaohui
2008-05-12  7:03       ` Keir Fraser
2008-05-12 17:28         ` Huang2, Wei
2008-05-13  8:46           ` Keir Fraser
2008-05-13 13:36             ` Huang2, Wei
2008-05-13 13:39               ` Li, Xin B
2008-05-13 13:51                 ` Keir Fraser
2008-05-13 15:49                   ` Huang2, Wei
2008-05-14  8:40                     ` Keir Fraser
2008-05-14 21:22                       ` Huang2, Wei

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.