All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] Optimizing x86-64 xenlinux using global pages for user mode
@ 2006-08-18 17:19 Nakajima, Jun
  2006-08-18 20:53 ` Aron Griffis
  0 siblings, 1 reply; 2+ messages in thread
From: Nakajima, Jun @ 2006-08-18 17:19 UTC (permalink / raw)
  To: xen-devel; +Cc: Keir Fraser

[-- Attachment #1: Type: text/plain, Size: 1239 bytes --]

This was based on Ian's idea, but the patch attached offers measurable
performance improvements by retaining TLB for user processes at
transitions between user and kernel mode, which can occur very
frequently in various workloads. 

I would like to thank Andrew Theurer for conducting benchmarks
thoroughly and collecting profile data from them. That was very helpful
when making this patch.

 linux-2.6-xen-sparse/arch/i386/mm/ioremap-xen.c                |    3 
 linux-2.6-xen-sparse/arch/x86_64/mm/init-xen.c                 |    6 
 linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/pgtable.h |   21
+--
 xen/arch/x86/flushtlb.c                                        |    9 +
 xen/arch/x86/mm.c                                              |   69
+++++++++-
 xen/arch/x86/x86_64/traps.c                                    |    5 
 xen/include/asm-x86/flushtlb.h                                 |    7 -
 xen/include/asm-x86/shadow.h                                   |   19
+-
 xen/include/asm-x86/x86_64/page.h                              |    2 
 9 files changed, 106 insertions(+), 35 deletions(-)

Signed-off-by: Jun Nakajima <jun.nakajima@intel.com>

Jun
---
Intel Open Source Technology Center 

[-- Attachment #2: global_64.patch --]
[-- Type: application/octet-stream, Size: 14264 bytes --]

diff -r 0e32095a7b46 linux-2.6-xen-sparse/arch/i386/mm/ioremap-xen.c
--- a/linux-2.6-xen-sparse/arch/i386/mm/ioremap-xen.c	Wed Aug  9 20:34:27 2006
+++ b/linux-2.6-xen-sparse/arch/i386/mm/ioremap-xen.c	Fri Aug 18 09:39:44 2006
@@ -282,9 +282,6 @@
 	area->phys_addr = phys_addr;
 	addr = (void __iomem *) area->addr;
 	flags |= _PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED;
-#ifdef __x86_64__
-	flags |= _PAGE_USER;
-#endif
 	if (__direct_remap_pfn_range(&init_mm, (unsigned long)addr,
 				     phys_addr>>PAGE_SHIFT,
 				     size, __pgprot(flags), domid)) {
diff -r 0e32095a7b46 linux-2.6-xen-sparse/arch/x86_64/mm/init-xen.c
--- a/linux-2.6-xen-sparse/arch/x86_64/mm/init-xen.c	Wed Aug  9 20:34:27 2006
+++ b/linux-2.6-xen-sparse/arch/x86_64/mm/init-xen.c	Fri Aug 18 09:39:44 2006
@@ -529,7 +529,7 @@
 		mk_kernel_pgd(__pa_symbol(level3_kernel_pgt));
 	level3_kernel_pgt[pud_index(__START_KERNEL_map)] = 
 		__pud(__pa_symbol(level2_kernel_pgt) |
-		      _KERNPG_TABLE | _PAGE_USER);
+		      _KERNPG_TABLE);
 	memcpy((void *)level2_kernel_pgt, page, PAGE_SIZE);
 
 	early_make_page_readonly(init_level4_pgt,
@@ -578,7 +578,7 @@
 			pte_page = alloc_static_page(&phys);
 			early_make_page_readonly(
 				pte_page, XENFEAT_writable_page_tables);
-			set_pmd(pmd, __pmd(phys | _KERNPG_TABLE | _PAGE_USER));
+			set_pmd(pmd, __pmd(phys | _KERNPG_TABLE));
 		} else {
 			addr = page[pmd_index(va)];
 			addr_to_page(addr, pte_page);
@@ -587,7 +587,7 @@
 		if (pte_none(*pte)) {
 			new_pte = pfn_pte(
 				(va - __START_KERNEL_map) >> PAGE_SHIFT, 
-				__pgprot(_KERNPG_TABLE | _PAGE_USER));
+				__pgprot(_KERNPG_TABLE));
 			xen_l1_entry_update(pte, new_pte);
 		}
 		va += PAGE_SIZE;
diff -r 0e32095a7b46 linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/pgtable.h
--- a/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/pgtable.h	Wed Aug  9 20:34:27 2006
+++ b/linux-2.6-xen-sparse/include/asm-x86_64/mach-xen/asm/pgtable.h	Fri Aug 18 09:39:44 2006
@@ -206,7 +206,7 @@
 #define _PAGE_NX        (1UL<<_PAGE_BIT_NX)
 
 #define _PAGE_TABLE	(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_DIRTY)
-#define _KERNPG_TABLE	_PAGE_TABLE
+#define _KERNPG_TABLE	(_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
 
 #define _PAGE_CHG_MASK	(PTE_MASK | _PAGE_ACCESSED | _PAGE_DIRTY)
 
@@ -219,22 +219,21 @@
 #define PAGE_READONLY	__pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_NX)
 #define PAGE_READONLY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
 #define __PAGE_KERNEL \
-	(_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_NX | _PAGE_USER )
+	(_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_NX)
 #define __PAGE_KERNEL_EXEC \
-	(_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_USER )
+	(_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED)
 #define __PAGE_KERNEL_NOCACHE \
-	(_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_PCD | _PAGE_ACCESSED | _PAGE_NX | _PAGE_USER )
+	(_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_PCD | _PAGE_ACCESSED | _PAGE_NX)
 #define __PAGE_KERNEL_RO \
-	(_PAGE_PRESENT | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_NX | _PAGE_USER )
+	(_PAGE_PRESENT | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_NX)
 #define __PAGE_KERNEL_VSYSCALL \
-	(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_USER )
+	(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED)
 #define __PAGE_KERNEL_VSYSCALL_NOCACHE \
-	(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_PCD | _PAGE_USER )
+	(_PAGE_PRESENT | _PAGE_USER | _PAGE_ACCESSED | _PAGE_PCD)
 #define __PAGE_KERNEL_LARGE \
-	(__PAGE_KERNEL | _PAGE_PSE | _PAGE_USER )
+	(__PAGE_KERNEL | _PAGE_PSE)
 #define __PAGE_KERNEL_LARGE_EXEC \
-	(__PAGE_KERNEL_EXEC | _PAGE_PSE | _PAGE_USER )
-
+	(__PAGE_KERNEL_EXEC | _PAGE_PSE)
 
 /*
  * We don't support GLOBAL page in xenolinux64
@@ -424,7 +423,7 @@
    can temporarily clear it. */
 #define pmd_present(x)	(pmd_val(x))
 #define pmd_clear(xp)	do { set_pmd(xp, __pmd(0)); } while (0)
-#define	pmd_bad(x)	((pmd_val(x) & (~PAGE_MASK & ~_PAGE_PRESENT)) != (_KERNPG_TABLE & ~_PAGE_PRESENT))
+#define	pmd_bad(x)	((pmd_val(x) & (~PAGE_MASK & ~_PAGE_USER & ~_PAGE_PRESENT)) != (_KERNPG_TABLE & ~_PAGE_PRESENT))
 #define pfn_pmd(nr,prot) (__pmd(((nr) << PAGE_SHIFT) | pgprot_val(prot)))
 #define pmd_pfn(x)  ((pmd_val(x) & __PHYSICAL_MASK) >> PAGE_SHIFT)
 
diff -r 0e32095a7b46 xen/arch/x86/flushtlb.c
--- a/xen/arch/x86/flushtlb.c	Wed Aug  9 20:34:27 2006
+++ b/xen/arch/x86/flushtlb.c	Fri Aug 18 09:39:44 2006
@@ -22,7 +22,7 @@
 u32 tlbflush_clock = 1U;
 DEFINE_PER_CPU(u32, tlbflush_time);
 
-void write_cr3(unsigned long cr3)
+void write_cr3(unsigned long cr3, const int flush_only)
 {
     u32 t, t1, t2;
     unsigned long flags;
@@ -57,7 +57,14 @@
      */
 
  skip_clocktick:
+#ifdef __x86_64__
+    __pge_off();
+    if ( flush_only != TLB_FLUSH_ONLY )
+	__asm__ __volatile__ ( "mov %0, %%cr3" : : "r" (cr3) : "memory" );
+    __pge_on();
+#else
     __asm__ __volatile__ ( "mov %0, %%cr3" : : "r" (cr3) : "memory" );
+#endif
 
     /*
      * STEP 3. Update this CPU's timestamp. Note that this happens *after*
diff -r 0e32095a7b46 xen/arch/x86/mm.c
--- a/xen/arch/x86/mm.c	Wed Aug  9 20:34:27 2006
+++ b/xen/arch/x86/mm.c	Fri Aug 18 09:39:44 2006
@@ -305,7 +305,7 @@
     /* Fast path 1: does this mfn need a shadow at all? */
     if ( !l3tab_needs_shadow(mfn) )
     {
-        write_cr3(mfn << PAGE_SHIFT);
+        write_cr3(mfn << PAGE_SHIFT, TLB_UPDATE_CR3);
         /* Cache is no longer in use or valid (/after/ write to %cr3). */
         cache->high_mfn = 0;
         return;
@@ -317,7 +317,7 @@
     /* Fast path 2: is this mfn already cached? */
     if ( cache->high_mfn == mfn )
     {
-        write_cr3(__pa(cache->table[cache->inuse_idx]));
+        write_cr3(__pa(cache->table[cache->inuse_idx]), TLB_UPDATE_CR3);
         return;
     }
 
@@ -335,7 +335,7 @@
     *(fix_pae_highmem_pl1e - cpu) = l1e_empty();
 
     /* Install the low-memory L3 table in CR3. */
-    write_cr3(__pa(lowmem_l3tab));
+    write_cr3(__pa(lowmem_l3tab), TLB_UPDATE_CR3);
 
     spin_unlock(&cache->lock);
 }
@@ -344,7 +344,7 @@
 
 static void __write_ptbase(unsigned long mfn)
 {
-    write_cr3(mfn << PAGE_SHIFT);
+    write_cr3(mfn << PAGE_SHIFT, TLB_UPDATE_CR3);
 }
 
 #endif /* !CONFIG_X86_PAE */
@@ -692,6 +692,43 @@
 }
 #endif /* 4 level */
 
+#ifdef __x86_64__
+#define adjust_l1e(pl1e)                                            \
+    do  {                                                           \
+        if ( likely(l1e_get_flags((pl1e)) & _PAGE_PRESENT) )        \
+        {                                                           \
+            if (l1e_get_flags((pl1e)) & _PAGE_USER)                 \
+                l1e_add_flags((pl1e), _PAGE_GLOBAL);                \
+            else                                                    \
+            {                                                       \
+                l1e_remove_flags((pl1e), _PAGE_GLOBAL);             \
+                l1e_add_flags((pl1e), _PAGE_USER);                  \
+            }                                                       \
+        }                                                           \
+    } while ( 0 )
+
+#define adjust_l2e(pl2e)                                        \
+    do {                                                        \
+        if ( likely(l2e_get_flags((pl2e)) & _PAGE_PRESENT) )    \
+            l2e_add_flags((pl2e), _PAGE_USER);                  \
+    } while ( 0 )
+
+#define adjust_l3e(pl3e)                                        \
+    do {                                                        \
+        if ( likely(l3e_get_flags((pl3e)) & _PAGE_PRESENT) )    \
+            l3e_add_flags((pl3e), _PAGE_USER);                  \
+    } while ( 0 )
+
+#define adjust_l4e(pl4e)                                        \
+    do {                                                        \
+        if ( likely(l4e_get_flags((pl4e)) & _PAGE_PRESENT) )    \
+                l4e_add_flags((pl4e), _PAGE_USER);              \
+    } while ( 0 )
+#else
+#define adjust_l1e(_p) ((void)0)
+#define adjust_l2e(_p) ((void)0)
+#define adjust_l3e(_p) ((void)0)
+#endif
 
 void put_page_from_l1e(l1_pgentry_t l1e, struct domain *d)
 {
@@ -788,9 +825,13 @@
     pl1e = map_domain_page(pfn);
 
     for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
+    {
         if ( is_guest_l1_slot(i) &&
              unlikely(!get_page_from_l1e(pl1e[i], d)) )
             goto fail;
+
+        adjust_l1e(pl1e[i]);
+    }
 
     unmap_domain_page(pl1e);
     return 1;
@@ -969,6 +1010,8 @@
         if ( is_guest_l2_slot(type, i) &&
              unlikely(!get_page_from_l2e(pl2e[i], pfn, d, vaddr)) )
             goto fail;
+        
+        adjust_l2e(pl2e[i]);
     }
 
 #if CONFIG_PAGING_LEVELS == 2
@@ -1041,6 +1084,8 @@
         if ( is_guest_l3_slot(i) &&
              unlikely(!get_page_from_l3e(pl3e[i], pfn, d, vaddr)) )
             goto fail;
+        
+        adjust_l3e(pl3e[i]);
     }
 
     if ( !create_pae_xen_mappings(pl3e) )
@@ -1085,6 +1130,8 @@
         if ( is_guest_l4_slot(i) &&
              unlikely(!get_page_from_l4e(pl4e[i], pfn, d, vaddr)) )
             goto fail;
+
+        adjust_l4e(pl4e[i]);
     }
 
     /* Xen private mappings. */
@@ -1236,6 +1283,8 @@
                     l1e_get_flags(nl1e) & L1_DISALLOW_MASK);
             return 0;
         }
+
+        adjust_l1e(nl1e);
 
         /* Fast path for identical mapping, r/w and presence. */
         if ( !l1e_has_changed(ol1e, nl1e, _PAGE_RW | _PAGE_PRESENT))
@@ -1306,6 +1355,8 @@
             return 0;
         }
 
+        adjust_l2e(nl2e);
+
         /* Fast path for identical mapping and presence. */
         if ( !l2e_has_changed(ol2e, nl2e, _PAGE_PRESENT))
             return UPDATE_ENTRY(l2, pl2e, ol2e, nl2e);
@@ -1368,6 +1419,8 @@
                     l3e_get_flags(nl3e) & L3_DISALLOW_MASK);
             return 0;
         }
+
+        adjust_l3e(nl3e);
 
         /* Fast path for identical mapping and presence. */
         if (!l3e_has_changed(ol3e, nl3e, _PAGE_PRESENT))
@@ -1434,6 +1487,8 @@
                     l4e_get_flags(nl4e) & L4_DISALLOW_MASK);
             return 0;
         }
+
+        adjust_l4e(nl4e);
 
         /* Fast path for identical mapping and presence. */
         if (!l4e_has_changed(ol4e, nl4e, _PAGE_PRESENT))
@@ -1770,7 +1825,7 @@
             old_base_mfn = pagetable_get_pfn(v->arch.guest_table);
             v->arch.guest_table = pagetable_null();
             update_pagetables(v);
-            write_cr3(__pa(idle_pg_table));
+            write_cr3(__pa(idle_pg_table), TLB_UPDATE_CR3);
             if ( old_base_mfn != 0 )
                 put_page_and_type(mfn_to_page(old_base_mfn));
 
@@ -3270,7 +3325,9 @@
             domain_crash(d);
             break;
         }
-        
+
+        adjust_l1e(l1page[i]);
+
         put_page_from_l1e(ol1e, d);
     }
 
diff -r 0e32095a7b46 xen/arch/x86/x86_64/traps.c
--- a/xen/arch/x86/x86_64/traps.c	Wed Aug  9 20:34:27 2006
+++ b/xen/arch/x86/x86_64/traps.c	Fri Aug 18 09:39:44 2006
@@ -163,7 +163,10 @@
     v->arch.flags ^= TF_kernel_mode;
     __asm__ __volatile__ ( "swapgs" );
     update_pagetables(v);
-    write_ptbase(v);
+
+    __asm__ __volatile__ ( "mov %0, %%cr3" : : "r" 
+                           (pagetable_get_paddr(v->arch.monitor_table))
+                           : "memory" );
 }
 
 unsigned long do_iret(void)
diff -r 0e32095a7b46 xen/include/asm-x86/flushtlb.h
--- a/xen/include/asm-x86/flushtlb.h	Wed Aug  9 20:34:27 2006
+++ b/xen/include/asm-x86/flushtlb.h	Fri Aug 18 09:39:44 2006
@@ -69,12 +69,15 @@
 }
 
 /* Write pagetable base and implicitly tick the tlbflush clock. */
-extern void write_cr3(unsigned long cr3);
+#define TLB_FLUSH_ONLY 1
+#define TLB_UPDATE_CR3 0
+
+extern void write_cr3(unsigned long cr3, const int flush_only);
 
 #define local_flush_tlb()                                         \
     do {                                                          \
         unsigned long cr3 = read_cr3();                           \
-        write_cr3(cr3);                                           \
+        write_cr3(cr3, TLB_FLUSH_ONLY);				  \
     } while ( 0 )
 
 #define local_flush_tlb_pge()                                     \
diff -r 0e32095a7b46 xen/include/asm-x86/shadow.h
--- a/xen/include/asm-x86/shadow.h	Wed Aug  9 20:34:27 2006
+++ b/xen/include/asm-x86/shadow.h	Fri Aug 18 09:39:44 2006
@@ -1752,10 +1752,22 @@
     if ( hvm_guest(v) )
         paging_enabled = hvm_paging_enabled(v);
     else
+    {
+        if ( !shadow_mode_enabled(d) )
+        {
+#if CONFIG_PAGING_LEVELS == 4
+            if ( !(v->arch.flags & TF_kernel_mode) )
+                v->arch.monitor_table = v->arch.guest_table_user;
+            else
+#endif
+                v->arch.monitor_table = v->arch.guest_table;
+            return;
+        }
         // HACK ALERT: there's currently no easy way to figure out if a domU
         // has set its arch.guest_table to zero, vs not yet initialized it.
         //
         paging_enabled = !!pagetable_get_paddr(v->arch.guest_table);
+    }
 
     /*
      * We don't call __update_pagetables() when hvm guest paging is
@@ -1774,13 +1786,6 @@
     {
         if ( shadow_mode_enabled(d) )
             v->arch.monitor_table = v->arch.shadow_table;
-        else
-#if CONFIG_PAGING_LEVELS == 4
-        if ( !(v->arch.flags & TF_kernel_mode) )
-            v->arch.monitor_table = v->arch.guest_table_user;
-        else
-#endif
-            v->arch.monitor_table = v->arch.guest_table;
     }
 }
 
diff -r 0e32095a7b46 xen/include/asm-x86/x86_64/page.h
--- a/xen/include/asm-x86/x86_64/page.h	Wed Aug  9 20:34:27 2006
+++ b/xen/include/asm-x86/x86_64/page.h	Fri Aug 18 09:39:44 2006
@@ -72,7 +72,7 @@
 /* Bit 23 of a 24-bit flag mask. This corresponds to bit 63 of a pte.*/
 #define _PAGE_NX (cpu_has_nx ? (1U<<23) : 0U)
 
-#define L1_DISALLOW_MASK BASE_DISALLOW_MASK
+#define L1_DISALLOW_MASK (BASE_DISALLOW_MASK & ~_PAGE_GLOBAL)
 #define L2_DISALLOW_MASK BASE_DISALLOW_MASK
 #define L3_DISALLOW_MASK (BASE_DISALLOW_MASK | 0x180U /* must-be-zero */)
 #define L4_DISALLOW_MASK (BASE_DISALLOW_MASK | 0x180U /* must-be-zero */)

[-- Attachment #3: Type: text/plain, Size: 138 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2006-08-18 20:53 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2006-08-18 17:19 [PATCH] Optimizing x86-64 xenlinux using global pages for user mode Nakajima, Jun
2006-08-18 20:53 ` Aron Griffis

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.