All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] x86: add SSE-based copy_page()
@ 2008-11-12  9:37 Jan Beulich
  2008-11-12 14:51 ` Dan Magenheimer
  0 siblings, 1 reply; 17+ messages in thread
From: Jan Beulich @ 2008-11-12  9:37 UTC (permalink / raw)
  To: xen-devel

In top of the highmem asstance hypercalls added earlier, this provides
a performance improvement of another 12% (measured on Xeon E5345) for
the page copying case.

Signed-off-by: Jan Beulich <jbeulich@novell.com>

Index: 2008-10-27/xen/arch/x86/Makefile
===================================================================
--- 2008-10-27.orig/xen/arch/x86/Makefile	2008-11-11 16:19:45.000000000 +0100
+++ 2008-10-27/xen/arch/x86/Makefile	2008-11-11 16:18:36.000000000 +0100
@@ -11,6 +11,7 @@ subdir-$(x86_64) += x86_64
 obj-y += apic.o
 obj-y += bitops.o
 obj-y += clear_page.o
+obj-y += copy_page.o
 obj-y += compat.o
 obj-y += delay.o
 obj-y += dmi_scan.o
Index: 2008-10-27/xen/arch/x86/copy_page.S
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ 2008-10-27/xen/arch/x86/copy_page.S	2008-06-03 14:24:57.000000000 +0200
@@ -0,0 +1,66 @@
+#include <xen/config.h>
+#include <asm/page.h>
+
+#ifdef __i386__
+#define src_reg %esi
+#define dst_reg %edi
+#define WORD_SIZE 4
+#define tmp1_reg %eax
+#define tmp2_reg %edx
+#define tmp3_reg %ebx
+#define tmp4_reg %ebp
+#else
+#define src_reg %rsi
+#define dst_reg %rdi
+#define WORD_SIZE 8
+#define tmp1_reg %r8
+#define tmp2_reg %r9
+#define tmp3_reg %r10
+#define tmp4_reg %r11
+#endif
+
+ENTRY(copy_page_sse2)
+#ifdef __i386__
+        push    %ebx
+        push    %ebp
+        push    %esi
+        push    %edi
+        mov     6*4(%esp), src_reg
+        mov     5*4(%esp), dst_reg
+#endif
+        mov     $PAGE_SIZE/(4*WORD_SIZE)-3, %ecx
+
+        prefetchnta 2*4*WORD_SIZE(src_reg)
+        mov     (src_reg), tmp1_reg
+        mov     WORD_SIZE(src_reg), tmp2_reg
+        mov     2*WORD_SIZE(src_reg), tmp3_reg
+        mov     3*WORD_SIZE(src_reg), tmp4_reg
+
+0:      prefetchnta 3*4*WORD_SIZE(src_reg)
+1:      add     $4*WORD_SIZE, src_reg
+        movnti  tmp1_reg, (dst_reg)
+        mov     (src_reg), tmp1_reg
+        dec     %ecx
+        movnti  tmp2_reg, WORD_SIZE(dst_reg)
+        mov     WORD_SIZE(src_reg), tmp2_reg
+        movnti  tmp3_reg, 2*WORD_SIZE(dst_reg)
+        mov     2*WORD_SIZE(src_reg), tmp3_reg
+        movnti  tmp4_reg, 3*WORD_SIZE(dst_reg)
+        lea     4*WORD_SIZE(dst_reg), dst_reg
+        mov     3*WORD_SIZE(src_reg), tmp4_reg
+        jg      0b
+        jpe     1b
+
+        movnti  tmp1_reg, (dst_reg)
+        movnti  tmp2_reg, WORD_SIZE(dst_reg)
+        movnti  tmp3_reg, 2*WORD_SIZE(dst_reg)
+        movnti  tmp4_reg, 3*WORD_SIZE(dst_reg)
+
+#ifdef __i386__
+        pop     %edi
+        pop     %esi
+        pop     %ebp
+        pop     %ebx
+#endif
+        sfence
+        ret
Index: 2008-10-27/xen/arch/x86/domain.c
===================================================================
--- 2008-10-27.orig/xen/arch/x86/domain.c	2008-11-11 14:55:44.000000000 +0100
+++ 2008-10-27/xen/arch/x86/domain.c	2008-11-11 16:24:48.000000000 +0100
@@ -183,7 +183,8 @@ static int setup_compat_l4(struct vcpu *
     /* This page needs to look like a pagetable so that it can be shadowed */
     pg->u.inuse.type_info = PGT_l4_page_table|PGT_validated|1;
 
-    l4tab = copy_page(page_to_virt(pg), idle_pg_table);
+    l4tab = page_to_virt(pg);
+    copy_page(l4tab, idle_pg_table);
     l4tab[0] = l4e_empty();
     l4tab[l4_table_offset(LINEAR_PT_VIRT_START)] =
         l4e_from_page(pg, __PAGE_HYPERVISOR);
Index: 2008-10-27/xen/arch/x86/domain_build.c
===================================================================
--- 2008-10-27.orig/xen/arch/x86/domain_build.c	2008-11-11 16:19:45.000000000 +0100
+++ 2008-10-27/xen/arch/x86/domain_build.c	2008-11-11 16:18:36.000000000 +0100
@@ -467,8 +467,9 @@ int __init construct_dom0(
     /* WARNING: The new domain must have its 'processor' field filled in! */
     l3start = l3tab = (l3_pgentry_t *)mpt_alloc; mpt_alloc += PAGE_SIZE;
     l2start = l2tab = (l2_pgentry_t *)mpt_alloc; mpt_alloc += 4*PAGE_SIZE;
-    memcpy(l2tab, idle_pg_table_l2, 4*PAGE_SIZE);
-    for (i = 0; i < 4; i++) {
+    for (i = 0; i < L3_PAGETABLE_ENTRIES; i++) {
+        copy_page(l2tab + i * L2_PAGETABLE_ENTRIES,
+                  idle_pg_table_l2 + i * L2_PAGETABLE_ENTRIES);
         l3tab[i] = l3e_from_paddr((u32)l2tab + i*PAGE_SIZE, L3_PROT);
         l2tab[(LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT)+i] =
             l2e_from_paddr((u32)l2tab + i*PAGE_SIZE, __PAGE_HYPERVISOR);
Index: 2008-10-27/xen/include/asm-x86/page.h
===================================================================
--- 2008-10-27.orig/xen/include/asm-x86/page.h	2008-11-11 16:19:45.000000000 +0100
+++ 2008-10-27/xen/include/asm-x86/page.h	2008-11-11 16:18:36.000000000 +0100
@@ -215,7 +215,10 @@ void clear_page_sse2(void *);
 #define clear_page(_p)      (cpu_has_xmm2 ?                             \
                              clear_page_sse2((void *)(_p)) :            \
                              (void)memset((void *)(_p), 0, PAGE_SIZE))
-#define copy_page(_t,_f)    memcpy((void *)(_t), (void *)(_f), PAGE_SIZE)
+void copy_page_sse2(void *, const void *);
+#define copy_page(_t,_f)    (cpu_has_xmm2 ?                             \
+                             copy_page_sse2(_t, _f) :                   \
+                             (void)memcpy(_t, _f, PAGE_SIZE))
 
 #define mfn_valid(mfn)      ((mfn) < max_page)
 

^ permalink raw reply	[flat|nested] 17+ messages in thread
* RE: [PATCH] x86: add SSE-based copy_page()
@ 2009-01-13  8:31 Jan Beulich
  0 siblings, 0 replies; 17+ messages in thread
From: Jan Beulich @ 2009-01-13  8:31 UTC (permalink / raw)
  To: Keir Fraser, Dexuan Cui, Dan Magenheimer; +Cc: xen-devel

>>> Dan Magenheimer <dan.magenheimer@oracle.com> 13.01.09 00:29 >>>
>I'm guessing the gcc optimizer for the memcpy code was tuned
>for an Intel pipeline... Jan, were you measuring on an
>AMD processor?

Oh, actually my previous reply was without pushing my thinking fully back
to what I was doing (and measuring) back then. I really measured on quad
core Xeons, as what I was looking at were the highmem helpers. Pretty
likely this almost exclusively covered the cold cache (all levels) case, but I
also think this is the most likely scenario.

Jan

^ permalink raw reply	[flat|nested] 17+ messages in thread

end of thread, other threads:[~2009-01-13  8:34 UTC | newest]

Thread overview: 17+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-11-12  9:37 [PATCH] x86: add SSE-based copy_page() Jan Beulich
2008-11-12 14:51 ` Dan Magenheimer
2008-11-12 15:01   ` Jan Beulich
2008-11-12 17:17     ` Dan Magenheimer
2008-11-13  8:37       ` Jan Beulich
2008-11-13 23:41         ` Dan Magenheimer
2008-11-14  3:08           ` Cui, Dexuan
2008-11-14 14:10             ` Dan Magenheimer
2008-11-14 14:16               ` Keir Fraser
2008-11-19 20:24                 ` Dan Magenheimer
2008-11-19 21:21                   ` Keir Fraser
2008-11-20  8:46                     ` Jan Beulich
2009-01-12 23:29                     ` Dan Magenheimer
2009-01-13  8:13                       ` Keir Fraser
2009-01-13  8:34                         ` Jan Beulich
2009-01-13  8:27                       ` Jan Beulich
  -- strict thread matches above, loose matches on Subject: below --
2009-01-13  8:31 Jan Beulich

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.