[PATCH] x86: add SSE-based copy

All of lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH] x86: add SSE-based copy_page()
@ 2008-11-12  9:37 Jan Beulich
  2008-11-12 14:51 ` Dan Magenheimer
  0 siblings, 1 reply; 17+ messages in thread
From: Jan Beulich @ 2008-11-12  9:37 UTC (permalink / raw)
  To: xen-devel

In top of the highmem asstance hypercalls added earlier, this provides
a performance improvement of another 12% (measured on Xeon E5345) for
the page copying case.

Signed-off-by: Jan Beulich <jbeulich@novell.com>

Index: 2008-10-27/xen/arch/x86/Makefile
===================================================================
--- 2008-10-27.orig/xen/arch/x86/Makefile	2008-11-11 16:19:45.000000000 +0100
+++ 2008-10-27/xen/arch/x86/Makefile	2008-11-11 16:18:36.000000000 +0100
@@ -11,6 +11,7 @@ subdir-$(x86_64) += x86_64
 obj-y += apic.o
 obj-y += bitops.o
 obj-y += clear_page.o
+obj-y += copy_page.o
 obj-y += compat.o
 obj-y += delay.o
 obj-y += dmi_scan.o
Index: 2008-10-27/xen/arch/x86/copy_page.S
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ 2008-10-27/xen/arch/x86/copy_page.S	2008-06-03 14:24:57.000000000 +0200
@@ -0,0 +1,66 @@
+#include <xen/config.h>
+#include <asm/page.h>
+
+#ifdef __i386__
+#define src_reg %esi
+#define dst_reg %edi
+#define WORD_SIZE 4
+#define tmp1_reg %eax
+#define tmp2_reg %edx
+#define tmp3_reg %ebx
+#define tmp4_reg %ebp
+#else
+#define src_reg %rsi
+#define dst_reg %rdi
+#define WORD_SIZE 8
+#define tmp1_reg %r8
+#define tmp2_reg %r9
+#define tmp3_reg %r10
+#define tmp4_reg %r11
+#endif
+
+ENTRY(copy_page_sse2)
+#ifdef __i386__
+        push    %ebx
+        push    %ebp
+        push    %esi
+        push    %edi
+        mov     6*4(%esp), src_reg
+        mov     5*4(%esp), dst_reg
+#endif
+        mov     $PAGE_SIZE/(4*WORD_SIZE)-3, %ecx
+
+        prefetchnta 2*4*WORD_SIZE(src_reg)
+        mov     (src_reg), tmp1_reg
+        mov     WORD_SIZE(src_reg), tmp2_reg
+        mov     2*WORD_SIZE(src_reg), tmp3_reg
+        mov     3*WORD_SIZE(src_reg), tmp4_reg
+
+0:      prefetchnta 3*4*WORD_SIZE(src_reg)
+1:      add     $4*WORD_SIZE, src_reg
+        movnti  tmp1_reg, (dst_reg)
+        mov     (src_reg), tmp1_reg
+        dec     %ecx
+        movnti  tmp2_reg, WORD_SIZE(dst_reg)
+        mov     WORD_SIZE(src_reg), tmp2_reg
+        movnti  tmp3_reg, 2*WORD_SIZE(dst_reg)
+        mov     2*WORD_SIZE(src_reg), tmp3_reg
+        movnti  tmp4_reg, 3*WORD_SIZE(dst_reg)
+        lea     4*WORD_SIZE(dst_reg), dst_reg
+        mov     3*WORD_SIZE(src_reg), tmp4_reg
+        jg      0b
+        jpe     1b
+
+        movnti  tmp1_reg, (dst_reg)
+        movnti  tmp2_reg, WORD_SIZE(dst_reg)
+        movnti  tmp3_reg, 2*WORD_SIZE(dst_reg)
+        movnti  tmp4_reg, 3*WORD_SIZE(dst_reg)
+
+#ifdef __i386__
+        pop     %edi
+        pop     %esi
+        pop     %ebp
+        pop     %ebx
+#endif
+        sfence
+        ret
Index: 2008-10-27/xen/arch/x86/domain.c
===================================================================
--- 2008-10-27.orig/xen/arch/x86/domain.c	2008-11-11 14:55:44.000000000 +0100
+++ 2008-10-27/xen/arch/x86/domain.c	2008-11-11 16:24:48.000000000 +0100
@@ -183,7 +183,8 @@ static int setup_compat_l4(struct vcpu *
     /* This page needs to look like a pagetable so that it can be shadowed */
     pg->u.inuse.type_info = PGT_l4_page_table|PGT_validated|1;
 
-    l4tab = copy_page(page_to_virt(pg), idle_pg_table);
+    l4tab = page_to_virt(pg);
+    copy_page(l4tab, idle_pg_table);
     l4tab[0] = l4e_empty();
     l4tab[l4_table_offset(LINEAR_PT_VIRT_START)] =
         l4e_from_page(pg, __PAGE_HYPERVISOR);
Index: 2008-10-27/xen/arch/x86/domain_build.c
===================================================================
--- 2008-10-27.orig/xen/arch/x86/domain_build.c	2008-11-11 16:19:45.000000000 +0100
+++ 2008-10-27/xen/arch/x86/domain_build.c	2008-11-11 16:18:36.000000000 +0100
@@ -467,8 +467,9 @@ int __init construct_dom0(
     /* WARNING: The new domain must have its 'processor' field filled in! */
     l3start = l3tab = (l3_pgentry_t *)mpt_alloc; mpt_alloc += PAGE_SIZE;
     l2start = l2tab = (l2_pgentry_t *)mpt_alloc; mpt_alloc += 4*PAGE_SIZE;
-    memcpy(l2tab, idle_pg_table_l2, 4*PAGE_SIZE);
-    for (i = 0; i < 4; i++) {
+    for (i = 0; i < L3_PAGETABLE_ENTRIES; i++) {
+        copy_page(l2tab + i * L2_PAGETABLE_ENTRIES,
+                  idle_pg_table_l2 + i * L2_PAGETABLE_ENTRIES);
         l3tab[i] = l3e_from_paddr((u32)l2tab + i*PAGE_SIZE, L3_PROT);
         l2tab[(LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT)+i] =
             l2e_from_paddr((u32)l2tab + i*PAGE_SIZE, __PAGE_HYPERVISOR);
Index: 2008-10-27/xen/include/asm-x86/page.h
===================================================================
--- 2008-10-27.orig/xen/include/asm-x86/page.h	2008-11-11 16:19:45.000000000 +0100
+++ 2008-10-27/xen/include/asm-x86/page.h	2008-11-11 16:18:36.000000000 +0100
@@ -215,7 +215,10 @@ void clear_page_sse2(void *);
 #define clear_page(_p)      (cpu_has_xmm2 ?                             \
                              clear_page_sse2((void *)(_p)) :            \
                              (void)memset((void *)(_p), 0, PAGE_SIZE))
-#define copy_page(_t,_f)    memcpy((void *)(_t), (void *)(_f), PAGE_SIZE)
+void copy_page_sse2(void *, const void *);
+#define copy_page(_t,_f)    (cpu_has_xmm2 ?                             \
+                             copy_page_sse2(_t, _f) :                   \
+                             (void)memcpy(_t, _f, PAGE_SIZE))
 
 #define mfn_valid(mfn)      ((mfn) < max_page)
 

^ permalink raw reply	[flat|nested] 17+ messages in thread

* RE: [PATCH] x86: add SSE-based copy_page()
  2008-11-12  9:37 [PATCH] x86: add SSE-based copy_page() Jan Beulich
@ 2008-11-12 14:51 ` Dan Magenheimer
  2008-11-12 15:01   ` Jan Beulich
  0 siblings, 1 reply; 17+ messages in thread
From: Dan Magenheimer @ 2008-11-12 14:51 UTC (permalink / raw)
  To: Jan Beulich, xen-devel

Jan --

I assume the 12% faster is on a benchmark...
Have you measured how much faster the copy_page_sse2
routine (standalond) is than the memcpy?  Is it a
factor of 2?

Thanks,
Dan

> -----Original Message-----
> From: Jan Beulich [mailto:jbeulich@novell.com]
> Sent: Wednesday, November 12, 2008 2:38 AM
> To: xen-devel@lists.xensource.com
> Subject: [Xen-devel] [PATCH] x86: add SSE-based copy_page()
> 
> 
> In top of the highmem asstance hypercalls added earlier, this provides
> a performance improvement of another 12% (measured on Xeon E5345) for
> the page copying case.
> 
> Signed-off-by: Jan Beulich <jbeulich@novell.com>
> 
> Index: 2008-10-27/xen/arch/x86/Makefile
> ===================================================================
> --- 2008-10-27.orig/xen/arch/x86/Makefile	2008-11-11 
> 16:19:45.000000000 +0100
> +++ 2008-10-27/xen/arch/x86/Makefile	2008-11-11 
> 16:18:36.000000000 +0100
> @@ -11,6 +11,7 @@ subdir-$(x86_64) += x86_64
>  obj-y += apic.o
>  obj-y += bitops.o
>  obj-y += clear_page.o
> +obj-y += copy_page.o
>  obj-y += compat.o
>  obj-y += delay.o
>  obj-y += dmi_scan.o
> Index: 2008-10-27/xen/arch/x86/copy_page.S
> ===================================================================
> --- /dev/null	1970-01-01 00:00:00.000000000 +0000
> +++ 2008-10-27/xen/arch/x86/copy_page.S	2008-06-03 
> 14:24:57.000000000 +0200
> @@ -0,0 +1,66 @@
> +#include <xen/config.h>
> +#include <asm/page.h>
> +
> +#ifdef __i386__
> +#define src_reg %esi
> +#define dst_reg %edi
> +#define WORD_SIZE 4
> +#define tmp1_reg %eax
> +#define tmp2_reg %edx
> +#define tmp3_reg %ebx
> +#define tmp4_reg %ebp
> +#else
> +#define src_reg %rsi
> +#define dst_reg %rdi
> +#define WORD_SIZE 8
> +#define tmp1_reg %r8
> +#define tmp2_reg %r9
> +#define tmp3_reg %r10
> +#define tmp4_reg %r11
> +#endif
> +
> +ENTRY(copy_page_sse2)
> +#ifdef __i386__
> +        push    %ebx
> +        push    %ebp
> +        push    %esi
> +        push    %edi
> +        mov     6*4(%esp), src_reg
> +        mov     5*4(%esp), dst_reg
> +#endif
> +        mov     $PAGE_SIZE/(4*WORD_SIZE)-3, %ecx
> +
> +        prefetchnta 2*4*WORD_SIZE(src_reg)
> +        mov     (src_reg), tmp1_reg
> +        mov     WORD_SIZE(src_reg), tmp2_reg
> +        mov     2*WORD_SIZE(src_reg), tmp3_reg
> +        mov     3*WORD_SIZE(src_reg), tmp4_reg
> +
> +0:      prefetchnta 3*4*WORD_SIZE(src_reg)
> +1:      add     $4*WORD_SIZE, src_reg
> +        movnti  tmp1_reg, (dst_reg)
> +        mov     (src_reg), tmp1_reg
> +        dec     %ecx
> +        movnti  tmp2_reg, WORD_SIZE(dst_reg)
> +        mov     WORD_SIZE(src_reg), tmp2_reg
> +        movnti  tmp3_reg, 2*WORD_SIZE(dst_reg)
> +        mov     2*WORD_SIZE(src_reg), tmp3_reg
> +        movnti  tmp4_reg, 3*WORD_SIZE(dst_reg)
> +        lea     4*WORD_SIZE(dst_reg), dst_reg
> +        mov     3*WORD_SIZE(src_reg), tmp4_reg
> +        jg      0b
> +        jpe     1b
> +
> +        movnti  tmp1_reg, (dst_reg)
> +        movnti  tmp2_reg, WORD_SIZE(dst_reg)
> +        movnti  tmp3_reg, 2*WORD_SIZE(dst_reg)
> +        movnti  tmp4_reg, 3*WORD_SIZE(dst_reg)
> +
> +#ifdef __i386__
> +        pop     %edi
> +        pop     %esi
> +        pop     %ebp
> +        pop     %ebx
> +#endif
> +        sfence
> +        ret
> Index: 2008-10-27/xen/arch/x86/domain.c
> ===================================================================
> --- 2008-10-27.orig/xen/arch/x86/domain.c	2008-11-11 
> 14:55:44.000000000 +0100
> +++ 2008-10-27/xen/arch/x86/domain.c	2008-11-11 
> 16:24:48.000000000 +0100
> @@ -183,7 +183,8 @@ static int setup_compat_l4(struct vcpu *
>      /* This page needs to look like a pagetable so that it 
> can be shadowed */
>      pg->u.inuse.type_info = PGT_l4_page_table|PGT_validated|1;
>  
> -    l4tab = copy_page(page_to_virt(pg), idle_pg_table);
> +    l4tab = page_to_virt(pg);
> +    copy_page(l4tab, idle_pg_table);
>      l4tab[0] = l4e_empty();
>      l4tab[l4_table_offset(LINEAR_PT_VIRT_START)] =
>          l4e_from_page(pg, __PAGE_HYPERVISOR);
> Index: 2008-10-27/xen/arch/x86/domain_build.c
> ===================================================================
> --- 2008-10-27.orig/xen/arch/x86/domain_build.c	
> 2008-11-11 16:19:45.000000000 +0100
> +++ 2008-10-27/xen/arch/x86/domain_build.c	2008-11-11 
> 16:18:36.000000000 +0100
> @@ -467,8 +467,9 @@ int __init construct_dom0(
>      /* WARNING: The new domain must have its 'processor' 
> field filled in! */
>      l3start = l3tab = (l3_pgentry_t *)mpt_alloc; mpt_alloc 
> += PAGE_SIZE;
>      l2start = l2tab = (l2_pgentry_t *)mpt_alloc; mpt_alloc 
> += 4*PAGE_SIZE;
> -    memcpy(l2tab, idle_pg_table_l2, 4*PAGE_SIZE);
> -    for (i = 0; i < 4; i++) {
> +    for (i = 0; i < L3_PAGETABLE_ENTRIES; i++) {
> +        copy_page(l2tab + i * L2_PAGETABLE_ENTRIES,
> +                  idle_pg_table_l2 + i * L2_PAGETABLE_ENTRIES);
>          l3tab[i] = l3e_from_paddr((u32)l2tab + i*PAGE_SIZE, L3_PROT);
>          l2tab[(LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT)+i] =
>              l2e_from_paddr((u32)l2tab + i*PAGE_SIZE, 
> __PAGE_HYPERVISOR);
> Index: 2008-10-27/xen/include/asm-x86/page.h
> ===================================================================
> --- 2008-10-27.orig/xen/include/asm-x86/page.h	
> 2008-11-11 16:19:45.000000000 +0100
> +++ 2008-10-27/xen/include/asm-x86/page.h	2008-11-11 
> 16:18:36.000000000 +0100
> @@ -215,7 +215,10 @@ void clear_page_sse2(void *);
>  #define clear_page(_p)      (cpu_has_xmm2 ?                  
>            \
>                               clear_page_sse2((void *)(_p)) : 
>            \
>                               (void)memset((void *)(_p), 0, 
> PAGE_SIZE))
> -#define copy_page(_t,_f)    memcpy((void *)(_t), (void 
> *)(_f), PAGE_SIZE)
> +void copy_page_sse2(void *, const void *);
> +#define copy_page(_t,_f)    (cpu_has_xmm2 ?                  
>            \
> +                             copy_page_sse2(_t, _f) :        
>            \
> +                             (void)memcpy(_t, _f, PAGE_SIZE))
>  
>  #define mfn_valid(mfn)      ((mfn) < max_page)
>  
> 
> 
> 
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xensource.com
> http://lists.xensource.com/xen-devel
>

^ permalink raw reply	[flat|nested] 17+ messages in thread

* RE: [PATCH] x86: add SSE-based copy_page()
  2008-11-12 14:51 ` Dan Magenheimer
@ 2008-11-12 15:01   ` Jan Beulich
  2008-11-12 17:17     ` Dan Magenheimer
  0 siblings, 1 reply; 17+ messages in thread
From: Jan Beulich @ 2008-11-12 15:01 UTC (permalink / raw)
  To: xen-devel, Dan Magenheimer

>>> Dan Magenheimer <dan.magenheimer@oracle.com> 12.11.08 15:51 >>>
>I assume the 12% faster is on a benchmark...

It's the win for an application doing nothing but dirtying private mappings
of a file. That seemed like the least overhead test that wouldn't require any
special testing code in kernel or hypervisor.

>Have you measured how much faster the copy_page_sse2
>routine (standalond) is than the memcpy?  Is it a
>factor of 2?

No, I didn't.

Jan

^ permalink raw reply	[flat|nested] 17+ messages in thread

* RE: [PATCH] x86: add SSE-based copy_page()
  2008-11-12 15:01   ` Jan Beulich
@ 2008-11-12 17:17     ` Dan Magenheimer
  2008-11-13  8:37       ` Jan Beulich
  0 siblings, 1 reply; 17+ messages in thread
From: Dan Magenheimer @ 2008-11-12 17:17 UTC (permalink / raw)
  To: Jan Beulich, xen-devel

> From: Jan Beulich [mailto:jbeulich@novell.com]
> 
> >>> Dan Magenheimer <dan.magenheimer@oracle.com> 12.11.08 15:51 >>>
> >I assume the 12% faster is on a benchmark...
> 
> It's the win for an application doing nothing but dirtying 
> private mappings
> of a file. That seemed like the least overhead test that 
> wouldn't require any
> special testing code in kernel or hypervisor.
> 
> >Have you measured how much faster the copy_page_sse2
> >routine (standalond) is than the memcpy?  Is it a
> >factor of 2?
> 
> No, I didn't.

Hmmm... I'm working on a project that does extensive page-copying
so was eager to give it a spin on two test machines, one a Core 2 Duo
("Weybridge"), the other an as-yet-unreleased Intel box.  I measured
the routine with rdtsc, took many thousands of samples, and
look at the smallest measurement.  The hypervisor measured is
64-bit so "cpu_has_xmm2" appears to always be true.

On the first machine, the change to use sse2 instructions
made no difference.  On the second machine, using sse2 actually
made copy_page() *worse* (by 30-40%).

I'm poor enough with the x86 instruction set that I can't explain
my results, but thought I would report them.  I'm not doubting that
you saw improvements on your box, just noting that YMMV.

Perhaps someone from Intel familiar with the microarchitectures
might be able to explain (and can query me offlist to identify
the as-yet-unreleased box).

Dan

^ permalink raw reply	[flat|nested] 17+ messages in thread

* RE: [PATCH] x86: add SSE-based copy_page()
  2008-11-12 17:17     ` Dan Magenheimer
@ 2008-11-13  8:37       ` Jan Beulich
  2008-11-13 23:41         ` Dan Magenheimer
  0 siblings, 1 reply; 17+ messages in thread
From: Jan Beulich @ 2008-11-13  8:37 UTC (permalink / raw)
  To: Dan Magenheimer; +Cc: xen-devel

>>> Dan Magenheimer <dan.magenheimer@oracle.com> 12.11.08 18:17 >>>
>Hmmm... I'm working on a project that does extensive page-copying
>so was eager to give it a spin on two test machines, one a Core 2 Duo
>("Weybridge"), the other an as-yet-unreleased Intel box.  I measured
>the routine with rdtsc, took many thousands of samples, and
>look at the smallest measurement.  The hypervisor measured is
>64-bit so "cpu_has_xmm2" appears to always be true.
>
>On the first machine, the change to use sse2 instructions
>made no difference.  On the second machine, using sse2 actually
>made copy_page() *worse* (by 30-40%).

This very much depends on whether the page(s) are in any caches - in
the general case (e.g. when dealing with large sets of data, or data
just read from disk), you'd expect both pages (source and destination)
not to be in any cache. This is where using the streaming instructions
helps.

However, when dealing with a small set of pages (or even just a single
source/destination pair), you'd easily run entirely on L1 or L2 data, which
certainly performs better using the non-streaming instructions.

>I'm poor enough with the x86 instruction set that I can't explain
>my results, but thought I would report them.  I'm not doubting that
>you saw improvements on your box, just noting that YMMV.
>
>Perhaps someone from Intel familiar with the microarchitectures
>might be able to explain (and can query me offlist to identify
>the as-yet-unreleased box).

The above is not to say that there are other reasons why this would
perform worse on as-yet-unreleased hardware (which I wasn't able
to test on and hence can't say anything about).

Jan

^ permalink raw reply	[flat|nested] 17+ messages in thread

* RE: [PATCH] x86: add SSE-based copy_page()
  2008-11-13  8:37       ` Jan Beulich
@ 2008-11-13 23:41         ` Dan Magenheimer
  2008-11-14  3:08           ` Cui, Dexuan
  0 siblings, 1 reply; 17+ messages in thread
From: Dan Magenheimer @ 2008-11-13 23:41 UTC (permalink / raw)
  To: Jan Beulich; +Cc: xen-devel

> This very much depends on whether the page(s) are in any caches - in
> the general case (e.g. when dealing with large sets of data, or data
> just read from disk), you'd expect both pages (source and destination)
> not to be in any cache. This is where using the streaming instructions
> helps.
> 
> However, when dealing with a small set of pages (or even just a single
> source/destination pair), you'd easily run entirely on L1 or 
> L2 data, which
> certainly performs better using the non-streaming instructions.

Is there a way to force-flush-cache on a page full of data?  (I knew
how to do this once on ia64...)  I can easily measure warm-start but
am not sure how to measure cold-start without some kind of flush.

Thanks,
Dan

^ permalink raw reply	[flat|nested] 17+ messages in thread

* RE: [PATCH] x86: add SSE-based copy_page()
  2008-11-13 23:41         ` Dan Magenheimer
@ 2008-11-14  3:08           ` Cui, Dexuan
  2008-11-14 14:10             ` Dan Magenheimer
  0 siblings, 1 reply; 17+ messages in thread
From: Cui, Dexuan @ 2008-11-14  3:08 UTC (permalink / raw)
  To: Dan Magenheimer, Jan Beulich; +Cc: xen-devel@lists.xensource.com

[-- Attachment #1: Type: text/plain, Size: 1376 bytes --]

Hi Dan,
clflush(flush a cache line; you may need a loop to flush a page.) or wbinvd(flush all the caches) should be what you need to flush caches on x86.

Thanks,
-- Dexuan


-----Original Message-----
From: xen-devel-bounces@lists.xensource.com [mailto:xen-devel-bounces@lists.xensource.com] On Behalf Of Dan Magenheimer
Sent: 2008年11月14日 7:42
To: Jan Beulich
Cc: xen-devel@lists.xensource.com
Subject: RE: [Xen-devel] [PATCH] x86: add SSE-based copy_page()

> This very much depends on whether the page(s) are in any caches - in
> the general case (e.g. when dealing with large sets of data, or data
> just read from disk), you'd expect both pages (source and destination)
> not to be in any cache. This is where using the streaming instructions
> helps.
> 
> However, when dealing with a small set of pages (or even just a single
> source/destination pair), you'd easily run entirely on L1 or 
> L2 data, which
> certainly performs better using the non-streaming instructions.

Is there a way to force-flush-cache on a page full of data?  (I knew
how to do this once on ia64...)  I can easily measure warm-start but
am not sure how to measure cold-start without some kind of flush.

Thanks,
Dan

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

[-- Attachment #2: Type: text/plain, Size: 138 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 17+ messages in thread

* RE: [PATCH] x86: add SSE-based copy_page()
  2008-11-14  3:08           ` Cui, Dexuan
@ 2008-11-14 14:10             ` Dan Magenheimer
  2008-11-14 14:16               ` Keir Fraser
  0 siblings, 1 reply; 17+ messages in thread
From: Dan Magenheimer @ 2008-11-14 14:10 UTC (permalink / raw)
  To: Cui, Dexuan, Jan Beulich; +Cc: xen-devel

Thanks Dexuan!  I don't want to flush the TLB (which
wbinvd does) so it looks like the way to flush cache
on a page in Xen is:

#include <xen/flushtlb.h>
flush_area_local(va,FLUSH_CACHE|FLUSH_ORDER(0))


> -----Original Message-----
> From: Cui, Dexuan [mailto:dexuan.cui@intel.com]
> Sent: Thursday, November 13, 2008 8:08 PM
> To: Dan Magenheimer; Jan Beulich
> Cc: xen-devel@lists.xensource.com
> Subject: RE: [Xen-devel] [PATCH] x86: add SSE-based copy_page()
>
>
> Hi Dan,
> clflush(flush a cache line; you may need a loop to flush a
> page.) or wbinvd(flush all the caches) should be what you
> need to flush caches on x86.
>
> Thanks,
> -- Dexuan
>
>
> -----Original Message-----
> From: xen-devel-bounces@lists.xensource.com
> [mailto:xen-devel-bounces@lists.xensource.com] On Behalf Of
> Dan Magenheimer
> Sent: 2008年11月14日 7:42
> To: Jan Beulich
> Cc: xen-devel@lists.xensource.com
> Subject: RE: [Xen-devel] [PATCH] x86: add SSE-based copy_page()
>
> > This very much depends on whether the page(s) are in any caches - in
> > the general case (e.g. when dealing with large sets of data, or data
> > just read from disk), you'd expect both pages (source and
> destination)
> > not to be in any cache. This is where using the streaming
> instructions
> > helps.
> >
> > However, when dealing with a small set of pages (or even
> just a single
> > source/destination pair), you'd easily run entirely on L1 or
> > L2 data, which
> > certainly performs better using the non-streaming instructions.
>
> Is there a way to force-flush-cache on a page full of data?  (I knew
> how to do this once on ia64...)  I can easily measure warm-start but
> am not sure how to measure cold-start without some kind of flush.
>
> Thanks,
> Dan
>
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xensource.com
> http://lists.xensource.com/xen-devel
>

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] x86: add SSE-based copy_page()
  2008-11-14 14:10             ` Dan Magenheimer
@ 2008-11-14 14:16               ` Keir Fraser
  2008-11-19 20:24                 ` Dan Magenheimer
  0 siblings, 1 reply; 17+ messages in thread
From: Keir Fraser @ 2008-11-14 14:16 UTC (permalink / raw)
  To: Dan Magenheimer, Cui, Dexuan, Jan Beulich; +Cc: xen-devel

That should indeed do the job.

 -- Keir

On 14/11/08 14:10, "Dan Magenheimer" <dan.magenheimer@oracle.com> wrote:

> Thanks Dexuan!  I don't want to flush the TLB (which
> wbinvd does) so it looks like the way to flush cache
> on a page in Xen is:
> 
> #include <xen/flushtlb.h>
> flush_area_local(va,FLUSH_CACHE|FLUSH_ORDER(0))
> 
> 
>> -----Original Message-----
>> From: Cui, Dexuan [mailto:dexuan.cui@intel.com]
>> Sent: Thursday, November 13, 2008 8:08 PM
>> To: Dan Magenheimer; Jan Beulich
>> Cc: xen-devel@lists.xensource.com
>> Subject: RE: [Xen-devel] [PATCH] x86: add SSE-based copy_page()
>> 
>> 
>> Hi Dan,
>> clflush(flush a cache line; you may need a loop to flush a
>> page.) or wbinvd(flush all the caches) should be what you
>> need to flush caches on x86.
>> 
>> Thanks,
>> -- Dexuan
>> 
>> 
>> -----Original Message-----
>> From: xen-devel-bounces@lists.xensource.com
>> [mailto:xen-devel-bounces@lists.xensource.com] On Behalf Of
>> Dan Magenheimer
>> Sent: 2008年11月14日 7:42
>> To: Jan Beulich
>> Cc: xen-devel@lists.xensource.com
>> Subject: RE: [Xen-devel] [PATCH] x86: add SSE-based copy_page()
>> 
>>> This very much depends on whether the page(s) are in any caches - in
>>> the general case (e.g. when dealing with large sets of data, or data
>>> just read from disk), you'd expect both pages (source and
>> destination)
>>> not to be in any cache. This is where using the streaming
>> instructions
>>> helps.
>>> 
>>> However, when dealing with a small set of pages (or even
>> just a single
>>> source/destination pair), you'd easily run entirely on L1 or
>>> L2 data, which
>>> certainly performs better using the non-streaming instructions.
>> 
>> Is there a way to force-flush-cache on a page full of data?  (I knew
>> how to do this once on ia64...)  I can easily measure warm-start but
>> am not sure how to measure cold-start without some kind of flush.
>> 
>> Thanks,
>> Dan
>> 
>> _______________________________________________
>> Xen-devel mailing list
>> Xen-devel@lists.xensource.com
>> http://lists.xensource.com/xen-devel
>> 
> 
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xensource.com
> http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 17+ messages in thread

* RE: [PATCH] x86: add SSE-based copy_page()
  2008-11-14 14:16               ` Keir Fraser
@ 2008-11-19 20:24                 ` Dan Magenheimer
  2008-11-19 21:21                   ` Keir Fraser
  0 siblings, 1 reply; 17+ messages in thread
From: Dan Magenheimer @ 2008-11-19 20:24 UTC (permalink / raw)
  To: Keir Fraser, Cui, Dexuan, Jan Beulich; +Cc: xen-devel

I haven't had a chance to test this further yet,
but I see the patch was already taken (c/s 18772).

Why, given that performance gets worse under some
circumstances?  At least maybe there should be two
interfaces: copy_page_cold_cache() and
copy_page_warm_cache() rather than just assume?

I'll post measurements when I get a chance to test,
but bring this up as a placeholder for now.

Thanks,
Dan

> -----Original Message-----
> From: Keir Fraser [mailto:keir.fraser@eu.citrix.com]
> Sent: Friday, November 14, 2008 7:16 AM
> To: Dan Magenheimer; Cui, Dexuan; Jan Beulich
> Cc: xen-devel@lists.xensource.com
> Subject: Re: [Xen-devel] [PATCH] x86: add SSE-based copy_page()
>
>
> That should indeed do the job.
>
>  -- Keir
>
> On 14/11/08 14:10, "Dan Magenheimer"
> <dan.magenheimer@oracle.com> wrote:
>
> > Thanks Dexuan!  I don't want to flush the TLB (which
> > wbinvd does) so it looks like the way to flush cache
> > on a page in Xen is:
> >
> > #include <xen/flushtlb.h>
> > flush_area_local(va,FLUSH_CACHE|FLUSH_ORDER(0))
> >
> >
> >> -----Original Message-----
> >> From: Cui, Dexuan [mailto:dexuan.cui@intel.com]
> >> Sent: Thursday, November 13, 2008 8:08 PM
> >> To: Dan Magenheimer; Jan Beulich
> >> Cc: xen-devel@lists.xensource.com
> >> Subject: RE: [Xen-devel] [PATCH] x86: add SSE-based copy_page()
> >>
> >>
> >> Hi Dan,
> >> clflush(flush a cache line; you may need a loop to flush a
> >> page.) or wbinvd(flush all the caches) should be what you
> >> need to flush caches on x86.
> >>
> >> Thanks,
> >> -- Dexuan
> >>
> >>
> >> -----Original Message-----
> >> From: xen-devel-bounces@lists.xensource.com
> >> [mailto:xen-devel-bounces@lists.xensource.com] On Behalf Of
> >> Dan Magenheimer
> >> Sent: 2008年11月14日 7:42
> >> To: Jan Beulich
> >> Cc: xen-devel@lists.xensource.com
> >> Subject: RE: [Xen-devel] [PATCH] x86: add SSE-based copy_page()
> >>
> >>> This very much depends on whether the page(s) are in any
> caches - in
> >>> the general case (e.g. when dealing with large sets of
> data, or data
> >>> just read from disk), you'd expect both pages (source and
> >> destination)
> >>> not to be in any cache. This is where using the streaming
> >> instructions
> >>> helps.
> >>>
> >>> However, when dealing with a small set of pages (or even
> >> just a single
> >>> source/destination pair), you'd easily run entirely on L1 or
> >>> L2 data, which
> >>> certainly performs better using the non-streaming instructions.
> >>
> >> Is there a way to force-flush-cache on a page full of
> data?  (I knew
> >> how to do this once on ia64...)  I can easily measure
> warm-start but
> >> am not sure how to measure cold-start without some kind of flush.
> >>
> >> Thanks,
> >> Dan
> >>
> >> _______________________________________________
> >> Xen-devel mailing list
> >> Xen-devel@lists.xensource.com
> >> http://lists.xensource.com/xen-devel
> >>
> >
> > _______________________________________________
> > Xen-devel mailing list
> > Xen-devel@lists.xensource.com
> > http://lists.xensource.com/xen-devel
>
>
>
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xensource.com
> http://lists.xensource.com/xen-devel
>

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] x86: add SSE-based copy_page()
  2008-11-19 20:24                 ` Dan Magenheimer
@ 2008-11-19 21:21                   ` Keir Fraser
  2008-11-20  8:46                     ` Jan Beulich
  2009-01-12 23:29                     ` Dan Magenheimer
  0 siblings, 2 replies; 17+ messages in thread
From: Keir Fraser @ 2008-11-19 21:21 UTC (permalink / raw)
  To: Dan Magenheimer, Cui, Dexuan, Jan Beulich; +Cc: xen-devel

On 19/11/08 20:24, "Dan Magenheimer" <dan.magenheimer@oracle.com> wrote:

> I haven't had a chance to test this further yet,
> but I see the patch was already taken (c/s 18772).
> 
> Why, given that performance gets worse under some
> circumstances?  At least maybe there should be two
> interfaces: copy_page_cold_cache() and
> copy_page_warm_cache() rather than just assume?
> 
> I'll post measurements when I get a chance to test,
> but bring this up as a placeholder for now.

If more extensive testing shows it not to be a win in general then we can
revert the patch.

Looking at c/s 18724 again also I wonder what fixmap_domain_page() was
introduced for, rather than just using map_domain_page(). Is it an attempt
to reduce global TLB shootdowns? It's not very clear nor very pretty and I'd
rather just kill it and use map_domain_page().

 -- Keir

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] x86: add SSE-based copy_page()
  2008-11-19 21:21                   ` Keir Fraser
@ 2008-11-20  8:46                     ` Jan Beulich
  2009-01-12 23:29                     ` Dan Magenheimer
  1 sibling, 0 replies; 17+ messages in thread
From: Jan Beulich @ 2008-11-20  8:46 UTC (permalink / raw)
  To: Keir Fraser; +Cc: Dan Magenheimer, xen-devel, Dexuan Cui

>>> Keir Fraser <keir.fraser@eu.citrix.com> 19.11.08 22:21 >>>
>Looking at c/s 18724 again also I wonder what fixmap_domain_page() was
>introduced for, rather than just using map_domain_page(). Is it an attempt
>to reduce global TLB shootdowns? It's not very clear nor very pretty and I'd
>rather just kill it and use map_domain_page().

Yes, this and avoiding the need to take another lock.

Jan

^ permalink raw reply	[flat|nested] 17+ messages in thread

* RE: [PATCH] x86: add SSE-based copy_page()
  2008-11-19 21:21                   ` Keir Fraser
  2008-11-20  8:46                     ` Jan Beulich
@ 2009-01-12 23:29                     ` Dan Magenheimer
  2009-01-13  8:13                       ` Keir Fraser
  2009-01-13  8:27                       ` Jan Beulich
  1 sibling, 2 replies; 17+ messages in thread
From: Dan Magenheimer @ 2009-01-12 23:29 UTC (permalink / raw)
  To: Keir Fraser, Cui, Dexuan, Jan Beulich; +Cc: xen-devel

> On 19/11/08 20:24, "Dan Magenheimer" 
> <dan.magenheimer@oracle.com> wrote:
> 
> > I haven't had a chance to test this further yet,
> > but I see the patch was already taken (c/s 18772).
> > 
> > Why, given that performance gets worse under some
> > circumstances?  At least maybe there should be two
> > interfaces: copy_page_cold_cache() and
> > copy_page_warm_cache() rather than just assume?
> > 
> > I'll post measurements when I get a chance to test,
> > but bring this up as a placeholder for now.
> 
> If more extensive testing shows it not to be a win in general 
> then we can
> revert the patch.
> 
>  -- Keir

I finally got around to measuring this.  On my two machines,
an Intel "Weybridge" box and an Intel TBD quadcore box,
the new sse2 code was at best nearly the same for cold cache
and much worse for warm cache.

I can't explain the sampling variation as I have interrupts off,
a lock held, and pre-warmed TLB... I suppose maybe another
processor could be causing rare TLB misses?  But in any case
the min number is probably best for comparison.

I'm guessing the gcc optimizer for the memcpy code was tuned
for an Intel pipeline... Jan, were you measuring on an
AMD processor?

I've included the raw data and measurement code below.

Dan (whose reason for interest in page-copy performance is now public)

=================

Dual core:

(XEN) Cycles for cold sse2: avg=5811, max=25839, min=4383, samples=208965
(XEN) Cycles for hot sse2: avg=2177, max=19665, min=1980, samples=208965
(XEN) Cycles for cold memcpy: avg=6125, max=27171, min=3969, samples=208965
(XEN) Cycles for hot memcpy: avg=668, max=17460, min=594, samples=208965

Quad core:

(raw numbers removed pending Intel OK, but the ratios reinforce
my claim)

Measurement code:

/* interrupts are off and lock is held */
void tmem_copy_page(char *to, char*from)
{
    *to = *from;  /* don't measure TLB misses */
    flush_area_local(to,FLUSH_CACHE|FLUSH_ORDER(0));
    flush_area_local(from,FLUSH_CACHE|FLUSH_ORDER(0));
    START_CYC_COUNTER(pg_copy1);
    copy_page_sse2(to, from);  /* cold cache */
    END_CYC_COUNTER(pg_copy1);
    START_CYC_COUNTER(pg_copy2);
    copy_page_sse2(to, from);  /* hot cache */
    END_CYC_COUNTER(pg_copy2);
    flush_area_local(to,FLUSH_CACHE|FLUSH_ORDER(0));
    flush_area_local(from,FLUSH_CACHE|FLUSH_ORDER(0));
    START_CYC_COUNTER(pg_copy3);
    memcpy(to, from, PAGE_SIZE);  /* cold cache */
    END_CYC_COUNTER(pg_copy3);
    START_CYC_COUNTER(pg_copy4);
    memcpy(to, from, PAGE_SIZE); /* hot cache */
    END_CYC_COUNTER(pg_copy4);
}

#define START_CYC_COUNTER(x) x##_start = get_cycles()
#define END_CYC_COUNTER(x) \
do { \
    x##_start = (int32_t)get_cycles() - x##_start; \
    if ((int32_t)(x##_start) < 0) x##_start = -x##_start; \
    if (x##_start < 10000000) { /* ignore context switches etc */ \
     x##_sum_cycles += x##_start; x##_count++; \
     if (x##_start < x##_min_cycles) x##_min_cycles = x##_start; \
     if (x##_start > x##_max_cycles) x##_max_cycles = x##_start; \
    } \
} while (0)
#define PRINTK_CYC_COUNTER(x,text) \
  if (x##_count) printk(text" avg=%"PRIu64", max=%"PRId32", " \
  "min=%"PRId32", samples=%"PRIu64"\n", \
  x##_sum_cycles ? (x##_sum_cycles/x##_count) : 0, \
  x##_max_cycles, x##_min_cycles, x##_count)

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] x86: add SSE-based copy_page()
  2009-01-12 23:29                     ` Dan Magenheimer
@ 2009-01-13  8:13                       ` Keir Fraser
  2009-01-13  8:34                         ` Jan Beulich
  2009-01-13  8:27                       ` Jan Beulich
  1 sibling, 1 reply; 17+ messages in thread
From: Keir Fraser @ 2009-01-13  8:13 UTC (permalink / raw)
  To: Dan Magenheimer, Cui, Dexuan, Jan Beulich; +Cc: xen-devel

On 12/01/2009 23:29, "Dan Magenheimer" <dan.magenheimer@oracle.com> wrote:

> I finally got around to measuring this.  On my two machines,
> an Intel "Weybridge" box and an Intel TBD quadcore box,
> the new sse2 code was at best nearly the same for cold cache
> and much worse for warm cache.
> 
> I can't explain the sampling variation as I have interrupts off,
> a lock held, and pre-warmed TLB... I suppose maybe another
> processor could be causing rare TLB misses?  But in any case
> the min number is probably best for comparison.
> 
> I'm guessing the gcc optimizer for the memcpy code was tuned
> for an Intel pipeline... Jan, were you measuring on an
> AMD processor?
> 
> I've included the raw data and measurement code below.

Seems like unless we dynamically choose the copy routine, we're better off
without the SSE2 alternative. Shall I revert it then?

 -- Keir

^ permalink raw reply	[flat|nested] 17+ messages in thread

* RE: [PATCH] x86: add SSE-based copy_page()
  2009-01-12 23:29                     ` Dan Magenheimer
  2009-01-13  8:13                       ` Keir Fraser
@ 2009-01-13  8:27                       ` Jan Beulich
  1 sibling, 0 replies; 17+ messages in thread
From: Jan Beulich @ 2009-01-13  8:27 UTC (permalink / raw)
  To: Keir Fraser, Dexuan Cui, Dan Magenheimer; +Cc: xen-devel

>>> Dan Magenheimer <dan.magenheimer@oracle.com> 13.01.09 00:29 >>>
>I'm guessing the gcc optimizer for the memcpy code was tuned
>for an Intel pipeline... Jan, were you measuring on an
>AMD processor?

Indeed.

Jan

^ permalink raw reply	[flat|nested] 17+ messages in thread

* RE: [PATCH] x86: add SSE-based copy_page()
@ 2009-01-13  8:31 Jan Beulich
  0 siblings, 0 replies; 17+ messages in thread
From: Jan Beulich @ 2009-01-13  8:31 UTC (permalink / raw)
  To: Keir Fraser, Dexuan Cui, Dan Magenheimer; +Cc: xen-devel

>>> Dan Magenheimer <dan.magenheimer@oracle.com> 13.01.09 00:29 >>>
>I'm guessing the gcc optimizer for the memcpy code was tuned
>for an Intel pipeline... Jan, were you measuring on an
>AMD processor?

Oh, actually my previous reply was without pushing my thinking fully back
to what I was doing (and measuring) back then. I really measured on quad
core Xeons, as what I was looking at were the highmem helpers. Pretty
likely this almost exclusively covered the cold cache (all levels) case, but I
also think this is the most likely scenario.

Jan

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] x86: add SSE-based copy_page()
  2009-01-13  8:13                       ` Keir Fraser
@ 2009-01-13  8:34                         ` Jan Beulich
  0 siblings, 0 replies; 17+ messages in thread
From: Jan Beulich @ 2009-01-13  8:34 UTC (permalink / raw)
  To: Keir Fraser, Dexuan Cui, Dan Magenheimer; +Cc: xen-devel

>>> Keir Fraser <keir.fraser@eu.citrix.com> 13.01.09 09:13 >>>
>Seems like unless we dynamically choose the copy routine, we're better off
>without the SSE2 alternative. Shall I revert it then?

I'd vote for not reverting it - Xen itself doesn't use copy_page() much, and
whether a domain wants to use MMUEXT_COPY_PAGE could be decided by
it based on its own judgement.

Ultimately we should improve it as you say to dynamically select the copy
method, but then the question really is whether to base the selection on
static CPU information, best cold, or best warm cache performance.

Jan

^ permalink raw reply	[flat|nested] 17+ messages in thread

end of thread, other threads:[~2009-01-13  8:34 UTC | newest]

Thread overview: 17+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-11-12  9:37 [PATCH] x86: add SSE-based copy_page() Jan Beulich
2008-11-12 14:51 ` Dan Magenheimer
2008-11-12 15:01   ` Jan Beulich
2008-11-12 17:17     ` Dan Magenheimer
2008-11-13  8:37       ` Jan Beulich
2008-11-13 23:41         ` Dan Magenheimer
2008-11-14  3:08           ` Cui, Dexuan
2008-11-14 14:10             ` Dan Magenheimer
2008-11-14 14:16               ` Keir Fraser
2008-11-19 20:24                 ` Dan Magenheimer
2008-11-19 21:21                   ` Keir Fraser
2008-11-20  8:46                     ` Jan Beulich
2009-01-12 23:29                     ` Dan Magenheimer
2009-01-13  8:13                       ` Keir Fraser
2009-01-13  8:34                         ` Jan Beulich
2009-01-13  8:27                       ` Jan Beulich
  -- strict thread matches above, loose matches on Subject: below --
2009-01-13  8:31 Jan Beulich

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.