[PATCH RFC] [x86] Optimize copy-page by reducing impact from HW prefetch

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH RFC] [x86] Optimize copy-page by reducing impact from HW prefetch
@ 2011-06-17 22:51 ling.ma
  2011-06-22 20:05 ` Andi Kleen
  0 siblings, 1 reply; 10+ messages in thread
From: ling.ma @ 2011-06-17 22:51 UTC (permalink / raw)
  To: mingo; +Cc: hpa, tglx, linux-kernel, ling.ma

From: Ma Ling <ling.ma@intel.com>

Program's temporal & spatial locality introduce cache unit to overcome
the processor-memory performance gap, hardware prefetch is very important
to improve performance by reducing cache miss. Modern CPU micro-architecture
mainly support two kinds of prefetch mechanism in L1 data cache:

a. Data cache unit (DCU) prefetcher. Data spatial locality ask us to provide
   adjacent data while handling current data. larger cache line size
   is one choice, but it would cause more cached data to be evicted and latency
   to load, so we simply prefetch next line when accessing current data.
   This mode only prefetch data of ascending address.
  
b. Instruction pointer (IP)- based strided prefetcher. Based on Load/write
   instruction address the mechanism predicate to prefetch data with adaptive stride,
   including ascending and descending address

DCU mode is good when time program data operation spend is longer than that of
prefetch next line, however copy-page function breaks the assumption,
DCU mode is hardly helpful, specially we append software prefetch and data is
in cache, so bus traffic is more busy that impact perforamnce seriously.

In this patch we introduce backward copy to successfully avoid HW prfetch
impact(DCU prefetcher), and simplify original code.
The performance is improved about 15% on core2, 36% on snb respectively.
(We use our micro-benchmark, and will do further test according to your requirment)

Thanks
Ling

---
 arch/x86/lib/copy_page_64.S |  124 +++++++++++++++++++-----------------------
 1 files changed, 56 insertions(+), 68 deletions(-)

diff --git a/arch/x86/lib/copy_page_64.S b/arch/x86/lib/copy_page_64.S
index 6fec2d1..3d17280 100644
--- a/arch/x86/lib/copy_page_64.S
+++ b/arch/x86/lib/copy_page_64.S
@@ -1,4 +1,5 @@
 /* Written 2003 by Andi Kleen, based on a kernel by Evandro Menezes */
+/* Updated 2011 by Ma Ling to introduce backward copy */
 
 #include <linux/linkage.h>
 #include <asm/dwarf2.h>
@@ -17,83 +18,70 @@ ENDPROC(copy_page_c)
 	    
 /* Could vary the prefetch distance based on SMP/UP */
 
+/* 
+ * By backward copy we manage to reduce impact from HW prefetch
+ * when data is in L1 cache, and get benefit when data is not in L1 cache.
+ */
 ENTRY(copy_page)
 	CFI_STARTPROC
-	subq	$3*8,%rsp
-	CFI_ADJUST_CFA_OFFSET 3*8
-	movq	%rbx,(%rsp)
-	CFI_REL_OFFSET rbx, 0
-	movq	%r12,1*8(%rsp)
-	CFI_REL_OFFSET r12, 1*8
-	movq	%r13,2*8(%rsp)
-	CFI_REL_OFFSET r13, 2*8
-
-	movl	$(4096/64)-5,%ecx
-	.p2align 4
+	lea	0x4096(%rsi), %rsi
+	lea	0x4096(%rdi), %rdi
+	mov	$(4096/64)-5,	%cl
+	mov	$5,	%dl
+	/*
+	 * Nop force following instruction to be 16 bytes aligned.
+	 */
+	nop
 .Loop64:
-  	dec     %rcx
-
-	movq        (%rsi), %rax
-	movq      8 (%rsi), %rbx
-	movq     16 (%rsi), %rdx
-	movq     24 (%rsi), %r8
-	movq     32 (%rsi), %r9
-	movq     40 (%rsi), %r10
-	movq     48 (%rsi), %r11
-	movq     56 (%rsi), %r12
-
-	prefetcht0 5*64(%rsi)
-
-	movq     %rax,    (%rdi)
-	movq     %rbx,  8 (%rdi)
-	movq     %rdx, 16 (%rdi)
-	movq     %r8,  24 (%rdi)
-	movq     %r9,  32 (%rdi)
-	movq     %r10, 40 (%rdi)
-	movq     %r11, 48 (%rdi)
-	movq     %r12, 56 (%rdi)
-
-	leaq    64 (%rsi), %rsi
-	leaq    64 (%rdi), %rdi
+	prefetchnta	-5*64(%rsi)
+  	dec	%cl
+
+	movq	-0x8*1(%rsi),	%rax
+	movq	-0x8*2(%rsi),	%r8
+	movq	-0x8*3(%rsi),	%r9
+	movq	-0x8*4(%rsi),	%r10
+	movq	%rax,	-0x8*1(%rdi)
+	movq	%r8,	-0x8*2(%rdi)
+	movq	%r9,	-0x8*3(%rdi)
+	movq	%r10,	-0x8*4(%rdi)
+
+	movq	-0x8*5(%rsi),	%rax
+	movq	-0x8*6(%rsi),	%r8
+	movq	-0x8*7(%rsi),	%r9
+	movq	-0x8*8(%rsi),	%r10
+	leaq	-64(%rsi),	%rsi
+	movq	%rax,	-0x8*5(%rdi)
+	movq	%r8,	-0x8*6(%rdi)
+	movq	%r9,	-0x8*7(%rdi)
+	movq	%r10,	-0x8*8(%rdi)
+	leaq	-64(%rdi),	%rdi
 
 	jnz     .Loop64
 
-	movl	$5,%ecx
-	.p2align 4
 .Loop2:
-	decl   %ecx
-
-	movq        (%rsi), %rax
-	movq      8 (%rsi), %rbx
-	movq     16 (%rsi), %rdx
-	movq     24 (%rsi), %r8
-	movq     32 (%rsi), %r9
-	movq     40 (%rsi), %r10
-	movq     48 (%rsi), %r11
-	movq     56 (%rsi), %r12
-
-	movq     %rax,    (%rdi)
-	movq     %rbx,  8 (%rdi)
-	movq     %rdx, 16 (%rdi)
-	movq     %r8,  24 (%rdi)
-	movq     %r9,  32 (%rdi)
-	movq     %r10, 40 (%rdi)
-	movq     %r11, 48 (%rdi)
-	movq     %r12, 56 (%rdi)
-
-	leaq	64(%rdi),%rdi
-	leaq	64(%rsi),%rsi
-
+	dec	%dl
+
+	movq	-0x8*1(%rsi),	%rax
+	movq	-0x8*2(%rsi),	%r8
+	movq	-0x8*3(%rsi),	%r9
+	movq	-0x8*4(%rsi),	%r10
+	movq	%rax,	-0x8*1(%rdi)
+	movq	%r8,	-0x8*2(%rdi)
+	movq	%r9,	-0x8*3(%rdi)
+	movq	%r10,	-0x8*4(%rdi)
+
+	movq	-0x8*5(%rsi),	%rax
+	movq	-0x8*6(%rsi),	%r8
+	movq	-0x8*7(%rsi),	%r9
+	movq	-0x8*8(%rsi),	%r10
+	leaq	-64(%rsi),	%rsi
+	movq	%rax,	-0x8*5(%rdi)
+	movq	%r8,	-0x8*6(%rdi)
+	movq	%r9,	-0x8*7(%rdi)
+	movq	%r10,	-0x8*8(%rdi)
+	leaq	-64(%rdi),	%rdi
 	jnz	.Loop2
 
-	movq	(%rsp),%rbx
-	CFI_RESTORE rbx
-	movq	1*8(%rsp),%r12
-	CFI_RESTORE r12
-	movq	2*8(%rsp),%r13
-	CFI_RESTORE r13
-	addq	$3*8,%rsp
-	CFI_ADJUST_CFA_OFFSET -3*8
 	ret
 .Lcopy_page_end:
 	CFI_ENDPROC
-- 
1.6.5.2


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH RFC] [x86] Optimize copy-page by reducing impact from HW prefetch
  2011-06-17 22:51 [PATCH RFC] [x86] Optimize copy-page by reducing impact from HW prefetch ling.ma
@ 2011-06-22 20:05 ` Andi Kleen
  2011-06-23  1:01   ` Ma, Ling
  2011-06-23  7:04   ` Ingo Molnar
  0 siblings, 2 replies; 10+ messages in thread
From: Andi Kleen @ 2011-06-22 20:05 UTC (permalink / raw)
  To: ling.ma; +Cc: mingo, hpa, tglx, linux-kernel

ling.ma@intel.com writes:
> impact(DCU prefetcher), and simplify original code.
> The performance is improved about 15% on core2, 36% on snb respectively.
> (We use our micro-benchmark, and will do further test according to your requirment)

This doesn't make a lot of sense because neither Core-2 nor SNB use the
code path you patched. They all use the rep ; movs path

-Andi

-- 
ak@linux.intel.com -- Speaking for myself only

^ permalink raw reply	[flat|nested] 10+ messages in thread

* RE: [PATCH RFC] [x86] Optimize copy-page by reducing impact from HW prefetch
  2011-06-22 20:05 ` Andi Kleen
@ 2011-06-23  1:01   ` Ma, Ling
  2011-06-23  2:29     ` Andi Kleen
  2011-06-23  7:04   ` Ingo Molnar
  1 sibling, 1 reply; 10+ messages in thread
From: Ma, Ling @ 2011-06-23  1:01 UTC (permalink / raw)
  To: Andi Kleen
  Cc: mingo@elte.hu, hpa@zytor.com, tglx@linutronix.de,
	linux-kernel@vger.kernel.org

Yes, I also have tested 64bit atom, it got 11.6% improvement.
Because older CPU almost all use prefetch-next-line mechanism, the patch should be useful to them.

Thanks
Ling
> -----Original Message-----
> From: Ma, Ling
> Sent: Monday, June 20, 2011 11:43 AM
> To: Ma, Ling; mingo@elte.hu
> Cc: hpa@zytor.com; tglx@linutronix.de; linux-kernel@vger.kernel.org
> Subject: RE: [PATCH RFC V2] [x86] Optimize copy-page by reducing impact
> from HW prefetch
> 
> New experiment shows, for 4096 bytes no improvement on snb,
> 10~15% improvement on Core2, 11.6% improvement on 64bit atom.
> 
> Thanks
> Ling

> -----Original Message-----
> From: Andi Kleen [mailto:andi@firstfloor.org]
> Sent: Thursday, June 23, 2011 4:06 AM
> To: Ma, Ling
> Cc: mingo@elte.hu; hpa@zytor.com; tglx@linutronix.de; linux-
> kernel@vger.kernel.org
> Subject: Re: [PATCH RFC] [x86] Optimize copy-page by reducing impact
> from HW prefetch
> 
> ling.ma@intel.com writes:
> > impact(DCU prefetcher), and simplify original code.
> > The performance is improved about 15% on core2, 36% on snb
> respectively.
> > (We use our micro-benchmark, and will do further test according to
> your requirment)
> 
> This doesn't make a lot of sense because neither Core-2 nor SNB use the
> code path you patched. They all use the rep ; movs path
> 
> -Andi
> 
> --
> ak@linux.intel.com -- Speaking for myself only

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH RFC] [x86] Optimize copy-page by reducing impact from HW prefetch
  2011-06-23  1:01   ` Ma, Ling
@ 2011-06-23  2:29     ` Andi Kleen
  0 siblings, 0 replies; 10+ messages in thread
From: Andi Kleen @ 2011-06-23  2:29 UTC (permalink / raw)
  To: Ma, Ling
  Cc: Andi Kleen, mingo@elte.hu, hpa@zytor.com, tglx@linutronix.de,
	linux-kernel@vger.kernel.org

On Thu, Jun 23, 2011 at 09:01:19AM +0800, Ma, Ling wrote:
> Yes, I also have tested 64bit atom, it got 11.6% improvement.

That's a nice improvement, however I should add that in my experience 
copy_page micro benchmark improvements do not necessarily translate to
real world improvements. Most simple micro benchmark do not simulate
the typical page fault access pattern very well.

> Because older CPU almost all use prefetch-next-line mechanism, the patch should be useful to them.

Old in this case is P4 and ancient early stepping K8 only.

-Andi

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH RFC] [x86] Optimize copy-page by reducing impact from HW prefetch
  2011-06-22 20:05 ` Andi Kleen
  2011-06-23  1:01   ` Ma, Ling
@ 2011-06-23  7:04   ` Ingo Molnar
  2011-06-24  2:01     ` Ma, Ling
                       ` (4 more replies)
  1 sibling, 5 replies; 10+ messages in thread
From: Ingo Molnar @ 2011-06-23  7:04 UTC (permalink / raw)
  To: Andi Kleen; +Cc: ling.ma, hpa, tglx, linux-kernel


* Andi Kleen <andi@firstfloor.org> wrote:

> ling.ma@intel.com writes:
>
> > impact(DCU prefetcher), and simplify original code. The 
> > performance is improved about 15% on core2, 36% on snb 
> > respectively. (We use our micro-benchmark, and will do further 
> > test according to your requirment)
> 
> This doesn't make a lot of sense because neither Core-2 nor SNB use 
> the code path you patched. They all use the rep ; movs path

Ling, mind double checking which one is the faster/better one on SNB, 
in cold-cache and hot-cache situations, copy_page or copy_page_c?

Also, while looking at this file please fix the countless pieces of 
style excrements it has before modifying it:

 - non-Linux comment style (and needless two comments - it can 
   be in one comment block):

  /* Don't use streaming store because it's better when the target
     ends up in cache. */
            
  /* Could vary the prefetch distance based on SMP/UP */

 - (there's other non-standard comment blocks in this file as well)

 - The copy_page/copy_page_c naming is needlessly obfuscated, it 
   should be copy_page, copy_page_norep or so - the _c postfix has no
   obvious meaning.

 - all #include's should be at the top

 - please standardize it on the 'instrn %x, %y' pattern that we 
   generally use in arch/x86/, not 'instrn %x,%y' pattern.

and do this cleanup patch first and the speedup on top of it, and 
keep the two in two separate patches so that the modification to the 
assembly code can be reviewed more easily.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 10+ messages in thread

* RE: [PATCH RFC] [x86] Optimize copy-page by reducing impact from HW prefetch
  2011-06-23  7:04   ` Ingo Molnar
@ 2011-06-24  2:01     ` Ma, Ling
  2011-06-24  2:09     ` Ma, Ling
                       ` (3 subsequent siblings)
  4 siblings, 0 replies; 10+ messages in thread
From: Ma, Ling @ 2011-06-24  2:01 UTC (permalink / raw)
  To: Ingo Molnar, Andi Kleen
  Cc: hpa@zytor.com, tglx@linutronix.de, linux-kernel@vger.kernel.org

Sure, I separate two patches ASAP, one is for performance tuning code after some experiments,
another code style patch.

Thanks
Ling

> -----Original Message-----
> From: Ingo Molnar [mailto:mingo@elte.hu]
> Sent: Thursday, June 23, 2011 3:05 PM
> To: Andi Kleen
> Cc: Ma, Ling; hpa@zytor.com; tglx@linutronix.de; linux-
> kernel@vger.kernel.org
> Subject: Re: [PATCH RFC] [x86] Optimize copy-page by reducing impact
> from HW prefetch
> 
> 
> * Andi Kleen <andi@firstfloor.org> wrote:
> 
> > ling.ma@intel.com writes:
> >
> > > impact(DCU prefetcher), and simplify original code. The
> > > performance is improved about 15% on core2, 36% on snb
> > > respectively. (We use our micro-benchmark, and will do further
> > > test according to your requirment)
> >
> > This doesn't make a lot of sense because neither Core-2 nor SNB use
> > the code path you patched. They all use the rep ; movs path
> 
> Ling, mind double checking which one is the faster/better one on SNB,
> in cold-cache and hot-cache situations, copy_page or copy_page_c?
> 
> Also, while looking at this file please fix the countless pieces of
> style excrements it has before modifying it:
> 
>  - non-Linux comment style (and needless two comments - it can
>    be in one comment block):
> 
>   /* Don't use streaming store because it's better when the target
>      ends up in cache. */
> 
>   /* Could vary the prefetch distance based on SMP/UP */
> 
>  - (there's other non-standard comment blocks in this file as well)
> 
>  - The copy_page/copy_page_c naming is needlessly obfuscated, it
>    should be copy_page, copy_page_norep or so - the _c postfix has no
>    obvious meaning.
> 
>  - all #include's should be at the top
> 
>  - please standardize it on the 'instrn %x, %y' pattern that we
>    generally use in arch/x86/, not 'instrn %x,%y' pattern.
> 
> and do this cleanup patch first and the speedup on top of it, and
> keep the two in two separate patches so that the modification to the
> assembly code can be reviewed more easily.
> 
> Thanks,
> 
> 	Ingo

^ permalink raw reply	[flat|nested] 10+ messages in thread

* RE: [PATCH RFC] [x86] Optimize copy-page by reducing impact from HW prefetch
  2011-06-23  7:04   ` Ingo Molnar
  2011-06-24  2:01     ` Ma, Ling
@ 2011-06-24  2:09     ` Ma, Ling
  2011-06-28 15:23     ` Ma, Ling
                       ` (2 subsequent siblings)
  4 siblings, 0 replies; 10+ messages in thread
From: Ma, Ling @ 2011-06-24  2:09 UTC (permalink / raw)
  To: Ma, Ling, Ingo Molnar, Andi Kleen
  Cc: hpa@zytor.com, tglx@linutronix.de, linux-kernel@vger.kernel.org

Yes, clean up patch is first. 

> -----Original Message-----
> From: Ma, Ling
> Sent: Friday, June 24, 2011 10:01 AM
> To: 'Ingo Molnar'; Andi Kleen
> Cc: hpa@zytor.com; tglx@linutronix.de; linux-kernel@vger.kernel.org
> Subject: RE: [PATCH RFC] [x86] Optimize copy-page by reducing impact
> from HW prefetch
> 
> Sure, I separate two patches ASAP, one is for performance tuning code
> after some experiments,
> another code style patch.
> 
> Thanks
> Ling
> 
> > -----Original Message-----
> > From: Ingo Molnar [mailto:mingo@elte.hu]
> > Sent: Thursday, June 23, 2011 3:05 PM
> > To: Andi Kleen
> > Cc: Ma, Ling; hpa@zytor.com; tglx@linutronix.de; linux-
> > kernel@vger.kernel.org
> > Subject: Re: [PATCH RFC] [x86] Optimize copy-page by reducing impact
> > from HW prefetch
> >
> >
> > * Andi Kleen <andi@firstfloor.org> wrote:
> >
> > > ling.ma@intel.com writes:
> > >
> > > > impact(DCU prefetcher), and simplify original code. The
> > > > performance is improved about 15% on core2, 36% on snb
> > > > respectively. (We use our micro-benchmark, and will do further
> > > > test according to your requirment)
> > >
> > > This doesn't make a lot of sense because neither Core-2 nor SNB use
> > > the code path you patched. They all use the rep ; movs path
> >
> > Ling, mind double checking which one is the faster/better one on SNB,
> > in cold-cache and hot-cache situations, copy_page or copy_page_c?
> >
> > Also, while looking at this file please fix the countless pieces of
> > style excrements it has before modifying it:
> >
> >  - non-Linux comment style (and needless two comments - it can
> >    be in one comment block):
> >
> >   /* Don't use streaming store because it's better when the target
> >      ends up in cache. */
> >
> >   /* Could vary the prefetch distance based on SMP/UP */
> >
> >  - (there's other non-standard comment blocks in this file as well)
> >
> >  - The copy_page/copy_page_c naming is needlessly obfuscated, it
> >    should be copy_page, copy_page_norep or so - the _c postfix has no
> >    obvious meaning.
> >
> >  - all #include's should be at the top
> >
> >  - please standardize it on the 'instrn %x, %y' pattern that we
> >    generally use in arch/x86/, not 'instrn %x,%y' pattern.
> >
> > and do this cleanup patch first and the speedup on top of it, and
> > keep the two in two separate patches so that the modification to the
> > assembly code can be reviewed more easily.
> >
> > Thanks,
> >
> > 	Ingo

^ permalink raw reply	[flat|nested] 10+ messages in thread

* RE: [PATCH RFC] [x86] Optimize copy-page by reducing impact from HW prefetch
  2011-06-23  7:04   ` Ingo Molnar
  2011-06-24  2:01     ` Ma, Ling
  2011-06-24  2:09     ` Ma, Ling
@ 2011-06-28 15:23     ` Ma, Ling
  2011-07-01  8:10     ` Ma, Ling
  2011-07-01 10:26     ` Ma, Ling
  4 siblings, 0 replies; 10+ messages in thread
From: Ma, Ling @ 2011-06-28 15:23 UTC (permalink / raw)
  To: Ingo Molnar, Andi Kleen
  Cc: hpa@zytor.com, tglx@linutronix.de, linux-kernel@vger.kernel.org

Hi Ingo
 
> Ling, mind double checking which one is the faster/better one on SNB,
> in cold-cache and hot-cache situations, copy_page or copy_page_c?
Copy_page_c 
on hot-cache copy_page_c on SNB combines data to 128bit (processor limit 128bit/cycle for write) after startup latency
so it is faster than copy_page which provides 64bit/cycle for write.

on cold-cache copy_page_c doesn't use prefetch, which uses prfetch according to copy size,
so copy_page function is better. 

Thanks
Ling


^ permalink raw reply	[flat|nested] 10+ messages in thread

* RE: [PATCH RFC] [x86] Optimize copy-page by reducing impact from HW prefetch
  2011-06-23  7:04   ` Ingo Molnar
                       ` (2 preceding siblings ...)
  2011-06-28 15:23     ` Ma, Ling
@ 2011-07-01  8:10     ` Ma, Ling
  2011-07-01 10:26     ` Ma, Ling
  4 siblings, 0 replies; 10+ messages in thread
From: Ma, Ling @ 2011-07-01  8:10 UTC (permalink / raw)
  To: Ma, Ling, Ingo Molnar, Andi Kleen
  Cc: hpa@zytor.com, tglx@linutronix.de, linux-kernel@vger.kernel.org

Forget to append experiment data:

1. We copy 4096 bytes for 32 times on snb, and extract minimum execution time 
On hot cache case: 
  Copy_page          copy_page_c 
  482 cycles          350 cycles

2. the same routine with hot-caches, but before each execution we copy 512k data to push original data out of L1 &L2.
On cold cache case:
  copy_page(with prefetch)    copy_page(without prefetch)      copy_page_c
   853~873 cycles                  1037~1051 cycles            959~976 cycles 

Thanks
Ling

> -----Original Message-----
> From: Ma, Ling
> Sent: Tuesday, June 28, 2011 11:24 PM
> To: 'Ingo Molnar'; Andi Kleen
> Cc: hpa@zytor.com; tglx@linutronix.de; linux-kernel@vger.kernel.org
> Subject: RE: [PATCH RFC] [x86] Optimize copy-page by reducing impact
> from HW prefetch
> 
> Hi Ingo
> 
> > Ling, mind double checking which one is the faster/better one on SNB,
> > in cold-cache and hot-cache situations, copy_page or copy_page_c?
> Copy_page_c
> on hot-cache copy_page_c on SNB combines data to 128bit (processor
> limit 128bit/cycle for write) after startup latency
> so it is faster than copy_page which provides 64bit/cycle for write.
> 
> on cold-cache copy_page_c doesn't use prefetch, which uses prfetch
> according to copy size,
> so copy_page function is better.
> 
> Thanks
> Ling


^ permalink raw reply	[flat|nested] 10+ messages in thread

* RE: [PATCH RFC] [x86] Optimize copy-page by reducing impact from HW prefetch
  2011-06-23  7:04   ` Ingo Molnar
                       ` (3 preceding siblings ...)
  2011-07-01  8:10     ` Ma, Ling
@ 2011-07-01 10:26     ` Ma, Ling
  4 siblings, 0 replies; 10+ messages in thread
From: Ma, Ling @ 2011-07-01 10:26 UTC (permalink / raw)
  To: Ma, Ling, Ingo Molnar, Andi Kleen
  Cc: hpa@zytor.com, tglx@linutronix.de, linux-kernel@vger.kernel.org

[-- Attachment #1: Type: text/plain, Size: 2945 bytes --]

Sorry for incorrect copy_page_c results from movsb not movsq.

Update results :
(the benchmark is not enough accurate, but it could tell us which is faster)

1. We copy 4096 bytes for 32 times on snb, and extract minimum execution time

On hot cache case:
   Copy_page          copy_page_c    copy_page_sse2 without preftch (128bit write /cycle)   copy_page_sse2 with prefetch (128bit write /cycle)
   437 cycles          226 cycles    183                                                    208
 

2. the same routine with hot-caches, but before each execution we copy
 512k data to push original data out of L1 &L2.
 On cold cache case:

 copy_page(with prefetch)  copy_page(without prefetch)  copy_page_c  copy_page_sse2 without preftch (128bit write /cycle)  copy_page_sse2 with prefetch(128bit write /cycle)
  688~713                  847~860                      636~648      661~673                                               609~615                                         

Answer to the question from Ingo, copy_page_c is always faster to copy page,
but copy_page_c doesn't use prefetch for cold-cache cases, and append prefetch according to copy size.

Thanks
Ling
  



> -----Original Message-----
> From: Ma, Ling
> Sent: Friday, July 01, 2011 4:11 PM
> To: Ma, Ling; 'Ingo Molnar'; 'Andi Kleen'
> Cc: 'hpa@zytor.com'; 'tglx@linutronix.de'; 'linux-
> kernel@vger.kernel.org'
> Subject: RE: [PATCH RFC] [x86] Optimize copy-page by reducing impact
> from HW prefetch
> 
> Forget to append experiment data:
> 
> 1. We copy 4096 bytes for 32 times on snb, and extract minimum
> execution time
> On hot cache case:
>   Copy_page          copy_page_c
>   482 cycles          350 cycles
> 
> 2. the same routine with hot-caches, but before each execution we copy
> 512k data to push original data out of L1 &L2.
> On cold cache case:
>   copy_page(with prefetch)    copy_page(without prefetch)
> copy_page_c
>    853~873 cycles                  1037~1051 cycles            959~976
> cycles
> 
> Thanks
> Ling
> 
> > -----Original Message-----
> > From: Ma, Ling
> > Sent: Tuesday, June 28, 2011 11:24 PM
> > To: 'Ingo Molnar'; Andi Kleen
> > Cc: hpa@zytor.com; tglx@linutronix.de; linux-kernel@vger.kernel.org
> > Subject: RE: [PATCH RFC] [x86] Optimize copy-page by reducing impact
> > from HW prefetch
> >
> > Hi Ingo
> >
> > > Ling, mind double checking which one is the faster/better one on
> SNB,
> > > in cold-cache and hot-cache situations, copy_page or copy_page_c?
> > Copy_page_c
> > on hot-cache copy_page_c on SNB combines data to 128bit (processor
> > limit 128bit/cycle for write) after startup latency
> > so it is faster than copy_page which provides 64bit/cycle for write.
> >
> > on cold-cache copy_page_c doesn't use prefetch, which uses prfetch
> > according to copy size,
> > so copy_page function is better.
> >
> > Thanks
> > Ling


[-- Attachment #2: snb_info --]
[-- Type: application/octet-stream, Size: 6888 bytes --]

processor	: 0
vendor_id	: GenuineIntel
cpu family	: 6
model		: 42
model name	: Genuine Intel(R) CPU 0 @ 2.60GHz
stepping	: 3
cpu MHz		: 1600.000
cache size	: 8192 KB
physical id	: 0
siblings	: 8
core id		: 0
cpu cores	: 4
apicid		: 0
initial apicid	: 0
fpu		: yes
fpu_exception	: yes
cpuid level	: 13
wp		: yes
flags		: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc aperfmperf pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 cx16 xtpr pdcm sse4_1 sse4_2 x2apic popcnt aes xsave avx lahf_lm ida arat xsaveopt pln pts dts tpr_shadow vnmi flexpriority ept vpid
bogomips	: 5199.89
clflush size	: 64
cache_alignment	: 64
address sizes	: 36 bits physical, 48 bits virtual
power management:

processor	: 1
vendor_id	: GenuineIntel
cpu family	: 6
model		: 42
model name	: Genuine Intel(R) CPU 0 @ 2.60GHz
stepping	: 3
cpu MHz		: 1600.000
cache size	: 8192 KB
physical id	: 0
siblings	: 8
core id		: 1
cpu cores	: 4
apicid		: 2
initial apicid	: 2
fpu		: yes
fpu_exception	: yes
cpuid level	: 13
wp		: yes
flags		: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc aperfmperf pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 cx16 xtpr pdcm sse4_1 sse4_2 x2apic popcnt aes xsave avx lahf_lm ida arat xsaveopt pln pts dts tpr_shadow vnmi flexpriority ept vpid
bogomips	: 5200.03
clflush size	: 64
cache_alignment	: 64
address sizes	: 36 bits physical, 48 bits virtual
power management:

processor	: 2
vendor_id	: GenuineIntel
cpu family	: 6
model		: 42
model name	: Genuine Intel(R) CPU 0 @ 2.60GHz
stepping	: 3
cpu MHz		: 1600.000
cache size	: 8192 KB
physical id	: 0
siblings	: 8
core id		: 2
cpu cores	: 4
apicid		: 4
initial apicid	: 4
fpu		: yes
fpu_exception	: yes
cpuid level	: 13
wp		: yes
flags		: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc aperfmperf pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 cx16 xtpr pdcm sse4_1 sse4_2 x2apic popcnt aes xsave avx lahf_lm ida arat xsaveopt pln pts dts tpr_shadow vnmi flexpriority ept vpid
bogomips	: 5200.03
clflush size	: 64
cache_alignment	: 64
address sizes	: 36 bits physical, 48 bits virtual
power management:

processor	: 3
vendor_id	: GenuineIntel
cpu family	: 6
model		: 42
model name	: Genuine Intel(R) CPU 0 @ 2.60GHz
stepping	: 3
cpu MHz		: 1600.000
cache size	: 8192 KB
physical id	: 0
siblings	: 8
core id		: 3
cpu cores	: 4
apicid		: 6
initial apicid	: 6
fpu		: yes
fpu_exception	: yes
cpuid level	: 13
wp		: yes
flags		: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc aperfmperf pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 cx16 xtpr pdcm sse4_1 sse4_2 x2apic popcnt aes xsave avx lahf_lm ida arat xsaveopt pln pts dts tpr_shadow vnmi flexpriority ept vpid
bogomips	: 5200.03
clflush size	: 64
cache_alignment	: 64
address sizes	: 36 bits physical, 48 bits virtual
power management:

processor	: 4
vendor_id	: GenuineIntel
cpu family	: 6
model		: 42
model name	: Genuine Intel(R) CPU 0 @ 2.60GHz
stepping	: 3
cpu MHz		: 1600.000
cache size	: 8192 KB
physical id	: 0
siblings	: 8
core id		: 0
cpu cores	: 4
apicid		: 1
initial apicid	: 1
fpu		: yes
fpu_exception	: yes
cpuid level	: 13
wp		: yes
flags		: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc aperfmperf pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 cx16 xtpr pdcm sse4_1 sse4_2 x2apic popcnt aes xsave avx lahf_lm ida arat xsaveopt pln pts dts tpr_shadow vnmi flexpriority ept vpid
bogomips	: 5200.03
clflush size	: 64
cache_alignment	: 64
address sizes	: 36 bits physical, 48 bits virtual
power management:

processor	: 5
vendor_id	: GenuineIntel
cpu family	: 6
model		: 42
model name	: Genuine Intel(R) CPU 0 @ 2.60GHz
stepping	: 3
cpu MHz		: 1600.000
cache size	: 8192 KB
physical id	: 0
siblings	: 8
core id		: 1
cpu cores	: 4
apicid		: 3
initial apicid	: 3
fpu		: yes
fpu_exception	: yes
cpuid level	: 13
wp		: yes
flags		: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc aperfmperf pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 cx16 xtpr pdcm sse4_1 sse4_2 x2apic popcnt aes xsave avx lahf_lm ida arat xsaveopt pln pts dts tpr_shadow vnmi flexpriority ept vpid
bogomips	: 5200.03
clflush size	: 64
cache_alignment	: 64
address sizes	: 36 bits physical, 48 bits virtual
power management:

processor	: 6
vendor_id	: GenuineIntel
cpu family	: 6
model		: 42
model name	: Genuine Intel(R) CPU 0 @ 2.60GHz
stepping	: 3
cpu MHz		: 1600.000
cache size	: 8192 KB
physical id	: 0
siblings	: 8
core id		: 2
cpu cores	: 4
apicid		: 5
initial apicid	: 5
fpu		: yes
fpu_exception	: yes
cpuid level	: 13
wp		: yes
flags		: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc aperfmperf pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 cx16 xtpr pdcm sse4_1 sse4_2 x2apic popcnt aes xsave avx lahf_lm ida arat xsaveopt pln pts dts tpr_shadow vnmi flexpriority ept vpid
bogomips	: 5200.03
clflush size	: 64
cache_alignment	: 64
address sizes	: 36 bits physical, 48 bits virtual
power management:

processor	: 7
vendor_id	: GenuineIntel
cpu family	: 6
model		: 42
model name	: Genuine Intel(R) CPU 0 @ 2.60GHz
stepping	: 3
cpu MHz		: 1600.000
cache size	: 8192 KB
physical id	: 0
siblings	: 8
core id		: 3
cpu cores	: 4
apicid		: 7
initial apicid	: 7
fpu		: yes
fpu_exception	: yes
cpuid level	: 13
wp		: yes
flags		: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc aperfmperf pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 cx16 xtpr pdcm sse4_1 sse4_2 x2apic popcnt aes xsave avx lahf_lm ida arat xsaveopt pln pts dts tpr_shadow vnmi flexpriority ept vpid
bogomips	: 5200.03
clflush size	: 64
cache_alignment	: 64
address sizes	: 36 bits physical, 48 bits virtual
power management:


^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2011-07-01 10:26 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2011-06-17 22:51 [PATCH RFC] [x86] Optimize copy-page by reducing impact from HW prefetch ling.ma
2011-06-22 20:05 ` Andi Kleen
2011-06-23  1:01   ` Ma, Ling
2011-06-23  2:29     ` Andi Kleen
2011-06-23  7:04   ` Ingo Molnar
2011-06-24  2:01     ` Ma, Ling
2011-06-24  2:09     ` Ma, Ling
2011-06-28 15:23     ` Ma, Ling
2011-07-01  8:10     ` Ma, Ling
2011-07-01 10:26     ` Ma, Ling

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox