[PATCH -tip] lib,x86_64: improve the performance of memcpy() for unaligned copy

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

From: Miao Xie <miaox@cn.fujitsu.com>
To: Ingo Molnar <mingo@redhat.com>, Andi Kleen <andi@firstfloor.org>,
	Ma Ling <ling.ma@intel.com>
Cc: Linux Kernel <linux-kernel@vger.kernel.org>
Subject: [PATCH -tip] lib,x86_64: improve the performance of memcpy() for unaligned copy
Date: Thu, 23 Sep 2010 20:01:43 +0800	[thread overview]
Message-ID: <4C9B41A7.3010608@cn.fujitsu.com> (raw)

memcpy of x86_64 hasn't been optimized for the unaligned copy like other
architecture, this patch fixed this problem.

I have tested this patch by doing 500 bytes memory copy for 5,000,000 times
with various alignments and buffer sizes on my x86_64 box

Len	Src/Dest	Old memcpy	New memcpy
	alignment
---	---------	-------------	-------------
32	0/0		0s 59553us	0s 39597us
32	0/4		0s 37675us	0s 39583us
32	4/0		0s 35720us	0s 39568us
32	4/4		0s 35721us	0s 39564us
256	0/0		0s 88783us	0s 86759us
256	0/4		0s 182896us	0s 166298us
256	4/0		0s 209244us	0s 191853us
256	4/4		0s 262847us	0s 165768us
512	0/0		0s 156486us	0s 148370us
512	0/4		0s 318856us	0s 302365us
512	4/0		0s 412763us	0s 338468us
512	4/4		0s 518688us	0s 218969us
1024	0/0		0s 298076us	0s 268443us
1024	0/4		0s 592114us	0s 575168us
1024	4/0		0s 819969us	0s 654752us
1024	4/4		1s 16405us	0s 343318us

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
---
 arch/x86/lib/memcpy_64.S |  134 ++++++++++++++++++++++++++++++++++++++++++++++
 1 files changed, 134 insertions(+), 0 deletions(-)

diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index 75ef61e..32dade1 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -49,6 +49,35 @@ ENTRY(memcpy)
 	jb .Lhandle_tail
 
 	/*
+	 * the code for unaligned copy is good for large-size copy(>80),
+	 * so if the size is small, we needn't check dst and src is aligned
+	 * or not.
+	 */
+	cmp $80, %edx
+	jbe .Lboth_aligned
+
+	/*
+	 * we found if src is aligned and dest is unaligned, using
+	 * both-aligned copy is better than unaligned copy. So if src is
+	 * aligned, we needn't check dest is aligned or not, just goto
+	 * both-aligned copy.
+	 */
+	movq %rsi, %rcx
+	andq $7, %rcx		/* src align check */
+	jz .Lboth_aligned
+
+	/* if dest and src both are unaligned, goto unaligned copy */
+	movq %rdi, %rcx
+	andq $7, %rcx		/* dst align check */
+	jnz .Ldst_unaligned
+
+	/* if src is unaligned and dest is aligned, goto unaligned copy */
+	movq %rsi, %rcx
+	andq $7, %rcx		/* src align check */
+	jnz .Lsrc_unaligned_dst_aligned
+
+.Lboth_aligned:
+	/*
 	 * We check whether memory false dependece could occur,
 	 * then jump to corresponding copy mode.
 	 */
@@ -166,6 +195,111 @@ ENTRY(memcpy)
 
 .Lend:
 	retq
+
+	.p2align 4
+.Ldst_unaligned:
+	negq %rcx
+	andq $7, %rcx
+	subq %rcx, %rdx
+
+	/* tune dst address */
+	movq (%rsi), %r8
+	movq %r8, (%rdi)
+	addq %rcx, %rdi
+	addq %rcx, %rsi
+
+	cmp $0x20, %rdx
+	jb .Lhandle_tail
+
+	movq %rsi, %rcx
+	andq $7, %rcx		/* src align check */
+	jz .Lboth_aligned
+
+	.p2align 4
+.Lsrc_unaligned_dst_aligned:
+	push %rbx
+	push %r12
+	push %r13
+	push %r14
+	push %r15
+	/*
+	 * Calculate how to shift a word read at the memory operation
+	 * aligned srcp to make it aligned for copy.
+	 */
+	movq %rsi, %r14
+	andq $7, %r14
+	shlq $3, %r14
+	
+	movq $64, %r15
+	subq %r14, %r15
+
+	andq $-8, %rsi		/* src aligned */
+	movq 0*8(%rsi), %r8
+
+	movq %rdx, %rbx
+	shrq $5, %rbx
+	jz .Lsrc_unaligned_less32
+
+	/*
+	 * %r8 : store src[0]
+	 * %r9 : store src[1]
+	 * %r10: store src[2]
+	 * %r11: store src[3]
+	 * %r12: store src[4]
+	 * %r13: store the tmp data
+	 */ 
+	.p2align 4
+.Lsrc_unaligned_loop32:
+	movq 1*8(%rsi), %r9
+	movq 2*8(%rsi), %r10
+	movq 3*8(%rsi), %r11
+	movq 4*8(%rsi), %r12
+
+	movq %r9, %r13
+	movb %r14b, %cl
+	shrq %cl, %r8
+	shrq %cl, %r13
+	movb %r15b, %cl
+	shlq  %cl, %r9
+	orq %r8, %r9
+	movq %r10, %r8
+	shlq  %cl, %r10
+	orq %r13, %r10
+
+	movq %r11, %r13
+	movb %r14b, %cl
+	shrq %cl, %r8
+	shrq %cl, %r13
+	movb %r15b, %cl
+	shlq  %cl, %r11
+	orq %r8, %r11	
+	movq %r12, %r8
+	shlq  %cl, %r12
+	orq %r13, %r12
+
+	movq %r9, 0*8(%rdi)
+	movq %r10, 1*8(%rdi)
+	movq %r11, 2*8(%rdi)
+	movq %r12, 3*8(%rdi)
+	
+	leaq 4*8(%rdi), %rdi
+	leaq 4*8(%rsi), %rsi
+	decq %rbx
+	jnz .Lsrc_unaligned_loop32
+
+	.p2align 4
+.Lsrc_unaligned_less32:
+	shrq $3, %r14
+	addq %r14, %rsi
+	pop %r15
+	pop %r14
+	pop %r13
+	pop %r12
+	pop %rbx
+	andq $31, %rdx
+	jnz .Lhandle_tail
+	retq
+	
 	CFI_ENDPROC
 ENDPROC(memcpy)
 ENDPROC(__memcpy)
-- 
1.7.0.1

                 reply	other threads:[~2010-09-23 12:02 UTC|newest]

Thread overview: [no followups] expand[flat|nested]  mbox.gz  Atom feed

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:75ef61e dfblob:32dade1 )
 OR (
bs:"[PATCH -tip] lib,x86_64: improve the performance of memcpy() for unaligned copy" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=4C9B41A7.3010608@cn.fujitsu.com \
    --to=miaox@cn.fujitsu.com \
    --cc=andi@firstfloor.org \
    --cc=ling.ma@intel.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mingo@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox