From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1754312Ab0JHH2d (ORCPT ); Fri, 8 Oct 2010 03:28:33 -0400 Received: from cn.fujitsu.com ([222.73.24.84]:50699 "EHLO song.cn.fujitsu.com" rhost-flags-OK-FAIL-OK-OK) by vger.kernel.org with ESMTP id S1753319Ab0JHH2c (ORCPT ); Fri, 8 Oct 2010 03:28:32 -0400 Message-ID: <4CAEC81C.1070108@cn.fujitsu.com> Date: Fri, 08 Oct 2010 15:28:28 +0800 From: Miao Xie Reply-To: miaox@cn.fujitsu.com User-Agent: Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.1.9) Gecko/20100413 Fedora/3.0.4-2.fc13 Thunderbird/3.0.4 MIME-Version: 1.0 To: Ingo Molnar , Andi Kleen , Ma Ling , "H. Peter Anvin" , Thomas Gleixner , ykzhao CC: Linux Kernel Subject: [PATCH V2 -tip] lib,x86_64: improve the performance of memcpy() for unaligned copy Content-Type: text/plain; charset=GB2312 Content-Transfer-Encoding: 7bit Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org memcpy of x86_64 hasn't been optimized for the unaligned copy like other architecture, this patch fixed this problem. I have tested this patch by my benchmark tool(doing 500 bytes memory copy for 5,000,000 times)with various alignments and buffer sizes on my Core2 box. Len Src/Dst Old memcpy New memcpy align --- ------- ------------- ------------- 1 0/0 0s 47015us 0s 28265us 1 0/4 0s 28201us 0s 28199us 1 4/0 0s 28200us 0s 28199us 1 4/4 0s 28199us 0s 28206us 7 0/0 0s 24441us 0s 24438us 7 0/4 0s 24439us 0s 24438us 7 4/0 0s 24439us 0s 24438us 7 4/4 0s 24439us 0s 24439us 8 0/0 0s 20699us 0s 20687us 8 0/4 0s 20689us 0s 20901us 8 4/0 0s 20692us 0s 20679us 8 4/4 0s 20679us 0s 20679us 16 0/0 0s 18807us 0s 18802us 16 0/4 0s 26319us 0s 18800us 16 4/0 0s 18800us 0s 18806us 16 4/4 0s 26317us 0s 18803us 32 0/0 0s 35728us 0s 18800us 32 0/4 0s 35716us 0s 18800us 32 4/0 0s 35717us 0s 18800us 32 4/4 0s 35724us 0s 18803us 48 0/0 0s 26897us 0s 30080us 48 0/4 0s 33837us 0s 33838us 48 4/0 0s 27600us 0s 30079us 48 4/4 0s 30087us 0s 33854us 64 0/0 0s 41369us 0s 45115us 64 0/4 0s 62042us 0s 65800us 64 4/0 0s 56400us 0s 58278us 64 4/4 0s 84596us 0s 84606us 80 0/0 0s 35877us 0s 37611us 80 0/4 0s 77083us 0s 56404us 80 4/0 0s 52652us 0s 55611us 80 4/4 0s 75200us 0s 78968us 128 0/0 0s 52642us 0s 56403us 128 0/4 0s 95883us 0s 95891us 128 4/0 0s 114683us 0s 108511us 128 4/4 0s 144780us 0s 110927us 256 0/0 0s 80832us 0s 86489us 256 0/4 0s 178586us 0s 163562us 256 4/0 0s 208670us 0s 181719us 256 4/4 0s 270705us 0s 148525us 512 0/0 0s 156049us 0s 148348us 512 0/4 0s 313933us 0s 298908us 512 4/0 0s 411671us 0s 329025us 512 4/4 0s 516971us 0s 208746us 1024 0/0 0s 297067us 0s 274019us 1024 0/4 0s 584703us 0s 569604us 1024 4/0 0s 818104us 0s 616419us 1024 4/4 1s 22839us 0s 328953us 2048 0/0 0s 577077us 0s 524148us 2048 0/4 1s 125953us 1s 111258us 2048 4/0 1s 894000us 1s 202724us 2048 4/4 2s 331807us 0s 822437us 4096 0/0 1s 25881us 1s 34128us 4096 0/4 2s 619273us 2s 606489us 4096 4/0 3s 553989us 2s 390272us 4096 4/4 4s 737789us 1s 433213us Signed-off-by: Miao Xie --- arch/x86/lib/memcpy_64.S | 135 +++++++++++++++++++++++++++++++++++++++++++++- 1 files changed, 134 insertions(+), 1 deletions(-) diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S index 75ef61e..b0224f8 100644 --- a/arch/x86/lib/memcpy_64.S +++ b/arch/x86/lib/memcpy_64.S @@ -46,9 +46,39 @@ ENTRY(memcpy) * Use 32bit CMP here to avoid long NOP padding. */ cmp $0x20, %edx - jb .Lhandle_tail + jbe .Lhandle_tail /* + * the code for unaligned copy is good for large-size copy(>100), + * so if the size is small, we needn't check dst and src is aligned + * or not. + */ + cmp $100, %edx + jb .Lboth_aligned + + /* + * unaligned access always leads to bad performance, so in order to + * avoid unaligned access, we align the address(both src and dest) + * first, and then copy from a aligned src to an aligned dst by using + * shifts. + * But we found if src is aligned, although dest is unaligned, the + * performance of generic memory copy (That is reading data aligned + * from the source and writing data unaligned to the dest) is better + * than the one that uses shifts to avoid unaligned access. + * So if src is aligned, we needn't check dest is aligned or not, just + * goto .Lboth_aligned + */ + test $7, %esi /* src align check */ + jz .Lboth_aligned + + /* if dest and src both are unaligned, goto unaligned copy */ + test $7, %edi + jnz .Ldst_unaligned + + jmp .Lsrc_unaligned_dst_aligned + +.Lboth_aligned: + /* * We check whether memory false dependece could occur, * then jump to corresponding copy mode. */ @@ -166,6 +196,109 @@ ENTRY(memcpy) .Lend: retq + + .p2align 4 +.Ldst_unaligned: + movq %rdi, %rcx + andq $7, %rcx /* Align the destination */ + negq %rcx + andq $7, %rcx + subq %rcx, %rdx + + /* tune dst address */ + movq (%rsi), %r8 + movq %r8, (%rdi) + addq %rcx, %rdi + addq %rcx, %rsi + + test $7, %esi /* src align check */ + jz .Lboth_aligned + + .p2align 4 +.Lsrc_unaligned_dst_aligned: + push %rbx + push %r12 + push %r13 + push %r14 + push %r15 + /* + * Calculate how to shift a word read at the memory operation + * aligned srcp to make it aligned for copy. + */ + movq %rsi, %r14 + andq $7, %r14 + shlq $3, %r14 + + movq $64, %r15 + subq %r14, %r15 + + andq $-8, %rsi /* src aligned */ + movq 0*8(%rsi), %r8 + + movq %rdx, %rbx + shrq $5, %rbx + jz .Lsrc_unaligned_less32 + + /* + * %r8 : store src[0] + * %r9 : store src[1] + * %r10: store src[2] + * %r11: store src[3] + * %r12: store src[4] + * %r13: store the tmp data + */ + .p2align 4 +.Lsrc_unaligned_loop32: + movq 1*8(%rsi), %r9 + movq 2*8(%rsi), %r10 + movq 3*8(%rsi), %r11 + movq 4*8(%rsi), %r12 + + movq %r9, %r13 + movb %r14b, %cl + shrq %cl, %r8 + shrq %cl, %r13 + movb %r15b, %cl + shlq %cl, %r9 + orq %r8, %r9 + movq %r10, %r8 + shlq %cl, %r10 + orq %r13, %r10 + + movq %r11, %r13 + movb %r14b, %cl + shrq %cl, %r8 + shrq %cl, %r13 + movb %r15b, %cl + shlq %cl, %r11 + orq %r8, %r11 + movq %r12, %r8 + shlq %cl, %r12 + orq %r13, %r12 + + movq %r9, 0*8(%rdi) + movq %r10, 1*8(%rdi) + movq %r11, 2*8(%rdi) + movq %r12, 3*8(%rdi) + + leaq 4*8(%rdi), %rdi + leaq 4*8(%rsi), %rsi + decq %rbx + jnz .Lsrc_unaligned_loop32 + + .p2align 4 +.Lsrc_unaligned_less32: + shrq $3, %r14 + addq %r14, %rsi + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbx + andq $31, %rdx + jnz .Lhandle_tail + retq + CFI_ENDPROC ENDPROC(memcpy) ENDPROC(__memcpy) -- 1.7.0.1