[PATCH 1/6] arm64: lib: Implement optimized memcpy routine

linux-arm-kernel.lists.infradead.org archive mirror
 help / color / mirror / Atom feed

From: zhichang.yuan@linaro.org (zhichang.yuan at linaro.org)
To: linux-arm-kernel@lists.infradead.org
Subject: [PATCH 1/6] arm64: lib: Implement optimized memcpy routine
Date: Wed, 11 Dec 2013 14:24:37 +0800	[thread overview]
Message-ID: <1386743082-5231-2-git-send-email-zhichang.yuan@linaro.org> (raw)
In-Reply-To: <1386743082-5231-1-git-send-email-zhichang.yuan@linaro.org>

From: "zhichang.yuan" <zhichang.yuan@linaro.org>

This patch, based on Linaro's Cortex Strings library, improves
the performance of the assembly optimized memcpy() function.

Signed-off-by: Zhichang Yuan <zhichang.yuan@linaro.org>
Signed-off-by: Deepak Saxena <dsaxena@linaro.org>
---
 arch/arm64/lib/memcpy.S |  182 +++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 160 insertions(+), 22 deletions(-)

diff --git a/arch/arm64/lib/memcpy.S b/arch/arm64/lib/memcpy.S
index 27b5003..e3bab71 100644
--- a/arch/arm64/lib/memcpy.S
+++ b/arch/arm64/lib/memcpy.S
@@ -1,5 +1,13 @@
 /*
  * Copyright (C) 2013 ARM Ltd.
+ * Copyright (C) 2013 Linaro.
+ *
+ * This code is based on glibc cortex strings work originally authored by Linaro
+ * and re-licensed under GPLv2 for the Linux kernel. The original code can
+ * be found @
+ *
+ * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
+ * files/head:/src/aarch64/
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -27,27 +35,157 @@
  * Returns:
  *	x0 - dest
  */
+#define dstin	x0
+#define src	x1
+#define count	x2
+#define tmp1	x3
+#define tmp1w	w3
+#define tmp2	x4
+#define tmp2w	w4
+#define tmp3	x5
+#define tmp3w	w5
+#define dst	x6
+
+#define A_l	x7
+#define A_h	x8
+#define B_l	x9
+#define B_h	x10
+#define C_l	x11
+#define C_h	x12
+#define D_l	x13
+#define D_h	x14
+
 ENTRY(memcpy)
-	mov	x4, x0
-	subs	x2, x2, #8
-	b.mi	2f
-1:	ldr	x3, [x1], #8
-	subs	x2, x2, #8
-	str	x3, [x4], #8
-	b.pl	1b
-2:	adds	x2, x2, #4
-	b.mi	3f
-	ldr	w3, [x1], #4
-	sub	x2, x2, #4
-	str	w3, [x4], #4
-3:	adds	x2, x2, #2
-	b.mi	4f
-	ldrh	w3, [x1], #2
-	sub	x2, x2, #2
-	strh	w3, [x4], #2
-4:	adds	x2, x2, #1
-	b.mi	5f
-	ldrb	w3, [x1]
-	strb	w3, [x4]
-5:	ret
+	mov	dst, dstin
+	cmp	count, #16
+	/*If memory length is less than 16, stp or ldp can not be used.*/
+	b.lo	.Ltiny15
+.Lover16:
+	neg	tmp2, src
+	ands	tmp2, tmp2, #15/* Bytes to reach alignment. */
+	b.eq	.LSrcAligned
+	sub	count, count, tmp2
+	/*
+	* Use ldp and sdp to copy 16 bytes,then backward the src to
+	* aligned address.This way is more efficient.
+	* But the risk overwriting the source area exists.Here,prefer to
+	* access memory forward straight,no backward.It will need a bit
+	* more instructions, but on the same time,the accesses are aligned.
+	*/
+	tbz	tmp2, #0, 1f
+	ldrb	tmp1w, [src], #1
+	strb	tmp1w, [dst], #1
+1:
+	tbz	tmp2, #1, 1f
+	ldrh	tmp1w, [src], #2
+	strh	tmp1w, [dst], #2
+1:
+	tbz	tmp2, #2, 1f
+	ldr	tmp1w, [src], #4
+	str	tmp1w, [dst], #4
+1:
+	tbz	tmp2, #3, .LSrcAligned
+	ldr	tmp1, [src],#8
+	str	tmp1, [dst],#8
+
+.LSrcAligned:
+	cmp	count, #64
+	b.ge	.Lcpy_over64
+	/*
+	* Deal with small copies quickly by dropping straight into the
+	* exit block.
+	*/
+.Ltail63:
+	/*
+	* Copy up to 48 bytes of data. At this point we only need the
+	* bottom 6 bits of count to be accurate.
+	*/
+	ands	tmp1, count, #0x30
+	b.eq	.Ltiny15
+	cmp	tmp1w, #0x20
+	b.eq	1f
+	b.lt	2f
+	ldp	A_l, A_h, [src], #16
+	stp	A_l, A_h, [dst], #16
+1:
+	ldp	A_l, A_h, [src], #16
+	stp	A_l, A_h, [dst], #16
+2:
+	ldp	A_l, A_h, [src], #16
+	stp	A_l, A_h, [dst], #16
+.Ltiny15:
+	/*
+	* To make memmove simpler, here don't make src backwards.
+	* since backwards will probably overwrite the src area where src
+	* data for nex copy located,if dst is not so far from src.
+	*/
+	tbz	count, #3, 1f
+	ldr	tmp1, [src], #8
+	str	tmp1, [dst], #8
+1:
+	tbz	count, #2, 1f
+	ldr	tmp1w, [src], #4
+	str	tmp1w, [dst], #4
+1:
+	tbz	count, #1, 1f
+	ldrh	tmp1w, [src], #2
+	strh	tmp1w, [dst], #2
+1:
+	tbz	count, #0, .Lexitfunc
+	ldrb	tmp1w, [src]
+	strb	tmp1w, [dst]
+
+.Lexitfunc:
+	ret
+
+.Lcpy_over64:
+	subs	count, count, #128
+	b.ge	.Lcpy_body_large
+	/*
+	* Less than 128 bytes to copy, so handle 64 here and then jump
+	* to the tail.
+	*/
+	ldp	A_l, A_h, [src],#16
+	stp	A_l, A_h, [dst],#16
+	ldp	B_l, B_h, [src],#16
+	ldp	C_l, C_h, [src],#16
+	stp	B_l, B_h, [dst],#16
+	stp	C_l, C_h, [dst],#16
+	ldp	D_l, D_h, [src],#16
+	stp	D_l, D_h, [dst],#16
+
+	tst	count, #0x3f
+	b.ne	.Ltail63
+	ret
+
+	/*
+	* Critical loop.  Start at a new cache line boundary.  Assuming
+	* 64 bytes per line this ensures the entire loop is in one line.
+	*/
+	.p2align	6
+.Lcpy_body_large:
+	/* There are at least 128 bytes to copy.  */
+	ldp	A_l, A_h, [src],#16
+	ldp	B_l, B_h, [src],#16
+	ldp	C_l, C_h, [src],#16
+	ldp	D_l, D_h, [src],#16
+1:
+	stp	A_l, A_h, [dst],#16
+	ldp	A_l, A_h, [src],#16
+	stp	B_l, B_h, [dst],#16
+	ldp	B_l, B_h, [src],#16
+	stp	C_l, C_h, [dst],#16
+	ldp	C_l, C_h, [src],#16
+	stp	D_l, D_h, [dst],#16
+	ldp	D_l, D_h, [src],#16
+	subs	count, count, #64
+	b.ge	1b
+	stp	A_l, A_h, [dst],#16
+	stp	B_l, B_h, [dst],#16
+	stp	C_l, C_h, [dst],#16
+	stp	D_l, D_h, [dst],#16
+
+	tst	count, #0x3f
+	b.ne	.Ltail63
+	ret
 ENDPROC(memcpy)
-- 
1.7.9.5

next prev parent reply	other threads:[~2013-12-11  6:24 UTC|newest]

Thread overview: 14+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2013-12-11  6:24 [PATCH 0/6] arm64:lib: the optimized string library routines for armv8 processors zhichang.yuan at linaro.org
2013-12-11  6:24 ` zhichang.yuan at linaro.org [this message]
2013-12-16 16:08   ` [PATCH 1/6] arm64: lib: Implement optimized memcpy routine Will Deacon
2013-12-18  1:54     ` zhichang.yuan
2013-12-11  6:24 ` [PATCH 2/6] arm64: lib: Implement optimized memmove routine zhichang.yuan at linaro.org
2013-12-11  6:24 ` [PATCH 3/6] arm64: lib: Implement optimized memset routine zhichang.yuan at linaro.org
2013-12-16 16:55   ` Will Deacon
2013-12-18  2:37     ` zhichang.yuan
2013-12-11  6:24 ` [PATCH 4/6] arm64: lib: Implement optimized memcmp routine zhichang.yuan at linaro.org
2013-12-16 16:56   ` Will Deacon
2013-12-19  8:18     ` Deepak Saxena
2013-12-19 16:14       ` Catalin Marinas
2013-12-11  6:24 ` [PATCH 5/6] arm64: lib: Implement optimized string compare routines zhichang.yuan at linaro.org
2013-12-11  6:24 ` [PATCH 6/6] arm64: lib: Implement optimized string length routines zhichang.yuan at linaro.org

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:27b5003 dfblob:e3bab71 )
 OR (
bs:"[PATCH 1/6] arm64: lib: Implement optimized memcpy routine" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1386743082-5231-2-git-send-email-zhichang.yuan@linaro.org \
    --to=zhichang.yuan@linaro.org \
    --cc=linux-arm-kernel@lists.infradead.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).