[PATCH v2 1/3] powerpc/64: Align bytes before fall back to .Lshort in powerpc64 memcmp().

linuxppc-dev.lists.ozlabs.org archive mirror
 help / color / mirror / Atom feed

From: wei.guo.simon@gmail.com
To: linuxppc-dev@lists.ozlabs.org
Cc: Paul Mackerras <paulus@ozlabs.org>,
	Michael Ellerman <mpe@ellerman.id.au>,
	"Naveen N.  Rao" <naveen.n.rao@linux.vnet.ibm.com>,
	David Laight <David.Laight@ACULAB.COM>,
	Christophe LEROY <christophe.leroy@c-s.fr>,
	Simon Guo <wei.guo.simon@gmail.com>
Subject: [PATCH v2 1/3] powerpc/64: Align bytes before fall back to .Lshort in powerpc64 memcmp().
Date: Thu, 21 Sep 2017 07:34:38 +0800	[thread overview]
Message-ID: <1505950480-14830-2-git-send-email-wei.guo.simon@gmail.com> (raw)
In-Reply-To: <1505950480-14830-1-git-send-email-wei.guo.simon@gmail.com>

From: Simon Guo <wei.guo.simon@gmail.com>

Currently memcmp() 64bytes version in powerpc will fall back to .Lshort
(compare per byte mode) if either src or dst address is not 8 bytes
aligned. It can be opmitized if both addresses are with the same offset
with 8 bytes boundary.

memcmp() can compare the unaligned bytes within 8 bytes boundary firstly
and then compare the rest 8-bytes-aligned content with .Llong mode.

This patch optmizes memcmp() behavior in this situation.

Test result:

(1) 256 bytes
Test with the existing tools/testing/selftests/powerpc/stringloops/memcmp:
- without patch
      50.996607479 seconds time elapsed                                          ( +- 0.01% )
- with patch
      28.033316997 seconds time elapsed                                          ( +- 0.01% )
		-> There is ~+81% percent improvement

(2) 32 bytes
To observe performance impact on < 32 bytes, modify
tools/testing/selftests/powerpc/stringloops/memcmp.c with following:
-------
 #include <string.h>
 #include "utils.h"

-#define SIZE 256
+#define SIZE 32
 #define ITERATIONS 10000

 int test_memcmp(const void *s1, const void *s2, size_t n);
--------

- Without patch
       0.392578831 seconds time elapsed                                          ( +- 0.05% )
- with patch
       0.358446662 seconds time elapsed                                          ( +- 0.04% )
		-> There is ～+9% improvement

(3) 0~8 bytes
To observe <8 bytes performance impact, modify
tools/testing/selftests/powerpc/stringloops/memcmp.c with following:
-------
 #include <string.h>
 #include "utils.h"

-#define SIZE 256
-#define ITERATIONS 10000
+#define SIZE 8
+#define ITERATIONS 1000000

 int test_memcmp(const void *s1, const void *s2, size_t n);
-------
- Without patch
       3.168752060 seconds time elapsed                                          ( +- 0.10% )
- With patch
       3.153030138 seconds time elapsed                                          ( +- 0.09% )
		-> They are nearly the same. (-0.4%)

Signed-off-by: Simon Guo <wei.guo.simon@gmail.com>
---
 arch/powerpc/lib/memcmp_64.S | 99 +++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 93 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/lib/memcmp_64.S b/arch/powerpc/lib/memcmp_64.S
index d75d18b..6dccfb8 100644
--- a/arch/powerpc/lib/memcmp_64.S
+++ b/arch/powerpc/lib/memcmp_64.S
@@ -24,28 +24,35 @@
 #define rH	r31
 
 #ifdef __LITTLE_ENDIAN__
+#define LH	lhbrx
+#define LW	lwbrx
 #define LD	ldbrx
 #else
+#define LH	lhzx
+#define LW	lwzx
 #define LD	ldx
 #endif
 
 _GLOBAL(memcmp)
 	cmpdi	cr1,r5,0
 
-	/* Use the short loop if both strings are not 8B aligned */
-	or	r6,r3,r4
+	/* Use the short loop if the src/dst addresses are not
+	 * with the same offset of 8 bytes align boundary.
+	 */
+	xor	r6,r3,r4
 	andi.	r6,r6,7
 
-	/* Use the short loop if length is less than 32B */
-	cmpdi	cr6,r5,31
+	/* fall back to short loop if compare at aligned addrs
+	 * with less than 8 bytes.
+	 */
+	cmpdi   cr6,r5,7
 
 	beq	cr1,.Lzero
 	bne	.Lshort
-	bgt	cr6,.Llong
+	bgt	cr6,.L8bytes_make_align_start
 
 .Lshort:
 	mtctr	r5
-
 1:	lbz	rA,0(r3)
 	lbz	rB,0(r4)
 	subf.	rC,rB,rA
@@ -78,6 +85,78 @@ _GLOBAL(memcmp)
 	li	r3,0
 	blr
 
+.L8bytes_make_align_start:
+	/* attempt to compare bytes not aligned with 8 bytes so that
+	 * left comparison can run based on 8 bytes alignment.
+	 */
+	andi.   r6,r3,7
+	beq     .L8bytes_aligned
+
+	/* Try to compare the first double word which is not 8 bytes aligned:
+	 * load the first double word at (src & ~7UL) and shift left appropriate
+	 * bits before comparision.
+	 */
+	clrlwi  r6,r3,29
+	rlwinm  r6,r6,3,0,28
+	clrrdi	r3,r3,3
+	clrrdi	r4,r4,3
+	LD	rA,0,r3
+	LD	rB,0,r4
+	sld	rA,rA,r6
+	sld	rB,rB,r6
+	cmpld	cr0,rA,rB
+	bne	cr0,.LcmpAB_lightweight
+	srwi	r6,r6,3
+	subfic  r6,r6,8
+	subfc.	r5,r6,r5
+	beq	.Lzero
+	addi	r3,r3,8
+	addi	r4,r4,8
+
+.L8bytes_aligned:
+	/* now we are aligned with 8 bytes.
+	 * Use .Llong loop if left cmp bytes are equal or greater than 32B.
+	 */
+	cmpdi   cr6,r5,31
+	bgt	cr6,.Llong
+
+	cmpdi   cr6,r5,7
+	bgt	cr6,.Lcmp_8bytes_31bytes
+
+.Lcmp_rest_lt8bytes:
+	/* Here we have only less than 8 bytes to compare with. Addresses
+	 * are aligned with 8 bytes.
+	 * The next double words are load and shift right with appropriate
+	 * bits.
+	 */
+	subfic  r6,r5,8
+	rlwinm  r6,r6,3,0,28
+	LD	rA,0,r3
+	LD	rB,0,r4
+	srd	rA,rA,r6
+	srd	rB,rB,r6
+	cmpld	cr0,rA,rB
+	bne	cr0,.LcmpAB_lightweight
+	beq	.Lzero
+
+.Lcmp_8bytes_31bytes:
+	/* compare 8 ~ 31 bytes with 8 bytes aligned */
+	srdi.   r0,r5,3
+	clrldi  r5,r5,61
+	mtctr   r0
+831:
+	LD	rA,0,r3
+	LD	rB,0,r4
+	cmpld	cr0,rA,rB
+	bne	cr0,.LcmpAB_lightweight
+	addi	r3,r3,8
+	addi	r4,r4,8
+	bdnz	831b
+
+	cmpwi   r5,0
+	beq	.Lzero
+	b	.Lcmp_rest_lt8bytes
+
 .Lnon_zero:
 	mr	r3,rC
 	blr
@@ -232,4 +311,12 @@ _GLOBAL(memcmp)
 	ld	r28,-32(r1)
 	ld	r27,-40(r1)
 	blr
+
+.LcmpAB_lightweight:   /* skip NV GPRS restore */
+	li	r3,1
+	bgt	cr0,8f
+	li	r3,-1
+8:
+	blr
+
 EXPORT_SYMBOL(memcmp)
-- 
1.8.3.1

next prev parent reply	other threads:[~2017-09-22  5:38 UTC|newest]

Thread overview: 18+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2017-09-20 23:34 [PATCH v2 0/3] powerpc/64: memcmp() optimization wei.guo.simon
2017-09-20 23:34 ` wei.guo.simon [this message]
2017-09-20 23:34 ` [PATCH v2 2/3] powerpc/64: enhance memcmp() with VMX instruction for long bytes comparision wei.guo.simon
2017-09-21  0:54   ` Simon Guo
2017-09-22 14:06   ` Cyril Bur
2017-09-23 21:18     ` Simon Guo
2017-09-25 23:59       ` Cyril Bur
2017-09-26  5:34         ` Michael Ellerman
2017-09-26 11:26           ` Segher Boessenkool
2017-09-27  3:38             ` Michael Ellerman
2017-09-27  9:27               ` Segher Boessenkool
2017-09-27  9:43                 ` David Laight
2017-09-27 18:33                   ` Simon Guo
2017-09-28  9:24                     ` David Laight
2017-09-27 16:22               ` Simon Guo
2017-09-20 23:34 ` [PATCH v2 3/3] powerpc:selftest update memcmp_64 selftest for VMX implementation wei.guo.simon
2017-09-25  9:30   ` David Laight
2017-09-24  6:19     ` Simon Guo

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:d75d18b dfblob:6dccfb8 )
 OR (
bs:"[PATCH v2 1/3] powerpc/64: Align bytes before fall back to .Lshort in powerpc64 memcmp()." )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1505950480-14830-2-git-send-email-wei.guo.simon@gmail.com \
    --to=wei.guo.simon@gmail.com \
    --cc=David.Laight@ACULAB.COM \
    --cc=christophe.leroy@c-s.fr \
    --cc=linuxppc-dev@lists.ozlabs.org \
    --cc=mpe@ellerman.id.au \
    --cc=naveen.n.rao@linux.vnet.ibm.com \
    --cc=paulus@ozlabs.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).