[PATCH v2 1/3] powerpc/64: Align bytes before fall back to .Lshort in powerpc64 memcmp().

All of lore.kernel.org
 help / color / mirror / Atom feed

From: wei.guo.simon@gmail.com
To: linuxppc-dev@lists.ozlabs.org
Cc: Paul Mackerras <paulus@ozlabs.org>,
	Michael Ellerman <mpe@ellerman.id.au>,
	"Naveen N.  Rao" <naveen.n.rao@linux.vnet.ibm.com>,
	David Laight <David.Laight@ACULAB.COM>,
	Christophe LEROY <christophe.leroy@c-s.fr>,
	Simon Guo <wei.guo.simon@gmail.com>
Subject: [PATCH v2 1/3] powerpc/64: Align bytes before fall back to .Lshort in powerpc64 memcmp().
Date: Thu, 21 Sep 2017 07:34:38 +0800	[thread overview]
Message-ID: <1505950480-14830-2-git-send-email-wei.guo.simon@gmail.com> (raw)
In-Reply-To: <1505950480-14830-1-git-send-email-wei.guo.simon@gmail.com>

From: Simon Guo <wei.guo.simon@gmail.com>

Currently memcmp() 64bytes version in powerpc will fall back to .Lshort
(compare per byte mode) if either src or dst address is not 8 bytes
aligned. It can be opmitized if both addresses are with the same offset
with 8 bytes boundary.

memcmp() can compare the unaligned bytes within 8 bytes boundary firstly
and then compare the rest 8-bytes-aligned content with .Llong mode.

This patch optmizes memcmp() behavior in this situation.

Test result:

(1) 256 bytes
Test with the existing tools/testing/selftests/powerpc/stringloops/memcmp:
- without patch
      50.996607479 seconds time elapsed                                          ( +- 0.01% )
- with patch
      28.033316997 seconds time elapsed                                          ( +- 0.01% )
		-> There is ~+81% percent improvement

(2) 32 bytes
To observe performance impact on < 32 bytes, modify
tools/testing/selftests/powerpc/stringloops/memcmp.c with following:
-------
 #include <string.h>
 #include "utils.h"

-#define SIZE 256
+#define SIZE 32
 #define ITERATIONS 10000

 int test_memcmp(const void *s1, const void *s2, size_t n);
--------

- Without patch
       0.392578831 seconds time elapsed                                          ( +- 0.05% )
- with patch
       0.358446662 seconds time elapsed                                          ( +- 0.04% )
		-> There is ～+9% improvement

(3) 0~8 bytes
To observe <8 bytes performance impact, modify
tools/testing/selftests/powerpc/stringloops/memcmp.c with following:
-------
 #include <string.h>
 #include "utils.h"

-#define SIZE 256
-#define ITERATIONS 10000
+#define SIZE 8
+#define ITERATIONS 1000000

 int test_memcmp(const void *s1, const void *s2, size_t n);
-------
- Without patch
       3.168752060 seconds time elapsed                                          ( +- 0.10% )
- With patch
       3.153030138 seconds time elapsed                                          ( +- 0.09% )
		-> They are nearly the same. (-0.4%)

Signed-off-by: Simon Guo <wei.guo.simon@gmail.com>
---
 arch/powerpc/lib/memcmp_64.S | 99 +++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 93 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/lib/memcmp_64.S b/arch/powerpc/lib/memcmp_64.S
index d75d18b..6dccfb8 100644
--- a/arch/powerpc/lib/memcmp_64.S
+++ b/arch/powerpc/lib/memcmp_64.S
@@ -24,28 +24,35 @@
 #define rH	r31
 
 #ifdef __LITTLE_ENDIAN__
+#define LH	lhbrx
+#define LW	lwbrx
 #define LD	ldbrx
 #else
+#define LH	lhzx
+#define LW	lwzx
 #define LD	ldx
 #endif
 
 _GLOBAL(memcmp)
 	cmpdi	cr1,r5,0
 
-	/* Use the short loop if both strings are not 8B aligned */
-	or	r6,r3,r4
+	/* Use the short loop if the src/dst addresses are not
+	 * with the same offset of 8 bytes align boundary.
+	 */
+	xor	r6,r3,r4
 	andi.	r6,r6,7
 
-	/* Use the short loop if length is less than 32B */
-	cmpdi	cr6,r5,31
+	/* fall back to short loop if compare at aligned addrs
+	 * with less than 8 bytes.
+	 */
+	cmpdi   cr6,r5,7
 
 	beq	cr1,.Lzero
 	bne	.Lshort
-	bgt	cr6,.Llong
+	bgt	cr6,.L8bytes_make_align_start
 
 .Lshort:
 	mtctr	r5
-
 1:	lbz	rA,0(r3)
 	lbz	rB,0(r4)
 	subf.	rC,rB,rA
@@ -78,6 +85,78 @@ _GLOBAL(memcmp)
 	li	r3,0
 	blr
 
+.L8bytes_make_align_start:
+	/* attempt to compare bytes not aligned with 8 bytes so that
+	 * left comparison can run based on 8 bytes alignment.
+	 */
+	andi.   r6,r3,7
+	beq     .L8bytes_aligned
+
+	/* Try to compare the first double word which is not 8 bytes aligned:
+	 * load the first double word at (src & ~7UL) and shift left appropriate
+	 * bits before comparision.
+	 */
+	clrlwi  r6,r3,29
+	rlwinm  r6,r6,3,0,28
+	clrrdi	r3,r3,3
+	clrrdi	r4,r4,3
+	LD	rA,0,r3
+	LD	rB,0,r4
+	sld	rA,rA,r6
+	sld	rB,rB,r6
+	cmpld	cr0,rA,rB
+	bne	cr0,.LcmpAB_lightweight
+	srwi	r6,r6,3
+	subfic  r6,r6,8
+	subfc.	r5,r6,r5
+	beq	.Lzero
+	addi	r3,r3,8
+	addi	r4,r4,8
+
+.L8bytes_aligned:
+	/* now we are aligned with 8 bytes.
+	 * Use .Llong loop if left cmp bytes are equal or greater than 32B.
+	 */
+	cmpdi   cr6,r5,31
+	bgt	cr6,.Llong
+
+	cmpdi   cr6,r5,7
+	bgt	cr6,.Lcmp_8bytes_31bytes
+
+.Lcmp_rest_lt8bytes:
+	/* Here we have only less than 8 bytes to compare with. Addresses
+	 * are aligned with 8 bytes.
+	 * The next double words are load and shift right with appropriate
+	 * bits.
+	 */
+	subfic  r6,r5,8
+	rlwinm  r6,r6,3,0,28
+	LD	rA,0,r3
+	LD	rB,0,r4
+	srd	rA,rA,r6
+	srd	rB,rB,r6
+	cmpld	cr0,rA,rB
+	bne	cr0,.LcmpAB_lightweight
+	beq	.Lzero
+
+.Lcmp_8bytes_31bytes:
+	/* compare 8 ~ 31 bytes with 8 bytes aligned */
+	srdi.   r0,r5,3
+	clrldi  r5,r5,61
+	mtctr   r0
+831:
+	LD	rA,0,r3
+	LD	rB,0,r4
+	cmpld	cr0,rA,rB
+	bne	cr0,.LcmpAB_lightweight
+	addi	r3,r3,8
+	addi	r4,r4,8
+	bdnz	831b
+
+	cmpwi   r5,0
+	beq	.Lzero
+	b	.Lcmp_rest_lt8bytes
+
 .Lnon_zero:
 	mr	r3,rC
 	blr
@@ -232,4 +311,12 @@ _GLOBAL(memcmp)
 	ld	r28,-32(r1)
 	ld	r27,-40(r1)
 	blr
+
+.LcmpAB_lightweight:   /* skip NV GPRS restore */
+	li	r3,1
+	bgt	cr0,8f
+	li	r3,-1
+8:
+	blr
+
 EXPORT_SYMBOL(memcmp)
-- 
1.8.3.1

next prev parent reply	other threads:[~2017-09-22  5:38 UTC|newest]

Thread overview: 18+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2017-09-20 23:34 [PATCH v2 0/3] powerpc/64: memcmp() optimization wei.guo.simon
2017-09-20 23:34 ` wei.guo.simon [this message]
2017-09-20 23:34 ` [PATCH v2 2/3] powerpc/64: enhance memcmp() with VMX instruction for long bytes comparision wei.guo.simon
2017-09-21  0:54   ` Simon Guo
2017-09-22 14:06   ` Cyril Bur
2017-09-23 21:18     ` Simon Guo
2017-09-25 23:59       ` Cyril Bur
2017-09-26  5:34         ` Michael Ellerman
2017-09-26 11:26           ` Segher Boessenkool
2017-09-27  3:38             ` Michael Ellerman
2017-09-27  9:27               ` Segher Boessenkool
2017-09-27  9:43                 ` David Laight
2017-09-27 18:33                   ` Simon Guo
2017-09-28  9:24                     ` David Laight
2017-09-27 16:22               ` Simon Guo
2017-09-20 23:34 ` [PATCH v2 3/3] powerpc:selftest update memcmp_64 selftest for VMX implementation wei.guo.simon
2017-09-25  9:30   ` David Laight
2017-09-24  6:19     ` Simon Guo

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:d75d18b dfblob:6dccfb8 )
 OR (
bs:"[PATCH v2 1/3] powerpc/64: Align bytes before fall back to .Lshort in powerpc64 memcmp()." )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1505950480-14830-2-git-send-email-wei.guo.simon@gmail.com \
    --to=wei.guo.simon@gmail.com \
    --cc=David.Laight@ACULAB.COM \
    --cc=christophe.leroy@c-s.fr \
    --cc=linuxppc-dev@lists.ozlabs.org \
    --cc=mpe@ellerman.id.au \
    --cc=naveen.n.rao@linux.vnet.ibm.com \
    --cc=paulus@ozlabs.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.