From: wei.guo.simon@gmail.com
To: linuxppc-dev@lists.ozlabs.org
Cc: Paul Mackerras <paulus@ozlabs.org>,
Michael Ellerman <mpe@ellerman.id.au>,
"Naveen N. Rao" <naveen.n.rao@linux.vnet.ibm.com>,
David Laight <David.Laight@ACULAB.COM>,
Christophe LEROY <christophe.leroy@c-s.fr>,
Simon Guo <wei.guo.simon@gmail.com>
Subject: [PATCH v2 1/3] powerpc/64: Align bytes before fall back to .Lshort in powerpc64 memcmp().
Date: Thu, 21 Sep 2017 07:34:38 +0800 [thread overview]
Message-ID: <1505950480-14830-2-git-send-email-wei.guo.simon@gmail.com> (raw)
In-Reply-To: <1505950480-14830-1-git-send-email-wei.guo.simon@gmail.com>
From: Simon Guo <wei.guo.simon@gmail.com>
Currently memcmp() 64bytes version in powerpc will fall back to .Lshort
(compare per byte mode) if either src or dst address is not 8 bytes
aligned. It can be opmitized if both addresses are with the same offset
with 8 bytes boundary.
memcmp() can compare the unaligned bytes within 8 bytes boundary firstly
and then compare the rest 8-bytes-aligned content with .Llong mode.
This patch optmizes memcmp() behavior in this situation.
Test result:
(1) 256 bytes
Test with the existing tools/testing/selftests/powerpc/stringloops/memcmp:
- without patch
50.996607479 seconds time elapsed ( +- 0.01% )
- with patch
28.033316997 seconds time elapsed ( +- 0.01% )
-> There is ~+81% percent improvement
(2) 32 bytes
To observe performance impact on < 32 bytes, modify
tools/testing/selftests/powerpc/stringloops/memcmp.c with following:
-------
#include <string.h>
#include "utils.h"
-#define SIZE 256
+#define SIZE 32
#define ITERATIONS 10000
int test_memcmp(const void *s1, const void *s2, size_t n);
--------
- Without patch
0.392578831 seconds time elapsed ( +- 0.05% )
- with patch
0.358446662 seconds time elapsed ( +- 0.04% )
-> There is ~+9% improvement
(3) 0~8 bytes
To observe <8 bytes performance impact, modify
tools/testing/selftests/powerpc/stringloops/memcmp.c with following:
-------
#include <string.h>
#include "utils.h"
-#define SIZE 256
-#define ITERATIONS 10000
+#define SIZE 8
+#define ITERATIONS 1000000
int test_memcmp(const void *s1, const void *s2, size_t n);
-------
- Without patch
3.168752060 seconds time elapsed ( +- 0.10% )
- With patch
3.153030138 seconds time elapsed ( +- 0.09% )
-> They are nearly the same. (-0.4%)
Signed-off-by: Simon Guo <wei.guo.simon@gmail.com>
---
arch/powerpc/lib/memcmp_64.S | 99 +++++++++++++++++++++++++++++++++++++++++---
1 file changed, 93 insertions(+), 6 deletions(-)
diff --git a/arch/powerpc/lib/memcmp_64.S b/arch/powerpc/lib/memcmp_64.S
index d75d18b..6dccfb8 100644
--- a/arch/powerpc/lib/memcmp_64.S
+++ b/arch/powerpc/lib/memcmp_64.S
@@ -24,28 +24,35 @@
#define rH r31
#ifdef __LITTLE_ENDIAN__
+#define LH lhbrx
+#define LW lwbrx
#define LD ldbrx
#else
+#define LH lhzx
+#define LW lwzx
#define LD ldx
#endif
_GLOBAL(memcmp)
cmpdi cr1,r5,0
- /* Use the short loop if both strings are not 8B aligned */
- or r6,r3,r4
+ /* Use the short loop if the src/dst addresses are not
+ * with the same offset of 8 bytes align boundary.
+ */
+ xor r6,r3,r4
andi. r6,r6,7
- /* Use the short loop if length is less than 32B */
- cmpdi cr6,r5,31
+ /* fall back to short loop if compare at aligned addrs
+ * with less than 8 bytes.
+ */
+ cmpdi cr6,r5,7
beq cr1,.Lzero
bne .Lshort
- bgt cr6,.Llong
+ bgt cr6,.L8bytes_make_align_start
.Lshort:
mtctr r5
-
1: lbz rA,0(r3)
lbz rB,0(r4)
subf. rC,rB,rA
@@ -78,6 +85,78 @@ _GLOBAL(memcmp)
li r3,0
blr
+.L8bytes_make_align_start:
+ /* attempt to compare bytes not aligned with 8 bytes so that
+ * left comparison can run based on 8 bytes alignment.
+ */
+ andi. r6,r3,7
+ beq .L8bytes_aligned
+
+ /* Try to compare the first double word which is not 8 bytes aligned:
+ * load the first double word at (src & ~7UL) and shift left appropriate
+ * bits before comparision.
+ */
+ clrlwi r6,r3,29
+ rlwinm r6,r6,3,0,28
+ clrrdi r3,r3,3
+ clrrdi r4,r4,3
+ LD rA,0,r3
+ LD rB,0,r4
+ sld rA,rA,r6
+ sld rB,rB,r6
+ cmpld cr0,rA,rB
+ bne cr0,.LcmpAB_lightweight
+ srwi r6,r6,3
+ subfic r6,r6,8
+ subfc. r5,r6,r5
+ beq .Lzero
+ addi r3,r3,8
+ addi r4,r4,8
+
+.L8bytes_aligned:
+ /* now we are aligned with 8 bytes.
+ * Use .Llong loop if left cmp bytes are equal or greater than 32B.
+ */
+ cmpdi cr6,r5,31
+ bgt cr6,.Llong
+
+ cmpdi cr6,r5,7
+ bgt cr6,.Lcmp_8bytes_31bytes
+
+.Lcmp_rest_lt8bytes:
+ /* Here we have only less than 8 bytes to compare with. Addresses
+ * are aligned with 8 bytes.
+ * The next double words are load and shift right with appropriate
+ * bits.
+ */
+ subfic r6,r5,8
+ rlwinm r6,r6,3,0,28
+ LD rA,0,r3
+ LD rB,0,r4
+ srd rA,rA,r6
+ srd rB,rB,r6
+ cmpld cr0,rA,rB
+ bne cr0,.LcmpAB_lightweight
+ beq .Lzero
+
+.Lcmp_8bytes_31bytes:
+ /* compare 8 ~ 31 bytes with 8 bytes aligned */
+ srdi. r0,r5,3
+ clrldi r5,r5,61
+ mtctr r0
+831:
+ LD rA,0,r3
+ LD rB,0,r4
+ cmpld cr0,rA,rB
+ bne cr0,.LcmpAB_lightweight
+ addi r3,r3,8
+ addi r4,r4,8
+ bdnz 831b
+
+ cmpwi r5,0
+ beq .Lzero
+ b .Lcmp_rest_lt8bytes
+
.Lnon_zero:
mr r3,rC
blr
@@ -232,4 +311,12 @@ _GLOBAL(memcmp)
ld r28,-32(r1)
ld r27,-40(r1)
blr
+
+.LcmpAB_lightweight: /* skip NV GPRS restore */
+ li r3,1
+ bgt cr0,8f
+ li r3,-1
+8:
+ blr
+
EXPORT_SYMBOL(memcmp)
--
1.8.3.1
next prev parent reply other threads:[~2017-09-22 5:38 UTC|newest]
Thread overview: 18+ messages / expand[flat|nested] mbox.gz Atom feed top
2017-09-20 23:34 [PATCH v2 0/3] powerpc/64: memcmp() optimization wei.guo.simon
2017-09-20 23:34 ` wei.guo.simon [this message]
2017-09-20 23:34 ` [PATCH v2 2/3] powerpc/64: enhance memcmp() with VMX instruction for long bytes comparision wei.guo.simon
2017-09-21 0:54 ` Simon Guo
2017-09-22 14:06 ` Cyril Bur
2017-09-23 21:18 ` Simon Guo
2017-09-25 23:59 ` Cyril Bur
2017-09-26 5:34 ` Michael Ellerman
2017-09-26 11:26 ` Segher Boessenkool
2017-09-27 3:38 ` Michael Ellerman
2017-09-27 9:27 ` Segher Boessenkool
2017-09-27 9:43 ` David Laight
2017-09-27 18:33 ` Simon Guo
2017-09-28 9:24 ` David Laight
2017-09-27 16:22 ` Simon Guo
2017-09-20 23:34 ` [PATCH v2 3/3] powerpc:selftest update memcmp_64 selftest for VMX implementation wei.guo.simon
2017-09-25 9:30 ` David Laight
2017-09-24 6:19 ` Simon Guo
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1505950480-14830-2-git-send-email-wei.guo.simon@gmail.com \
--to=wei.guo.simon@gmail.com \
--cc=David.Laight@ACULAB.COM \
--cc=christophe.leroy@c-s.fr \
--cc=linuxppc-dev@lists.ozlabs.org \
--cc=mpe@ellerman.id.au \
--cc=naveen.n.rao@linux.vnet.ibm.com \
--cc=paulus@ozlabs.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).