From: wei.guo.simon@gmail.com
To: linuxppc-dev@lists.ozlabs.org
Cc: Paul Mackerras <paulus@ozlabs.org>,
Michael Ellerman <mpe@ellerman.id.au>,
"Naveen N. Rao" <naveen.n.rao@linux.vnet.ibm.com>,
David Laight <David.Laight@ACULAB.COM>,
Christophe LEROY <christophe.leroy@c-s.fr>,
Simon Guo <wei.guo.simon@gmail.com>
Subject: [PATCH v2 1/3] powerpc/64: Align bytes before fall back to .Lshort in powerpc64 memcmp().
Date: Thu, 21 Sep 2017 07:34:38 +0800 [thread overview]
Message-ID: <1505950480-14830-2-git-send-email-wei.guo.simon@gmail.com> (raw)
In-Reply-To: <1505950480-14830-1-git-send-email-wei.guo.simon@gmail.com>
From: Simon Guo <wei.guo.simon@gmail.com>
Currently memcmp() 64bytes version in powerpc will fall back to .Lshort
(compare per byte mode) if either src or dst address is not 8 bytes
aligned. It can be opmitized if both addresses are with the same offset
with 8 bytes boundary.
memcmp() can compare the unaligned bytes within 8 bytes boundary firstly
and then compare the rest 8-bytes-aligned content with .Llong mode.
This patch optmizes memcmp() behavior in this situation.
Test result:
(1) 256 bytes
Test with the existing tools/testing/selftests/powerpc/stringloops/memcmp:
- without patch
50.996607479 seconds time elapsed ( +- 0.01% )
- with patch
28.033316997 seconds time elapsed ( +- 0.01% )
-> There is ~+81% percent improvement
(2) 32 bytes
To observe performance impact on < 32 bytes, modify
tools/testing/selftests/powerpc/stringloops/memcmp.c with following:
-------
#include <string.h>
#include "utils.h"
-#define SIZE 256
+#define SIZE 32
#define ITERATIONS 10000
int test_memcmp(const void *s1, const void *s2, size_t n);
--------
- Without patch
0.392578831 seconds time elapsed ( +- 0.05% )
- with patch
0.358446662 seconds time elapsed ( +- 0.04% )
-> There is ~+9% improvement
(3) 0~8 bytes
To observe <8 bytes performance impact, modify
tools/testing/selftests/powerpc/stringloops/memcmp.c with following:
-------
#include <string.h>
#include "utils.h"
-#define SIZE 256
-#define ITERATIONS 10000
+#define SIZE 8
+#define ITERATIONS 1000000
int test_memcmp(const void *s1, const void *s2, size_t n);
-------
- Without patch
3.168752060 seconds time elapsed ( +- 0.10% )
- With patch
3.153030138 seconds time elapsed ( +- 0.09% )
-> They are nearly the same. (-0.4%)
Signed-off-by: Simon Guo <wei.guo.simon@gmail.com>
---
arch/powerpc/lib/memcmp_64.S | 99 +++++++++++++++++++++++++++++++++++++++++---
1 file changed, 93 insertions(+), 6 deletions(-)
diff --git a/arch/powerpc/lib/memcmp_64.S b/arch/powerpc/lib/memcmp_64.S
index d75d18b..6dccfb8 100644
--- a/arch/powerpc/lib/memcmp_64.S
+++ b/arch/powerpc/lib/memcmp_64.S
@@ -24,28 +24,35 @@
#define rH r31
#ifdef __LITTLE_ENDIAN__
+#define LH lhbrx
+#define LW lwbrx
#define LD ldbrx
#else
+#define LH lhzx
+#define LW lwzx
#define LD ldx
#endif
_GLOBAL(memcmp)
cmpdi cr1,r5,0
- /* Use the short loop if both strings are not 8B aligned */
- or r6,r3,r4
+ /* Use the short loop if the src/dst addresses are not
+ * with the same offset of 8 bytes align boundary.
+ */
+ xor r6,r3,r4
andi. r6,r6,7
- /* Use the short loop if length is less than 32B */
- cmpdi cr6,r5,31
+ /* fall back to short loop if compare at aligned addrs
+ * with less than 8 bytes.
+ */
+ cmpdi cr6,r5,7
beq cr1,.Lzero
bne .Lshort
- bgt cr6,.Llong
+ bgt cr6,.L8bytes_make_align_start
.Lshort:
mtctr r5
-
1: lbz rA,0(r3)
lbz rB,0(r4)
subf. rC,rB,rA
@@ -78,6 +85,78 @@ _GLOBAL(memcmp)
li r3,0
blr
+.L8bytes_make_align_start:
+ /* attempt to compare bytes not aligned with 8 bytes so that
+ * left comparison can run based on 8 bytes alignment.
+ */
+ andi. r6,r3,7
+ beq .L8bytes_aligned
+
+ /* Try to compare the first double word which is not 8 bytes aligned:
+ * load the first double word at (src & ~7UL) and shift left appropriate
+ * bits before comparision.
+ */
+ clrlwi r6,r3,29
+ rlwinm r6,r6,3,0,28
+ clrrdi r3,r3,3
+ clrrdi r4,r4,3
+ LD rA,0,r3
+ LD rB,0,r4
+ sld rA,rA,r6
+ sld rB,rB,r6
+ cmpld cr0,rA,rB
+ bne cr0,.LcmpAB_lightweight
+ srwi r6,r6,3
+ subfic r6,r6,8
+ subfc. r5,r6,r5
+ beq .Lzero
+ addi r3,r3,8
+ addi r4,r4,8
+
+.L8bytes_aligned:
+ /* now we are aligned with 8 bytes.
+ * Use .Llong loop if left cmp bytes are equal or greater than 32B.
+ */
+ cmpdi cr6,r5,31
+ bgt cr6,.Llong
+
+ cmpdi cr6,r5,7
+ bgt cr6,.Lcmp_8bytes_31bytes
+
+.Lcmp_rest_lt8bytes:
+ /* Here we have only less than 8 bytes to compare with. Addresses
+ * are aligned with 8 bytes.
+ * The next double words are load and shift right with appropriate
+ * bits.
+ */
+ subfic r6,r5,8
+ rlwinm r6,r6,3,0,28
+ LD rA,0,r3
+ LD rB,0,r4
+ srd rA,rA,r6
+ srd rB,rB,r6
+ cmpld cr0,rA,rB
+ bne cr0,.LcmpAB_lightweight
+ beq .Lzero
+
+.Lcmp_8bytes_31bytes:
+ /* compare 8 ~ 31 bytes with 8 bytes aligned */
+ srdi. r0,r5,3
+ clrldi r5,r5,61
+ mtctr r0
+831:
+ LD rA,0,r3
+ LD rB,0,r4
+ cmpld cr0,rA,rB
+ bne cr0,.LcmpAB_lightweight
+ addi r3,r3,8
+ addi r4,r4,8
+ bdnz 831b
+
+ cmpwi r5,0
+ beq .Lzero
+ b .Lcmp_rest_lt8bytes
+
.Lnon_zero:
mr r3,rC
blr
@@ -232,4 +311,12 @@ _GLOBAL(memcmp)
ld r28,-32(r1)
ld r27,-40(r1)
blr
+
+.LcmpAB_lightweight: /* skip NV GPRS restore */
+ li r3,1
+ bgt cr0,8f
+ li r3,-1
+8:
+ blr
+
EXPORT_SYMBOL(memcmp)
--
1.8.3.1
next prev parent reply other threads:[~2017-09-22 5:38 UTC|newest]
Thread overview: 18+ messages / expand[flat|nested] mbox.gz Atom feed top
2017-09-20 23:34 [PATCH v2 0/3] powerpc/64: memcmp() optimization wei.guo.simon
2017-09-20 23:34 ` wei.guo.simon [this message]
2017-09-20 23:34 ` [PATCH v2 2/3] powerpc/64: enhance memcmp() with VMX instruction for long bytes comparision wei.guo.simon
2017-09-21 0:54 ` Simon Guo
2017-09-22 14:06 ` Cyril Bur
2017-09-23 21:18 ` Simon Guo
2017-09-25 23:59 ` Cyril Bur
2017-09-26 5:34 ` Michael Ellerman
2017-09-26 11:26 ` Segher Boessenkool
2017-09-27 3:38 ` Michael Ellerman
2017-09-27 9:27 ` Segher Boessenkool
2017-09-27 9:43 ` David Laight
2017-09-27 18:33 ` Simon Guo
2017-09-28 9:24 ` David Laight
2017-09-27 16:22 ` Simon Guo
2017-09-20 23:34 ` [PATCH v2 3/3] powerpc:selftest update memcmp_64 selftest for VMX implementation wei.guo.simon
2017-09-25 9:30 ` David Laight
2017-09-24 6:19 ` Simon Guo
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1505950480-14830-2-git-send-email-wei.guo.simon@gmail.com \
--to=wei.guo.simon@gmail.com \
--cc=David.Laight@ACULAB.COM \
--cc=christophe.leroy@c-s.fr \
--cc=linuxppc-dev@lists.ozlabs.org \
--cc=mpe@ellerman.id.au \
--cc=naveen.n.rao@linux.vnet.ibm.com \
--cc=paulus@ozlabs.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.