* [PATCH v3 1/3] powerpc/lib: move PPC32 specific functions out of string.S
2018-05-22 16:06 [PATCH v3 0/3] powerpc/lib: Optimisation of string functions for PPC32 - part 1 Christophe Leroy
@ 2018-05-22 16:06 ` Christophe Leroy
2018-05-22 16:06 ` [PATCH v3 2/3] powerpc/lib: optimise 32 bits __clear_user() Christophe Leroy
2018-05-22 16:06 ` [PATCH v3 3/3] powerpc/lib: optimise PPC32 memcmp Christophe Leroy
2 siblings, 0 replies; 7+ messages in thread
From: Christophe Leroy @ 2018-05-22 16:06 UTC (permalink / raw)
To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman, segher
Cc: linux-kernel, linuxppc-dev
In preparation of optimisation patches, move PPC32 specific
memcmp() and __clear_user() into string_32.S
Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
---
arch/powerpc/lib/Makefile | 5 +--
arch/powerpc/lib/string.S | 61 -------------------------------------
arch/powerpc/lib/string_32.S | 72 ++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 75 insertions(+), 63 deletions(-)
create mode 100644 arch/powerpc/lib/string_32.S
diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile
index 653901042ad7..2c9b8c0adf22 100644
--- a/arch/powerpc/lib/Makefile
+++ b/arch/powerpc/lib/Makefile
@@ -26,13 +26,14 @@ obj-$(CONFIG_PPC_BOOK3S_64) += copyuser_power7.o copypage_power7.o \
memcpy_power7.o
obj64-y += copypage_64.o copyuser_64.o mem_64.o hweight_64.o \
- string_64.o memcpy_64.o memcmp_64.o pmem.o
+ memcpy_64.o memcmp_64.o pmem.o
obj64-$(CONFIG_SMP) += locks.o
obj64-$(CONFIG_ALTIVEC) += vmx-helper.o
obj64-$(CONFIG_KPROBES_SANITY_TEST) += test_emulate_step.o
-obj-y += checksum_$(BITS).o checksum_wrappers.o
+obj-y += checksum_$(BITS).o checksum_wrappers.o \
+ string_$(BITS).o
obj-y += sstep.o ldstfp.o quad.o
obj64-y += quad.o
diff --git a/arch/powerpc/lib/string.S b/arch/powerpc/lib/string.S
index 0378def28d41..b03c1a6861f4 100644
--- a/arch/powerpc/lib/string.S
+++ b/arch/powerpc/lib/string.S
@@ -56,23 +56,6 @@ _GLOBAL(strncmp)
blr
EXPORT_SYMBOL(strncmp)
-#ifdef CONFIG_PPC32
-_GLOBAL(memcmp)
- PPC_LCMPI 0,r5,0
- beq- 2f
- mtctr r5
- addi r6,r3,-1
- addi r4,r4,-1
-1: lbzu r3,1(r6)
- lbzu r0,1(r4)
- subf. r3,r0,r3
- bdnzt 2,1b
- blr
-2: li r3,0
- blr
-EXPORT_SYMBOL(memcmp)
-#endif
-
_GLOBAL(memchr)
PPC_LCMPI 0,r5,0
beq- 2f
@@ -86,47 +69,3 @@ _GLOBAL(memchr)
2: li r3,0
blr
EXPORT_SYMBOL(memchr)
-
-#ifdef CONFIG_PPC32
-_GLOBAL(__clear_user)
- addi r6,r3,-4
- li r3,0
- li r5,0
- cmplwi 0,r4,4
- blt 7f
- /* clear a single word */
-11: stwu r5,4(r6)
- beqlr
- /* clear word sized chunks */
- andi. r0,r6,3
- add r4,r0,r4
- subf r6,r0,r6
- srwi r0,r4,2
- andi. r4,r4,3
- mtctr r0
- bdz 7f
-1: stwu r5,4(r6)
- bdnz 1b
- /* clear byte sized chunks */
-7: cmpwi 0,r4,0
- beqlr
- mtctr r4
- addi r6,r6,3
-8: stbu r5,1(r6)
- bdnz 8b
- blr
-90: mr r3,r4
- blr
-91: mfctr r3
- slwi r3,r3,2
- add r3,r3,r4
- blr
-92: mfctr r3
- blr
-
- EX_TABLE(11b, 90b)
- EX_TABLE(1b, 91b)
- EX_TABLE(8b, 92b)
-
-EXPORT_SYMBOL(__clear_user)
-#endif
diff --git a/arch/powerpc/lib/string_32.S b/arch/powerpc/lib/string_32.S
new file mode 100644
index 000000000000..204db8a834fd
--- /dev/null
+++ b/arch/powerpc/lib/string_32.S
@@ -0,0 +1,72 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * String handling functions for PowerPC32
+ *
+ * Copyright (C) 1996 Paul Mackerras.
+ *
+ */
+
+#include <asm/processor.h>
+#include <asm/errno.h>
+#include <asm/ppc_asm.h>
+#include <asm/export.h>
+
+ .text
+
+_GLOBAL(memcmp)
+ cmpwi cr0, r5, 0
+ beq- 2f
+ mtctr r5
+ addi r6,r3,-1
+ addi r4,r4,-1
+1: lbzu r3,1(r6)
+ lbzu r0,1(r4)
+ subf. r3,r0,r3
+ bdnzt 2,1b
+ blr
+2: li r3,0
+ blr
+EXPORT_SYMBOL(memcmp)
+
+_GLOBAL(__clear_user)
+ addi r6,r3,-4
+ li r3,0
+ li r5,0
+ cmplwi 0,r4,4
+ blt 7f
+ /* clear a single word */
+11: stwu r5,4(r6)
+ beqlr
+ /* clear word sized chunks */
+ andi. r0,r6,3
+ add r4,r0,r4
+ subf r6,r0,r6
+ srwi r0,r4,2
+ andi. r4,r4,3
+ mtctr r0
+ bdz 7f
+1: stwu r5,4(r6)
+ bdnz 1b
+ /* clear byte sized chunks */
+7: cmpwi 0,r4,0
+ beqlr
+ mtctr r4
+ addi r6,r6,3
+8: stbu r5,1(r6)
+ bdnz 8b
+ blr
+90: mr r3,r4
+ blr
+91: mfctr r3
+ slwi r3,r3,2
+ add r3,r3,r4
+ blr
+92: mfctr r3
+ blr
+
+ EX_TABLE(11b, 90b)
+ EX_TABLE(1b, 91b)
+ EX_TABLE(8b, 92b)
+
+EXPORT_SYMBOL(__clear_user)
--
2.13.3
^ permalink raw reply related [flat|nested] 7+ messages in thread
* [PATCH v3 2/3] powerpc/lib: optimise 32 bits __clear_user()
2018-05-22 16:06 [PATCH v3 0/3] powerpc/lib: Optimisation of string functions for PPC32 - part 1 Christophe Leroy
2018-05-22 16:06 ` [PATCH v3 1/3] powerpc/lib: move PPC32 specific functions out of string.S Christophe Leroy
@ 2018-05-22 16:06 ` Christophe Leroy
2018-05-22 16:06 ` [PATCH v3 3/3] powerpc/lib: optimise PPC32 memcmp Christophe Leroy
2 siblings, 0 replies; 7+ messages in thread
From: Christophe Leroy @ 2018-05-22 16:06 UTC (permalink / raw)
To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman, segher
Cc: linux-kernel, linuxppc-dev
Rewrite clear_user() on the same principle as memset(0), making use
of dcbz to clear complete cache lines.
This code is a copy/paste of memset(), with some modifications
in order to retrieve remaining number of bytes to be cleared,
as it needs to be returned in case of error.
On a MPC885, throughput is almost doubled:
Before:
~# dd if=/dev/zero of=/dev/null bs=1M count=1000
1048576000 bytes (1000.0MB) copied, 18.990779 seconds, 52.7MB/s
After:
~# dd if=/dev/zero of=/dev/null bs=1M count=1000
1048576000 bytes (1000.0MB) copied, 9.611468 seconds, 104.0MB/s
On a MPC8321, throughput is multiplied by 2.12:
Before:
root@vgoippro:~# dd if=/dev/zero of=/dev/null bs=1M count=1000
1048576000 bytes (1000.0MB) copied, 6.844352 seconds, 146.1MB/s
After:
root@vgoippro:~# dd if=/dev/zero of=/dev/null bs=1M count=1000
1048576000 bytes (1000.0MB) copied, 3.218854 seconds, 310.7MB/s
Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
---
arch/powerpc/lib/string_32.S | 85 +++++++++++++++++++++++++++++++-------------
1 file changed, 60 insertions(+), 25 deletions(-)
diff --git a/arch/powerpc/lib/string_32.S b/arch/powerpc/lib/string_32.S
index 204db8a834fd..40a576d56ac7 100644
--- a/arch/powerpc/lib/string_32.S
+++ b/arch/powerpc/lib/string_32.S
@@ -11,6 +11,7 @@
#include <asm/errno.h>
#include <asm/ppc_asm.h>
#include <asm/export.h>
+#include <asm/cache.h>
.text
@@ -29,44 +30,78 @@ _GLOBAL(memcmp)
blr
EXPORT_SYMBOL(memcmp)
+CACHELINE_BYTES = L1_CACHE_BYTES
+LG_CACHELINE_BYTES = L1_CACHE_SHIFT
+CACHELINE_MASK = (L1_CACHE_BYTES-1)
+
_GLOBAL(__clear_user)
- addi r6,r3,-4
- li r3,0
- li r5,0
- cmplwi 0,r4,4
+/*
+ * Use dcbz on the complete cache lines in the destination
+ * to set them to zero. This requires that the destination
+ * area is cacheable.
+ */
+ cmplwi cr0, r4, 4
+ mr r10, r3
+ li r3, 0
blt 7f
- /* clear a single word */
-11: stwu r5,4(r6)
+
+11: stw r3, 0(r10)
beqlr
- /* clear word sized chunks */
- andi. r0,r6,3
- add r4,r0,r4
- subf r6,r0,r6
- srwi r0,r4,2
- andi. r4,r4,3
+ andi. r0, r10, 3
+ add r11, r0, r4
+ subf r6, r0, r10
+
+ clrlwi r7, r6, 32 - LG_CACHELINE_BYTES
+ add r8, r7, r11
+ srwi r9, r8, LG_CACHELINE_BYTES
+ addic. r9, r9, -1 /* total number of complete cachelines */
+ ble 2f
+ xori r0, r7, CACHELINE_MASK & ~3
+ srwi. r0, r0, 2
+ beq 3f
+ mtctr r0
+4: stwu r3, 4(r6)
+ bdnz 4b
+3: mtctr r9
+ li r7, 4
+10: dcbz r7, r6
+ addi r6, r6, CACHELINE_BYTES
+ bdnz 10b
+ clrlwi r11, r8, 32 - LG_CACHELINE_BYTES
+ addi r11, r11, 4
+
+2: srwi r0 ,r11 ,2
mtctr r0
- bdz 7f
-1: stwu r5,4(r6)
+ bdz 6f
+1: stwu r3, 4(r6)
bdnz 1b
- /* clear byte sized chunks */
-7: cmpwi 0,r4,0
+6: andi. r11, r11, 3
beqlr
- mtctr r4
- addi r6,r6,3
-8: stbu r5,1(r6)
+ mtctr r11
+ addi r6, r6, 3
+8: stbu r3, 1(r6)
bdnz 8b
blr
-90: mr r3,r4
+
+7: cmpwi cr0, r4, 0
+ beqlr
+ mtctr r4
+ addi r6, r10, -1
+9: stbu r3, 1(r6)
+ bdnz 9b
blr
-91: mfctr r3
- slwi r3,r3,2
- add r3,r3,r4
+
+90: mr r3, r4
blr
-92: mfctr r3
+91: add r3, r10, r4
+ subf r3, r6, r3
blr
EX_TABLE(11b, 90b)
+ EX_TABLE(4b, 91b)
+ EX_TABLE(10b, 91b)
EX_TABLE(1b, 91b)
- EX_TABLE(8b, 92b)
+ EX_TABLE(8b, 91b)
+ EX_TABLE(9b, 91b)
EXPORT_SYMBOL(__clear_user)
--
2.13.3
^ permalink raw reply related [flat|nested] 7+ messages in thread
* [PATCH v3 3/3] powerpc/lib: optimise PPC32 memcmp
2018-05-22 16:06 [PATCH v3 0/3] powerpc/lib: Optimisation of string functions for PPC32 - part 1 Christophe Leroy
2018-05-22 16:06 ` [PATCH v3 1/3] powerpc/lib: move PPC32 specific functions out of string.S Christophe Leroy
2018-05-22 16:06 ` [PATCH v3 2/3] powerpc/lib: optimise 32 bits __clear_user() Christophe Leroy
@ 2018-05-22 16:06 ` Christophe Leroy
2018-05-23 7:47 ` [PATCH v4 " Christophe Leroy
2 siblings, 1 reply; 7+ messages in thread
From: Christophe Leroy @ 2018-05-22 16:06 UTC (permalink / raw)
To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman, segher
Cc: linux-kernel, linuxppc-dev
At the time being, memcmp() compares two chunks of memory
byte per byte.
This patch optimises the comparison by comparing word by word.
A small benchmark performed on an 8xx comparing two chuncks
of 512 bytes performed 100000 times gives:
Before : 5852274 TB ticks
After: 1488638 TB ticks
This is almost 4 times faster
Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
---
arch/powerpc/lib/string_32.S | 47 +++++++++++++++++++++++++++++++++++++-------
1 file changed, 40 insertions(+), 7 deletions(-)
diff --git a/arch/powerpc/lib/string_32.S b/arch/powerpc/lib/string_32.S
index 40a576d56ac7..d83b7d996f61 100644
--- a/arch/powerpc/lib/string_32.S
+++ b/arch/powerpc/lib/string_32.S
@@ -18,16 +18,49 @@
_GLOBAL(memcmp)
cmpwi cr0, r5, 0
beq- 2f
- mtctr r5
- addi r6,r3,-1
- addi r4,r4,-1
-1: lbzu r3,1(r6)
- lbzu r0,1(r4)
- subf. r3,r0,r3
- bdnzt 2,1b
+ srawi. r7, r5, 2 /* Divide len by 4 */
+ mr r6, r3
+ beq- 3f
+ mtctr r7
+ li r7, 0
+1:
+#ifdef __LITTLE_ENDIAN__
+ lwbrx r3, r6, r7
+ lwbrx r0, r4, r7
+#else
+ lwzx r3, r6, r7
+ lwzx r0, r4, r7
+#endif
+ addi r7, r7, 4
+ cmpl cr0, r3, r0
+ bdnzt eq, 1b
+ bne 5f
+ andi. r5, r5, 3
+ li r3, 0
+ beqlr
+3: cmplwi cr1, r5, 2
+ blt- cr1, 4f
+#ifdef __LITTLE_ENDIAN__
+ lhbrx r3, r6, r7
+ lhbrx r0, r4, r7
+#else
+ lhzx r3, r6, r7
+ lhzx r0, r4, r7
+#endif
+ addi r7, r7, 2
+ subf. r3, r0, r3
+ beqlr cr1
+ bnelr
+4: lbzx r3, r6, r7
+ lbzx r0, r4, r7
+ subf. r3, r0, r3
blr
2: li r3,0
blr
+5: li r3, 1
+ bgtlr
+ li r3, -1
+ blr
EXPORT_SYMBOL(memcmp)
CACHELINE_BYTES = L1_CACHE_BYTES
--
2.13.3
^ permalink raw reply related [flat|nested] 7+ messages in thread
* [PATCH v4 3/3] powerpc/lib: optimise PPC32 memcmp
2018-05-22 16:06 ` [PATCH v3 3/3] powerpc/lib: optimise PPC32 memcmp Christophe Leroy
@ 2018-05-23 7:47 ` Christophe Leroy
2018-05-24 17:24 ` Segher Boessenkool
0 siblings, 1 reply; 7+ messages in thread
From: Christophe Leroy @ 2018-05-23 7:47 UTC (permalink / raw)
To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman, segher
Cc: linuxppc-dev, linux-kernel
At the time being, memcmp() compares two chunks of memory
byte per byte.
This patch optimises the comparison by comparing word by word.
A small benchmark performed on an 8xx comparing two chuncks
of 512 bytes performed 100000 times gives:
Before : 5852274 TB ticks
After: 1488638 TB ticks
This is almost 4 times faster
Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
---
Not resending the entire serie
v4: Dropped the special handling for when length is 0. Handling it
through the small length path.
arch/powerpc/lib/string_32.S | 48
+++++++++++++++++++++++++++++++++++---------
1 file changed, 38 insertions(+), 10 deletions(-)
diff --git a/arch/powerpc/lib/string_32.S b/arch/powerpc/lib/string_32.S
index 40a576d56ac7..542e6cecbcaf 100644
--- a/arch/powerpc/lib/string_32.S
+++ b/arch/powerpc/lib/string_32.S
@@ -16,17 +16,45 @@
.text
_GLOBAL(memcmp)
- cmpwi cr0, r5, 0
- beq- 2f
- mtctr r5
- addi r6,r3,-1
- addi r4,r4,-1
-1: lbzu r3,1(r6)
- lbzu r0,1(r4)
- subf. r3,r0,r3
- bdnzt 2,1b
+ srawi. r7, r5, 2 /* Divide len by 4 */
+ mr r6, r3
+ beq- 3f
+ mtctr r7
+ li r7, 0
+1:
+#ifdef __LITTLE_ENDIAN__
+ lwbrx r3, r6, r7
+ lwbrx r0, r4, r7
+#else
+ lwzx r3, r6, r7
+ lwzx r0, r4, r7
+#endif
+ addi r7, r7, 4
+ cmplw cr0, r3, r0
+ bdnzt eq, 1b
+ bne 5f
+3: andi. r3, r5, 3
+ beqlr
+ cmplwi cr1, r3, 2
+ blt- cr1, 4f
+#ifdef __LITTLE_ENDIAN__
+ lhbrx r3, r6, r7
+ lhbrx r0, r4, r7
+#else
+ lhzx r3, r6, r7
+ lhzx r0, r4, r7
+#endif
+ addi r7, r7, 2
+ subf. r3, r0, r3
+ beqlr cr1
+ bnelr
+4: lbzx r3, r6, r7
+ lbzx r0, r4, r7
+ subf. r3, r0, r3
blr
-2: li r3,0
+5: li r3, 1
+ bgtlr
+ li r3, -1
blr
EXPORT_SYMBOL(memcmp)
--
2.13.3
^ permalink raw reply related [flat|nested] 7+ messages in thread
* Re: [PATCH v4 3/3] powerpc/lib: optimise PPC32 memcmp
2018-05-23 7:47 ` [PATCH v4 " Christophe Leroy
@ 2018-05-24 17:24 ` Segher Boessenkool
2018-05-25 5:55 ` Christophe LEROY
0 siblings, 1 reply; 7+ messages in thread
From: Segher Boessenkool @ 2018-05-24 17:24 UTC (permalink / raw)
To: Christophe Leroy
Cc: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman,
linuxppc-dev, linux-kernel
On Wed, May 23, 2018 at 09:47:32AM +0200, Christophe Leroy wrote:
> At the time being, memcmp() compares two chunks of memory
> byte per byte.
>
> This patch optimises the comparison by comparing word by word.
>
> A small benchmark performed on an 8xx comparing two chuncks
> of 512 bytes performed 100000 times gives:
>
> Before : 5852274 TB ticks
> After: 1488638 TB ticks
> diff --git a/arch/powerpc/lib/string_32.S b/arch/powerpc/lib/string_32.S
> index 40a576d56ac7..542e6cecbcaf 100644
> --- a/arch/powerpc/lib/string_32.S
> +++ b/arch/powerpc/lib/string_32.S
> @@ -16,17 +16,45 @@
> .text
>
> _GLOBAL(memcmp)
> - cmpwi cr0, r5, 0
> - beq- 2f
> - mtctr r5
> - addi r6,r3,-1
> - addi r4,r4,-1
> -1: lbzu r3,1(r6)
> - lbzu r0,1(r4)
> - subf. r3,r0,r3
> - bdnzt 2,1b
> + srawi. r7, r5, 2 /* Divide len by 4 */
> + mr r6, r3
> + beq- 3f
> + mtctr r7
> + li r7, 0
> +1:
> +#ifdef __LITTLE_ENDIAN__
> + lwbrx r3, r6, r7
> + lwbrx r0, r4, r7
> +#else
> + lwzx r3, r6, r7
> + lwzx r0, r4, r7
> +#endif
You don't test whether the pointers are word-aligned. Does that work?
Say, when a load is crossing a page boundary, or segment boundary.
Segher
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH v4 3/3] powerpc/lib: optimise PPC32 memcmp
2018-05-24 17:24 ` Segher Boessenkool
@ 2018-05-25 5:55 ` Christophe LEROY
0 siblings, 0 replies; 7+ messages in thread
From: Christophe LEROY @ 2018-05-25 5:55 UTC (permalink / raw)
To: Segher Boessenkool
Cc: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman,
linuxppc-dev, linux-kernel
Le 24/05/2018 à 19:24, Segher Boessenkool a écrit :
> On Wed, May 23, 2018 at 09:47:32AM +0200, Christophe Leroy wrote:
>> At the time being, memcmp() compares two chunks of memory
>> byte per byte.
>>
>> This patch optimises the comparison by comparing word by word.
>>
>> A small benchmark performed on an 8xx comparing two chuncks
>> of 512 bytes performed 100000 times gives:
>>
>> Before : 5852274 TB ticks
>> After: 1488638 TB ticks
>
>> diff --git a/arch/powerpc/lib/string_32.S b/arch/powerpc/lib/string_32.S
>> index 40a576d56ac7..542e6cecbcaf 100644
>> --- a/arch/powerpc/lib/string_32.S
>> +++ b/arch/powerpc/lib/string_32.S
>> @@ -16,17 +16,45 @@
>> .text
>>
>> _GLOBAL(memcmp)
>> - cmpwi cr0, r5, 0
>> - beq- 2f
>> - mtctr r5
>> - addi r6,r3,-1
>> - addi r4,r4,-1
>> -1: lbzu r3,1(r6)
>> - lbzu r0,1(r4)
>> - subf. r3,r0,r3
>> - bdnzt 2,1b
>> + srawi. r7, r5, 2 /* Divide len by 4 */
>> + mr r6, r3
>> + beq- 3f
>> + mtctr r7
>> + li r7, 0
>> +1:
>> +#ifdef __LITTLE_ENDIAN__
>> + lwbrx r3, r6, r7
>> + lwbrx r0, r4, r7
>> +#else
>> + lwzx r3, r6, r7
>> + lwzx r0, r4, r7
>> +#endif
>
> You don't test whether the pointers are word-aligned. Does that work?
copy_tofrom_user() word-aligns the store address and doesn't take care
of the load address, so I believe it works.
Now, I just read in the MPC885 Ref Manual that unaligned access
generates alignment exception when the processor is running in LE mode.
Ref. made to the discussion on patch "powerpc/32be: use stmw/lmw for
registers save/restore in asm"
(https://patchwork.ozlabs.org/patch/899465/), I will drop the handling
for LE mode.
Christophe
> Say, when a load is crossing a page boundary, or segment boundary.
>
>
> Segher
>
^ permalink raw reply [flat|nested] 7+ messages in thread