[PATCH v3 0/3] powerpc/lib: Optimisation of string functions for PPC32

linuxppc-dev.lists.ozlabs.org archive mirror
 help / color / mirror / Atom feed

* [PATCH v3 0/3] powerpc/lib: Optimisation of string functions for PPC32 - part 1
@ 2018-05-22 16:06 Christophe Leroy
  2018-05-22 16:06 ` [PATCH v3 1/3] powerpc/lib: move PPC32 specific functions out of string.S Christophe Leroy
                   ` (2 more replies)
  0 siblings, 3 replies; 7+ messages in thread
From: Christophe Leroy @ 2018-05-22 16:06 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman, segher
  Cc: linux-kernel, linuxppc-dev

This serie intends to optimise string functions for PPC32 in the
same spirit as already done on PPC64.

The first patch moves PPC32 specific functions from string.S into
a dedicated file named string_32.S
The second patch rewrites __clear_user() by using dcbz intruction
The third patch rewrites memcmp() to compare 32 bits words instead
of comparing byte per byte.

As shown in each individual commit log, second and third patches provide
significant improvment.

Changes in v3:
- Fixed the sign of the result returned by memcmp() by using a logical
comparison of u32 words and returning -1, 0 or 1 instead of doing a substract.
- In first patch, replaced PPC_LCMPI by cmpwi
- Fixed licence in string_32.S
- Removed the two last patches from the serie. They will be handled later as
they require further tests and analysis to properly identify their real benefit
in all possible cases.

Changes in v2:
- Moved out the patch removing the hot loop alignment on PPC32
- Squashed the changes related to NUL size verification in a single patch
- Reordered the patches in a more logical order
- Modified the inlining patch to avoid warning about impossibility to version symbols.

Christophe Leroy (3):
  powerpc/lib: move PPC32 specific functions out of string.S
  powerpc/lib: optimise 32 bits __clear_user()
  powerpc/lib: optimise PPC32 memcmp

 arch/powerpc/lib/Makefile    |   5 +-
 arch/powerpc/lib/string.S    |  61 -------------------
 arch/powerpc/lib/string_32.S | 140 +++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 143 insertions(+), 63 deletions(-)
 create mode 100644 arch/powerpc/lib/string_32.S

-- 
2.13.3

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH v3 1/3] powerpc/lib: move PPC32 specific functions out of string.S
  2018-05-22 16:06 [PATCH v3 0/3] powerpc/lib: Optimisation of string functions for PPC32 - part 1 Christophe Leroy
@ 2018-05-22 16:06 ` Christophe Leroy
  2018-05-22 16:06 ` [PATCH v3 2/3] powerpc/lib: optimise 32 bits __clear_user() Christophe Leroy
  2018-05-22 16:06 ` [PATCH v3 3/3] powerpc/lib: optimise PPC32 memcmp Christophe Leroy
  2 siblings, 0 replies; 7+ messages in thread
From: Christophe Leroy @ 2018-05-22 16:06 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman, segher
  Cc: linux-kernel, linuxppc-dev

In preparation of optimisation patches, move PPC32 specific
memcmp() and __clear_user() into string_32.S

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
---
 arch/powerpc/lib/Makefile    |  5 +--
 arch/powerpc/lib/string.S    | 61 -------------------------------------
 arch/powerpc/lib/string_32.S | 72 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 75 insertions(+), 63 deletions(-)
 create mode 100644 arch/powerpc/lib/string_32.S

diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile
index 653901042ad7..2c9b8c0adf22 100644
--- a/arch/powerpc/lib/Makefile
+++ b/arch/powerpc/lib/Makefile
@@ -26,13 +26,14 @@ obj-$(CONFIG_PPC_BOOK3S_64) += copyuser_power7.o copypage_power7.o \
 			       memcpy_power7.o
 
 obj64-y	+= copypage_64.o copyuser_64.o mem_64.o hweight_64.o \
-	   string_64.o memcpy_64.o memcmp_64.o pmem.o
+	   memcpy_64.o memcmp_64.o pmem.o
 
 obj64-$(CONFIG_SMP)	+= locks.o
 obj64-$(CONFIG_ALTIVEC)	+= vmx-helper.o
 obj64-$(CONFIG_KPROBES_SANITY_TEST) += test_emulate_step.o
 
-obj-y			+= checksum_$(BITS).o checksum_wrappers.o
+obj-y			+= checksum_$(BITS).o checksum_wrappers.o \
+			   string_$(BITS).o
 
 obj-y			+= sstep.o ldstfp.o quad.o
 obj64-y			+= quad.o
diff --git a/arch/powerpc/lib/string.S b/arch/powerpc/lib/string.S
index 0378def28d41..b03c1a6861f4 100644
--- a/arch/powerpc/lib/string.S
+++ b/arch/powerpc/lib/string.S
@@ -56,23 +56,6 @@ _GLOBAL(strncmp)
 	blr
 EXPORT_SYMBOL(strncmp)
 
-#ifdef CONFIG_PPC32
-_GLOBAL(memcmp)
-	PPC_LCMPI 0,r5,0
-	beq-	2f
-	mtctr	r5
-	addi	r6,r3,-1
-	addi	r4,r4,-1
-1:	lbzu	r3,1(r6)
-	lbzu	r0,1(r4)
-	subf.	r3,r0,r3
-	bdnzt	2,1b
-	blr
-2:	li	r3,0
-	blr
-EXPORT_SYMBOL(memcmp)
-#endif
-
 _GLOBAL(memchr)
 	PPC_LCMPI 0,r5,0
 	beq-	2f
@@ -86,47 +69,3 @@ _GLOBAL(memchr)
 2:	li	r3,0
 	blr
 EXPORT_SYMBOL(memchr)
-
-#ifdef CONFIG_PPC32
-_GLOBAL(__clear_user)
-	addi	r6,r3,-4
-	li	r3,0
-	li	r5,0
-	cmplwi	0,r4,4
-	blt	7f
-	/* clear a single word */
-11:	stwu	r5,4(r6)
-	beqlr
-	/* clear word sized chunks */
-	andi.	r0,r6,3
-	add	r4,r0,r4
-	subf	r6,r0,r6
-	srwi	r0,r4,2
-	andi.	r4,r4,3
-	mtctr	r0
-	bdz	7f
-1:	stwu	r5,4(r6)
-	bdnz	1b
-	/* clear byte sized chunks */
-7:	cmpwi	0,r4,0
-	beqlr
-	mtctr	r4
-	addi	r6,r6,3
-8:	stbu	r5,1(r6)
-	bdnz	8b
-	blr
-90:	mr	r3,r4
-	blr
-91:	mfctr	r3
-	slwi	r3,r3,2
-	add	r3,r3,r4
-	blr
-92:	mfctr	r3
-	blr
-
-	EX_TABLE(11b, 90b)
-	EX_TABLE(1b, 91b)
-	EX_TABLE(8b, 92b)
-
-EXPORT_SYMBOL(__clear_user)
-#endif
diff --git a/arch/powerpc/lib/string_32.S b/arch/powerpc/lib/string_32.S
new file mode 100644
index 000000000000..204db8a834fd
--- /dev/null
+++ b/arch/powerpc/lib/string_32.S
@@ -0,0 +1,72 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * String handling functions for PowerPC32
+ *
+ * Copyright (C) 1996 Paul Mackerras.
+ *
+ */
+
+#include <asm/processor.h>
+#include <asm/errno.h>
+#include <asm/ppc_asm.h>
+#include <asm/export.h>
+
+	.text
+
+_GLOBAL(memcmp)
+	cmpwi	cr0, r5, 0
+	beq-	2f
+	mtctr	r5
+	addi	r6,r3,-1
+	addi	r4,r4,-1
+1:	lbzu	r3,1(r6)
+	lbzu	r0,1(r4)
+	subf.	r3,r0,r3
+	bdnzt	2,1b
+	blr
+2:	li	r3,0
+	blr
+EXPORT_SYMBOL(memcmp)
+
+_GLOBAL(__clear_user)
+	addi	r6,r3,-4
+	li	r3,0
+	li	r5,0
+	cmplwi	0,r4,4
+	blt	7f
+	/* clear a single word */
+11:	stwu	r5,4(r6)
+	beqlr
+	/* clear word sized chunks */
+	andi.	r0,r6,3
+	add	r4,r0,r4
+	subf	r6,r0,r6
+	srwi	r0,r4,2
+	andi.	r4,r4,3
+	mtctr	r0
+	bdz	7f
+1:	stwu	r5,4(r6)
+	bdnz	1b
+	/* clear byte sized chunks */
+7:	cmpwi	0,r4,0
+	beqlr
+	mtctr	r4
+	addi	r6,r6,3
+8:	stbu	r5,1(r6)
+	bdnz	8b
+	blr
+90:	mr	r3,r4
+	blr
+91:	mfctr	r3
+	slwi	r3,r3,2
+	add	r3,r3,r4
+	blr
+92:	mfctr	r3
+	blr
+
+	EX_TABLE(11b, 90b)
+	EX_TABLE(1b, 91b)
+	EX_TABLE(8b, 92b)
+
+EXPORT_SYMBOL(__clear_user)
-- 
2.13.3

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH v3 2/3] powerpc/lib: optimise 32 bits __clear_user()
  2018-05-22 16:06 [PATCH v3 0/3] powerpc/lib: Optimisation of string functions for PPC32 - part 1 Christophe Leroy
  2018-05-22 16:06 ` [PATCH v3 1/3] powerpc/lib: move PPC32 specific functions out of string.S Christophe Leroy
@ 2018-05-22 16:06 ` Christophe Leroy
  2018-05-22 16:06 ` [PATCH v3 3/3] powerpc/lib: optimise PPC32 memcmp Christophe Leroy
  2 siblings, 0 replies; 7+ messages in thread
From: Christophe Leroy @ 2018-05-22 16:06 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman, segher
  Cc: linux-kernel, linuxppc-dev

Rewrite clear_user() on the same principle as memset(0), making use
of dcbz to clear complete cache lines.

This code is a copy/paste of memset(), with some modifications
in order to retrieve remaining number of bytes to be cleared,
as it needs to be returned in case of error.

On a MPC885, throughput is almost doubled:

Before:
~# dd if=/dev/zero of=/dev/null bs=1M count=1000
1048576000 bytes (1000.0MB) copied, 18.990779 seconds, 52.7MB/s

After:
~# dd if=/dev/zero of=/dev/null bs=1M count=1000
1048576000 bytes (1000.0MB) copied, 9.611468 seconds, 104.0MB/s

On a MPC8321, throughput is multiplied by 2.12:

Before:
root@vgoippro:~# dd if=/dev/zero of=/dev/null bs=1M count=1000
1048576000 bytes (1000.0MB) copied, 6.844352 seconds, 146.1MB/s

After:
root@vgoippro:~# dd if=/dev/zero of=/dev/null bs=1M count=1000
1048576000 bytes (1000.0MB) copied, 3.218854 seconds, 310.7MB/s

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
---
 arch/powerpc/lib/string_32.S | 85 +++++++++++++++++++++++++++++++-------------
 1 file changed, 60 insertions(+), 25 deletions(-)

diff --git a/arch/powerpc/lib/string_32.S b/arch/powerpc/lib/string_32.S
index 204db8a834fd..40a576d56ac7 100644
--- a/arch/powerpc/lib/string_32.S
+++ b/arch/powerpc/lib/string_32.S
@@ -11,6 +11,7 @@
 #include <asm/errno.h>
 #include <asm/ppc_asm.h>
 #include <asm/export.h>
+#include <asm/cache.h>
 
 	.text
 
@@ -29,44 +30,78 @@ _GLOBAL(memcmp)
 	blr
 EXPORT_SYMBOL(memcmp)
 
+CACHELINE_BYTES = L1_CACHE_BYTES
+LG_CACHELINE_BYTES = L1_CACHE_SHIFT
+CACHELINE_MASK = (L1_CACHE_BYTES-1)
+
 _GLOBAL(__clear_user)
-	addi	r6,r3,-4
-	li	r3,0
-	li	r5,0
-	cmplwi	0,r4,4
+/*
+ * Use dcbz on the complete cache lines in the destination
+ * to set them to zero.  This requires that the destination
+ * area is cacheable.
+ */
+	cmplwi	cr0, r4, 4
+	mr	r10, r3
+	li	r3, 0
 	blt	7f
-	/* clear a single word */
-11:	stwu	r5,4(r6)
+
+11:	stw	r3, 0(r10)
 	beqlr
-	/* clear word sized chunks */
-	andi.	r0,r6,3
-	add	r4,r0,r4
-	subf	r6,r0,r6
-	srwi	r0,r4,2
-	andi.	r4,r4,3
+	andi.	r0, r10, 3
+	add	r11, r0, r4
+	subf	r6, r0, r10
+
+	clrlwi	r7, r6, 32 - LG_CACHELINE_BYTES
+	add	r8, r7, r11
+	srwi	r9, r8, LG_CACHELINE_BYTES
+	addic.	r9, r9, -1	/* total number of complete cachelines */
+	ble	2f
+	xori	r0, r7, CACHELINE_MASK & ~3
+	srwi.	r0, r0, 2
+	beq	3f
+	mtctr	r0
+4:	stwu	r3, 4(r6)
+	bdnz	4b
+3:	mtctr	r9
+	li	r7, 4
+10:	dcbz	r7, r6
+	addi	r6, r6, CACHELINE_BYTES
+	bdnz	10b
+	clrlwi	r11, r8, 32 - LG_CACHELINE_BYTES
+	addi	r11, r11, 4
+
+2:	srwi	r0 ,r11 ,2
 	mtctr	r0
-	bdz	7f
-1:	stwu	r5,4(r6)
+	bdz	6f
+1:	stwu	r3, 4(r6)
 	bdnz	1b
-	/* clear byte sized chunks */
-7:	cmpwi	0,r4,0
+6:	andi.	r11, r11, 3
 	beqlr
-	mtctr	r4
-	addi	r6,r6,3
-8:	stbu	r5,1(r6)
+	mtctr	r11
+	addi	r6, r6, 3
+8:	stbu	r3, 1(r6)
 	bdnz	8b
 	blr
-90:	mr	r3,r4
+
+7:	cmpwi	cr0, r4, 0
+	beqlr
+	mtctr	r4
+	addi	r6, r10, -1
+9:	stbu	r3, 1(r6)
+	bdnz	9b
 	blr
-91:	mfctr	r3
-	slwi	r3,r3,2
-	add	r3,r3,r4
+
+90:	mr	r3, r4
 	blr
-92:	mfctr	r3
+91:	add	r3, r10, r4
+	subf	r3, r6, r3
 	blr
 
 	EX_TABLE(11b, 90b)
+	EX_TABLE(4b, 91b)
+	EX_TABLE(10b, 91b)
 	EX_TABLE(1b, 91b)
-	EX_TABLE(8b, 92b)
+	EX_TABLE(8b, 91b)
+	EX_TABLE(9b, 91b)
 
 EXPORT_SYMBOL(__clear_user)
-- 
2.13.3

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH v3 3/3] powerpc/lib: optimise PPC32 memcmp
  2018-05-22 16:06 [PATCH v3 0/3] powerpc/lib: Optimisation of string functions for PPC32 - part 1 Christophe Leroy
  2018-05-22 16:06 ` [PATCH v3 1/3] powerpc/lib: move PPC32 specific functions out of string.S Christophe Leroy
  2018-05-22 16:06 ` [PATCH v3 2/3] powerpc/lib: optimise 32 bits __clear_user() Christophe Leroy
@ 2018-05-22 16:06 ` Christophe Leroy
  2018-05-23  7:47   ` [PATCH v4 " Christophe Leroy
  2 siblings, 1 reply; 7+ messages in thread
From: Christophe Leroy @ 2018-05-22 16:06 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman, segher
  Cc: linux-kernel, linuxppc-dev

At the time being, memcmp() compares two chunks of memory
byte per byte.

This patch optimises the comparison by comparing word by word.

A small benchmark performed on an 8xx comparing two chuncks
of 512 bytes performed 100000 times gives:

Before : 5852274 TB ticks
After:   1488638 TB ticks

This is almost 4 times faster

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
---
 arch/powerpc/lib/string_32.S | 47 +++++++++++++++++++++++++++++++++++++-------
 1 file changed, 40 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/lib/string_32.S b/arch/powerpc/lib/string_32.S
index 40a576d56ac7..d83b7d996f61 100644
--- a/arch/powerpc/lib/string_32.S
+++ b/arch/powerpc/lib/string_32.S
@@ -18,16 +18,49 @@
 _GLOBAL(memcmp)
 	cmpwi	cr0, r5, 0
 	beq-	2f
-	mtctr	r5
-	addi	r6,r3,-1
-	addi	r4,r4,-1
-1:	lbzu	r3,1(r6)
-	lbzu	r0,1(r4)
-	subf.	r3,r0,r3
-	bdnzt	2,1b
+	srawi.	r7, r5, 2		/* Divide len by 4 */
+	mr	r6, r3
+	beq-	3f
+	mtctr	r7
+	li	r7, 0
+1:
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	r3, r6, r7
+	lwbrx	r0, r4, r7
+#else
+	lwzx	r3, r6, r7
+	lwzx	r0, r4, r7
+#endif
+	addi	r7, r7, 4
+	cmpl	cr0, r3, r0
+	bdnzt	eq, 1b
+	bne	5f
+	andi.	r5, r5, 3
+	li	r3, 0
+	beqlr
+3:	cmplwi	cr1, r5, 2
+	blt-	cr1, 4f
+#ifdef __LITTLE_ENDIAN__
+	lhbrx	r3, r6, r7
+	lhbrx	r0, r4, r7
+#else
+	lhzx	r3, r6, r7
+	lhzx	r0, r4, r7
+#endif
+	addi	r7, r7, 2
+	subf.	r3, r0, r3
+	beqlr	cr1
+	bnelr
+4:	lbzx	r3, r6, r7
+	lbzx	r0, r4, r7
+	subf.	r3, r0, r3
 	blr
 2:	li	r3,0
 	blr
+5:	li	r3, 1
+	bgtlr
+	li	r3, -1
+	blr
 EXPORT_SYMBOL(memcmp)
 
 CACHELINE_BYTES = L1_CACHE_BYTES
-- 
2.13.3

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH v4 3/3] powerpc/lib: optimise PPC32 memcmp
  2018-05-22 16:06 ` [PATCH v3 3/3] powerpc/lib: optimise PPC32 memcmp Christophe Leroy
@ 2018-05-23  7:47   ` Christophe Leroy
  2018-05-24 17:24     ` Segher Boessenkool
  0 siblings, 1 reply; 7+ messages in thread
From: Christophe Leroy @ 2018-05-23  7:47 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman, segher
  Cc: linuxppc-dev, linux-kernel

At the time being, memcmp() compares two chunks of memory
byte per byte.

This patch optimises the comparison by comparing word by word.

A small benchmark performed on an 8xx comparing two chuncks
of 512 bytes performed 100000 times gives:

Before : 5852274 TB ticks
After:   1488638 TB ticks

This is almost 4 times faster

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
---
  Not resending the entire serie

  v4: Dropped the special handling for when length is 0. Handling it 
through the small length path.

  arch/powerpc/lib/string_32.S | 48 
+++++++++++++++++++++++++++++++++++---------
  1 file changed, 38 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/lib/string_32.S b/arch/powerpc/lib/string_32.S
index 40a576d56ac7..542e6cecbcaf 100644
--- a/arch/powerpc/lib/string_32.S
+++ b/arch/powerpc/lib/string_32.S
@@ -16,17 +16,45 @@
  	.text

  _GLOBAL(memcmp)
-	cmpwi	cr0, r5, 0
-	beq-	2f
-	mtctr	r5
-	addi	r6,r3,-1
-	addi	r4,r4,-1
-1:	lbzu	r3,1(r6)
-	lbzu	r0,1(r4)
-	subf.	r3,r0,r3
-	bdnzt	2,1b
+	srawi.	r7, r5, 2		/* Divide len by 4 */
+	mr	r6, r3
+	beq-	3f
+	mtctr	r7
+	li	r7, 0
+1:
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	r3, r6, r7
+	lwbrx	r0, r4, r7
+#else
+	lwzx	r3, r6, r7
+	lwzx	r0, r4, r7
+#endif
+	addi	r7, r7, 4
+	cmplw	cr0, r3, r0
+	bdnzt	eq, 1b
+	bne	5f
+3:	andi.	r3, r5, 3
+	beqlr
+	cmplwi	cr1, r3, 2
+	blt-	cr1, 4f
+#ifdef __LITTLE_ENDIAN__
+	lhbrx	r3, r6, r7
+	lhbrx	r0, r4, r7
+#else
+	lhzx	r3, r6, r7
+	lhzx	r0, r4, r7
+#endif
+	addi	r7, r7, 2
+	subf.	r3, r0, r3
+	beqlr	cr1
+	bnelr
+4:	lbzx	r3, r6, r7
+	lbzx	r0, r4, r7
+	subf.	r3, r0, r3
  	blr
-2:	li	r3,0
+5:	li	r3, 1
+	bgtlr
+	li	r3, -1
  	blr
  EXPORT_SYMBOL(memcmp)

-- 
2.13.3

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH v4 3/3] powerpc/lib: optimise PPC32 memcmp
  2018-05-23  7:47   ` [PATCH v4 " Christophe Leroy
@ 2018-05-24 17:24     ` Segher Boessenkool
  2018-05-25  5:55       ` Christophe LEROY
  0 siblings, 1 reply; 7+ messages in thread
From: Segher Boessenkool @ 2018-05-24 17:24 UTC (permalink / raw)
  To: Christophe Leroy
  Cc: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman,
	linuxppc-dev, linux-kernel

On Wed, May 23, 2018 at 09:47:32AM +0200, Christophe Leroy wrote:
> At the time being, memcmp() compares two chunks of memory
> byte per byte.
> 
> This patch optimises the comparison by comparing word by word.
> 
> A small benchmark performed on an 8xx comparing two chuncks
> of 512 bytes performed 100000 times gives:
> 
> Before : 5852274 TB ticks
> After:   1488638 TB ticks

> diff --git a/arch/powerpc/lib/string_32.S b/arch/powerpc/lib/string_32.S
> index 40a576d56ac7..542e6cecbcaf 100644
> --- a/arch/powerpc/lib/string_32.S
> +++ b/arch/powerpc/lib/string_32.S
> @@ -16,17 +16,45 @@
>  	.text
> 
>  _GLOBAL(memcmp)
> -	cmpwi	cr0, r5, 0
> -	beq-	2f
> -	mtctr	r5
> -	addi	r6,r3,-1
> -	addi	r4,r4,-1
> -1:	lbzu	r3,1(r6)
> -	lbzu	r0,1(r4)
> -	subf.	r3,r0,r3
> -	bdnzt	2,1b
> +	srawi.	r7, r5, 2		/* Divide len by 4 */
> +	mr	r6, r3
> +	beq-	3f
> +	mtctr	r7
> +	li	r7, 0
> +1:
> +#ifdef __LITTLE_ENDIAN__
> +	lwbrx	r3, r6, r7
> +	lwbrx	r0, r4, r7
> +#else
> +	lwzx	r3, r6, r7
> +	lwzx	r0, r4, r7
> +#endif

You don't test whether the pointers are word-aligned.  Does that work?
Say, when a load is crossing a page boundary, or segment boundary.


Segher

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH v4 3/3] powerpc/lib: optimise PPC32 memcmp
  2018-05-24 17:24     ` Segher Boessenkool
@ 2018-05-25  5:55       ` Christophe LEROY
  0 siblings, 0 replies; 7+ messages in thread
From: Christophe LEROY @ 2018-05-25  5:55 UTC (permalink / raw)
  To: Segher Boessenkool
  Cc: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman,
	linuxppc-dev, linux-kernel



Le 24/05/2018 à 19:24, Segher Boessenkool a écrit :
> On Wed, May 23, 2018 at 09:47:32AM +0200, Christophe Leroy wrote:
>> At the time being, memcmp() compares two chunks of memory
>> byte per byte.
>>
>> This patch optimises the comparison by comparing word by word.
>>
>> A small benchmark performed on an 8xx comparing two chuncks
>> of 512 bytes performed 100000 times gives:
>>
>> Before : 5852274 TB ticks
>> After:   1488638 TB ticks
> 
>> diff --git a/arch/powerpc/lib/string_32.S b/arch/powerpc/lib/string_32.S
>> index 40a576d56ac7..542e6cecbcaf 100644
>> --- a/arch/powerpc/lib/string_32.S
>> +++ b/arch/powerpc/lib/string_32.S
>> @@ -16,17 +16,45 @@
>>   	.text
>>
>>   _GLOBAL(memcmp)
>> -	cmpwi	cr0, r5, 0
>> -	beq-	2f
>> -	mtctr	r5
>> -	addi	r6,r3,-1
>> -	addi	r4,r4,-1
>> -1:	lbzu	r3,1(r6)
>> -	lbzu	r0,1(r4)
>> -	subf.	r3,r0,r3
>> -	bdnzt	2,1b
>> +	srawi.	r7, r5, 2		/* Divide len by 4 */
>> +	mr	r6, r3
>> +	beq-	3f
>> +	mtctr	r7
>> +	li	r7, 0
>> +1:
>> +#ifdef __LITTLE_ENDIAN__
>> +	lwbrx	r3, r6, r7
>> +	lwbrx	r0, r4, r7
>> +#else
>> +	lwzx	r3, r6, r7
>> +	lwzx	r0, r4, r7
>> +#endif
> 
> You don't test whether the pointers are word-aligned.  Does that work?

copy_tofrom_user() word-aligns the store address and doesn't take care 
of the load address, so I believe it works.

Now, I just read in the MPC885 Ref Manual that unaligned access 
generates alignment exception when the processor is running in LE mode.

Ref. made to the discussion on patch "powerpc/32be: use stmw/lmw for 
registers save/restore in asm" 
(https://patchwork.ozlabs.org/patch/899465/), I will drop the handling 
for LE mode.

Christophe

> Say, when a load is crossing a page boundary, or segment boundary.
> 
> 
> Segher
> 

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2018-05-25  5:55 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2018-05-22 16:06 [PATCH v3 0/3] powerpc/lib: Optimisation of string functions for PPC32 - part 1 Christophe Leroy
2018-05-22 16:06 ` [PATCH v3 1/3] powerpc/lib: move PPC32 specific functions out of string.S Christophe Leroy
2018-05-22 16:06 ` [PATCH v3 2/3] powerpc/lib: optimise 32 bits __clear_user() Christophe Leroy
2018-05-22 16:06 ` [PATCH v3 3/3] powerpc/lib: optimise PPC32 memcmp Christophe Leroy
2018-05-23  7:47   ` [PATCH v4 " Christophe Leroy
2018-05-24 17:24     ` Segher Boessenkool
2018-05-25  5:55       ` Christophe LEROY

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).