linuxppc-dev.lists.ozlabs.org archive mirror
 help / color / mirror / Atom feed
From: wei.guo.simon@gmail.com
To: linuxppc-dev@lists.ozlabs.org
Cc: Paul Mackerras <paulus@ozlabs.org>,
	Michael Ellerman <mpe@ellerman.id.au>,
	"Naveen N.  Rao" <naveen.n.rao@linux.vnet.ibm.com>,
	Simon Guo <wei.guo.simon@gmail.com>
Subject: [PATCH v1 2/3] powerpc: enhance memcmp() with VMX instruction for long bytes comparision
Date: Tue, 19 Sep 2017 18:03:58 +0800	[thread overview]
Message-ID: <1505815439-18720-3-git-send-email-wei.guo.simon@gmail.com> (raw)
In-Reply-To: <1505815439-18720-1-git-send-email-wei.guo.simon@gmail.com>

From: Simon Guo <wei.guo.simon@gmail.com>

This patch add VMX primitives to do memcmp() in case the compare size
exceeds 4K bytes.

Test result with following test program:
------
tools/testing/selftests/powerpc/stringloops# cat memcmp.c

int test_memcmp(const void *s1, const void *s2, size_t n);

static int testcase(void)
{
	char *s1;
	char *s2;
	unsigned long i;

	s1 = memalign(128, SIZE);
	if (!s1) {
		perror("memalign");
		exit(1);
	}

	s2 = memalign(128, SIZE);
	if (!s2) {
		perror("memalign");
		exit(1);
	}

	for (i = 0; i < SIZE; i++)  {
		s1[i] = i & 0xff;
		s2[i] = i & 0xff;
	}
	for (i = 0; i < ITERATIONS; i++)
		test_memcmp(s1, s2, SIZE);

	return 0;
}

int main(void)
{
	return test_harness(testcase, "memcmp");
}

------
Without VMX patch:
       5.085776331 seconds time elapsed                                          ( +-  0.28% )
With VMX patch:
       4.584002052 seconds time elapsed                                          ( +-  0.02% )

		There is ~10% improvement.

However I am not aware whether there is use case in kernel for memcmp on
large size yet.

Signed-off-by: Simon Guo <wei.guo.simon@gmail.com>
---
 arch/powerpc/include/asm/asm-prototypes.h |  2 +-
 arch/powerpc/lib/copypage_power7.S        |  2 +-
 arch/powerpc/lib/memcmp_64.S              | 79 +++++++++++++++++++++++++++++++
 arch/powerpc/lib/memcpy_power7.S          |  2 +-
 arch/powerpc/lib/vmx-helper.c             |  2 +-
 5 files changed, 83 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/include/asm/asm-prototypes.h b/arch/powerpc/include/asm/asm-prototypes.h
index 7330150..e6530d8 100644
--- a/arch/powerpc/include/asm/asm-prototypes.h
+++ b/arch/powerpc/include/asm/asm-prototypes.h
@@ -49,7 +49,7 @@ void __trace_hcall_exit(long opcode, unsigned long retval,
 /* VMX copying */
 int enter_vmx_usercopy(void);
 int exit_vmx_usercopy(void);
-int enter_vmx_copy(void);
+int enter_vmx_ops(void);
 void * exit_vmx_copy(void *dest);
 
 /* Traps */
diff --git a/arch/powerpc/lib/copypage_power7.S b/arch/powerpc/lib/copypage_power7.S
index ca5fc8f..9e7729e 100644
--- a/arch/powerpc/lib/copypage_power7.S
+++ b/arch/powerpc/lib/copypage_power7.S
@@ -60,7 +60,7 @@ _GLOBAL(copypage_power7)
 	std	r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
 	std	r0,16(r1)
 	stdu	r1,-STACKFRAMESIZE(r1)
-	bl	enter_vmx_copy
+	bl	enter_vmx_ops
 	cmpwi	r3,0
 	ld	r0,STACKFRAMESIZE+16(r1)
 	ld	r3,STK_REG(R31)(r1)
diff --git a/arch/powerpc/lib/memcmp_64.S b/arch/powerpc/lib/memcmp_64.S
index 6dbafdb..b86a1d3 100644
--- a/arch/powerpc/lib/memcmp_64.S
+++ b/arch/powerpc/lib/memcmp_64.S
@@ -153,6 +153,13 @@ _GLOBAL(memcmp)
 	blr
 
 .Llong:
+#ifdef CONFIG_ALTIVEC
+	/* Try to use vmx loop if length is larger than 4K */
+	cmpldi  cr6,r5,4096
+	bgt	cr6,.Lvmx_cmp
+
+.Llong_novmx_cmp:
+#endif
 	li	off8,8
 	li	off16,16
 	li	off24,24
@@ -310,4 +317,76 @@ _GLOBAL(memcmp)
 8:
 	blr
 
+#ifdef CONFIG_ALTIVEC
+.Lvmx_cmp:
+	mflr    r0
+	std     r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
+	std     r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
+	std     r5,-STACKFRAMESIZE+STK_REG(R29)(r1)
+	std     r0,16(r1)
+	stdu    r1,-STACKFRAMESIZE(r1)
+	bl      enter_vmx_ops
+	cmpwi   cr1,r3,0
+	ld      r0,STACKFRAMESIZE+16(r1)
+	ld      r3,STK_REG(R31)(r1)
+	ld      r4,STK_REG(R30)(r1)
+	ld      r5,STK_REG(R29)(r1)
+	addi	r1,r1,STACKFRAMESIZE
+	mtlr    r0
+	beq     cr1,.Llong_novmx_cmp
+
+3:
+	/* Enter with src/dst address 8 bytes aligned, and len is
+	 * no less than 4KB. Need to align with 16 bytes further.
+	 */
+	andi.	rA,r3,8
+	beq	4f
+	LD	rA,0,r3
+	LD	rB,0,r4
+	cmpld	cr0,rA,rB
+	bne	cr0,.LcmpAB_lightweight
+
+	addi	r3,r3,8
+	addi	r4,r4,8
+
+4:
+	/* compare 32 bytes for each loop */
+	srdi	r0,r5,5
+	mtctr	r0
+	andi.	r5,r5,31
+	li	off16,16
+5:
+	lvx 	v0,0,r3
+	lvx 	v1,0,r4
+	vcmpequd. v0,v0,v1
+	bf	24,7f
+	lvx 	v0,off16,r3
+	lvx 	v1,off16,r4
+	vcmpequd. v0,v0,v1
+	bf	24,6f
+	addi	r3,r3,32
+	addi	r4,r4,32
+	bdnz	5b
+
+	cmpdi	r5,0
+	beq	.Lzero
+	b	.Lshort
+
+6:
+	addi	r3,r3,16
+	addi	r4,r4,16
+
+7:
+	LD	rA,0,r3
+	LD	rB,0,r4
+	cmpld	cr0,rA,rB
+	bne	cr0,.LcmpAB_lightweight
+
+	li	off8,8
+	LD	rA,off8,r3
+	LD	rB,off8,r4
+	cmpld	cr0,rA,rB
+	bne	cr0,.LcmpAB_lightweight
+	b	.Lzero
+#endif
 EXPORT_SYMBOL(memcmp)
diff --git a/arch/powerpc/lib/memcpy_power7.S b/arch/powerpc/lib/memcpy_power7.S
index 193909a..682e386 100644
--- a/arch/powerpc/lib/memcpy_power7.S
+++ b/arch/powerpc/lib/memcpy_power7.S
@@ -230,7 +230,7 @@ _GLOBAL(memcpy_power7)
 	std	r5,-STACKFRAMESIZE+STK_REG(R29)(r1)
 	std	r0,16(r1)
 	stdu	r1,-STACKFRAMESIZE(r1)
-	bl	enter_vmx_copy
+	bl	enter_vmx_ops
 	cmpwi	cr1,r3,0
 	ld	r0,STACKFRAMESIZE+16(r1)
 	ld	r3,STK_REG(R31)(r1)
diff --git a/arch/powerpc/lib/vmx-helper.c b/arch/powerpc/lib/vmx-helper.c
index bf925cd..923a9ab 100644
--- a/arch/powerpc/lib/vmx-helper.c
+++ b/arch/powerpc/lib/vmx-helper.c
@@ -53,7 +53,7 @@ int exit_vmx_usercopy(void)
 	return 0;
 }
 
-int enter_vmx_copy(void)
+int enter_vmx_ops(void)
 {
 	if (in_interrupt())
 		return 0;
-- 
1.8.3.1

  parent reply	other threads:[~2017-09-19 10:04 UTC|newest]

Thread overview: 10+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2017-09-19 10:03 [PATCH v1 0/3] powerpc: memcmp() optimization wei.guo.simon
2017-09-19 10:03 ` [PATCH v1 1/3] powerpc: Align bytes before fall back to .Lshort in powerpc memcmp wei.guo.simon
2017-09-19 10:12   ` David Laight
2017-09-20  9:56     ` Simon Guo
2017-09-20 10:05       ` David Laight
2017-09-19 12:20   ` Christophe LEROY
2017-09-19 10:03 ` wei.guo.simon [this message]
2017-09-19 10:03 ` [PATCH v1 3/3] powerpc:selftest update memcmp selftest according to kernel change wei.guo.simon
2017-09-19 12:21 ` [PATCH v1 0/3] powerpc: memcmp() optimization Christophe LEROY
2017-09-20  9:57   ` Simon Guo

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1505815439-18720-3-git-send-email-wei.guo.simon@gmail.com \
    --to=wei.guo.simon@gmail.com \
    --cc=linuxppc-dev@lists.ozlabs.org \
    --cc=mpe@ellerman.id.au \
    --cc=naveen.n.rao@linux.vnet.ibm.com \
    --cc=paulus@ozlabs.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).