All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/2] MIPS: lib: csum_partial: more instruction paral
@ 2014-05-15  7:09 chenj
  2014-05-15  7:09 ` [PATCH 2/2] MIPS: lib: csum_partial: use wsbh/movn on ls3 chenj
                   ` (2 more replies)
  0 siblings, 3 replies; 24+ messages in thread
From: chenj @ 2014-05-15  7:09 UTC (permalink / raw)
  To: linux-mips; +Cc: chenj

This will bring at most 50% performance gain on loongson3a.
See
http://dev.lemote.com/files/upload/software/csum-opti/csum-opti-benchmark.html

The benchmark is done in userspace through
http://dev.lemote.com/files/upload/software/csum-opti/csum-test.tar.gz
---
 arch/mips/lib/csum_partial.S | 38 +++++++++++++++++++-------------------
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/arch/mips/lib/csum_partial.S b/arch/mips/lib/csum_partial.S
index 9901237..6cea101 100644
--- a/arch/mips/lib/csum_partial.S
+++ b/arch/mips/lib/csum_partial.S
@@ -76,10 +76,10 @@
 	LOAD	_t1, (offset + UNIT(1))(src);			\
 	LOAD	_t2, (offset + UNIT(2))(src);			\
 	LOAD	_t3, (offset + UNIT(3))(src);			\
+	ADDC(_t0, _t1);						\
+	ADDC(_t2, _t3);						\
 	ADDC(sum, _t0);						\
-	ADDC(sum, _t1);						\
-	ADDC(sum, _t2);						\
-	ADDC(sum, _t3)
+	ADDC(sum, _t2)
 
 #ifdef USE_DOUBLE
 #define CSUM_BIGCHUNK(src, offset, sum, _t0, _t1, _t2, _t3)	\
@@ -501,21 +501,21 @@ LEAF(csum_partial)
 	SUB	len, len, 8*NBYTES
 	ADD	src, src, 8*NBYTES
 	STORE(t0, UNIT(0)(dst),	.Ls_exc\@)
-	ADDC(sum, t0)
+	ADDC(t0, t1)
 	STORE(t1, UNIT(1)(dst),	.Ls_exc\@)
-	ADDC(sum, t1)
+	ADDC(sum, t0)
 	STORE(t2, UNIT(2)(dst),	.Ls_exc\@)
-	ADDC(sum, t2)
+	ADDC(t2, t3)
 	STORE(t3, UNIT(3)(dst),	.Ls_exc\@)
-	ADDC(sum, t3)
+	ADDC(sum, t2)
 	STORE(t4, UNIT(4)(dst),	.Ls_exc\@)
-	ADDC(sum, t4)
+	ADDC(t4, t5)
 	STORE(t5, UNIT(5)(dst),	.Ls_exc\@)
-	ADDC(sum, t5)
+	ADDC(sum, t4)
 	STORE(t6, UNIT(6)(dst),	.Ls_exc\@)
-	ADDC(sum, t6)
+	ADDC(t6, t7)
 	STORE(t7, UNIT(7)(dst),	.Ls_exc\@)
-	ADDC(sum, t7)
+	ADDC(sum, t6)
 	.set	reorder				/* DADDI_WAR */
 	ADD	dst, dst, 8*NBYTES
 	bgez	len, 1b
@@ -541,13 +541,13 @@ LEAF(csum_partial)
 	SUB	len, len, 4*NBYTES
 	ADD	src, src, 4*NBYTES
 	STORE(t0, UNIT(0)(dst),	.Ls_exc\@)
-	ADDC(sum, t0)
+	ADDC(t0, t1)
 	STORE(t1, UNIT(1)(dst),	.Ls_exc\@)
-	ADDC(sum, t1)
+	ADDC(sum, t0)
 	STORE(t2, UNIT(2)(dst),	.Ls_exc\@)
-	ADDC(sum, t2)
+	ADDC(t2, t3)
 	STORE(t3, UNIT(3)(dst),	.Ls_exc\@)
-	ADDC(sum, t3)
+	ADDC(sum, t2)
 	.set	reorder				/* DADDI_WAR */
 	ADD	dst, dst, 4*NBYTES
 	beqz	len, .Ldone\@
@@ -646,13 +646,13 @@ LEAF(csum_partial)
 	nop				# improves slotting
 #endif
 	STORE(t0, UNIT(0)(dst),	.Ls_exc\@)
-	ADDC(sum, t0)
+	ADDC(t0, t1)
 	STORE(t1, UNIT(1)(dst),	.Ls_exc\@)
-	ADDC(sum, t1)
+	ADDC(sum, t0)
 	STORE(t2, UNIT(2)(dst),	.Ls_exc\@)
-	ADDC(sum, t2)
+	ADDC(t2, t3)
 	STORE(t3, UNIT(3)(dst),	.Ls_exc\@)
-	ADDC(sum, t3)
+	ADDC(sum, t2)
 	.set	reorder				/* DADDI_WAR */
 	ADD	dst, dst, 4*NBYTES
 	bne	len, rem, 1b
-- 
1.9.0

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* [PATCH 2/2] MIPS: lib: csum_partial: use wsbh/movn on ls3
  2014-05-15  7:09 [PATCH 1/2] MIPS: lib: csum_partial: more instruction paral chenj
@ 2014-05-15  7:09 ` chenj
  2014-05-15 11:40     ` Paul Burton
                     ` (2 more replies)
  2014-05-15  8:20   ` Markos Chandras
  2014-05-19  3:14 ` [PATCH, v2] " chenj
  2 siblings, 3 replies; 24+ messages in thread
From: chenj @ 2014-05-15  7:09 UTC (permalink / raw)
  To: linux-mips; +Cc: chenj

wsbh & movn are available on loongson3 CPU.
---
 arch/mips/lib/csum_partial.S | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/arch/mips/lib/csum_partial.S b/arch/mips/lib/csum_partial.S
index 6cea101..ed88647 100644
--- a/arch/mips/lib/csum_partial.S
+++ b/arch/mips/lib/csum_partial.S
@@ -277,9 +277,12 @@ LEAF(csum_partial)
 #endif
 
 	/* odd buffer alignment? */
-#ifdef CONFIG_CPU_MIPSR2
+#if defined(CONFIG_CPU_MIPSR2) || defined(CONFIG_CPU_LOONGSON3)
+	.set	push
+	.set	arch=mips32r2
 	wsbh	v1, sum
 	movn	sum, v1, t7
+	.set	pop
 #else
 	beqz	t7, 1f			/* odd buffer alignment? */
 	 lui	v1, 0x00ff
@@ -726,9 +729,12 @@ LEAF(csum_partial)
 	addu	sum, v1
 #endif
 
-#ifdef CONFIG_CPU_MIPSR2
+#if defined(CONFIG_CPU_MIPSR2) || defined(CONFIG_CPU_LOONGSON3)
+	.set	push
+	.set	arch=mips32r2
 	wsbh	v1, sum
 	movn	sum, v1, odd
+	.set	pop
 #else
 	beqz	odd, 1f			/* odd buffer alignment? */
 	 lui	v1, 0x00ff
-- 
1.9.0

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* Re: [PATCH 1/2] MIPS: lib: csum_partial: more instruction paral
@ 2014-05-15  8:20   ` Markos Chandras
  0 siblings, 0 replies; 24+ messages in thread
From: Markos Chandras @ 2014-05-15  8:20 UTC (permalink / raw)
  To: linux-mips

On 05/15/2014 08:09 AM, chenj wrote:
> This will bring at most 50% performance gain on loongson3a.
> See
> http://dev.lemote.com/files/upload/software/csum-opti/csum-opti-benchmark.html
> 
> The benchmark is done in userspace through
> http://dev.lemote.com/files/upload/software/csum-opti/csum-test.tar.gz
> ---
>  arch/mips/lib/csum_partial.S | 38 +++++++++++++++++++-------------------
>  1 file changed, 19 insertions(+), 19 deletions(-)

Hi chenj,

My opinion is that the commit message is not ideal. If the http links
ever go away, the commit message will be mostly useless for someone
trying to understand the reason for this patch. I would suggest to
explain the reason for the optimizations in the commit message and put
the http links as a comment to this patch which will still be visible in
the mailing list archives. Or you can keep them in the commit message
but I think the reason for the patch should be explained as well.

-- 
markos

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 1/2] MIPS: lib: csum_partial: more instruction paral
@ 2014-05-15  8:20   ` Markos Chandras
  0 siblings, 0 replies; 24+ messages in thread
From: Markos Chandras @ 2014-05-15  8:20 UTC (permalink / raw)
  To: linux-mips

On 05/15/2014 08:09 AM, chenj wrote:
> This will bring at most 50% performance gain on loongson3a.
> See
> http://dev.lemote.com/files/upload/software/csum-opti/csum-opti-benchmark.html
> 
> The benchmark is done in userspace through
> http://dev.lemote.com/files/upload/software/csum-opti/csum-test.tar.gz
> ---
>  arch/mips/lib/csum_partial.S | 38 +++++++++++++++++++-------------------
>  1 file changed, 19 insertions(+), 19 deletions(-)

Hi chenj,

My opinion is that the commit message is not ideal. If the http links
ever go away, the commit message will be mostly useless for someone
trying to understand the reason for this patch. I would suggest to
explain the reason for the optimizations in the commit message and put
the http links as a comment to this patch which will still be visible in
the mailing list archives. Or you can keep them in the commit message
but I think the reason for the patch should be explained as well.

-- 
markos

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 2/2] MIPS: lib: csum_partial: use wsbh/movn on ls3
@ 2014-05-15 11:40     ` Paul Burton
  0 siblings, 0 replies; 24+ messages in thread
From: Paul Burton @ 2014-05-15 11:40 UTC (permalink / raw)
  To: chenj; +Cc: linux-mips

[-- Attachment #1: Type: text/plain, Size: 1217 bytes --]

On Thu, May 15, 2014 at 03:09:03PM +0800, chenj wrote:
> wsbh & movn are available on loongson3 CPU.
> ---
>  arch/mips/lib/csum_partial.S | 10 ++++++++--
>  1 file changed, 8 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/mips/lib/csum_partial.S b/arch/mips/lib/csum_partial.S
> index 6cea101..ed88647 100644
> --- a/arch/mips/lib/csum_partial.S
> +++ b/arch/mips/lib/csum_partial.S
> @@ -277,9 +277,12 @@ LEAF(csum_partial)
>  #endif
>  
>  	/* odd buffer alignment? */
> -#ifdef CONFIG_CPU_MIPSR2
> +#if defined(CONFIG_CPU_MIPSR2) || defined(CONFIG_CPU_LOONGSON3)

Is there some reason CPU_LOONGSON3 can't select CPU_MIPSR2?

Thanks,
    Paul

> +	.set	push
> +	.set	arch=mips32r2
>  	wsbh	v1, sum
>  	movn	sum, v1, t7
> +	.set	pop
>  #else
>  	beqz	t7, 1f			/* odd buffer alignment? */
>  	 lui	v1, 0x00ff
> @@ -726,9 +729,12 @@ LEAF(csum_partial)
>  	addu	sum, v1
>  #endif
>  
> -#ifdef CONFIG_CPU_MIPSR2
> +#if defined(CONFIG_CPU_MIPSR2) || defined(CONFIG_CPU_LOONGSON3)
> +	.set	push
> +	.set	arch=mips32r2
>  	wsbh	v1, sum
>  	movn	sum, v1, odd
> +	.set	pop
>  #else
>  	beqz	odd, 1f			/* odd buffer alignment? */
>  	 lui	v1, 0x00ff
> -- 
> 1.9.0
> 
> 

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 836 bytes --]

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 2/2] MIPS: lib: csum_partial: use wsbh/movn on ls3
@ 2014-05-15 11:40     ` Paul Burton
  0 siblings, 0 replies; 24+ messages in thread
From: Paul Burton @ 2014-05-15 11:40 UTC (permalink / raw)
  To: chenj; +Cc: linux-mips

[-- Attachment #1: Type: text/plain, Size: 1217 bytes --]

On Thu, May 15, 2014 at 03:09:03PM +0800, chenj wrote:
> wsbh & movn are available on loongson3 CPU.
> ---
>  arch/mips/lib/csum_partial.S | 10 ++++++++--
>  1 file changed, 8 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/mips/lib/csum_partial.S b/arch/mips/lib/csum_partial.S
> index 6cea101..ed88647 100644
> --- a/arch/mips/lib/csum_partial.S
> +++ b/arch/mips/lib/csum_partial.S
> @@ -277,9 +277,12 @@ LEAF(csum_partial)
>  #endif
>  
>  	/* odd buffer alignment? */
> -#ifdef CONFIG_CPU_MIPSR2
> +#if defined(CONFIG_CPU_MIPSR2) || defined(CONFIG_CPU_LOONGSON3)

Is there some reason CPU_LOONGSON3 can't select CPU_MIPSR2?

Thanks,
    Paul

> +	.set	push
> +	.set	arch=mips32r2
>  	wsbh	v1, sum
>  	movn	sum, v1, t7
> +	.set	pop
>  #else
>  	beqz	t7, 1f			/* odd buffer alignment? */
>  	 lui	v1, 0x00ff
> @@ -726,9 +729,12 @@ LEAF(csum_partial)
>  	addu	sum, v1
>  #endif
>  
> -#ifdef CONFIG_CPU_MIPSR2
> +#if defined(CONFIG_CPU_MIPSR2) || defined(CONFIG_CPU_LOONGSON3)
> +	.set	push
> +	.set	arch=mips32r2
>  	wsbh	v1, sum
>  	movn	sum, v1, odd
> +	.set	pop
>  #else
>  	beqz	odd, 1f			/* odd buffer alignment? */
>  	 lui	v1, 0x00ff
> -- 
> 1.9.0
> 
> 

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 836 bytes --]

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 2/2] MIPS: lib: csum_partial: use wsbh/movn on ls3
  2014-05-15 11:40     ` Paul Burton
  (?)
@ 2014-05-16 13:29     ` Chen Jie
  2014-05-16 15:21       ` Paul Burton
  -1 siblings, 1 reply; 24+ messages in thread
From: Chen Jie @ 2014-05-16 13:29 UTC (permalink / raw)
  To: Paul Burton; +Cc: linux-mips, 陈华才, 王锐

2014-05-15 19:40 GMT+08:00 Paul Burton <paul.burton@imgtec.com>:
> On Thu, May 15, 2014 at 03:09:03PM +0800, chenj wrote:
>> wsbh & movn are available on loongson3 CPU.
>> ---
>>  arch/mips/lib/csum_partial.S | 10 ++++++++--
>>  1 file changed, 8 insertions(+), 2 deletions(-)
>>
>> diff --git a/arch/mips/lib/csum_partial.S b/arch/mips/lib/csum_partial.S
>> index 6cea101..ed88647 100644
>> --- a/arch/mips/lib/csum_partial.S
>> +++ b/arch/mips/lib/csum_partial.S
>> @@ -277,9 +277,12 @@ LEAF(csum_partial)
>>  #endif
>>
>>       /* odd buffer alignment? */
>> -#ifdef CONFIG_CPU_MIPSR2
>> +#if defined(CONFIG_CPU_MIPSR2) || defined(CONFIG_CPU_LOONGSON3)
>
> Is there some reason CPU_LOONGSON3 can't select CPU_MIPSR2?

Loongson 3a is not fully mips32r2 compatible, but Loongson 3b is.

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 2/2] MIPS: lib: csum_partial: use wsbh/movn on ls3
  2014-05-16 13:29     ` Chen Jie
@ 2014-05-16 15:21       ` Paul Burton
  0 siblings, 0 replies; 24+ messages in thread
From: Paul Burton @ 2014-05-16 15:21 UTC (permalink / raw)
  To: Chen Jie; +Cc: linux-mips, 陈华才, 王锐

[-- Attachment #1: Type: text/plain, Size: 870 bytes --]

On Fri, May 16, 2014 at 09:29:39PM +0800, Chen Jie wrote:
> >> -#ifdef CONFIG_CPU_MIPSR2
> >> +#if defined(CONFIG_CPU_MIPSR2) || defined(CONFIG_CPU_LOONGSON3)
> >
> > Is there some reason CPU_LOONGSON3 can't select CPU_MIPSR2?
> 
> Loongson 3a is not fully mips32r2 compatible, but Loongson 3b is.

Interesting, I was led to believe that the 3A was MIPS64r2 compliant
(and thus by nature MIPS32r2 compliant) with the one difference with
other implementations being how it handles Status.FR=0 (appearing as
16x64b registers rather than the 32x32b registers of other CPUs). Are
there any other ways in which Loongson 3A is not MIPS64r2/MIPS32r2
compliant?

If not then wouldn't it make more sense to select CPU_MIPSR2 & then
special case any FPU differences? That would get you all the R2 bits of
the kernel with minimal #ifdef-ery.

Thanks,
    Paul

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 836 bytes --]

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 2/2] MIPS: lib: csum_partial: use wsbh/movn on ls3
@ 2014-05-18  8:23 chenj
  2014-05-18 14:39 ` Huacai Chen
  0 siblings, 1 reply; 24+ messages in thread
From: chenj @ 2014-05-18  8:23 UTC (permalink / raw)
  To: linux-mips, paul.burton; +Cc: chenhc

[PATCH] MIPS: Loongson 3: Select CPU_MIPS64_R2

To chenhc: Please review this.

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 2/2] MIPS: lib: csum_partial: use wsbh/movn on ls3
  2014-05-18  8:23 [PATCH 2/2] MIPS: lib: csum_partial: use wsbh/movn on ls3 chenj
@ 2014-05-18 14:39 ` Huacai Chen
  2014-05-19 10:02   ` Paul Burton
  0 siblings, 1 reply; 24+ messages in thread
From: Huacai Chen @ 2014-05-18 14:39 UTC (permalink / raw)
  To: chenj; +Cc: Linux MIPS Mailing List, Paul Burton

Due to Wang Rui's tests, Loongson-3's EI/DI instructions don't have
correct behaviors, its Status.FR is also different with MIPS64R2. So,
I don't want to select CPU_MIPS64_R2.

On Sun, May 18, 2014 at 4:23 PM, chenj <chenj@lemote.com> wrote:
> [PATCH] MIPS: Loongson 3: Select CPU_MIPS64_R2
>
> To chenhc: Please review this.
>

^ permalink raw reply	[flat|nested] 24+ messages in thread

* [PATCH, v2] MIPS: lib: csum_partial: more instruction paral
  2014-05-15  7:09 [PATCH 1/2] MIPS: lib: csum_partial: more instruction paral chenj
  2014-05-15  7:09 ` [PATCH 2/2] MIPS: lib: csum_partial: use wsbh/movn on ls3 chenj
  2014-05-15  8:20   ` Markos Chandras
@ 2014-05-19  3:14 ` chenj
  2014-05-19  6:59   ` James Hogan
  2 siblings, 1 reply; 24+ messages in thread
From: chenj @ 2014-05-19  3:14 UTC (permalink / raw)
  To: markos.chandras; +Cc: linux-mips, chenhc, chenj

Computing sum introduces true data dependency, e.g.
	ADDC(sum, t0)
	ADDC(sum, t1)
	ADDC(sum, t2)
	ADDC(sum, t3)
Here, each ADDC(sum, ...) references the sum value updated by previous ADDC.

In this patch, above sequence is adjusted as following:
	ADDC(t0, t1)
	ADDC(t2, t3)
	ADDC(sum, t0)
	ADDC(sum, t2)
The first two ADDC operations are independent, hence can be executed
simultaneously if possible.

This patch improves instruction level parallelism, and brings at most 50%
csum performance gain on Loongson 3a processor[1].

---
1. The result can be found at
http://dev.lemote.com/files/upload/software/csum-opti/csum-opti-benchmark.html
And is generated by a userspace test program:
http://dev.lemote.com/files/upload/software/csum-opti/csum-test.tar.gz

[v2: amend commit message]

 arch/mips/lib/csum_partial.S | 38 +++++++++++++++++++-------------------
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/arch/mips/lib/csum_partial.S b/arch/mips/lib/csum_partial.S
index 9901237..6cea101 100644
--- a/arch/mips/lib/csum_partial.S
+++ b/arch/mips/lib/csum_partial.S
@@ -76,10 +76,10 @@
 	LOAD	_t1, (offset + UNIT(1))(src);			\
 	LOAD	_t2, (offset + UNIT(2))(src);			\
 	LOAD	_t3, (offset + UNIT(3))(src);			\
+	ADDC(_t0, _t1);						\
+	ADDC(_t2, _t3);						\
 	ADDC(sum, _t0);						\
-	ADDC(sum, _t1);						\
-	ADDC(sum, _t2);						\
-	ADDC(sum, _t3)
+	ADDC(sum, _t2)
 
 #ifdef USE_DOUBLE
 #define CSUM_BIGCHUNK(src, offset, sum, _t0, _t1, _t2, _t3)	\
@@ -501,21 +501,21 @@ LEAF(csum_partial)
 	SUB	len, len, 8*NBYTES
 	ADD	src, src, 8*NBYTES
 	STORE(t0, UNIT(0)(dst),	.Ls_exc\@)
-	ADDC(sum, t0)
+	ADDC(t0, t1)
 	STORE(t1, UNIT(1)(dst),	.Ls_exc\@)
-	ADDC(sum, t1)
+	ADDC(sum, t0)
 	STORE(t2, UNIT(2)(dst),	.Ls_exc\@)
-	ADDC(sum, t2)
+	ADDC(t2, t3)
 	STORE(t3, UNIT(3)(dst),	.Ls_exc\@)
-	ADDC(sum, t3)
+	ADDC(sum, t2)
 	STORE(t4, UNIT(4)(dst),	.Ls_exc\@)
-	ADDC(sum, t4)
+	ADDC(t4, t5)
 	STORE(t5, UNIT(5)(dst),	.Ls_exc\@)
-	ADDC(sum, t5)
+	ADDC(sum, t4)
 	STORE(t6, UNIT(6)(dst),	.Ls_exc\@)
-	ADDC(sum, t6)
+	ADDC(t6, t7)
 	STORE(t7, UNIT(7)(dst),	.Ls_exc\@)
-	ADDC(sum, t7)
+	ADDC(sum, t6)
 	.set	reorder				/* DADDI_WAR */
 	ADD	dst, dst, 8*NBYTES
 	bgez	len, 1b
@@ -541,13 +541,13 @@ LEAF(csum_partial)
 	SUB	len, len, 4*NBYTES
 	ADD	src, src, 4*NBYTES
 	STORE(t0, UNIT(0)(dst),	.Ls_exc\@)
-	ADDC(sum, t0)
+	ADDC(t0, t1)
 	STORE(t1, UNIT(1)(dst),	.Ls_exc\@)
-	ADDC(sum, t1)
+	ADDC(sum, t0)
 	STORE(t2, UNIT(2)(dst),	.Ls_exc\@)
-	ADDC(sum, t2)
+	ADDC(t2, t3)
 	STORE(t3, UNIT(3)(dst),	.Ls_exc\@)
-	ADDC(sum, t3)
+	ADDC(sum, t2)
 	.set	reorder				/* DADDI_WAR */
 	ADD	dst, dst, 4*NBYTES
 	beqz	len, .Ldone\@
@@ -646,13 +646,13 @@ LEAF(csum_partial)
 	nop				# improves slotting
 #endif
 	STORE(t0, UNIT(0)(dst),	.Ls_exc\@)
-	ADDC(sum, t0)
+	ADDC(t0, t1)
 	STORE(t1, UNIT(1)(dst),	.Ls_exc\@)
-	ADDC(sum, t1)
+	ADDC(sum, t0)
 	STORE(t2, UNIT(2)(dst),	.Ls_exc\@)
-	ADDC(sum, t2)
+	ADDC(t2, t3)
 	STORE(t3, UNIT(3)(dst),	.Ls_exc\@)
-	ADDC(sum, t3)
+	ADDC(sum, t2)
 	.set	reorder				/* DADDI_WAR */
 	ADD	dst, dst, 4*NBYTES
 	bne	len, rem, 1b
-- 
1.9.0

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* Re: [PATCH, v2] MIPS: lib: csum_partial: more instruction paral
  2014-05-19  3:14 ` [PATCH, v2] " chenj
@ 2014-05-19  6:59   ` James Hogan
  2014-05-19 15:32     ` Chen Jie
  0 siblings, 1 reply; 24+ messages in thread
From: James Hogan @ 2014-05-19  6:59 UTC (permalink / raw)
  To: linux-mips; +Cc: chenj, markos.chandras, chenhc

[-- Attachment #1: Type: text/plain, Size: 3679 bytes --]

On Monday 19 May 2014 11:14:07 chenj wrote:
> Computing sum introduces true data dependency, e.g.
> 	ADDC(sum, t0)
> 	ADDC(sum, t1)
> 	ADDC(sum, t2)
> 	ADDC(sum, t3)
> Here, each ADDC(sum, ...) references the sum value updated by previous ADDC.
> 
> In this patch, above sequence is adjusted as following:
> 	ADDC(t0, t1)
> 	ADDC(t2, t3)
> 	ADDC(sum, t0)
> 	ADDC(sum, t2)
> The first two ADDC operations are independent, hence can be executed
> simultaneously if possible.

The actual patch appears to change it to this:
ADDC(t0, t1)
ADDC(sum, t0)
ADDC(t2, t3)
ADDC(sum, t2)

which is slightly different (presumably due to the interleaved stores in some 
of the cases).

> This patch improves instruction level parallelism, and brings at most 50%
> csum performance gain on Loongson 3a processor[1].

Nice results.

The stuff below the --- will get dropped when the patch is applied though, 
after which the "[1]" won't refer to anything.

Cheers
James

> 
> ---
> 1. The result can be found at
> http://dev.lemote.com/files/upload/software/csum-opti/csum-opti-benchmark.ht
> ml And is generated by a userspace test program:
> http://dev.lemote.com/files/upload/software/csum-opti/csum-test.tar.gz
> 
> [v2: amend commit message]
> 
>  arch/mips/lib/csum_partial.S | 38 +++++++++++++++++++-------------------
>  1 file changed, 19 insertions(+), 19 deletions(-)
> 
> diff --git a/arch/mips/lib/csum_partial.S b/arch/mips/lib/csum_partial.S
> index 9901237..6cea101 100644
> --- a/arch/mips/lib/csum_partial.S
> +++ b/arch/mips/lib/csum_partial.S
> @@ -76,10 +76,10 @@
>  	LOAD	_t1, (offset + UNIT(1))(src);			\
>  	LOAD	_t2, (offset + UNIT(2))(src);			\
>  	LOAD	_t3, (offset + UNIT(3))(src);			\
> +	ADDC(_t0, _t1);						\
> +	ADDC(_t2, _t3);						\
>  	ADDC(sum, _t0);						\
> -	ADDC(sum, _t1);						\
> -	ADDC(sum, _t2);						\
> -	ADDC(sum, _t3)
> +	ADDC(sum, _t2)
> 
>  #ifdef USE_DOUBLE
>  #define CSUM_BIGCHUNK(src, offset, sum, _t0, _t1, _t2, _t3)	\
> @@ -501,21 +501,21 @@ LEAF(csum_partial)
>  	SUB	len, len, 8*NBYTES
>  	ADD	src, src, 8*NBYTES
>  	STORE(t0, UNIT(0)(dst),	.Ls_exc\@)
> -	ADDC(sum, t0)
> +	ADDC(t0, t1)
>  	STORE(t1, UNIT(1)(dst),	.Ls_exc\@)
> -	ADDC(sum, t1)
> +	ADDC(sum, t0)
>  	STORE(t2, UNIT(2)(dst),	.Ls_exc\@)
> -	ADDC(sum, t2)
> +	ADDC(t2, t3)
>  	STORE(t3, UNIT(3)(dst),	.Ls_exc\@)
> -	ADDC(sum, t3)
> +	ADDC(sum, t2)
>  	STORE(t4, UNIT(4)(dst),	.Ls_exc\@)
> -	ADDC(sum, t4)
> +	ADDC(t4, t5)
>  	STORE(t5, UNIT(5)(dst),	.Ls_exc\@)
> -	ADDC(sum, t5)
> +	ADDC(sum, t4)
>  	STORE(t6, UNIT(6)(dst),	.Ls_exc\@)
> -	ADDC(sum, t6)
> +	ADDC(t6, t7)
>  	STORE(t7, UNIT(7)(dst),	.Ls_exc\@)
> -	ADDC(sum, t7)
> +	ADDC(sum, t6)
>  	.set	reorder				/* DADDI_WAR */
>  	ADD	dst, dst, 8*NBYTES
>  	bgez	len, 1b
> @@ -541,13 +541,13 @@ LEAF(csum_partial)
>  	SUB	len, len, 4*NBYTES
>  	ADD	src, src, 4*NBYTES
>  	STORE(t0, UNIT(0)(dst),	.Ls_exc\@)
> -	ADDC(sum, t0)
> +	ADDC(t0, t1)
>  	STORE(t1, UNIT(1)(dst),	.Ls_exc\@)
> -	ADDC(sum, t1)
> +	ADDC(sum, t0)
>  	STORE(t2, UNIT(2)(dst),	.Ls_exc\@)
> -	ADDC(sum, t2)
> +	ADDC(t2, t3)
>  	STORE(t3, UNIT(3)(dst),	.Ls_exc\@)
> -	ADDC(sum, t3)
> +	ADDC(sum, t2)
>  	.set	reorder				/* DADDI_WAR */
>  	ADD	dst, dst, 4*NBYTES
>  	beqz	len, .Ldone\@
> @@ -646,13 +646,13 @@ LEAF(csum_partial)
>  	nop				# improves slotting
>  #endif
>  	STORE(t0, UNIT(0)(dst),	.Ls_exc\@)
> -	ADDC(sum, t0)
> +	ADDC(t0, t1)
>  	STORE(t1, UNIT(1)(dst),	.Ls_exc\@)
> -	ADDC(sum, t1)
> +	ADDC(sum, t0)
>  	STORE(t2, UNIT(2)(dst),	.Ls_exc\@)
> -	ADDC(sum, t2)
> +	ADDC(t2, t3)
>  	STORE(t3, UNIT(3)(dst),	.Ls_exc\@)
> -	ADDC(sum, t3)
> +	ADDC(sum, t2)
>  	.set	reorder				/* DADDI_WAR */
>  	ADD	dst, dst, 4*NBYTES
>  	bne	len, rem, 1b

[-- Attachment #2: This is a digitally signed message part. --]
[-- Type: application/pgp-signature, Size: 836 bytes --]

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 2/2] MIPS: lib: csum_partial: use wsbh/movn on ls3
  2014-05-18 14:39 ` Huacai Chen
@ 2014-05-19 10:02   ` Paul Burton
  2014-05-19 15:15     ` Chen Jie
  0 siblings, 1 reply; 24+ messages in thread
From: Paul Burton @ 2014-05-19 10:02 UTC (permalink / raw)
  To: Huacai Chen; +Cc: chenj, Linux MIPS Mailing List

[-- Attachment #1: Type: text/plain, Size: 683 bytes --]

On Sun, May 18, 2014 at 10:39:20PM +0800, Huacai Chen wrote:
> Due to Wang Rui's tests, Loongson-3's EI/DI instructions don't have
> correct behaviors, its Status.FR is also different with MIPS64R2. So,
> I don't want to select CPU_MIPS64_R2.

Out of curiosity, how do ei & di misbehave? There are still a number of
other areas in the kernel which special case >= R2, so if the 3A isn't
too incompatible it may still make sense to select R2 & special case
those incompatibilities.

Thanks,
    Paul

> 
> On Sun, May 18, 2014 at 4:23 PM, chenj <chenj@lemote.com> wrote:
> > [PATCH] MIPS: Loongson 3: Select CPU_MIPS64_R2
> >
> > To chenhc: Please review this.
> >

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 836 bytes --]

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 2/2] MIPS: lib: csum_partial: use wsbh/movn on ls3
  2014-05-19 10:02   ` Paul Burton
@ 2014-05-19 15:15     ` Chen Jie
  2014-05-19 17:06       ` Ralf Baechle
  0 siblings, 1 reply; 24+ messages in thread
From: Chen Jie @ 2014-05-19 15:15 UTC (permalink / raw)
  To: Paul Burton; +Cc: Huacai Chen, Linux MIPS Mailing List, 王锐

2014-05-19 18:02 GMT+08:00 Paul Burton <paul.burton@imgtec.com>:
> On Sun, May 18, 2014 at 10:39:20PM +0800, Huacai Chen wrote:
>> Due to Wang Rui's tests, Loongson-3's EI/DI instructions don't have
>> correct behaviors, its Status.FR is also different with MIPS64R2. So,
>> I don't want to select CPU_MIPS64_R2.
>
> Out of curiosity, how do ei & di misbehave?
In our test, it may cause machine stall if use ei&di in kernel,
especially in smp case.

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH, v2] MIPS: lib: csum_partial: more instruction paral
  2014-05-19  6:59   ` James Hogan
@ 2014-05-19 15:32     ` Chen Jie
  2014-08-15 20:15       ` Chen Jie
  0 siblings, 1 reply; 24+ messages in thread
From: Chen Jie @ 2014-05-19 15:32 UTC (permalink / raw)
  To: James Hogan; +Cc: linux-mips, markos.chandras, 陈华才

2014-05-19 14:59 GMT+08:00 James Hogan <james.hogan@imgtec.com>:
> On Monday 19 May 2014 11:14:07 chenj wrote:
>> Computing sum introduces true data dependency, e.g.
>>       ADDC(sum, t0)
>>       ADDC(sum, t1)
>>       ADDC(sum, t2)
>>       ADDC(sum, t3)
>> Here, each ADDC(sum, ...) references the sum value updated by previous ADDC.
>>
>> In this patch, above sequence is adjusted as following:
>>       ADDC(t0, t1)
>>       ADDC(t2, t3)
>>       ADDC(sum, t0)
>>       ADDC(sum, t2)
>> The first two ADDC operations are independent, hence can be executed
>> simultaneously if possible.
>
> The actual patch appears to change it to this:
> ADDC(t0, t1)
> ADDC(sum, t0)
> ADDC(t2, t3)
> ADDC(sum, t2)
>
> which is slightly different (presumably due to the interleaved stores in some
> of the cases).
>
>> This patch improves instruction level parallelism, and brings at most 50%
>> csum performance gain on Loongson 3a processor[1].
>
> Nice results.
>
> The stuff below the --- will get dropped when the patch is applied though,
> after which the "[1]" won't refer to anything.
>
Thanks for your suggestion, I'll amend the commit message further later.

Basically, the patch reduces the case of one ADDC depending on the
result of the previous ADDC.

BTW, I'm not sure whether the sum value of the new implementation is
equivalent to the original one, but in my test(make run_test of the
csum_test.tar.gz, and a comparing patch in kernel) it is.

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 1/2] MIPS: lib: csum_partial: more instruction paral
  2014-05-15  8:20   ` Markos Chandras
  (?)
@ 2014-05-19 16:36   ` Ralf Baechle
  -1 siblings, 0 replies; 24+ messages in thread
From: Ralf Baechle @ 2014-05-19 16:36 UTC (permalink / raw)
  To: Markos Chandras; +Cc: linux-mips

On Thu, May 15, 2014 at 09:20:39AM +0100, Markos Chandras wrote:

> On 05/15/2014 08:09 AM, chenj wrote:
> > This will bring at most 50% performance gain on loongson3a.
> > See
> > http://dev.lemote.com/files/upload/software/csum-opti/csum-opti-benchmark.html
> > 
> > The benchmark is done in userspace through
> > http://dev.lemote.com/files/upload/software/csum-opti/csum-test.tar.gz
> > ---
> >  arch/mips/lib/csum_partial.S | 38 +++++++++++++++++++-------------------
> >  1 file changed, 19 insertions(+), 19 deletions(-)
> 
> Hi chenj,
> 
> My opinion is that the commit message is not ideal. If the http links
> ever go away, the commit message will be mostly useless for someone
> trying to understand the reason for this patch. I would suggest to
> explain the reason for the optimizations in the commit message and put
> the http links as a comment to this patch which will still be visible in
> the mailing list archives. Or you can keep them in the commit message
> but I think the reason for the patch should be explained as well.

I can certainly offer a FTP space for such things.  Or alternatively,
post the archive's content as a shar archive (GNU sharutils) which will
end up in the mailing list archives itself and which provides the best
guarantee for a stable URL.

That said, tinkering with the checksum code shouldn't be done lightly.
The results on some CPUs are quite surprising and the current
impleemntation which is working well on R4000 or Sibyte class cores
has worked reasonably well for most more modern cores and some of the
attempted tweaks have benefited a few but hurt many cores.

But time progresses and we optimize the kernel for modern hardware so
I'm open to reevaluate things, including runtime generation.

  Ralf

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 2/2] MIPS: lib: csum_partial: use wsbh/movn on ls3
  2014-05-19 15:15     ` Chen Jie
@ 2014-05-19 17:06       ` Ralf Baechle
  2014-05-20 12:33         ` cee1
  0 siblings, 1 reply; 24+ messages in thread
From: Ralf Baechle @ 2014-05-19 17:06 UTC (permalink / raw)
  To: Chen Jie
  Cc: Paul Burton, Huacai Chen, Linux MIPS Mailing List,
	王锐

On Mon, May 19, 2014 at 11:15:15PM +0800, Chen Jie wrote:

> 2014-05-19 18:02 GMT+08:00 Paul Burton <paul.burton@imgtec.com>:
> > On Sun, May 18, 2014 at 10:39:20PM +0800, Huacai Chen wrote:
> >> Due to Wang Rui's tests, Loongson-3's EI/DI instructions don't have
> >> correct behaviors, its Status.FR is also different with MIPS64R2. So,
> >> I don't want to select CPU_MIPS64_R2.
> >
> > Out of curiosity, how do ei & di misbehave?
> In our test, it may cause machine stall if use ei&di in kernel,
> especially in smp case.

In that case we could make the use of DI/EI depend on a new errata flag
in war.h or similar.

  Ralf

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 2/2] MIPS: lib: csum_partial: use wsbh/movn on ls3
  2014-05-19 17:06       ` Ralf Baechle
@ 2014-05-20 12:33         ` cee1
  2014-05-24  1:33           ` Huacai Chen
  0 siblings, 1 reply; 24+ messages in thread
From: cee1 @ 2014-05-20 12:33 UTC (permalink / raw)
  To: Ralf Baechle
  Cc: Paul Burton, Linux MIPS Mailing List, huacai chen,
	王锐

2014-05-20 1:06 GMT+08:00 Ralf Baechle <ralf@linux-mips.org>:
> On Mon, May 19, 2014 at 11:15:15PM +0800, Chen Jie wrote:
>
>> 2014-05-19 18:02 GMT+08:00 Paul Burton <paul.burton@imgtec.com>:
>> > On Sun, May 18, 2014 at 10:39:20PM +0800, Huacai Chen wrote:
>> >> Due to Wang Rui's tests, Loongson-3's EI/DI instructions don't have
>> >> correct behaviors, its Status.FR is also different with MIPS64R2. So,
>> >> I don't want to select CPU_MIPS64_R2.
>> >
>> > Out of curiosity, how do ei & di misbehave?
>> In our test, it may cause machine stall if use ei&di in kernel,
>> especially in smp case.
>
> In that case we could make the use of DI/EI depend on a new errata flag
> in war.h or similar.

Thanks for the suggest!
Huacai,I think we would try to select MIPS64_R2 for loongson3
processor, and take care of the above bits to see whether it works.


-- 
Regards,

- cee1

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 2/2] MIPS: lib: csum_partial: use wsbh/movn on ls3
  2014-05-20 12:33         ` cee1
@ 2014-05-24  1:33           ` Huacai Chen
  0 siblings, 0 replies; 24+ messages in thread
From: Huacai Chen @ 2014-05-24  1:33 UTC (permalink / raw)
  To: cee1; +Cc: Ralf Baechle, Paul Burton, Linux MIPS Mailing List,
	王锐

No, I think we need more experiments to test the compatiability of
Loongson-3. So, ralf, I suggest to merge the first patch and drop this
one.

On Tue, May 20, 2014 at 8:33 PM, cee1 <fykcee1@gmail.com> wrote:
> 2014-05-20 1:06 GMT+08:00 Ralf Baechle <ralf@linux-mips.org>:
>> On Mon, May 19, 2014 at 11:15:15PM +0800, Chen Jie wrote:
>>
>>> 2014-05-19 18:02 GMT+08:00 Paul Burton <paul.burton@imgtec.com>:
>>> > On Sun, May 18, 2014 at 10:39:20PM +0800, Huacai Chen wrote:
>>> >> Due to Wang Rui's tests, Loongson-3's EI/DI instructions don't have
>>> >> correct behaviors, its Status.FR is also different with MIPS64R2. So,
>>> >> I don't want to select CPU_MIPS64_R2.
>>> >
>>> > Out of curiosity, how do ei & di misbehave?
>>> In our test, it may cause machine stall if use ei&di in kernel,
>>> especially in smp case.
>>
>> In that case we could make the use of DI/EI depend on a new errata flag
>> in war.h or similar.
>
> Thanks for the suggest!
> Huacai,I think we would try to select MIPS64_R2 for loongson3
> processor, and take care of the above bits to see whether it works.
>
>
> --
> Regards,
>
> - cee1

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 2/2] MIPS: lib: csum_partial: use wsbh/movn on ls3
  2014-05-15  7:09 ` [PATCH 2/2] MIPS: lib: csum_partial: use wsbh/movn on ls3 chenj
  2014-05-15 11:40     ` Paul Burton
@ 2014-06-03 11:03   ` Ralf Baechle
  2014-06-03 15:03     ` Chen Jie
  2014-06-03 18:44   ` Ralf Baechle
  2 siblings, 1 reply; 24+ messages in thread
From: Ralf Baechle @ 2014-06-03 11:03 UTC (permalink / raw)
  To: chenj; +Cc: linux-mips

On Thu, May 15, 2014 at 03:09:03PM +0800, chenj wrote:

> wsbh & movn are available on loongson3 CPU.
> ---
>  arch/mips/lib/csum_partial.S | 10 ++++++++--

Does Loongson 3 also have both the DSBH and DSHD instructions?

  Ralf

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 2/2] MIPS: lib: csum_partial: use wsbh/movn on ls3
  2014-06-03 11:03   ` Ralf Baechle
@ 2014-06-03 15:03     ` Chen Jie
  0 siblings, 0 replies; 24+ messages in thread
From: Chen Jie @ 2014-06-03 15:03 UTC (permalink / raw)
  To: Ralf Baechle; +Cc: Linux MIPS Mailing List, 王锐

2014-06-03 19:03 GMT+08:00 Ralf Baechle <ralf@linux-mips.org>:
> On Thu, May 15, 2014 at 03:09:03PM +0800, chenj wrote:
>
>> wsbh & movn are available on loongson3 CPU.
>> ---
>>  arch/mips/lib/csum_partial.S | 10 ++++++++--
>
> Does Loongson 3 also have both the DSBH and DSHD instructions?
Both of these two instructions are available on Loongson 3a.

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 2/2] MIPS: lib: csum_partial: use wsbh/movn on ls3
  2014-05-15  7:09 ` [PATCH 2/2] MIPS: lib: csum_partial: use wsbh/movn on ls3 chenj
  2014-05-15 11:40     ` Paul Burton
  2014-06-03 11:03   ` Ralf Baechle
@ 2014-06-03 18:44   ` Ralf Baechle
  2014-06-04  7:57     ` Chen Jie
  2 siblings, 1 reply; 24+ messages in thread
From: Ralf Baechle @ 2014-06-03 18:44 UTC (permalink / raw)
  To: chenj; +Cc: linux-mips

On Thu, May 15, 2014 at 03:09:03PM +0800, chenj wrote:

> wsbh & movn are available on loongson3 CPU.

I think there are a few more case that need to be fixed than just
this file to make best use of WSBH and similar on Loongson 3A.  How
about below patch?

As I don't have Loongson 3 hardware I am not able to runtime test this.

Thanks,

  Ralf

 arch/mips/include/asm/cpu-features.h                           | 10 ++++++++++
 .../include/asm/mach-cavium-octeon/cpu-feature-overrides.h     |  1 +
 arch/mips/include/asm/mach-loongson/cpu-feature-overrides.h    |  2 ++
 arch/mips/include/uapi/asm/swab.h                              |  5 +++--
 arch/mips/lib/csum_partial.S                                   | 10 ++++++++--
 arch/mips/net/bpf_jit.c                                        |  2 +-
 6 files changed, 25 insertions(+), 5 deletions(-)

diff --git a/arch/mips/include/asm/cpu-features.h b/arch/mips/include/asm/cpu-features.h
index c7d8c99..d927bda 100644
--- a/arch/mips/include/asm/cpu-features.h
+++ b/arch/mips/include/asm/cpu-features.h
@@ -222,6 +222,16 @@
 #define cpu_has_clo_clz	cpu_has_mips_r
 #endif
 
+/*
+ * MIPS32 R2, MIPS64 R2, Loongson 3A and Octeon have WSBH.
+ * MIPS64 R2, Loongson 3A and Octeon have WSBH, DSBH and DSHD.
+ * This indicates the availability of WSBH and in case of 64 bit CPUs also
+ * DSBH and DSHD.
+ */
+#ifndef cpu_has_wsbh
+#define cpu_has_wsbh		cpu_has_mips_r2
+#endif
+
 #ifndef cpu_has_dsp
 #define cpu_has_dsp		(cpu_data[0].ases & MIPS_ASE_DSP)
 #endif
diff --git a/arch/mips/include/asm/mach-cavium-octeon/cpu-feature-overrides.h b/arch/mips/include/asm/mach-cavium-octeon/cpu-feature-overrides.h
index cf80228..fa1f3cf 100644
--- a/arch/mips/include/asm/mach-cavium-octeon/cpu-feature-overrides.h
+++ b/arch/mips/include/asm/mach-cavium-octeon/cpu-feature-overrides.h
@@ -57,6 +57,7 @@
 #define cpu_has_vint		0
 #define cpu_has_veic		0
 #define cpu_hwrena_impl_bits	0xc0000000
+#define cpu_has_wsbh            1
 
 #define cpu_has_rixi		(cpu_data[0].cputype != CPU_CAVIUM_OCTEON)
 
diff --git a/arch/mips/include/asm/mach-loongson/cpu-feature-overrides.h b/arch/mips/include/asm/mach-loongson/cpu-feature-overrides.h
index c0f3ef4..7d28f95 100644
--- a/arch/mips/include/asm/mach-loongson/cpu-feature-overrides.h
+++ b/arch/mips/include/asm/mach-loongson/cpu-feature-overrides.h
@@ -59,4 +59,6 @@
 #define cpu_has_watch		1
 #define cpu_has_local_ebase	0
 
+#define cpu_has_wsbh		IS_ENABLED(CONFIG_CPU_LOONGSON3)
+
 #endif /* __ASM_MACH_LOONGSON_CPU_FEATURE_OVERRIDES_H */
diff --git a/arch/mips/include/uapi/asm/swab.h b/arch/mips/include/uapi/asm/swab.h
index ac9a8f9..b2ab2cf 100644
--- a/arch/mips/include/uapi/asm/swab.h
+++ b/arch/mips/include/uapi/asm/swab.h
@@ -13,7 +13,8 @@
 
 #define __SWAB_64_THRU_32__
 
-#if defined(__mips_isa_rev) && (__mips_isa_rev >= 2)
+#if (defined(__mips_isa_rev) && (__mips_isa_rev >= 2)) ||		\
+    defined(_MIPS_ARCH_LOONGSON3A)
 
 static inline __attribute_const__ __u16 __arch_swab16(__u16 x)
 {
@@ -55,5 +56,5 @@ static inline __attribute_const__ __u64 __arch_swab64(__u64 x)
 }
 #define __arch_swab64 __arch_swab64
 #endif /* __mips64 */
-#endif /* MIPS R2 or newer  */
+#endif /* MIPS R2 or newer or Loongson 3A  */
 #endif /* _ASM_SWAB_H */
diff --git a/arch/mips/lib/csum_partial.S b/arch/mips/lib/csum_partial.S
index 9901237..4c721e2 100644
--- a/arch/mips/lib/csum_partial.S
+++ b/arch/mips/lib/csum_partial.S
@@ -277,9 +277,12 @@ LEAF(csum_partial)
 #endif
 
 	/* odd buffer alignment? */
-#ifdef CONFIG_CPU_MIPSR2
+#if defined(CONFIG_CPU_MIPSR2) || defined(CONFIG_CPU_LOONGSON3)
+	.set	push
+	.set	arch=mips32r2
 	wsbh	v1, sum
 	movn	sum, v1, t7
+	.set	pop
 #else
 	beqz	t7, 1f			/* odd buffer alignment? */
 	 lui	v1, 0x00ff
@@ -726,9 +729,12 @@ LEAF(csum_partial)
 	addu	sum, v1
 #endif
 
-#ifdef CONFIG_CPU_MIPSR2
+#if defined(CONFIG_CPU_MIPSR2) || defined(CONFIG_CPU_LOONGSON3)
+	.set	push
+	.set	arch=mips32r2
 	wsbh	v1, sum
 	movn	sum, v1, odd
+	.set	pop
 #else
 	beqz	odd, 1f			/* odd buffer alignment? */
 	 lui	v1, 0x00ff
diff --git a/arch/mips/net/bpf_jit.c b/arch/mips/net/bpf_jit.c
index a67b975..b2a560b 100644
--- a/arch/mips/net/bpf_jit.c
+++ b/arch/mips/net/bpf_jit.c
@@ -1240,7 +1240,7 @@ jmp_cmp:
 			emit_half_load(r_A, r_skb, off, ctx);
 #ifdef CONFIG_CPU_LITTLE_ENDIAN
 			/* This needs little endian fixup */
-			if (cpu_has_mips_r2) {
+			if (cpu_has_wsbh) {
 				/* R2 and later have the wsbh instruction */
 				emit_wsbh(r_A, r_A, ctx);
 			} else {

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* Re: [PATCH 2/2] MIPS: lib: csum_partial: use wsbh/movn on ls3
  2014-06-03 18:44   ` Ralf Baechle
@ 2014-06-04  7:57     ` Chen Jie
  0 siblings, 0 replies; 24+ messages in thread
From: Chen Jie @ 2014-06-04  7:57 UTC (permalink / raw)
  To: Ralf Baechle; +Cc: Linux MIPS Mailing List

I've tested the patch:
1. arch/mips/net/bpf_jit.c is missing in master branch of linux-mips,
so it was ignored.
2. Other things are fine.

2014-06-04 2:44 GMT+08:00 Ralf Baechle <ralf@linux-mips.org>:
> diff --git a/arch/mips/include/uapi/asm/swab.h b/arch/mips/include/uapi/asm/swab.h
> index ac9a8f9..b2ab2cf 100644
> --- a/arch/mips/include/uapi/asm/swab.h
> +++ b/arch/mips/include/uapi/asm/swab.h
> @@ -13,7 +13,8 @@
>
>  #define __SWAB_64_THRU_32__
>
> -#if defined(__mips_isa_rev) && (__mips_isa_rev >= 2)
> +#if (defined(__mips_isa_rev) && (__mips_isa_rev >= 2)) ||              \
> +    defined(_MIPS_ARCH_LOONGSON3A)
>
>  static inline __attribute_const__ __u16 __arch_swab16(__u16 x)
>  {
> @@ -55,5 +56,5 @@ static inline __attribute_const__ __u64 __arch_swab64(__u64 x)
Should we add ".set arch=mips64r2" for using of wsbh/dsbh?
I can compile 3.15-rc8 with the patch applied, but when I tried the
patch on 3.10, it reported "unrecognized op dsbh..."

> diff --git a/arch/mips/lib/csum_partial.S b/arch/mips/lib/csum_partial.S
> index 9901237..4c721e2 100644
> --- a/arch/mips/lib/csum_partial.S
> +++ b/arch/mips/lib/csum_partial.S
> @@ -277,9 +277,12 @@ LEAF(csum_partial)
>  #endif
>
>         /* odd buffer alignment? */
> -#ifdef CONFIG_CPU_MIPSR2
> +#if defined(CONFIG_CPU_MIPSR2) || defined(CONFIG_CPU_LOONGSON3)
Should above line be replaced with the following line?
"#if cpu_has_wsbh"

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH, v2] MIPS: lib: csum_partial: more instruction paral
  2014-05-19 15:32     ` Chen Jie
@ 2014-08-15 20:15       ` Chen Jie
  0 siblings, 0 replies; 24+ messages in thread
From: Chen Jie @ 2014-08-15 20:15 UTC (permalink / raw)
  To: James Hogan
  Cc: Linux MIPS Mailing List, markos.chandras,
	陈华才

2014-05-19 23:32 GMT+08:00 Chen Jie <chenj@lemote.com>:
> 2014-05-19 14:59 GMT+08:00 James Hogan <james.hogan@imgtec.com>:
>> On Monday 19 May 2014 11:14:07 chenj wrote:
>>> Computing sum introduces true data dependency, e.g.
>>>       ADDC(sum, t0)
>>>       ADDC(sum, t1)
>>>       ADDC(sum, t2)
>>>       ADDC(sum, t3)
>>> Here, each ADDC(sum, ...) references the sum value updated by previous ADDC.
>>>
>>> In this patch, above sequence is adjusted as following:
>>>       ADDC(t0, t1)
>>>       ADDC(t2, t3)
>>>       ADDC(sum, t0)
>>>       ADDC(sum, t2)
>>> The first two ADDC operations are independent, hence can be executed
>>> simultaneously if possible.
>>
>> The actual patch appears to change it to this:
>> ADDC(t0, t1)
>> ADDC(sum, t0)
>> ADDC(t2, t3)
>> ADDC(sum, t2)
>>
>> which is slightly different (presumably due to the interleaved stores in some
>> of the cases).
>>
>>> This patch improves instruction level parallelism, and brings at most 50%
>>> csum performance gain on Loongson 3a processor[1].
>>
>> Nice results.
>>
>> The stuff below the --- will get dropped when the patch is applied though,
>> after which the "[1]" won't refer to anything.
>>
> Thanks for your suggestion, I'll amend the commit message further later.
>
> Basically, the patch reduces the case of one ADDC depending on the
> result of the previous ADDC.
>
> BTW, I'm not sure whether the sum value of the new implementation is
> equivalent to the original one, but in my test(make run_test of the
> csum_test.tar.gz, and a comparing patch in kernel) it is.

It is equivalent to the original one.
More explanation about the math behind "x ADDC y ADDC z == x ADDC (y ADDC z)":

Let C = the value of '(uint64_t) -1 + 1' in 64bit case, then
0 <= x <= C-1
0 <= y <= C-1
0 <= z <= C-1

Here 'x ADDC y' is defined as
x + y >= C ? x + y - C + 1 : x + y


Case 1: x + y >= C && x + y - C + 1 + z < C (i.e. x ADDC y ADDC z = x
+ y - C +1 + z)
if y + z >= C:
=> y ADDC z = y + z - C + 1
C > x + y + z - C + 1
=> The result is x + y + z - C + 1

if y + z < C:
=> y ADDC z = y + z
x + y >= C
=> x + (y + z) >= C + z >= C
=> The result is x + y + z - C + 1

Case 2: x + y < C && x + y + z >= C (i.e. x ADDC y ADDC z = x + y + z - C + 1)
if y + z >= C:
=> y ADDC z = y + z - C + 1
C > x + y
=> C + z - C + 1 > x + y + z - C + 1
=> z + 1 > x + y + z - C + 1
=> C >= z + 1 > x + y + z - C + 1
=> The result is x + y + z - C + 1

if y + z < C:
=> y ADDC z = y + z
x + y + z >= C
=> The result is x + y + z - C + 1

Case 3: x + y >= C && x + y - C + 1 + z >= C (i.e. x ADDC y ADDC z = x
+ y + z - 2C + 2)
x + y - C + 1 + z >= C
=> y + z >= 2C - 1 - x
C >= 1 + x
=> 2C - 1 - x >= C
=> y + z >= C
=> y ADDC z = y + z - C + 1

x + y - C + 1 + z >= C
=> The result is  x + y + z - 2C + 2


Case 4: x + y < C && x + y + z < C (i.e. x ADDC y ADDC z = x + y + z)
x + y + z < C
=> y + z < C - x <= C
=> y + z < C
=> y ADDC z = y + z

x + y + z < C
=> The result is x + y + z

^ permalink raw reply	[flat|nested] 24+ messages in thread

end of thread, other threads:[~2014-08-15 20:16 UTC | newest]

Thread overview: 24+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2014-05-15  7:09 [PATCH 1/2] MIPS: lib: csum_partial: more instruction paral chenj
2014-05-15  7:09 ` [PATCH 2/2] MIPS: lib: csum_partial: use wsbh/movn on ls3 chenj
2014-05-15 11:40   ` Paul Burton
2014-05-15 11:40     ` Paul Burton
2014-05-16 13:29     ` Chen Jie
2014-05-16 15:21       ` Paul Burton
2014-06-03 11:03   ` Ralf Baechle
2014-06-03 15:03     ` Chen Jie
2014-06-03 18:44   ` Ralf Baechle
2014-06-04  7:57     ` Chen Jie
2014-05-15  8:20 ` [PATCH 1/2] MIPS: lib: csum_partial: more instruction paral Markos Chandras
2014-05-15  8:20   ` Markos Chandras
2014-05-19 16:36   ` Ralf Baechle
2014-05-19  3:14 ` [PATCH, v2] " chenj
2014-05-19  6:59   ` James Hogan
2014-05-19 15:32     ` Chen Jie
2014-08-15 20:15       ` Chen Jie
  -- strict thread matches above, loose matches on Subject: below --
2014-05-18  8:23 [PATCH 2/2] MIPS: lib: csum_partial: use wsbh/movn on ls3 chenj
2014-05-18 14:39 ` Huacai Chen
2014-05-19 10:02   ` Paul Burton
2014-05-19 15:15     ` Chen Jie
2014-05-19 17:06       ` Ralf Baechle
2014-05-20 12:33         ` cee1
2014-05-24  1:33           ` Huacai Chen

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.