linuxppc-dev.lists.ozlabs.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] powerpc: tiny memcpy_(to|from)io optimisation
@ 2009-05-27 20:00 Albrecht Dreß
  2009-05-28 16:13 ` Joakim Tjernlund
                   ` (3 more replies)
  0 siblings, 4 replies; 13+ messages in thread
From: Albrecht Dreß @ 2009-05-27 20:00 UTC (permalink / raw)
  To: Linux PPC Development

[-- Attachment #1: Type: text/plain, Size: 1012 bytes --]

This trivial patch changes memcpy_(to|from)io as to transfer as many  
32-bit words as possible in 32-bit accesses (in the current solution,  
the last 32-bit word was transferred as 4 byte accesses).

Signed-off-by: Albrecht Dreß <albrecht.dress@arcor.de>
---

diff -urpN -X linux-2.6.29.1.orig/Documentation/dontdiff  
linux-2.6.29.1.orig/arch/powerpc/kernel/io.c  
linux-2.6.29.1/arch/powerpc/kernel/io.c
--- linux-2.6.29.1.orig/arch/powerpc/kernel/io.c	2009-04-02  
22:55:27.000000000 +0200
+++ linux-2.6.29.1/arch/powerpc/kernel/io.c	2009-05-27  
11:36:09.000000000 +0200
@@ -161,7 +161,7 @@ void _memcpy_fromio(void *dest, const vo
  		dest++;
  		n--;
  	}
-	while(n > 4) {
+	while(n >= 4) {
  		*((u32 *)dest) = *((volatile u32 *)vsrc);
  		eieio();
  		vsrc += 4;
@@ -190,7 +190,7 @@ void _memcpy_toio(volatile void __iomem
  		vdest++;
  		n--;
  	}
-	while(n > 4) {
+	while(n >= 4) {
  		*((volatile u32 *)vdest) = *((volatile u32 *)src);
  		src += 4;
  		vdest += 4;

[-- Attachment #2: Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] powerpc: tiny memcpy_(to|from)io optimisation
  2009-05-27 20:00 [PATCH] powerpc: tiny memcpy_(to|from)io optimisation Albrecht Dreß
@ 2009-05-28 16:13 ` Joakim Tjernlund
  2009-05-28 19:50   ` Albrecht Dreß
  2009-06-11 17:07 ` Wolfram Sang
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 13+ messages in thread
From: Joakim Tjernlund @ 2009-05-28 16:13 UTC (permalink / raw)
  To: Albrecht Dreß; +Cc: Linux PPC Development


>
> This trivial patch changes memcpy_(to|from)io as to transfer as many
> 32-bit words as possible in 32-bit accesses (in the current solution,=

> the last 32-bit word was transferred as 4 byte accesses).
>
> Signed-off-by: Albrecht Dre=DF <albrecht.dress@arcor.de>
> ---
>
> diff -urpN -X linux-2.6.29.1.orig/Documentation/dontdiff
> linux-2.6.29.1.orig/arch/powerpc/kernel/io.c
> linux-2.6.29.1/arch/powerpc/kernel/io.c
> --- linux-2.6.29.1.orig/arch/powerpc/kernel/io.c   2009-04-02
> 22:55:27.000000000 +0200
> +++ linux-2.6.29.1/arch/powerpc/kernel/io.c   2009-05-27
> 11:36:09.000000000 +0200
> @@ -161,7 +161,7 @@ void _memcpy_fromio(void *dest, const vo
>         dest++;
>         n--;
>      }
> -   while(n > 4) {
> +   while(n >=3D 4) {
>         *((u32 *)dest) =3D *((volatile u32 *)vsrc);
>         eieio();
>         vsrc +=3D 4;
> @@ -190,7 +190,7 @@ void _memcpy_toio(volatile void __iomem
>         vdest++;
>         n--;
>      }
> -   while(n > 4) {
> +   while(n >=3D 4) {
>         *((volatile u32 *)vdest) =3D *((volatile u32 *)src);
>         src +=3D 4;
>         vdest +=3D 4;

hmm, these do look a bit unoptimal anyway. Any reason not to write
them something like below(written by me for uClibc long time ago). You =
will
have to add eieio()/sync

void *memcpy(void *to, const void *from, size_t n)
/* PPC can do pre increment and load/store, but not post increment and =
load/store.
   Therefore use *++ptr instead of *ptr++. */
{
	unsigned long rem, chunks, tmp1, tmp2;
	unsigned char *tmp_to;
	unsigned char *tmp_from =3D (unsigned char *)from;

	chunks =3D n / 8;
	tmp_from -=3D 4;
	tmp_to =3D to - 4;
	if (!chunks)
		goto lessthan8;
	rem =3D (unsigned long )tmp_to % 4;
	if (rem)
		goto align;
 copy_chunks:
	do {
		/* make gcc to load all data, then store it */
		tmp1 =3D *(unsigned long *)(tmp_from+4);
		tmp_from +=3D 8;
		tmp2 =3D *(unsigned long *)tmp_from;
		*(unsigned long *)(tmp_to+4) =3D tmp1;
		tmp_to +=3D 8;
		*(unsigned long *)tmp_to =3D tmp2;
	} while (--chunks);
 lessthan8:
	n =3D n % 8;
	if (n >=3D 4) {
		*(unsigned long *)(tmp_to+4) =3D *(unsigned long *)(tmp_from+4);
		tmp_from +=3D 4;
		tmp_to +=3D 4;
		n =3D n-4;
	}
	if (!n ) return to;
	tmp_from +=3D 3;
	tmp_to +=3D 3;
	do {
		*++tmp_to =3D *++tmp_from;
	} while (--n);

	return to;
 align:
	rem =3D 4 - rem;
	n =3D n - rem;
	do {
		*(tmp_to+4) =3D *(tmp_from+4);
		++tmp_from;
		++tmp_to;
	} while (--rem);
	chunks =3D n / 8;
	if (chunks)
		goto copy_chunks;
	goto lessthan8;
}=

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] powerpc: tiny memcpy_(to|from)io optimisation
  2009-05-28 16:13 ` Joakim Tjernlund
@ 2009-05-28 19:50   ` Albrecht Dreß
  2009-05-29  6:31     ` Joakim Tjernlund
  0 siblings, 1 reply; 13+ messages in thread
From: Albrecht Dreß @ 2009-05-28 19:50 UTC (permalink / raw)
  To: linuxppc-dev

[-- Attachment #1: Type: text/plain, Size: 1122 bytes --]

Am 28.05.09 18:13 schrieb(en) Joakim Tjernlund:
> hmm, these do look a bit unoptimal anyway. Any reason not to write  
> them something like below(written by me for uClibc long time ago).  
> You will have to add eieio()/sync

No (and I wasn't aware of the PPC pre-inc vs. post-inc stuff) - I just  
stumbled over this while fixing mtd accesses to the MPC5200's Local Bus  
in 16-bit mode which doesn't allow byte accesses.  And I didn't want to  
go too deep into this as the real fix for me is actually somewhat  
different...

> /* PPC can do pre increment and load/store, but not post increment  
> and load/store.
>    Therefore use *++ptr instead of *ptr++. */
[snip]
>  copy_chunks:
> 	do {
> 		/* make gcc to load all data, then store it */
> 		tmp1 = *(unsigned long *)(tmp_from+4);
> 		tmp_from += 8;
> 		tmp2 = *(unsigned long *)tmp_from;
> 		*(unsigned long *)(tmp_to+4) = tmp1;
> 		tmp_to += 8;
> 		*(unsigned long *)tmp_to = tmp2;
> 	} while (--chunks);

Is this the same for all PPC cores, i.e. do they all benefit from  
loading/storing 8 instead of 4 bytes?

Best, Albrecht.

[-- Attachment #2: Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] powerpc: tiny memcpy_(to|from)io optimisation
  2009-05-28 19:50   ` Albrecht Dreß
@ 2009-05-29  6:31     ` Joakim Tjernlund
  2009-05-31 10:11       ` Albrecht Dreß
  0 siblings, 1 reply; 13+ messages in thread
From: Joakim Tjernlund @ 2009-05-29  6:31 UTC (permalink / raw)
  To: Albrecht Dreß; +Cc: linuxppc-dev

> Am 28.05.09 18:13 schrieb(en) Joakim Tjernlund:
> > hmm, these do look a bit unoptimal anyway. Any reason not to write
> > them something like below(written by me for uClibc long time ago).
> > You will have to add eieio()/sync
>
> No (and I wasn't aware of the PPC pre-inc vs. post-inc stuff) - I just

I think this is true for most RISC based CPU's. It is a pity as
post ops are a lot more common. The do {} while(--chunks) is also
better. Basically the "while(--chunks)" is free(but only if you don't use
chunks inside the loop).

> stumbled over this while fixing mtd accesses to the MPC5200's Local Bus
> in 16-bit mode which doesn't allow byte accesses.  And I didn't want to
> go too deep into this as the real fix for me is actually somewhat
> different...

OK.
>
> > /* PPC can do pre increment and load/store, but not post increment
> > and load/store.
> >    Therefore use *++ptr instead of *ptr++. */
> [snip]
> >  copy_chunks:
> >    do {
> >       /* make gcc to load all data, then store it */
> >       tmp1 = *(unsigned long *)(tmp_from+4);
> >       tmp_from += 8;
> >       tmp2 = *(unsigned long *)tmp_from;
> >       *(unsigned long *)(tmp_to+4) = tmp1;
> >       tmp_to += 8;
> >       *(unsigned long *)tmp_to = tmp2;
> >    } while (--chunks);
>
> Is this the same for all PPC cores, i.e. do they all benefit from
> loading/storing 8 instead of 4 bytes?

As I recall there is an extra cycle between load and store,
so you will benefit from doing all your loads first and then
stores. The kernel memcpy has loads 16 bytes before storing. I selected
8 as uClibc should also be small.
Since there has to be eieio between ops I am not sure it will
matter here. Perhaps it is better to do 4 bytes in the main loop, making
the whole function smaller. There are memset and memmove functions in
uClibc too.

 Jocke

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] powerpc: tiny memcpy_(to|from)io optimisation
  2009-05-29  6:31     ` Joakim Tjernlund
@ 2009-05-31 10:11       ` Albrecht Dreß
  2009-06-01  6:14         ` Joakim Tjernlund
  0 siblings, 1 reply; 13+ messages in thread
From: Albrecht Dreß @ 2009-05-31 10:11 UTC (permalink / raw)
  To: Joakim Tjernlund; +Cc: linuxppc-dev

[-- Attachment #1: Type: text/plain, Size: 961 bytes --]

Hi Jocke:

Am 29.05.09 08:31 schrieb(en) Joakim Tjernlund:
> > No (and I wasn't aware of the PPC pre-inc vs. post-inc stuff) - I  
> just
> 
> I think this is true for most RISC based CPU's. It is a pity as
> post ops are a lot more common. The do {} while(--chunks) is also
> better. Basically the "while(--chunks)" is free(but only if you don't  
> use
> chunks inside the loop).

Just a side note:  I looked at the assembly output of gcc 4.3.3 coming  
with Ubuntu Jaunty/PowerPC for

<snip case="1">
   n >>= 2;
   do {
     *++dst = *++src;
   } while (--n);
<snip>

and

<snip case="2">
   n >>= 2;
   while (n--)
     *dst++ = *src++;
</snip>

Using the gcc options "-O2 -mcpu=603e -mtune=603e" (same effect with  
"-O3" instead of "-O2") the loop core is *exactly* the same in both  
cases.

With gcc 4.2.2 (coming with ELDK 4.2) the loop core in case 2 is indeed  
one statement longer, though...

Best, Albrecht.

[-- Attachment #2: Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] powerpc: tiny memcpy_(to|from)io optimisation
  2009-05-31 10:11       ` Albrecht Dreß
@ 2009-06-01  6:14         ` Joakim Tjernlund
  2009-06-02 18:45           ` Albrecht Dreß
  0 siblings, 1 reply; 13+ messages in thread
From: Joakim Tjernlund @ 2009-06-01  6:14 UTC (permalink / raw)
  To: Albrecht Dreß; +Cc: linuxppc-dev

>
> Hi Jocke:
>
> Am 29.05.09 08:31 schrieb(en) Joakim Tjernlund:
> > > No (and I wasn't aware of the PPC pre-inc vs. post-inc stuff) - I
> > just
> >
> > I think this is true for most RISC based CPU's. It is a pity as
> > post ops are a lot more common. The do {} while(--chunks) is also
> > better. Basically the "while(--chunks)" is free(but only if you don't
> > use
> > chunks inside the loop).
>
> Just a side note:  I looked at the assembly output of gcc 4.3.3 coming
> with Ubuntu Jaunty/PowerPC for
>
> <snip case="1">
>    n >>= 2;
>    do {
>      *++dst = *++src;
>    } while (--n);
> <snip>
>
> and
>
> <snip case="2">
>    n >>= 2;
>    while (n--)
>      *dst++ = *src++;
> </snip>
>
> Using the gcc options "-O2 -mcpu=603e -mtune=603e" (same effect with
> "-O3" instead of "-O2") the loop core is *exactly* the same in both
> cases.

Yes, the compiler can/should optimize this but ...

>
> With gcc 4.2.2 (coming with ELDK 4.2) the loop core in case 2 is indeed
> one statement longer, though...

.. not even 4.2.2 which is fairly modern will get it right. It breaks very
easy as gcc has never been any good at this type of optimization. Sometimes
small changes will make gcc unhappy and it won't do the right optimization.

 Jocke

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] powerpc: tiny memcpy_(to|from)io optimisation
  2009-06-01  6:14         ` Joakim Tjernlund
@ 2009-06-02 18:45           ` Albrecht Dreß
  2009-06-02 22:51             ` Benjamin Herrenschmidt
  0 siblings, 1 reply; 13+ messages in thread
From: Albrecht Dreß @ 2009-06-02 18:45 UTC (permalink / raw)
  To: Joakim Tjernlund; +Cc: linuxppc-dev

[-- Attachment #1: Type: text/plain, Size: 1857 bytes --]

Am 01.06.09 08:14 schrieb(en) Joakim Tjernlund:
> .. not even 4.2.2 which is fairly modern will get it right. It breaks  
> very easy as gcc has never been any good at this type of  
> optimization. Sometimes small changes will make gcc unhappy and it  
> won't do the right optimization.

It's even worse...  Looking at the assembly output of the simple  
function

<snip>
void loop2(void * src, void * dst, int n)
{
   volatile uint32_t * _dst = (volatile uint32_t *) (dst - 4);
   volatile uint32_t * _src = (volatile uint32_t *) (src - 4);
   n >>= 2;
   do {
     *(++_dst) = *(++_src);
   } while (--n);
}
</snip>

gcc 4.0.1 coming with Apple's Developer Tools (on Tiger) with options  
"-O3 -mcpu=603e -mtune=603e" produces

<snip>
_loop2:
         srawi r5,r5,2
         mtctr r5
         addi r4,r4,-4
         addi r3,r3,-4
L11:
         lwzu r0,4(r3)
         stwu r0,4(r4)
         bdnz L11
         blr
</snip>

which looks perfect to me.  However, gcc 4.3.3 on Ubuntu/PPC produces  
with the same options

<snip>
loop2:
         srawi 5,5,2
         stwu 1,-16(1)
         mtctr 5
         li 9,0
.L8:
         lwzx 0,3,9
         stwx 0,4,9
         addi 9,9,4
         bdnz .L8
         addi 1,1,16
         blr
</snip>

wasting a register and a statement in the loop core, and fiddles around  
with the stack pointer for no good reason.  Gcc 4.4.0 produces

<snip>
loop2:
         srawi 5,5,2
         mtctr 5
         li 9,0
.L9:
         lwzx 0,3,9
         stwx 0,4,9
         addi 9,9,4
         bdnz .L9
         blr
</snip>

which drops the r1 accesses, but still produces the sub-optimal loop.   
Is this a gcc regression, or did I miss something here?  Probably the  
only bullet-proof way is to write some core loops in assembly... :-/

Thanks, Albrecht.

[-- Attachment #2: Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] powerpc: tiny memcpy_(to|from)io optimisation
  2009-06-02 18:45           ` Albrecht Dreß
@ 2009-06-02 22:51             ` Benjamin Herrenschmidt
  2009-06-03 14:36               ` Kenneth Johansson
  0 siblings, 1 reply; 13+ messages in thread
From: Benjamin Herrenschmidt @ 2009-06-02 22:51 UTC (permalink / raw)
  To: Albrecht Dreß; +Cc: linuxppc-dev

On Tue, 2009-06-02 at 20:45 +0200, Albrecht Dreß wrote:

> 
> which drops the r1 accesses, but still produces the sub-optimal loop.   
> Is this a gcc regression, or did I miss something here?  Probably the  
> only bullet-proof way is to write some core loops in assembly... :-/

Well, gcc may be right here. What you call the "optimal" loop uses the
lwzu instruction. An interesting thing about this instruction is that
it updates two GPRs at completion (I'm ignoring the load multiple and
string instructions on purpose here).

Now, quite a few simple implementations don't have two write ports to
the GPR file, nor the logic to handle hazards properly with two GPRs
being updated... which means the instruction is very likely to take a
very inefficient path through the pipeline. On server processors, I'm
pretty sure it's just cracked into a load and an add anyway.

I wouldn't be surprised thus if the loop variant with the separate add
ends up more efficient on most implementations around.

Of course, the loop above could use some unrolling to put some distance
between the load and the store of the loaded value.

Cheers,
Ben.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] powerpc: tiny memcpy_(to|from)io optimisation
  2009-06-02 22:51             ` Benjamin Herrenschmidt
@ 2009-06-03 14:36               ` Kenneth Johansson
  2009-06-03 18:35                 ` Albrecht Dreß
  0 siblings, 1 reply; 13+ messages in thread
From: Kenneth Johansson @ 2009-06-03 14:36 UTC (permalink / raw)
  To: Benjamin Herrenschmidt; +Cc: Albrecht Dreß, linuxppc-dev

On Wed, 2009-06-03 at 08:51 +1000, Benjamin Herrenschmidt wrote:
> On Tue, 2009-06-02 at 20:45 +0200, Albrecht Dreß wrote:
> 
> > 
> > which drops the r1 accesses, but still produces the sub-optimal loop.   
> > Is this a gcc regression, or did I miss something here?  Probably the  
> > only bullet-proof way is to write some core loops in assembly... :-/
> 
> Well, gcc may be right here. What you call the "optimal" loop uses the
> lwzu instruction. An interesting thing about this instruction is that
> it updates two GPRs at completion (I'm ignoring the load multiple and
> string instructions on purpose here).

> I wouldn't be surprised thus if the loop variant with the separate add
> ends up more efficient on most implementations around.

On an e300 core using the lwzu/stwu is about 20% faster so at least one
core prefer that optimization. 

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] powerpc: tiny memcpy_(to|from)io optimisation
  2009-06-03 14:36               ` Kenneth Johansson
@ 2009-06-03 18:35                 ` Albrecht Dreß
  0 siblings, 0 replies; 13+ messages in thread
From: Albrecht Dreß @ 2009-06-03 18:35 UTC (permalink / raw)
  To: Kenneth Johansson; +Cc: linuxppc-dev

[-- Attachment #1: Type: text/plain, Size: 287 bytes --]

Am 03.06.09 16:36 schrieb(en) Kenneth Johansson:
> On an e300 core using the lwzu/stwu is about 20% faster so at least  
> one core prefer that optimization.

Bingo.  The '5200 on my test system has that core.  So "-mcpu=603e"  
should select this strategy!

Cheers, Albrecht.


[-- Attachment #2: Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] powerpc: tiny memcpy_(to|from)io optimisation
  2009-05-27 20:00 [PATCH] powerpc: tiny memcpy_(to|from)io optimisation Albrecht Dreß
  2009-05-28 16:13 ` Joakim Tjernlund
@ 2009-06-11 17:07 ` Wolfram Sang
  2009-06-11 17:30 ` Grant Likely
  2009-06-19 18:42 ` Lorenz Kolb
  3 siblings, 0 replies; 13+ messages in thread
From: Wolfram Sang @ 2009-06-11 17:07 UTC (permalink / raw)
  To: Albrecht Dreß; +Cc: Linux PPC Development

[-- Attachment #1: Type: text/plain, Size: 602 bytes --]

On Wed, May 27, 2009 at 10:00:41PM +0200, Albrecht Dreß wrote:
> This trivial patch changes memcpy_(to|from)io as to transfer as many  
> 32-bit words as possible in 32-bit accesses (in the current solution,  
> the last 32-bit word was transferred as 4 byte accesses).
>
> Signed-off-by: Albrecht Dreß <albrecht.dress@arcor.de>

Besides the discussion about optimizing the loop, what about the patch itself?
Looks plausible to me...

-- 
Pengutronix e.K.                           | Wolfram Sang                |
Industrial Linux Solutions                 | http://www.pengutronix.de/  |

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 197 bytes --]

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] powerpc: tiny memcpy_(to|from)io optimisation
  2009-05-27 20:00 [PATCH] powerpc: tiny memcpy_(to|from)io optimisation Albrecht Dreß
  2009-05-28 16:13 ` Joakim Tjernlund
  2009-06-11 17:07 ` Wolfram Sang
@ 2009-06-11 17:30 ` Grant Likely
  2009-06-19 18:42 ` Lorenz Kolb
  3 siblings, 0 replies; 13+ messages in thread
From: Grant Likely @ 2009-06-11 17:30 UTC (permalink / raw)
  To: Albrecht Dreß; +Cc: Linux PPC Development

On Wed, May 27, 2009 at 2:00 PM, Albrecht Dre=DF<albrecht.dress@arcor.de> w=
rote:
> This trivial patch changes memcpy_(to|from)io as to transfer as many 32-b=
it
> words as possible in 32-bit accesses (in the current solution, the last
> 32-bit word was transferred as 4 byte accesses).
>
> Signed-off-by: Albrecht Dre=DF <albrecht.dress@arcor.de>
Acked-by: Grant Likely <grant.likely@secretlab.ca>

> ---
>
> diff -urpN -X linux-2.6.29.1.orig/Documentation/dontdiff
> linux-2.6.29.1.orig/arch/powerpc/kernel/io.c
> linux-2.6.29.1/arch/powerpc/kernel/io.c
> --- linux-2.6.29.1.orig/arch/powerpc/kernel/io.c =A0 =A0 =A0 =A02009-04-0=
2
> 22:55:27.000000000 +0200
> +++ linux-2.6.29.1/arch/powerpc/kernel/io.c =A0 =A0 2009-05-27
> 11:36:09.000000000 +0200
> @@ -161,7 +161,7 @@ void _memcpy_fromio(void *dest, const vo
> =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0dest++;
> =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0n--;
> =A0 =A0 =A0 =A0}
> - =A0 =A0 =A0 while(n > 4) {
> + =A0 =A0 =A0 while(n >=3D 4) {
> =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0*((u32 *)dest) =3D *((volatile u32 *)vsrc)=
;
> =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0eieio();
> =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0vsrc +=3D 4;
> @@ -190,7 +190,7 @@ void _memcpy_toio(volatile void __iomem
> =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0vdest++;
> =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0n--;
> =A0 =A0 =A0 =A0}
> - =A0 =A0 =A0 while(n > 4) {
> + =A0 =A0 =A0 while(n >=3D 4) {
> =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0*((volatile u32 *)vdest) =3D *((volatile u=
32 *)src);
> =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0src +=3D 4;
> =A0 =A0 =A0 =A0 =A0 =A0 =A0 =A0vdest +=3D 4;
>
> _______________________________________________
> Linuxppc-dev mailing list
> Linuxppc-dev@ozlabs.org
> https://ozlabs.org/mailman/listinfo/linuxppc-dev
>



--=20
Grant Likely, B.Sc., P.Eng.
Secret Lab Technologies Ltd.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] powerpc: tiny memcpy_(to|from)io optimisation
  2009-05-27 20:00 [PATCH] powerpc: tiny memcpy_(to|from)io optimisation Albrecht Dreß
                   ` (2 preceding siblings ...)
  2009-06-11 17:30 ` Grant Likely
@ 2009-06-19 18:42 ` Lorenz Kolb
  3 siblings, 0 replies; 13+ messages in thread
From: Lorenz Kolb @ 2009-06-19 18:42 UTC (permalink / raw)
  To: Albrecht Dreß; +Cc: Linux PPC Development

Albrecht Dreß wrote:
> This trivial patch changes memcpy_(to|from)io as to transfer as many 
> 32-bit words as possible in 32-bit accesses (in the current solution, 
> the last 32-bit word was transferred as 4 byte accesses).
> 
> Signed-off-by: Albrecht Dreß <albrecht.dress@arcor.de>
> ---
> 
> diff -urpN -X linux-2.6.29.1.orig/Documentation/dontdiff 
> linux-2.6.29.1.orig/arch/powerpc/kernel/io.c 
> linux-2.6.29.1/arch/powerpc/kernel/io.c
> --- linux-2.6.29.1.orig/arch/powerpc/kernel/io.c    2009-04-02 
> 22:55:27.000000000 +0200
> +++ linux-2.6.29.1/arch/powerpc/kernel/io.c    2009-05-27 
> 11:36:09.000000000 +0200
> @@ -161,7 +161,7 @@ void _memcpy_fromio(void *dest, const vo
>          dest++;
>          n--;
>      }
> -    while(n > 4) {
> +    while(n >= 4) {
>          *((u32 *)dest) = *((volatile u32 *)vsrc);
>          eieio();
>          vsrc += 4;
> @@ -190,7 +190,7 @@ void _memcpy_toio(volatile void __iomem
>          vdest++;
>          n--;
>      }
> -    while(n > 4) {
> +    while(n >= 4) {
>          *((volatile u32 *)vdest) = *((volatile u32 *)src);
>          src += 4;
>          vdest += 4;
> 
> 
> ------------------------------------------------------------------------

Works for me (and some custom hardware I created [quite a while ago, 
when I was young ;-)] that is not very tolerant to Byte-by-Byte-Transfers).

Thus:
Acked

^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2009-06-19 18:42 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2009-05-27 20:00 [PATCH] powerpc: tiny memcpy_(to|from)io optimisation Albrecht Dreß
2009-05-28 16:13 ` Joakim Tjernlund
2009-05-28 19:50   ` Albrecht Dreß
2009-05-29  6:31     ` Joakim Tjernlund
2009-05-31 10:11       ` Albrecht Dreß
2009-06-01  6:14         ` Joakim Tjernlund
2009-06-02 18:45           ` Albrecht Dreß
2009-06-02 22:51             ` Benjamin Herrenschmidt
2009-06-03 14:36               ` Kenneth Johansson
2009-06-03 18:35                 ` Albrecht Dreß
2009-06-11 17:07 ` Wolfram Sang
2009-06-11 17:30 ` Grant Likely
2009-06-19 18:42 ` Lorenz Kolb

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).