LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed
* [RFC] Optimize __arch_swab32 and __arch_swab16
@ 2011-08-11  8:13 Joakim Tjernlund
  0 siblings, 0 replies; 7+ messages in thread
From: Joakim Tjernlund @ 2011-08-11  8:13 UTC (permalink / raw)
  To: linuxppc-dev


PPC __arch_swab32 and __arch_swab16 generates non optimal code.
It doesn't schedule very well, need to copy its input register and
and swab16 needs an extra insn to clear its upper bits. I have improved
these functions(see my__xx). Any problem with the new asm? If
not I will send a patch.

Below some example code to illustrate:

#include <stdio.h>

unsigned long __arch_swab32(unsigned long value)
{
	unsigned long result;

	__asm__("rlwimi %0,%1,24,16,23\n\t"
	    "rlwimi %0,%1,8,8,15\n\t"
	    "rlwimi %0,%1,24,0,7"
	    : "=r" (result)
	    : "r" (value), "0" (value >> 24));
	return result;
}

unsigned long my__arch_swab32(unsigned long value)
{
	unsigned long tmp;

	__asm__("rlwimi %0,%0,24,0xffffffff"
		: "+r" (value));
	__asm__("rlwinm %0,%1,16,0xffffffff"
		: "=r" (tmp), "+r" (value));
	__asm__("rlwimi %0,%1,0,0x00ff0000"
		: "+r" (value), "+r" (tmp));
	__asm__("rlwimi %0,%1,0,0x000000ff"
		: "+r" (value), "+r" (tmp));
	return value;
}

unsigned short __arch_swab16(unsigned short value)
{
	unsigned short result;

	__asm__("rlwimi %0,%1,8,16,23"
	    : "=r" (result)
	    : "r" (value), "0" (value >> 8));
	return result;
}

unsigned short my__arch_swab16(unsigned short value)
{
	__asm__("rlwimi %0,%0,16,0x00ff0000"
		: "+r" (value));
	__asm__("rlwinm %0,%0,24,0x0000ffff"
		: "+r"(value));
	return value;
}

main()
{
	unsigned long x=0x12345678, y;

	y = my__arch_swab32(x);
	printf("swab32 x:%x, y:%x\n", x, y);
	y = my__arch_swab16(x);
	printf("swab16 x:%x, y:%x\n", x, y);
}

Generated asm:

	.file	"tst.c"
	.section	".text"
	.align 2
	.globl __arch_swab32
	.type	__arch_swab32, @function
__arch_swab32:
	mr %r0,%r3
	srwi %r3,%r3,24
#APP
	rlwimi %r3,%r0,24,16,23
	rlwimi %r3,%r0,8,8,15
	rlwimi %r3,%r0,24,0,7
#NO_APP
	blr
	.size	__arch_swab32, .-__arch_swab32
	.align 2
	.globl my__arch_swab32
	.type	my__arch_swab32, @function
my__arch_swab32:
#APP
	rlwimi %r3,%r3,24,0xffffffff
	rlwinm %r0,%r3,16,0xffffffff
	rlwimi %r3,%r0,0,0x00ff0000
	rlwimi %r3,%r0,0,0x000000ff
#NO_APP
	blr
	.size	my__arch_swab32, .-my__arch_swab32
	.align 2
	.globl __arch_swab16
	.type	__arch_swab16, @function
__arch_swab16:
	mr %r0,%r3
	srwi %r3,%r3,8
#APP
	rlwimi %r3,%r0,8,16,23
#NO_APP
	rlwinm %r3,%r3,0,0xffff
	blr
	.size	__arch_swab16, .-__arch_swab16
	.align 2
	.globl my__arch_swab16
	.type	my__arch_swab16, @function
my__arch_swab16:
#APP
	rlwimi %r3,%r3,16,0x00ff0000
	rlwinm %r3,%r3,24,0x0000ffff
#NO_APP
	blr
	.size	my__arch_swab16, .-my__arch_swab16
	.section	.rodata.str1.4,"aMS",@progbits,1
	.align 2
.LC0:
	.string	"swab32 x:%x, y:%x\n"
	.align 2
.LC1:
	.string	"swab16 x:%x, y:%x\n"
	.section	".text"
	.align 2
	.globl main
	.type	main, @function
main:
	mflr %r0
	lis %r3,0x1234
	stwu %r1,-16(%r1)
	ori %r3,%r3,22136
	stw %r0,20(%r1)
	bl my__arch_swab32
	mr %r5,%r3
	lis %r4,0x1234
	lis %r3,.LC0@ha
	ori %r4,%r4,22136
	la %r3,.LC0@l(%r3)
	bl printf
	li %r3,22136
	bl my__arch_swab16
	lis %r4,0x1234
	mr %r5,%r3
	lis %r3,.LC1@ha
	la %r3,.LC1@l(%r3)
	ori %r4,%r4,22136
	bl printf
	lwz %r0,20(%r1)
	addi %r1,%r1,16
	mtlr %r0
	blr
	.size	main, .-main
	.section	.note.GNU-stack,"",@progbits
	.ident	"GCC: (GNU) 3.4.6 (Gentoo 3.4.6-r2, ssp-3.4.6-1.0, pie-8.7.9)"

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [RFC] Optimize __arch_swab32 and __arch_swab16
       [not found] <OF9F5D0B06.A12659DA-ONC12578E9.002B23BB-C12578E9.002D304C__34569.7653633126$1313050887$gmane$org@transmode.se>
@ 2011-08-11  8:45 ` Andreas Schwab
  2011-08-11  8:51   ` Joakim Tjernlund
  2011-08-11  8:56   ` David Laight
  0 siblings, 2 replies; 7+ messages in thread
From: Andreas Schwab @ 2011-08-11  8:45 UTC (permalink / raw)
  To: Joakim Tjernlund; +Cc: linuxppc-dev

Joakim Tjernlund <joakim.tjernlund@transmode.se> writes:

> unsigned short my__arch_swab16(unsigned short value)
> {
> 	__asm__("rlwimi %0,%0,16,0x00ff0000"
> 		: "+r" (value));

You are creating a value that does not fit in a short.

Andreas.

-- 
Andreas Schwab, schwab@redhat.com
GPG Key fingerprint = D4E8 DBE3 3813 BB5D FA84  5EC7 45C6 250E 6F00 984E
"And now for something completely different."

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [RFC] Optimize __arch_swab32 and __arch_swab16
  2011-08-11  8:45 ` [RFC] Optimize __arch_swab32 and __arch_swab16 Andreas Schwab
@ 2011-08-11  8:51   ` Joakim Tjernlund
  2011-08-11  8:56   ` David Laight
  1 sibling, 0 replies; 7+ messages in thread
From: Joakim Tjernlund @ 2011-08-11  8:51 UTC (permalink / raw)
  To: Andreas Schwab; +Cc: linuxppc-dev

Andreas Schwab <schwab@redhat.com> wrote on 2011/08/11 10:45:42:
>
> Joakim Tjernlund <joakim.tjernlund@transmode.se> writes:
>
> > unsigned short my__arch_swab16(unsigned short value)
> > {
> >    __asm__("rlwimi %0,%0,16,0x00ff0000"
> >       : "+r" (value));
>
> You are creating a value that does not fit in a short.

Short is passed in a 32 bit register with the upper 16 bits cleared. I just
temporarily use the upper bits and shift it back with the next insn:
__asm__("rlwinm %0,%0,24,0x0000ffff"
    : "+r"(value));
Can I not use the upper 16 bits in this manner?

 Jocke

^ permalink raw reply	[flat|nested] 7+ messages in thread

* RE: [RFC] Optimize __arch_swab32 and __arch_swab16
  2011-08-11  8:45 ` [RFC] Optimize __arch_swab32 and __arch_swab16 Andreas Schwab
  2011-08-11  8:51   ` Joakim Tjernlund
@ 2011-08-11  8:56   ` David Laight
  2011-08-11  9:23     ` Joakim Tjernlund
  1 sibling, 1 reply; 7+ messages in thread
From: David Laight @ 2011-08-11  8:56 UTC (permalink / raw)
  To: Andreas Schwab, Joakim Tjernlund; +Cc: linuxppc-dev

> Joakim Tjernlund <joakim.tjernlund@transmode.se> writes:
>=20
> > unsigned short my__arch_swab16(unsigned short value)
> > {
> > 	__asm__("rlwimi %0,%0,16,0x00ff0000"
> > 		: "+r" (value));
>=20
> You are creating a value that does not fit in a short.

Which is a problem because the compiler could schedule
it be written back to real memory between the instructions.

Actually the generated code would be better if the swap16()
functions operated on 'unsigned int' fields - since it would
save the compiler from doing a lot of shifts/masks elsewhere.

For instance there is likely to be a mask with 0xffff prior
to the call to swab16().

Since one use of these is for htons() (etc), when
the host is the correct endianness these are #defines that
do nothing - so out of range values aren't masked.
So it seems to me that defining:
   unsigned int swab(unsigned int);
would be fine - except it clashes with standard headers :-(

	David

^ permalink raw reply	[flat|nested] 7+ messages in thread

* RE: [RFC] Optimize __arch_swab32 and __arch_swab16
  2011-08-11  8:56   ` David Laight
@ 2011-08-11  9:23     ` Joakim Tjernlund
  2011-08-11  9:29       ` David Laight
  0 siblings, 1 reply; 7+ messages in thread
From: Joakim Tjernlund @ 2011-08-11  9:23 UTC (permalink / raw)
  To: David Laight; +Cc: Andreas Schwab, linuxppc-dev

"David Laight" <David.Laight@ACULAB.COM> wrote on 2011/08/11 10:56:26:
>
> > Joakim Tjernlund <joakim.tjernlund@transmode.se> writes:
> >
> > > unsigned short my__arch_swab16(unsigned short value)
> > > {
> > >    __asm__("rlwimi %0,%0,16,0x00ff0000"
> > >       : "+r" (value));
> >
> > You are creating a value that does not fit in a short.
>
> Which is a problem because the compiler could schedule
> it be written back to real memory between the instructions.

It can? There is no memory here, just registers. Even if it
is written to memory, how would that affect the register?

Assuming you are right, would rewriting it to
  __asm__("rlwimi %0,%0,16,0x00ff0000\n\t"
	    "rlwinm %0,%0,24,0x0000ffff"
	    : "+r"(value));
help?

>
> Actually the generated code would be better if the swap16()
> functions operated on 'unsigned int' fields - since it would
> save the compiler from doing a lot of shifts/masks elsewhere.
>
> For instance there is likely to be a mask with 0xffff prior
> to the call to swab16().
>
> Since one use of these is for htons() (etc), when
> the host is the correct endianness these are #defines that
> do nothing - so out of range values aren't masked.
> So it seems to me that defining:
>    unsigned int swab(unsigned int);
> would be fine - except it clashes with standard headers :-(
>
>    David
>
>

^ permalink raw reply	[flat|nested] 7+ messages in thread

* RE: [RFC] Optimize __arch_swab32 and __arch_swab16
  2011-08-11  9:23     ` Joakim Tjernlund
@ 2011-08-11  9:29       ` David Laight
  2011-08-11  9:43         ` Joakim Tjernlund
  0 siblings, 1 reply; 7+ messages in thread
From: David Laight @ 2011-08-11  9:29 UTC (permalink / raw)
  To: Joakim Tjernlund; +Cc: Andreas Schwab, linuxppc-dev

=20
> > Which is a problem because the compiler could schedule
> > it be written back to real memory between the instructions.
>=20
> It can? There is no memory here, just registers. Even if it
> is written to memory, how would that affect the register?

Although the function argument is passed in a register, the
compiler could generate a store-load sequence before and
after each __asm__() line.

> Assuming you are right, would rewriting it to
>   __asm__("rlwimi %0,%0,16,0x00ff0000\n\t"
> 	    "rlwinm %0,%0,24,0x0000ffff"
> 	    : "+r"(value));
> help?

Except that now you've stopped the compiler scheduling
another instruction between the two - probably forcing a
execution stall.

	David

^ permalink raw reply	[flat|nested] 7+ messages in thread

* RE: [RFC] Optimize __arch_swab32 and __arch_swab16
  2011-08-11  9:29       ` David Laight
@ 2011-08-11  9:43         ` Joakim Tjernlund
  0 siblings, 0 replies; 7+ messages in thread
From: Joakim Tjernlund @ 2011-08-11  9:43 UTC (permalink / raw)
  To: David Laight; +Cc: Andreas Schwab, linuxppc-dev

"David Laight" <David.Laight@ACULAB.COM> wrote on 2011/08/11 11:29:33:
>
>
> > > Which is a problem because the compiler could schedule
> > > it be written back to real memory between the instructions.
> >
> > It can? There is no memory here, just registers. Even if it
> > is written to memory, how would that affect the register?
>
> Although the function argument is passed in a register, the
> compiler could generate a store-load sequence before and
> after each __asm__() line.

Ah, I see. Seems strange that the complier would do that for
the register in use(value). Other regs perhaps.

>
> > Assuming you are right, would rewriting it to
> >   __asm__("rlwimi %0,%0,16,0x00ff0000\n\t"
> >        "rlwinm %0,%0,24,0x0000ffff"
> >        : "+r"(value));
> > help?
>
> Except that now you've stopped the compiler scheduling
> another instruction between the two - probably forcing a
> execution stall.

But this should be better than using 2 more insns and an
extra register.

 Jocke

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2011-08-11  9:44 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
     [not found] <OF9F5D0B06.A12659DA-ONC12578E9.002B23BB-C12578E9.002D304C__34569.7653633126$1313050887$gmane$org@transmode.se>
2011-08-11  8:45 ` [RFC] Optimize __arch_swab32 and __arch_swab16 Andreas Schwab
2011-08-11  8:51   ` Joakim Tjernlund
2011-08-11  8:56   ` David Laight
2011-08-11  9:23     ` Joakim Tjernlund
2011-08-11  9:29       ` David Laight
2011-08-11  9:43         ` Joakim Tjernlund
2011-08-11  8:13 Joakim Tjernlund

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox