Re: [PATCH] PPC assembly implementation of SHA1

git.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* Re: [PATCH] PPC assembly implementation of SHA1
@ 2005-04-23 12:42 linux
  2005-04-23 13:03 ` linux
                   ` (2 more replies)
  0 siblings, 3 replies; 17+ messages in thread
From: linux @ 2005-04-23 12:42 UTC (permalink / raw)
  To: paulus; +Cc: git, linux

I was working on the same thing, but hindered by lack of access to PPC
hardware.  I notice that you also took advantage of the unaligned load
support and native byte order to do the hash straight from the source.

But I came up with a few additional refinements:

- You are using three temporaries (%r0, %r6, and RT(x)) for your
  round functions.  You only need one temporary (%r0) for all the functions.
  (Plus %r15 for k)

  The core round function is

  e += (a <<< 5) + f(b,c,d) + W[i] + K
  b = (b <<< 30)
followed by renaming the registers for the next round:
  d += (e <<< 5) + f(a,b,c) + W[i+1] + K
  a = (a <<< 30)

That can all be computed with one temporary.

There are three possible f() functions:
f1(x,y,z) = "bitwise x ? y : z"
          = (x & y) | (~x & z)
          = (x & y) + (~x & z)
          = z ^ (x & (y ^ z))
All are three logical instrunctions on PPC.  The second form
lets you add it into the accumulator e in two pieces:

+#define STEPD0(t)				\
+	add	%r0,%r15,W(t);			\
+	add	RE(t),RE(t),%r0;		\
+	and	%r0,RC(t),RB(t);		\
+	add	RE(t),RE(t),%r0;		\
+	andc	%r0,RD(t),RB(t);		\
+	add	RE(t),RE(t),%r0;		\
+	rotlwi	%r0,RA(t),5;			\
+	rotlwi	RB(t),RB(t),30;			\
+	add	RE(t),RE(t),%r0;

The XOR version f2(x,y,z) = x^y^z is of course trivial:

+#define STEPD1(t)				\
+	add	%r0,%r15,W(t);			\
+	add	RE(t),RE(t),%r0;		\
+	xor	%r0,RC(t),RB(t);		\
+	xor	%r0,%r0,RA(t);			\
+	add	RE(t),RE(t),%r0;		\
+	rotlwi	%r0,RA(t),5;			\
+	rotlwi	RB(t),RB(t),30;			\
+	add	RE(t),RE(t),%r0;

And the last function, majority(x,y,z), can be written as:
f3(x,y,z) = (x & y) | (y & z) | (z & x)
          = (x & y) | z & (x | y)
          = (x & y) | z & (x ^ y)
          = (x & y) + z & (x ^ y)
The last form again allows evaluation with one temporary and
two adds into RE(t):

+#define STEPD2(t)				\
+	add	%r0,%r15,W(t);			\
+	add	RE(t),RE(t),%r0;		\
+	and	%r0,RC(t),RB(t);		\
+	add	RE(t),RE(t),%r0;		\
+	xor	%r0,RC(t),RB(t);		\
+	and	%r0,%r0,RA(t);			\
+	add	RE(t),RE(t),%r0;		\
+	rotlwi	%r0,RA(t),5;			\
+	rotlwi	RB(t),RB(t),30;			\
+	add	RE(t),RE(t),%r0;

Other notes:
- You don't need to decrement %r1 before saving registers.
  The PPC calling convention defines a "red zone" below the
  current stack pointer that is guaranteed never to be touched
  by signal handlers or the like.  This is specifically for
  leaf procedure optimization, and is at least 224 bytes.
- Is that many stw/lwz instructions faster than stmw/lmw?
  The latter is at least more cahce-friendly.
- You can avoid saving and restoring %r15 by recycling %r5 for that
  purpose; it's not used after the mtctr %r5.
- (Note, by the way, that while the PPC supports unaligned loads, using
  lmw to load unaligned data is SLOW onthe G5; multiple lwz instructions
  are faster in that case.)
- The above changes actually save enough registers to cache the whole hash[5]
  in registers as well, eliminating *all* unnecessary load/store traffic.

With all of the above changes, your sha1ppc.S file turns into:

diff -urN git.orig/ppc/sha1ppc.S git/ppc/sha1ppc.S
--- /dev/null	2005-04-04 12:56:19.000000000 +1000
+++ git/ppc/sha1ppc.S	2005-04-22 16:29:19.000000000 +1000
@@ -0,0 +1,142 @@
+/*
+ * SHA-1 implementation for PowerPC.
+ *
+ * Copyright (C) 2005 Paul Mackerras <paulus@samba.org>
+ */
+
+/*
+ * We roll the registers for A, B, C, D, E around on each
+ * iteration; A on iteration t is B on iteration t+1, and so on.
+ * We use registers 6 - 10 for this.  (Registers 27 - 31 hold
+ * the previous values.)
+ */
+#define RA(t)	((((t)+4)%5)+6)
+#define RB(t)	((((t)+3)%5)+6)
+#define RC(t)	((((t)+2)%5)+6)
+#define RD(t)	((((t)+1)%5)+6)
+#define RE(t)	((((t)+0)%5)+6)
+
+/* We use registers 11 - 26 for the W values */
+#define W(t)	(((t)%16)+11)
+
+#define STEPD0(t)				\
+	add	%r0,%r5,W(t);			\
+	add	RE(t),RE(t),%r0;		\
+	and	%r0,RC(t),RB(t);		\
+	add	RE(t),RE(t),%r0;		\
+	andc	%r0,RD(t),RB(t);		\
+	add	RE(t),RE(t),%r0;		\
+	rotlwi	%r0,RA(t),5;			\
+	rotlwi	RB(t),RB(t),30;			\
+	add	RE(t),RE(t),%r0;
+
+#define STEPD1(t)				\
+	add	%r0,%r5,W(t);			\
+	add	RE(t),RE(t),%r0;		\
+	xor	%r0,RC(t),RB(t);		\
+	xor	%r0,%r0,RA(t);			\
+	add	RE(t),RE(t),%r0;		\
+	rotlwi	%r0,RA(t),5;			\
+	rotlwi	RB(t),RB(t),30;			\
+	add	RE(t),RE(t),%r0;
+
+#define STEPD2(t)				\
+	add	%r0,%r15,W(t);			\
+	add	RE(t),RE(t),%r0;		\
+	and	%r0,RC(t),RB(t);		\
+	add	RE(t),RE(t),%r0;		\
+	xor	%r0,RC(t),RB(t);		\
+	and	%r0,%r0,RA(t);			\
+	add	RE(t),RE(t),%r0;		\
+	rotlwi	%r0,RA(t),5;			\
+	rotlwi	RB(t),RB(t),30;			\
+	add	RE(t),RE(t),%r0;
+
+#define LOADW(t)				\
+	lwz	W(t),(t)*4(%r4)
+
+#define UPDATEW(t)				\
+	xor	%r0,W((t)-3),W((t)-8);		\
+	xor	W(t),W((t)-16),W((t)-14);	\
+	xor	W(t),W(t),%r0;			\
+	rotlwi	W(t),W(t),1
+
+#define STEP0LD4(t)				\
+	STEPD0(t);     LOADW((t)+4);		\
+	STEPD0((t)+1); LOADW((t)+5);		\
+	STEPD0((t)+2); LOADW((t)+6);		\
+	STEPD0((t)+3); LOADW((t)+7)
+
+#define STEPUP4(t, fn)				\
+	STEP##fn(t);     UPDATEW((t)+4);	\
+	STEP##fn((t)+1); UPDATEW((t)+5);	\
+	STEP##fn((t)+2); UPDATEW((t)+6);	\
+	STEP##fn((t)+3); UPDATEW((t)+7)
+
+#define STEPUP20(t, fn)				\
+	STEPUP4(t, fn);				\
+	STEPUP4((t)+4, fn);			\
+	STEPUP4((t)+8, fn);			\
+	STEPUP4((t)+12, fn);			\
+	STEPUP4((t)+16, fn)
+
+	.globl	sha1_core
+sha1_core:
+	stmw	%r13,-76(%r1)
+
+	/* Load up A - E */
+	lmw	%r27,0(%r3)
+
+	mtctr	%r5
+
+1:	mr	RA(0),%r27
+	LOADW(0)
+	mr	RB(0),%r28
+	LOADW(1)
+	mr	RC(0),%r29
+	LOADW(2)
+	mr	RD(0),%r30
+	LOADW(3)
+	mr	RE(0),%r31
+
+	lis	%r5,0x5a82	/* K0-19 */
+	ori	%r5,%r5,0x7999
+	STEP0LD4(0)
+	STEP0LD4(4)
+	STEP0LD4(8)
+	STEPUP4(12, D0)
+	STEPUP4(16, D0)
+
+	lis	%r5,0x6ed9	/* K20-39 */
+	ori	%r5,%r5,0xeba1
+	STEPUP20(20, D1)
+
+	lis	%r5,0x8f1b	/* K40-59 */
+	ori	%r5,%r5,0xbcdc
+	STEPUP20(40, D2)
+
+	lis	%r5,0xca62	/* K60-79 */
+	ori	%r5,%r5,0xc1d6
+	STEPUP4(60, D1)
+	STEPUP4(64, D1)
+	STEPUP4(68, D1)
+	STEPUP4(72, D1)
+	STEPD1(76)
+	STEPD1(77)
+	STEPD1(78)
+	STEPD1(79)
+
+	/* Add results to original values */
+	add	%r27,%r27,RA(0)
+	add	%r28,%r28,RB(0)
+	add	%r29,%r29,RC(0)
+	add	%r30,%r30,RD(0)
+	add	%r31,%r31,RE(0)
+
+	addi	%r4,%r4,64
+	bdnz	1b
+
+	/* Save final hash, restore registers, and return */
+	stmw	%r27,0(%r3)
+	lmw	%r13,-76(%r1)
+	blr

The above changes, if you want them, are placed in the public domain.
It assembles, but I can't test it.  Have fun.

(I might also replace "t" with "i" in the macros, but figured that
would make comparing versions annoying.  Of course, part of the reason
was confusion with the T register, which is now eliminated...)

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] PPC assembly implementation of SHA1
  2005-04-23 12:42 [PATCH] PPC assembly implementation of SHA1 linux
@ 2005-04-23 13:03 ` linux
  2005-04-24  2:49 ` Benjamin Herrenschmidt
  2005-04-24  4:40 ` Paul Mackerras
  2 siblings, 0 replies; 17+ messages in thread
From: linux @ 2005-04-23 13:03 UTC (permalink / raw)
  To: paulus; +Cc: git, linux

One bug I spotted too late: the first line of the STEPD2
macro still references %r15; that should be the new home of k, %r5.

Sorry about that.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] PPC assembly implementation of SHA1
  2005-04-23 12:42 [PATCH] PPC assembly implementation of SHA1 linux
  2005-04-23 13:03 ` linux
@ 2005-04-24  2:49 ` Benjamin Herrenschmidt
  2005-04-24  4:40 ` Paul Mackerras
  2 siblings, 0 replies; 17+ messages in thread
From: Benjamin Herrenschmidt @ 2005-04-24  2:49 UTC (permalink / raw)
  To: linux; +Cc: Paul Mackerras, git

On Sat, 2005-04-23 at 12:42 +0000, linux@horizon.com wrote:

> - You don't need to decrement %r1 before saving registers.
>   The PPC calling convention defines a "red zone" below the
>   current stack pointer that is guaranteed never to be touched
>   by signal handlers or the like.  This is specifically for
>   leaf procedure optimization, and is at least 224 bytes.

On ELF ppc32 ABI ? Hrm... the SysV ABI document says

"The only permitted references with negative offsets from the stack
pointer are those described here for establishing a stack frame."

I know MacOS had a red zone, but do we ?

> - Is that many stw/lwz instructions faster than stmw/lmw?
>   The latter is at least more cahce-friendly.

More cache friendly ? Hrm.. I wouldn't be sure. Also, I remember readind
a while ago that those store/load multiple instructions aren't that
recommended. They aren't very good on some CPU models. I would expect
the bus/cache bandwidth to be the limiting factor here anyway, and as
you point out below, the they don't deal with unaligned access very well
in all cases.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] PPC assembly implementation of SHA1
  2005-04-23 12:42 [PATCH] PPC assembly implementation of SHA1 linux
  2005-04-23 13:03 ` linux
  2005-04-24  2:49 ` Benjamin Herrenschmidt
@ 2005-04-24  4:40 ` Paul Mackerras
  2005-04-24 12:04   ` Wayne Scott
                     ` (2 more replies)
  2 siblings, 3 replies; 17+ messages in thread
From: Paul Mackerras @ 2005-04-24  4:40 UTC (permalink / raw)
  To: linux; +Cc: git

linux@horizon.com writes:

> I was working on the same thing, but hindered by lack of access to PPC
> hardware.  I notice that you also took advantage of the unaligned load
> support and native byte order to do the hash straight from the source.

Yes. :)  In previous experiments (in the context of trying different
ways to do memcpy) I found that doing unaligned word loads is faster
than doing aligned loads plus extra rotate and mask instructions to
get the bytes you want together.

> But I came up with a few additional refinements:
> 
> - You are using three temporaries (%r0, %r6, and RT(x)) for your
>   round functions.  You only need one temporary (%r0) for all the functions.
>   (Plus %r15 for k)

The reason I used more than one temporary is that I was trying to put
dependent instructions as far apart as reasonably possible, to
minimize the chances of pipeline stalls.  Given that the 970 does
register renaming and out-of-order execution, I don't know how
essential that is, but it can't hurt.

> All are three logical instrunctions on PPC.  The second form
> lets you add it into the accumulator e in two pieces:

A sequence of adds into a single register is going to incur the
2-cycle latency between generation and use of a value; i.e. the adds
will only issue on every second cycle.  I think we are better off
making the dataflow more like a tree than a linear chain where
possible.

> And the last function, majority(x,y,z), can be written as:
> f3(x,y,z) = (x & y) | (y & z) | (z & x)
>           = (x & y) | z & (x | y)
>           = (x & y) | z & (x ^ y)
>           = (x & y) + z & (x ^ y)

That's cute, I hadn't thought of that.

> - You don't need to decrement %r1 before saving registers.
>   The PPC calling convention defines a "red zone" below the
>   current stack pointer that is guaranteed never to be touched
>   by signal handlers or the like.  This is specifically for
>   leaf procedure optimization, and is at least 224 bytes.

Not in the ppc32 ELF ABI - you are not supposed to touch memory below
the stack pointer.  The kernel is more forgiving than that, and in
fact you can currently use the red zone without anything bad
happening, but you really shouldn't.

> - Is that many stw/lwz instructions faster than stmw/lmw?
>   The latter is at least more cahce-friendly.

I believe the stw/lwz and the stmw/lmw will actually execute at the
same speed on the 970, but I have seen lwz/stw go faster than lmw/stmw
on other machines.  In any case we aren't executing the prolog and
epilog as often as the instructions in the main loop, hopefully.

> - You can avoid saving and restoring %r15 by recycling %r5 for that
>   purpose; it's not used after the mtctr %r5.

True.

> - The above changes actually save enough registers to cache the whole hash[5]
>   in registers as well, eliminating *all* unnecessary load/store traffic.

That's cool.

> With all of the above changes, your sha1ppc.S file turns into:

I added a stwu and an addi to make a stack frame, and changed %r15 to
%r5 as you mentioned in another message.  I tried it in a little test
program I have that calls SHA1_Update 256,000 times with a buffer of
4096 zero bytes, i.e. it processes 1000MB.  Your version seems to be
about 2% faster; it took 4.53 seconds compared to 4.62 for mine.  But
it also gives the wrong answer; I haven't investigated why.

Thanks,
Paul.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] PPC assembly implementation of SHA1
  2005-04-24  4:40 ` Paul Mackerras
@ 2005-04-24 12:04   ` Wayne Scott
  2005-04-25  0:16   ` linux
  2005-04-25  3:13   ` Revised PPC assembly implementation linux
  2 siblings, 0 replies; 17+ messages in thread
From: Wayne Scott @ 2005-04-24 12:04 UTC (permalink / raw)
  To: Paul Mackerras; +Cc: linux, git

_The_ book on expression rewriting tricks like this, especially for
the PPC, is "Hacker's Delight" by Henry Warren.  Great reading!!!
http://www.hackersdelight.org/

-Wayne

On 4/23/05, Paul Mackerras <paulus@samba.org> wrote:
> linux@horizon.com writes:
> 
> > I was working on the same thing, but hindered by lack of access to PPC
> > hardware.  I notice that you also took advantage of the unaligned load
> > support and native byte order to do the hash straight from the source.
> 
> Yes. :)  In previous experiments (in the context of trying different
> ways to do memcpy) I found that doing unaligned word loads is faster
> than doing aligned loads plus extra rotate and mask instructions to
> get the bytes you want together.
> 
> > But I came up with a few additional refinements:
> >
> > - You are using three temporaries (%r0, %r6, and RT(x)) for your
> >   round functions.  You only need one temporary (%r0) for all the functions.
> >   (Plus %r15 for k)
> 
> The reason I used more than one temporary is that I was trying to put
> dependent instructions as far apart as reasonably possible, to
> minimize the chances of pipeline stalls.  Given that the 970 does
> register renaming and out-of-order execution, I don't know how
> essential that is, but it can't hurt.
> 
> > All are three logical instrunctions on PPC.  The second form
> > lets you add it into the accumulator e in two pieces:
> 
> A sequence of adds into a single register is going to incur the
> 2-cycle latency between generation and use of a value; i.e. the adds
> will only issue on every second cycle.  I think we are better off
> making the dataflow more like a tree than a linear chain where
> possible.
> 
> > And the last function, majority(x,y,z), can be written as:
> > f3(x,y,z) = (x & y) | (y & z) | (z & x)
> >           = (x & y) | z & (x | y)
> >           = (x & y) | z & (x ^ y)
> >           = (x & y) + z & (x ^ y)
> 
> That's cute, I hadn't thought of that.
> 
> > - You don't need to decrement %r1 before saving registers.
> >   The PPC calling convention defines a "red zone" below the
> >   current stack pointer that is guaranteed never to be touched
> >   by signal handlers or the like.  This is specifically for
> >   leaf procedure optimization, and is at least 224 bytes.
> 
> Not in the ppc32 ELF ABI - you are not supposed to touch memory below
> the stack pointer.  The kernel is more forgiving than that, and in
> fact you can currently use the red zone without anything bad
> happening, but you really shouldn't.
> 
> > - Is that many stw/lwz instructions faster than stmw/lmw?
> >   The latter is at least more cahce-friendly.
> 
> I believe the stw/lwz and the stmw/lmw will actually execute at the
> same speed on the 970, but I have seen lwz/stw go faster than lmw/stmw
> on other machines.  In any case we aren't executing the prolog and
> epilog as often as the instructions in the main loop, hopefully.
> 
> > - You can avoid saving and restoring %r15 by recycling %r5 for that
> >   purpose; it's not used after the mtctr %r5.
> 
> True.
> 
> > - The above changes actually save enough registers to cache the whole hash[5]
> >   in registers as well, eliminating *all* unnecessary load/store traffic.
> 
> That's cool.
> 
> > With all of the above changes, your sha1ppc.S file turns into:
> 
> I added a stwu and an addi to make a stack frame, and changed %r15 to
> %r5 as you mentioned in another message.  I tried it in a little test
> program I have that calls SHA1_Update 256,000 times with a buffer of
> 4096 zero bytes, i.e. it processes 1000MB.  Your version seems to be
> about 2% faster; it took 4.53 seconds compared to 4.62 for mine.  But
> it also gives the wrong answer; I haven't investigated why.
> 
> Thanks,
> Paul.
> -
> To unsubscribe from this list: send the line "unsubscribe git" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] PPC assembly implementation of SHA1
  2005-04-24  4:40 ` Paul Mackerras
  2005-04-24 12:04   ` Wayne Scott
@ 2005-04-25  0:16   ` linux
  2005-04-25  3:13   ` Revised PPC assembly implementation linux
  2 siblings, 0 replies; 17+ messages in thread
From: linux @ 2005-04-25  0:16 UTC (permalink / raw)
  To: paulus; +Cc: git, linux

> Yes. :)  In previous experiments (in the context of trying different
> ways to do memcpy) I found that doing unaligned word loads is faster
> than doing aligned loads plus extra rotate and mask instructions to
> get the bytes you want together.

The PPC970, at least, supports unaligned loads within one cache line
(64 bytes for L1 hit; 32 bytes for L1 miss) directly.  If the load
crosses the line, the processor backs up and re-issues it as two
loads and a merge.

Multiple-word loads can really suffer from this, as when the fault
hits, the *entire instruction* is aborted and re-issued as a series
of aligned loads and merges.

But for a single load, it's probably cheaper on average to use the
hardware 15 times out of 16 and take the retry the 16th.

> But I came up with a few additional refinements:
> 
>> - You are using three temporaries (%r0, %r6, and RT(x)) for your
>>   round functions.  You only need one temporary (%r0) for all the functions.
>>   (Plus %r15 for k)

> The reason I used more than one temporary is that I was trying to put
> dependent instructions as far apart as reasonably possible, to
> minimize the chances of pipeline stalls.  Given that the 970 does
> register renaming and out-of-order execution, I don't know how
> essential that is, but it can't hurt.

It's a good general idea, but the PPC970 only has two integer ALUs, so
it can't get too clever.

>> All are three logical instrunctions on PPC.  The second form
>> lets you add it into the accumulator e in two pieces:

> A sequence of adds into a single register is going to incur the
> 2-cycle latency between generation and use of a value; i.e. the adds
> will only issue on every second cycle.  I think we are better off
> making the dataflow more like a tree than a linear chain where
> possible.

Grumble, complain... you're right.  I didn't know it had a 2-cycle
dependency.  Time to reschedule those inner loops.  Still, the multi-input
sum representation gives you a lot of scheduling flexibility.

Given this, it has to be scheduled 4-wide, which is a lot trickier.
The steps are 9, 8, and 10 instructions long (plus 4 instructions
for UPDATEW on most of them), and there's a 5- or 6-input sum to
compute in that time.

I'll stare at the dependency graph a bit and see if I can do better.

>> - You don't need to decrement %r1 before saving registers.
>>   The PPC calling convention defines a "red zone" below the
>>   current stack pointer that is guaranteed never to be touched
>>   by signal handlers or the like.  This is specifically for
>>   leaf procedure optimization, and is at least 224 bytes.

> Not in the ppc32 ELF ABI - you are not supposed to touch memory below
> the stack pointer.  The kernel is more forgiving than that, and in
> fact you can currently use the red zone without anything bad
> happening, but you really shouldn't.

Oh!  I didn't know that!  Thank you for enlightening me!

>> - Is that many stw/lwz instructions faster than stmw/lmw?
>>   The latter is at least more cache-friendly.

> I believe the stw/lwz and the stmw/lmw will actually execute at the
> same speed on the 970, but I have seen lwz/stw go faster than lmw/stmw
> on other machines.  In any case we aren't executing the prolog and
> epilog as often as the instructions in the main loop, hopefully.

Yes, and reducing the I-cache footprint seems useful.

>> With all of the above changes, your sha1ppc.S file turns into:

> I added a stwu and an addi to make a stack frame, and changed %r15 to
> %r5 as you mentioned in another message.  I tried it in a little test
> program I have that calls SHA1_Update 256,000 times with a buffer of
> 4096 zero bytes, i.e. it processes 1000MB.  Your version seems to be
> about 2% faster; it took 4.53 seconds compared to 4.62 for mine.  But
> it also gives the wrong answer; I haven't investigated why.

Grumble, damn.  The standard document has all the register values for
every round. so if you set up the same plaintext they use and step
through the code with that at your side, the problem should jump
out at you.

Anyway, now that I know the pipeline issues better, I'll try to
reschedule it and see if I can improve the situation.  I may have
to understand the dispatch group issues pretty thoroughly, too.

http://www.alphaworks.ibm.com/tech/simppc
might be of interest.

I'll try to find the bug while I'm at it.  Would you be willing to
benchmark some code for me?

Thanks!

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Revised PPC assembly implementation
  2005-04-24  4:40 ` Paul Mackerras
  2005-04-24 12:04   ` Wayne Scott
  2005-04-25  0:16   ` linux
@ 2005-04-25  3:13   ` linux
  2005-04-25  9:40     ` Paul Mackerras
  2 siblings, 1 reply; 17+ messages in thread
From: linux @ 2005-04-25  3:13 UTC (permalink / raw)
  To: paulus; +Cc: git, linux

Three changes:
- Added stack frame as per your description.
- Found two bugs.  (Cutting & pasting too fast.)  Fixed.
- Minor scheduling improvements.  More to come.

Which lead to three questions:
- Is the stack set properly now?
- Does it produce the right answer now?
- Is it any faster?

Thanks for your help!


/*
 * SHA-1 implementation for PowerPC.
 *
 * Copyright (C) 2005 Paul Mackerras <paulus@samba.org>
 */

/*
 * We roll the registers for A, B, C, D, E around on each
 * iteration; E on iteration t is D on iteration t+1, and so on.
 * We use registers 6 - 10 for this.  (Registers 27 - 31 hold
 * the previous values.)
 */
#define RA(t)	((((t)+4)%5)+6)
#define RB(t)	((((t)+3)%5)+6)
#define RC(t)	((((t)+2)%5)+6)
#define RD(t)	((((t)+1)%5)+6)
#define RE(t)	((((t)+0)%5)+6)

/* We use registers 11 - 26 for the W values */
#define W(t)	(((t)%16)+11)

/* Register 5 is used for the constant k */

/*
 * Note that, in the previous step, RC was rotated, and RA was computed.
 * So try to postpone using them, *especially* the latter.
 */

/* f(b,c,d) = "bitwise b ? c : d" = (b & c) + (~b & d) */
#define STEPD0(t)				\
	andc	%r0,RD(t),RB(t);		\
	add	%r0,%r0,W(t)
	add	RE(t),RE(t),%r0;		\
	and	%r0,RC(t),RB(t);		\
	add	%r0,%r0,%r5
	add	RE(t),RE(t),%r0;		\
	rotlwi	%r0,RA(t),5;			\
	rotlwi	RB(t),RB(t),30;			\
	add	RE(t),RE(t),%r0;

/* f(b,c,d) = b ^ c ^ d */
#define STEPD1(t)				\
	xor	%r0,RD(t),RB(t);		\
	xor	%r0,%r0,RC(t);			\
	add	%r0,%r0,W(t)
	add	RE(t),RE(t),%r0;		\
	rotlwi	%r0,RA(t),5;			\
	add	%r0,%r0,%r5
	rotlwi	RB(t),RB(t),30;			\
	add	RE(t),RE(t),%r0;

/* f(b,c,d) = majority(b,c,d) = (b & d) + (c & (b ^ d)) */
#define STEPD2(t)				\
	and	%r0,RD(t),RB(t);		\
	add	%r0,%r0,W(t)
	add	RE(t),RE(t),%r0;		\
	xor	%r0,RD(t),RB(t);		\
	and	%r0,%r0,RC(t);			\
	add	RE(t),RE(t),%r0;		\
	rotlwi	%r0,RA(t),5;			\
	add	%r0,%r0,%r5
	rotlwi	RB(t),RB(t),30;			\
	add	RE(t),RE(t),%r0;

#define LOADW(t)				\
	lwz	W(t),(t)*4(%r4)

#define UPDATEW(t)				\
	xor	%r0,W((t)-3),W((t)-8);		\
	xor	W(t),W((t)-16),W((t)-14);	\
	xor	W(t),W(t),%r0;			\
	rotlwi	W(t),W(t),1

#define STEP0LD4(t)				\
	STEPD0(t);     LOADW((t)+4);		\
	STEPD0((t)+1); LOADW((t)+5);		\
	STEPD0((t)+2); LOADW((t)+6);		\
	STEPD0((t)+3); LOADW((t)+7)

#define STEPUP4(t, fn)				\
	STEP##fn(t);     UPDATEW((t)+4);	\
	STEP##fn((t)+1); UPDATEW((t)+5);	\
	STEP##fn((t)+2); UPDATEW((t)+6);	\
	STEP##fn((t)+3); UPDATEW((t)+7)

#define STEPUP20(t, fn)				\
	STEPUP4(t, fn);				\
	STEPUP4((t)+4, fn);			\
	STEPUP4((t)+8, fn);			\
	STEPUP4((t)+12, fn);			\
	STEPUP4((t)+16, fn)

	.globl	sha1_core
sha1_core:
	stwu	%r1,-80(%r1)
	stmw	%r13,4(%r1)

	/* Load up A - E */
	lmw	%r27,0(%r3)

	mtctr	%r5

1:	mr	RA(0),%r27
	LOADW(0)
	mr	RB(0),%r28
	LOADW(1)
	mr	RC(0),%r29
	LOADW(2)
	mr	RD(0),%r30
	LOADW(3)
	mr	RE(0),%r31

	lis	%r5,0x5a82	/* K0-19 */
	ori	%r5,%r5,0x7999
	STEP0LD4(0)
	STEP0LD4(4)
	STEP0LD4(8)
	STEPUP4(12, D0)
	STEPUP4(16, D0)

	lis	%r5,0x6ed9	/* K20-39 */
	ori	%r5,%r5,0xeba1
	STEPUP20(20, D1)

	lis	%r5,0x8f1b	/* K40-59 */
	ori	%r5,%r5,0xbcdc
	STEPUP20(40, D2)

	lis	%r5,0xca62	/* K60-79 */
	ori	%r5,%r5,0xc1d6
	STEPUP4(60, D1)
	STEPUP4(64, D1)
	STEPUP4(68, D1)
	STEPUP4(72, D1)
	STEPD1(76)
	STEPD1(77)
	STEPD1(78)
	STEPD1(79)

	/* Add results to original values */
	add	%r27,%r27,RA(0)
	add	%r28,%r28,RB(0)
	add	%r29,%r29,RC(0)
	add	%r30,%r30,RD(0)
	add	%r31,%r31,RE(0)

	addi	%r4,%r4,64
	bdnz	1b

	/* Save final hash, restore registers, and return */
	stmw	%r27,0(%r3)
	lmw	%r13,4(%r1)
	addi	%r1,%r1,80
	blr

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: Revised PPC assembly implementation
  2005-04-25  3:13   ` Revised PPC assembly implementation linux
@ 2005-04-25  9:40     ` Paul Mackerras
  2005-04-25 17:34       ` linux
  0 siblings, 1 reply; 17+ messages in thread
From: Paul Mackerras @ 2005-04-25  9:40 UTC (permalink / raw)
  To: linux; +Cc: git

linux@horizon.com writes:

> Three changes:
> - Added stack frame as per your description.
> - Found two bugs.  (Cutting & pasting too fast.)  Fixed.
> - Minor scheduling improvements.  More to come.
> 
> Which lead to three questions:
> - Is the stack set properly now?

Not quite; you are saving 20 registers, so you need a 96-byte stack
frame, like this:

	stwu	%r1,-96(%r1)
	stmw	%r13,16(%r1)
	...
	lmw	%r13,16(%r1)
	addi	%r1,%r1,96
	blr

Since sha1_core is a leaf function, I suppose you could use the lr
save area (do stwu %r1,-80(%r1); stmw %r13,0(%r1)) but it seems a bit
dodgy.

> - Does it produce the right answer now?

Yes.

> - Is it any faster?

I did 10 repetitions of my program that calls SHA1_Update with a
4096-byte block of zeroes 256,000 times.  With my version, the average
time was 4.6191 seconds with a standard deviation of 0.0157.  With your
version, the average was 4.6063 and the standard deviation 0.0148.  So
I would say that your version is probably just a little faster - of the
order of 0.3% faster.

Paul.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: Revised PPC assembly implementation
  2005-04-25  9:40     ` Paul Mackerras
@ 2005-04-25 17:34       ` linux
  2005-04-25 23:00         ` Paul Mackerras
  0 siblings, 1 reply; 17+ messages in thread
From: linux @ 2005-04-25 17:34 UTC (permalink / raw)
  To: paulus; +Cc: git, linux

>> Which lead to three questions:
>> - Is the stack set properly now?

> Not quite; you are saving 20 registers, so you need a 96-byte stack
> frame, like this:

> 	stwu	%r1,-96(%r1)
> 	stmw	%r13,16(%r1)
> 	...
> 	lmw	%r13,16(%r1)
> 	addi	%r1,%r1,96
> 	blr

Huh?  I'm saving 19 registers, r13..r31, and not saving 13, namely
r0..r12.

The dodgy thing *I'm* thinking of is saving %r2 (the TOC pointer)
and using it as an extra temporary.  (The alternative is spilling
one of the "old" hash values to the stack, which is not
too big a disaster.)

>> - Is it any faster?

> I did 10 repetitions of my program that calls SHA1_Update with a
> 4096-byte block of zeroes 256,000 times.  With my version, the average
> time was 4.6191 seconds with a standard deviation of 0.0157.  With your
> version, the average was 4.6063 and the standard deviation 0.0148.  So
> I would say that your version is probably just a little faster - of the
> order of 0.3% faster.

Damn.  So that's actually *worse* than me earlier version which achieved
an (also piddling) 2% speedup?
As you can see, I tried to make the addition tree bushier, but I guess
it didn't help.  Or the processor isn't out-of-order enough to find
the parallelism I made available.

Damn, I wish I had at that IBM pipeline profiling tool.  If it could
just tell me which cycles didn't have both ALUs busy, I could solve it
in relatively little time.

The place that could really use scheduing help is the G4, which has three
integer ALUs, but can only *think* about executing the bottom three entries
in the reorder queue.  So if one of those instructions isn't ready, it
stalls in the queue and idles the ALU with it.

Especially there, it may be necessary to interleave the EXPANDW code
with the round code to avoid having the (non-critical-path) EXPANDW
code scheduled ahead of critical-path round code.

The two critical-path inter-round dependencies are:
- summing into E to be rotated by 5 and added to D next round.
  (this is the "A<<<5" code in the current round)
- rotating B left for use in the next round's F(a,b,c) function.
  (this is the current round's C input)
Actually, the E variable isn't critical-path at all; it was last
modified several rounds ago.

Maybe I can improve the scheduling some more...

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: Revised PPC assembly implementation
  2005-04-25 17:34       ` linux
@ 2005-04-25 23:00         ` Paul Mackerras
  2005-04-25 23:17           ` David S. Miller
  0 siblings, 1 reply; 17+ messages in thread
From: Paul Mackerras @ 2005-04-25 23:00 UTC (permalink / raw)
  To: linux; +Cc: git

linux@horizon.com writes:

> Huh?  I'm saving 19 registers, r13..r31, and not saving 13, namely
> r0..r12.

Oops. :)  Somehow I thought you were saving r13..r32 or something. :)

> Damn.  So that's actually *worse* than me earlier version which achieved
> an (also piddling) 2% speedup?

I wouldn't say it is worse, I would say it is the same.  I didn't do
as many runs of the previous version.  The spread of times looked
about the same with both of your versions.

> Damn, I wish I had at that IBM pipeline profiling tool.  If it could
> just tell me which cycles didn't have both ALUs busy, I could solve it
> in relatively little time.

I'm going to look at trying to get it going.

> The place that could really use scheduing help is the G4, which has three
> integer ALUs, but can only *think* about executing the bottom three entries
> in the reorder queue.  So if one of those instructions isn't ready, it
> stalls in the queue and idles the ALU with it.

Yes, the performance on the G4 is also important.  Not everyone has a
G5. ;)

> Maybe I can improve the scheduling some more...

The main loop seems to be taking about 560 cycles (assuming that
essentially all the time spent in my little test program is spent in
the main loop).  It contains about 1000 integer instructions, which
will take at least 500 cycles, as we have 2 ALUs.  So we are already
within about 10% of the theoretical optimum.

So I think we are already at the point of diminishing returns as far
as the overall performance of git is concerned.  But if you want to
try to get that last 10%, go for it... :)

Paul.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: Revised PPC assembly implementation
  2005-04-25 23:00         ` Paul Mackerras
@ 2005-04-25 23:17           ` David S. Miller
  2005-04-26  1:22             ` Paul Mackerras
                               ` (2 more replies)
  0 siblings, 3 replies; 17+ messages in thread
From: David S. Miller @ 2005-04-25 23:17 UTC (permalink / raw)
  To: Paul Mackerras; +Cc: linux, git

On Tue, 26 Apr 2005 09:00:45 +1000
Paul Mackerras <paulus@samba.org> wrote:

> The main loop seems to be taking about 560 cycles (assuming that
> essentially all the time spent in my little test program is spent in
> the main loop).  It contains about 1000 integer instructions, which
> will take at least 500 cycles, as we have 2 ALUs.  So we are already
> within about 10% of the theoretical optimum.

Time to bust out the altivec perhaps :)

Do a block with the integer ALUs in parallel with a block done using
Altivec :-)  There should be enough spare insn slots so that the loads
are absorbed properly.

Unlike UltraSPARC's VIS, with altivec you can reasonably do shifts and
rotates, which is the only reason I'm suggesting this.


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: Revised PPC assembly implementation
  2005-04-25 23:17           ` David S. Miller
@ 2005-04-26  1:22             ` Paul Mackerras
  2005-04-27  1:47               ` linux
  2005-04-26  2:14             ` linux
  2005-04-26  2:35             ` linux
  2 siblings, 1 reply; 17+ messages in thread
From: Paul Mackerras @ 2005-04-26  1:22 UTC (permalink / raw)
  To: David S. Miller; +Cc: linux, git

David S. Miller writes:

> Time to bust out the altivec perhaps :)

I looked at this but I couldn't see a way to use altivec effectively
for SHA1.

The problem is that we have a chain of dependencies with the A
variable (which is 32-bit) where each A value depends on the previous
A value and on one of the 80 W values.  The W values are derived from
the 16 words (32-bit) of the input data block.

It might be possible to use altivec for generating the W values
(although there is the problem that W[k] depends on W[k-3], making it
hard to do a 4-way parallelization), but I don't see any way of
parallelizing the calculation of the A values, which is the critical
path.  Using altivec for generating the W values but the integer ALUs
for the A calculations would mean we had to go via memory, too, since
there isn't any way to transfer stuff directly between altivec
registers and GPRs.

We can't do four blocks from the same sequence in parallel either.  We
could do four blocks from four separate streams in parallel, but that
seems hard to organize...

Regards,
Paul.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: Revised PPC assembly implementation
  2005-04-25 23:17           ` David S. Miller
  2005-04-26  1:22             ` Paul Mackerras
@ 2005-04-26  2:14             ` linux
  2005-04-26  2:35             ` linux
  2 siblings, 0 replies; 17+ messages in thread
From: linux @ 2005-04-26  2:14 UTC (permalink / raw)
  To: davem, paulus; +Cc: git, linux

>From davem@davemloft.net Mon Apr 25 23:26:06 2005
Date: Mon, 25 Apr 2005 16:17:46 -0700
From: "David S. Miller" <davem@davemloft.net>
To: Paul Mackerras <paulus@samba.org>
Cc: linux@horizon.com, git@vger.kernel.org
Subject: Re: Revised PPC assembly implementation
In-Reply-To: <17005.30365.995256.963911@cargo.ozlabs.ibm.com>
References: <17004.47876.414.756912@cargo.ozlabs.ibm.com>
	<20050425173430.11031.qmail@science.horizon.com>
	<17005.30365.995256.963911@cargo.ozlabs.ibm.com>
X-Mailer: Sylpheed version 1.0.4 (GTK+ 1.2.10; sparc-unknown-linux-gnu)
X-Face: "_;p5u5aPsO,_Vsx"^v-pEq09'CU4&Dc1$fQExov$62l60cgCc%FnIwD=.UF^a>?5'9Kn[;433QFVV9M..2eN.@4ZWPGbdi<=?[:T>y?SD(R*-3It"Vj:)"dP
Mime-Version: 1.0
Content-Type: text/plain; charset=US-ASCII
Content-Transfer-Encoding: 7bit

On Tue, 26 Apr 2005 09:00:45 +1000
Paul Mackerras <paulus@samba.org> wrote:

> The main loop seems to be taking about 560 cycles (assuming that
> essentially all the time spent in my little test program is spent in
> the main loop).  It contains about 1000 integer instructions, which
> will take at least 500 cycles, as we have 2 ALUs.  So we are already
> within about 10% of the theoretical optimum.

Time to bust out the altivec perhaps :)

Do a block with the integer ALUs in parallel with a block done using
Altivec :-)  There should be enough spare insn slots so that the loads
are absorbed properly.

Unlike UltraSPARC's VIS, with altivec you can reasonably do shifts and
rotates, which is the only reason I'm suggesting this.



^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: Revised PPC assembly implementation
  2005-04-25 23:17           ` David S. Miller
  2005-04-26  1:22             ` Paul Mackerras
  2005-04-26  2:14             ` linux
@ 2005-04-26  2:35             ` linux
  2 siblings, 0 replies; 17+ messages in thread
From: linux @ 2005-04-26  2:35 UTC (permalink / raw)
  To: davem, paulus; +Cc: git, linux

(Sorry about that last e-mail.  gnome-terminal crashed and sent the file
before I edited it.  Here's what I meant to send.)

> Do a block with the integer ALUs in parallel with a block done using
> Altivec :-)  There should be enough spare insn slots so that the loads
> are absorbed properly.

Unfortunately, the blocks are connected by a data dependency.
It's basically a large-key block cipher, chained by:

iv[] = fixed_initial_value.
iv[] += encrypt(iv, text[0..63])
iv[] += encrypt(iv, text[64..127])
iv[] += encrypt(iv, text[128..191])
iv[] += encrypt(iv, text[192..255])
etc.

There is no coarse-grain parallelism to exploit, unless you want
to be hashing two separate files at once.  Which would do too much
damage to the structure of the source to be worth considering.

> Unlike UltraSPARC's VIS, with altivec you can reasonably do shifts and
> rotates, which is the only reason I'm suggesting this.

I don't quite think it's worth it, though.  It's not data-parallel
enough.

We could theoretically use it to form the w[] vector, but that's only
4 instructions in registers which are very flexibly schedulable and
nicely fill in the cracks between other instructions.

Oh, here's STEPD1+UPDATEW scheduled optimally for the G4.  %r5 holds the
constant K.  Note that t < s <= t+16.  W(s) and W((s)-16) are actually
the same register.

add   RE(t),RE(t),W(t);	xor    %r0,RD(t),RB(t);	xor    W(s),W((s)-16),W((s)-3);
add   RE(t),RE(t),%r5;	xor    %r0,%r0,RC(t);	xor    W(s),W(s),W((s)-8);
add   RE(t),RE(t),%r0;	rotlwi %r0,RA(t),5;	xor    W(s),W(s),W((s)-14);
add   RE(t),RE(t),%r0;	rotlwi RB(t),RB(t),30;	rotlwi W(s),W(s),1;

However, whether that can be done in 6 cycles on a G5 is a bit unclear.
It can't be 6 consecutive cycles, but with some motion of code
across the edges, perhaps...

0: add   RE(t),RE(t),W(t);		xor    %r0,RD(t),RB(t);
1: xor    W(s),W((s)-16),W((s)-3);	(add)
2: add   RE(t),RE(t),%r5;		xor    %r0,%r0,RC(t);
3: xor    W(s),W(s),W((s)-8);		(rotlwi)
4: add   RE(t),RE(t),%r0;		rotlwi %r0,RA(t),5;
5: xor    W(s),W(s),W((s)-14);		rotlwi RB(t),RB(t),30;
6:
7: add   RE(t),RE(t),%r0;
8:
9: rotlwi W(s),W(s),1;

The problem there is forcing that ordering, rather than issuing the final
add in cycle 6 and pushing everything else ahead of it.

STEPD0+UPDATEW and STEPD1+UPDATEW are 13 and 14 instructions,
respectively, and don't fit into a 3-issue machine as neatly.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: Revised PPC assembly implementation
  2005-04-26  1:22             ` Paul Mackerras
@ 2005-04-27  1:47               ` linux
  2005-04-27  3:39                 ` Paul Mackerras
  0 siblings, 1 reply; 17+ messages in thread
From: linux @ 2005-04-27  1:47 UTC (permalink / raw)
  To: paulus; +Cc: davem, git, linux

Here's a massively revised version, scheduled very close to optimally for
the G4.  (The main remaining limitation is the loading of the k value
in %r5, which could be split up more.)

My hope is that the G5 will do decently on it as well.

The G4 can in theory do 3 integer operations per cycle, but only
if everything is arranged just right.  Every cycle, it tries to
dispatch the 3 instructions at the bottom of the GIQ.  If any
of them stall, that issue slot is lost.

So although it's theoretically out-of-order, if you want it to
sustain 3 instructions per cycle, you have to treat it as in-order.

It required interleaving the STEPDx and UPDATEW macros in a few
complicated ways.  I don't have access to a machine for testing,
so some poor schmuck^W^Wgenerous person is needed to find the bugs.

This should be *much* faster than the previous code on a G4, and I hope
it will do better on a G5 as well.

I'm curious if *reducing* the amount of fetch-ahead to 2 words
instead of 4 would help things or not.

Still to do: improve the comments.  This level of hackery needs a
lot of commenting...

/*
 * SHA-1 implementation for PowerPC.
 *
 * Copyright (C) 2005 Paul Mackerras <paulus@samba.org>
 */

/*
 * We roll the registers for A, B, C, D, E around on each
 * iteration; E on iteration t is D on iteration t+1, and so on.
 * We use registers 6 - 10 for this.  (Registers 27 - 31 hold
 * the previous values.)
 */
#define RA(t)	(((t)+4)%5+6)
#define RB(t)	(((t)+3)%5+6)
#define RC(t)	(((t)+2)%5+6)
#define RD(t)	(((t)+1)%5+6)
#define RE(t)	(((t)+0)%5+6)

/* We use registers 11 - 26 for the W values */
#define W(t)	((t)%16+11)

/* Register 5 is used for the constant k */

/*
 * There are three F functions, used four groups of 20:
 * - 20 rounds of f0(b,c,d) = "bit wise b ? c : d" =  (^b & d) + (b & c)
 * - 20 rounds of f1(b,c,d) = b^c^d = (b^d)^c
 * - 20 rounds of f2(b,c,d) = majority(b,c,d) = (b&d) + ((b^d)&c)
 * - 20 more rounds of f1(b,c,d)
 *
 * These are all scheduled for near-optimal performance on a G4.
 * The G4 is a 3-issue out-of-order machine with 3 ALUs, but it can only
 * *consider* starting the oldest 3 instructions per cycle.  So to get
 * maximum performace out of it, you have to treat it as an in-order
 * machine.  Which means interleaving the computation round t with the
 * computation of W[t+4].
 *
 * The first 16 rounds use W values loaded directly from memory, while the
 * remianing 64 use values computed from those first 16.  We preload
 * 4 values before starting, so there are three kinds of rounds:
 * - The first 12 (all f0) also load the W values from memory.
 * - The next 64 compute W(i+4) in parallel. 8*f0, 20*f1, 20*f2, 16*f1.
 * - The last 4 (all f1) do not do anything with W.
 *
 * Therefore, we have 6 different round functions:
 * STEPD0_LOAD(t,s) - Perform round t and load W(s).  s < 16
 * STEPD0_UPDATE(t,s) - Perform round t and compute W(s).  s >= 16.
 * STEPD1_UPDATE(t,s)
 * STEPD2_UPDATE(t,s)
 * STEPD1(t) - Perform round t with no load or update.
 * 
 * The G5 is more fully out-of-order, and can find the parallelism
 * by itself.  The big limit is that it has a 2-cycle ALU latency, so
 * even though it's 2-way, the code has to be scheduled as if it's
 * 4-way, which can be a limit.  To help it, we try to schedule the
 * read of RA(t) as late as possible so it doesn't stall waiting for
 * the previous round's RE(t-1), and we try to rotate RB(t) as early
 * as possible while reading RC(t) (= RB(t-1)) as late as possible.
 */


/* the initial loads. */
#define LOADW(s) \
	lwz	W(s),(s)*4(%r4)

/*
 * This is actually 13 instructions, which is an awkward fit,
 * and uses W(s) as a temporary before loading it.
 */
#define STEPD0_LOAD(t,s) \
add RE(t),RE(t),W(t); andc   %r0,RD(t),RB(t);  /* spare slot */        \
add RE(t),RE(t),%r0;  and    W(s),RC(t),RB(t); rotlwi %r0,RA(t),5;     \
add RE(t),RE(t),W(s); add    %r0,%r0,%r5;      rotlwi RB(t),RB(t),30;  \
add RE(t),RE(t),%r0;  lwz    W(s),(s)*4(%r4);

/*
 * This can execute starting with 2 out of 3 possible moduli, so it
 * does 2 rounds in 9 cycles, 4.5 cycles/round.
 */
#define STEPD0_UPDATE(t,s) \
add RE(t),RE(t),W(t); andc   %r0,RD(t),RB(t); xor    W(s),W((s)-16),W((s)-3); \
add RE(t),RE(t),%r0;  and    %r0,RC(t),RB(t); xor    W(s),W(s),W((s)-8);      \
add RE(t),RE(t),%r0;  rotlwi %r0,RA(t),5;     xor    W(s),W(s),W((s)-14);     \
add RE(t),RE(t),%r5;  rotlwi RB(t),RB(t),30;  rotlwi W(s),W(s),1;             \
add RE(t),RE(t),%r0;

/* Nicely optimal.  Conveniently, also the most common. */
#define STEPD1_UPDATE(t,s) \
add RE(t),RE(t),W(t); xor    %r0,RD(t),RB(t); xor    W(s),W((s)-16),W((s)-3); \
add RE(t),RE(t),%r5;  xor    %r0,%r0,RC(t);   xor    W(s),W(s),W((s)-8);      \
add RE(t),RE(t),%r0;  rotlwi %r0,RA(t),5;     xor    W(s),W(s),W((s)-14);     \
add RE(t),RE(t),%r0;  rotlwi RB(t),RB(t),30;  rotlwi W(s),W(s),1;

/*
 * The naked version, no UPDATE, for the last 4 rounds.  3 cycles per.
 * We could use W(s) as a temp register, but we don't need it.
 */
#define STEPD1(t) \
/* spare slot */        add   RE(t),RE(t),W(t); xor    %r0,RD(t),RB(t); \
rotlwi RB(t),RB(t),30;  add   RE(t),RE(t),%r5;  xor    %r0,%r0,RC(t);   \
add    RE(t),RE(t),%r0; rotlwi %r0,RA(t),5;     /* idle */              \
add    RE(t),RE(t),%r0;

/* 5 cycles per */
#define STEPD2_UPDATE(t,s) \
add RE(t),RE(t),W(t); and    %r0,RD(t),RB(t); xor    W(s),W((s)-16),W((s)-3); \
add RE(t),RE(t),%r0;  xor    %r0,RD(t),RB(t); xor    W(s),W(s),W((s)-8);      \
add RE(t),RE(t),%r5;  and    %r0,%r0,RC(t);   xor    W(s),W(s),W((s)-14);     \
add RE(t),RE(t),%r0;  rotlwi %r0,RA(t),5;     rotlwi W(s),W(s),1;             \
add RE(t),RE(t),%r0;  rotlwi RB(t),RB(t),30;

#define STEP0_LOAD4(t,s)		\
	STEPD0_LOAD(t,s);		\
	STEPD0_LOAD((t+1),(s)+1);	\
	STEPD0_LOAD((t)+2,(s)+2);	\
	STEPD0_LOAD((t)+3,(s)+3);

#define STEPUP4(fn, t, s)		\
	STEP##fn##_UPDATE(t,s);		\
	STEP##fn##_UPDATE((t)+1,(s)+1);	\
	STEP##fn##_UPDATE((t)+2,(s)+2);	\
	STEP##fn##_UPDATE((t)+3,(s)+3);	\

#define STEPUP20(fn, t, s)		\
	STEPUP4(fn, t, s);		\
	STEPUP4(fn, (t)+4, (s)+4);	\
	STEPUP4(fn, (t)+4, (s)+4);	\
	STEPUP4(fn, (t)+12, (s)+12);	\
	STEPUP4(fn, (t)+16, (s)+16)

	.globl	sha1_core
sha1_core:
	stwu	%r1,-80(%r1)
	stmw	%r13,4(%r1)

	/* Load up A - E */
	lmw	%r27,0(%r3)

	mtctr	%r5

1:
	lis	%r5,0x5a82	/* K0-19 */
	mr	RA(0),%r27
	LOADW(0)
	mr	RB(0),%r28
	LOADW(1)
	mr	RC(0),%r29
	LOADW(2)
	ori	%r5,%r5,0x7999
	mr	RD(0),%r30
	LOADW(3)
	mr	RE(0),%r31

	STEP0_LOAD4(0, 4)
	STEP0_LOAD4(4, 8)
	STEP0_LOAD4(8, 12)
	STEPUP4(D0, 12, 16)
	STEPUP4(D0, 16, 20)

	lis	%r5,0x6ed9	/* K20-39 */
	ori	%r5,%r5,0xeba1
	STEPUP20(D1, 20, 24)

	lis	%r5,0x8f1b	/* K40-59 */
	ori	%r5,%r5,0xbcdc
	STEPUP20(D2, 40, 44)

	lis	%r5,0xca62	/* K60-79 */
	ori	%r5,%r5,0xc1d6
	STEPUP4(D1, 60, 64)
	STEPUP4(D1, 64, 68)
	STEPUP4(D1, 68, 72)
	STEPUP4(D1, 72, 76)
	STEPD1(76)
	STEPD1(77)
	STEPD1(78)
	STEPD1(79)

	/* Add results to original values */
	add	%r31,%r31,RE(0)
	add	%r30,%r30,RD(0)
	add	%r29,%r29,RC(0)
	add	%r28,%r28,RB(0)
	add	%r27,%r27,RA(0)

	addi	%r4,%r4,64
	bdnz	1b

	/* Save final hash, restore registers, and return */
	stmw	%r27,0(%r3)
	lmw	%r13,4(%r1)
	addi	%r1,%r1,80
	blr

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: Revised PPC assembly implementation
  2005-04-27  1:47               ` linux
@ 2005-04-27  3:39                 ` Paul Mackerras
  2005-04-27 16:01                   ` linux
  0 siblings, 1 reply; 17+ messages in thread
From: Paul Mackerras @ 2005-04-27  3:39 UTC (permalink / raw)
  To: linux; +Cc: davem, git

linux@horizon.com writes:

> Here's a massively revised version, scheduled very close to optimally for
> the G4.  (The main remaining limitation is the loading of the k value
> in %r5, which could be split up more.)
> 
> My hope is that the G5 will do decently on it as well.

Nice... your new version takes 4.413 seconds on my G5 for 1000MB,
compared to 4.606 for your old version, i.e. it's about 4.4% faster.
Unfortunately it gives the wrong answer, though.

On my powerbook, which has a 1.5GHz G4 (7447A), the same test takes
4.68 seconds with my version, 4.72 seconds with your old version, but
only 3.90 seconds with your new version.

Care to check the code and find out why it's giving the wrong answer?

Regards,
Paul.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: Revised PPC assembly implementation
  2005-04-27  3:39                 ` Paul Mackerras
@ 2005-04-27 16:01                   ` linux
  0 siblings, 0 replies; 17+ messages in thread
From: linux @ 2005-04-27 16:01 UTC (permalink / raw)
  To: paulus; +Cc: davem, git, linux

> On my powerbook, which has a 1.5GHz G4 (7447A), the same test takes
> 4.68 seconds with my version, 4.72 seconds with your old version, but
> only 3.90 seconds with your new version.

20%; now we're getting somewhere!  Thanks for running the tests.

> Care to check the code and find out why it's giving the wrong answer?

You *could* be nice to me and breakpoint it every 20 rounds and tell me
which group is delivering the wrong answer..

But I'll look...

Hey!  It's not the tricky code at all; it's the STEPUP20 macro.
The third line should be +8, not +4.
Fix appended, but you can just edit line 127.

I can add one more tweak (scheduling the load of k better), and the
comments, then I think I'm done.

Would you mind playing with the number of words of fetchahead and see if
a value less than 4 is any faster?  It'll probably be a pretty minimal
change, but it doesn't affect the code size any.

I suppose we should also test it in a more realistic setting,
hashing *different* data a lot.  A dcbt in the loop might help.
(Does any PPC since the G3 have a cache line less than 64 bytes?
I know the G5 is 64 L1 and 128 L2...)

BTW, what's the best way to refer to PPC processors?  MPC74xx and PPC970FX?
Or Apple's names?  Or something else?



/*
 * SHA-1 implementation for PowerPC.
 *
 * Copyright (C) 2005 Paul Mackerras <paulus@samba.org>
 */

/*
 * We roll the registers for A, B, C, D, E around on each
 * iteration; E on iteration t is D on iteration t+1, and so on.
 * We use registers 6 - 10 for this.  (Registers 27 - 31 hold
 * the previous values.)
 */
#define RA(t)	(((t)+4)%5+6)
#define RB(t)	(((t)+3)%5+6)
#define RC(t)	(((t)+2)%5+6)
#define RD(t)	(((t)+1)%5+6)
#define RE(t)	(((t)+0)%5+6)

/* We use registers 11 - 26 for the W values */
#define W(t)	((t)%16+11)

/* Register 5 is used for the constant k */

/*
 * There are three F functions, used four groups of 20:
 * - 20 rounds of f0(b,c,d) = "bit wise b ? c : d" =  (^b & d) + (b & c)
 * - 20 rounds of f1(b,c,d) = b^c^d = (b^d)^c
 * - 20 rounds of f2(b,c,d) = majority(b,c,d) = (b&d) + ((b^d)&c)
 * - 20 more rounds of f1(b,c,d)
 *
 * These are all scheduled for near-optimal performance on a G4.
 * The G4 is a 3-issue out-of-order machine with 3 ALUs, but it can only
 * *consider* starting the oldest 3 instructions per cycle.  So to get
 * maximum performace out of it, you have to treat it as an in-order
 * machine.  Which means interleaving the computation round t with the
 * computation of W[t+4].
 *
 * The first 16 rounds use W values loaded directly from memory, while the
 * remianing 64 use values computed from those first 16.  We preload
 * 4 values before starting, so there are three kinds of rounds:
 * - The first 12 (all f0) also load the W values from memory.
 * - The next 64 compute W(i+4) in parallel. 8*f0, 20*f1, 20*f2, 16*f1.
 * - The last 4 (all f1) do not do anything with W.
 *
 * Therefore, we have 6 different round functions:
 * STEPD0_LOAD(t,s) - Perform round t and load W(s).  s < 16
 * STEPD0_UPDATE(t,s) - Perform round t and compute W(s).  s >= 16.
 * STEPD1_UPDATE(t,s)
 * STEPD2_UPDATE(t,s)
 * STEPD1(t) - Perform round t with no load or update.
 * 
 * The G5 is more fully out-of-order, and can find the parallelism
 * by itself.  The big limit is that it has a 2-cycle ALU latency, so
 * even though it's 2-way, the code has to be scheduled as if it's
 * 4-way, which can be a limit.  To help it, we try to schedule the
 * read of RA(t) as late as possible so it doesn't stall waiting for
 * the previous round's RE(t-1), and we try to rotate RB(t) as early
 * as possible while reading RC(t) (= RB(t-1)) as late as possible.
 */


/* the initial loads. */
#define LOADW(s) \
	lwz	W(s),(s)*4(%r4)

/*
 * This is actually 13 instructions, which is an awkward fit,
 * and uses W(s) as a temporary before loading it.
 */
#define STEPD0_LOAD(t,s) \
add RE(t),RE(t),W(t); andc   %r0,RD(t),RB(t);  /* spare slot */        \
add RE(t),RE(t),%r0;  and    W(s),RC(t),RB(t); rotlwi %r0,RA(t),5;     \
add RE(t),RE(t),W(s); add    %r0,%r0,%r5;      rotlwi RB(t),RB(t),30;  \
add RE(t),RE(t),%r0;  lwz    W(s),(s)*4(%r4);

/*
 * This can execute starting with 2 out of 3 possible moduli, so it
 * does 2 rounds in 9 cycles, 4.5 cycles/round.
 */
#define STEPD0_UPDATE(t,s) \
add RE(t),RE(t),W(t); andc   %r0,RD(t),RB(t); xor    W(s),W((s)-16),W((s)-3); \
add RE(t),RE(t),%r0;  and    %r0,RC(t),RB(t); xor    W(s),W(s),W((s)-8);      \
add RE(t),RE(t),%r0;  rotlwi %r0,RA(t),5;     xor    W(s),W(s),W((s)-14);     \
add RE(t),RE(t),%r5;  rotlwi RB(t),RB(t),30;  rotlwi W(s),W(s),1;             \
add RE(t),RE(t),%r0;

/* Nicely optimal.  Conveniently, also the most common. */
#define STEPD1_UPDATE(t,s) \
add RE(t),RE(t),W(t); xor    %r0,RD(t),RB(t); xor    W(s),W((s)-16),W((s)-3); \
add RE(t),RE(t),%r5;  xor    %r0,%r0,RC(t);   xor    W(s),W(s),W((s)-8);      \
add RE(t),RE(t),%r0;  rotlwi %r0,RA(t),5;     xor    W(s),W(s),W((s)-14);     \
add RE(t),RE(t),%r0;  rotlwi RB(t),RB(t),30;  rotlwi W(s),W(s),1;

/*
 * The naked version, no UPDATE, for the last 4 rounds.  3 cycles per.
 * We could use W(s) as a temp register, but we don't need it.
 */
#define STEPD1(t) \
/* spare slot */        add   RE(t),RE(t),W(t); xor    %r0,RD(t),RB(t); \
rotlwi RB(t),RB(t),30;  add   RE(t),RE(t),%r5;  xor    %r0,%r0,RC(t);   \
add    RE(t),RE(t),%r0; rotlwi %r0,RA(t),5;     /* idle */              \
add    RE(t),RE(t),%r0;

/* 5 cycles per */
#define STEPD2_UPDATE(t,s) \
add RE(t),RE(t),W(t); and    %r0,RD(t),RB(t); xor    W(s),W((s)-16),W((s)-3); \
add RE(t),RE(t),%r0;  xor    %r0,RD(t),RB(t); xor    W(s),W(s),W((s)-8);      \
add RE(t),RE(t),%r5;  and    %r0,%r0,RC(t);   xor    W(s),W(s),W((s)-14);     \
add RE(t),RE(t),%r0;  rotlwi %r0,RA(t),5;     rotlwi W(s),W(s),1;             \
add RE(t),RE(t),%r0;  rotlwi RB(t),RB(t),30;

#define STEP0_LOAD4(t,s)		\
	STEPD0_LOAD(t,s);		\
	STEPD0_LOAD((t+1),(s)+1);	\
	STEPD0_LOAD((t)+2,(s)+2);	\
	STEPD0_LOAD((t)+3,(s)+3);

#define STEPUP4(fn, t, s)		\
	STEP##fn##_UPDATE(t,s);		\
	STEP##fn##_UPDATE((t)+1,(s)+1);	\
	STEP##fn##_UPDATE((t)+2,(s)+2);	\
	STEP##fn##_UPDATE((t)+3,(s)+3);	\

#define STEPUP20(fn, t, s)		\
	STEPUP4(fn, t, s);		\
	STEPUP4(fn, (t)+4, (s)+4);	\
	STEPUP4(fn, (t)+8, (s)+8);	\
	STEPUP4(fn, (t)+12, (s)+12);	\
	STEPUP4(fn, (t)+16, (s)+16)

	.globl	sha1_core
sha1_core:
	stwu	%r1,-80(%r1)
	stmw	%r13,4(%r1)

	/* Load up A - E */
	lmw	%r27,0(%r3)

	mtctr	%r5

1:
	lis	%r5,0x5a82	/* K0-19 */
	mr	RA(0),%r27
	LOADW(0)
	mr	RB(0),%r28
	LOADW(1)
	mr	RC(0),%r29
	LOADW(2)
	ori	%r5,%r5,0x7999
	mr	RD(0),%r30
	LOADW(3)
	mr	RE(0),%r31

	STEP0_LOAD4(0, 4)
	STEP0_LOAD4(4, 8)
	STEP0_LOAD4(8, 12)
	STEPUP4(D0, 12, 16)
	STEPUP4(D0, 16, 20)

	lis	%r5,0x6ed9	/* K20-39 */
	ori	%r5,%r5,0xeba1
	STEPUP20(D1, 20, 24)

	lis	%r5,0x8f1b	/* K40-59 */
	ori	%r5,%r5,0xbcdc
	STEPUP20(D2, 40, 44)

	lis	%r5,0xca62	/* K60-79 */
	ori	%r5,%r5,0xc1d6
	STEPUP4(D1, 60, 64)
	STEPUP4(D1, 64, 68)
	STEPUP4(D1, 68, 72)
	STEPUP4(D1, 72, 76)
	STEPD1(76)
	STEPD1(77)
	STEPD1(78)
	STEPD1(79)

	/* Add results to original values */
	add	%r31,%r31,RE(0)
	add	%r30,%r30,RD(0)
	add	%r29,%r29,RC(0)
	add	%r28,%r28,RB(0)
	add	%r27,%r27,RA(0)

	addi	%r4,%r4,64
	bdnz	1b

	/* Save final hash, restore registers, and return */
	stmw	%r27,0(%r3)
	lmw	%r13,4(%r1)
	addi	%r1,%r1,80
	blr

^ permalink raw reply	[flat|nested] 17+ messages in thread

end of thread, other threads:[~2005-04-27 15:57 UTC | newest]

Thread overview: 17+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2005-04-23 12:42 [PATCH] PPC assembly implementation of SHA1 linux
2005-04-23 13:03 ` linux
2005-04-24  2:49 ` Benjamin Herrenschmidt
2005-04-24  4:40 ` Paul Mackerras
2005-04-24 12:04   ` Wayne Scott
2005-04-25  0:16   ` linux
2005-04-25  3:13   ` Revised PPC assembly implementation linux
2005-04-25  9:40     ` Paul Mackerras
2005-04-25 17:34       ` linux
2005-04-25 23:00         ` Paul Mackerras
2005-04-25 23:17           ` David S. Miller
2005-04-26  1:22             ` Paul Mackerras
2005-04-27  1:47               ` linux
2005-04-27  3:39                 ` Paul Mackerras
2005-04-27 16:01                   ` linux
2005-04-26  2:14             ` linux
2005-04-26  2:35             ` linux

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).