From: linux@horizon.com
To: paulus@samba.org
Cc: git@vger.kernel.org, linux@horizon.com
Subject: Re: [PATCH] PPC assembly implementation of SHA1
Date: 23 Apr 2005 12:42:46 -0000 [thread overview]
Message-ID: <20050423124246.30071.qmail@science.horizon.com> (raw)
I was working on the same thing, but hindered by lack of access to PPC
hardware. I notice that you also took advantage of the unaligned load
support and native byte order to do the hash straight from the source.
But I came up with a few additional refinements:
- You are using three temporaries (%r0, %r6, and RT(x)) for your
round functions. You only need one temporary (%r0) for all the functions.
(Plus %r15 for k)
The core round function is
e += (a <<< 5) + f(b,c,d) + W[i] + K
b = (b <<< 30)
followed by renaming the registers for the next round:
d += (e <<< 5) + f(a,b,c) + W[i+1] + K
a = (a <<< 30)
That can all be computed with one temporary.
There are three possible f() functions:
f1(x,y,z) = "bitwise x ? y : z"
= (x & y) | (~x & z)
= (x & y) + (~x & z)
= z ^ (x & (y ^ z))
All are three logical instrunctions on PPC. The second form
lets you add it into the accumulator e in two pieces:
+#define STEPD0(t) \
+ add %r0,%r15,W(t); \
+ add RE(t),RE(t),%r0; \
+ and %r0,RC(t),RB(t); \
+ add RE(t),RE(t),%r0; \
+ andc %r0,RD(t),RB(t); \
+ add RE(t),RE(t),%r0; \
+ rotlwi %r0,RA(t),5; \
+ rotlwi RB(t),RB(t),30; \
+ add RE(t),RE(t),%r0;
The XOR version f2(x,y,z) = x^y^z is of course trivial:
+#define STEPD1(t) \
+ add %r0,%r15,W(t); \
+ add RE(t),RE(t),%r0; \
+ xor %r0,RC(t),RB(t); \
+ xor %r0,%r0,RA(t); \
+ add RE(t),RE(t),%r0; \
+ rotlwi %r0,RA(t),5; \
+ rotlwi RB(t),RB(t),30; \
+ add RE(t),RE(t),%r0;
And the last function, majority(x,y,z), can be written as:
f3(x,y,z) = (x & y) | (y & z) | (z & x)
= (x & y) | z & (x | y)
= (x & y) | z & (x ^ y)
= (x & y) + z & (x ^ y)
The last form again allows evaluation with one temporary and
two adds into RE(t):
+#define STEPD2(t) \
+ add %r0,%r15,W(t); \
+ add RE(t),RE(t),%r0; \
+ and %r0,RC(t),RB(t); \
+ add RE(t),RE(t),%r0; \
+ xor %r0,RC(t),RB(t); \
+ and %r0,%r0,RA(t); \
+ add RE(t),RE(t),%r0; \
+ rotlwi %r0,RA(t),5; \
+ rotlwi RB(t),RB(t),30; \
+ add RE(t),RE(t),%r0;
Other notes:
- You don't need to decrement %r1 before saving registers.
The PPC calling convention defines a "red zone" below the
current stack pointer that is guaranteed never to be touched
by signal handlers or the like. This is specifically for
leaf procedure optimization, and is at least 224 bytes.
- Is that many stw/lwz instructions faster than stmw/lmw?
The latter is at least more cahce-friendly.
- You can avoid saving and restoring %r15 by recycling %r5 for that
purpose; it's not used after the mtctr %r5.
- (Note, by the way, that while the PPC supports unaligned loads, using
lmw to load unaligned data is SLOW onthe G5; multiple lwz instructions
are faster in that case.)
- The above changes actually save enough registers to cache the whole hash[5]
in registers as well, eliminating *all* unnecessary load/store traffic.
With all of the above changes, your sha1ppc.S file turns into:
diff -urN git.orig/ppc/sha1ppc.S git/ppc/sha1ppc.S
--- /dev/null 2005-04-04 12:56:19.000000000 +1000
+++ git/ppc/sha1ppc.S 2005-04-22 16:29:19.000000000 +1000
@@ -0,0 +1,142 @@
+/*
+ * SHA-1 implementation for PowerPC.
+ *
+ * Copyright (C) 2005 Paul Mackerras <paulus@samba.org>
+ */
+
+/*
+ * We roll the registers for A, B, C, D, E around on each
+ * iteration; A on iteration t is B on iteration t+1, and so on.
+ * We use registers 6 - 10 for this. (Registers 27 - 31 hold
+ * the previous values.)
+ */
+#define RA(t) ((((t)+4)%5)+6)
+#define RB(t) ((((t)+3)%5)+6)
+#define RC(t) ((((t)+2)%5)+6)
+#define RD(t) ((((t)+1)%5)+6)
+#define RE(t) ((((t)+0)%5)+6)
+
+/* We use registers 11 - 26 for the W values */
+#define W(t) (((t)%16)+11)
+
+#define STEPD0(t) \
+ add %r0,%r5,W(t); \
+ add RE(t),RE(t),%r0; \
+ and %r0,RC(t),RB(t); \
+ add RE(t),RE(t),%r0; \
+ andc %r0,RD(t),RB(t); \
+ add RE(t),RE(t),%r0; \
+ rotlwi %r0,RA(t),5; \
+ rotlwi RB(t),RB(t),30; \
+ add RE(t),RE(t),%r0;
+
+#define STEPD1(t) \
+ add %r0,%r5,W(t); \
+ add RE(t),RE(t),%r0; \
+ xor %r0,RC(t),RB(t); \
+ xor %r0,%r0,RA(t); \
+ add RE(t),RE(t),%r0; \
+ rotlwi %r0,RA(t),5; \
+ rotlwi RB(t),RB(t),30; \
+ add RE(t),RE(t),%r0;
+
+#define STEPD2(t) \
+ add %r0,%r15,W(t); \
+ add RE(t),RE(t),%r0; \
+ and %r0,RC(t),RB(t); \
+ add RE(t),RE(t),%r0; \
+ xor %r0,RC(t),RB(t); \
+ and %r0,%r0,RA(t); \
+ add RE(t),RE(t),%r0; \
+ rotlwi %r0,RA(t),5; \
+ rotlwi RB(t),RB(t),30; \
+ add RE(t),RE(t),%r0;
+
+#define LOADW(t) \
+ lwz W(t),(t)*4(%r4)
+
+#define UPDATEW(t) \
+ xor %r0,W((t)-3),W((t)-8); \
+ xor W(t),W((t)-16),W((t)-14); \
+ xor W(t),W(t),%r0; \
+ rotlwi W(t),W(t),1
+
+#define STEP0LD4(t) \
+ STEPD0(t); LOADW((t)+4); \
+ STEPD0((t)+1); LOADW((t)+5); \
+ STEPD0((t)+2); LOADW((t)+6); \
+ STEPD0((t)+3); LOADW((t)+7)
+
+#define STEPUP4(t, fn) \
+ STEP##fn(t); UPDATEW((t)+4); \
+ STEP##fn((t)+1); UPDATEW((t)+5); \
+ STEP##fn((t)+2); UPDATEW((t)+6); \
+ STEP##fn((t)+3); UPDATEW((t)+7)
+
+#define STEPUP20(t, fn) \
+ STEPUP4(t, fn); \
+ STEPUP4((t)+4, fn); \
+ STEPUP4((t)+8, fn); \
+ STEPUP4((t)+12, fn); \
+ STEPUP4((t)+16, fn)
+
+ .globl sha1_core
+sha1_core:
+ stmw %r13,-76(%r1)
+
+ /* Load up A - E */
+ lmw %r27,0(%r3)
+
+ mtctr %r5
+
+1: mr RA(0),%r27
+ LOADW(0)
+ mr RB(0),%r28
+ LOADW(1)
+ mr RC(0),%r29
+ LOADW(2)
+ mr RD(0),%r30
+ LOADW(3)
+ mr RE(0),%r31
+
+ lis %r5,0x5a82 /* K0-19 */
+ ori %r5,%r5,0x7999
+ STEP0LD4(0)
+ STEP0LD4(4)
+ STEP0LD4(8)
+ STEPUP4(12, D0)
+ STEPUP4(16, D0)
+
+ lis %r5,0x6ed9 /* K20-39 */
+ ori %r5,%r5,0xeba1
+ STEPUP20(20, D1)
+
+ lis %r5,0x8f1b /* K40-59 */
+ ori %r5,%r5,0xbcdc
+ STEPUP20(40, D2)
+
+ lis %r5,0xca62 /* K60-79 */
+ ori %r5,%r5,0xc1d6
+ STEPUP4(60, D1)
+ STEPUP4(64, D1)
+ STEPUP4(68, D1)
+ STEPUP4(72, D1)
+ STEPD1(76)
+ STEPD1(77)
+ STEPD1(78)
+ STEPD1(79)
+
+ /* Add results to original values */
+ add %r27,%r27,RA(0)
+ add %r28,%r28,RB(0)
+ add %r29,%r29,RC(0)
+ add %r30,%r30,RD(0)
+ add %r31,%r31,RE(0)
+
+ addi %r4,%r4,64
+ bdnz 1b
+
+ /* Save final hash, restore registers, and return */
+ stmw %r27,0(%r3)
+ lmw %r13,-76(%r1)
+ blr
The above changes, if you want them, are placed in the public domain.
It assembles, but I can't test it. Have fun.
(I might also replace "t" with "i" in the macros, but figured that
would make comparing versions annoying. Of course, part of the reason
was confusion with the T register, which is now eliminated...)
next reply other threads:[~2005-04-23 12:38 UTC|newest]
Thread overview: 18+ messages / expand[flat|nested] mbox.gz Atom feed top
2005-04-23 12:42 linux [this message]
2005-04-23 13:03 ` [PATCH] PPC assembly implementation of SHA1 linux
2005-04-24 2:49 ` Benjamin Herrenschmidt
2005-04-24 4:40 ` Paul Mackerras
2005-04-24 12:04 ` Wayne Scott
2005-04-25 0:16 ` linux
2005-04-25 3:13 ` Revised PPC assembly implementation linux
2005-04-25 9:40 ` Paul Mackerras
2005-04-25 17:34 ` linux
2005-04-25 23:00 ` Paul Mackerras
2005-04-25 23:17 ` David S. Miller
2005-04-26 1:22 ` Paul Mackerras
2005-04-27 1:47 ` linux
2005-04-27 3:39 ` Paul Mackerras
2005-04-27 16:01 ` linux
2005-04-26 2:14 ` linux
2005-04-26 2:35 ` linux
-- strict thread matches above, loose matches on Subject: below --
2005-04-23 5:33 [PATCH] PPC assembly implementation of SHA1 Paul Mackerras
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20050423124246.30071.qmail@science.horizon.com \
--to=linux@horizon.com \
--cc=git@vger.kernel.org \
--cc=paulus@samba.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).