git.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] PPC assembly implementation of SHA1
@ 2005-04-23  5:33 Paul Mackerras
  0 siblings, 0 replies; 7+ messages in thread
From: Paul Mackerras @ 2005-04-23  5:33 UTC (permalink / raw)
  To: torvalds; +Cc: git

Here is a SHA1 implementation with the core written in PPC assembly.
On my 2GHz G5, it does 218MB/s, compared to 135MB/s for the openssl
version or 45MB/s for the mozilla version.

Signed-off-by: Paul Mackerras <paulus@samba.org>
---

diff -urN git.orig/Makefile git/Makefile
--- git.orig/Makefile	2005-04-22 16:23:44.000000000 +1000
+++ git/Makefile	2005-04-22 16:43:31.000000000 +1000
@@ -34,9 +34,14 @@
   SHA1_HEADER="mozilla-sha1/sha1.h"
   LIB_OBJS += mozilla-sha1/sha1.o
 else
+ifdef PPC_SHA1
+  SHA1_HEADER="ppc/sha1.h"
+  LIB_OBJS += ppc/sha1.o ppc/sha1ppc.o
+else
   SHA1_HEADER=<openssl/sha.h>
   LIBS += -lssl
 endif
+endif
 
 CFLAGS += '-DSHA1_HEADER=$(SHA1_HEADER)'
 
@@ -77,7 +82,7 @@
 write-tree.o: $(LIB_H)
 
 clean:
-	rm -f *.o mozilla-sha1/*.o $(PROG) $(LIB_FILE)
+	rm -f *.o mozilla-sha1/*.o ppc/*.o $(PROG) $(LIB_FILE)
 
 backup: clean
 	cd .. ; tar czvf dircache.tar.gz dir-cache
diff -urN git.orig/ppc/sha1.c git/ppc/sha1.c
--- /dev/null	2005-04-04 12:56:19.000000000 +1000
+++ git/ppc/sha1.c	2005-04-22 16:29:19.000000000 +1000
@@ -0,0 +1,72 @@
+/*
+ * SHA-1 implementation.
+ *
+ * Copyright (C) 2005 Paul Mackerras <paulus@samba.org>
+ *
+ * This version assumes we are running on a big-endian machine.
+ * It calls an external sha1_core() to process blocks of 64 bytes.
+ */
+#include <stdio.h>
+#include <string.h>
+#include "sha1.h"
+
+extern void sha1_core(uint32_t *hash, const unsigned char *p,
+		      unsigned int nblocks);
+
+int SHA1_Init(SHA_CTX *c)
+{
+	c->hash[0] = 0x67452301;
+	c->hash[1] = 0xEFCDAB89;
+	c->hash[2] = 0x98BADCFE;
+	c->hash[3] = 0x10325476;
+	c->hash[4] = 0xC3D2E1F0;
+	c->len = 0;
+	c->cnt = 0;
+	return 0;
+}
+
+int SHA1_Update(SHA_CTX *c, const void *ptr, unsigned long n)
+{
+	unsigned long nb;
+	const unsigned char *p = ptr;
+
+	c->len += n << 3;
+	while (n != 0) {
+		if (c->cnt || n < 64) {
+			nb = 64 - c->cnt;
+			if (nb > n)
+				nb = n;
+			memcpy(&c->buf.b[c->cnt], p, nb);
+			if ((c->cnt += nb) == 64) {
+				sha1_core(c->hash, c->buf.b, 1);
+				c->cnt = 0;
+			}
+		} else {
+			nb = n >> 6;
+			sha1_core(c->hash, p, nb);
+			nb <<= 6;
+		}
+		n -= nb;
+		p += nb;
+	}
+	return 0;
+}	
+
+int SHA1_Final(unsigned char *hash, SHA_CTX *c)
+{
+	unsigned int cnt = c->cnt;
+
+	c->buf.b[cnt++] = 0x80;
+	if (cnt > 56) {
+		if (cnt < 64)
+			memset(&c->buf.b[cnt], 0, 64 - cnt);
+		sha1_core(c->hash, c->buf.b, 1);
+		cnt = 0;
+	}
+	if (cnt < 56)
+		memset(&c->buf.b[cnt], 0, 56 - cnt);
+	c->buf.l[7] = c->len;
+	sha1_core(c->hash, c->buf.b, 1);
+	memcpy(hash, c->hash, 20);
+	return 0;
+}
diff -urN git.orig/ppc/sha1.h git/ppc/sha1.h
--- /dev/null	2005-04-04 12:56:19.000000000 +1000
+++ git/ppc/sha1.h	2005-04-22 16:45:28.000000000 +1000
@@ -0,0 +1,20 @@
+/*
+ * SHA-1 implementation.
+ *
+ * Copyright (C) 2005 Paul Mackerras <paulus@samba.org>
+ */
+#include <stdint.h>
+
+typedef struct sha_context {
+	uint32_t hash[5];
+	uint32_t cnt;
+	uint64_t len;
+	union {
+		unsigned char b[64];
+		uint64_t l[8];
+	} buf;
+} SHA_CTX;
+
+int SHA1_Init(SHA_CTX *c);
+int SHA1_Update(SHA_CTX *c, const void *p, unsigned long n);
+int SHA1_Final(unsigned char *hash, SHA_CTX *c);
diff -urN git.orig/ppc/sha1ppc.S git/ppc/sha1ppc.S
--- /dev/null	2005-04-04 12:56:19.000000000 +1000
+++ git/ppc/sha1ppc.S	2005-04-22 16:29:19.000000000 +1000
@@ -0,0 +1,185 @@
+/*
+ * SHA-1 implementation for PowerPC.
+ *
+ * Copyright (C) 2005 Paul Mackerras <paulus@samba.org>
+ */
+#define FS	80
+
+/*
+ * We roll the registers for T, A, B, C, D, E around on each
+ * iteration; T on iteration t is A on iteration t+1, and so on.
+ * We use registers 7 - 12 for this.
+ */
+#define RT(t)	((((t)+5)%6)+7)
+#define RA(t)	((((t)+4)%6)+7)
+#define RB(t)	((((t)+3)%6)+7)
+#define RC(t)	((((t)+2)%6)+7)
+#define RD(t)	((((t)+1)%6)+7)
+#define RE(t)	((((t)+0)%6)+7)
+
+/* We use registers 16 - 31 for the W values */
+#define W(t)	(((t)%16)+16)
+
+#define STEPD0(t)				\
+	and	%r6,RB(t),RC(t);		\
+	andc	%r0,RD(t),RB(t);		\
+	rotlwi	RT(t),RA(t),5;			\
+	rotlwi	RB(t),RB(t),30;			\
+	or	%r6,%r6,%r0;			\
+	add	%r0,RE(t),%r15;			\
+	add	RT(t),RT(t),%r6;		\
+	add	%r0,%r0,W(t);			\
+	add	RT(t),RT(t),%r0
+
+#define STEPD1(t)				\
+	xor	%r6,RB(t),RC(t);		\
+	rotlwi	RT(t),RA(t),5;			\
+	rotlwi	RB(t),RB(t),30;			\
+	xor	%r6,%r6,RD(t);			\
+	add	%r0,RE(t),%r15;			\
+	add	RT(t),RT(t),%r6;		\
+	add	%r0,%r0,W(t);			\
+	add	RT(t),RT(t),%r0
+
+#define STEPD2(t)				\
+	and	%r6,RB(t),RC(t);		\
+	and	%r0,RB(t),RD(t);		\
+	rotlwi	RT(t),RA(t),5;			\
+	rotlwi	RB(t),RB(t),30;			\
+	or	%r6,%r6,%r0;			\
+	and	%r0,RC(t),RD(t);		\
+	or	%r6,%r6,%r0;			\
+	add	%r0,RE(t),%r15;			\
+	add	RT(t),RT(t),%r6;		\
+	add	%r0,%r0,W(t);			\
+	add	RT(t),RT(t),%r0
+
+#define LOADW(t)				\
+	lwz	W(t),(t)*4(%r4)
+
+#define UPDATEW(t)				\
+	xor	%r0,W((t)-3),W((t)-8);		\
+	xor	W(t),W((t)-16),W((t)-14);	\
+	xor	W(t),W(t),%r0;			\
+	rotlwi	W(t),W(t),1
+
+#define STEP0LD4(t)				\
+	STEPD0(t);   LOADW((t)+4);		\
+	STEPD0((t)+1); LOADW((t)+5);		\
+	STEPD0((t)+2); LOADW((t)+6);		\
+	STEPD0((t)+3); LOADW((t)+7)
+
+#define STEPUP4(t, fn)				\
+	STEP##fn(t);   UPDATEW((t)+4);		\
+	STEP##fn((t)+1); UPDATEW((t)+5);	\
+	STEP##fn((t)+2); UPDATEW((t)+6);	\
+	STEP##fn((t)+3); UPDATEW((t)+7)
+
+#define STEPUP20(t, fn)				\
+	STEPUP4(t, fn);				\
+	STEPUP4((t)+4, fn);			\
+	STEPUP4((t)+8, fn);			\
+	STEPUP4((t)+12, fn);			\
+	STEPUP4((t)+16, fn)
+
+	.globl	sha1_core
+sha1_core:
+	stwu	%r1,-FS(%r1)
+	stw	%r15,FS-68(%r1)
+	stw	%r16,FS-64(%r1)
+	stw	%r17,FS-60(%r1)
+	stw	%r18,FS-56(%r1)
+	stw	%r19,FS-52(%r1)
+	stw	%r20,FS-48(%r1)
+	stw	%r21,FS-44(%r1)
+	stw	%r22,FS-40(%r1)
+	stw	%r23,FS-36(%r1)
+	stw	%r24,FS-32(%r1)
+	stw	%r25,FS-28(%r1)
+	stw	%r26,FS-24(%r1)
+	stw	%r27,FS-20(%r1)
+	stw	%r28,FS-16(%r1)
+	stw	%r29,FS-12(%r1)
+	stw	%r30,FS-8(%r1)
+	stw	%r31,FS-4(%r1)
+
+	/* Load up A - E */
+	lwz	RA(0),0(%r3)	/* A */
+	lwz	RB(0),4(%r3)	/* B */
+	lwz	RC(0),8(%r3)	/* C */
+	lwz	RD(0),12(%r3)	/* D */
+	lwz	RE(0),16(%r3)	/* E */
+
+	mtctr	%r5
+
+1:	LOADW(0)
+	LOADW(1)
+	LOADW(2)
+	LOADW(3)
+
+	lis	%r15,0x5a82	/* K0-19 */
+	ori	%r15,%r15,0x7999
+	STEP0LD4(0)
+	STEP0LD4(4)
+	STEP0LD4(8)
+	STEPUP4(12, D0)
+	STEPUP4(16, D0)
+
+	lis	%r15,0x6ed9	/* K20-39 */
+	ori	%r15,%r15,0xeba1
+	STEPUP20(20, D1)
+
+	lis	%r15,0x8f1b	/* K40-59 */
+	ori	%r15,%r15,0xbcdc
+	STEPUP20(40, D2)
+
+	lis	%r15,0xca62	/* K60-79 */
+	ori	%r15,%r15,0xc1d6
+	STEPUP4(60, D1)
+	STEPUP4(64, D1)
+	STEPUP4(68, D1)
+	STEPUP4(72, D1)
+	STEPD1(76)
+	STEPD1(77)
+	STEPD1(78)
+	STEPD1(79)
+
+	lwz	%r20,16(%r3)
+	lwz	%r19,12(%r3)
+	lwz	%r18,8(%r3)
+	lwz	%r17,4(%r3)
+	lwz	%r16,0(%r3)
+	add	%r20,RE(80),%r20
+	add	RD(0),RD(80),%r19
+	add	RC(0),RC(80),%r18
+	add	RB(0),RB(80),%r17
+	add	RA(0),RA(80),%r16
+	mr	RE(0),%r20
+	stw	RA(0),0(%r3)
+	stw	RB(0),4(%r3)
+	stw	RC(0),8(%r3)
+	stw	RD(0),12(%r3)
+	stw	RE(0),16(%r3)
+
+	addi	%r4,%r4,64
+	bdnz	1b
+
+	lwz	%r15,FS-68(%r1)
+	lwz	%r16,FS-64(%r1)
+	lwz	%r17,FS-60(%r1)
+	lwz	%r18,FS-56(%r1)
+	lwz	%r19,FS-52(%r1)
+	lwz	%r20,FS-48(%r1)
+	lwz	%r21,FS-44(%r1)
+	lwz	%r22,FS-40(%r1)
+	lwz	%r23,FS-36(%r1)
+	lwz	%r24,FS-32(%r1)
+	lwz	%r25,FS-28(%r1)
+	lwz	%r26,FS-24(%r1)
+	lwz	%r27,FS-20(%r1)
+	lwz	%r28,FS-16(%r1)
+	lwz	%r29,FS-12(%r1)
+	lwz	%r30,FS-8(%r1)
+	lwz	%r31,FS-4(%r1)
+	addi	%r1,%r1,FS
+	blr

^ permalink raw reply	[flat|nested] 7+ messages in thread
* Re: [PATCH] PPC assembly implementation of SHA1
@ 2005-04-23 12:42 linux
  2005-04-23 13:03 ` linux
                   ` (2 more replies)
  0 siblings, 3 replies; 7+ messages in thread
From: linux @ 2005-04-23 12:42 UTC (permalink / raw)
  To: paulus; +Cc: git, linux

I was working on the same thing, but hindered by lack of access to PPC
hardware.  I notice that you also took advantage of the unaligned load
support and native byte order to do the hash straight from the source.

But I came up with a few additional refinements:

- You are using three temporaries (%r0, %r6, and RT(x)) for your
  round functions.  You only need one temporary (%r0) for all the functions.
  (Plus %r15 for k)

  The core round function is

  e += (a <<< 5) + f(b,c,d) + W[i] + K
  b = (b <<< 30)
followed by renaming the registers for the next round:
  d += (e <<< 5) + f(a,b,c) + W[i+1] + K
  a = (a <<< 30)

That can all be computed with one temporary.

There are three possible f() functions:
f1(x,y,z) = "bitwise x ? y : z"
          = (x & y) | (~x & z)
          = (x & y) + (~x & z)
          = z ^ (x & (y ^ z))
All are three logical instrunctions on PPC.  The second form
lets you add it into the accumulator e in two pieces:

+#define STEPD0(t)				\
+	add	%r0,%r15,W(t);			\
+	add	RE(t),RE(t),%r0;		\
+	and	%r0,RC(t),RB(t);		\
+	add	RE(t),RE(t),%r0;		\
+	andc	%r0,RD(t),RB(t);		\
+	add	RE(t),RE(t),%r0;		\
+	rotlwi	%r0,RA(t),5;			\
+	rotlwi	RB(t),RB(t),30;			\
+	add	RE(t),RE(t),%r0;

The XOR version f2(x,y,z) = x^y^z is of course trivial:

+#define STEPD1(t)				\
+	add	%r0,%r15,W(t);			\
+	add	RE(t),RE(t),%r0;		\
+	xor	%r0,RC(t),RB(t);		\
+	xor	%r0,%r0,RA(t);			\
+	add	RE(t),RE(t),%r0;		\
+	rotlwi	%r0,RA(t),5;			\
+	rotlwi	RB(t),RB(t),30;			\
+	add	RE(t),RE(t),%r0;

And the last function, majority(x,y,z), can be written as:
f3(x,y,z) = (x & y) | (y & z) | (z & x)
          = (x & y) | z & (x | y)
          = (x & y) | z & (x ^ y)
          = (x & y) + z & (x ^ y)
The last form again allows evaluation with one temporary and
two adds into RE(t):

+#define STEPD2(t)				\
+	add	%r0,%r15,W(t);			\
+	add	RE(t),RE(t),%r0;		\
+	and	%r0,RC(t),RB(t);		\
+	add	RE(t),RE(t),%r0;		\
+	xor	%r0,RC(t),RB(t);		\
+	and	%r0,%r0,RA(t);			\
+	add	RE(t),RE(t),%r0;		\
+	rotlwi	%r0,RA(t),5;			\
+	rotlwi	RB(t),RB(t),30;			\
+	add	RE(t),RE(t),%r0;

Other notes:
- You don't need to decrement %r1 before saving registers.
  The PPC calling convention defines a "red zone" below the
  current stack pointer that is guaranteed never to be touched
  by signal handlers or the like.  This is specifically for
  leaf procedure optimization, and is at least 224 bytes.
- Is that many stw/lwz instructions faster than stmw/lmw?
  The latter is at least more cahce-friendly.
- You can avoid saving and restoring %r15 by recycling %r5 for that
  purpose; it's not used after the mtctr %r5.
- (Note, by the way, that while the PPC supports unaligned loads, using
  lmw to load unaligned data is SLOW onthe G5; multiple lwz instructions
  are faster in that case.)
- The above changes actually save enough registers to cache the whole hash[5]
  in registers as well, eliminating *all* unnecessary load/store traffic.

With all of the above changes, your sha1ppc.S file turns into:

diff -urN git.orig/ppc/sha1ppc.S git/ppc/sha1ppc.S
--- /dev/null	2005-04-04 12:56:19.000000000 +1000
+++ git/ppc/sha1ppc.S	2005-04-22 16:29:19.000000000 +1000
@@ -0,0 +1,142 @@
+/*
+ * SHA-1 implementation for PowerPC.
+ *
+ * Copyright (C) 2005 Paul Mackerras <paulus@samba.org>
+ */
+
+/*
+ * We roll the registers for A, B, C, D, E around on each
+ * iteration; A on iteration t is B on iteration t+1, and so on.
+ * We use registers 6 - 10 for this.  (Registers 27 - 31 hold
+ * the previous values.)
+ */
+#define RA(t)	((((t)+4)%5)+6)
+#define RB(t)	((((t)+3)%5)+6)
+#define RC(t)	((((t)+2)%5)+6)
+#define RD(t)	((((t)+1)%5)+6)
+#define RE(t)	((((t)+0)%5)+6)
+
+/* We use registers 11 - 26 for the W values */
+#define W(t)	(((t)%16)+11)
+
+#define STEPD0(t)				\
+	add	%r0,%r5,W(t);			\
+	add	RE(t),RE(t),%r0;		\
+	and	%r0,RC(t),RB(t);		\
+	add	RE(t),RE(t),%r0;		\
+	andc	%r0,RD(t),RB(t);		\
+	add	RE(t),RE(t),%r0;		\
+	rotlwi	%r0,RA(t),5;			\
+	rotlwi	RB(t),RB(t),30;			\
+	add	RE(t),RE(t),%r0;
+
+#define STEPD1(t)				\
+	add	%r0,%r5,W(t);			\
+	add	RE(t),RE(t),%r0;		\
+	xor	%r0,RC(t),RB(t);		\
+	xor	%r0,%r0,RA(t);			\
+	add	RE(t),RE(t),%r0;		\
+	rotlwi	%r0,RA(t),5;			\
+	rotlwi	RB(t),RB(t),30;			\
+	add	RE(t),RE(t),%r0;
+
+#define STEPD2(t)				\
+	add	%r0,%r15,W(t);			\
+	add	RE(t),RE(t),%r0;		\
+	and	%r0,RC(t),RB(t);		\
+	add	RE(t),RE(t),%r0;		\
+	xor	%r0,RC(t),RB(t);		\
+	and	%r0,%r0,RA(t);			\
+	add	RE(t),RE(t),%r0;		\
+	rotlwi	%r0,RA(t),5;			\
+	rotlwi	RB(t),RB(t),30;			\
+	add	RE(t),RE(t),%r0;
+
+#define LOADW(t)				\
+	lwz	W(t),(t)*4(%r4)
+
+#define UPDATEW(t)				\
+	xor	%r0,W((t)-3),W((t)-8);		\
+	xor	W(t),W((t)-16),W((t)-14);	\
+	xor	W(t),W(t),%r0;			\
+	rotlwi	W(t),W(t),1
+
+#define STEP0LD4(t)				\
+	STEPD0(t);     LOADW((t)+4);		\
+	STEPD0((t)+1); LOADW((t)+5);		\
+	STEPD0((t)+2); LOADW((t)+6);		\
+	STEPD0((t)+3); LOADW((t)+7)
+
+#define STEPUP4(t, fn)				\
+	STEP##fn(t);     UPDATEW((t)+4);	\
+	STEP##fn((t)+1); UPDATEW((t)+5);	\
+	STEP##fn((t)+2); UPDATEW((t)+6);	\
+	STEP##fn((t)+3); UPDATEW((t)+7)
+
+#define STEPUP20(t, fn)				\
+	STEPUP4(t, fn);				\
+	STEPUP4((t)+4, fn);			\
+	STEPUP4((t)+8, fn);			\
+	STEPUP4((t)+12, fn);			\
+	STEPUP4((t)+16, fn)
+
+	.globl	sha1_core
+sha1_core:
+	stmw	%r13,-76(%r1)
+
+	/* Load up A - E */
+	lmw	%r27,0(%r3)
+
+	mtctr	%r5
+
+1:	mr	RA(0),%r27
+	LOADW(0)
+	mr	RB(0),%r28
+	LOADW(1)
+	mr	RC(0),%r29
+	LOADW(2)
+	mr	RD(0),%r30
+	LOADW(3)
+	mr	RE(0),%r31
+
+	lis	%r5,0x5a82	/* K0-19 */
+	ori	%r5,%r5,0x7999
+	STEP0LD4(0)
+	STEP0LD4(4)
+	STEP0LD4(8)
+	STEPUP4(12, D0)
+	STEPUP4(16, D0)
+
+	lis	%r5,0x6ed9	/* K20-39 */
+	ori	%r5,%r5,0xeba1
+	STEPUP20(20, D1)
+
+	lis	%r5,0x8f1b	/* K40-59 */
+	ori	%r5,%r5,0xbcdc
+	STEPUP20(40, D2)
+
+	lis	%r5,0xca62	/* K60-79 */
+	ori	%r5,%r5,0xc1d6
+	STEPUP4(60, D1)
+	STEPUP4(64, D1)
+	STEPUP4(68, D1)
+	STEPUP4(72, D1)
+	STEPD1(76)
+	STEPD1(77)
+	STEPD1(78)
+	STEPD1(79)
+
+	/* Add results to original values */
+	add	%r27,%r27,RA(0)
+	add	%r28,%r28,RB(0)
+	add	%r29,%r29,RC(0)
+	add	%r30,%r30,RD(0)
+	add	%r31,%r31,RE(0)
+
+	addi	%r4,%r4,64
+	bdnz	1b
+
+	/* Save final hash, restore registers, and return */
+	stmw	%r27,0(%r3)
+	lmw	%r13,-76(%r1)
+	blr

The above changes, if you want them, are placed in the public domain.
It assembles, but I can't test it.  Have fun.

(I might also replace "t" with "i" in the macros, but figured that
would make comparing versions annoying.  Of course, part of the reason
was confusion with the T register, which is now eliminated...)

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2005-04-25  0:12 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2005-04-23  5:33 [PATCH] PPC assembly implementation of SHA1 Paul Mackerras
  -- strict thread matches above, loose matches on Subject: below --
2005-04-23 12:42 linux
2005-04-23 13:03 ` linux
2005-04-24  2:49 ` Benjamin Herrenschmidt
2005-04-24  4:40 ` Paul Mackerras
2005-04-24 12:04   ` Wayne Scott
2005-04-25  0:16   ` linux

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).