git.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] PPC assembly implementation of SHA1
@ 2005-04-23  5:33 Paul Mackerras
  0 siblings, 0 replies; 7+ messages in thread
From: Paul Mackerras @ 2005-04-23  5:33 UTC (permalink / raw)
  To: torvalds; +Cc: git

Here is a SHA1 implementation with the core written in PPC assembly.
On my 2GHz G5, it does 218MB/s, compared to 135MB/s for the openssl
version or 45MB/s for the mozilla version.

Signed-off-by: Paul Mackerras <paulus@samba.org>
---

diff -urN git.orig/Makefile git/Makefile
--- git.orig/Makefile	2005-04-22 16:23:44.000000000 +1000
+++ git/Makefile	2005-04-22 16:43:31.000000000 +1000
@@ -34,9 +34,14 @@
   SHA1_HEADER="mozilla-sha1/sha1.h"
   LIB_OBJS += mozilla-sha1/sha1.o
 else
+ifdef PPC_SHA1
+  SHA1_HEADER="ppc/sha1.h"
+  LIB_OBJS += ppc/sha1.o ppc/sha1ppc.o
+else
   SHA1_HEADER=<openssl/sha.h>
   LIBS += -lssl
 endif
+endif
 
 CFLAGS += '-DSHA1_HEADER=$(SHA1_HEADER)'
 
@@ -77,7 +82,7 @@
 write-tree.o: $(LIB_H)
 
 clean:
-	rm -f *.o mozilla-sha1/*.o $(PROG) $(LIB_FILE)
+	rm -f *.o mozilla-sha1/*.o ppc/*.o $(PROG) $(LIB_FILE)
 
 backup: clean
 	cd .. ; tar czvf dircache.tar.gz dir-cache
diff -urN git.orig/ppc/sha1.c git/ppc/sha1.c
--- /dev/null	2005-04-04 12:56:19.000000000 +1000
+++ git/ppc/sha1.c	2005-04-22 16:29:19.000000000 +1000
@@ -0,0 +1,72 @@
+/*
+ * SHA-1 implementation.
+ *
+ * Copyright (C) 2005 Paul Mackerras <paulus@samba.org>
+ *
+ * This version assumes we are running on a big-endian machine.
+ * It calls an external sha1_core() to process blocks of 64 bytes.
+ */
+#include <stdio.h>
+#include <string.h>
+#include "sha1.h"
+
+extern void sha1_core(uint32_t *hash, const unsigned char *p,
+		      unsigned int nblocks);
+
+int SHA1_Init(SHA_CTX *c)
+{
+	c->hash[0] = 0x67452301;
+	c->hash[1] = 0xEFCDAB89;
+	c->hash[2] = 0x98BADCFE;
+	c->hash[3] = 0x10325476;
+	c->hash[4] = 0xC3D2E1F0;
+	c->len = 0;
+	c->cnt = 0;
+	return 0;
+}
+
+int SHA1_Update(SHA_CTX *c, const void *ptr, unsigned long n)
+{
+	unsigned long nb;
+	const unsigned char *p = ptr;
+
+	c->len += n << 3;
+	while (n != 0) {
+		if (c->cnt || n < 64) {
+			nb = 64 - c->cnt;
+			if (nb > n)
+				nb = n;
+			memcpy(&c->buf.b[c->cnt], p, nb);
+			if ((c->cnt += nb) == 64) {
+				sha1_core(c->hash, c->buf.b, 1);
+				c->cnt = 0;
+			}
+		} else {
+			nb = n >> 6;
+			sha1_core(c->hash, p, nb);
+			nb <<= 6;
+		}
+		n -= nb;
+		p += nb;
+	}
+	return 0;
+}	
+
+int SHA1_Final(unsigned char *hash, SHA_CTX *c)
+{
+	unsigned int cnt = c->cnt;
+
+	c->buf.b[cnt++] = 0x80;
+	if (cnt > 56) {
+		if (cnt < 64)
+			memset(&c->buf.b[cnt], 0, 64 - cnt);
+		sha1_core(c->hash, c->buf.b, 1);
+		cnt = 0;
+	}
+	if (cnt < 56)
+		memset(&c->buf.b[cnt], 0, 56 - cnt);
+	c->buf.l[7] = c->len;
+	sha1_core(c->hash, c->buf.b, 1);
+	memcpy(hash, c->hash, 20);
+	return 0;
+}
diff -urN git.orig/ppc/sha1.h git/ppc/sha1.h
--- /dev/null	2005-04-04 12:56:19.000000000 +1000
+++ git/ppc/sha1.h	2005-04-22 16:45:28.000000000 +1000
@@ -0,0 +1,20 @@
+/*
+ * SHA-1 implementation.
+ *
+ * Copyright (C) 2005 Paul Mackerras <paulus@samba.org>
+ */
+#include <stdint.h>
+
+typedef struct sha_context {
+	uint32_t hash[5];
+	uint32_t cnt;
+	uint64_t len;
+	union {
+		unsigned char b[64];
+		uint64_t l[8];
+	} buf;
+} SHA_CTX;
+
+int SHA1_Init(SHA_CTX *c);
+int SHA1_Update(SHA_CTX *c, const void *p, unsigned long n);
+int SHA1_Final(unsigned char *hash, SHA_CTX *c);
diff -urN git.orig/ppc/sha1ppc.S git/ppc/sha1ppc.S
--- /dev/null	2005-04-04 12:56:19.000000000 +1000
+++ git/ppc/sha1ppc.S	2005-04-22 16:29:19.000000000 +1000
@@ -0,0 +1,185 @@
+/*
+ * SHA-1 implementation for PowerPC.
+ *
+ * Copyright (C) 2005 Paul Mackerras <paulus@samba.org>
+ */
+#define FS	80
+
+/*
+ * We roll the registers for T, A, B, C, D, E around on each
+ * iteration; T on iteration t is A on iteration t+1, and so on.
+ * We use registers 7 - 12 for this.
+ */
+#define RT(t)	((((t)+5)%6)+7)
+#define RA(t)	((((t)+4)%6)+7)
+#define RB(t)	((((t)+3)%6)+7)
+#define RC(t)	((((t)+2)%6)+7)
+#define RD(t)	((((t)+1)%6)+7)
+#define RE(t)	((((t)+0)%6)+7)
+
+/* We use registers 16 - 31 for the W values */
+#define W(t)	(((t)%16)+16)
+
+#define STEPD0(t)				\
+	and	%r6,RB(t),RC(t);		\
+	andc	%r0,RD(t),RB(t);		\
+	rotlwi	RT(t),RA(t),5;			\
+	rotlwi	RB(t),RB(t),30;			\
+	or	%r6,%r6,%r0;			\
+	add	%r0,RE(t),%r15;			\
+	add	RT(t),RT(t),%r6;		\
+	add	%r0,%r0,W(t);			\
+	add	RT(t),RT(t),%r0
+
+#define STEPD1(t)				\
+	xor	%r6,RB(t),RC(t);		\
+	rotlwi	RT(t),RA(t),5;			\
+	rotlwi	RB(t),RB(t),30;			\
+	xor	%r6,%r6,RD(t);			\
+	add	%r0,RE(t),%r15;			\
+	add	RT(t),RT(t),%r6;		\
+	add	%r0,%r0,W(t);			\
+	add	RT(t),RT(t),%r0
+
+#define STEPD2(t)				\
+	and	%r6,RB(t),RC(t);		\
+	and	%r0,RB(t),RD(t);		\
+	rotlwi	RT(t),RA(t),5;			\
+	rotlwi	RB(t),RB(t),30;			\
+	or	%r6,%r6,%r0;			\
+	and	%r0,RC(t),RD(t);		\
+	or	%r6,%r6,%r0;			\
+	add	%r0,RE(t),%r15;			\
+	add	RT(t),RT(t),%r6;		\
+	add	%r0,%r0,W(t);			\
+	add	RT(t),RT(t),%r0
+
+#define LOADW(t)				\
+	lwz	W(t),(t)*4(%r4)
+
+#define UPDATEW(t)				\
+	xor	%r0,W((t)-3),W((t)-8);		\
+	xor	W(t),W((t)-16),W((t)-14);	\
+	xor	W(t),W(t),%r0;			\
+	rotlwi	W(t),W(t),1
+
+#define STEP0LD4(t)				\
+	STEPD0(t);   LOADW((t)+4);		\
+	STEPD0((t)+1); LOADW((t)+5);		\
+	STEPD0((t)+2); LOADW((t)+6);		\
+	STEPD0((t)+3); LOADW((t)+7)
+
+#define STEPUP4(t, fn)				\
+	STEP##fn(t);   UPDATEW((t)+4);		\
+	STEP##fn((t)+1); UPDATEW((t)+5);	\
+	STEP##fn((t)+2); UPDATEW((t)+6);	\
+	STEP##fn((t)+3); UPDATEW((t)+7)
+
+#define STEPUP20(t, fn)				\
+	STEPUP4(t, fn);				\
+	STEPUP4((t)+4, fn);			\
+	STEPUP4((t)+8, fn);			\
+	STEPUP4((t)+12, fn);			\
+	STEPUP4((t)+16, fn)
+
+	.globl	sha1_core
+sha1_core:
+	stwu	%r1,-FS(%r1)
+	stw	%r15,FS-68(%r1)
+	stw	%r16,FS-64(%r1)
+	stw	%r17,FS-60(%r1)
+	stw	%r18,FS-56(%r1)
+	stw	%r19,FS-52(%r1)
+	stw	%r20,FS-48(%r1)
+	stw	%r21,FS-44(%r1)
+	stw	%r22,FS-40(%r1)
+	stw	%r23,FS-36(%r1)
+	stw	%r24,FS-32(%r1)
+	stw	%r25,FS-28(%r1)
+	stw	%r26,FS-24(%r1)
+	stw	%r27,FS-20(%r1)
+	stw	%r28,FS-16(%r1)
+	stw	%r29,FS-12(%r1)
+	stw	%r30,FS-8(%r1)
+	stw	%r31,FS-4(%r1)
+
+	/* Load up A - E */
+	lwz	RA(0),0(%r3)	/* A */
+	lwz	RB(0),4(%r3)	/* B */
+	lwz	RC(0),8(%r3)	/* C */
+	lwz	RD(0),12(%r3)	/* D */
+	lwz	RE(0),16(%r3)	/* E */
+
+	mtctr	%r5
+
+1:	LOADW(0)
+	LOADW(1)
+	LOADW(2)
+	LOADW(3)
+
+	lis	%r15,0x5a82	/* K0-19 */
+	ori	%r15,%r15,0x7999
+	STEP0LD4(0)
+	STEP0LD4(4)
+	STEP0LD4(8)
+	STEPUP4(12, D0)
+	STEPUP4(16, D0)
+
+	lis	%r15,0x6ed9	/* K20-39 */
+	ori	%r15,%r15,0xeba1
+	STEPUP20(20, D1)
+
+	lis	%r15,0x8f1b	/* K40-59 */
+	ori	%r15,%r15,0xbcdc
+	STEPUP20(40, D2)
+
+	lis	%r15,0xca62	/* K60-79 */
+	ori	%r15,%r15,0xc1d6
+	STEPUP4(60, D1)
+	STEPUP4(64, D1)
+	STEPUP4(68, D1)
+	STEPUP4(72, D1)
+	STEPD1(76)
+	STEPD1(77)
+	STEPD1(78)
+	STEPD1(79)
+
+	lwz	%r20,16(%r3)
+	lwz	%r19,12(%r3)
+	lwz	%r18,8(%r3)
+	lwz	%r17,4(%r3)
+	lwz	%r16,0(%r3)
+	add	%r20,RE(80),%r20
+	add	RD(0),RD(80),%r19
+	add	RC(0),RC(80),%r18
+	add	RB(0),RB(80),%r17
+	add	RA(0),RA(80),%r16
+	mr	RE(0),%r20
+	stw	RA(0),0(%r3)
+	stw	RB(0),4(%r3)
+	stw	RC(0),8(%r3)
+	stw	RD(0),12(%r3)
+	stw	RE(0),16(%r3)
+
+	addi	%r4,%r4,64
+	bdnz	1b
+
+	lwz	%r15,FS-68(%r1)
+	lwz	%r16,FS-64(%r1)
+	lwz	%r17,FS-60(%r1)
+	lwz	%r18,FS-56(%r1)
+	lwz	%r19,FS-52(%r1)
+	lwz	%r20,FS-48(%r1)
+	lwz	%r21,FS-44(%r1)
+	lwz	%r22,FS-40(%r1)
+	lwz	%r23,FS-36(%r1)
+	lwz	%r24,FS-32(%r1)
+	lwz	%r25,FS-28(%r1)
+	lwz	%r26,FS-24(%r1)
+	lwz	%r27,FS-20(%r1)
+	lwz	%r28,FS-16(%r1)
+	lwz	%r29,FS-12(%r1)
+	lwz	%r30,FS-8(%r1)
+	lwz	%r31,FS-4(%r1)
+	addi	%r1,%r1,FS
+	blr

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] PPC assembly implementation of SHA1
@ 2005-04-23 12:42 linux
  2005-04-23 13:03 ` linux
                   ` (2 more replies)
  0 siblings, 3 replies; 7+ messages in thread
From: linux @ 2005-04-23 12:42 UTC (permalink / raw)
  To: paulus; +Cc: git, linux

I was working on the same thing, but hindered by lack of access to PPC
hardware.  I notice that you also took advantage of the unaligned load
support and native byte order to do the hash straight from the source.

But I came up with a few additional refinements:

- You are using three temporaries (%r0, %r6, and RT(x)) for your
  round functions.  You only need one temporary (%r0) for all the functions.
  (Plus %r15 for k)

  The core round function is

  e += (a <<< 5) + f(b,c,d) + W[i] + K
  b = (b <<< 30)
followed by renaming the registers for the next round:
  d += (e <<< 5) + f(a,b,c) + W[i+1] + K
  a = (a <<< 30)

That can all be computed with one temporary.

There are three possible f() functions:
f1(x,y,z) = "bitwise x ? y : z"
          = (x & y) | (~x & z)
          = (x & y) + (~x & z)
          = z ^ (x & (y ^ z))
All are three logical instrunctions on PPC.  The second form
lets you add it into the accumulator e in two pieces:

+#define STEPD0(t)				\
+	add	%r0,%r15,W(t);			\
+	add	RE(t),RE(t),%r0;		\
+	and	%r0,RC(t),RB(t);		\
+	add	RE(t),RE(t),%r0;		\
+	andc	%r0,RD(t),RB(t);		\
+	add	RE(t),RE(t),%r0;		\
+	rotlwi	%r0,RA(t),5;			\
+	rotlwi	RB(t),RB(t),30;			\
+	add	RE(t),RE(t),%r0;

The XOR version f2(x,y,z) = x^y^z is of course trivial:

+#define STEPD1(t)				\
+	add	%r0,%r15,W(t);			\
+	add	RE(t),RE(t),%r0;		\
+	xor	%r0,RC(t),RB(t);		\
+	xor	%r0,%r0,RA(t);			\
+	add	RE(t),RE(t),%r0;		\
+	rotlwi	%r0,RA(t),5;			\
+	rotlwi	RB(t),RB(t),30;			\
+	add	RE(t),RE(t),%r0;

And the last function, majority(x,y,z), can be written as:
f3(x,y,z) = (x & y) | (y & z) | (z & x)
          = (x & y) | z & (x | y)
          = (x & y) | z & (x ^ y)
          = (x & y) + z & (x ^ y)
The last form again allows evaluation with one temporary and
two adds into RE(t):

+#define STEPD2(t)				\
+	add	%r0,%r15,W(t);			\
+	add	RE(t),RE(t),%r0;		\
+	and	%r0,RC(t),RB(t);		\
+	add	RE(t),RE(t),%r0;		\
+	xor	%r0,RC(t),RB(t);		\
+	and	%r0,%r0,RA(t);			\
+	add	RE(t),RE(t),%r0;		\
+	rotlwi	%r0,RA(t),5;			\
+	rotlwi	RB(t),RB(t),30;			\
+	add	RE(t),RE(t),%r0;

Other notes:
- You don't need to decrement %r1 before saving registers.
  The PPC calling convention defines a "red zone" below the
  current stack pointer that is guaranteed never to be touched
  by signal handlers or the like.  This is specifically for
  leaf procedure optimization, and is at least 224 bytes.
- Is that many stw/lwz instructions faster than stmw/lmw?
  The latter is at least more cahce-friendly.
- You can avoid saving and restoring %r15 by recycling %r5 for that
  purpose; it's not used after the mtctr %r5.
- (Note, by the way, that while the PPC supports unaligned loads, using
  lmw to load unaligned data is SLOW onthe G5; multiple lwz instructions
  are faster in that case.)
- The above changes actually save enough registers to cache the whole hash[5]
  in registers as well, eliminating *all* unnecessary load/store traffic.

With all of the above changes, your sha1ppc.S file turns into:

diff -urN git.orig/ppc/sha1ppc.S git/ppc/sha1ppc.S
--- /dev/null	2005-04-04 12:56:19.000000000 +1000
+++ git/ppc/sha1ppc.S	2005-04-22 16:29:19.000000000 +1000
@@ -0,0 +1,142 @@
+/*
+ * SHA-1 implementation for PowerPC.
+ *
+ * Copyright (C) 2005 Paul Mackerras <paulus@samba.org>
+ */
+
+/*
+ * We roll the registers for A, B, C, D, E around on each
+ * iteration; A on iteration t is B on iteration t+1, and so on.
+ * We use registers 6 - 10 for this.  (Registers 27 - 31 hold
+ * the previous values.)
+ */
+#define RA(t)	((((t)+4)%5)+6)
+#define RB(t)	((((t)+3)%5)+6)
+#define RC(t)	((((t)+2)%5)+6)
+#define RD(t)	((((t)+1)%5)+6)
+#define RE(t)	((((t)+0)%5)+6)
+
+/* We use registers 11 - 26 for the W values */
+#define W(t)	(((t)%16)+11)
+
+#define STEPD0(t)				\
+	add	%r0,%r5,W(t);			\
+	add	RE(t),RE(t),%r0;		\
+	and	%r0,RC(t),RB(t);		\
+	add	RE(t),RE(t),%r0;		\
+	andc	%r0,RD(t),RB(t);		\
+	add	RE(t),RE(t),%r0;		\
+	rotlwi	%r0,RA(t),5;			\
+	rotlwi	RB(t),RB(t),30;			\
+	add	RE(t),RE(t),%r0;
+
+#define STEPD1(t)				\
+	add	%r0,%r5,W(t);			\
+	add	RE(t),RE(t),%r0;		\
+	xor	%r0,RC(t),RB(t);		\
+	xor	%r0,%r0,RA(t);			\
+	add	RE(t),RE(t),%r0;		\
+	rotlwi	%r0,RA(t),5;			\
+	rotlwi	RB(t),RB(t),30;			\
+	add	RE(t),RE(t),%r0;
+
+#define STEPD2(t)				\
+	add	%r0,%r15,W(t);			\
+	add	RE(t),RE(t),%r0;		\
+	and	%r0,RC(t),RB(t);		\
+	add	RE(t),RE(t),%r0;		\
+	xor	%r0,RC(t),RB(t);		\
+	and	%r0,%r0,RA(t);			\
+	add	RE(t),RE(t),%r0;		\
+	rotlwi	%r0,RA(t),5;			\
+	rotlwi	RB(t),RB(t),30;			\
+	add	RE(t),RE(t),%r0;
+
+#define LOADW(t)				\
+	lwz	W(t),(t)*4(%r4)
+
+#define UPDATEW(t)				\
+	xor	%r0,W((t)-3),W((t)-8);		\
+	xor	W(t),W((t)-16),W((t)-14);	\
+	xor	W(t),W(t),%r0;			\
+	rotlwi	W(t),W(t),1
+
+#define STEP0LD4(t)				\
+	STEPD0(t);     LOADW((t)+4);		\
+	STEPD0((t)+1); LOADW((t)+5);		\
+	STEPD0((t)+2); LOADW((t)+6);		\
+	STEPD0((t)+3); LOADW((t)+7)
+
+#define STEPUP4(t, fn)				\
+	STEP##fn(t);     UPDATEW((t)+4);	\
+	STEP##fn((t)+1); UPDATEW((t)+5);	\
+	STEP##fn((t)+2); UPDATEW((t)+6);	\
+	STEP##fn((t)+3); UPDATEW((t)+7)
+
+#define STEPUP20(t, fn)				\
+	STEPUP4(t, fn);				\
+	STEPUP4((t)+4, fn);			\
+	STEPUP4((t)+8, fn);			\
+	STEPUP4((t)+12, fn);			\
+	STEPUP4((t)+16, fn)
+
+	.globl	sha1_core
+sha1_core:
+	stmw	%r13,-76(%r1)
+
+	/* Load up A - E */
+	lmw	%r27,0(%r3)
+
+	mtctr	%r5
+
+1:	mr	RA(0),%r27
+	LOADW(0)
+	mr	RB(0),%r28
+	LOADW(1)
+	mr	RC(0),%r29
+	LOADW(2)
+	mr	RD(0),%r30
+	LOADW(3)
+	mr	RE(0),%r31
+
+	lis	%r5,0x5a82	/* K0-19 */
+	ori	%r5,%r5,0x7999
+	STEP0LD4(0)
+	STEP0LD4(4)
+	STEP0LD4(8)
+	STEPUP4(12, D0)
+	STEPUP4(16, D0)
+
+	lis	%r5,0x6ed9	/* K20-39 */
+	ori	%r5,%r5,0xeba1
+	STEPUP20(20, D1)
+
+	lis	%r5,0x8f1b	/* K40-59 */
+	ori	%r5,%r5,0xbcdc
+	STEPUP20(40, D2)
+
+	lis	%r5,0xca62	/* K60-79 */
+	ori	%r5,%r5,0xc1d6
+	STEPUP4(60, D1)
+	STEPUP4(64, D1)
+	STEPUP4(68, D1)
+	STEPUP4(72, D1)
+	STEPD1(76)
+	STEPD1(77)
+	STEPD1(78)
+	STEPD1(79)
+
+	/* Add results to original values */
+	add	%r27,%r27,RA(0)
+	add	%r28,%r28,RB(0)
+	add	%r29,%r29,RC(0)
+	add	%r30,%r30,RD(0)
+	add	%r31,%r31,RE(0)
+
+	addi	%r4,%r4,64
+	bdnz	1b
+
+	/* Save final hash, restore registers, and return */
+	stmw	%r27,0(%r3)
+	lmw	%r13,-76(%r1)
+	blr

The above changes, if you want them, are placed in the public domain.
It assembles, but I can't test it.  Have fun.

(I might also replace "t" with "i" in the macros, but figured that
would make comparing versions annoying.  Of course, part of the reason
was confusion with the T register, which is now eliminated...)

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] PPC assembly implementation of SHA1
  2005-04-23 12:42 linux
@ 2005-04-23 13:03 ` linux
  2005-04-24  2:49 ` Benjamin Herrenschmidt
  2005-04-24  4:40 ` Paul Mackerras
  2 siblings, 0 replies; 7+ messages in thread
From: linux @ 2005-04-23 13:03 UTC (permalink / raw)
  To: paulus; +Cc: git, linux

One bug I spotted too late: the first line of the STEPD2
macro still references %r15; that should be the new home of k, %r5.

Sorry about that.

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] PPC assembly implementation of SHA1
  2005-04-23 12:42 linux
  2005-04-23 13:03 ` linux
@ 2005-04-24  2:49 ` Benjamin Herrenschmidt
  2005-04-24  4:40 ` Paul Mackerras
  2 siblings, 0 replies; 7+ messages in thread
From: Benjamin Herrenschmidt @ 2005-04-24  2:49 UTC (permalink / raw)
  To: linux; +Cc: Paul Mackerras, git

On Sat, 2005-04-23 at 12:42 +0000, linux@horizon.com wrote:

> - You don't need to decrement %r1 before saving registers.
>   The PPC calling convention defines a "red zone" below the
>   current stack pointer that is guaranteed never to be touched
>   by signal handlers or the like.  This is specifically for
>   leaf procedure optimization, and is at least 224 bytes.

On ELF ppc32 ABI ? Hrm... the SysV ABI document says

"The only permitted references with negative offsets from the stack
pointer are those described here for establishing a stack frame."

I know MacOS had a red zone, but do we ?

> - Is that many stw/lwz instructions faster than stmw/lmw?
>   The latter is at least more cahce-friendly.

More cache friendly ? Hrm.. I wouldn't be sure. Also, I remember readind
a while ago that those store/load multiple instructions aren't that
recommended. They aren't very good on some CPU models. I would expect
the bus/cache bandwidth to be the limiting factor here anyway, and as
you point out below, the they don't deal with unaligned access very well
in all cases.
 


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] PPC assembly implementation of SHA1
  2005-04-23 12:42 linux
  2005-04-23 13:03 ` linux
  2005-04-24  2:49 ` Benjamin Herrenschmidt
@ 2005-04-24  4:40 ` Paul Mackerras
  2005-04-24 12:04   ` Wayne Scott
  2005-04-25  0:16   ` linux
  2 siblings, 2 replies; 7+ messages in thread
From: Paul Mackerras @ 2005-04-24  4:40 UTC (permalink / raw)
  To: linux; +Cc: git

linux@horizon.com writes:

> I was working on the same thing, but hindered by lack of access to PPC
> hardware.  I notice that you also took advantage of the unaligned load
> support and native byte order to do the hash straight from the source.

Yes. :)  In previous experiments (in the context of trying different
ways to do memcpy) I found that doing unaligned word loads is faster
than doing aligned loads plus extra rotate and mask instructions to
get the bytes you want together.

> But I came up with a few additional refinements:
> 
> - You are using three temporaries (%r0, %r6, and RT(x)) for your
>   round functions.  You only need one temporary (%r0) for all the functions.
>   (Plus %r15 for k)

The reason I used more than one temporary is that I was trying to put
dependent instructions as far apart as reasonably possible, to
minimize the chances of pipeline stalls.  Given that the 970 does
register renaming and out-of-order execution, I don't know how
essential that is, but it can't hurt.

> All are three logical instrunctions on PPC.  The second form
> lets you add it into the accumulator e in two pieces:

A sequence of adds into a single register is going to incur the
2-cycle latency between generation and use of a value; i.e. the adds
will only issue on every second cycle.  I think we are better off
making the dataflow more like a tree than a linear chain where
possible.

> And the last function, majority(x,y,z), can be written as:
> f3(x,y,z) = (x & y) | (y & z) | (z & x)
>           = (x & y) | z & (x | y)
>           = (x & y) | z & (x ^ y)
>           = (x & y) + z & (x ^ y)

That's cute, I hadn't thought of that.

> - You don't need to decrement %r1 before saving registers.
>   The PPC calling convention defines a "red zone" below the
>   current stack pointer that is guaranteed never to be touched
>   by signal handlers or the like.  This is specifically for
>   leaf procedure optimization, and is at least 224 bytes.

Not in the ppc32 ELF ABI - you are not supposed to touch memory below
the stack pointer.  The kernel is more forgiving than that, and in
fact you can currently use the red zone without anything bad
happening, but you really shouldn't.

> - Is that many stw/lwz instructions faster than stmw/lmw?
>   The latter is at least more cahce-friendly.

I believe the stw/lwz and the stmw/lmw will actually execute at the
same speed on the 970, but I have seen lwz/stw go faster than lmw/stmw
on other machines.  In any case we aren't executing the prolog and
epilog as often as the instructions in the main loop, hopefully.

> - You can avoid saving and restoring %r15 by recycling %r5 for that
>   purpose; it's not used after the mtctr %r5.

True.

> - The above changes actually save enough registers to cache the whole hash[5]
>   in registers as well, eliminating *all* unnecessary load/store traffic.

That's cool.

> With all of the above changes, your sha1ppc.S file turns into:

I added a stwu and an addi to make a stack frame, and changed %r15 to
%r5 as you mentioned in another message.  I tried it in a little test
program I have that calls SHA1_Update 256,000 times with a buffer of
4096 zero bytes, i.e. it processes 1000MB.  Your version seems to be
about 2% faster; it took 4.53 seconds compared to 4.62 for mine.  But
it also gives the wrong answer; I haven't investigated why.

Thanks,
Paul.

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] PPC assembly implementation of SHA1
  2005-04-24  4:40 ` Paul Mackerras
@ 2005-04-24 12:04   ` Wayne Scott
  2005-04-25  0:16   ` linux
  1 sibling, 0 replies; 7+ messages in thread
From: Wayne Scott @ 2005-04-24 12:04 UTC (permalink / raw)
  To: Paul Mackerras; +Cc: linux, git

_The_ book on expression rewriting tricks like this, especially for
the PPC, is "Hacker's Delight" by Henry Warren.  Great reading!!!
http://www.hackersdelight.org/

-Wayne

On 4/23/05, Paul Mackerras <paulus@samba.org> wrote:
> linux@horizon.com writes:
> 
> > I was working on the same thing, but hindered by lack of access to PPC
> > hardware.  I notice that you also took advantage of the unaligned load
> > support and native byte order to do the hash straight from the source.
> 
> Yes. :)  In previous experiments (in the context of trying different
> ways to do memcpy) I found that doing unaligned word loads is faster
> than doing aligned loads plus extra rotate and mask instructions to
> get the bytes you want together.
> 
> > But I came up with a few additional refinements:
> >
> > - You are using three temporaries (%r0, %r6, and RT(x)) for your
> >   round functions.  You only need one temporary (%r0) for all the functions.
> >   (Plus %r15 for k)
> 
> The reason I used more than one temporary is that I was trying to put
> dependent instructions as far apart as reasonably possible, to
> minimize the chances of pipeline stalls.  Given that the 970 does
> register renaming and out-of-order execution, I don't know how
> essential that is, but it can't hurt.
> 
> > All are three logical instrunctions on PPC.  The second form
> > lets you add it into the accumulator e in two pieces:
> 
> A sequence of adds into a single register is going to incur the
> 2-cycle latency between generation and use of a value; i.e. the adds
> will only issue on every second cycle.  I think we are better off
> making the dataflow more like a tree than a linear chain where
> possible.
> 
> > And the last function, majority(x,y,z), can be written as:
> > f3(x,y,z) = (x & y) | (y & z) | (z & x)
> >           = (x & y) | z & (x | y)
> >           = (x & y) | z & (x ^ y)
> >           = (x & y) + z & (x ^ y)
> 
> That's cute, I hadn't thought of that.
> 
> > - You don't need to decrement %r1 before saving registers.
> >   The PPC calling convention defines a "red zone" below the
> >   current stack pointer that is guaranteed never to be touched
> >   by signal handlers or the like.  This is specifically for
> >   leaf procedure optimization, and is at least 224 bytes.
> 
> Not in the ppc32 ELF ABI - you are not supposed to touch memory below
> the stack pointer.  The kernel is more forgiving than that, and in
> fact you can currently use the red zone without anything bad
> happening, but you really shouldn't.
> 
> > - Is that many stw/lwz instructions faster than stmw/lmw?
> >   The latter is at least more cahce-friendly.
> 
> I believe the stw/lwz and the stmw/lmw will actually execute at the
> same speed on the 970, but I have seen lwz/stw go faster than lmw/stmw
> on other machines.  In any case we aren't executing the prolog and
> epilog as often as the instructions in the main loop, hopefully.
> 
> > - You can avoid saving and restoring %r15 by recycling %r5 for that
> >   purpose; it's not used after the mtctr %r5.
> 
> True.
> 
> > - The above changes actually save enough registers to cache the whole hash[5]
> >   in registers as well, eliminating *all* unnecessary load/store traffic.
> 
> That's cool.
> 
> > With all of the above changes, your sha1ppc.S file turns into:
> 
> I added a stwu and an addi to make a stack frame, and changed %r15 to
> %r5 as you mentioned in another message.  I tried it in a little test
> program I have that calls SHA1_Update 256,000 times with a buffer of
> 4096 zero bytes, i.e. it processes 1000MB.  Your version seems to be
> about 2% faster; it took 4.53 seconds compared to 4.62 for mine.  But
> it also gives the wrong answer; I haven't investigated why.
> 
> Thanks,
> Paul.
> -
> To unsubscribe from this list: send the line "unsubscribe git" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] PPC assembly implementation of SHA1
  2005-04-24  4:40 ` Paul Mackerras
  2005-04-24 12:04   ` Wayne Scott
@ 2005-04-25  0:16   ` linux
  1 sibling, 0 replies; 7+ messages in thread
From: linux @ 2005-04-25  0:16 UTC (permalink / raw)
  To: paulus; +Cc: git, linux

> Yes. :)  In previous experiments (in the context of trying different
> ways to do memcpy) I found that doing unaligned word loads is faster
> than doing aligned loads plus extra rotate and mask instructions to
> get the bytes you want together.

The PPC970, at least, supports unaligned loads within one cache line
(64 bytes for L1 hit; 32 bytes for L1 miss) directly.  If the load
crosses the line, the processor backs up and re-issues it as two
loads and a merge.

Multiple-word loads can really suffer from this, as when the fault
hits, the *entire instruction* is aborted and re-issued as a series
of aligned loads and merges.

But for a single load, it's probably cheaper on average to use the
hardware 15 times out of 16 and take the retry the 16th.

> But I came up with a few additional refinements:
> 
>> - You are using three temporaries (%r0, %r6, and RT(x)) for your
>>   round functions.  You only need one temporary (%r0) for all the functions.
>>   (Plus %r15 for k)

> The reason I used more than one temporary is that I was trying to put
> dependent instructions as far apart as reasonably possible, to
> minimize the chances of pipeline stalls.  Given that the 970 does
> register renaming and out-of-order execution, I don't know how
> essential that is, but it can't hurt.

It's a good general idea, but the PPC970 only has two integer ALUs, so
it can't get too clever.

>> All are three logical instrunctions on PPC.  The second form
>> lets you add it into the accumulator e in two pieces:

> A sequence of adds into a single register is going to incur the
> 2-cycle latency between generation and use of a value; i.e. the adds
> will only issue on every second cycle.  I think we are better off
> making the dataflow more like a tree than a linear chain where
> possible.

Grumble, complain... you're right.  I didn't know it had a 2-cycle
dependency.  Time to reschedule those inner loops.  Still, the multi-input
sum representation gives you a lot of scheduling flexibility.

Given this, it has to be scheduled 4-wide, which is a lot trickier.
The steps are 9, 8, and 10 instructions long (plus 4 instructions
for UPDATEW on most of them), and there's a 5- or 6-input sum to
compute in that time.

I'll stare at the dependency graph a bit and see if I can do better.

>> - You don't need to decrement %r1 before saving registers.
>>   The PPC calling convention defines a "red zone" below the
>>   current stack pointer that is guaranteed never to be touched
>>   by signal handlers or the like.  This is specifically for
>>   leaf procedure optimization, and is at least 224 bytes.

> Not in the ppc32 ELF ABI - you are not supposed to touch memory below
> the stack pointer.  The kernel is more forgiving than that, and in
> fact you can currently use the red zone without anything bad
> happening, but you really shouldn't.

Oh!  I didn't know that!  Thank you for enlightening me!

>> - Is that many stw/lwz instructions faster than stmw/lmw?
>>   The latter is at least more cache-friendly.

> I believe the stw/lwz and the stmw/lmw will actually execute at the
> same speed on the 970, but I have seen lwz/stw go faster than lmw/stmw
> on other machines.  In any case we aren't executing the prolog and
> epilog as often as the instructions in the main loop, hopefully.

Yes, and reducing the I-cache footprint seems useful.

>> With all of the above changes, your sha1ppc.S file turns into:

> I added a stwu and an addi to make a stack frame, and changed %r15 to
> %r5 as you mentioned in another message.  I tried it in a little test
> program I have that calls SHA1_Update 256,000 times with a buffer of
> 4096 zero bytes, i.e. it processes 1000MB.  Your version seems to be
> about 2% faster; it took 4.53 seconds compared to 4.62 for mine.  But
> it also gives the wrong answer; I haven't investigated why.

Grumble, damn.  The standard document has all the register values for
every round. so if you set up the same plaintext they use and step
through the code with that at your side, the problem should jump
out at you.

Anyway, now that I know the pipeline issues better, I'll try to
reschedule it and see if I can improve the situation.  I may have
to understand the dispatch group issues pretty thoroughly, too.

http://www.alphaworks.ibm.com/tech/simppc
might be of interest.

I'll try to find the bug while I'm at it.  Would you be willing to
benchmark some code for me?

Thanks!

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2005-04-25  0:12 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2005-04-23  5:33 [PATCH] PPC assembly implementation of SHA1 Paul Mackerras
  -- strict thread matches above, loose matches on Subject: below --
2005-04-23 12:42 linux
2005-04-23 13:03 ` linux
2005-04-24  2:49 ` Benjamin Herrenschmidt
2005-04-24  4:40 ` Paul Mackerras
2005-04-24 12:04   ` Wayne Scott
2005-04-25  0:16   ` linux

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).