public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
* [PATCH] small sha512 cleanup
@ 2004-10-01 19:31 Denis Vlasenko
  2004-10-01 20:38 ` [PATCH] reduce sha512_transform() stack usage, speedup Denis Vlasenko
  0 siblings, 1 reply; 4+ messages in thread
From: Denis Vlasenko @ 2004-10-01 19:31 UTC (permalink / raw)
  To: jmorris, davem; +Cc: linux-kernel

[-- Attachment #1: Type: text/plain, Size: 299 bytes --]

Looks like open-coded be_to_cpu.
GCC produces rather poor code for this.
be_to_cpu produces asm()s which are ~4 times shorter.

Compile-tested only.

I am not sure whether input can be 64bit-unaligned.
If it indeed can be, replace:

((u64*)(input))[I]  ->  get_unaligned( ((u64*)(input))+I )
--
vda

[-- Attachment #2: sha512.c.diff --]
[-- Type: text/x-diff, Size: 1004 bytes --]

Replaces tons of GCC-produced horror code
with nice small one.
While we're at it, fix whitespace.

--- linux-2.6.9-rc3/crypto/sha512.c.org	Thu Sep 30 07:09:44 2004
+++ linux-2.6.9-rc3/crypto/sha512.c	Thu Sep 30 07:10:36 2004
@@ -104,27 +104,12 @@
 
 static inline void LOAD_OP(int I, u64 *W, const u8 *input)
 {
-        u64 t1  = input[(8*I)  ] & 0xff;
-        t1 <<= 8;
-        t1 |= input[(8*I)+1] & 0xff;
-        t1 <<= 8;
-        t1 |= input[(8*I)+2] & 0xff;
-        t1 <<= 8;
-        t1 |= input[(8*I)+3] & 0xff;
-        t1 <<= 8;
-        t1 |= input[(8*I)+4] & 0xff;
-        t1 <<= 8;
-        t1 |= input[(8*I)+5] & 0xff;
-        t1 <<= 8;
-        t1 |= input[(8*I)+6] & 0xff;
-        t1 <<= 8;
-        t1 |= input[(8*I)+7] & 0xff;
-        W[I] = t1;
+	W[I] = __be64_to_cpu( ((u64*)(input))[I] );
 }
 
 static inline void BLEND_OP(int I, u64 *W)
 {
-        W[I] = s1(W[I-2]) + W[I-7] + s0(W[I-15]) + W[I-16];
+	W[I] = s1(W[I-2]) + W[I-7] + s0(W[I-15]) + W[I-16];
 }
 
 static void

^ permalink raw reply	[flat|nested] 4+ messages in thread

* [PATCH] reduce sha512_transform() stack usage, speedup
  2004-10-01 19:31 [PATCH] small sha512 cleanup Denis Vlasenko
@ 2004-10-01 20:38 ` Denis Vlasenko
  2004-10-01 20:43   ` David S. Miller
  0 siblings, 1 reply; 4+ messages in thread
From: Denis Vlasenko @ 2004-10-01 20:38 UTC (permalink / raw)
  To: jmorris, davem; +Cc: linux-kernel

[-- Attachment #1: Type: text/plain, Size: 1493 bytes --]

On top of previous:

Patch moves large temporary u64 W[80]
from stack to ctx struct:

* reduces stack usage by 640 bytes
* saves one 640-byte memset() per sha512_transform()
  (we still do it after *all* iterations are done)
* quite unexpectedly saves 1.6k of code on i386
  because stack offsets now fit into 8bits
  and many stack addressing insns got 3 bytes smaller:

# size sha512.o.org sha512.o
text       data     bss     dec     hex filename
8281        372       0    8653    21cd sha512.o.org
6649        372       0    7021    1b6d sha512.o

# objdump -d sha512.o.org | cut -b9- >sha512.d.org
# objdump -d sha512.o | cut -b9- >sha512.d
# diff -u sha512.d.org sha512.d
[snip]
 :      8b 4b 28                mov    0x28(%ebx),%ecx
 :      8b 5b 2c                mov    0x2c(%ebx),%ebx
-:      89 8d 44 fd ff ff       mov    %ecx,0xfffffd44(%ebp)
-:      89 9d 48 fd ff ff       mov    %ebx,0xfffffd48(%ebp)
-:      89 9d f4 fc ff ff       mov    %ebx,0xfffffcf4(%ebp)
+:      89 4d c4                mov    %ecx,0xffffffc4(%ebp)
+:      89 5d c8                mov    %ebx,0xffffffc8(%ebp)
+:      89 9d 64 ff ff ff       mov    %ebx,0xffffff64(%ebp)
 :      8b 5d 08                mov    0x8(%ebp),%ebx
-:      89 8d f0 fc ff ff       mov    %ecx,0xfffffcf0(%ebp)
+:      89 8d 60 ff ff ff       mov    %ecx,0xffffff60(%ebp)
 :      8b 42 30                mov    0x30(%edx),%eax
 :      8b 52 34                mov    0x34(%edx),%edx
[snip]

WARNING: compile tested only.
--
vda

[-- Attachment #2: sha512.c.W.patch --]
[-- Type: text/x-diff, Size: 1198 bytes --]

--- linux-2.6.9-rc3/crypto/sha512.c.org	Fri Oct  1 22:17:14 2004
+++ linux-2.6.9-rc3/crypto/sha512.c	Fri Oct  1 23:20:13 2004
@@ -30,6 +30,7 @@
 	u64 state[8];
 	u32 count[4];
 	u8 buf[128];
+	u64 W[80];
 };
 
 static inline u64 Ch(u64 x, u64 y, u64 z)
@@ -113,10 +114,9 @@
 }
 
 static void
-sha512_transform(u64 *state, const u8 *input)
+sha512_transform(u64 *state, u64 *W, const u8 *input)
 {
 	u64 a, b, c, d, e, f, g, h, t1, t2;
-	u64 W[80];
 
 	int i;
 
@@ -157,7 +157,6 @@
 
 	/* erase our data */
 	a = b = c = d = e = f = g = h = t1 = t2 = 0;
-	memset(W, 0, 80 * sizeof(u64));
 }
 
 static void
@@ -215,10 +214,10 @@
 	/* Transform as many times as possible. */
 	if (len >= part_len) {
 		memcpy(&sctx->buf[index], data, part_len);
-		sha512_transform(sctx->state, sctx->buf);
+		sha512_transform(sctx->state, sctx->W, sctx->buf);
 
 		for (i = part_len; i + 127 < len; i+=128)
-			sha512_transform(sctx->state, &data[i]);
+			sha512_transform(sctx->state, sctx->W, &data[i]);
 
 		index = 0;
 	} else {
@@ -227,6 +226,9 @@
 
 	/* Buffer remaining input */
 	memcpy(&sctx->buf[index], &data[i], len - i);
+
+	/* erase our data */
+	memset(sctx->W, 0, sizeof(sctx->W));
 }
 
 static void

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH] reduce sha512_transform() stack usage, speedup
  2004-10-01 20:38 ` [PATCH] reduce sha512_transform() stack usage, speedup Denis Vlasenko
@ 2004-10-01 20:43   ` David S. Miller
  2004-10-01 21:22     ` Denis Vlasenko
  0 siblings, 1 reply; 4+ messages in thread
From: David S. Miller @ 2004-10-01 20:43 UTC (permalink / raw)
  To: Denis Vlasenko; +Cc: jmorris, linux-kernel

On Fri, 1 Oct 2004 23:38:11 +0300
Denis Vlasenko <vda@port.imtp.ilyichevsk.odessa.ua> wrote:

> WARNING: compile tested only.

You can't claim a "speed up" if you only compile test your
changes.  Neither can you expect us to apply patches in
such a case.

It's not that difficult to load the tcrypt module and make
sure all the tests for the module you're changing still
pass.

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH] reduce sha512_transform() stack usage, speedup
  2004-10-01 20:43   ` David S. Miller
@ 2004-10-01 21:22     ` Denis Vlasenko
  0 siblings, 0 replies; 4+ messages in thread
From: Denis Vlasenko @ 2004-10-01 21:22 UTC (permalink / raw)
  To: David S. Miller; +Cc: jmorris, linux-kernel

On Friday 01 October 2004 23:43, David S. Miller wrote:
> On Fri, 1 Oct 2004 23:38:11 +0300
> Denis Vlasenko <vda@port.imtp.ilyichevsk.odessa.ua> wrote:
> 
> > WARNING: compile tested only.
> 
> You can't claim a "speed up" if you only compile test your
> changes.  Neither can you expect us to apply patches in
> such a case.

Speedup is rather tiny, most probably not measurable.
Patch optimizes out some memsets, otherwise code
practically did not change.
 
> It's not that difficult to load the tcrypt module and make
> sure all the tests for the module you're changing still
> pass.

Done:

testing sha384
test 1:
cb00753f45a35e8bb5a03d699ac65007272c32ab0eded1631a8b605a43ff5bed8086072ba1e7cc2358baeca134c825a7
pass
test 2:
3391fdddfc8dc7393707a65b1b4709397cf8b1d162af05abfe8f450de5f36bc6b0455a8520bc4e6f5fe95b1fe3c8452b
pass
test 3:
09330c33f71147e83d192fc782cd1b4753111b173b3b05d22fa08086e3b0f712fcc7c71a557e2db966c3e9fa91746039
pass
test 4:
3d208973ab3508dbbd7e2c2862ba290ad3010e4978c198dc4d8fd014e582823a89e16f9b2a7bbc1ac938e2d199e8bea4
pass
testing sha384 across pages
test 1:
3d208973ab3508dbbd7e2c2862ba290ad3010e4978c198dc4d8fd014e582823a89e16f9b2a7bbc1ac938e2d199e8bea4
pass

testing sha512
test 1:
ddaf35a193617abacc417349ae20413112e6fa4e89a97ea20a9eeee64b55d39a2192992a274fc1a836ba3c23a3feebbd454d4423643ce80e2a9ac94fa54ca49f
pass
test 2:
204a8fc6dda82f0a0ced7beb8e08a41657c16ef468b228a8279be331a703c33596fd15c13b1b07f9aa1d3bea57789ca031ad85c7a71dd70354ec631238ca3445
pass
test 3:
8e959b75dae313da8cf4f72814fc143f8f7779c6eb9f7fa17299aeadb6889018501d289e4900f7e4331b99dec4b5433ac7d329eeb6dd26545e96e55b874be909
pass
test 4:
930d0cefcb30ff1133b6898121f1cf3d27578afcafe8677c5257cf069911f75d8f5831b56ebfda67b278e66dff8b84fe2b2870f742a580d8edb41987232850c9
pass
testing sha512 across pages
test 1:
930d0cefcb30ff1133b6898121f1cf3d27578afcafe8677c5257cf069911f75d8f5831b56ebfda67b278e66dff8b84fe2b2870f742a580d8edb41987232850c9
pass

Please consider applying.
--
vda


^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2004-10-01 21:33 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2004-10-01 19:31 [PATCH] small sha512 cleanup Denis Vlasenko
2004-10-01 20:38 ` [PATCH] reduce sha512_transform() stack usage, speedup Denis Vlasenko
2004-10-01 20:43   ` David S. Miller
2004-10-01 21:22     ` Denis Vlasenko

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox