[PATCH] introduce generic 64bit rotations and i386 asm optimized version

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH] introduce generic 64bit rotations and i386 asm optimized version
@ 2005-04-19  6:18 Denis Vlasenko
  2005-04-19  6:21 ` [PATCH] sha512: replace open-coded be64 conversions Denis Vlasenko
                   ` (2 more replies)
  0 siblings, 3 replies; 14+ messages in thread
From: Denis Vlasenko @ 2005-04-19  6:18 UTC (permalink / raw)
  To: linux-kernel; +Cc: Matt Mackall

[-- Attachment #1: Type: text/plain, Size: 1994 bytes --]

This is done because on 32bit arch gcc generates suboptimal code
from C implementation:

old:
      40:       83 ea 80                sub    $0xffffff80,%edx
      43:       89 95 50 ff ff ff       mov    %edx,0xffffff50(%ebp)
      49:       8b bd 50 ff ff ff       mov    0xffffff50(%ebp),%edi
      4f:       8b 4f f0                mov    0xfffffff0(%edi),%ecx
      52:       8b 5f f4                mov    0xfffffff4(%edi),%ebx
      55:       89 ce                   mov    %ecx,%esi
      57:       0f ac de 13             shrd   $0x13,%ebx,%esi
      5b:       31 c0                   xor    %eax,%eax
      5d:       89 df                   mov    %ebx,%edi
      5f:       89 ca                   mov    %ecx,%edx
      61:       c1 e2 0d                shl    $0xd,%edx
      64:       09 c6                   or     %eax,%esi
      66:       c1 ef 13                shr    $0x13,%edi
      69:       89 d8                   mov    %ebx,%eax
      6b:       09 d7                   or     %edx,%edi
      6d:       c1 e8 1d                shr    $0x1d,%eax
      70:       31 d2                   xor    %edx,%edx
      72:       89 85 40 ff ff ff       mov    %eax,0xffffff40(%ebp)
      78:       89 95 44 ff ff ff       mov    %edx,0xffffff44(%ebp)
new:
      40:       83 ea 80                sub    $0xffffff80,%edx
      43:       89 95 48 ff ff ff       mov    %edx,0xffffff48(%ebp)
      49:       8b 9d 48 ff ff ff       mov    0xffffff48(%ebp),%ebx
      4f:       8b 4b f0                mov    0xfffffff0(%ebx),%ecx
      52:       8b 5b f4                mov    0xfffffff4(%ebx),%ebx
      55:       89 c8                   mov    %ecx,%eax
      57:       89 da                   mov    %ebx,%edx
      59:       0f ac d0 13             shrd   $0x13,%edx,%eax
      5d:       0f ac ca 13             shrd   $0x13,%ecx,%edx
      61:       89 45 a0                mov    %eax,0xffffffa0(%ebp)
      64:       89 55 a4                mov    %edx,0xffffffa4(%ebp)
--
vda

[-- Attachment #2: 0.ror64.patch --]
[-- Type: text/x-diff, Size: 3980 bytes --]

diff -urpN 2.6.12-rc2.1.be/include/asm-i386/bitops.h 2.6.12-rc2.2.ror/include/asm-i386/bitops.h
--- 2.6.12-rc2.1.be/include/asm-i386/bitops.h	Tue Oct 19 00:54:36 2004
+++ 2.6.12-rc2.2.ror/include/asm-i386/bitops.h	Tue Apr 19 08:20:07 2005
@@ -7,6 +7,7 @@
 
 #include <linux/config.h>
 #include <linux/compiler.h>
+#include <linux/types.h>
 
 /*
  * These have to be done with inline assembly: that way the bit-setting
@@ -431,9 +432,81 @@ static inline int ffs(int x)
 #define hweight16(x) generic_hweight16(x)
 #define hweight8(x) generic_hweight8(x)
 
-#endif /* __KERNEL__ */
+/*
+ * 64bit rotations
+ * (gcc3 seems to be clever enough to do 32bit ones just fine)
+ *
+ * Why "i" and "I" constraints do not work? gcc says:
+ * "warning: asm operand 2 probably doesn't match constraints"
+ * "error: impossible constraint in 'asm'"
+ * Will use "Ic" for now. If gcc will fail to do const propagation
+ * and will try to stuff constant into ecx, shld %3,... will expand
+ * to shld %ecx,... and assembler will moan.
+ * Do not 'fix' by changing to shld %b3,...
+ *
+ * Have to stick to edx,eax pair only because
+ * gcc has limited support for 64bit asm parameters
+ */
+#define constant_rol64(v,c) \
+	({						\
+	u64 vv = (v);					\
+	if(!(c&63)) {					\
+	} else if((c&63)==1) {				\
+		asm (					\
+		"	shldl	$1,%%edx,%%eax	\n"	\
+		"	rcll	$1,%%edx	\n"	\
+		: "=&A" (vv)				\
+		: "0" (vv)				\
+		);					\
+	} else if((c&63)==63) {				\
+		asm (					\
+		"	shrdl	$1,%%edx,%%eax	\n"	\
+		"	rcrl	$1,%%edx	\n"	\
+		: "=&A" (vv)				\
+		: "0" (vv)				\
+		);					\
+	} else if((c&63)<32) {				\
+		asm (					\
+		"	shldl	%3,%%edx,%%eax	\n"	\
+		"	shldl	%3,%2,%%edx	\n"	\
+		: "=&A" (vv)				\
+		: "0" (vv), "r" (vv), "Ic" (c&63)	\
+		);					\
+	} else if((c&63)>32) {				\
+		asm (					\
+		"	shrdl	%3,%%edx,%%eax	\n"	\
+		"	shrdl	%3,%2,%%edx	\n"	\
+		: "=&A" (vv)				\
+		: "0" (vv), "r" (vv), "Ic" (64-(c&63))	\
+		);					\
+	} else /* (c&63)==32 */ {			\
+		asm (					\
+		"	xchgl	%%edx,%%eax	\n"	\
+		: "=&A" (vv)				\
+		: "0" (vv)				\
+		);					\
+	}						\
+	vv;						\
+	})
+/*
+ * Unfortunately 64bit rotations with non-constant count
+ * have issues with cnt>=32. Using C code instead
+ */
+static inline u64 rol64(u64 x,int num) {
+	if(__builtin_constant_p(num))
+		return constant_rol64(x,num);
+	/* Hmmm... shall we do cnt&=63 here? */
+	return ((x<<num) | (x>>(64-num)));
+}
+static inline u64 ror64(u64 x,int num) {
+	if(__builtin_constant_p(num))
+		return constant_rol64(x,(64-num));
+	return ((x>>num) | (x<<(64-num)));
+}
+
+#define ARCH_HAS_ROL64
+#define ARCH_HAS_ROR64
 
-#ifdef __KERNEL__
 
 #define ext2_set_bit(nr,addr) \
 	__test_and_set_bit((nr),(unsigned long*)addr)
diff -urpN 2.6.12-rc2.1.be/include/linux/bitops.h 2.6.12-rc2.2.ror/include/linux/bitops.h
--- 2.6.12-rc2.1.be/include/linux/bitops.h	Mon Apr 18 22:55:10 2005
+++ 2.6.12-rc2.2.ror/include/linux/bitops.h	Tue Apr 19 00:25:28 2005
@@ -41,7 +41,7 @@ static inline int generic_ffs(int x)
  * fls: find last bit set.
  */
 
-static __inline__ int generic_fls(int x)
+static inline int generic_fls(int x)
 {
 	int r = 32;
 
@@ -76,7 +76,7 @@ static __inline__ int generic_fls(int x)
  */
 #include <asm/bitops.h>
 
-static __inline__ int get_bitmask_order(unsigned int count)
+static inline int get_bitmask_order(unsigned int count)
 {
 	int order;
 	
@@ -155,5 +155,31 @@ static inline __u32 ror32(__u32 word, un
 {
 	return (word >> shift) | (word << (32 - shift));
 }
+
+/*
+ * rol64 - rotate a 64-bit value left
+ *
+ * @word: value to rotate
+ * @shift: bits to roll
+ */
+#ifndef ARCH_HAS_ROL64
+static inline __u64 rol64(__u64 word, unsigned int shift)
+{
+	return (word << shift) | (word >> (64 - shift));
+}
+#endif
+
+/*
+ * ror64 - rotate a 64-bit value right
+ *
+ * @word: value to rotate
+ * @shift: bits to roll
+ */
+#ifndef ARCH_HAS_ROR64
+static inline __u64 ror64(__u64 word, unsigned int shift)
+{
+	return (word >> shift) | (word << (64 - shift));
+}
+#endif
 
 #endif

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH] sha512: replace open-coded be64 conversions
  2005-04-19  6:18 [PATCH] introduce generic 64bit rotations and i386 asm optimized version Denis Vlasenko
@ 2005-04-19  6:21 ` Denis Vlasenko
  2005-04-19  6:26   ` [PATCH] sha512: use 64bit rotations Denis Vlasenko
  2005-04-19  9:04   ` [PATCH] sha512: replace open-coded be64 conversions Jesper Juhl
  2005-04-19  6:46 ` [PATCH] introduce generic 64bit rotations and i386 asm optimized version YOSHIFUJI Hideaki / 吉藤英明
  2005-04-19 19:15 ` Geert Uytterhoeven
  2 siblings, 2 replies; 14+ messages in thread
From: Denis Vlasenko @ 2005-04-19  6:21 UTC (permalink / raw)
  To: linux-kernel; +Cc: Matt Mackall

[-- Attachment #1: Type: text/plain, Size: 71 bytes --]

This + next patch were "modprobe tcrypt" tested.
See next mail.
--
vda

[-- Attachment #2: 1.be.patch --]
[-- Type: text/x-diff, Size: 2535 bytes --]

diff -urpN 2.6.12-rc2.0.orig/crypto/sha512.c 2.6.12-rc2.1.be/crypto/sha512.c
--- 2.6.12-rc2.0.orig/crypto/sha512.c	Tue Apr 19 00:20:12 2005
+++ 2.6.12-rc2.1.be/crypto/sha512.c	Mon Apr 18 23:31:54 2005
@@ -105,7 +105,7 @@ static const u64 sha512_K[80] = {
 
 static inline void LOAD_OP(int I, u64 *W, const u8 *input)
 {
-	W[I] = __be64_to_cpu( ((__be64*)(input))[I] );
+	W[I] = __be64_to_cpu( ((__be64*)input)[I] );
 }
 
 static inline void BLEND_OP(int I, u64 *W)
@@ -124,9 +124,8 @@ sha512_transform(u64 *state, u64 *W, con
         for (i = 0; i < 16; i++)
                 LOAD_OP(i, W, input);
 
-        for (i = 16; i < 80; i++) {
+        for (i = 16; i < 80; i++)
                 BLEND_OP(i, W);
-        }
 
 	/* load the state into our registers */
 	a=state[0];   b=state[1];   c=state[2];   d=state[3];  
@@ -238,36 +237,17 @@ sha512_final(void *ctx, u8 *hash)
 	
         static u8 padding[128] = { 0x80, };
 
-        u32 t;
-	u64 t2;
         u8 bits[128];
 	unsigned int index, pad_len;
-	int i, j;
+	int i;
 
-        index = pad_len = t = i = j = 0;
-        t2 = 0;
+	index = pad_len = i = 0;
 
 	/* Save number of bits */
-	t = sctx->count[0];
-	bits[15] = t; t>>=8;
-	bits[14] = t; t>>=8;
-	bits[13] = t; t>>=8;
-	bits[12] = t; 
-	t = sctx->count[1];
-	bits[11] = t; t>>=8;
-	bits[10] = t; t>>=8;
-	bits[9 ] = t; t>>=8;
-	bits[8 ] = t; 
-	t = sctx->count[2];
-	bits[7 ] = t; t>>=8;
-	bits[6 ] = t; t>>=8;
-	bits[5 ] = t; t>>=8;
-	bits[4 ] = t; 
-	t = sctx->count[3];
-	bits[3 ] = t; t>>=8;
-	bits[2 ] = t; t>>=8;
-	bits[1 ] = t; t>>=8;
-	bits[0 ] = t; 
+	((__be32*)bits)[3] = __cpu_to_be32(sctx->count[0]);
+	((__be32*)bits)[2] = __cpu_to_be32(sctx->count[1]);
+	((__be32*)bits)[1] = __cpu_to_be32(sctx->count[2]);
+	((__be32*)bits)[0] = __cpu_to_be32(sctx->count[3]);
 
 	/* Pad out to 112 mod 128. */
 	index = (sctx->count[0] >> 3) & 0x7f;
@@ -278,17 +258,8 @@ sha512_final(void *ctx, u8 *hash)
 	sha512_update(sctx, bits, 16);
 
 	/* Store state in digest */
-	for (i = j = 0; i < 8; i++, j += 8) {
-		t2 = sctx->state[i];
-		hash[j+7] = (char)t2 & 0xff; t2>>=8;
-		hash[j+6] = (char)t2 & 0xff; t2>>=8;
-		hash[j+5] = (char)t2 & 0xff; t2>>=8;
-		hash[j+4] = (char)t2 & 0xff; t2>>=8;
-		hash[j+3] = (char)t2 & 0xff; t2>>=8;
-		hash[j+2] = (char)t2 & 0xff; t2>>=8;
-		hash[j+1] = (char)t2 & 0xff; t2>>=8;
-		hash[j  ] = (char)t2 & 0xff;
-	}
+	for (i = 0; i < 8; i++)
+		((__be64*)hash)[i] = __cpu_to_be64(sctx->state[i]);
 	
 	/* Zeroize sensitive information. */
 	memset(sctx, 0, sizeof(struct sha512_ctx));

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH] sha512: use 64bit rotations
  2005-04-19  6:21 ` [PATCH] sha512: replace open-coded be64 conversions Denis Vlasenko
@ 2005-04-19  6:26   ` Denis Vlasenko
  2005-04-19  6:28     ` [PATCH] sha512: fix whitespace Denis Vlasenko
  2005-04-19  9:04   ` [PATCH] sha512: replace open-coded be64 conversions Jesper Juhl
  1 sibling, 1 reply; 14+ messages in thread
From: Denis Vlasenko @ 2005-04-19  6:26 UTC (permalink / raw)
  To: linux-kernel; +Cc: Matt Mackall

[-- Attachment #1: Type: text/plain, Size: 1679 bytes --]

this results in following code size changes on i386:

# size sha512_org.o sha512_be.o sha512_ror.o
   text    data     bss     dec     hex filename
   5339     364       0    5703    1647 sha512_org.o
   5221     364       0    5585    15d1 sha512_be.o
   5122     364       0    5486    156e sha512_ror.o

modprobe tcrypt:

testing sha384
test 1:
cb00753f45a35e8bb5a03d699ac65007272c32ab0eded1631a8b605a43ff5bed8086072ba1e7cc2358baeca134c825a7
pass
test 2:
3391fdddfc8dc7393707a65b1b4709397cf8b1d162af05abfe8f450de5f36bc6b0455a8520bc4e6f5fe95b1fe3c8452b
pass
test 3:
09330c33f71147e83d192fc782cd1b4753111b173b3b05d22fa08086e3b0f712fcc7c71a557e2db966c3e9fa91746039
pass
test 4:
3d208973ab3508dbbd7e2c2862ba290ad3010e4978c198dc4d8fd014e582823a89e16f9b2a7bbc1ac938e2d199e8bea4
pass
testing sha384 across pages
test 1:
3d208973ab3508dbbd7e2c2862ba290ad3010e4978c198dc4d8fd014e582823a89e16f9b2a7bbc1ac938e2d199e8bea4
pass

testing sha512
test 1:
ddaf35a193617abacc417349ae20413112e6fa4e89a97ea20a9eeee64b55d39a2192992a274fc1a836ba3c23a3feebbd454d4423643ce80e2a9ac94fa54ca49f
pass
test 2:
204a8fc6dda82f0a0ced7beb8e08a41657c16ef468b228a8279be331a703c33596fd15c13b1b07f9aa1d3bea57789ca031ad85c7a71dd70354ec631238ca3445
pass
test 3:
8e959b75dae313da8cf4f72814fc143f8f7779c6eb9f7fa17299aeadb6889018501d289e4900f7e4331b99dec4b5433ac7d329eeb6dd26545e96e55b874be909
pass
test 4:
930d0cefcb30ff1133b6898121f1cf3d27578afcafe8677c5257cf069911f75d8f5831b56ebfda67b278e66dff8b84fe2b2870f742a580d8edb41987232850c9
pass
testing sha512 across pages
test 1:
930d0cefcb30ff1133b6898121f1cf3d27578afcafe8677c5257cf069911f75d8f5831b56ebfda67b278e66dff8b84fe2b2870f742a580d8edb41987232850c9
pass
--
vda

[-- Attachment #2: 2.ror.patch --]
[-- Type: text/x-diff, Size: 1285 bytes --]

diff -urpN 2.6.12-rc2.1.be/crypto/sha512.c 2.6.12-rc2.2.ror/crypto/sha512.c
--- 2.6.12-rc2.1.be/crypto/sha512.c	Mon Apr 18 23:31:54 2005
+++ 2.6.12-rc2.2.ror/crypto/sha512.c	Mon Apr 18 23:37:20 2005
@@ -43,11 +43,6 @@ static inline u64 Maj(u64 x, u64 y, u64 
         return (x & y) | (z & (x | y));
 }
 
-static inline u64 RORu64(u64 x, u64 y)
-{
-        return (x >> y) | (x << (64 - y));
-}
-
 static const u64 sha512_K[80] = {
         0x428a2f98d728ae22ULL, 0x7137449123ef65cdULL, 0xb5c0fbcfec4d3b2fULL,
         0xe9b5dba58189dbbcULL, 0x3956c25bf348b538ULL, 0x59f111f1b605d019ULL,
@@ -78,10 +73,10 @@ static const u64 sha512_K[80] = {
         0x5fcb6fab3ad6faecULL, 0x6c44198c4a475817ULL,
 };
 
-#define e0(x)       (RORu64(x,28) ^ RORu64(x,34) ^ RORu64(x,39))
-#define e1(x)       (RORu64(x,14) ^ RORu64(x,18) ^ RORu64(x,41))
-#define s0(x)       (RORu64(x, 1) ^ RORu64(x, 8) ^ (x >> 7))
-#define s1(x)       (RORu64(x,19) ^ RORu64(x,61) ^ (x >> 6))
+#define e0(x)       (ror64(x,28) ^ ror64(x,34) ^ ror64(x,39))
+#define e1(x)       (ror64(x,14) ^ ror64(x,18) ^ ror64(x,41))
+#define s0(x)       (ror64(x, 1) ^ ror64(x, 8) ^ (x >> 7))
+#define s1(x)       (ror64(x,19) ^ ror64(x,61) ^ (x >> 6))
 
 /* H* initial state for SHA-512 */
 #define H0         0x6a09e667f3bcc908ULL

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH] sha512: fix whitespace
  2005-04-19  6:26   ` [PATCH] sha512: use 64bit rotations Denis Vlasenko
@ 2005-04-19  6:28     ` Denis Vlasenko
  2005-04-19  6:31       ` [PATCH] ia64: use 64bit rotations Denis Vlasenko
  0 siblings, 1 reply; 14+ messages in thread
From: Denis Vlasenko @ 2005-04-19  6:28 UTC (permalink / raw)
  To: linux-kernel; +Cc: Matt Mackall

[-- Attachment #1: Type: text/plain, Size: 78 bytes --]

While we're at it, fix inconsistent tab/space usage.

No code changes.
--
vda

[-- Attachment #2: 3.ws.patch --]
[-- Type: text/x-diff, Size: 11302 bytes --]

diff -urpN 2.6.12-rc2.2.ror/crypto/sha512.c 2.6.12-rc2.3.ws/crypto/sha512.c
--- 2.6.12-rc2.2.ror/crypto/sha512.c	Mon Apr 18 23:37:20 2005
+++ 2.6.12-rc2.3.ws/crypto/sha512.c	Mon Apr 18 23:48:18 2005
@@ -35,42 +35,42 @@ struct sha512_ctx {
 
 static inline u64 Ch(u64 x, u64 y, u64 z)
 {
-        return z ^ (x & (y ^ z));
+	return z ^ (x & (y ^ z));
 }
 
 static inline u64 Maj(u64 x, u64 y, u64 z)
 {
-        return (x & y) | (z & (x | y));
+	return (x & y) | (z & (x | y));
 }
 
 static const u64 sha512_K[80] = {
-        0x428a2f98d728ae22ULL, 0x7137449123ef65cdULL, 0xb5c0fbcfec4d3b2fULL,
-        0xe9b5dba58189dbbcULL, 0x3956c25bf348b538ULL, 0x59f111f1b605d019ULL,
-        0x923f82a4af194f9bULL, 0xab1c5ed5da6d8118ULL, 0xd807aa98a3030242ULL,
-        0x12835b0145706fbeULL, 0x243185be4ee4b28cULL, 0x550c7dc3d5ffb4e2ULL,
-        0x72be5d74f27b896fULL, 0x80deb1fe3b1696b1ULL, 0x9bdc06a725c71235ULL,
-        0xc19bf174cf692694ULL, 0xe49b69c19ef14ad2ULL, 0xefbe4786384f25e3ULL,
-        0x0fc19dc68b8cd5b5ULL, 0x240ca1cc77ac9c65ULL, 0x2de92c6f592b0275ULL,
-        0x4a7484aa6ea6e483ULL, 0x5cb0a9dcbd41fbd4ULL, 0x76f988da831153b5ULL,
-        0x983e5152ee66dfabULL, 0xa831c66d2db43210ULL, 0xb00327c898fb213fULL,
-        0xbf597fc7beef0ee4ULL, 0xc6e00bf33da88fc2ULL, 0xd5a79147930aa725ULL,
-        0x06ca6351e003826fULL, 0x142929670a0e6e70ULL, 0x27b70a8546d22ffcULL,
-        0x2e1b21385c26c926ULL, 0x4d2c6dfc5ac42aedULL, 0x53380d139d95b3dfULL,
-        0x650a73548baf63deULL, 0x766a0abb3c77b2a8ULL, 0x81c2c92e47edaee6ULL,
-        0x92722c851482353bULL, 0xa2bfe8a14cf10364ULL, 0xa81a664bbc423001ULL,
-        0xc24b8b70d0f89791ULL, 0xc76c51a30654be30ULL, 0xd192e819d6ef5218ULL,
-        0xd69906245565a910ULL, 0xf40e35855771202aULL, 0x106aa07032bbd1b8ULL,
-        0x19a4c116b8d2d0c8ULL, 0x1e376c085141ab53ULL, 0x2748774cdf8eeb99ULL,
-        0x34b0bcb5e19b48a8ULL, 0x391c0cb3c5c95a63ULL, 0x4ed8aa4ae3418acbULL,
-        0x5b9cca4f7763e373ULL, 0x682e6ff3d6b2b8a3ULL, 0x748f82ee5defb2fcULL,
-        0x78a5636f43172f60ULL, 0x84c87814a1f0ab72ULL, 0x8cc702081a6439ecULL,
-        0x90befffa23631e28ULL, 0xa4506cebde82bde9ULL, 0xbef9a3f7b2c67915ULL,
-        0xc67178f2e372532bULL, 0xca273eceea26619cULL, 0xd186b8c721c0c207ULL,
-        0xeada7dd6cde0eb1eULL, 0xf57d4f7fee6ed178ULL, 0x06f067aa72176fbaULL,
-        0x0a637dc5a2c898a6ULL, 0x113f9804bef90daeULL, 0x1b710b35131c471bULL,
-        0x28db77f523047d84ULL, 0x32caab7b40c72493ULL, 0x3c9ebe0a15c9bebcULL,
-        0x431d67c49c100d4cULL, 0x4cc5d4becb3e42b6ULL, 0x597f299cfc657e2aULL,
-        0x5fcb6fab3ad6faecULL, 0x6c44198c4a475817ULL,
+	0x428a2f98d728ae22ULL, 0x7137449123ef65cdULL, 0xb5c0fbcfec4d3b2fULL,
+	0xe9b5dba58189dbbcULL, 0x3956c25bf348b538ULL, 0x59f111f1b605d019ULL,
+	0x923f82a4af194f9bULL, 0xab1c5ed5da6d8118ULL, 0xd807aa98a3030242ULL,
+	0x12835b0145706fbeULL, 0x243185be4ee4b28cULL, 0x550c7dc3d5ffb4e2ULL,
+	0x72be5d74f27b896fULL, 0x80deb1fe3b1696b1ULL, 0x9bdc06a725c71235ULL,
+	0xc19bf174cf692694ULL, 0xe49b69c19ef14ad2ULL, 0xefbe4786384f25e3ULL,
+	0x0fc19dc68b8cd5b5ULL, 0x240ca1cc77ac9c65ULL, 0x2de92c6f592b0275ULL,
+	0x4a7484aa6ea6e483ULL, 0x5cb0a9dcbd41fbd4ULL, 0x76f988da831153b5ULL,
+	0x983e5152ee66dfabULL, 0xa831c66d2db43210ULL, 0xb00327c898fb213fULL,
+	0xbf597fc7beef0ee4ULL, 0xc6e00bf33da88fc2ULL, 0xd5a79147930aa725ULL,
+	0x06ca6351e003826fULL, 0x142929670a0e6e70ULL, 0x27b70a8546d22ffcULL,
+	0x2e1b21385c26c926ULL, 0x4d2c6dfc5ac42aedULL, 0x53380d139d95b3dfULL,
+	0x650a73548baf63deULL, 0x766a0abb3c77b2a8ULL, 0x81c2c92e47edaee6ULL,
+	0x92722c851482353bULL, 0xa2bfe8a14cf10364ULL, 0xa81a664bbc423001ULL,
+	0xc24b8b70d0f89791ULL, 0xc76c51a30654be30ULL, 0xd192e819d6ef5218ULL,
+	0xd69906245565a910ULL, 0xf40e35855771202aULL, 0x106aa07032bbd1b8ULL,
+	0x19a4c116b8d2d0c8ULL, 0x1e376c085141ab53ULL, 0x2748774cdf8eeb99ULL,
+	0x34b0bcb5e19b48a8ULL, 0x391c0cb3c5c95a63ULL, 0x4ed8aa4ae3418acbULL,
+	0x5b9cca4f7763e373ULL, 0x682e6ff3d6b2b8a3ULL, 0x748f82ee5defb2fcULL,
+	0x78a5636f43172f60ULL, 0x84c87814a1f0ab72ULL, 0x8cc702081a6439ecULL,
+	0x90befffa23631e28ULL, 0xa4506cebde82bde9ULL, 0xbef9a3f7b2c67915ULL,
+	0xc67178f2e372532bULL, 0xca273eceea26619cULL, 0xd186b8c721c0c207ULL,
+	0xeada7dd6cde0eb1eULL, 0xf57d4f7fee6ed178ULL, 0x06f067aa72176fbaULL,
+	0x0a637dc5a2c898a6ULL, 0x113f9804bef90daeULL, 0x1b710b35131c471bULL,
+	0x28db77f523047d84ULL, 0x32caab7b40c72493ULL, 0x3c9ebe0a15c9bebcULL,
+	0x431d67c49c100d4cULL, 0x4cc5d4becb3e42b6ULL, 0x597f299cfc657e2aULL,
+	0x5fcb6fab3ad6faecULL, 0x6c44198c4a475817ULL,
 };
 
 #define e0(x)       (ror64(x,28) ^ ror64(x,34) ^ ror64(x,39))
@@ -116,16 +116,16 @@ sha512_transform(u64 *state, u64 *W, con
 	int i;
 
 	/* load the input */
-        for (i = 0; i < 16; i++)
-                LOAD_OP(i, W, input);
+	for (i = 0; i < 16; i++)
+		LOAD_OP(i, W, input);
 
-        for (i = 16; i < 80; i++)
-                BLEND_OP(i, W);
+	for (i = 16; i < 80; i++)
+		BLEND_OP(i, W);
 
 	/* load the state into our registers */
-	a=state[0];   b=state[1];   c=state[2];   d=state[3];  
-	e=state[4];   f=state[5];   g=state[6];   h=state[7];  
-  
+	a=state[0]; b=state[1]; c=state[2]; d=state[3];
+	e=state[4]; f=state[5]; g=state[6]; h=state[7];
+
 	/* now iterate */
 	for (i=0; i<80; i+=8) {
 		t1 = h + e1(e) + Ch(e,f,g) + sha512_K[i  ] + W[i  ];
@@ -145,9 +145,9 @@ sha512_transform(u64 *state, u64 *W, con
 		t1 = a + e1(f) + Ch(f,g,h) + sha512_K[i+7] + W[i+7];
 		t2 = e0(b) + Maj(b,c,d);    e+=t1;    a=t1+t2;
 	}
-  
-	state[0] += a; state[1] += b; state[2] += c; state[3] += d;  
-	state[4] += e; state[5] += f; state[6] += g; state[7] += h;  
+
+	state[0] += a; state[1] += b; state[2] += c; state[3] += d;
+	state[4] += e; state[5] += f; state[6] += g; state[7] += h;
 
 	/* erase our data */
 	a = b = c = d = e = f = g = h = t1 = t2 = 0;
@@ -156,7 +156,7 @@ sha512_transform(u64 *state, u64 *W, con
 static void
 sha512_init(void *ctx)
 {
-        struct sha512_ctx *sctx = ctx;
+	struct sha512_ctx *sctx = ctx;
 	sctx->state[0] = H0;
 	sctx->state[1] = H1;
 	sctx->state[2] = H2;
@@ -172,29 +172,29 @@ sha512_init(void *ctx)
 static void
 sha384_init(void *ctx)
 {
-        struct sha512_ctx *sctx = ctx;
-        sctx->state[0] = HP0;
-        sctx->state[1] = HP1;
-        sctx->state[2] = HP2;
-        sctx->state[3] = HP3;
-        sctx->state[4] = HP4;
-        sctx->state[5] = HP5;
-        sctx->state[6] = HP6;
-        sctx->state[7] = HP7;
-        sctx->count[0] = sctx->count[1] = sctx->count[2] = sctx->count[3] = 0;
-        memset(sctx->buf, 0, sizeof(sctx->buf));
+	struct sha512_ctx *sctx = ctx;
+	sctx->state[0] = HP0;
+	sctx->state[1] = HP1;
+	sctx->state[2] = HP2;
+	sctx->state[3] = HP3;
+	sctx->state[4] = HP4;
+	sctx->state[5] = HP5;
+	sctx->state[6] = HP6;
+	sctx->state[7] = HP7;
+	sctx->count[0] = sctx->count[1] = sctx->count[2] = sctx->count[3] = 0;
+	memset(sctx->buf, 0, sizeof(sctx->buf));
 }
 
 static void
 sha512_update(void *ctx, const u8 *data, unsigned int len)
 {
-        struct sha512_ctx *sctx = ctx;
+	struct sha512_ctx *sctx = ctx;
 
 	unsigned int i, index, part_len;
 
 	/* Compute number of bytes mod 128 */
 	index = (unsigned int)((sctx->count[0] >> 3) & 0x7F);
-	
+
 	/* Update number of bits */
 	if ((sctx->count[0] += (len << 3)) < (len << 3)) {
 		if ((sctx->count[1] += 1) < 1)
@@ -202,9 +202,9 @@ sha512_update(void *ctx, const u8 *data,
 				sctx->count[3]++;
 		sctx->count[1] += (len >> 29);
 	}
-	
-        part_len = 128 - index;
-	
+
+	part_len = 128 - index;
+
 	/* Transform as many times as possible. */
 	if (len >= part_len) {
 		memcpy(&sctx->buf[index], data, part_len);
@@ -228,11 +228,11 @@ sha512_update(void *ctx, const u8 *data,
 static void
 sha512_final(void *ctx, u8 *hash)
 {
-        struct sha512_ctx *sctx = ctx;
-	
-        static u8 padding[128] = { 0x80, };
+	struct sha512_ctx *sctx = ctx;
+
+	static u8 padding[128] = { 0x80, };
 
-        u8 bits[128];
+	u8 bits[128];
 	unsigned int index, pad_len;
 	int i;
 
@@ -255,70 +255,70 @@ sha512_final(void *ctx, u8 *hash)
 	/* Store state in digest */
 	for (i = 0; i < 8; i++)
 		((__be64*)hash)[i] = __cpu_to_be64(sctx->state[i]);
-	
+
 	/* Zeroize sensitive information. */
 	memset(sctx, 0, sizeof(struct sha512_ctx));
 }
 
 static void sha384_final(void *ctx, u8 *hash)
 {
-        struct sha512_ctx *sctx = ctx;
-        u8 D[64];
+	struct sha512_ctx *sctx = ctx;
+	u8 D[64];
 
-        sha512_final(sctx, D);
+	sha512_final(sctx, D);
 
-        memcpy(hash, D, 48);
-        memset(D, 0, 64);
+	memcpy(hash, D, 48);
+	memset(D, 0, 64);
 }
 
 static struct crypto_alg sha512 = {
-        .cra_name       = "sha512",
-        .cra_flags      = CRYPTO_ALG_TYPE_DIGEST,
-        .cra_blocksize  = SHA512_HMAC_BLOCK_SIZE,
-        .cra_ctxsize    = sizeof(struct sha512_ctx),
-        .cra_module     = THIS_MODULE,
-        .cra_list       = LIST_HEAD_INIT(sha512.cra_list),
-        .cra_u          = { .digest = {
-                                .dia_digestsize = SHA512_DIGEST_SIZE,
-                                .dia_init       = sha512_init,
-                                .dia_update     = sha512_update,
-                                .dia_final      = sha512_final }
-        }
+	.cra_name	= "sha512",
+	.cra_flags	= CRYPTO_ALG_TYPE_DIGEST,
+	.cra_blocksize	= SHA512_HMAC_BLOCK_SIZE,
+	.cra_ctxsize	= sizeof(struct sha512_ctx),
+	.cra_module	= THIS_MODULE,
+	.cra_list	= LIST_HEAD_INIT(sha512.cra_list),
+	.cra_u		= { .digest = {
+				.dia_digestsize	= SHA512_DIGEST_SIZE,
+				.dia_init	= sha512_init,
+				.dia_update	= sha512_update,
+				.dia_final	= sha512_final }
+	}
 };
 
 static struct crypto_alg sha384 = {
-        .cra_name       = "sha384",
-        .cra_flags      = CRYPTO_ALG_TYPE_DIGEST,
-        .cra_blocksize  = SHA384_HMAC_BLOCK_SIZE,
-        .cra_ctxsize    = sizeof(struct sha512_ctx),
-        .cra_module     = THIS_MODULE,
-        .cra_list       = LIST_HEAD_INIT(sha384.cra_list),
-        .cra_u          = { .digest = {
-                                .dia_digestsize = SHA384_DIGEST_SIZE,
-                                .dia_init       = sha384_init,
-                                .dia_update     = sha512_update,
-                                .dia_final      = sha384_final }
-        }
+	.cra_name	= "sha384",
+	.cra_flags	= CRYPTO_ALG_TYPE_DIGEST,
+	.cra_blocksize	= SHA384_HMAC_BLOCK_SIZE,
+	.cra_ctxsize	= sizeof(struct sha512_ctx),
+	.cra_module	= THIS_MODULE,
+	.cra_list	= LIST_HEAD_INIT(sha384.cra_list),
+	.cra_u		= { .digest = {
+				.dia_digestsize	= SHA384_DIGEST_SIZE,
+				.dia_init	= sha384_init,
+				.dia_update	= sha512_update,
+				.dia_final	= sha384_final }
+	}
 };
 
 MODULE_ALIAS("sha384");
 
 static int __init init(void)
 {
-        int ret = 0;
+	int ret = 0;
 
-        if ((ret = crypto_register_alg(&sha384)) < 0)
-                goto out;
-        if ((ret = crypto_register_alg(&sha512)) < 0)
-                crypto_unregister_alg(&sha384);
+	if ((ret = crypto_register_alg(&sha384)) < 0)
+		goto out;
+	if ((ret = crypto_register_alg(&sha512)) < 0)
+		crypto_unregister_alg(&sha384);
 out:
-        return ret;
+	return ret;
 }
 
 static void __exit fini(void)
 {
-        crypto_unregister_alg(&sha384);
-        crypto_unregister_alg(&sha512);
+	crypto_unregister_alg(&sha384);
+	crypto_unregister_alg(&sha512);
 }
 
 module_init(init);

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH] ia64: use 64bit rotations
  2005-04-19  6:28     ` [PATCH] sha512: fix whitespace Denis Vlasenko
@ 2005-04-19  6:31       ` Denis Vlasenko
  0 siblings, 0 replies; 14+ messages in thread
From: Denis Vlasenko @ 2005-04-19  6:31 UTC (permalink / raw)
  To: linux-kernel; +Cc: Matt Mackall

[-- Attachment #1: Type: text/plain, Size: 141 bytes --]

Remove local 64bit rotation function, use generic one.

Patch is untested.

I believe there is no more 64bit rotations in the kernel.
--
vda

[-- Attachment #2: 4.ia64.patch --]
[-- Type: text/x-diff, Size: 1318 bytes --]

diff -urpN 2.6.12-rc2.3.ws/arch/ia64/kernel/ptrace.c 2.6.12-rc2.4.ia64/arch/ia64/kernel/ptrace.c
--- 2.6.12-rc2.3.ws/arch/ia64/kernel/ptrace.c	Mon Apr 18 22:54:38 2005
+++ 2.6.12-rc2.4.ia64/arch/ia64/kernel/ptrace.c	Tue Apr 19 00:44:30 2005
@@ -80,7 +80,7 @@ ia64_get_scratch_nat_bits (struct pt_reg
 			dist = 64 + bit - first;			\
 		else							\
 			dist = bit - first;				\
-		ia64_rotr(unat, dist) & mask;				\
+		ror64(unat, dist) & mask;				\
 	})
 	unsigned long val;
 
@@ -119,7 +119,7 @@ ia64_put_scratch_nat_bits (struct pt_reg
 			dist = 64 + bit - first;			\
 		else							\
 			dist = bit - first;				\
-		ia64_rotl(nat & mask, dist);				\
+		rol64(nat & mask, dist);				\
 	})
 	unsigned long scratch_unat;
 
diff -urpN 2.6.12-rc2.3.ws/include/asm-ia64/processor.h 2.6.12-rc2.4.ia64/include/asm-ia64/processor.h
--- 2.6.12-rc2.3.ws/include/asm-ia64/processor.h	Thu Feb  3 11:40:06 2005
+++ 2.6.12-rc2.4.ia64/include/asm-ia64/processor.h	Tue Apr 19 00:43:30 2005
@@ -652,14 +652,6 @@ ia64_get_dbr (__u64 regnum)
 	return retval;
 }
 
-static inline __u64
-ia64_rotr (__u64 w, __u64 n)
-{
-	return (w >> n) | (w << (64 - n));
-}
-
-#define ia64_rotl(w,n)	ia64_rotr((w), (64) - (n))
-
 /*
  * Take a mapped kernel address and return the equivalent address
  * in the region 7 identity mapped virtual area.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] introduce generic 64bit rotations and i386 asm optimized version
  2005-04-19  6:18 [PATCH] introduce generic 64bit rotations and i386 asm optimized version Denis Vlasenko
  2005-04-19  6:21 ` [PATCH] sha512: replace open-coded be64 conversions Denis Vlasenko
@ 2005-04-19  6:46 ` YOSHIFUJI Hideaki / 吉藤英明
  2005-04-19  6:55   ` Denis Vlasenko
  2005-04-19 20:20   ` Domen Puncer
  2005-04-19 19:15 ` Geert Uytterhoeven
  2 siblings, 2 replies; 14+ messages in thread
From: YOSHIFUJI Hideaki / 吉藤英明 @ 2005-04-19  6:46 UTC (permalink / raw)
  To: vda; +Cc: linux-kernel, mpm, yoshfuji

In article <200504190918.10279.vda@port.imtp.ilyichevsk.odessa.ua> (at Tue, 19 Apr 2005 09:18:10 +0300), Denis Vlasenko <vda@port.imtp.ilyichevsk.odessa.ua> says:

> diff -urpN 2.6.12-rc2.1.be/include/linux/bitops.h 2.6.12-rc2.2.ror/include/linux/bitops.h
> --- 2.6.12-rc2.1.be/include/linux/bitops.h	Mon Apr 18 22:55:10 2005
> +++ 2.6.12-rc2.2.ror/include/linux/bitops.h	Tue Apr 19 00:25:28 2005
> @@ -41,7 +41,7 @@ static inline int generic_ffs(int x)
>   * fls: find last bit set.
>   */
>  
> -static __inline__ int generic_fls(int x)
> +static inline int generic_fls(int x)
>  {
>  	int r = 32;
>  
> @@ -76,7 +76,7 @@ static __inline__ int generic_fls(int x)
>   */
>  #include <asm/bitops.h>
>  
> -static __inline__ int get_bitmask_order(unsigned int count)
> +static inline int get_bitmask_order(unsigned int count)
>  {
>  	int order;
>  	

Please keep using __inline__, not inline.

> +#ifndef ARCH_HAS_ROL64
> +static inline __u64 rol64(__u64 word, unsigned int shift)
> +{
> +	return (word << shift) | (word >> (64 - shift));
> +}
> +#endif
:
> +#ifndef ARCH_HAS_ROR64
> +static inline __u64 ror64(__u64 word, unsigned int shift)
> +{
> +	return (word >> shift) | (word << (64 - shift));
> +}
> +#endif
>  
>  #endif

please use __inline__, in header files.

-- 
Hideaki YOSHIFUJI @ USAGI Project <yoshfuji@linux-ipv6.org>
Homepage: http://www.yoshifuji.org/~hideaki/
GPG FP  : 9022 65EB 1ECF 3AD1 0BDF  80D8 4807 F894 E062 0EEA

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] introduce generic 64bit rotations and i386 asm optimized version
  2005-04-19  6:46 ` [PATCH] introduce generic 64bit rotations and i386 asm optimized version YOSHIFUJI Hideaki / 吉藤英明
@ 2005-04-19  6:55   ` Denis Vlasenko
  2005-04-19 20:20   ` Domen Puncer
  1 sibling, 0 replies; 14+ messages in thread
From: Denis Vlasenko @ 2005-04-19  6:55 UTC (permalink / raw)
  To: linux-kernel; +Cc: mpm, yoshfuji

[-- Attachment #1: Type: text/plain, Size: 132 bytes --]

On Tuesday 19 April 2005 09:46, YOSHIFUJI Hideaki wrote:
> Please keep using __inline__, not inline.

Updated patch follows.
--
vda

[-- Attachment #2: 0.ror64.patch --]
[-- Type: text/x-diff, Size: 12527 bytes --]

diff -urpN 2.6.12-rc2.1.be/include/asm-i386/bitops.h 2.6.12-rc2.2.ror/include/asm-i386/bitops.h
--- 2.6.12-rc2.1.be/include/asm-i386/bitops.h	Tue Oct 19 00:54:36 2004
+++ 2.6.12-rc2.2.ror/include/asm-i386/bitops.h	Tue Apr 19 09:48:34 2005
@@ -7,6 +7,7 @@
 
 #include <linux/config.h>
 #include <linux/compiler.h>
+#include <linux/types.h>
 
 /*
  * These have to be done with inline assembly: that way the bit-setting
@@ -39,7 +40,7 @@
  * Note that @nr may be almost arbitrarily large; this function is not
  * restricted to acting on a single-word quantity.
  */
-static inline void set_bit(int nr, volatile unsigned long * addr)
+static __inline__ void set_bit(int nr, volatile unsigned long * addr)
 {
 	__asm__ __volatile__( LOCK_PREFIX
 		"btsl %1,%0"
@@ -56,7 +57,7 @@ static inline void set_bit(int nr, volat
  * If it's called on the same region of memory simultaneously, the effect
  * may be that only one operation succeeds.
  */
-static inline void __set_bit(int nr, volatile unsigned long * addr)
+static __inline__ void __set_bit(int nr, volatile unsigned long * addr)
 {
 	__asm__(
 		"btsl %1,%0"
@@ -74,7 +75,7 @@ static inline void __set_bit(int nr, vol
  * you should call smp_mb__before_clear_bit() and/or smp_mb__after_clear_bit()
  * in order to ensure changes are visible on other processors.
  */
-static inline void clear_bit(int nr, volatile unsigned long * addr)
+static __inline__ void clear_bit(int nr, volatile unsigned long * addr)
 {
 	__asm__ __volatile__( LOCK_PREFIX
 		"btrl %1,%0"
@@ -82,7 +83,7 @@ static inline void clear_bit(int nr, vol
 		:"Ir" (nr));
 }
 
-static inline void __clear_bit(int nr, volatile unsigned long * addr)
+static __inline__ void __clear_bit(int nr, volatile unsigned long * addr)
 {
 	__asm__ __volatile__(
 		"btrl %1,%0"
@@ -101,7 +102,7 @@ static inline void __clear_bit(int nr, v
  * If it's called on the same region of memory simultaneously, the effect
  * may be that only one operation succeeds.
  */
-static inline void __change_bit(int nr, volatile unsigned long * addr)
+static __inline__ void __change_bit(int nr, volatile unsigned long * addr)
 {
 	__asm__ __volatile__(
 		"btcl %1,%0"
@@ -119,7 +120,7 @@ static inline void __change_bit(int nr, 
  * Note that @nr may be almost arbitrarily large; this function is not
  * restricted to acting on a single-word quantity.
  */
-static inline void change_bit(int nr, volatile unsigned long * addr)
+static __inline__ void change_bit(int nr, volatile unsigned long * addr)
 {
 	__asm__ __volatile__( LOCK_PREFIX
 		"btcl %1,%0"
@@ -136,7 +137,7 @@ static inline void change_bit(int nr, vo
  * It may be reordered on other architectures than x86.
  * It also implies a memory barrier.
  */
-static inline int test_and_set_bit(int nr, volatile unsigned long * addr)
+static __inline__ int test_and_set_bit(int nr, volatile unsigned long * addr)
 {
 	int oldbit;
 
@@ -156,7 +157,7 @@ static inline int test_and_set_bit(int n
  * If two examples of this operation race, one can appear to succeed
  * but actually fail.  You must protect multiple accesses with a lock.
  */
-static inline int __test_and_set_bit(int nr, volatile unsigned long * addr)
+static __inline__ int __test_and_set_bit(int nr, volatile unsigned long * addr)
 {
 	int oldbit;
 
@@ -176,7 +177,7 @@ static inline int __test_and_set_bit(int
  * It can be reorderdered on other architectures other than x86.
  * It also implies a memory barrier.
  */
-static inline int test_and_clear_bit(int nr, volatile unsigned long * addr)
+static __inline__ int test_and_clear_bit(int nr, volatile unsigned long * addr)
 {
 	int oldbit;
 
@@ -196,7 +197,7 @@ static inline int test_and_clear_bit(int
  * If two examples of this operation race, one can appear to succeed
  * but actually fail.  You must protect multiple accesses with a lock.
  */
-static inline int __test_and_clear_bit(int nr, volatile unsigned long *addr)
+static __inline__ int __test_and_clear_bit(int nr, volatile unsigned long *addr)
 {
 	int oldbit;
 
@@ -208,7 +209,7 @@ static inline int __test_and_clear_bit(i
 }
 
 /* WARNING: non atomic and it can be reordered! */
-static inline int __test_and_change_bit(int nr, volatile unsigned long *addr)
+static __inline__ int __test_and_change_bit(int nr, volatile unsigned long *addr)
 {
 	int oldbit;
 
@@ -227,7 +228,7 @@ static inline int __test_and_change_bit(
  * This operation is atomic and cannot be reordered.  
  * It also implies a memory barrier.
  */
-static inline int test_and_change_bit(int nr, volatile unsigned long* addr)
+static __inline__ int test_and_change_bit(int nr, volatile unsigned long* addr)
 {
 	int oldbit;
 
@@ -247,12 +248,12 @@ static inline int test_and_change_bit(in
 static int test_bit(int nr, const volatile void * addr);
 #endif
 
-static inline int constant_test_bit(int nr, const volatile unsigned long *addr)
+static __inline__ int constant_test_bit(int nr, const volatile unsigned long *addr)
 {
 	return ((1UL << (nr & 31)) & (addr[nr >> 5])) != 0;
 }
 
-static inline int variable_test_bit(int nr, const volatile unsigned long * addr)
+static __inline__ int variable_test_bit(int nr, const volatile unsigned long * addr)
 {
 	int oldbit;
 
@@ -278,7 +279,7 @@ static inline int variable_test_bit(int 
  * Returns the bit-number of the first zero bit, not the number of the byte
  * containing a bit.
  */
-static inline int find_first_zero_bit(const unsigned long *addr, unsigned size)
+static __inline__ int find_first_zero_bit(const unsigned long *addr, unsigned size)
 {
 	int d0, d1, d2;
 	int res;
@@ -318,7 +319,7 @@ int find_next_zero_bit(const unsigned lo
  * Returns the bit-number of the first set bit, not the number of the byte
  * containing a bit.
  */
-static inline int find_first_bit(const unsigned long *addr, unsigned size)
+static __inline__ int find_first_bit(const unsigned long *addr, unsigned size)
 {
 	int d0, d1;
 	int res;
@@ -352,7 +353,7 @@ int find_next_bit(const unsigned long *a
  *
  * Undefined if no zero exists, so code should check against ~0UL first.
  */
-static inline unsigned long ffz(unsigned long word)
+static __inline__ unsigned long ffz(unsigned long word)
 {
 	__asm__("bsfl %1,%0"
 		:"=r" (word)
@@ -366,7 +367,7 @@ static inline unsigned long ffz(unsigned
  *
  * Undefined if no bit exists, so code should check against 0 first.
  */
-static inline unsigned long __ffs(unsigned long word)
+static __inline__ unsigned long __ffs(unsigned long word)
 {
 	__asm__("bsfl %1,%0"
 		:"=r" (word)
@@ -388,7 +389,7 @@ static inline unsigned long __ffs(unsign
  * unlikely to be set. It's guaranteed that at least one of the 140
  * bits is cleared.
  */
-static inline int sched_find_first_bit(const unsigned long *b)
+static __inline__ int sched_find_first_bit(const unsigned long *b)
 {
 	if (unlikely(b[0]))
 		return __ffs(b[0]);
@@ -409,7 +410,7 @@ static inline int sched_find_first_bit(c
  * the libc and compiler builtin ffs routines, therefore
  * differs in spirit from the above ffz (man ffs).
  */
-static inline int ffs(int x)
+static __inline__ int ffs(int x)
 {
 	int r;
 
@@ -431,9 +432,81 @@ static inline int ffs(int x)
 #define hweight16(x) generic_hweight16(x)
 #define hweight8(x) generic_hweight8(x)
 
-#endif /* __KERNEL__ */
+/*
+ * 64bit rotations
+ * (gcc3 seems to be clever enough to do 32bit ones just fine)
+ *
+ * Why "i" and "I" constraints do not work? gcc says:
+ * "warning: asm operand 2 probably doesn't match constraints"
+ * "error: impossible constraint in 'asm'"
+ * Will use "Ic" for now. If gcc will fail to do const propagation
+ * and will try to stuff constant into ecx, shld %3,... will expand
+ * to shld %ecx,... and assembler will moan.
+ * Do not 'fix' by changing to shld %b3,...
+ *
+ * Have to stick to edx,eax pair only because
+ * gcc has limited support for 64bit asm parameters
+ */
+#define constant_rol64(v,c) \
+	({						\
+	u64 vv = (v);					\
+	if(!(c&63)) {					\
+	} else if((c&63)==1) {				\
+		asm (					\
+		"	shldl	$1,%%edx,%%eax	\n"	\
+		"	rcll	$1,%%edx	\n"	\
+		: "=&A" (vv)				\
+		: "0" (vv)				\
+		);					\
+	} else if((c&63)==63) {				\
+		asm (					\
+		"	shrdl	$1,%%edx,%%eax	\n"	\
+		"	rcrl	$1,%%edx	\n"	\
+		: "=&A" (vv)				\
+		: "0" (vv)				\
+		);					\
+	} else if((c&63)<32) {				\
+		asm (					\
+		"	shldl	%3,%%edx,%%eax	\n"	\
+		"	shldl	%3,%2,%%edx	\n"	\
+		: "=&A" (vv)				\
+		: "0" (vv), "r" (vv), "Ic" (c&63)	\
+		);					\
+	} else if((c&63)>32) {				\
+		asm (					\
+		"	shrdl	%3,%%edx,%%eax	\n"	\
+		"	shrdl	%3,%2,%%edx	\n"	\
+		: "=&A" (vv)				\
+		: "0" (vv), "r" (vv), "Ic" (64-(c&63))	\
+		);					\
+	} else /* (c&63)==32 */ {			\
+		asm (					\
+		"	xchgl	%%edx,%%eax	\n"	\
+		: "=&A" (vv)				\
+		: "0" (vv)				\
+		);					\
+	}						\
+	vv;						\
+	})
+/*
+ * Unfortunately 64bit rotations with non-constant count
+ * have issues with cnt>=32. Using C code instead
+ */
+static __inline__ u64 rol64(u64 x,int num) {
+	if(__builtin_constant_p(num))
+		return constant_rol64(x,num);
+	/* Hmmm... shall we do cnt&=63 here? */
+	return ((x<<num) | (x>>(64-num)));
+}
+static __inline__ u64 ror64(u64 x,int num) {
+	if(__builtin_constant_p(num))
+		return constant_rol64(x,(64-num));
+	return ((x>>num) | (x<<(64-num)));
+}
+
+#define ARCH_HAS_ROL64
+#define ARCH_HAS_ROR64
 
-#ifdef __KERNEL__
 
 #define ext2_set_bit(nr,addr) \
 	__test_and_set_bit((nr),(unsigned long*)addr)
diff -urpN 2.6.12-rc2.1.be/include/linux/bitops.h 2.6.12-rc2.2.ror/include/linux/bitops.h
--- 2.6.12-rc2.1.be/include/linux/bitops.h	Mon Apr 18 22:55:10 2005
+++ 2.6.12-rc2.2.ror/include/linux/bitops.h	Tue Apr 19 09:48:50 2005
@@ -8,7 +8,7 @@
  * differs in spirit from the above ffz (man ffs).
  */
 
-static inline int generic_ffs(int x)
+static __inline__ int generic_ffs(int x)
 {
 	int r = 1;
 
@@ -89,7 +89,7 @@ static __inline__ int get_bitmask_order(
  * of bits set) of a N-bit word
  */
 
-static inline unsigned int generic_hweight32(unsigned int w)
+static __inline__ unsigned int generic_hweight32(unsigned int w)
 {
         unsigned int res = (w & 0x55555555) + ((w >> 1) & 0x55555555);
         res = (res & 0x33333333) + ((res >> 2) & 0x33333333);
@@ -98,7 +98,7 @@ static inline unsigned int generic_hweig
         return (res & 0x0000FFFF) + ((res >> 16) & 0x0000FFFF);
 }
 
-static inline unsigned int generic_hweight16(unsigned int w)
+static __inline__ unsigned int generic_hweight16(unsigned int w)
 {
         unsigned int res = (w & 0x5555) + ((w >> 1) & 0x5555);
         res = (res & 0x3333) + ((res >> 2) & 0x3333);
@@ -106,14 +106,14 @@ static inline unsigned int generic_hweig
         return (res & 0x00FF) + ((res >> 8) & 0x00FF);
 }
 
-static inline unsigned int generic_hweight8(unsigned int w)
+static __inline__ unsigned int generic_hweight8(unsigned int w)
 {
         unsigned int res = (w & 0x55) + ((w >> 1) & 0x55);
         res = (res & 0x33) + ((res >> 2) & 0x33);
         return (res & 0x0F) + ((res >> 4) & 0x0F);
 }
 
-static inline unsigned long generic_hweight64(__u64 w)
+static __inline__ unsigned long generic_hweight64(__u64 w)
 {
 #if BITS_PER_LONG < 64
 	return generic_hweight32((unsigned int)(w >> 32)) +
@@ -129,7 +129,7 @@ static inline unsigned long generic_hwei
 #endif
 }
 
-static inline unsigned long hweight_long(unsigned long w)
+static __inline__ unsigned long hweight_long(unsigned long w)
 {
 	return sizeof(w) == 4 ? generic_hweight32(w) : generic_hweight64(w);
 }
@@ -140,7 +140,7 @@ static inline unsigned long hweight_long
  * @word: value to rotate
  * @shift: bits to roll
  */
-static inline __u32 rol32(__u32 word, unsigned int shift)
+static __inline__ __u32 rol32(__u32 word, unsigned int shift)
 {
 	return (word << shift) | (word >> (32 - shift));
 }
@@ -151,9 +151,35 @@ static inline __u32 rol32(__u32 word, un
  * @word: value to rotate
  * @shift: bits to roll
  */
-static inline __u32 ror32(__u32 word, unsigned int shift)
+static __inline__ __u32 ror32(__u32 word, unsigned int shift)
 {
 	return (word >> shift) | (word << (32 - shift));
 }
+
+/*
+ * rol64 - rotate a 64-bit value left
+ *
+ * @word: value to rotate
+ * @shift: bits to roll
+ */
+#ifndef ARCH_HAS_ROL64
+static __inline__ __u64 rol64(__u64 word, unsigned int shift)
+{
+	return (word << shift) | (word >> (64 - shift));
+}
+#endif
+
+/*
+ * ror64 - rotate a 64-bit value right
+ *
+ * @word: value to rotate
+ * @shift: bits to roll
+ */
+#ifndef ARCH_HAS_ROR64
+static __inline__ __u64 ror64(__u64 word, unsigned int shift)
+{
+	return (word >> shift) | (word << (64 - shift));
+}
+#endif
 
 #endif

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] sha512: replace open-coded be64 conversions
  2005-04-19  6:21 ` [PATCH] sha512: replace open-coded be64 conversions Denis Vlasenko
  2005-04-19  6:26   ` [PATCH] sha512: use 64bit rotations Denis Vlasenko
@ 2005-04-19  9:04   ` Jesper Juhl
  2005-04-19 14:12     ` Denis Vlasenko
  1 sibling, 1 reply; 14+ messages in thread
From: Jesper Juhl @ 2005-04-19  9:04 UTC (permalink / raw)
  To: Denis Vlasenko; +Cc: linux-kernel

On Tue, 19 Apr 2005, Denis Vlasenko wrote:

> This + next patch were "modprobe tcrypt" tested.
> See next mail.

Could you please send patches inline instead of as attachments. 
Attachments mean there's more work involved in getting at them to read 
them, and they are a pain when you want to reply and quote the patch to 
comment on it.
Thanks.

-- 
Jesper

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] sha512: replace open-coded be64 conversions
  2005-04-19  9:04   ` [PATCH] sha512: replace open-coded be64 conversions Jesper Juhl
@ 2005-04-19 14:12     ` Denis Vlasenko
  2005-04-19 14:41       ` Jesper Juhl
  0 siblings, 1 reply; 14+ messages in thread
From: Denis Vlasenko @ 2005-04-19 14:12 UTC (permalink / raw)
  To: Jesper Juhl; +Cc: linux-kernel, vital

On Tuesday 19 April 2005 12:04, Jesper Juhl wrote:
> On Tue, 19 Apr 2005, Denis Vlasenko wrote:
> 
> > This + next patch were "modprobe tcrypt" tested.
> > See next mail.
> 
> Could you please send patches inline instead of as attachments. 
> Attachments mean there's more work involved in getting at them to read 
> them, and they are a pain when you want to reply and quote the patch to 
> comment on it.
> Thanks.

I'm afraid Kmail tend to subtly mangle inlined patches.
Then people start to complain that patch does not apply, and rightly so. :(

However, I can try.

I noticed that there is many more open-coded cpu<->be and cpu<->le
conversions. This is a new patch on top of previous patchset.
Seems to pass tcrypt testing.

size:
   text    data     bss     dec     hex filename
   8156     108       0    8264    2048 crypto/anubis.o
   8256     108       0    8364    20ac crypto_orig/anubis.o
  14856     108       0   14964    3a74 crypto/cast5.o
  15242     108       0   15350    3bf6 crypto_orig/cast5.o
  16639     108       0   16747    416b crypto/cast6.o
  17297     108       0   17405    43fd crypto_orig/cast6.o
   9395     244       0    9639    25a7 crypto_orig/des.o
  17981     108       0   18089    46a9 crypto/khazad.o
  18334     108       0   18442    480a crypto_orig/khazad.o
    741     108       0     849     351 crypto/michael_mic.o
    908     108       0    1016     3f8 crypto_orig/michael_mic.o
    666     108       0     774     306 crypto/sha1.o
    739     108       0     847     34f crypto_orig/sha1.o
   9535     108       0    9643    25ab crypto/sha256.o
   9579     108       0    9687    25d7 crypto_orig/sha256.o
  10495     364       0   10859    2a6b crypto/tgr192.o
  10902     364       0   11266    2c02 crypto_orig/tgr192.o
  34114     108       0   34222    85ae crypto/twofish.o
  34290     108       0   34398    865e crypto_orig/twofish.o
  23735     364       0   24099    5e23 crypto/wp512.o
  23872     364       0   24236    5eac crypto_orig/wp512.o

BTW, twofish seems to be loop-unrolled far too much for sanity.
Any objections to reduce that? I have a patch in the works.
--
vda

diff -urpN linux-2.6.12-rc2.0.orig/crypto/aes.c linux-2.6.12-rc2.1.be_le/crypto/aes.c
--- linux-2.6.12-rc2.0.orig/crypto/aes.c	Tue Apr 19 14:56:09 2005
+++ linux-2.6.12-rc2.1.be_le/crypto/aes.c	Tue Apr 19 14:53:25 2005
@@ -448,4 +448,3 @@ module_exit(aes_fini);
 
 MODULE_DESCRIPTION("Rijndael (AES) Cipher Algorithm");
 MODULE_LICENSE("Dual BSD/GPL");
-
diff -urpN linux-2.6.12-rc2.0.orig/crypto/anubis.c linux-2.6.12-rc2.1.be_le/crypto/anubis.c
--- linux-2.6.12-rc2.0.orig/crypto/anubis.c	Tue Apr 19 14:56:09 2005
+++ linux-2.6.12-rc2.1.be_le/crypto/anubis.c	Tue Apr 19 15:29:53 2005
@@ -462,7 +462,7 @@ static int anubis_setkey(void *ctx_arg, 
 			 unsigned int key_len, u32 *flags)
 {
 
-	int N, R, i, pos, r;
+	int N, R, i, r;
 	u32 kappa[ANUBIS_MAX_N];
 	u32 inter[ANUBIS_MAX_N];
 
@@ -483,12 +483,8 @@ static int anubis_setkey(void *ctx_arg, 
 	ctx->R = R = 8 + N;
 
 	/* * map cipher key to initial key state (mu): */
-		for (i = 0, pos = 0; i < N; i++, pos += 4) {
-		kappa[i] =
-			(in_key[pos    ] << 24) ^
-			(in_key[pos + 1] << 16) ^
-			(in_key[pos + 2] <<  8) ^
-			(in_key[pos + 3]      );
+	for (i = 0; i < N; i++) {
+		kappa[i] = be32_to_cpu( ((__be32*)in_key)[i] );
 	}
 
 	/*
@@ -578,7 +574,7 @@ static int anubis_setkey(void *ctx_arg, 
 static void anubis_crypt(u32 roundKey[ANUBIS_MAX_ROUNDS + 1][4],
 		u8 *ciphertext, const u8 *plaintext, const int R)
 {
-	int i, pos, r;
+	int i;
 	u32 state[4];
 	u32 inter[4];
 
@@ -586,12 +582,8 @@ static void anubis_crypt(u32 roundKey[AN
 	 * map plaintext block to cipher state (mu)
 	 * and add initial round key (sigma[K^0]):
 	 */
-	for (i = 0, pos = 0; i < 4; i++, pos += 4) {
-		state[i] =
-			(plaintext[pos    ] << 24) ^
-			(plaintext[pos + 1] << 16) ^
-			(plaintext[pos + 2] <<  8) ^
-			(plaintext[pos + 3]      ) ^
+	for (i = 0; i < 4; i++) {
+		state[i] = __be32_to_cpu( ((__be32*)plaintext)[i] ) ^
 			roundKey[0][i];
 	}
 
@@ -599,31 +591,31 @@ static void anubis_crypt(u32 roundKey[AN
 	 * R - 1 full rounds:
 	 */
 
-	for (r = 1; r < R; r++) {
+	for (i = 1; i < R; i++) {
 		inter[0] =
 			T0[(state[0] >> 24)       ] ^
 			T1[(state[1] >> 24)       ] ^
 			T2[(state[2] >> 24)       ] ^
 			T3[(state[3] >> 24)       ] ^
-			roundKey[r][0];
+			roundKey[i][0];
 		inter[1] =
 			T0[(state[0] >> 16) & 0xff] ^
 			T1[(state[1] >> 16) & 0xff] ^
 			T2[(state[2] >> 16) & 0xff] ^
 			T3[(state[3] >> 16) & 0xff] ^
-			roundKey[r][1];
+			roundKey[i][1];
 		inter[2] =
 			T0[(state[0] >>  8) & 0xff] ^
 			T1[(state[1] >>  8) & 0xff] ^
 			T2[(state[2] >>  8) & 0xff] ^
 			T3[(state[3] >>  8) & 0xff] ^
-			roundKey[r][2];
+			roundKey[i][2];
 		inter[3] =
 			T0[(state[0]      ) & 0xff] ^
 			T1[(state[1]      ) & 0xff] ^
 			T2[(state[2]      ) & 0xff] ^
 			T3[(state[3]      ) & 0xff] ^
-			roundKey[r][3];
+			roundKey[i][3];
 		state[0] = inter[0];
 		state[1] = inter[1];
 		state[2] = inter[2];
@@ -663,12 +655,8 @@ static void anubis_crypt(u32 roundKey[AN
 	 * map cipher state to ciphertext block (mu^{-1}):
 	 */
 
-	for (i = 0, pos = 0; i < 4; i++, pos += 4) {
-		u32 w = inter[i];
-		ciphertext[pos    ] = (u8)(w >> 24);
-		ciphertext[pos + 1] = (u8)(w >> 16);
-		ciphertext[pos + 2] = (u8)(w >>  8);
-		ciphertext[pos + 3] = (u8)(w      );
+	for (i = 0; i < 4; i++) {
+		((__be32*)ciphertext)[i] = __cpu_to_be32(inter[i]);
 	}
 }
 
@@ -701,10 +689,7 @@ static struct crypto_alg anubis_alg = {
 
 static int __init init(void)
 {
-	int ret = 0;
-	
-	ret = crypto_register_alg(&anubis_alg);
-	return ret;
+	return crypto_register_alg(&anubis_alg);
 }
 
 static void __exit fini(void)
diff -urpN linux-2.6.12-rc2.0.orig/crypto/cast5.c linux-2.6.12-rc2.1.be_le/crypto/cast5.c
--- linux-2.6.12-rc2.0.orig/crypto/cast5.c	Tue Apr 19 14:56:09 2005
+++ linux-2.6.12-rc2.1.be_le/crypto/cast5.c	Tue Apr 19 14:53:25 2005
@@ -589,8 +589,8 @@ static void cast5_encrypt(void *ctx, u8 
 	/* (L0,R0) <-- (m1...m64).  (Split the plaintext into left and
 	 * right 32-bit halves L0 = m1...m32 and R0 = m33...m64.)
 	 */
-	l = inbuf[0] << 24 | inbuf[1] << 16 | inbuf[2] << 8 | inbuf[3];
-	r = inbuf[4] << 24 | inbuf[5] << 16 | inbuf[6] << 8 | inbuf[7];
+	l = be32_to_cpu( ((__be32*)inbuf)[0] );
+	r = be32_to_cpu( ((__be32*)inbuf)[1] );
 
 	/* (16 rounds) for i from 1 to 16, compute Li and Ri as follows:
 	 *  Li = Ri-1;
@@ -634,14 +634,8 @@ static void cast5_encrypt(void *ctx, u8 
 
 	/* c1...c64 <-- (R16,L16).  (Exchange final blocks L16, R16 and
 	 *  concatenate to form the ciphertext.) */
-	outbuf[0] = (r >> 24) & 0xff;
-	outbuf[1] = (r >> 16) & 0xff;
-	outbuf[2] = (r >> 8) & 0xff;
-	outbuf[3] = r & 0xff;
-	outbuf[4] = (l >> 24) & 0xff;
-	outbuf[5] = (l >> 16) & 0xff;
-	outbuf[6] = (l >> 8) & 0xff;
-	outbuf[7] = l & 0xff;
+	((__be32*)outbuf)[0] = cpu_to_be32(r);
+	((__be32*)outbuf)[1] = cpu_to_be32(l);
 }
 
 static void cast5_decrypt(void *ctx, u8 * outbuf, const u8 * inbuf)
@@ -655,8 +649,8 @@ static void cast5_decrypt(void *ctx, u8 
 	Km = c->Km;
 	Kr = c->Kr;
 
-	l = inbuf[0] << 24 | inbuf[1] << 16 | inbuf[2] << 8 | inbuf[3];
-	r = inbuf[4] << 24 | inbuf[5] << 16 | inbuf[6] << 8 | inbuf[7];
+	l = be32_to_cpu( ((__be32*)inbuf)[0] );
+	r = be32_to_cpu( ((__be32*)inbuf)[1] );
 
 	if (!(c->rr)) {
 		t = l; l = r; r = t ^ F1(r, Km[15], Kr[15]);
@@ -690,14 +684,8 @@ static void cast5_decrypt(void *ctx, u8 
 		t = l; l = r; r = t ^ F1(r, Km[0], Kr[0]);
 	}
 
-	outbuf[0] = (r >> 24) & 0xff;
-	outbuf[1] = (r >> 16) & 0xff;
-	outbuf[2] = (r >> 8) & 0xff;
-	outbuf[3] = r & 0xff;
-	outbuf[4] = (l >> 24) & 0xff;
-	outbuf[5] = (l >> 16) & 0xff;
-	outbuf[6] = (l >> 8) & 0xff;
-	outbuf[7] = l & 0xff;
+	((__be32*)outbuf)[0] = cpu_to_be32(r);
+	((__be32*)outbuf)[1] = cpu_to_be32(l);
 }
 
 static void key_schedule(u32 * x, u32 * z, u32 * k)
@@ -795,13 +783,10 @@ cast5_setkey(void *ctx, const u8 * key, 
 	memset(p_key, 0, 16);
 	memcpy(p_key, key, key_len);
 
-
-	x[0] = p_key[0] << 24 | p_key[1] << 16 | p_key[2] << 8 | p_key[3];
-	x[1] = p_key[4] << 24 | p_key[5] << 16 | p_key[6] << 8 | p_key[7];
-	x[2] =
-	    p_key[8] << 24 | p_key[9] << 16 | p_key[10] << 8 | p_key[11];
-	x[3] =
-	    p_key[12] << 24 | p_key[13] << 16 | p_key[14] << 8 | p_key[15];
+	x[0] = be32_to_cpu( ((__be32*)p_key)[0] );
+	x[1] = be32_to_cpu( ((__be32*)p_key)[1] );
+	x[2] = be32_to_cpu( ((__be32*)p_key)[2] );
+	x[3] = be32_to_cpu( ((__be32*)p_key)[3] );
 
 	key_schedule(x, z, k);
 	for (i = 0; i < 16; i++)
@@ -845,4 +830,3 @@ module_exit(fini);
 
 MODULE_LICENSE("GPL");
 MODULE_DESCRIPTION("Cast5 Cipher Algorithm");
-
diff -urpN linux-2.6.12-rc2.0.orig/crypto/cast6.c linux-2.6.12-rc2.1.be_le/crypto/cast6.c
--- linux-2.6.12-rc2.0.orig/crypto/cast6.c	Tue Apr 19 14:56:09 2005
+++ linux-2.6.12-rc2.1.be_le/crypto/cast6.c	Tue Apr 19 14:53:25 2005
@@ -395,16 +395,14 @@ cast6_setkey(void *ctx, const u8 * in_ke
 	memset (p_key, 0, 32);
 	memcpy (p_key, in_key, key_len);
 	
-	key[0] = p_key[0] << 24 | p_key[1] << 16 | p_key[2] << 8 | p_key[3];		/* A */
-	key[1] = p_key[4] << 24 | p_key[5] << 16 | p_key[6] << 8 | p_key[7];		/* B */
-	key[2] = p_key[8] << 24 | p_key[9] << 16 | p_key[10] << 8 | p_key[11];		/* C */
-	key[3] = p_key[12] << 24 | p_key[13] << 16 | p_key[14] << 8 | p_key[15];	/* D */
-	key[4] = p_key[16] << 24 | p_key[17] << 16 | p_key[18] << 8 | p_key[19];	/* E */
-	key[5] = p_key[20] << 24 | p_key[21] << 16 | p_key[22] << 8 | p_key[23];	/* F */
-	key[6] = p_key[24] << 24 | p_key[25] << 16 | p_key[26] << 8 | p_key[27];	/* G */
-	key[7] = p_key[28] << 24 | p_key[29] << 16 | p_key[30] << 8 | p_key[31];	/* H */
-	
-
+	key[0] = be32_to_cpu( ((__be32*)p_key)[0] );	/* A */
+	key[1] = be32_to_cpu( ((__be32*)p_key)[1] );	/* B */
+	key[2] = be32_to_cpu( ((__be32*)p_key)[2] );	/* C */
+	key[3] = be32_to_cpu( ((__be32*)p_key)[3] );	/* D */
+	key[4] = be32_to_cpu( ((__be32*)p_key)[4] );	/* E */
+	key[5] = be32_to_cpu( ((__be32*)p_key)[5] );	/* F */
+	key[6] = be32_to_cpu( ((__be32*)p_key)[6] );	/* G */
+	key[7] = be32_to_cpu( ((__be32*)p_key)[7] );	/* H */
 
 	for (i = 0; i < 12; i++) {
 		W (key, 2 * i);
@@ -448,10 +446,10 @@ static void cast6_encrypt (void * ctx, u
 	u32 * Km; 
 	u8 * Kr;
 
-	block[0] = inbuf[0] << 24 | inbuf[1] << 16 | inbuf[2] << 8 | inbuf[3];
-	block[1] = inbuf[4] << 24 | inbuf[5] << 16 | inbuf[6] << 8 | inbuf[7];
-	block[2] = inbuf[8] << 24 | inbuf[9] << 16 | inbuf[10] << 8 | inbuf[11];
-	block[3] = inbuf[12] << 24 | inbuf[13] << 16 | inbuf[14] << 8 | inbuf[15];
+	block[0] = be32_to_cpu( ((__be32*)inbuf)[0] );
+	block[1] = be32_to_cpu( ((__be32*)inbuf)[1] );
+	block[2] = be32_to_cpu( ((__be32*)inbuf)[2] );
+	block[3] = be32_to_cpu( ((__be32*)inbuf)[3] );
 
 	Km = c->Km[0]; Kr = c->Kr[0]; Q (block, Kr, Km);
 	Km = c->Km[1]; Kr = c->Kr[1]; Q (block, Kr, Km);
@@ -465,24 +463,12 @@ static void cast6_encrypt (void * ctx, u
 	Km = c->Km[9]; Kr = c->Kr[9]; QBAR (block, Kr, Km);
 	Km = c->Km[10]; Kr = c->Kr[10]; QBAR (block, Kr, Km);
 	Km = c->Km[11]; Kr = c->Kr[11]; QBAR (block, Kr, Km);
-	
-	outbuf[0] = (block[0] >> 24) & 0xff;
-	outbuf[1] = (block[0] >> 16) & 0xff;
-	outbuf[2] = (block[0] >> 8) & 0xff;
-	outbuf[3] = block[0] & 0xff;
-	outbuf[4] = (block[1] >> 24) & 0xff;
-	outbuf[5] = (block[1] >> 16) & 0xff;
-	outbuf[6] = (block[1] >> 8) & 0xff;
-	outbuf[7] = block[1] & 0xff;
-	outbuf[8] = (block[2] >> 24) & 0xff;
-	outbuf[9] = (block[2] >> 16) & 0xff;
-	outbuf[10] = (block[2] >> 8) & 0xff;
-	outbuf[11] = block[2] & 0xff;
-	outbuf[12] = (block[3] >> 24) & 0xff;
-	outbuf[13] = (block[3] >> 16) & 0xff;
-	outbuf[14] = (block[3] >> 8) & 0xff;
-	outbuf[15] = block[3] & 0xff;	
-}	
+
+	((__be32*)outbuf)[0] = cpu_to_be32(block[0]);
+	((__be32*)outbuf)[1] = cpu_to_be32(block[1]);
+	((__be32*)outbuf)[2] = cpu_to_be32(block[2]);
+	((__be32*)outbuf)[3] = cpu_to_be32(block[3]);
+}
 
 static void cast6_decrypt (void * ctx, u8 * outbuf, const u8 * inbuf) {
 	struct cast6_ctx * c = (struct cast6_ctx *)ctx;
@@ -490,10 +476,10 @@ static void cast6_decrypt (void * ctx, u
 	u32 * Km; 
 	u8 * Kr;
 
-	block[0] = inbuf[0] << 24 | inbuf[1] << 16 | inbuf[2] << 8 | inbuf[3];
-	block[1] = inbuf[4] << 24 | inbuf[5] << 16 | inbuf[6] << 8 | inbuf[7];
-	block[2] = inbuf[8] << 24 | inbuf[9] << 16 | inbuf[10] << 8 | inbuf[11];
-	block[3] = inbuf[12] << 24 | inbuf[13] << 16 | inbuf[14] << 8 | inbuf[15];
+	block[0] = be32_to_cpu( ((__be32*)inbuf)[0] );
+	block[1] = be32_to_cpu( ((__be32*)inbuf)[1] );
+	block[2] = be32_to_cpu( ((__be32*)inbuf)[2] );
+	block[3] = be32_to_cpu( ((__be32*)inbuf)[3] );
 
 	Km = c->Km[11]; Kr = c->Kr[11]; Q (block, Kr, Km);
 	Km = c->Km[10]; Kr = c->Kr[10]; Q (block, Kr, Km);
@@ -507,24 +493,12 @@ static void cast6_decrypt (void * ctx, u
 	Km = c->Km[2]; Kr = c->Kr[2]; QBAR (block, Kr, Km);
 	Km = c->Km[1]; Kr = c->Kr[1]; QBAR (block, Kr, Km);
 	Km = c->Km[0]; Kr = c->Kr[0]; QBAR (block, Kr, Km);
-	
-	outbuf[0] = (block[0] >> 24) & 0xff;
-	outbuf[1] = (block[0] >> 16) & 0xff;
-	outbuf[2] = (block[0] >> 8) & 0xff;
-	outbuf[3] = block[0] & 0xff;
-	outbuf[4] = (block[1] >> 24) & 0xff;
-	outbuf[5] = (block[1] >> 16) & 0xff;
-	outbuf[6] = (block[1] >> 8) & 0xff;
-	outbuf[7] = block[1] & 0xff;
-	outbuf[8] = (block[2] >> 24) & 0xff;
-	outbuf[9] = (block[2] >> 16) & 0xff;
-	outbuf[10] = (block[2] >> 8) & 0xff;
-	outbuf[11] = block[2] & 0xff;
-	outbuf[12] = (block[3] >> 24) & 0xff;
-	outbuf[13] = (block[3] >> 16) & 0xff;
-	outbuf[14] = (block[3] >> 8) & 0xff;
-	outbuf[15] = block[3] & 0xff;	
-}	
+
+	((__be32*)outbuf)[0] = cpu_to_be32(block[0]);
+	((__be32*)outbuf)[1] = cpu_to_be32(block[1]);
+	((__be32*)outbuf)[2] = cpu_to_be32(block[2]);
+	((__be32*)outbuf)[3] = cpu_to_be32(block[3]);
+}
 
 static struct crypto_alg alg = {
 	.cra_name = "cast6",
diff -urpN linux-2.6.12-rc2.0.orig/crypto/deflate.c linux-2.6.12-rc2.1.be_le/crypto/deflate.c
--- linux-2.6.12-rc2.0.orig/crypto/deflate.c	Tue Apr 19 14:56:09 2005
+++ linux-2.6.12-rc2.1.be_le/crypto/deflate.c	Tue Apr 19 14:53:25 2005
@@ -220,4 +220,3 @@ module_exit(fini);
 MODULE_LICENSE("GPL");
 MODULE_DESCRIPTION("Deflate Compression Algorithm for IPCOMP");
 MODULE_AUTHOR("James Morris <jmorris@intercode.com.au>");
-
diff -urpN linux-2.6.12-rc2.0.orig/crypto/des.c linux-2.6.12-rc2.1.be_le/crypto/des.c
--- linux-2.6.12-rc2.0.orig/crypto/des.c	Tue Apr 19 14:56:09 2005
+++ linux-2.6.12-rc2.1.be_le/crypto/des.c	Tue Apr 19 16:32:14 2005
@@ -35,8 +35,6 @@
 #define DES3_EDE_EXPKEY_WORDS	(3 * DES_EXPKEY_WORDS)
 #define DES3_EDE_BLOCK_SIZE	DES_BLOCK_SIZE
 
-#define ROR(d,c,o)	((d) = (d) >> (c) | (d) << (o))
-
 struct des_ctx {
 	u8 iv[DES_BLOCK_SIZE];
 	u32 expkey[DES_EXPKEY_WORDS];
@@ -282,21 +280,10 @@ static const u8 parity[] = {
 static void des_small_fips_encrypt(u32 *expkey, u8 *dst, const u8 *src)
 {
 	u32 x, y, z;
-	
-	x  = src[7];
-	x <<= 8;
-	x |= src[6];
-	x <<= 8;
-	x |= src[5];
-	x <<= 8;
-	x |= src[4];
-	y  = src[3];
-	y <<= 8;
-	y |= src[2];
-	y <<= 8;
-	y |= src[1];
-	y <<= 8;
-	y |= src[0];
+
+	x  = le32_to_cpu( ((__le32*)src)[1] );
+	y  = le32_to_cpu( ((__le32*)src)[0] );
+
 	z  = ((x >> 004) ^ y) & 0x0F0F0F0FL;
 	x ^= z << 004;
 	y ^= z;
@@ -635,40 +622,18 @@ static void des_small_fips_encrypt(u32 *
 	z  = ((y >> 004) ^ x) & 0x0F0F0F0FL;
 	y ^= z << 004;
 	x ^= z;
-	dst[0] = x;
-	x >>= 8;
-	dst[1] = x;
-	x >>= 8;
-	dst[2] = x;
-	x >>= 8;
-	dst[3] = x;
-	dst[4] = y;
-	y >>= 8;
-	dst[5] = y;
-	y >>= 8;
-	dst[6] = y;
-	y >>= 8;
-	dst[7] = y;
+
+	((__le32*)dst)[0] = cpu_to_le32(x);
+	((__le32*)dst)[1] = cpu_to_le32(y);
 }
 
 static void des_small_fips_decrypt(u32 *expkey, u8 *dst, const u8 *src)
 {
 	u32 x, y, z;
 	
-	x  = src[7];
-	x <<= 8;
-	x |= src[6];
-	x <<= 8;
-	x |= src[5];
-	x <<= 8;
-	x |= src[4];
-	y  = src[3];
-	y <<= 8;
-	y |= src[2];
-	y <<= 8;
-	y |= src[1];
-	y <<= 8;
-	y |= src[0];
+	x  = le32_to_cpu( ((__le32*)src)[1] );
+	y  = le32_to_cpu( ((__le32*)src)[0] );
+
 	z  = ((x >> 004) ^ y) & 0x0F0F0F0FL;
 	x ^= z << 004;
 	y ^= z;
@@ -1007,20 +972,9 @@ static void des_small_fips_decrypt(u32 *
 	z  = ((y >> 004) ^ x) & 0x0F0F0F0FL;
 	y ^= z << 004;
 	x ^= z;
-	dst[0] = x;
-	x >>= 8;
-	dst[1] = x;
-	x >>= 8;
-	dst[2] = x;
-	x >>= 8;
-	dst[3] = x;
-	dst[4] = y;
-	y >>= 8;
-	dst[5] = y;
-	y >>= 8;
-	dst[6] = y;
-	y >>= 8;
-	dst[7] = y;
+
+	((__le32*)dst)[0] = cpu_to_le32(x);
+	((__le32*)dst)[1] = cpu_to_le32(y);
 }
 
 /*
@@ -1159,9 +1113,7 @@ not_weak:
 		w  |= (b1[k[18+24]] | b0[k[19+24]]) << 4;
 		w  |= (b1[k[20+24]] | b0[k[21+24]]) << 2;
 		w  |=  b1[k[22+24]] | b0[k[23+24]];
-		
-		ROR(w, 4, 28);      /* could be eliminated */
-		expkey[1] = w;
+		expkey[1] = ror32(w, 4);	/* could be eliminated */
 
 		k += 48;
 		expkey += 2;
diff -urpN linux-2.6.12-rc2.0.orig/crypto/hmac.c linux-2.6.12-rc2.1.be_le/crypto/hmac.c
--- linux-2.6.12-rc2.0.orig/crypto/hmac.c	Tue Apr 19 14:56:09 2005
+++ linux-2.6.12-rc2.1.be_le/crypto/hmac.c	Tue Apr 19 14:53:25 2005
@@ -131,4 +131,3 @@ EXPORT_SYMBOL_GPL(crypto_hmac_init);
 EXPORT_SYMBOL_GPL(crypto_hmac_update);
 EXPORT_SYMBOL_GPL(crypto_hmac_final);
 EXPORT_SYMBOL_GPL(crypto_hmac);
-
diff -urpN linux-2.6.12-rc2.0.orig/crypto/internal.h linux-2.6.12-rc2.1.be_le/crypto/internal.h
--- linux-2.6.12-rc2.0.orig/crypto/internal.h	Tue Apr 19 14:56:09 2005
+++ linux-2.6.12-rc2.1.be_le/crypto/internal.h	Tue Apr 19 14:53:25 2005
@@ -89,4 +89,3 @@ void crypto_exit_cipher_ops(struct crypt
 void crypto_exit_compress_ops(struct crypto_tfm *tfm);
 
 #endif	/* _CRYPTO_INTERNAL_H */
-
diff -urpN linux-2.6.12-rc2.0.orig/crypto/khazad.c linux-2.6.12-rc2.1.be_le/crypto/khazad.c
--- linux-2.6.12-rc2.0.orig/crypto/khazad.c	Tue Apr 19 14:56:09 2005
+++ linux-2.6.12-rc2.1.be_le/crypto/khazad.c	Tue Apr 19 14:53:25 2005
@@ -767,22 +767,8 @@ static int khazad_setkey(void *ctx_arg, 
 		return -EINVAL;
 	}
 
-	K2 = ((u64)in_key[ 0] << 56) ^
-	     ((u64)in_key[ 1] << 48) ^
-	     ((u64)in_key[ 2] << 40) ^
-	     ((u64)in_key[ 3] << 32) ^
-	     ((u64)in_key[ 4] << 24) ^
-	     ((u64)in_key[ 5] << 16) ^
-	     ((u64)in_key[ 6] <<  8) ^
-	     ((u64)in_key[ 7]      );
-	K1 = ((u64)in_key[ 8] << 56) ^
-	     ((u64)in_key[ 9] << 48) ^
-	     ((u64)in_key[10] << 40) ^
-	     ((u64)in_key[11] << 32) ^
-	     ((u64)in_key[12] << 24) ^
-	     ((u64)in_key[13] << 16) ^
-	     ((u64)in_key[14] <<  8) ^
-	     ((u64)in_key[15]      );
+	K2 = be64_to_cpu( ((__be64*)in_key)[0] );
+	K1 = be64_to_cpu( ((__be64*)in_key)[1] );
 
 	/* setup the encrypt key */
 	for (r = 0; r <= KHAZAD_ROUNDS; r++) {
@@ -814,7 +800,6 @@ static int khazad_setkey(void *ctx_arg, 
 	ctx->D[KHAZAD_ROUNDS] = ctx->E[0];
 
 	return 0;
-
 }
 
 static void khazad_crypt(const u64 roundKey[KHAZAD_ROUNDS + 1],
@@ -824,15 +809,7 @@ static void khazad_crypt(const u64 round
 	int r;
 	u64 state;
 
-	state = ((u64)plaintext[0] << 56) ^
-		((u64)plaintext[1] << 48) ^
-		((u64)plaintext[2] << 40) ^
-		((u64)plaintext[3] << 32) ^
-		((u64)plaintext[4] << 24) ^
-		((u64)plaintext[5] << 16) ^
-		((u64)plaintext[6] <<  8) ^
-		((u64)plaintext[7]      ) ^
-		roundKey[0];
+	state = be64_to_cpu( ((__be64*)plaintext)[0] ) ^ roundKey[0];
 
 	for (r = 1; r < KHAZAD_ROUNDS; r++) {
 		state = T0[(int)(state >> 56)       ] ^
@@ -856,15 +833,7 @@ static void khazad_crypt(const u64 round
 		(T7[(int)(state      ) & 0xff] & 0x00000000000000ffULL) ^
 		roundKey[KHAZAD_ROUNDS];
 
-	ciphertext[0] = (u8)(state >> 56);
-	ciphertext[1] = (u8)(state >> 48);
-	ciphertext[2] = (u8)(state >> 40);
-	ciphertext[3] = (u8)(state >> 32);
-	ciphertext[4] = (u8)(state >> 24);
-	ciphertext[5] = (u8)(state >> 16);
-	ciphertext[6] = (u8)(state >>  8);
-	ciphertext[7] = (u8)(state      );
-
+	((__be64*)ciphertext)[0] = cpu_to_be64(state);
 }
 
 static void khazad_encrypt(void *ctx_arg, u8 *dst, const u8 *src)
@@ -906,7 +875,6 @@ static void __exit fini(void)
 {
 	crypto_unregister_alg(&khazad_alg);
 }
-
 
 module_init(init);
 module_exit(fini);
diff -urpN linux-2.6.12-rc2.0.orig/crypto/md4.c linux-2.6.12-rc2.1.be_le/crypto/md4.c
--- linux-2.6.12-rc2.0.orig/crypto/md4.c	Tue Apr 19 14:56:09 2005
+++ linux-2.6.12-rc2.1.be_le/crypto/md4.c	Tue Apr 19 14:53:25 2005
@@ -37,12 +37,6 @@ struct md4_ctx {
 	u64 byte_count;
 };
 
-static inline u32 lshift(u32 x, unsigned int s)
-{
-	x &= 0xFFFFFFFF;
-	return ((x << s) & 0xFFFFFFFF) | (x >> (32 - s));
-}
-
 static inline u32 F(u32 x, u32 y, u32 z)
 {
 	return (x & y) | ((~x) & z);
@@ -58,9 +52,9 @@ static inline u32 H(u32 x, u32 y, u32 z)
 	return x ^ y ^ z;
 }
                         
-#define ROUND1(a,b,c,d,k,s) (a = lshift(a + F(b,c,d) + k, s))
-#define ROUND2(a,b,c,d,k,s) (a = lshift(a + G(b,c,d) + k + (u32)0x5A827999,s))
-#define ROUND3(a,b,c,d,k,s) (a = lshift(a + H(b,c,d) + k + (u32)0x6ED9EBA1,s))
+#define ROUND1(a,b,c,d,k,s) (a = rol32(a + F(b,c,d) + k, s))
+#define ROUND2(a,b,c,d,k,s) (a = rol32(a + G(b,c,d) + k + (u32)0x5A827999, s))
+#define ROUND3(a,b,c,d,k,s) (a = rol32(a + H(b,c,d) + k + (u32)0x6ED9EBA1, s))
 
 /* XXX: this stuff can be optimized */
 static inline void le32_to_cpu_array(u32 *buf, unsigned int words)
@@ -247,4 +241,3 @@ module_exit(fini);
 
 MODULE_LICENSE("GPL");
 MODULE_DESCRIPTION("MD4 Message Digest Algorithm");
-
diff -urpN linux-2.6.12-rc2.0.orig/crypto/michael_mic.c linux-2.6.12-rc2.1.be_le/crypto/michael_mic.c
--- linux-2.6.12-rc2.0.orig/crypto/michael_mic.c	Tue Apr 19 14:56:09 2005
+++ linux-2.6.12-rc2.1.be_le/crypto/michael_mic.c	Tue Apr 19 14:53:25 2005
@@ -45,16 +45,13 @@ do {				\
 
 static inline u32 get_le32(const u8 *p)
 {
-	return p[0] | (p[1] << 8) | (p[2] << 16) | (p[3] << 24);
+	return le32_to_cpu( ((__le32*)p)[0] );
 }
 
 
 static inline void put_le32(u8 *p, u32 v)
 {
-	p[0] = v;
-	p[1] = v >> 8;
-	p[2] = v >> 16;
-	p[3] = v >> 24;
+	((__le32*)p)[0] = cpu_to_le32(v);
 }
 
 
diff -urpN linux-2.6.12-rc2.0.orig/crypto/sha1.c linux-2.6.12-rc2.1.be_le/crypto/sha1.c
--- linux-2.6.12-rc2.0.orig/crypto/sha1.c	Tue Apr 19 14:56:09 2005
+++ linux-2.6.12-rc2.1.be_le/crypto/sha1.c	Tue Apr 19 14:53:25 2005
@@ -67,25 +67,15 @@ static void sha1_update(void *ctx, const
 	memcpy(&sctx->buffer[j], &data[i], len - i);
 }
 
-
 /* Add padding and return the message digest. */
 static void sha1_final(void* ctx, u8 *out)
 {
 	struct sha1_ctx *sctx = ctx;
-	u32 i, j, index, padlen;
-	u64 t;
+	u32 i, index, padlen;
 	u8 bits[8] = { 0, };
 	static const u8 padding[64] = { 0x80, };
 
-	t = sctx->count;
-	bits[7] = 0xff & t; t>>=8;
-	bits[6] = 0xff & t; t>>=8;
-	bits[5] = 0xff & t; t>>=8;
-	bits[4] = 0xff & t; t>>=8;
-	bits[3] = 0xff & t; t>>=8;
-	bits[2] = 0xff & t; t>>=8;
-	bits[1] = 0xff & t; t>>=8;
-	bits[0] = 0xff & t;
+	((__be64*)bits)[0] = cpu_to_be64(sctx->count);
 
 	/* Pad out to 56 mod 64 */
 	index = (sctx->count >> 3) & 0x3f;
@@ -96,12 +86,8 @@ static void sha1_final(void* ctx, u8 *ou
 	sha1_update(sctx, bits, sizeof bits); 
 
 	/* Store state in digest */
-	for (i = j = 0; i < 5; i++, j += 4) {
-		u32 t2 = sctx->state[i];
-		out[j+3] = t2 & 0xff; t2>>=8;
-		out[j+2] = t2 & 0xff; t2>>=8;
-		out[j+1] = t2 & 0xff; t2>>=8;
-		out[j  ] = t2 & 0xff;
+	for (i = 0; i < 5; i++) {
+		((__be32*)out)[i] = cpu_to_be32(sctx->state[i]);
 	}
 
 	/* Wipe context */
diff -urpN linux-2.6.12-rc2.0.orig/crypto/sha256.c linux-2.6.12-rc2.1.be_le/crypto/sha256.c
--- linux-2.6.12-rc2.0.orig/crypto/sha256.c	Tue Apr 19 14:56:09 2005
+++ linux-2.6.12-rc2.1.be_le/crypto/sha256.c	Tue Apr 19 14:53:25 2005
@@ -280,21 +280,13 @@ static void sha256_final(void* ctx, u8 *
 {
 	struct sha256_ctx *sctx = ctx;
 	u8 bits[8];
-	unsigned int index, pad_len, t;
-	int i, j;
+	unsigned int index, pad_len;
+	int i;
 	static const u8 padding[64] = { 0x80, };
 
 	/* Save number of bits */
-	t = sctx->count[0];
-	bits[7] = t; t >>= 8;
-	bits[6] = t; t >>= 8;
-	bits[5] = t; t >>= 8;
-	bits[4] = t;
-	t = sctx->count[1];
-	bits[3] = t; t >>= 8;
-	bits[2] = t; t >>= 8;
-	bits[1] = t; t >>= 8;
-	bits[0] = t;
+	((__be32*)bits)[1] = cpu_to_be32(sctx->count[0]);
+	((__be32*)bits)[0] = cpu_to_be32(sctx->count[1]);
 
 	/* Pad out to 56 mod 64. */
 	index = (sctx->count[0] >> 3) & 0x3f;
@@ -305,18 +297,13 @@ static void sha256_final(void* ctx, u8 *
 	sha256_update(sctx, bits, 8);
 
 	/* Store state in digest */
-	for (i = j = 0; i < 8; i++, j += 4) {
-		t = sctx->state[i];
-		out[j+3] = t; t >>= 8;
-		out[j+2] = t; t >>= 8;
-		out[j+1] = t; t >>= 8;
-		out[j  ] = t;
+	for (i = 0; i < 8; i++) {
+		((__be32*)out)[i] = cpu_to_be32(sctx->state[i]);
 	}
 
 	/* Zeroize sensitive information. */
 	memset(sctx, 0, sizeof(*sctx));
 }
-
 
 static struct crypto_alg alg = {
 	.cra_name	=	"sha256",
diff -urpN linux-2.6.12-rc2.0.orig/crypto/tgr192.c linux-2.6.12-rc2.1.be_le/crypto/tgr192.c
--- linux-2.6.12-rc2.0.orig/crypto/tgr192.c	Tue Apr 19 14:56:09 2005
+++ linux-2.6.12-rc2.1.be_le/crypto/tgr192.c	Tue Apr 19 15:58:00 2005
@@ -467,17 +467,9 @@ static void tgr192_transform(struct tgr1
 	u64 a, b, c, aa, bb, cc;
 	u64 x[8];
 	int i;
-	const u8 *ptr = data;
 
-	for (i = 0; i < 8; i++, ptr += 8) {
-		x[i] = (((u64)ptr[7] ) << 56) ^
-		(((u64)ptr[6] & 0xffL) << 48) ^
-		(((u64)ptr[5] & 0xffL) << 40) ^
-		(((u64)ptr[4] & 0xffL) << 32) ^
-		(((u64)ptr[3] & 0xffL) << 24) ^
-		(((u64)ptr[2] & 0xffL) << 16) ^
-		(((u64)ptr[1] & 0xffL) <<  8) ^
-		(((u64)ptr[0] & 0xffL)      );
+	for (i = 0; i < 8; i++) {
+		x[i] = le64_to_cpu( ((__le64*)data)[i] );
 	}
 
 	/* save */
@@ -559,8 +551,6 @@ static void tgr192_final(void *ctx, u8 *
 {
 	struct tgr192_ctx *tctx = ctx;
 	u32 t, msb, lsb;
-	u8 *p;
-	int i, j;
 
 	tgr192_update(tctx, NULL, 0); /* flush */ ;
 
@@ -594,41 +584,14 @@ static void tgr192_final(void *ctx, u8 *
 		memset(tctx->hash, 0, 56);    /* fill next block with zeroes */
 	}
 	/* append the 64 bit count */
-	tctx->hash[56] = lsb;
-	tctx->hash[57] = lsb >> 8;
-	tctx->hash[58] = lsb >> 16;
-	tctx->hash[59] = lsb >> 24;
-	tctx->hash[60] = msb;
-	tctx->hash[61] = msb >> 8;
-	tctx->hash[62] = msb >> 16;
-	tctx->hash[63] = msb >> 24;
+	((__le32*)tctx->hash)[56/4] = cpu_to_le32(lsb);
+	((__le32*)tctx->hash)[60/4] = cpu_to_le32(msb);
 	tgr192_transform(tctx, tctx->hash);
 
-	p = tctx->hash;
-	*p++ = tctx->a >> 56; *p++ = tctx->a >> 48; *p++ = tctx->a >> 40;
-	*p++ = tctx->a >> 32; *p++ = tctx->a >> 24; *p++ = tctx->a >> 16;
-	*p++ = tctx->a >>  8; *p++ = tctx->a;\
-	*p++ = tctx->b >> 56; *p++ = tctx->b >> 48; *p++ = tctx->b >> 40;
-	*p++ = tctx->b >> 32; *p++ = tctx->b >> 24; *p++ = tctx->b >> 16;
-	*p++ = tctx->b >>  8; *p++ = tctx->b;
-	*p++ = tctx->c >> 56; *p++ = tctx->c >> 48; *p++ = tctx->c >> 40;
-	*p++ = tctx->c >> 32; *p++ = tctx->c >> 24; *p++ = tctx->c >> 16;
-	*p++ = tctx->c >>  8; *p++ = tctx->c;
-
-
 	/* unpack the hash */
-	j = 7;
-	for (i = 0; i < 8; i++) {
-		out[j--] = (tctx->a >> 8 * i) & 0xff;
-	}
-	j = 15;
-	for (i = 0; i < 8; i++) {
-		out[j--] = (tctx->b >> 8 * i) & 0xff;
-	}
-	j = 23;
-	for (i = 0; i < 8; i++) {
-		out[j--] = (tctx->c >> 8 * i) & 0xff;
-	}
+	((__be64*)out)[0] = ((__be64*)tctx->hash)[0] = cpu_to_be64(tctx->a);
+	((__be64*)out)[1] = ((__be64*)tctx->hash)[1] = cpu_to_be64(tctx->b);
+	((__be64*)out)[2] = ((__be64*)tctx->hash)[2] = cpu_to_be64(tctx->c);
 }
 
 static void tgr160_final(void *ctx, u8 * out)
diff -urpN linux-2.6.12-rc2.0.orig/crypto/twofish.c linux-2.6.12-rc2.1.be_le/crypto/twofish.c
--- linux-2.6.12-rc2.0.orig/crypto/twofish.c	Tue Apr 19 14:56:09 2005
+++ linux-2.6.12-rc2.1.be_le/crypto/twofish.c	Tue Apr 19 16:32:42 2005
@@ -540,9 +540,9 @@ static const u8 calc_sb_tbl[512] = {
 #define CALC_K(a, j, k, l, m, n) \
    x = CALC_K_2 (k, l, k, l, 0); \
    y = CALC_K_2 (m, n, m, n, 4); \
-   y = (y << 8) + (y >> 24); \
+   y = rol32(y, 8); \
    x += y; y += x; ctx->a[j] = x; \
-   ctx->a[(j) + 1] = (y << 9) + (y >> 23)
+   ctx->a[(j) + 1] = rol32(y, 9)
 
 #define CALC_K192_2(a, b, c, d, j) \
    CALC_K_2 (q0[a ^ key[(j) + 16]], \
@@ -553,9 +553,9 @@ static const u8 calc_sb_tbl[512] = {
 #define CALC_K192(a, j, k, l, m, n) \
    x = CALC_K192_2 (l, l, k, k, 0); \
    y = CALC_K192_2 (n, n, m, m, 4); \
-   y = (y << 8) + (y >> 24); \
+   y = rol32(y, 8); \
    x += y; y += x; ctx->a[j] = x; \
-   ctx->a[(j) + 1] = (y << 9) + (y >> 23)
+   ctx->a[(j) + 1] = rol32(y, 9)
 
 #define CALC_K256_2(a, b, j) \
    CALC_K192_2 (q1[b ^ key[(j) + 24]], \
@@ -566,9 +566,9 @@ static const u8 calc_sb_tbl[512] = {
 #define CALC_K256(a, j, k, l, m, n) \
    x = CALC_K256_2 (k, l, 0); \
    y = CALC_K256_2 (m, n, 4); \
-   y = (y << 8) + (y >> 24); \
+   y = rol32(y, 8); \
    x += y; y += x; ctx->a[j] = x; \
-   ctx->a[(j) + 1] = (y << 9) + (y >> 23)
+   ctx->a[(j) + 1] = rol32(y, 9)
 
 
 /* Macros to compute the g() function in the encryption and decryption
@@ -621,13 +621,11 @@ static const u8 calc_sb_tbl[512] = {
  * whitening subkey number m. */
 
 #define INPACK(n, x, m) \
-   x = in[4 * (n)] ^ (in[4 * (n) + 1] << 8) \
-     ^ (in[4 * (n) + 2] << 16) ^ (in[4 * (n) + 3] << 24) ^ ctx->w[m]
+   x = le32_to_cpu( ((__le32*)in)[n] )
 
 #define OUTUNPACK(n, x, m) \
    x ^= ctx->w[m]; \
-   out[4 * (n)] = x; out[4 * (n) + 1] = x >> 8; \
-   out[4 * (n) + 2] = x >> 16; out[4 * (n) + 3] = x >> 24
+   ((__le32*)out)[n] = cpu_to_le32(x)
 
 #define TF_MIN_KEY_SIZE 16
 #define TF_MAX_KEY_SIZE 32
@@ -867,7 +865,6 @@ static void twofish_decrypt(void *cx, u8
 	OUTUNPACK (1, b, 1);
 	OUTUNPACK (2, c, 2);
 	OUTUNPACK (3, d, 3);
-
 }
 
 static struct crypto_alg alg = {
diff -urpN linux-2.6.12-rc2.0.orig/crypto/wp512.c linux-2.6.12-rc2.1.be_le/crypto/wp512.c
--- linux-2.6.12-rc2.0.orig/crypto/wp512.c	Tue Apr 19 14:56:09 2005
+++ linux-2.6.12-rc2.1.be_le/crypto/wp512.c	Tue Apr 19 16:40:12 2005
@@ -778,18 +778,10 @@ static void wp512_process_buffer(struct 
 	u64 block[8];    /* mu(buffer) */
 	u64 state[8];    /* the cipher state */
 	u64 L[8];
-	u8 *buffer = wctx->buffer;
+	//u8 *buffer = wctx->buffer;
 
-	for (i = 0; i < 8; i++, buffer += 8) {
-		block[i] =
-		(((u64)buffer[0]        ) << 56) ^
-		(((u64)buffer[1] & 0xffL) << 48) ^
-		(((u64)buffer[2] & 0xffL) << 40) ^
-		(((u64)buffer[3] & 0xffL) << 32) ^
-		(((u64)buffer[4] & 0xffL) << 24) ^
-		(((u64)buffer[5] & 0xffL) << 16) ^
-		(((u64)buffer[6] & 0xffL) <<  8) ^
-		(((u64)buffer[7] & 0xffL)      );
+	for (i = 0; i < 8; i++) {
+		block[i] = be64_to_cpu( ((__be64*)wctx->buffer)[i] );
 	}
 
 	state[0] = block[0] ^ (K[0] = wctx->hash[0]);
@@ -985,7 +977,6 @@ static void wp512_process_buffer(struct 
 	wctx->hash[5] ^= state[5] ^ block[5];
 	wctx->hash[6] ^= state[6] ^ block[6];
 	wctx->hash[7] ^= state[7] ^ block[7];
-
 }
 
 static void wp512_init (void *ctx) {
@@ -1058,7 +1049,6 @@ static void wp512_update(void *ctx, cons
 
 	wctx->bufferBits   = bufferBits;
 	wctx->bufferPos    = bufferPos;
-
 }
 
 static void wp512_final(void *ctx, u8 *out)
@@ -1089,14 +1079,7 @@ static void wp512_final(void *ctx, u8 *o
 		   bitLength, WP512_LENGTHBYTES);
    	wp512_process_buffer(wctx);
    	for (i = 0; i < WP512_DIGEST_SIZE/8; i++) {
-		digest[0] = (u8)(wctx->hash[i] >> 56);
-		digest[1] = (u8)(wctx->hash[i] >> 48);
-		digest[2] = (u8)(wctx->hash[i] >> 40);
-		digest[3] = (u8)(wctx->hash[i] >> 32);
-		digest[4] = (u8)(wctx->hash[i] >> 24);
-		digest[5] = (u8)(wctx->hash[i] >> 16);
-		digest[6] = (u8)(wctx->hash[i] >>  8);
-		digest[7] = (u8)(wctx->hash[i]      );
+		((__be64*)digest)[0] = cpu_to_be64(wctx->hash[i]);
 		digest += 8;
    	}
    	wctx->bufferBits   = bufferBits;


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] sha512: replace open-coded be64 conversions
  2005-04-19 14:12     ` Denis Vlasenko
@ 2005-04-19 14:41       ` Jesper Juhl
  2005-04-19 14:49         ` Denis Vlasenko
  0 siblings, 1 reply; 14+ messages in thread
From: Jesper Juhl @ 2005-04-19 14:41 UTC (permalink / raw)
  To: Denis Vlasenko; +Cc: linux-kernel, vital

On Tue, 19 Apr 2005, Denis Vlasenko wrote:

> On Tuesday 19 April 2005 12:04, Jesper Juhl wrote:
> > On Tue, 19 Apr 2005, Denis Vlasenko wrote:
> > 
> > > This + next patch were "modprobe tcrypt" tested.
> > > See next mail.
> > 
> > Could you please send patches inline instead of as attachments. 
> > Attachments mean there's more work involved in getting at them to read 
> > them, and they are a pain when you want to reply and quote the patch to 
> > comment on it.
> > Thanks.
> 
> I'm afraid Kmail tend to subtly mangle inlined patches.
> Then people start to complain that patch does not apply, and rightly so. :(
> 
> However, I can try.
> 
Thanks.

A few small comments below.


[...]
> @@ -483,12 +483,8 @@ static int anubis_setkey(void *ctx_arg, 
>  	ctx->R = R = 8 + N;
>  
>  	/* * map cipher key to initial key state (mu): */
> -		for (i = 0, pos = 0; i < N; i++, pos += 4) {
> -		kappa[i] =
> -			(in_key[pos    ] << 24) ^
> -			(in_key[pos + 1] << 16) ^
> -			(in_key[pos + 2] <<  8) ^
> -			(in_key[pos + 3]      );
> +	for (i = 0; i < N; i++) {
> +		kappa[i] = be32_to_cpu( ((__be32*)in_key)[i] );
                                         ^^^^^^^            ^
                 is this cast really nessesary ?      No space there.
>  	}
        ^
     Those braces are superfluous.

[...]


Same comments for the rest of it. There seems to be a lot of unneeded 
casts (I could be wrong, but I don't see the need for them, and if they 
are not needed all they do is mask potential type errors). And the 
whitespace thing also goes for the rest, don't put extra spaces after the 
opening parenthesis and before the closing one in function calls.


-- 
Jesper Juhl



^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] sha512: replace open-coded be64 conversions
  2005-04-19 14:41       ` Jesper Juhl
@ 2005-04-19 14:49         ` Denis Vlasenko
  0 siblings, 0 replies; 14+ messages in thread
From: Denis Vlasenko @ 2005-04-19 14:49 UTC (permalink / raw)
  To: Jesper Juhl; +Cc: linux-kernel, vital

On Tuesday 19 April 2005 17:41, Jesper Juhl wrote:
> On Tue, 19 Apr 2005, Denis Vlasenko wrote:
> 
> > On Tuesday 19 April 2005 12:04, Jesper Juhl wrote:
> > > On Tue, 19 Apr 2005, Denis Vlasenko wrote:
> > > 
> > > > This + next patch were "modprobe tcrypt" tested.
> > > > See next mail.
> > > 
> > > Could you please send patches inline instead of as attachments. 
> > > Attachments mean there's more work involved in getting at them to read 
> > > them, and they are a pain when you want to reply and quote the patch to 
> > > comment on it.
> > > Thanks.
> > 
> > I'm afraid Kmail tend to subtly mangle inlined patches.
> > Then people start to complain that patch does not apply, and rightly so. :(
> > 
> > However, I can try.
> > 
> Thanks.
> 
> A few small comments below.
> 
> 
> [...]
> > @@ -483,12 +483,8 @@ static int anubis_setkey(void *ctx_arg, 
> >  	ctx->R = R = 8 + N;
> >  
> >  	/* * map cipher key to initial key state (mu): */
> > -		for (i = 0, pos = 0; i < N; i++, pos += 4) {
> > -		kappa[i] =
> > -			(in_key[pos    ] << 24) ^
> > -			(in_key[pos + 1] << 16) ^
> > -			(in_key[pos + 2] <<  8) ^
> > -			(in_key[pos + 3]      );
> > +	for (i = 0; i < N; i++) {
> > +		kappa[i] = be32_to_cpu( ((__be32*)in_key)[i] );
>                                          ^^^^^^^            ^
>                  is this cast really nessesary ?      No space there.

Yes, it is necessary. It (a) casts pointer to u32*,
this allows index [i] to work as inteneded.
Adn (b) it marks value as big-endian. IIRC this is needed
for sparse checking.

Spaces are there because otherwise it is a bit hard to visually
check for correct number and placement of three pairs of ()'s.

> >  	}
>         ^
>      Those braces are superfluous.

Yes. Since they were there from the start, I decided to not touch 'em.
--
vda


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] introduce generic 64bit rotations and i386 asm optimized version
  2005-04-19  6:18 [PATCH] introduce generic 64bit rotations and i386 asm optimized version Denis Vlasenko
  2005-04-19  6:21 ` [PATCH] sha512: replace open-coded be64 conversions Denis Vlasenko
  2005-04-19  6:46 ` [PATCH] introduce generic 64bit rotations and i386 asm optimized version YOSHIFUJI Hideaki / 吉藤英明
@ 2005-04-19 19:15 ` Geert Uytterhoeven
  2 siblings, 0 replies; 14+ messages in thread
From: Geert Uytterhoeven @ 2005-04-19 19:15 UTC (permalink / raw)
  To: Denis Vlasenko; +Cc: Linux Kernel Development, Matt Mackall

On Tue, 19 Apr 2005, Denis Vlasenko wrote:

[ Please inline patches, so it's easier to comment on them ]

> +static inline u64 rol64(u64 x,int num) {
                                     ^^^
> +	if(__builtin_constant_p(num))
> +		return constant_rol64(x,num);
> +	/* Hmmm... shall we do cnt&=63 here? */
                               ^^^
			       num?

> +	return ((x<<num) | (x>>(64-num)));
> +}

Gr{oetje,eeting}s,

						Geert

--
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- geert@linux-m68k.org

In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
							    -- Linus Torvalds

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] introduce generic 64bit rotations and i386 asm optimized version
  2005-04-19  6:46 ` [PATCH] introduce generic 64bit rotations and i386 asm optimized version YOSHIFUJI Hideaki / 吉藤英明
  2005-04-19  6:55   ` Denis Vlasenko
@ 2005-04-19 20:20   ` Domen Puncer
  2005-04-20 20:47     ` Ralf Baechle
  1 sibling, 1 reply; 14+ messages in thread
From: Domen Puncer @ 2005-04-19 20:20 UTC (permalink / raw)
  To: YOSHIFUJI Hideaki / ?$B5HF#1QL@; +Cc: vda, linux-kernel, mpm

On 19/04/05 15:46 +0900, YOSHIFUJI Hideaki / ?$B5HF#1QL@ wrote:
> In article <200504190918.10279.vda@port.imtp.ilyichevsk.odessa.ua> (at Tue, 19 Apr 2005 09:18:10 +0300), Denis Vlasenko <vda@port.imtp.ilyichevsk.odessa.ua> says:
> 
> > diff -urpN 2.6.12-rc2.1.be/include/linux/bitops.h 2.6.12-rc2.2.ror/include/linux/bitops.h
> > --- 2.6.12-rc2.1.be/include/linux/bitops.h	Mon Apr 18 22:55:10 2005
> > +++ 2.6.12-rc2.2.ror/include/linux/bitops.h	Tue Apr 19 00:25:28 2005
...
> > -static __inline__ int get_bitmask_order(unsigned int count)
> > +static inline int get_bitmask_order(unsigned int count)
> >  {
> >  	int order;
> >  	
> 
> Please keep using __inline__, not inline.

Why?

Couldn't find any threads about this, and even SubmittingPatches has:
"'static inline' is preferred over 'static __inline__'..."


	Domen

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] introduce generic 64bit rotations and i386 asm optimized version
  2005-04-19 20:20   ` Domen Puncer
@ 2005-04-20 20:47     ` Ralf Baechle
  0 siblings, 0 replies; 14+ messages in thread
From: Ralf Baechle @ 2005-04-20 20:47 UTC (permalink / raw)
  To: Domen Puncer; +Cc: YOSHIFUJI Hideaki / ?$B5HF#1QL@, vda, linux-kernel, mpm

On Tue, Apr 19, 2005 at 10:20:38PM +0200, Domen Puncer wrote:

> > Please keep using __inline__, not inline.
> 
> Why?
> 
> Couldn't find any threads about this, and even SubmittingPatches has:
> "'static inline' is preferred over 'static __inline__'..."

Unlike inline __inline__ will be recogniced by gcc even in -ansi mode.
And inline is namespace pollution.  Where there's no technical reason I
prefer the non-underscore version, __inline__ is just looking too 1337
and eye-insulting.

Of course all these thoughts are useless because these days usespace is
not supposed to use kernel headers directly and the kernel isn't ANSI-clean
anyway.

  Ralf

^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2005-04-21 10:02 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2005-04-19  6:18 [PATCH] introduce generic 64bit rotations and i386 asm optimized version Denis Vlasenko
2005-04-19  6:21 ` [PATCH] sha512: replace open-coded be64 conversions Denis Vlasenko
2005-04-19  6:26   ` [PATCH] sha512: use 64bit rotations Denis Vlasenko
2005-04-19  6:28     ` [PATCH] sha512: fix whitespace Denis Vlasenko
2005-04-19  6:31       ` [PATCH] ia64: use 64bit rotations Denis Vlasenko
2005-04-19  9:04   ` [PATCH] sha512: replace open-coded be64 conversions Jesper Juhl
2005-04-19 14:12     ` Denis Vlasenko
2005-04-19 14:41       ` Jesper Juhl
2005-04-19 14:49         ` Denis Vlasenko
2005-04-19  6:46 ` [PATCH] introduce generic 64bit rotations and i386 asm optimized version YOSHIFUJI Hideaki / 吉藤英明
2005-04-19  6:55   ` Denis Vlasenko
2005-04-19 20:20   ` Domen Puncer
2005-04-20 20:47     ` Ralf Baechle
2005-04-19 19:15 ` Geert Uytterhoeven

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox