From: Benjamin Gilbert <bgilbert@cs.cmu.edu>
To: jengelh@computergmbh.de
Cc: akpm@linux-foundation.org, herbert@gondor.apana.org.au,
linux-crypto@vger.kernel.org, linux-kernel@vger.kernel.org
Subject: [PATCH] [CRYPTO] Add optimized SHA-1 implementation for i486+
Date: Mon, 11 Jun 2007 15:50:52 -0400 [thread overview]
Message-ID: <20070611195051.25068.16858.stgit@dev> (raw)
In-Reply-To: <466DA6DF.1080403@cs.cmu.edu>
Add x86-optimized implementation of the SHA-1 hash function, taken from
Nettle under the LGPL. This code will be enabled on kernels compiled for
486es or better; kernels which support 386es will use the generic
implementation (since we need BSWAP).
We disable building lib/sha1.o when an optimized implementation is
available, as the library link order for x86 (and x86_64) would otherwise
ignore the optimized version. The existing optimized implementation for ARM
does not do this; the library link order for that architecture appears to
favor the arch/arm/ version automatically. I've left this situation alone
since I'm not familiar with the ARM code, but a !ARM condition could be
added to CONFIG_SHA1_GENERIC if it makes sense.
The code has been tested with tcrypt and the NIST test vectors.
Signed-off-by: Benjamin Gilbert <bgilbert@cs.cmu.edu>
---
arch/i386/kernel/i386_ksyms.c | 5 +
arch/i386/lib/Makefile | 1
arch/i386/lib/sha1.S | 299 +++++++++++++++++++++++++++++++++++++++++
include/linux/cryptohash.h | 9 +
lib/Kconfig | 13 ++
lib/Makefile | 3
6 files changed, 328 insertions(+), 2 deletions(-)
diff --git a/arch/i386/kernel/i386_ksyms.c b/arch/i386/kernel/i386_ksyms.c
index e3d4b73..812bc4e 100644
--- a/arch/i386/kernel/i386_ksyms.c
+++ b/arch/i386/kernel/i386_ksyms.c
@@ -1,4 +1,5 @@
#include <linux/module.h>
+#include <linux/cryptohash.h>
#include <asm/checksum.h>
#include <asm/desc.h>
@@ -28,3 +29,7 @@ EXPORT_SYMBOL(__read_lock_failed);
#endif
EXPORT_SYMBOL(csum_partial);
+
+#ifdef CONFIG_SHA1_X86
+EXPORT_SYMBOL(sha_transform);
+#endif
diff --git a/arch/i386/lib/Makefile b/arch/i386/lib/Makefile
index 22d8ac5..69f4845 100644
--- a/arch/i386/lib/Makefile
+++ b/arch/i386/lib/Makefile
@@ -6,6 +6,7 @@
lib-y = checksum.o delay.o usercopy.o getuser.o putuser.o memcpy.o strstr.o \
bitops.o semaphore.o
+lib-$(CONFIG_SHA1_X86) += sha1.o
lib-$(CONFIG_X86_USE_3DNOW) += mmx.o
obj-$(CONFIG_SMP) += msr-on-cpu.o
diff --git a/arch/i386/lib/sha1.S b/arch/i386/lib/sha1.S
new file mode 100644
index 0000000..a84d829
--- /dev/null
+++ b/arch/i386/lib/sha1.S
@@ -0,0 +1,299 @@
+/*
+ * x86-optimized SHA1 hash algorithm (i486 and above)
+ *
+ * Originally from Nettle
+ * Ported from M4 to cpp by Benjamin Gilbert <bgilbert@cs.cmu.edu>
+ *
+ * Copyright (C) 2004, Niels Möller
+ * Copyright (C) 2006-2007 Carnegie Mellon University
+ *
+ * This library is free software; you can redistribute it and/or modify it
+ * under the terms of version 2.1 of the GNU Lesser General Public License as
+ * published by the Free Software Foundation. A copy of the GNU Lesser General
+ * Public License should have been distributed along with this library in the
+ * file LICENSE.LGPL.
+ *
+ * This library is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
+ * for more details.
+ */
+
+#include <linux/linkage.h>
+#include <asm/dwarf2.h>
+
+/* Register usage */
+#define SA %eax
+#define SB %ebx
+#define SC %ecx
+#define SD %edx
+#define SE %ebp
+#define DATA %esp
+#define TMP %edi
+#define TMP2 %esi /* Used by SWAP and F3 */
+#define TMP3 64(%esp)
+
+/* Constants */
+#define K1VALUE $0x5A827999 /* Rounds 0-19 */
+#define K2VALUE $0x6ED9EBA1 /* Rounds 20-39 */
+#define K3VALUE $0x8F1BBCDC /* Rounds 40-59 */
+#define K4VALUE $0xCA62C1D6 /* Rounds 60-79 */
+
+/* Convert stack offsets in words to offsets in bytes */
+#define OFFSET(i) 4*(i)
+
+/* Reads the input via TMP2 into register, byteswaps it, and stores it in
+ the DATA array. */
+#define SWAP(index, register) \
+ movl OFFSET(index)(TMP2), register; \
+ bswap register; \
+ movl register, OFFSET(index)(DATA)
+
+/* Sets the workspace word at the given index to TMP. */
+#define CLEAR(index) \
+ movl TMP, OFFSET(index)(DATA)
+
+/* pushl/popl wrappers that update the DWARF unwind table */
+#define PUSH(regname) \
+ pushl %regname; \
+ CFI_ADJUST_CFA_OFFSET 4; \
+ CFI_REL_OFFSET regname, 0
+
+#define POP(regname) \
+ popl %regname; \
+ CFI_ADJUST_CFA_OFFSET -4; \
+ CFI_RESTORE regname
+
+/*
+ * expand(i) is the expansion function
+ *
+ * W[i] = (W[i - 16] ^ W[i - 14] ^ W[i - 8] ^ W[i - 3]) <<< 1
+ *
+ * where W[i] is stored in DATA[i mod 16].
+ *
+ * Result is stored back in W[i], and also left in TMP, the only
+ * register that is used.
+ */
+#define EXPAND(i) \
+ movl OFFSET(i % 16)(DATA), TMP; \
+ xorl OFFSET((i + 2) % 16)(DATA), TMP; \
+ xorl OFFSET((i + 8) % 16)(DATA), TMP; \
+ xorl OFFSET((i + 13) % 16)(DATA), TMP; \
+ roll $1, TMP; \
+ movl TMP, OFFSET(i % 16)(DATA)
+
+/*
+ * The f functions,
+ *
+ * f1(x,y,z) = z ^ (x & (y ^ z))
+ * f2(x,y,z) = x ^ y ^ z
+ * f3(x,y,z) = (x & y) | (z & (x | y))
+ * f4 = f2
+ *
+ * The macro Fk(x,y,z) computes = fk(x,y,z).
+ * Result is left in TMP.
+ */
+#define F1(x,y,z) \
+ movl z, TMP; \
+ xorl y, TMP; \
+ andl x, TMP; \
+ xorl z, TMP
+
+#define F2(x,y,z) \
+ movl x, TMP; \
+ xorl y, TMP; \
+ xorl z, TMP
+
+#define F3(x,y,z) \
+ movl x, TMP2; \
+ andl y, TMP2; \
+ movl x, TMP; \
+ orl y, TMP; \
+ andl z, TMP; \
+ orl TMP2, TMP
+
+/*
+ * The form of one sha1 round is
+ *
+ * a' = e + a <<< 5 + f( b, c, d ) + k + w;
+ * b' = a;
+ * c' = b <<< 30;
+ * d' = c;
+ * e' = d;
+ *
+ * where <<< denotes rotation. We permute our variables, so that we
+ * instead get
+ *
+ * e += a <<< 5 + f( b, c, d ) + k + w;
+ * b <<<= 30
+ *
+ * Using the TMP register for the rotate could be avoided, by rotating
+ * %a in place, adding, and then rotating back.
+ */
+#define ROUND(a,b,c,d,e,f,k,w) \
+ addl k, e; \
+ addl w, e; \
+ f(b,c,d); \
+ addl TMP, e; \
+ movl a, TMP; \
+ roll $5, TMP; \
+ addl TMP, e; \
+ roll $30, b;
+
+/* sha_transform(__u32 *digest, const char *in, __u32 *W) */
+/* The workspace argument is ignored; we don't have enough registers to handle
+ a workspace that isn't on our own stack. */
+.text
+ENTRY(sha_transform)
+ CFI_STARTPROC
+ /* save all registers that need to be saved */
+ PUSH(ebx) /* 80(%esp) */
+ PUSH(ebp) /* 76(%esp) */
+ PUSH(esi) /* 72(%esp) */
+ PUSH(edi) /* 68(%esp) */
+
+ subl $68, %esp /* %esp = W */
+ CFI_ADJUST_CFA_OFFSET 68
+
+ /* Load and byteswap data */
+ movl 92(%esp), TMP2
+
+ SWAP( 0, %eax); SWAP( 1, %ebx); SWAP( 2, %ecx); SWAP( 3, %edx)
+ SWAP( 4, %eax); SWAP( 5, %ebx); SWAP( 6, %ecx); SWAP( 7, %edx)
+ SWAP( 8, %eax); SWAP( 9, %ebx); SWAP(10, %ecx); SWAP(11, %edx)
+ SWAP(12, %eax); SWAP(13, %ebx); SWAP(14, %ecx); SWAP(15, %edx)
+
+ /* load the state vector */
+ movl 88(%esp),TMP
+ movl (TMP), SA
+ movl 4(TMP), SB
+ movl 8(TMP), SC
+ movl 12(TMP), SD
+ movl 16(TMP), SE
+
+ movl K1VALUE, TMP2
+ ROUND(SA, SB, SC, SD, SE, F1, TMP2, OFFSET( 0)(DATA))
+ ROUND(SE, SA, SB, SC, SD, F1, TMP2, OFFSET( 1)(DATA))
+ ROUND(SD, SE, SA, SB, SC, F1, TMP2, OFFSET( 2)(DATA))
+ ROUND(SC, SD, SE, SA, SB, F1, TMP2, OFFSET( 3)(DATA))
+ ROUND(SB, SC, SD, SE, SA, F1, TMP2, OFFSET( 4)(DATA))
+
+ ROUND(SA, SB, SC, SD, SE, F1, TMP2, OFFSET( 5)(DATA))
+ ROUND(SE, SA, SB, SC, SD, F1, TMP2, OFFSET( 6)(DATA))
+ ROUND(SD, SE, SA, SB, SC, F1, TMP2, OFFSET( 7)(DATA))
+ ROUND(SC, SD, SE, SA, SB, F1, TMP2, OFFSET( 8)(DATA))
+ ROUND(SB, SC, SD, SE, SA, F1, TMP2, OFFSET( 9)(DATA))
+
+ ROUND(SA, SB, SC, SD, SE, F1, TMP2, OFFSET(10)(DATA))
+ ROUND(SE, SA, SB, SC, SD, F1, TMP2, OFFSET(11)(DATA))
+ ROUND(SD, SE, SA, SB, SC, F1, TMP2, OFFSET(12)(DATA))
+ ROUND(SC, SD, SE, SA, SB, F1, TMP2, OFFSET(13)(DATA))
+ ROUND(SB, SC, SD, SE, SA, F1, TMP2, OFFSET(14)(DATA))
+
+ ROUND(SA, SB, SC, SD, SE, F1, TMP2, OFFSET(15)(DATA))
+ EXPAND(16); ROUND(SE, SA, SB, SC, SD, F1, TMP2, TMP)
+ EXPAND(17); ROUND(SD, SE, SA, SB, SC, F1, TMP2, TMP)
+ EXPAND(18); ROUND(SC, SD, SE, SA, SB, F1, TMP2, TMP)
+ EXPAND(19); ROUND(SB, SC, SD, SE, SA, F1, TMP2, TMP)
+
+ /* TMP2 is free to use in these rounds */
+ movl K2VALUE, TMP2
+ EXPAND(20); ROUND(SA, SB, SC, SD, SE, F2, TMP2, TMP)
+ EXPAND(21); ROUND(SE, SA, SB, SC, SD, F2, TMP2, TMP)
+ EXPAND(22); ROUND(SD, SE, SA, SB, SC, F2, TMP2, TMP)
+ EXPAND(23); ROUND(SC, SD, SE, SA, SB, F2, TMP2, TMP)
+ EXPAND(24); ROUND(SB, SC, SD, SE, SA, F2, TMP2, TMP)
+
+ EXPAND(25); ROUND(SA, SB, SC, SD, SE, F2, TMP2, TMP)
+ EXPAND(26); ROUND(SE, SA, SB, SC, SD, F2, TMP2, TMP)
+ EXPAND(27); ROUND(SD, SE, SA, SB, SC, F2, TMP2, TMP)
+ EXPAND(28); ROUND(SC, SD, SE, SA, SB, F2, TMP2, TMP)
+ EXPAND(29); ROUND(SB, SC, SD, SE, SA, F2, TMP2, TMP)
+
+ EXPAND(30); ROUND(SA, SB, SC, SD, SE, F2, TMP2, TMP)
+ EXPAND(31); ROUND(SE, SA, SB, SC, SD, F2, TMP2, TMP)
+ EXPAND(32); ROUND(SD, SE, SA, SB, SC, F2, TMP2, TMP)
+ EXPAND(33); ROUND(SC, SD, SE, SA, SB, F2, TMP2, TMP)
+ EXPAND(34); ROUND(SB, SC, SD, SE, SA, F2, TMP2, TMP)
+
+ EXPAND(35); ROUND(SA, SB, SC, SD, SE, F2, TMP2, TMP)
+ EXPAND(36); ROUND(SE, SA, SB, SC, SD, F2, TMP2, TMP)
+ EXPAND(37); ROUND(SD, SE, SA, SB, SC, F2, TMP2, TMP)
+ EXPAND(38); ROUND(SC, SD, SE, SA, SB, F2, TMP2, TMP)
+ EXPAND(39); ROUND(SB, SC, SD, SE, SA, F2, TMP2, TMP)
+
+ /* We have to put this constant on the stack */
+ movl K3VALUE, TMP3
+ EXPAND(40); ROUND(SA, SB, SC, SD, SE, F3, TMP3, TMP)
+ EXPAND(41); ROUND(SE, SA, SB, SC, SD, F3, TMP3, TMP)
+ EXPAND(42); ROUND(SD, SE, SA, SB, SC, F3, TMP3, TMP)
+ EXPAND(43); ROUND(SC, SD, SE, SA, SB, F3, TMP3, TMP)
+ EXPAND(44); ROUND(SB, SC, SD, SE, SA, F3, TMP3, TMP)
+
+ EXPAND(45); ROUND(SA, SB, SC, SD, SE, F3, TMP3, TMP)
+ EXPAND(46); ROUND(SE, SA, SB, SC, SD, F3, TMP3, TMP)
+ EXPAND(47); ROUND(SD, SE, SA, SB, SC, F3, TMP3, TMP)
+ EXPAND(48); ROUND(SC, SD, SE, SA, SB, F3, TMP3, TMP)
+ EXPAND(49); ROUND(SB, SC, SD, SE, SA, F3, TMP3, TMP)
+
+ EXPAND(50); ROUND(SA, SB, SC, SD, SE, F3, TMP3, TMP)
+ EXPAND(51); ROUND(SE, SA, SB, SC, SD, F3, TMP3, TMP)
+ EXPAND(52); ROUND(SD, SE, SA, SB, SC, F3, TMP3, TMP)
+ EXPAND(53); ROUND(SC, SD, SE, SA, SB, F3, TMP3, TMP)
+ EXPAND(54); ROUND(SB, SC, SD, SE, SA, F3, TMP3, TMP)
+
+ EXPAND(55); ROUND(SA, SB, SC, SD, SE, F3, TMP3, TMP)
+ EXPAND(56); ROUND(SE, SA, SB, SC, SD, F3, TMP3, TMP)
+ EXPAND(57); ROUND(SD, SE, SA, SB, SC, F3, TMP3, TMP)
+ EXPAND(58); ROUND(SC, SD, SE, SA, SB, F3, TMP3, TMP)
+ EXPAND(59); ROUND(SB, SC, SD, SE, SA, F3, TMP3, TMP)
+
+ movl K4VALUE, TMP2
+ EXPAND(60); ROUND(SA, SB, SC, SD, SE, F2, TMP2, TMP)
+ EXPAND(61); ROUND(SE, SA, SB, SC, SD, F2, TMP2, TMP)
+ EXPAND(62); ROUND(SD, SE, SA, SB, SC, F2, TMP2, TMP)
+ EXPAND(63); ROUND(SC, SD, SE, SA, SB, F2, TMP2, TMP)
+ EXPAND(64); ROUND(SB, SC, SD, SE, SA, F2, TMP2, TMP)
+
+ EXPAND(65); ROUND(SA, SB, SC, SD, SE, F2, TMP2, TMP)
+ EXPAND(66); ROUND(SE, SA, SB, SC, SD, F2, TMP2, TMP)
+ EXPAND(67); ROUND(SD, SE, SA, SB, SC, F2, TMP2, TMP)
+ EXPAND(68); ROUND(SC, SD, SE, SA, SB, F2, TMP2, TMP)
+ EXPAND(69); ROUND(SB, SC, SD, SE, SA, F2, TMP2, TMP)
+
+ EXPAND(70); ROUND(SA, SB, SC, SD, SE, F2, TMP2, TMP)
+ EXPAND(71); ROUND(SE, SA, SB, SC, SD, F2, TMP2, TMP)
+ EXPAND(72); ROUND(SD, SE, SA, SB, SC, F2, TMP2, TMP)
+ EXPAND(73); ROUND(SC, SD, SE, SA, SB, F2, TMP2, TMP)
+ EXPAND(74); ROUND(SB, SC, SD, SE, SA, F2, TMP2, TMP)
+
+ EXPAND(75); ROUND(SA, SB, SC, SD, SE, F2, TMP2, TMP)
+ EXPAND(76); ROUND(SE, SA, SB, SC, SD, F2, TMP2, TMP)
+ EXPAND(77); ROUND(SD, SE, SA, SB, SC, F2, TMP2, TMP)
+ EXPAND(78); ROUND(SC, SD, SE, SA, SB, F2, TMP2, TMP)
+ EXPAND(79); ROUND(SB, SC, SD, SE, SA, F2, TMP2, TMP)
+
+ /* Update the state vector */
+ movl 88(%esp),TMP
+ addl SA, (TMP)
+ addl SB, 4(TMP)
+ addl SC, 8(TMP)
+ addl SD, 12(TMP)
+ addl SE, 16(TMP)
+
+ /* Clear the workspace for security */
+ xorl TMP, TMP
+ CLEAR( 0); CLEAR( 1); CLEAR( 2); CLEAR( 3);
+ CLEAR( 4); CLEAR( 5); CLEAR( 6); CLEAR( 7);
+ CLEAR( 8); CLEAR( 9); CLEAR(10); CLEAR(11);
+ CLEAR(12); CLEAR(13); CLEAR(14); CLEAR(15);
+
+ addl $68, %esp
+ CFI_ADJUST_CFA_OFFSET -68
+ POP(edi)
+ POP(esi)
+ POP(ebp)
+ POP(ebx)
+ ret
+ CFI_ENDPROC
+ENDPROC(sha_transform)
diff --git a/include/linux/cryptohash.h b/include/linux/cryptohash.h
index a172401..0da331c 100644
--- a/include/linux/cryptohash.h
+++ b/include/linux/cryptohash.h
@@ -1,8 +1,15 @@
#ifndef __CRYPTOHASH_H
#define __CRYPTOHASH_H
+#include <linux/linkage.h>
+
#define SHA_DIGEST_WORDS 5
+
+#if defined(CONFIG_SHA1_X86)
+#define SHA_WORKSPACE_WORDS 0
+#else
#define SHA_WORKSPACE_WORDS 80
+#endif
/**
* sha_init - initialize the vectors for a SHA1 digest
@@ -17,7 +24,7 @@ static inline void sha_init(__u32 *buf)
buf[4] = 0xc3d2e1f0;
}
-void sha_transform(__u32 *digest, const char *data, __u32 *W);
+asmlinkage void sha_transform(__u32 *digest, const char *data, __u32 *W);
__u32 half_md4_transform(__u32 buf[4], __u32 const in[8]);
diff --git a/lib/Kconfig b/lib/Kconfig
index 2e7ae6b..69fdb64 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -124,4 +124,17 @@ config HAS_DMA
depends on !NO_DMA
default y
+#
+# Optimized SHA-1 support is autoconfigured
+#
+config SHA1_X86
+ bool
+ depends on (X86 || UML_X86) && !64BIT && X86_BSWAP
+ default y
+
+config SHA1_GENERIC
+ bool
+ depends on !SHA1_X86
+ default y
+
endmenu
diff --git a/lib/Makefile b/lib/Makefile
index c8c8e20..f67be29 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -5,10 +5,11 @@
lib-y := ctype.o string.o vsprintf.o cmdline.o \
rbtree.o radix-tree.o dump_stack.o \
idr.o int_sqrt.o bitmap.o extable.o prio_tree.o \
- sha1.o irq_regs.o reciprocal_div.o
+ irq_regs.o reciprocal_div.o
lib-$(CONFIG_MMU) += ioremap.o
lib-$(CONFIG_SMP) += cpumask.o
+lib-$(CONFIG_SHA1_GENERIC) += sha1.o
lib-y += kobject.o kref.o kobject_uevent.o klist.o
next prev parent reply other threads:[~2007-06-11 19:51 UTC|newest]
Thread overview: 21+ messages / expand[flat|nested] mbox.gz Atom feed top
2007-06-08 21:42 [PATCH 0/3] Add optimized SHA-1 implementations for x86 and x86_64 Benjamin Gilbert
2007-06-08 21:42 ` [PATCH 1/3] [CRYPTO] Move sha_init() into cryptohash.h Benjamin Gilbert
2007-06-08 21:42 ` [PATCH 2/3] [CRYPTO] Add optimized SHA-1 implementation for i486+ Benjamin Gilbert
2007-06-09 7:32 ` Jan Engelhardt
2007-06-10 1:15 ` Benjamin Gilbert
2007-06-11 19:47 ` Benjamin Gilbert
2007-06-11 19:50 ` Benjamin Gilbert [this message]
2007-06-11 19:52 ` [PATCH] [CRYPTO] Add optimized SHA-1 implementation for x86_64 Benjamin Gilbert
2007-06-09 20:11 ` [PATCH 2/3] [CRYPTO] Add optimized SHA-1 implementation for i486+ Matt Mackall
2007-06-09 20:23 ` Jeff Garzik
2007-06-09 21:34 ` Matt Mackall
2007-06-10 0:33 ` Benjamin Gilbert
2007-06-10 13:59 ` Matt Mackall
2007-06-10 16:47 ` Benjamin Gilbert
2007-06-10 17:33 ` Matt Mackall
2007-06-11 17:39 ` Benjamin Gilbert
2007-06-11 12:04 ` Andi Kleen
2007-06-08 21:42 ` [PATCH 3/3] [CRYPTO] Add optimized SHA-1 implementation for x86_64 Benjamin Gilbert
2007-06-11 12:01 ` Andi Kleen
2007-06-11 19:45 ` Benjamin Gilbert
2007-06-11 20:30 ` [PATCH 0/3] Add optimized SHA-1 implementations for x86 and x86_64 Adrian Bunk
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20070611195051.25068.16858.stgit@dev \
--to=bgilbert@cs.cmu.edu \
--cc=akpm@linux-foundation.org \
--cc=herbert@gondor.apana.org.au \
--cc=jengelh@computergmbh.de \
--cc=linux-crypto@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.