[PATCH 3/4] Twofish cipher

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH  3/4] Twofish cipher - i586 assembler
@ 2006-06-04 13:16 Joachim Fritschi
  2006-06-04 22:49 ` Horst von Brand
                   ` (3 more replies)
  0 siblings, 4 replies; 9+ messages in thread
From: Joachim Fritschi @ 2006-06-04 13:16 UTC (permalink / raw)
  To: linux-kernel; +Cc: linux-crypto, herbert, ak

This patch adds the twofish i586 assembler routine. 

Changes since the last version:
- The keysetup is now handled by the twofish_common.c (see patch 1 )

Correctness was verified with the tcrypt module and automated test scripts.

Signed-off-by: Joachim Fritschi <jfritschi@freenet.de>

diff -uprN linux-2.6.17-rc5.twofish2/arch/i386/crypto/Makefile 
linux-2.6.17-rc5.twofish3/arch/i386/crypto/Makefile
--- linux-2.6.17-rc5.twofish2/arch/i386/crypto/Makefile	2006-05-30 
19:43:48.768000198 +0200
+++ linux-2.6.17-rc5.twofish3/arch/i386/crypto/Makefile	2006-05-30 
20:06:10.880715217 +0200
@@ -5,5 +5,8 @@
 #

 obj-$(CONFIG_CRYPTO_AES_586) += aes-i586.o
+obj-$(CONFIG_CRYPTO_TWOFISH_586) += twofish-i586.o

 aes-i586-y := aes-i586-asm.o aes.o
+twofish-i586-y := twofish-i586-asm.o 
twofish.o ../../../crypto/twofish_common.o
+
diff -uprN linux-2.6.17-rc5.twofish2/arch/i386/crypto/twofish.c 
linux-2.6.17-rc5.twofish3/arch/i386/crypto/twofish.c
--- linux-2.6.17-rc5.twofish2/arch/i386/crypto/twofish.c	1970-01-01 
01:00:00.000000000 +0100
+++ linux-2.6.17-rc5.twofish3/arch/i386/crypto/twofish.c	2006-05-30 
20:04:16.279682770 +0200
@@ -0,0 +1,88 @@
+/*
+ *  Glue Code for optimized 586 assembler version of TWOFISH
+ *
+ * Originally Twofish for GPG
+ * By Matthew Skala <mskala@ansuz.sooke.bc.ca>, July 26, 1998
+ * 256-bit key length added March 20, 1999
+ * Some modifications to reduce the text size by Werner Koch, April, 1998
+ * Ported to the kerneli patch by Marc Mutz <Marc@Mutz.com>
+ * Ported to CryptoAPI by Colin Slater <hoho@tacomeat.net>
+ *
+ * The original author has disclaimed all copyright interest in this
+ * code and thus put it in the public domain. The subsequent authors
+ * have put this under the GNU General Public License.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307
+ * USA
+ *
+ * This code is a "clean room" implementation, written from the paper
+ * _Twofish: A 128-Bit Block Cipher_ by Bruce Schneier, John Kelsey,
+ * Doug Whiting, David Wagner, Chris Hall, and Niels Ferguson, available
+ * through http://www.counterpane.com/twofish.html
+ *
+ * For background information on multiplication in finite fields, used for
+ * the matrix operations in the key schedule, see the book _Contemporary
+ * Abstract Algebra_ by Joseph A. Gallian, especially chapter 22 in the
+ * Third Edition.
+ */
+
+#include <asm/byteorder.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/crypto.h>
+#include <linux/bitops.h>
+#include <crypto/twofish.h>
+
+
+asmlinkage void twofish_enc_blk(void *ctx, u8 *dst, const u8 *src);
+
+asmlinkage void twofish_dec_blk(void *ctx, u8 *dst, const u8 *src);
+
+
+static struct crypto_alg alg = {
+	.cra_name           =   "twofish",
+	.cra_driver_name    =	"twofish-i586",
+        .cra_priority       =   200,
+	.cra_flags          =   CRYPTO_ALG_TYPE_CIPHER,
+	.cra_blocksize      =   TF_BLOCK_SIZE,
+	.cra_ctxsize        =   sizeof(struct twofish_ctx),
+	.cra_alignmask      =	3,
+	.cra_module         =   THIS_MODULE,
+	.cra_list           =   LIST_HEAD_INIT(alg.cra_list),
+	.cra_u              =   { .cipher = {
+	.cia_min_keysize    =   TF_MIN_KEY_SIZE,
+	.cia_max_keysize    =   TF_MAX_KEY_SIZE,
+	.cia_setkey         =   twofish_setkey,
+	.cia_encrypt        =   twofish_enc_blk,
+	.cia_decrypt        =   twofish_dec_blk } }
+};
+
+static int __init init(void)
+{
+	return crypto_register_alg(&alg);
+}
+
+static void __exit fini(void)
+{
+	crypto_unregister_alg(&alg);
+}
+
+module_init(init);
+module_exit(fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION ("Twofish Cipher Algorithm, i586 asm optimized");
diff -uprN linux-2.6.17-rc5.twofish2/arch/i386/crypto/twofish-i586-asm.S 
linux-2.6.17-rc5.twofish3/arch/i386/crypto/twofish-i586-asm.S
--- linux-2.6.17-rc5.twofish2/arch/i386/crypto/twofish-i586-asm.S	1970-01-01 
01:00:00.000000000 +0100
+++ linux-2.6.17-rc5.twofish3/arch/i386/crypto/twofish-i586-asm.S	2006-05-30 
20:00:47.825035584 +0200
@@ -0,0 +1,377 @@
+	/***************************************************************************
+	*   Copyright (C) 2006 by Joachim Fritschi, <jfritschi@freenet.de>        *
+	*                                                                         *
+	*   This program is free software; you can redistribute it and/or modify  *
+	*   it under the terms of the GNU General Public License as published by  *
+	*   the Free Software Foundation; either version 2 of the License, or     *
+	*   (at your option) any later version.                                   *
+	*                                                                         *
+	*   This program is distributed in the hope that it will be useful,       *
+	*   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+	*   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+	*   GNU General Public License for more details.                          *
+	*                                                                         *
+	*   You should have received a copy of the GNU General Public License     *
+	*   along with this program; if not, write to the                         *
+	*   Free Software Foundation, Inc.,                                       *
+	*   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+	***************************************************************************/
+
+.file "twofish-i586-asm.S"
+.text
+
+/* return adress at 0 */
+
+#define in_blk    12  /* input byte array address parameter*/
+#define out_blk   8  /* output byte array address parameter*/
+#define ctx       4  /* Twofish context structure */
+
+#define a_offset	0
+#define b_offset	4
+#define c_offset	8
+#define d_offset	12
+
+/* Structure of the crypto context struct*/
+
+#define s0	0	/* S0 Array 256 Words each */
+#define s1	1024	/* S1 Array */
+#define s2	2048	/* S2 Array */
+#define s3	3072	/* S3 Array */
+#define w	4096	/* 8 whitening keys (word) */
+#define k	4128	/* key 1-32 ( word ) */
+
+
+
+
+/* register aliases for better reading */
+
+#define r0  eax
+#define r1  ebx
+#define r2  ecx
+#define r3  edx
+#define r4  esi
+#define r5  edi
+
+
+
+#define eaxl  al
+#define eaxh  ah
+#define ebxl  bl
+#define ebxh  bh
+#define ecxl  cl
+#define ecxh  ch
+#define edxl  dl
+#define edxh  dh
+
+
+#define _h(reg) reg##h
+#define h(reg) _h(reg)
+
+#define _l(reg) reg##l
+#define l(reg) _l(reg)
+
+/*load input word with whitening */
+
+#define get_input(input_adress,offset,dst,context)\
+	load_input(input_adress,offset,dst);\
+	input_whitening(dst,context,offset);
+
+#define get_dec_input(input_adress,offset,dst,context)\
+	load_dec_input(input_adress,offset,dst);\
+	dec_input_whitening(dst,context,offset);
+
+/* perform output whitening and save value. Old value is saved */
+#define process_output(dst,src,tmp,context,offset)\
+	output_whitening(src,tmp,context,offset);\
+	save_output(dst,offset,tmp);
+
+/* perform output whitening and save value. The old value is destoyed */
+#define destructive_process_output(dst,src,context,offset)\
+	destructive_output_whitening(src,context,offset);\
+	save_output(dst,offset,src);
+
+/* perform output whitening and save value. Old value is saved */
+#define process_dec_output(dst,src,tmp,context,offset)\
+	dec_output_whitening(src,tmp,context,offset);\
+	save_output(dst,offset,tmp);
+
+/* perform output whitening and save value. The old value is destoyed */
+#define destructive_process_dec_output(dst,src,context,offset)\
+	destructive_dec_output_whitening(src,context,offset);\
+	save_output(dst,offset,src);
+
+/* load input */
+#define load_input(input_adress,offset,dst)\
+	mov	offset(%input_adress), %dst;
+
+#define load_dec_input(input_adress,offset,dst)\
+	mov	offset(%input_adress), %dst;
+
+/* performs input whitening */
+#define input_whitening(src,context,offset)\
+	xor	w+offset(%context), %src;\
+
+#define dec_input_whitening(src,context,offset)\
+	xor	w+16+offset(%context), %src;
+
+/* performs decryption output whitening */
+/* Result is in dst, the original value is still intact */
+#define dec_output_whitening(src,dst,context,offset)\
+	mov	w+offset(%context), %dst;\
+	xor	%src,%dst;
+
+/* performs encryption output whitening */
+/* Result is in dst, the original value is still intact */
+#define output_whitening(src,dst,context,offset)\
+	mov	w+16+offset(%context), %dst;\
+	xor	%src,%dst;
+
+/* performs encryption output whitening */
+/* Result is in dst, the original value is destroyed */
+#define destructive_output_whitening(src,context,offset)\
+	xor	w+16+offset(%context), %src;\
+
+/* performs decryption output whitening */
+/* Result is in dst, the original value is destroyed */
+#define destructive_dec_output_whitening(src,context,offset)\
+	xor	w+offset(%context), %src;\
+
+/* save the output values */
+#define save_output(output_adress,offset,src)\
+	mov	%src,offset(%output_adress);\
+
+/* load sbox values */
+#define load_s(context,sbox,index,dst)\
+	xor	sbox(%context,%index,4),%dst;\
+
+/* performs "a" sbox transfomation */
+/* input value is still intact but rotatet */
+#define g1(context,input,dst,tmp)\
+	xor	%dst,%dst;\
+ 	movzx	%l(input),%tmp;\
+	load_s(context,s0,tmp,dst);\
+	movzx	%h(input),%tmp;\
+	load_s(context,s1,tmp,dst);\
+	ror	$16,%input;\
+	movzx	%l(input),%tmp;\
+	load_s(context,s2,tmp,dst);\
+	movzx	%h(input),%tmp;\
+	load_s(context,s3,tmp,dst);\
+
+/* performs "b" sbox transfomation */
+/* input value is still intact but rotatet */
+#define g2(context,input,dst,tmp)\
+	xor	%dst,%dst;\
+	movzx	%l(input),%tmp;\
+	load_s(context,s1,tmp,dst);\
+	movzx	%h(input),%tmp;\
+	load_s(context,s2,tmp,dst);\
+	ror	$16,%input;\
+	movzx	%l(input),%tmp;\
+	load_s(context,s3,tmp,dst);\
+	movzx	%h(input),%tmp;\
+	load_s(context,s0,tmp,dst);\
+	;
+
+/* Pseudo Harmann Transfomation */
+#define pht(a,b)\
+	add	%b,%a;\
+	add	%a,%b;
+
+/* Adds the round keys to a and b */
+#define round_key(context,a,b,round)\
+	add	k+round(%context),%a;\
+	add	k+4+round(%context),%b;\
+
+
+/* Input in a and b , output in fa fb */
+/* a and b a prerotate for the next round */
+#define f_function(context,a,b,fa,fb,tmp3,round)\
+	g1(context,a,fa,tmp3);\
+	g2(context,b,fb,tmp3);\
+	ror	$16,%a;\
+	ror	$15,%b;\
+	pht(fa,fb);\
+	round_key(context,fa,fb,round);
+
+
+/* Input in a and b , output in fa fb */
+/* a and b a prerotate for the next round */
+#define reverse_f_function(context,a,b,fa,fb,tmp3,round)\
+	g1(context,a,fa,tmp3);\
+	g2(context,b,fb,tmp3);\
+	ror	$15,%a;\
+	ror	$16,%b;\
+	pht(fa,fb);\
+	round_key(context,fa,fb,round);
+
+
+/* Output in a and b */
+/* olda contains the a of the round before, cuts down stack use to one push / 
pop per round for the oldb */
+/* b is alread pre rotated (rol 1) in the f funtion to save one instruction 
*/
+#define round(context,a,b,tmp1,tmp2,tmp3,olda,round)\
+	f_function(context,a,b,tmp1,tmp2,tmp3,round);\
+	mov	%b,%tmp3;\
+	pop	%b;\
+	push	%tmp3;\
+	xor	%tmp2,%b;\
+	xor	%tmp1,%olda;\
+	ror	$1,%olda;
+
+
+/* Output in a and b */
+/* olda contains the a of the round before, cuts donw stack use to one push / 
pop per round for the oldb */
+/* a is alread pre rotated (rol 1) in the f funtion to save one instruction 
*/
+#define dec_round(context,a,b,tmp1,tmp2,tmp3,olda,round)\
+	reverse_f_function(context,a,b,tmp1,tmp2,tmp3,round);\
+	xor	%tmp1,%olda;\
+	mov	%b,%tmp3;\
+	pop	%b;\
+	push	%tmp3;\
+	xor	%tmp2,%b;\
+	ror	$1,%b;
+
+
+
+.align 4
+.global twofish_enc_blk
+.global twofish_dec_blk
+
+
+
+twofish_enc_blk:
+	push	%ebp			/* save registers according to calling convention*/
+	push    %r1
+	push    %esi
+	push    %edi
+
+
+	mov	ctx + 16(%esp),%ebp	/* abuse the base pointer: set new base bointer to 
the crypto ctx */
+	mov     in_blk+16(%esp),%r5	/* input adress in r5 */
+
+	get_input(r5,a_offset,r0,ebp);
+	get_input(r5,b_offset,r1,ebp);
+
+	/* To save a few instructions round 1 is unrolled */
+
+	f_function(ebp,r0,r1,r2,r3,r4,0);	//ouput in r2 r3
+	push	%r1;
+
+	get_input(r5,c_offset,r1,ebp);
+	get_input(r5,d_offset,r4,ebp);
+	xor	%r1,%r2;\
+	ror	$1,%r2;\
+	rol	$1,%r4;\
+	xor	%r4,%r3;
+
+
+	round(ebp,r2,r3,r4,r5,r1,r0,1*8);
+	round(ebp,r0,r3,r4,r5,r1,r2,2*8);
+	round(ebp,r2,r3,r4,r5,r1,r0,3*8);
+	round(ebp,r0,r3,r4,r5,r1,r2,4*8);
+	round(ebp,r2,r3,r4,r5,r1,r0,5*8);
+	round(ebp,r0,r3,r4,r5,r1,r2,6*8);
+	round(ebp,r2,r3,r4,r5,r1,r0,7*8);
+	round(ebp,r0,r3,r4,r5,r1,r2,8*8);
+	round(ebp,r2,r3,r4,r5,r1,r0,9*8);
+	round(ebp,r0,r3,r4,r5,r1,r2,10*8);
+	round(ebp,r2,r3,r4,r5,r1,r0,11*8);
+	round(ebp,r0,r3,r4,r5,r1,r2,12*8);
+	round(ebp,r2,r3,r4,r5,r1,r0,13*8);
+	round(ebp,r0,r3,r4,r5,r1,r2,14*8);
+
+	/* To save a few instructions round 15 is unrolled */
+
+	mov	out_blk+20(%esp),%r1;
+	process_output(r1,r2,r4,ebp,a_offset);
+	process_output(r1,r3,r4,ebp,b_offset);
+	g1(ebp,r2,r4,r1);
+	g2(ebp,r3,r5,r1);
+	pht(r4,r5);
+	round_key(ebp,r4,r5,15*8);
+	pop	%r1;
+	xor	%r5,%r1;
+	xor	%r4,%r0;
+	ror	$1,%r0;
+
+	mov	out_blk+16(%esp),%r3
+	destructive_process_output(r3,r0,ebp,c_offset);
+	destructive_process_output(r3,r1,ebp,d_offset);
+
+	pop	%edi
+	pop	%esi
+	pop	%r1
+	pop	%ebp
+	mov	$1,%r0
+	ret
+
+twofish_dec_blk:
+	push	%ebp			/* save  registers according to calling convention*/
+	push    %r1
+	push    %esi
+	push    %edi
+
+
+	mov	ctx + 16(%esp),%ebp	/* abuse the base pointer: set new base bointer to 
the crypto ctx */
+	mov     in_blk+16(%esp),%r5	/* output adress in r5 */
+
+	/* To save a few instructions round 15 is unrolled */
+	get_dec_input(r5,a_offset,r0,ebp);
+	get_dec_input(r5,b_offset,r1,ebp);
+
+	reverse_f_function(ebp,r0,r1,r2,r3,r4,15*8);
+
+        push %r1; /* save oldb for next rount */
+
+	get_dec_input(r5,c_offset,r1,ebp);
+	get_dec_input(r5,d_offset,r4,ebp);
+	xor	%r4,%r3;
+	ror	$1,%r3;
+	rol	$1,%r1;
+	xor	%r1,%r2;
+
+	dec_round(ebp,r2,r3,r4,r5,r1,r0,14*8);
+	dec_round(ebp,r0,r3,r4,r5,r1,r2,13*8);
+	dec_round(ebp,r2,r3,r4,r5,r1,r0,12*8);
+	dec_round(ebp,r0,r3,r4,r5,r1,r2,11*8);
+	dec_round(ebp,r2,r3,r4,r5,r1,r0,10*8);
+	dec_round(ebp,r0,r3,r4,r5,r1,r2,9*8);
+	dec_round(ebp,r2,r3,r4,r5,r1,r0,8*8);
+	dec_round(ebp,r0,r3,r4,r5,r1,r2,7*8);
+	dec_round(ebp,r2,r3,r4,r5,r1,r0,6*8);
+	dec_round(ebp,r0,r3,r4,r5,r1,r2,5*8);
+	dec_round(ebp,r2,r3,r4,r5,r1,r0,4*8);
+	dec_round(ebp,r0,r3,r4,r5,r1,r2,3*8);
+	dec_round(ebp,r2,r3,r4,r5,r1,r0,2*8);
+	dec_round(ebp,r0,r3,r4,r5,r1,r2,8);
+
+	/* To save a few instructions round 0 is unrolled */
+	mov	out_blk+20(%esp),%r1;
+	process_dec_output(r1,r2,r4,ebp,a_offset);
+	process_dec_output(r1,r3,r4,ebp,b_offset);
+	g1(ebp,r2,r4,r1);
+	g2(ebp,r3,r5,r1);
+	pht(r4,r5);
+	round_key(ebp,r4,r5,0);
+
+	pop	%r1;
+	xor	%r1,%r5
+	ror	$1,%r5
+	xor	%r0,%r4
+
+
+
+	mov	out_blk+16(%esp),%r3
+	destructive_process_dec_output(r3,r4,ebp,c_offset);
+	destructive_process_dec_output(r3,r5,ebp,d_offset);
+
+	pop	%edi
+	pop	%esi
+	pop	%r1
+	pop	%ebp
+	mov	$1,%r0
+	ret
+
+
+
+
diff -uprN linux-2.6.17-rc5.twofish2/crypto/Kconfig 
linux-2.6.17-rc5.twofish3/crypto/Kconfig
--- linux-2.6.17-rc5.twofish2/crypto/Kconfig	2006-05-30 19:44:02.607579102 
+0200
+++ linux-2.6.17-rc5.twofish3/crypto/Kconfig	2006-05-30 20:00:47.841035197 
+0200
@@ -142,6 +142,20 @@ config CRYPTO_TWOFISH
 	  See also:
 	  <http://www.schneier.com/twofish.html>

+config CRYPTO_TWOFISH_586
+	tristate "Twofish cipher algorithms (i586)"
+	depends on CRYPTO && ((X86 || UML_X86) && !64BIT)
+	help
+	  Twofish cipher algorithm.
+
+	  Twofish was submitted as an AES (Advanced Encryption Standard)
+	  candidate cipher by researchers at CounterPane Systems.  It is a
+	  16 round block cipher supporting key sizes of 128, 192, and 256
+	  bits.
+
+	  See also:
+	  <http://www.schneier.com/twofish.html>
+
 config CRYPTO_SERPENT
 	tristate "Serpent cipher algorithm"
 	depends on CRYPTO

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 3/4] Twofish cipher - i586 assembler
  2006-06-04 13:16 [PATCH 3/4] Twofish cipher - i586 assembler Joachim Fritschi
@ 2006-06-04 22:49 ` Horst von Brand
  2006-06-05 10:47   ` Joachim Fritschi
  2006-06-07 19:38 ` Joachim Fritschi
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 9+ messages in thread
From: Horst von Brand @ 2006-06-04 22:49 UTC (permalink / raw)
  To: Joachim Fritschi; +Cc: linux-kernel, linux-crypto, herbert, ak

Joachim Fritschi <jfritschi@freenet.de> wrote:
> This patch adds the twofish i586 assembler routine. 

What performance impact does this have on a variety of machines? Is twofish
used enough for this to be relevant?
-- 
Dr. Horst H. von Brand                   User #22616 counter.li.org
Departamento de Informatica                     Fono: +56 32 654431
Universidad Tecnica Federico Santa Maria              +56 32 654239
Casilla 110-V, Valparaiso, Chile                Fax:  +56 32 797513

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 3/4] Twofish cipher - i586 assembler
  2006-06-04 22:49 ` Horst von Brand
@ 2006-06-05 10:47   ` Joachim Fritschi
  0 siblings, 0 replies; 9+ messages in thread
From: Joachim Fritschi @ 2006-06-05 10:47 UTC (permalink / raw)
  To: Horst von Brand; +Cc: linux-kernel, linux-crypto, herbert, ak

On Monday 05 June 2006 00:49, Horst von Brand wrote:
> Joachim Fritschi <jfritschi@freenet.de> wrote:
> > This patch adds the twofish i586 assembler routine.
>
> What performance impact does this have on a variety of machines? 

Here are the outputs from the tcrypt speedtests. They haven't changed much 
since the last patch:

http://homepages.tu-darmstadt.de/~fritschi/twofish/tcrypt-speed-c-i586.txt
http://homepages.tu-darmstadt.de/~fritschi/twofish/tcrypt-speed-asm-i586.txt
http://homepages.tu-darmstadt.de/~fritschi/twofish/tcrypt-speed-c-x86_64.txt
http://homepages.tu-darmstadt.de/~fritschi/twofish/tcrypt-speed-asm-x86_64.txt

Summary for cycles used for CBC encrypt decrypt (256bit / 8k blocks) assembler 
vs. generic-c:

i586 encrypt:   - 17%
i568 decrypt:   -24%
x86_64 encrypt: -22%
x86_64 decrypt: -17%

The numbers vary a bit with different blocksizes / keylength and per test.

I also did some filesystem benchmarks (bonnie++) with various ciphers. Most 
write tests maxed out my drives writing to disk.  But at least for the read 
speed you can see some notable performance improvements:
(Note: The x86 and x86_64 numbers are not comparable since the tests were done 
on different machines)

http://homepages.tu-darmstadt.de/~fritschi/twofish/output_20060531_160442_x86.html

Summary:
Sequential read speed improved between 25-32%
Sequential write speed improved at least 15% but the disk maxed out
Twofish 256 is a little bit faster than AES 128

http://homepages.tu-darmstadt.de/~fritschi/twofish/output_20060601_113747_x86_64.html

Summary:
Sequential read speed improved 13%
Seqential write speed maxed out the drives

> Is twofish used enough for this to be relevant?

I don't have hard facts about that, but i have been using it for many year. 
Since Suse included it in there release 7.0 a few years back. Don't know what 
the current status in the various distributions is. I guess it is probably 
the second most used cipher behind aes since the performance for 256bit 
keylength is better or close to the aes speed. Some cryptoanalyst believe it 
to be cryptograhically superior to aes but that is probably a matter of 
opinion and parameters you consider. In the NIST competition it was third 
behind aes and serpent. Serpent itself is probably more secure but lacks 
performance. Twofish is probably also one of the cipher you choose from if 
you don't trust the US government to choose the right one for you. :)

Regards,

Joachim

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH  3/4] Twofish cipher - i586 assembler
  2006-06-04 13:16 [PATCH 3/4] Twofish cipher - i586 assembler Joachim Fritschi
  2006-06-04 22:49 ` Horst von Brand
@ 2006-06-07 19:38 ` Joachim Fritschi
  2006-06-16 11:59   ` Joachim Fritschi
  2006-06-17 10:30 ` Joachim Fritschi
  2006-06-19 14:12 ` Joachim Fritschi
  3 siblings, 1 reply; 9+ messages in thread
From: Joachim Fritschi @ 2006-06-07 19:38 UTC (permalink / raw)
  To: linux-kernel; +Cc: linux-crypto, herbert, ak

On Sunday 04 June 2006 15:16, Joachim Fritschi wrote:
> This patch adds the twofish i586 assembler routine. 
> 
> Changes since the last version:
> - The keysetup is now handled by the twofish_common.c (see patch 1 )
> 
> Correctness was verified with the tcrypt module and automated test scripts.
My first mail was wordwrapped. This one should be unwrapped and working.
It also includes minor readability fixes.

Signed-off-by: Joachim Fritschi <jfritschi@freenet.de>

diff -uprN linux-2.6.17-rc5.twofish2/arch/i386/crypto/Makefile linux-2.6.17-rc5.twofish3/arch/i386/crypto/Makefile
--- linux-2.6.17-rc5.twofish2/arch/i386/crypto/Makefile	2006-05-30 19:43:48.768000198 +0200
+++ linux-2.6.17-rc5.twofish3/arch/i386/crypto/Makefile	2006-05-30 20:06:10.880715217 +0200
@@ -5,5 +5,8 @@
 #

 obj-$(CONFIG_CRYPTO_AES_586) += aes-i586.o
+obj-$(CONFIG_CRYPTO_TWOFISH_586) += twofish-i586.o

 aes-i586-y := aes-i586-asm.o aes.o
+twofish-i586-y := twofish-i586-asm.o twofish.o ../../../crypto/twofish_common.o
+
diff -uprN linux-2.6.17-rc5.twofish2/arch/i386/crypto/twofish.c linux-2.6.17-rc5.twofish3/arch/i386/crypto/twofish.c
--- linux-2.6.17-rc5.twofish2/arch/i386/crypto/twofish.c	1970-01-01 01:00:00.000000000 +0100
+++ linux-2.6.17-rc5.twofish3/arch/i386/crypto/twofish.c	2006-05-30 20:04:16.279682770 +0200
@@ -0,0 +1,88 @@
+/*
+ *  Glue Code for optimized 586 assembler version of TWOFISH
+ *
+ * Originally Twofish for GPG
+ * By Matthew Skala <mskala@ansuz.sooke.bc.ca>, July 26, 1998
+ * 256-bit key length added March 20, 1999
+ * Some modifications to reduce the text size by Werner Koch, April, 1998
+ * Ported to the kerneli patch by Marc Mutz <Marc@Mutz.com>
+ * Ported to CryptoAPI by Colin Slater <hoho@tacomeat.net>
+ *
+ * The original author has disclaimed all copyright interest in this
+ * code and thus put it in the public domain. The subsequent authors
+ * have put this under the GNU General Public License.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307
+ * USA
+ *
+ * This code is a "clean room" implementation, written from the paper
+ * _Twofish: A 128-Bit Block Cipher_ by Bruce Schneier, John Kelsey,
+ * Doug Whiting, David Wagner, Chris Hall, and Niels Ferguson, available
+ * through http://www.counterpane.com/twofish.html
+ *
+ * For background information on multiplication in finite fields, used for
+ * the matrix operations in the key schedule, see the book _Contemporary
+ * Abstract Algebra_ by Joseph A. Gallian, especially chapter 22 in the
+ * Third Edition.
+ */
+
+#include <asm/byteorder.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/crypto.h>
+#include <linux/bitops.h>
+#include <crypto/twofish.h>
+
+
+asmlinkage void twofish_enc_blk(void *ctx, u8 *dst, const u8 *src);
+
+asmlinkage void twofish_dec_blk(void *ctx, u8 *dst, const u8 *src);
+
+
+static struct crypto_alg alg = {
+	.cra_name           =   "twofish",
+	.cra_driver_name    =	"twofish-i586",
+        .cra_priority       =   200,
+	.cra_flags          =   CRYPTO_ALG_TYPE_CIPHER,
+	.cra_blocksize      =   TF_BLOCK_SIZE,
+	.cra_ctxsize        =   sizeof(struct twofish_ctx),
+	.cra_alignmask      =	3,
+	.cra_module         =   THIS_MODULE,
+	.cra_list           =   LIST_HEAD_INIT(alg.cra_list),
+	.cra_u              =   { .cipher = {
+	.cia_min_keysize    =   TF_MIN_KEY_SIZE,
+	.cia_max_keysize    =   TF_MAX_KEY_SIZE,
+	.cia_setkey         =   twofish_setkey,
+	.cia_encrypt        =   twofish_enc_blk,
+	.cia_decrypt        =   twofish_dec_blk } }
+};
+
+static int __init init(void)
+{
+	return crypto_register_alg(&alg);
+}
+
+static void __exit fini(void)
+{
+	crypto_unregister_alg(&alg);
+}
+
+module_init(init);
+module_exit(fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION ("Twofish Cipher Algorithm, i586 asm optimized");
diff -uprN linux-2.6.17-rc5.twofish2/arch/i386/crypto/twofish-i586-asm.S linux-2.6.17-rc5.twofish3/arch/i386/crypto/twofish-i586-asm.S
--- linux-2.6.17-rc5.twofish2/arch/i386/crypto/twofish-i586-asm.S	1970-01-01 01:00:00.000000000 +0100
+++ linux-2.6.17-rc5.twofish3/arch/i386/crypto/twofish-i586-asm.S	2006-06-07 17:35:12.626818884 +0200
@@ -0,0 +1,381 @@
+/***************************************************************************
+*   Copyright (C) 2006 by Joachim Fritschi, <jfritschi@freenet.de>        *
+*                                                                         *
+*   This program is free software; you can redistribute it and/or modify  *
+*   it under the terms of the GNU General Public License as published by  *
+*   the Free Software Foundation; either version 2 of the License, or     *
+*   (at your option) any later version.                                   *
+*                                                                         *
+*   This program is distributed in the hope that it will be useful,       *
+*   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+*   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+*   GNU General Public License for more details.                          *
+*                                                                         *
+*   You should have received a copy of the GNU General Public License     *
+*   along with this program; if not, write to the                         *
+*   Free Software Foundation, Inc.,                                       *
+*   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+***************************************************************************/
+
+.file "twofish-i586-asm.S"
+.text
+
+/* return adress at 0 */
+
+#define in_blk    12  /* input byte array address parameter*/
+#define out_blk   8  /* output byte array address parameter*/
+#define ctx       4  /* Twofish context structure */
+
+#define a_offset	0
+#define b_offset	4
+#define c_offset	8
+#define d_offset	12
+
+/* Structure of the crypto context struct*/
+
+#define s0	0	/* S0 Array 256 Words each */
+#define s1	1024	/* S1 Array */
+#define s2	2048	/* S2 Array */
+#define s3	3072	/* S3 Array */
+#define w	4096	/* 8 whitening keys (word) */
+#define k	4128	/* key 1-32 ( word ) */
+
+
+
+
+/* register aliases for macro substitution */
+
+#define r0  eax
+#define r1  ebx
+#define r2  ecx
+#define r3  edx
+#define r4  esi
+#define r5  edi
+
+
+
+#define eaxl  al
+#define eaxh  ah
+#define ebxl  bl
+#define ebxh  bh
+#define ecxl  cl
+#define ecxh  ch
+#define edxl  dl
+#define edxh  dh
+
+
+#define _h(reg) reg##h
+#define h(reg) _h(reg)
+
+#define _l(reg) reg##l
+#define l(reg) _l(reg)
+
+/*load input word with whitening */
+
+#define get_input(input_adress,offset,dst,context)\
+	load_input(input_adress,offset,dst);\
+	input_whitening(dst,context,offset);
+
+#define get_dec_input(input_adress,offset,dst,context)\
+	load_dec_input(input_adress,offset,dst);\
+	dec_input_whitening(dst,context,offset);
+
+/* perform output whitening and save value. Old value is saved */
+#define process_output(dst,src,tmp,context,offset)\
+	output_whitening(src,tmp,context,offset);\
+	save_output(dst,offset,tmp);
+
+/* perform output whitening and save value. The old value is destoyed */
+#define destructive_process_output(dst,src,context,offset)\
+	destructive_output_whitening(src,context,offset);\
+	save_output(dst,offset,src);
+
+/* perform output whitening and save value. Old value is saved */
+#define process_dec_output(dst,src,tmp,context,offset)\
+	dec_output_whitening(src,tmp,context,offset);\
+	save_output(dst,offset,tmp);
+
+/* perform output whitening and save value. The old value is destoyed */
+#define destructive_process_dec_output(dst,src,context,offset)\
+	destructive_dec_output_whitening(src,context,offset);\
+	save_output(dst,offset,src);
+
+/* load input */
+#define load_input(input_adress,offset,dst)\
+	mov	offset(%input_adress), %dst;
+
+#define load_dec_input(input_adress,offset,dst)\
+	mov	offset(%input_adress), %dst;
+
+/* performs input whitening */
+#define input_whitening(src,context,offset)\
+	xor	w+offset(%context), %src;\
+
+#define dec_input_whitening(src,context,offset)\
+	xor	w+16+offset(%context), %src;
+
+/* performs decryption output whitening */
+/* Result is in dst, the original value is still intact */
+#define dec_output_whitening(src,dst,context,offset)\
+	mov	w+offset(%context), %dst;\
+	xor	%src,	%dst;
+
+/* performs encryption output whitening */
+/* Result is in dst, the original value is still intact */
+#define output_whitening(src,dst,context,offset)\
+	mov	w+16+offset(%context), %dst;\
+	xor	%src,	%dst;
+
+/* performs encryption output whitening */
+/* Result is in dst, the original value is destroyed */
+#define destructive_output_whitening(src,context,offset)\
+	xor	w+16+offset(%context), %src;\
+
+/* performs decryption output whitening */
+/* Result is in dst, the original value is destroyed */
+#define destructive_dec_output_whitening(src,context,offset)\
+	xor	w+offset(%context), %src;\
+
+/* save the output values */
+#define save_output(output_adress,offset,src)\
+	mov	%src,offset(%output_adress);\
+
+/* load sbox values */
+#define load_s(context,sbox,index,dst)\
+	xor	sbox(%context,%index,4),%dst;\
+
+/* performs "a" sbox transfomation */
+/* input value is still intact but rotatet */
+#define g1(context,input,dst,tmp)\
+	xor	%dst,		%dst;\
+ 	movzx	%l(input),	%tmp;\
+	load_s(context,s0,tmp,dst);\
+	movzx	%h(input),	%tmp;\
+	load_s(context,s1,	tmp,dst);\
+	ror	$16,		%input;\
+	movzx	%l(input),	%tmp;\
+	load_s(context,s2,tmp,dst);\
+	movzx	%h(input),	%tmp;\
+	load_s(context,s3,tmp,dst);\
+
+/* performs "b" sbox transfomation */
+/* input value is still intact but rotatet */
+#define g2(context,input,dst,tmp)\
+	xor	%dst,		%dst;\
+	movzx	%l(input),	%tmp;\
+	load_s(context,s1,tmp,dst);\
+	movzx	%h(input),	%tmp;\
+	load_s(context,s2,tmp,dst);\
+	ror	$16,		%input;\
+	movzx	%l(input),	%tmp;\
+	load_s(context,s3,tmp,dst);\
+	movzx	%h(input),	%tmp;\
+	load_s(context,s0,tmp,dst);\
+	;
+
+/* Pseudo Harmann Transfomation */
+#define pht(a,b)\
+	add	%b,	%a;\
+	add	%a,	%b;
+
+/* Adds the round keys to a and b */
+#define round_key(context,a,b,round)\
+	add	k+round(%context),%a;\
+	add	k+4+round(%context),%b;\
+
+
+/* Input in a and b , output in fa fb */
+/* a and b a prerotate for the next round */
+#define f_function(context,a,b,fa,fb,tmp3,round)\
+	g1(context,a,fa,tmp3);\
+	g2(context,b,fb,tmp3);\
+	ror	$16,	%a;\
+	ror	$15,	%b;\
+	pht(fa,fb);\
+	round_key(context,fa,fb,round);
+
+
+/* Input in a and b , output in fa fb */
+/* a and b a prerotate for the next round */
+#define reverse_f_function(context,a,b,fa,fb,tmp3,round)\
+	g1(context,a,fa,tmp3);\
+	g2(context,b,fb,tmp3);\
+	ror	$15,	%a;\
+	ror	$16,	%b;\
+	pht(fa,fb);\
+	round_key(context,fa,fb,round);
+
+
+/* Output in a and b */
+/* olda contains the a of the round before, cuts down
+stack use to one push / pop per round for the oldb */
+/* b is alread pre rotated (rol 1) in the f funtion to save one instruction */
+#define round(context,a,b,tmp1,tmp2,tmp3,olda,round)\
+	f_function(context,a,b,tmp1,tmp2,tmp3,round);\
+	mov	%b,	%tmp3;\
+	pop	%b;\
+	push	%tmp3;\
+	xor	%tmp2,	%b;\
+	xor	%tmp1,	%olda;\
+	ror	$1,	%olda;
+
+
+/* Output in a and b */
+/* olda contains the a of the round before, cuts down
+stack use to one push / pop per round for the oldb */
+/* a is alread pre rotated (rol 1) in the f funtion to save one instruction */
+#define dec_round(context,a,b,tmp1,tmp2,tmp3,olda,round)\
+	reverse_f_function(context,a,b,tmp1,tmp2,tmp3,round);\
+	xor	%tmp1,	%olda;\
+	mov	%b,	%tmp3;\
+	pop	%b;\
+	push	%tmp3;\
+	xor	%tmp2,	%b;\
+	ror	$1,	%b;
+
+
+
+.align 4
+.global twofish_enc_blk
+.global twofish_dec_blk
+
+
+
+twofish_enc_blk:
+	/* save registers according to calling convention*/
+	push	%ebp
+	push    %r1
+	push    %esi
+	push    %edi
+
+	/* abuse the base pointer: set new base bointer to the crypto ctx */
+	mov	ctx + 16(%esp),	%ebp
+	mov     in_blk+16(%esp),%r5	/* input adress in r5 */
+
+	get_input(r5,a_offset,r0,ebp);
+	get_input(r5,b_offset,r1,ebp);
+
+	/* To save a few instructions round 1 is unrolled */
+
+	f_function(ebp,r0,r1,r2,r3,r4,0);	//ouput in r2 r3
+	push	%r1;
+
+	get_input(r5,c_offset,r1,ebp);
+	get_input(r5,d_offset,r4,ebp);
+	xor	%r1,	%r2;\
+	ror	$1,	%r2;\
+	rol	$1,	%r4;\
+	xor	%r4,	%r3;
+
+
+	round(ebp,r2,r3,r4,r5,r1,r0,1*8);
+	round(ebp,r0,r3,r4,r5,r1,r2,2*8);
+	round(ebp,r2,r3,r4,r5,r1,r0,3*8);
+	round(ebp,r0,r3,r4,r5,r1,r2,4*8);
+	round(ebp,r2,r3,r4,r5,r1,r0,5*8);
+	round(ebp,r0,r3,r4,r5,r1,r2,6*8);
+	round(ebp,r2,r3,r4,r5,r1,r0,7*8);
+	round(ebp,r0,r3,r4,r5,r1,r2,8*8);
+	round(ebp,r2,r3,r4,r5,r1,r0,9*8);
+	round(ebp,r0,r3,r4,r5,r1,r2,10*8);
+	round(ebp,r2,r3,r4,r5,r1,r0,11*8);
+	round(ebp,r0,r3,r4,r5,r1,r2,12*8);
+	round(ebp,r2,r3,r4,r5,r1,r0,13*8);
+	round(ebp,r0,r3,r4,r5,r1,r2,14*8);
+
+	/* To save a few instructions round 15 is unrolled */
+
+	mov	out_blk+20(%esp),%r1;
+	process_output(r1,r2,r4,ebp,a_offset);
+	process_output(r1,r3,r4,ebp,b_offset);
+	g1(ebp,r2,r4,r1);
+	g2(ebp,r3,r5,r1);
+	pht(r4,r5);
+	round_key(ebp,r4,r5,15*8);
+	pop	%r1;
+	xor	%r5,	%r1;
+	xor	%r4,	%r0;
+	ror	$1,	%r0;
+
+	mov	out_blk+16(%esp),%r3
+	destructive_process_output(r3,r0,ebp,c_offset);
+	destructive_process_output(r3,r1,ebp,d_offset);
+
+	pop	%edi
+	pop	%esi
+	pop	%r1
+	pop	%ebp
+	mov	$1,%r0
+	ret
+
+twofish_dec_blk:
+	/* save  registers according to calling convention*/
+	push	%ebp
+	push    %r1
+	push    %esi
+	push    %edi
+
+	/* abuse the base pointer: set new base bointer to the crypto ctx */
+	mov	ctx + 16(%esp),	%ebp
+	mov     in_blk+16(%esp),%r5	/* output adress in r5 */
+
+	/* To save a few instructions round 15 is unrolled */
+	get_dec_input(r5,a_offset,r0,ebp);
+	get_dec_input(r5,b_offset,r1,ebp);
+
+	reverse_f_function(ebp,r0,r1,r2,r3,r4,15*8);
+
+        push %r1; /* save oldb for next rount */
+
+	get_dec_input(r5,c_offset,r1,ebp);
+	get_dec_input(r5,d_offset,r4,ebp);
+	xor	%r4,	%r3;
+	ror	$1,	%r3;
+	rol	$1,	%r1;
+	xor	%r1,	%r2;
+
+	dec_round(ebp,r2,r3,r4,r5,r1,r0,14*8);
+	dec_round(ebp,r0,r3,r4,r5,r1,r2,13*8);
+	dec_round(ebp,r2,r3,r4,r5,r1,r0,12*8);
+	dec_round(ebp,r0,r3,r4,r5,r1,r2,11*8);
+	dec_round(ebp,r2,r3,r4,r5,r1,r0,10*8);
+	dec_round(ebp,r0,r3,r4,r5,r1,r2,9*8);
+	dec_round(ebp,r2,r3,r4,r5,r1,r0,8*8);
+	dec_round(ebp,r0,r3,r4,r5,r1,r2,7*8);
+	dec_round(ebp,r2,r3,r4,r5,r1,r0,6*8);
+	dec_round(ebp,r0,r3,r4,r5,r1,r2,5*8);
+	dec_round(ebp,r2,r3,r4,r5,r1,r0,4*8);
+	dec_round(ebp,r0,r3,r4,r5,r1,r2,3*8);
+	dec_round(ebp,r2,r3,r4,r5,r1,r0,2*8);
+	dec_round(ebp,r0,r3,r4,r5,r1,r2,8);
+
+	/* To save a few instructions round 0 is unrolled */
+	mov	out_blk+20(%esp),%r1;
+	process_dec_output(r1,r2,r4,ebp,a_offset);
+	process_dec_output(r1,r3,r4,ebp,b_offset);
+	g1(ebp,r2,r4,r1);
+	g2(ebp,r3,r5,r1);
+	pht(r4,r5);
+	round_key(ebp,r4,r5,0);
+
+	pop	%r1;
+	xor	%r1,	%r5
+	ror	$1,	%r5
+	xor	%r0,	%r4
+
+
+
+	mov	out_blk+16(%esp),%r3
+	destructive_process_dec_output(r3,r4,ebp,c_offset);
+	destructive_process_dec_output(r3,r5,ebp,d_offset);
+
+	pop	%edi
+	pop	%esi
+	pop	%r1
+	pop	%ebp
+	mov	$1,%r0
+	ret
+
+
+
+
diff -uprN linux-2.6.17-rc5.twofish2/crypto/Kconfig linux-2.6.17-rc5.twofish3/crypto/Kconfig
--- linux-2.6.17-rc5.twofish2/crypto/Kconfig	2006-05-30 19:44:02.607579102 +0200
+++ linux-2.6.17-rc5.twofish3/crypto/Kconfig	2006-05-30 20:00:47.841035197 +0200
@@ -142,6 +142,20 @@ config CRYPTO_TWOFISH
 	  See also:
 	  <http://www.schneier.com/twofish.html>

+config CRYPTO_TWOFISH_586
+	tristate "Twofish cipher algorithms (i586)"
+	depends on CRYPTO && ((X86 || UML_X86) && !64BIT)
+	help
+	  Twofish cipher algorithm.
+
+	  Twofish was submitted as an AES (Advanced Encryption Standard)
+	  candidate cipher by researchers at CounterPane Systems.  It is a
+	  16 round block cipher supporting key sizes of 128, 192, and 256
+	  bits.
+
+	  See also:
+	  <http://www.schneier.com/twofish.html>
+
 config CRYPTO_SERPENT
 	tristate "Serpent cipher algorithm"
 	depends on CRYPTO

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH  3/4] Twofish cipher - i586 assembler
  2006-06-07 19:38 ` Joachim Fritschi
@ 2006-06-16 11:59   ` Joachim Fritschi
  0 siblings, 0 replies; 9+ messages in thread
From: Joachim Fritschi @ 2006-06-16 11:59 UTC (permalink / raw)
  To: linux-kernel; +Cc: linux-crypto, herbert, ak

Update patch for the i586 twofish assembler implementation.

Changes since last version:
-Updated to the new twofish_common setup
-Complete rewrite of the code  according to the feedback i recieved for the
x86_64 patch (thanks linux@horizon.com)

The patch passed the trycpt tests and automated filesystem tests.
This rewrite resulted in some nice perfomance increase over my last patch.

Short summary of the tcrypt benchmarks:

Twofish Assembler vs. Twofish C (256bit 8kb block CBC)
encrypt: -33% Cycles
decrypt: -45% Cycles

Twofish Assembler vs. AES Assembler (128bit 8kb block CBC)
encrypt: +3%  Cycles
decrypt: -22% Cycles

Twofish Assembler vs. AES Assembler (256bit 8kb block CBC)
encrypt: -20% Cycles
decrypt: -36% Cycles

Full Output:
http://homepages.tu-darmstadt.de/~fritschi/twofish/tcrypt-speed-twofish-asm-i586.txt
http://homepages.tu-darmstadt.de/~fritschi/twofish/tcrypt-speed-twofish-c-i586.txt
http://homepages.tu-darmstadt.de/~fritschi/twofish/tcrypt-speed-aes-asm-i586.txt


Here is another bonnie++ benchmark with encrypted filesystems. All runs with
the twofish assembler modules max out the drivespeed. It should give some
idea what the module can do for encrypted filesystem performance even though
you can't see the full numbers.

http://homepages.tu-darmstadt.de/~fritschi/twofish/output_20060611_205432_x86.html


Signed-off-by: Joachim Fritschi <jfritschi@freenet.de>

diff -uprN linux-2.6.17-rc5.twofish2/arch/i386/crypto/Makefile linux-2.6.17-rc5.twofish3/arch/i386/crypto/Makefile
--- linux-2.6.17-rc5.twofish2/arch/i386/crypto/Makefile	2006-06-11 15:58:36.991988374 +0200
+++ linux-2.6.17-rc5.twofish3/arch/i386/crypto/Makefile	2006-06-11 16:05:51.675813834 +0200
@@ -5,5 +5,8 @@
 # 
 
 obj-$(CONFIG_CRYPTO_AES_586) += aes-i586.o
+obj-$(CONFIG_CRYPTO_TWOFISH_586) += twofish-i586.o
 
 aes-i586-y := aes-i586-asm.o aes.o
+twofish-i586-y := twofish-i586-asm.o twofish.o
+
diff -uprN linux-2.6.17-rc5.twofish2/arch/i386/crypto/twofish.c linux-2.6.17-rc5.twofish3/arch/i386/crypto/twofish.c
--- linux-2.6.17-rc5.twofish2/arch/i386/crypto/twofish.c	1970-01-01 01:00:00.000000000 +0100
+++ linux-2.6.17-rc5.twofish3/arch/i386/crypto/twofish.c	2006-06-11 16:03:56.669852049 +0200
@@ -0,0 +1,88 @@
+/*
+ *  Glue Code for optimized 586 assembler version of TWOFISH
+ *
+ * Originally Twofish for GPG
+ * By Matthew Skala <mskala@ansuz.sooke.bc.ca>, July 26, 1998
+ * 256-bit key length added March 20, 1999
+ * Some modifications to reduce the text size by Werner Koch, April, 1998
+ * Ported to the kerneli patch by Marc Mutz <Marc@Mutz.com>
+ * Ported to CryptoAPI by Colin Slater <hoho@tacomeat.net>
+ *
+ * The original author has disclaimed all copyright interest in this
+ * code and thus put it in the public domain. The subsequent authors
+ * have put this under the GNU General Public License.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307
+ * USA
+ *
+ * This code is a "clean room" implementation, written from the paper
+ * _Twofish: A 128-Bit Block Cipher_ by Bruce Schneier, John Kelsey,
+ * Doug Whiting, David Wagner, Chris Hall, and Niels Ferguson, available
+ * through http://www.counterpane.com/twofish.html
+ *
+ * For background information on multiplication in finite fields, used for
+ * the matrix operations in the key schedule, see the book _Contemporary
+ * Abstract Algebra_ by Joseph A. Gallian, especially chapter 22 in the
+ * Third Edition.
+ */
+
+#include <asm/byteorder.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/crypto.h>
+#include <linux/bitops.h>
+#include <crypto/twofish.h>
+
+
+asmlinkage void twofish_enc_blk(void *ctx, u8 *dst, const u8 *src);
+
+asmlinkage void twofish_dec_blk(void *ctx, u8 *dst, const u8 *src);
+
+
+static struct crypto_alg alg = {
+	.cra_name           =   "twofish",
+	.cra_driver_name    =	"twofish-i586",
+        .cra_priority       =   200,
+	.cra_flags          =   CRYPTO_ALG_TYPE_CIPHER,
+	.cra_blocksize      =   TF_BLOCK_SIZE,
+	.cra_ctxsize        =   sizeof(struct twofish_ctx),
+	.cra_alignmask      =	3,
+	.cra_module         =   THIS_MODULE,
+	.cra_list           =   LIST_HEAD_INIT(alg.cra_list),
+	.cra_u              =   { .cipher = {
+	.cia_min_keysize    =   TF_MIN_KEY_SIZE,
+	.cia_max_keysize    =   TF_MAX_KEY_SIZE,
+	.cia_setkey         =   twofish_setkey,
+	.cia_encrypt        =   twofish_enc_blk,
+	.cia_decrypt        =   twofish_dec_blk } }
+};
+
+static int __init init(void)
+{
+	return crypto_register_alg(&alg);
+}
+
+static void __exit fini(void)
+{
+	crypto_unregister_alg(&alg);
+}
+
+module_init(init);
+module_exit(fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION ("Twofish Cipher Algorithm, i586 asm optimized");
diff -uprN linux-2.6.17-rc5.twofish2/arch/i386/crypto/twofish-i586-asm.S linux-2.6.17-rc5.twofish3/arch/i386/crypto/twofish-i586-asm.S
--- linux-2.6.17-rc5.twofish2/arch/i386/crypto/twofish-i586-asm.S	1970-01-01 01:00:00.000000000 +0100
+++ linux-2.6.17-rc5.twofish3/arch/i386/crypto/twofish-i586-asm.S	2006-06-11 21:49:26.508548778 +0200
@@ -0,0 +1,404 @@
+/***************************************************************************
+*   Copyright (C) 2006 by Joachim Fritschi, <jfritschi@freenet.de>        *
+*                                                                         *
+*   This program is free software; you can redistribute it and/or modify  *
+*   it under the terms of the GNU General Public License as published by  *
+*   the Free Software Foundation; either version 2 of the License, or     *
+*   (at your option) any later version.                                   *
+*                                                                         *
+*   This program is distributed in the hope that it will be useful,       *
+*   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+*   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+*   GNU General Public License for more details.                          *
+*                                                                         *
+*   You should have received a copy of the GNU General Public License     *
+*   along with this program; if not, write to the                         *
+*   Free Software Foundation, Inc.,                                       *
+*   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+***************************************************************************/
+
+.file "twofish-i586-asm.S"
+.text
+
+/* return adress at 0 */
+
+#define in_blk    12  /* input byte array address parameter*/
+#define out_blk   8  /* output byte array address parameter*/
+#define ctx       4  /* Twofish context structure */
+
+#define a_offset	0
+#define b_offset	4
+#define c_offset	8
+#define d_offset	12
+
+/* Structure of the crypto context struct*/
+
+#define s0	0	/* S0 Array 256 Words each */
+#define s1	1024	/* S1 Array */
+#define s2	2048	/* S2 Array */
+#define s3	3072	/* S3 Array */
+#define w	4096	/* 8 whitening keys (word) */
+#define k	4128	/* key 1-32 ( word ) */
+
+/* define a few register aliases to allow macro substitution */
+
+#define R0D    %eax
+#define R0B    %al
+#define R0H    %ah
+
+#define R1D    %ebx
+#define R1B    %bl
+#define R1H    %bh
+
+#define R2D    %ecx
+#define R2B    %cl
+#define R2H    %ch
+
+#define R3D    %edx
+#define R3B    %dl
+#define R3H    %dh
+
+
+/* performs input whitening */
+#define input_whitening(src,context,offset)\
+	xor	w+offset(context),	src;
+
+/* performs input whitening */
+#define output_whitening(src,context,offset)\
+	xor	w+16+offset(context),	src;
+
+/*
+a input register containing a (rotated 16)
+b input register containing b
+c input register containing c
+d input register containing d (already rol $1)
+operations on a and b are interleaved to increase performance
+*/
+#define encrypt_round(a,b,c,d,round)\
+movzx	b ## B,		%edi;\
+mov	s1(%ebp,%edi,4),d ## D;\
+movzx	a ## B,		%edi;\
+mov	s2(%ebp,%edi,4),%esi;\
+movzx	b ## H,		%edi;\
+ror	$16,		b ## D;\
+xor	s2(%ebp,%edi,4),d ## D;\
+movzx	a ## H,		%edi;\
+ror	$16,		a ## D;\
+xor	s3(%ebp,%edi,4),%esi;\
+movzx	b ## B,		%edi;\
+xor	s3(%ebp,%edi,4),d ## D;\
+movzx	a ## B,		%edi;\
+xor	(%ebp,%edi,4),	%esi;\
+movzx	b ## H,		%edi;\
+ror	$15,		b ## D;\
+xor	(%ebp,%edi,4),	d ## D;\
+movzx	a ## H,		%edi;\
+xor	s1(%ebp,%edi,4),%esi;\
+add	d ## D,		%esi;\
+add	%esi,		d ## D;\
+add	k+round(%ebp),	%esi;\
+pop	%edi;\
+push	b ## D;\
+xor	%esi,		c ## D;\
+rol	$15,		c ## D;\
+add	k+4+round(%ebp),d ## D;\
+xor	%edi,		d ## D;
+
+
+/*
+a input register containing a
+b input register containing b
+c input register containing c
+d input register containing d (already rol $1)
+operations on a and b are interleaved to increase performance
+*/
+#define encrypt_first_round(a,b,c,d,round)\
+movzx	a ## B,		%edi;\
+mov	(%ebp,%edi,4),	%esi;\
+movzx	b ## B,		%edi;\
+mov	s1(%ebp,%edi,4),d ## D;\
+movzx	a ## H,		%edi;\
+ror	$16,		a ## D;\
+xor	s1(%ebp,%edi,4),%esi;\
+movzx	b ## H,		%edi;\
+ror	$16,		b ## D;\
+xor	s2(%ebp,%edi,4),d ## D;\
+movzx	a ## B,		%edi;\
+xor	s2(%ebp,%edi,4),%esi;\
+movzx	b ## B,		%edi;\
+xor	s3(%ebp,%edi,4),d ## D;\
+movzx	a ## H,		%edi;\
+ror	$16,		a ## D;\
+xor	s3(%ebp,%edi,4),%esi;\
+movzx	b ## H,		%edi;\
+ror	$15,		b ## D;\
+xor	(%ebp,%edi,4),	d ## D;\
+pop	%edi;\
+add	d ## D,		%esi;\
+add	%esi,		d ## D;\
+add	k+round(%ebp),	%esi;\
+push	b ## D;\
+xor	%esi,		c ## D;\
+rol	$15,		c ## D;\
+add	k+4+round(%ebp),d ## D;\
+xor	%edi,		d ## D;
+
+#define encrypt_last_round(a,b,c,d,round)\
+movzx	b ## B,		%edi;\
+mov	s1(%ebp,%edi,4),d ## D;\
+movzx	a ## B,		%edi;\
+mov	s2(%ebp,%edi,4),%esi;\
+movzx	b ## H,		%edi;\
+ror	$16,		b ## D;\
+xor	s2(%ebp,%edi,4),d ## D;\
+movzx	a ## H,		%edi;\
+ror	$16,		a ## D;\
+xor	s3(%ebp,%edi,4),%esi;\
+movzx	b ## B,		%edi;\
+xor	s3(%ebp,%edi,4),d ## D;\
+movzx	a ## B,		%edi;\
+xor	(%ebp,%edi,4),	%esi;\
+movzx	b ## H,		%edi;\
+ror	$16,		b ## D;\
+xor	(%ebp,%edi,4),	d ## D;\
+movzx	a ## H,		%edi;\
+xor	s1(%ebp,%edi,4),%esi;\
+add	d ## D,		%esi;\
+add	%esi,		d ## D;\
+add	k+round(%ebp),	%esi;\
+pop	%edi;\
+xor	%esi,		c ## D;\
+ror	$1,		c ## D;\
+add	k+4+round(%ebp),d ## D;\
+xor	%edi,		d ## D;
+
+/*
+a input register containing a
+b input register containing b (rotated 16)
+c input register containing c
+d input register containing d (already rol $1)
+operations on a and b are interleaved to increase performance
+*/
+#define decrypt_round(a,b,c,d,round)\
+movzx	a ## B,		%edi;\
+mov	(%ebp,%edi,4),	%esi;\
+movzx	b ## B,		%edi;\
+mov	s3(%ebp,%edi,4),d ## D;\
+movzx	a ## H,		%edi;\
+ror	$16,		a ## D;\
+xor	s1(%ebp,%edi,4),%esi;\
+movzx	b ## H,		%edi;\
+ror	$16,		b ## D;\
+xor	(%ebp,%edi,4),	d ## D;\
+movzx	a ## B,		%edi;\
+xor	s2(%ebp,%edi,4),%esi;\
+movzx	b ## B,		%edi;\
+xor	s1(%ebp,%edi,4),d ## D;\
+movzx	a ## H,		%edi;\
+ror	$15,		a ## D;\
+xor	s3(%ebp,%edi,4),%esi;\
+movzx	b ## H,		%edi;\
+xor	s2(%ebp,%edi,4),d ## D;\
+add	d ## D,		%esi;\
+add	%esi,		d ## D;\
+add	k+4+round(%ebp),d ## D;\
+pop	%edi;\
+push	b ## D;\
+xor	%edi,		d ## D;\
+rol	$15,		d ## D;\
+add	k+round(%ebp),	%esi;\
+xor	%esi,		c ## D;
+
+
+/*
+a input register containing a
+b input register containing b
+c input register containing c
+d input register containing d (already rol $1)
+operations on a and b are interleaved to increase performance
+*/
+#define decrypt_first_round(a,b,c,d,round)\
+movzx	a ## B,		%edi;\
+mov	(%ebp,%edi,4),	%esi;\
+movzx	b ## B,		%edi;\
+mov	s1(%ebp,%edi,4),d ## D;\
+movzx	a ## H,		%edi;\
+ror	$16,		a ## D;\
+xor	s1(%ebp,%edi,4),%esi;\
+movzx	b ## H,		%edi;\
+ror	$16,		b ## D;\
+xor	s2(%ebp,%edi,4),d ## D;\
+movzx	a ## B,		%edi;\
+xor	s2(%ebp,%edi,4),%esi;\
+movzx	b ## B,		%edi;\
+xor	s3(%ebp,%edi,4),d ## D;\
+movzx	a ## H,		%edi;\
+ror	$15,		a ## D;\
+xor	s3(%ebp,%edi,4),%esi;\
+movzx	b ## H,		%edi;\
+ror	$16,		b ## D;\
+xor	(%ebp,%edi,4),d ## D;\
+pop	%edi;\
+add	d ## D,		%esi;\
+add	%esi,		d ## D;\
+add	k+4+round(%ebp),d ## D;\
+xor	%edi,		d ## D;\
+rol	$15,		d ## D;\
+push	b ## D;\
+add	k+round(%ebp),	%esi;\
+xor	%esi,		c ## D;
+
+
+/*
+a input register containing a
+b input register containing b (rotated 16)
+c input register containing c
+d input register containing d (already rol $1)
+operations on a and b are interleaved to increase performance
+*/
+#define decrypt_last_round(a,b,c,d,round)\
+movzx	a ## B,		%edi;\
+mov	(%ebp,%edi,4),	%esi;\
+movzx	b ## B,		%edi;\
+mov	s3(%ebp,%edi,4),d ## D;\
+movzx	a ## H,		%edi;\
+ror	$16,		a ## D;\
+xor	s1(%ebp,%edi,4),%esi;\
+movzx	b ## H,		%edi;\
+ror	$16,		b ## D;\
+xor	(%ebp,%edi,4),	d ## D;\
+movzx	a ## B,		%edi;\
+xor	s2(%ebp,%edi,4),%esi;\
+movzx	b ## B,		%edi;\
+xor	s1(%ebp,%edi,4),d ## D;\
+movzx	a ## H,		%edi;\
+ror	$16,		a ## D;\
+xor	s3(%ebp,%edi,4),%esi;\
+movzx	b ## H,		%edi;\
+xor	s2(%ebp,%edi,4),d ## D;\
+pop	%edi;\
+add	d ## D,		%esi;\
+add	%esi,		d ## D;\
+add	k+4+round(%ebp),d ## D;\
+xor	%edi,		d ## D;\
+ror	$1,		d ## D;\
+add	k+round(%ebp),	%esi;\
+xor	%esi,		c ## D;
+	
+.align 4
+.global twofish_enc_blk
+.global twofish_dec_blk
+
+
+
+twofish_enc_blk:
+	push	%ebp			/* save registers according to calling convention*/
+	push    %ebx
+	push    %esi			
+	push    %edi
+		
+		
+	mov	ctx + 16(%esp),	%ebp	/* abuse the base pointer: set new base bointer to the crypto ctx */
+	mov     in_blk+16(%esp),%edi	/* input adress in edi */
+
+	mov	(%edi),		%eax
+	mov	b_offset(%edi),	%ebx
+	mov	c_offset(%edi),	%ecx
+	mov	d_offset(%edi),	%edx
+	input_whitening(%eax,%ebp,a_offset)
+	input_whitening(%ebx,%ebp,b_offset)
+	input_whitening(%ecx,%ebp,c_offset)
+	input_whitening(%edx,%ebp,d_offset)
+	rol	$1,	%edx
+	push	%edx
+
+	encrypt_first_round(R0,R1,R2,R3,0);
+	encrypt_round(R2,R3,R0,R1,8);
+	encrypt_round(R0,R1,R2,R3,2*8);
+	encrypt_round(R2,R3,R0,R1,3*8);
+	encrypt_round(R0,R1,R2,R3,4*8);
+	encrypt_round(R2,R3,R0,R1,5*8);
+	encrypt_round(R0,R1,R2,R3,6*8);
+	encrypt_round(R2,R3,R0,R1,7*8);
+	encrypt_round(R0,R1,R2,R3,8*8);
+	encrypt_round(R2,R3,R0,R1,9*8);
+	encrypt_round(R0,R1,R2,R3,10*8);
+	encrypt_round(R2,R3,R0,R1,11*8);
+	encrypt_round(R0,R1,R2,R3,12*8);
+	encrypt_round(R2,R3,R0,R1,13*8);
+	encrypt_round(R0,R1,R2,R3,14*8);
+	encrypt_last_round(R2,R3,R0,R1,15*8);
+
+
+	output_whitening(%eax,%ebp,c_offset)
+	output_whitening(%ebx,%ebp,d_offset)
+	output_whitening(%ecx,%ebp,a_offset)
+	output_whitening(%edx,%ebp,b_offset)
+	mov	out_blk+16(%esp),%edi;
+	mov	%eax,		c_offset(%edi)
+	mov	%ebx,		d_offset(%edi)
+	mov	%ecx,		(%edi)
+	mov	%edx,		b_offset(%edi)
+
+	pop	%edi
+	pop	%esi
+	pop	%ebx
+	pop	%ebp
+	mov	$1,	%eax
+	ret
+	
+twofish_dec_blk:	
+	push	%ebp			/* save registers according to calling convention*/
+	push    %ebx
+	push    %esi			
+	push    %edi
+		
+		
+	mov	ctx + 16(%esp),	%ebp	/* abuse the base pointer: set new base bointer to the crypto ctx */
+	mov     in_blk+16(%esp),%edi	/* input adress in edi */
+
+	mov	(%edi),		%eax
+	mov	b_offset(%edi),	%ebx
+	mov	c_offset(%edi),	%ecx
+	mov	d_offset(%edi),	%edx
+	output_whitening(%eax,%ebp,a_offset)
+	output_whitening(%ebx,%ebp,b_offset)
+	output_whitening(%ecx,%ebp,c_offset)
+	output_whitening(%edx,%ebp,d_offset)
+	rol	$1,	%ecx
+	push	%edx
+
+	decrypt_first_round(R0,R1,R2,R3,15*8);
+	decrypt_round(R2,R3,R0,R1,14*8);
+	decrypt_round(R0,R1,R2,R3,13*8);
+	decrypt_round(R2,R3,R0,R1,12*8);
+	decrypt_round(R0,R1,R2,R3,11*8);
+	decrypt_round(R2,R3,R0,R1,10*8);
+	decrypt_round(R0,R1,R2,R3,9*8);
+	decrypt_round(R2,R3,R0,R1,8*8);
+	decrypt_round(R0,R1,R2,R3,7*8);
+	decrypt_round(R2,R3,R0,R1,6*8);
+	decrypt_round(R0,R1,R2,R3,5*8);
+	decrypt_round(R2,R3,R0,R1,4*8);
+	decrypt_round(R0,R1,R2,R3,3*8);
+	decrypt_round(R2,R3,R0,R1,2*8);
+	decrypt_round(R0,R1,R2,R3,1*8);
+	decrypt_last_round(R2,R3,R0,R1,0);
+
+
+	input_whitening(%eax,%ebp,c_offset)
+	input_whitening(%ebx,%ebp,d_offset)
+	input_whitening(%ecx,%ebp,a_offset)
+	input_whitening(%edx,%ebp,b_offset)
+	mov	out_blk+16(%esp),%edi;
+	mov	%eax,		c_offset(%edi)
+	mov	%ebx,		d_offset(%edi)
+	mov	%ecx,		(%edi)
+	mov	%edx,		b_offset(%edi)
+
+	pop	%edi
+	pop	%esi
+	pop	%ebx
+	pop	%ebp
+	mov	$1,	%eax
+	ret
diff -uprN linux-2.6.17-rc5.twofish2/crypto/Kconfig linux-2.6.17-rc5.twofish3/crypto/Kconfig
--- linux-2.6.17-rc5.twofish2/crypto/Kconfig	2006-06-11 15:58:39.219982140 +0200
+++ linux-2.6.17-rc5.twofish3/crypto/Kconfig	2006-06-11 16:05:19.938782275 +0200
@@ -150,6 +150,21 @@ config CRYPTO_TWOFISH_COMMON
 	  Common parts of the Twofish cipher algorithm.
 	  
 
+config CRYPTO_TWOFISH_586
+	tristate "Twofish cipher algorithms (i586)"
+	depends on CRYPTO && ((X86 || UML_X86) && !64BIT)
+	select CRYPTO_TWOFISH_COMMON
+	help
+	  Twofish cipher algorithm.
+
+	  Twofish was submitted as an AES (Advanced Encryption Standard)
+	  candidate cipher by researchers at CounterPane Systems.  It is a
+	  16 round block cipher supporting key sizes of 128, 192, and 256
+	  bits.
+
+	  See also:
+	  <http://www.schneier.com/twofish.html>
+
 config CRYPTO_SERPENT
 	tristate "Serpent cipher algorithm"
 	depends on CRYPTO


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH  3/4] Twofish cipher - i586 assembler
  2006-06-04 13:16 [PATCH 3/4] Twofish cipher - i586 assembler Joachim Fritschi
  2006-06-04 22:49 ` Horst von Brand
  2006-06-07 19:38 ` Joachim Fritschi
@ 2006-06-17 10:30 ` Joachim Fritschi
  2006-06-19 14:12 ` Joachim Fritschi
  3 siblings, 0 replies; 9+ messages in thread
From: Joachim Fritschi @ 2006-06-17 10:30 UTC (permalink / raw)
  To: linux-kernel; +Cc: linux-crypto, herbert, ak

After recieving some more feedback from linux@horizon.com, i have revised my
patch a bit and done some cosmetic changes. The first_round macros are now 
eliminated and the pop instruction has been moved forward a bit.

The patch passed the trycpt tests and automated filesystem tests.


Signed-off-by: Joachim Fritschi <jfritschi@freenet.de>

diff -uprN linux-2.6.17-rc5.twofish2/arch/i386/crypto/Makefile linux-2.6.17-rc5.twofish3/arch/i386/crypto/Makefile
--- linux-2.6.17-rc5.twofish2/arch/i386/crypto/Makefile	2006-06-11 15:58:36.991988374 +0200
+++ linux-2.6.17-rc5.twofish3/arch/i386/crypto/Makefile	2006-06-11 16:05:51.675813834 +0200
@@ -5,5 +5,8 @@
 # 
 
 obj-$(CONFIG_CRYPTO_AES_586) += aes-i586.o
+obj-$(CONFIG_CRYPTO_TWOFISH_586) += twofish-i586.o
 
 aes-i586-y := aes-i586-asm.o aes.o
+twofish-i586-y := twofish-i586-asm.o twofish.o
+
diff -uprN linux-2.6.17-rc5.twofish2/arch/i386/crypto/twofish.c linux-2.6.17-rc5.twofish3/arch/i386/crypto/twofish.c
--- linux-2.6.17-rc5.twofish2/arch/i386/crypto/twofish.c	1970-01-01 01:00:00.000000000 +0100
+++ linux-2.6.17-rc5.twofish3/arch/i386/crypto/twofish.c	2006-06-11 16:03:56.669852049 +0200
@@ -0,0 +1,88 @@
+/*
+ *  Glue Code for optimized 586 assembler version of TWOFISH
+ *
+ * Originally Twofish for GPG
+ * By Matthew Skala <mskala@ansuz.sooke.bc.ca>, July 26, 1998
+ * 256-bit key length added March 20, 1999
+ * Some modifications to reduce the text size by Werner Koch, April, 1998
+ * Ported to the kerneli patch by Marc Mutz <Marc@Mutz.com>
+ * Ported to CryptoAPI by Colin Slater <hoho@tacomeat.net>
+ *
+ * The original author has disclaimed all copyright interest in this
+ * code and thus put it in the public domain. The subsequent authors
+ * have put this under the GNU General Public License.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307
+ * USA
+ *
+ * This code is a "clean room" implementation, written from the paper
+ * _Twofish: A 128-Bit Block Cipher_ by Bruce Schneier, John Kelsey,
+ * Doug Whiting, David Wagner, Chris Hall, and Niels Ferguson, available
+ * through http://www.counterpane.com/twofish.html
+ *
+ * For background information on multiplication in finite fields, used for
+ * the matrix operations in the key schedule, see the book _Contemporary
+ * Abstract Algebra_ by Joseph A. Gallian, especially chapter 22 in the
+ * Third Edition.
+ */
+
+#include <asm/byteorder.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/crypto.h>
+#include <linux/bitops.h>
+#include <crypto/twofish.h>
+
+
+asmlinkage void twofish_enc_blk(void *ctx, u8 *dst, const u8 *src);
+
+asmlinkage void twofish_dec_blk(void *ctx, u8 *dst, const u8 *src);
+
+
+static struct crypto_alg alg = {
+	.cra_name           =   "twofish",
+	.cra_driver_name    =	"twofish-i586",
+        .cra_priority       =   200,
+	.cra_flags          =   CRYPTO_ALG_TYPE_CIPHER,
+	.cra_blocksize      =   TF_BLOCK_SIZE,
+	.cra_ctxsize        =   sizeof(struct twofish_ctx),
+	.cra_alignmask      =	3,
+	.cra_module         =   THIS_MODULE,
+	.cra_list           =   LIST_HEAD_INIT(alg.cra_list),
+	.cra_u              =   { .cipher = {
+	.cia_min_keysize    =   TF_MIN_KEY_SIZE,
+	.cia_max_keysize    =   TF_MAX_KEY_SIZE,
+	.cia_setkey         =   twofish_setkey,
+	.cia_encrypt        =   twofish_enc_blk,
+	.cia_decrypt        =   twofish_dec_blk } }
+};
+
+static int __init init(void)
+{
+	return crypto_register_alg(&alg);
+}
+
+static void __exit fini(void)
+{
+	crypto_unregister_alg(&alg);
+}
+
+module_init(init);
+module_exit(fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION ("Twofish Cipher Algorithm, i586 asm optimized");
diff -uprN linux-2.6.17-rc5.twofish2/arch/i386/crypto/twofish-i586-asm.S linux-2.6.17-rc5.twofish3/arch/i386/crypto/twofish-i586-asm.S
--- linux-2.6.17-rc5.twofish2/arch/i386/crypto/twofish-i586-asm.S	1970-01-01 01:00:00.000000000 +0100
+++ linux-2.6.17-rc5.twofish3/arch/i386/crypto/twofish-i586-asm.S	2006-06-17 11:08:32.536993468 +0200
@@ -0,0 +1,331 @@
+/***************************************************************************
+*   Copyright (C) 2006 by Joachim Fritschi, <jfritschi@freenet.de>        *
+*                                                                         *
+*   This program is free software; you can redistribute it and/or modify  *
+*   it under the terms of the GNU General Public License as published by  *
+*   the Free Software Foundation; either version 2 of the License, or     *
+*   (at your option) any later version.                                   *
+*                                                                         *
+*   This program is distributed in the hope that it will be useful,       *
+*   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+*   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+*   GNU General Public License for more details.                          *
+*                                                                         *
+*   You should have received a copy of the GNU General Public License     *
+*   along with this program; if not, write to the                         *
+*   Free Software Foundation, Inc.,                                       *
+*   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+***************************************************************************/
+
+.file "twofish-i586-asm.S"
+.text
+
+/* return adress at 0 */
+
+#define in_blk    12  /* input byte array address parameter*/
+#define out_blk   8  /* output byte array address parameter*/
+#define ctx       4  /* Twofish context structure */
+
+#define a_offset	0
+#define b_offset	4
+#define c_offset	8
+#define d_offset	12
+
+/* Structure of the crypto context struct*/
+
+#define s0	0	/* S0 Array 256 Words each */
+#define s1	1024	/* S1 Array */
+#define s2	2048	/* S2 Array */
+#define s3	3072	/* S3 Array */
+#define w	4096	/* 8 whitening keys (word) */
+#define k	4128	/* key 1-32 ( word ) */
+
+/* define a few register aliases to allow macro substitution */
+
+#define R0D    %eax
+#define R0B    %al
+#define R0H    %ah
+
+#define R1D    %ebx
+#define R1B    %bl
+#define R1H    %bh
+
+#define R2D    %ecx
+#define R2B    %cl
+#define R2H    %ch
+
+#define R3D    %edx
+#define R3B    %dl
+#define R3H    %dh
+
+
+/* performs input whitening */
+#define input_whitening(src,context,offset)\
+	xor	w+offset(context),	src;
+
+/* performs input whitening */
+#define output_whitening(src,context,offset)\
+	xor	w+16+offset(context),	src;
+
+/*
+a input register containing a (rotated 16)
+b input register containing b
+c input register containing c
+d input register containing d (already rol $1)
+operations on a and b are interleaved to increase performance
+*/
+#define encrypt_round(a,b,c,d,round)\
+push	d ## D;\
+movzx	b ## B,		%edi;\
+mov	s1(%ebp,%edi,4),d ## D;\
+movzx	a ## B,		%edi;\
+mov	s2(%ebp,%edi,4),%esi;\
+movzx	b ## H,		%edi;\
+ror	$16,		b ## D;\
+xor	s2(%ebp,%edi,4),d ## D;\
+movzx	a ## H,		%edi;\
+ror	$16,		a ## D;\
+xor	s3(%ebp,%edi,4),%esi;\
+movzx	b ## B,		%edi;\
+xor	s3(%ebp,%edi,4),d ## D;\
+movzx	a ## B,		%edi;\
+xor	(%ebp,%edi,4),	%esi;\
+movzx	b ## H,		%edi;\
+ror	$15,		b ## D;\
+xor	(%ebp,%edi,4),	d ## D;\
+movzx	a ## H,		%edi;\
+xor	s1(%ebp,%edi,4),%esi;\
+pop	%edi;\
+add	d ## D,		%esi;\
+add	%esi,		d ## D;\
+add	k+round(%ebp),	%esi;\
+xor	%esi,		c ## D;\
+rol	$15,		c ## D;\
+add	k+4+round(%ebp),d ## D;\
+xor	%edi,		d ## D;
+
+/*
+a input register containing a (rotated 16)
+b input register containing b
+c input register containing c
+d input register containing d (already rol $1)
+operations on a and b are interleaved to increase performance
+last round has different rotations for the output preparation
+*/
+#define encrypt_last_round(a,b,c,d,round)\
+push	d ## D;\
+movzx	b ## B,		%edi;\
+mov	s1(%ebp,%edi,4),d ## D;\
+movzx	a ## B,		%edi;\
+mov	s2(%ebp,%edi,4),%esi;\
+movzx	b ## H,		%edi;\
+ror	$16,		b ## D;\
+xor	s2(%ebp,%edi,4),d ## D;\
+movzx	a ## H,		%edi;\
+ror	$16,		a ## D;\
+xor	s3(%ebp,%edi,4),%esi;\
+movzx	b ## B,		%edi;\
+xor	s3(%ebp,%edi,4),d ## D;\
+movzx	a ## B,		%edi;\
+xor	(%ebp,%edi,4),	%esi;\
+movzx	b ## H,		%edi;\
+ror	$16,		b ## D;\
+xor	(%ebp,%edi,4),	d ## D;\
+movzx	a ## H,		%edi;\
+xor	s1(%ebp,%edi,4),%esi;\
+pop	%edi;\
+add	d ## D,		%esi;\
+add	%esi,		d ## D;\
+add	k+round(%ebp),	%esi;\
+xor	%esi,		c ## D;\
+ror	$1,		c ## D;\
+add	k+4+round(%ebp),d ## D;\
+xor	%edi,		d ## D;
+
+/*
+a input register containing a
+b input register containing b (rotated 16)
+c input register containing c
+d input register containing d (already rol $1)
+operations on a and b are interleaved to increase performance
+*/
+#define decrypt_round(a,b,c,d,round)\
+push	c ## D;\
+movzx	a ## B,		%edi;\
+mov	(%ebp,%edi,4),	c ## D;\
+movzx	b ## B,		%edi;\
+mov	s3(%ebp,%edi,4),%esi;\
+movzx	a ## H,		%edi;\
+ror	$16,		a ## D;\
+xor	s1(%ebp,%edi,4),c ## D;\
+movzx	b ## H,		%edi;\
+ror	$16,		b ## D;\
+xor	(%ebp,%edi,4),	%esi;\
+movzx	a ## B,		%edi;\
+xor	s2(%ebp,%edi,4),c ## D;\
+movzx	b ## B,		%edi;\
+xor	s1(%ebp,%edi,4),%esi;\
+movzx	a ## H,		%edi;\
+ror	$15,		a ## D;\
+xor	s3(%ebp,%edi,4),c ## D;\
+movzx	b ## H,		%edi;\
+xor	s2(%ebp,%edi,4),%esi;\
+pop	%edi;\
+add	%esi,		c ## D;\
+add	c ## D,		%esi;\
+add	k+round(%ebp),	c ## D;\
+xor	%edi,		c ## D;\
+add	k+4+round(%ebp),%esi;\
+xor	%esi,		d ## D;\
+rol	$15,		d ## D;
+
+/*
+a input register containing a
+b input register containing b (rotated 16)
+c input register containing c
+d input register containing d (already rol $1)
+operations on a and b are interleaved to increase performance
+last round has different rotations for the output preparation
+*/
+#define decrypt_last_round(a,b,c,d,round)\
+push	c ## D;\
+movzx	a ## B,		%edi;\
+mov	(%ebp,%edi,4),	c ## D;\
+movzx	b ## B,		%edi;\
+mov	s3(%ebp,%edi,4),%esi;\
+movzx	a ## H,		%edi;\
+ror	$16,		a ## D;\
+xor	s1(%ebp,%edi,4),c ## D;\
+movzx	b ## H,		%edi;\
+ror	$16,		b ## D;\
+xor	(%ebp,%edi,4),	%esi;\
+movzx	a ## B,		%edi;\
+xor	s2(%ebp,%edi,4),c ## D;\
+movzx	b ## B,		%edi;\
+xor	s1(%ebp,%edi,4),%esi;\
+movzx	a ## H,		%edi;\
+ror	$16,		a ## D;\
+xor	s3(%ebp,%edi,4),c ## D;\
+movzx	b ## H,		%edi;\
+xor	s2(%ebp,%edi,4),%esi;\
+pop	%edi;\
+add	%esi,		c ## D;\
+add	c ## D,		%esi;\
+add	k+round(%ebp),	c ## D;\
+xor	%edi,		c ## D;\
+add	k+4+round(%ebp),%esi;\
+xor	%esi,		d ## D;\
+ror	$1,		d ## D;
+
+.align 4
+.global twofish_enc_blk
+.global twofish_dec_blk
+
+twofish_enc_blk:
+	push	%ebp			/* save registers according to calling convention*/
+	push    %ebx
+	push    %esi			
+	push    %edi
+
+	mov	ctx + 16(%esp),	%ebp	/* abuse the base pointer: set new base bointer to the crypto ctx */
+	mov     in_blk+16(%esp),%edi	/* input adress in edi */
+
+	mov	(%edi),		%eax
+	mov	b_offset(%edi),	%ebx
+	mov	c_offset(%edi),	%ecx
+	mov	d_offset(%edi),	%edx
+	input_whitening(%eax,%ebp,a_offset)
+	ror	$16,	%eax
+	input_whitening(%ebx,%ebp,b_offset)
+	input_whitening(%ecx,%ebp,c_offset)
+	input_whitening(%edx,%ebp,d_offset)
+	rol	$1,	%edx
+
+	encrypt_round(R0,R1,R2,R3,0);
+	encrypt_round(R2,R3,R0,R1,8);
+	encrypt_round(R0,R1,R2,R3,2*8);
+	encrypt_round(R2,R3,R0,R1,3*8);
+	encrypt_round(R0,R1,R2,R3,4*8);
+	encrypt_round(R2,R3,R0,R1,5*8);
+	encrypt_round(R0,R1,R2,R3,6*8);
+	encrypt_round(R2,R3,R0,R1,7*8);
+	encrypt_round(R0,R1,R2,R3,8*8);
+	encrypt_round(R2,R3,R0,R1,9*8);
+	encrypt_round(R0,R1,R2,R3,10*8);
+	encrypt_round(R2,R3,R0,R1,11*8);
+	encrypt_round(R0,R1,R2,R3,12*8);
+	encrypt_round(R2,R3,R0,R1,13*8);
+	encrypt_round(R0,R1,R2,R3,14*8);
+	encrypt_last_round(R2,R3,R0,R1,15*8);
+
+	output_whitening(%eax,%ebp,c_offset)
+	output_whitening(%ebx,%ebp,d_offset)
+	output_whitening(%ecx,%ebp,a_offset)
+	output_whitening(%edx,%ebp,b_offset)
+	mov	out_blk+16(%esp),%edi;
+	mov	%eax,		c_offset(%edi)
+	mov	%ebx,		d_offset(%edi)
+	mov	%ecx,		(%edi)
+	mov	%edx,		b_offset(%edi)
+
+	pop	%edi
+	pop	%esi
+	pop	%ebx
+	pop	%ebp
+	mov	$1,	%eax
+	ret
+
+twofish_dec_blk:	
+	push	%ebp			/* save registers according to calling convention*/
+	push    %ebx
+	push    %esi			
+	push    %edi
+
+
+	mov	ctx + 16(%esp),	%ebp	/* abuse the base pointer: set new base bointer to the crypto ctx */
+	mov     in_blk+16(%esp),%edi	/* input adress in edi */
+
+	mov	(%edi),		%eax
+	mov	b_offset(%edi),	%ebx
+	mov	c_offset(%edi),	%ecx
+	mov	d_offset(%edi),	%edx
+	output_whitening(%eax,%ebp,a_offset)
+	output_whitening(%ebx,%ebp,b_offset)
+	ror	$16,	%ebx
+	output_whitening(%ecx,%ebp,c_offset)
+	output_whitening(%edx,%ebp,d_offset)
+	rol	$1,	%ecx
+
+	decrypt_round(R0,R1,R2,R3,15*8);
+	decrypt_round(R2,R3,R0,R1,14*8);
+	decrypt_round(R0,R1,R2,R3,13*8);
+	decrypt_round(R2,R3,R0,R1,12*8);
+	decrypt_round(R0,R1,R2,R3,11*8);
+	decrypt_round(R2,R3,R0,R1,10*8);
+	decrypt_round(R0,R1,R2,R3,9*8);
+	decrypt_round(R2,R3,R0,R1,8*8);
+	decrypt_round(R0,R1,R2,R3,7*8);
+	decrypt_round(R2,R3,R0,R1,6*8);
+	decrypt_round(R0,R1,R2,R3,5*8);
+	decrypt_round(R2,R3,R0,R1,4*8);
+	decrypt_round(R0,R1,R2,R3,3*8);
+	decrypt_round(R2,R3,R0,R1,2*8);
+	decrypt_round(R0,R1,R2,R3,1*8);
+	decrypt_last_round(R2,R3,R0,R1,0);
+
+	input_whitening(%eax,%ebp,c_offset)
+	input_whitening(%ebx,%ebp,d_offset)
+	input_whitening(%ecx,%ebp,a_offset)
+	input_whitening(%edx,%ebp,b_offset)
+	mov	out_blk+16(%esp),%edi;
+	mov	%eax,		c_offset(%edi)
+	mov	%ebx,		d_offset(%edi)
+	mov	%ecx,		(%edi)
+	mov	%edx,		b_offset(%edi)
+
+	pop	%edi
+	pop	%esi
+	pop	%ebx
+	pop	%ebp
+	mov	$1,	%eax
+	ret
diff -uprN linux-2.6.17-rc5.twofish2/crypto/Kconfig linux-2.6.17-rc5.twofish3/crypto/Kconfig
--- linux-2.6.17-rc5.twofish2/crypto/Kconfig	2006-06-11 15:58:39.219982140 +0200
+++ linux-2.6.17-rc5.twofish3/crypto/Kconfig	2006-06-11 16:05:19.938782275 +0200
@@ -150,6 +150,21 @@ config CRYPTO_TWOFISH_COMMON
 	  Common parts of the Twofish cipher algorithm.
 	  
 
+config CRYPTO_TWOFISH_586
+	tristate "Twofish cipher algorithms (i586)"
+	depends on CRYPTO && ((X86 || UML_X86) && !64BIT)
+	select CRYPTO_TWOFISH_COMMON
+	help
+	  Twofish cipher algorithm.
+
+	  Twofish was submitted as an AES (Advanced Encryption Standard)
+	  candidate cipher by researchers at CounterPane Systems.  It is a
+	  16 round block cipher supporting key sizes of 128, 192, and 256
+	  bits.
+
+	  See also:
+	  <http://www.schneier.com/twofish.html>
+
 config CRYPTO_SERPENT
 	tristate "Serpent cipher algorithm"
 	depends on CRYPTO

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH  3/4] Twofish cipher - i586 assembler
  2006-06-04 13:16 [PATCH 3/4] Twofish cipher - i586 assembler Joachim Fritschi
                   ` (2 preceding siblings ...)
  2006-06-17 10:30 ` Joachim Fritschi
@ 2006-06-19 14:12 ` Joachim Fritschi
  3 siblings, 0 replies; 9+ messages in thread
From: Joachim Fritschi @ 2006-06-19 14:12 UTC (permalink / raw)
  To: linux-kernel; +Cc: linux-crypto, herbert, ak

This patch is now based on the cryptodev tree using the new cryptoapi (crypto  tfm
 instead of the crypto ctx as parameter).

The module passed the tcrypt tests and testscripts.

Signed-off-by: Joachim Fritschi <jfritschi@freenet.de>

 arch/i386/crypto/Makefile           |    3 
 arch/i386/crypto/twofish-i586-asm.S |  335 +++++++++++++++++++++++++++++++++++
 arch/i386/crypto/twofish.c          |   88 +++++++++
 crypto/Kconfig                      |   15 ++
 4 files changed, 441 insertions(+), 0 deletions(-)

diff --git a/arch/i386/crypto/Makefile b/arch/i386/crypto/Makefile
index 103c353..3fd19af 100644
--- a/arch/i386/crypto/Makefile
+++ b/arch/i386/crypto/Makefile
@@ -5,5 +5,8 @@ # Arch-specific CryptoAPI modules.
 # 
 
 obj-$(CONFIG_CRYPTO_AES_586) += aes-i586.o
+obj-$(CONFIG_CRYPTO_TWOFISH_586) += twofish-i586.o
 
 aes-i586-y := aes-i586-asm.o aes.o
+twofish-i586-y := twofish-i586-asm.o twofish.o
+
diff --git a/arch/i386/crypto/twofish-i586-asm.S b/arch/i386/crypto/twofish-i586-asm.S
new file mode 100644
index 0000000..0bae145
--- /dev/null
+++ b/arch/i386/crypto/twofish-i586-asm.S
@@ -0,0 +1,335 @@
+/***************************************************************************
+*   Copyright (C) 2006 by Joachim Fritschi, <jfritschi@freenet.de>        *
+*                                                                         *
+*   This program is free software; you can redistribute it and/or modify  *
+*   it under the terms of the GNU General Public License as published by  *
+*   the Free Software Foundation; either version 2 of the License, or     *
+*   (at your option) any later version.                                   *
+*                                                                         *
+*   This program is distributed in the hope that it will be useful,       *
+*   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+*   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+*   GNU General Public License for more details.                          *
+*                                                                         *
+*   You should have received a copy of the GNU General Public License     *
+*   along with this program; if not, write to the                         *
+*   Free Software Foundation, Inc.,                                       *
+*   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+***************************************************************************/
+
+.file "twofish-i586-asm.S"
+.text
+
+#include <asm/asm-offsets.h>
+
+/* return adress at 0 */
+
+#define in_blk    12  /* input byte array address parameter*/
+#define out_blk   8  /* output byte array address parameter*/
+#define tfm       4  /* Twofish context structure */
+
+#define a_offset	0
+#define b_offset	4
+#define c_offset	8
+#define d_offset	12
+
+/* Structure of the crypto context struct*/
+
+#define s0	0	/* S0 Array 256 Words each */
+#define s1	1024	/* S1 Array */
+#define s2	2048	/* S2 Array */
+#define s3	3072	/* S3 Array */
+#define w	4096	/* 8 whitening keys (word) */
+#define k	4128	/* key 1-32 ( word ) */
+
+/* define a few register aliases to allow macro substitution */
+
+#define R0D    %eax
+#define R0B    %al
+#define R0H    %ah
+
+#define R1D    %ebx
+#define R1B    %bl
+#define R1H    %bh
+
+#define R2D    %ecx
+#define R2B    %cl
+#define R2H    %ch
+
+#define R3D    %edx
+#define R3B    %dl
+#define R3H    %dh
+
+
+/* performs input whitening */
+#define input_whitening(src,context,offset)\
+	xor	w+offset(context),	src;
+
+/* performs input whitening */
+#define output_whitening(src,context,offset)\
+	xor	w+16+offset(context),	src;
+
+/*
+a input register containing a (rotated 16)
+b input register containing b
+c input register containing c
+d input register containing d (already rol $1)
+operations on a and b are interleaved to increase performance
+*/
+#define encrypt_round(a,b,c,d,round)\
+push	d ## D;\
+movzx	b ## B,		%edi;\
+mov	s1(%ebp,%edi,4),d ## D;\
+movzx	a ## B,		%edi;\
+mov	s2(%ebp,%edi,4),%esi;\
+movzx	b ## H,		%edi;\
+ror	$16,		b ## D;\
+xor	s2(%ebp,%edi,4),d ## D;\
+movzx	a ## H,		%edi;\
+ror	$16,		a ## D;\
+xor	s3(%ebp,%edi,4),%esi;\
+movzx	b ## B,		%edi;\
+xor	s3(%ebp,%edi,4),d ## D;\
+movzx	a ## B,		%edi;\
+xor	(%ebp,%edi,4),	%esi;\
+movzx	b ## H,		%edi;\
+ror	$15,		b ## D;\
+xor	(%ebp,%edi,4),	d ## D;\
+movzx	a ## H,		%edi;\
+xor	s1(%ebp,%edi,4),%esi;\
+pop	%edi;\
+add	d ## D,		%esi;\
+add	%esi,		d ## D;\
+add	k+round(%ebp),	%esi;\
+xor	%esi,		c ## D;\
+rol	$15,		c ## D;\
+add	k+4+round(%ebp),d ## D;\
+xor	%edi,		d ## D;
+
+/*
+a input register containing a (rotated 16)
+b input register containing b
+c input register containing c
+d input register containing d (already rol $1)
+operations on a and b are interleaved to increase performance
+last round has different rotations for the output preparation
+*/
+#define encrypt_last_round(a,b,c,d,round)\
+push	d ## D;\
+movzx	b ## B,		%edi;\
+mov	s1(%ebp,%edi,4),d ## D;\
+movzx	a ## B,		%edi;\
+mov	s2(%ebp,%edi,4),%esi;\
+movzx	b ## H,		%edi;\
+ror	$16,		b ## D;\
+xor	s2(%ebp,%edi,4),d ## D;\
+movzx	a ## H,		%edi;\
+ror	$16,		a ## D;\
+xor	s3(%ebp,%edi,4),%esi;\
+movzx	b ## B,		%edi;\
+xor	s3(%ebp,%edi,4),d ## D;\
+movzx	a ## B,		%edi;\
+xor	(%ebp,%edi,4),	%esi;\
+movzx	b ## H,		%edi;\
+ror	$16,		b ## D;\
+xor	(%ebp,%edi,4),	d ## D;\
+movzx	a ## H,		%edi;\
+xor	s1(%ebp,%edi,4),%esi;\
+pop	%edi;\
+add	d ## D,		%esi;\
+add	%esi,		d ## D;\
+add	k+round(%ebp),	%esi;\
+xor	%esi,		c ## D;\
+ror	$1,		c ## D;\
+add	k+4+round(%ebp),d ## D;\
+xor	%edi,		d ## D;
+
+/*
+a input register containing a
+b input register containing b (rotated 16)
+c input register containing c
+d input register containing d (already rol $1)
+operations on a and b are interleaved to increase performance
+*/
+#define decrypt_round(a,b,c,d,round)\
+push	c ## D;\
+movzx	a ## B,		%edi;\
+mov	(%ebp,%edi,4),	c ## D;\
+movzx	b ## B,		%edi;\
+mov	s3(%ebp,%edi,4),%esi;\
+movzx	a ## H,		%edi;\
+ror	$16,		a ## D;\
+xor	s1(%ebp,%edi,4),c ## D;\
+movzx	b ## H,		%edi;\
+ror	$16,		b ## D;\
+xor	(%ebp,%edi,4),	%esi;\
+movzx	a ## B,		%edi;\
+xor	s2(%ebp,%edi,4),c ## D;\
+movzx	b ## B,		%edi;\
+xor	s1(%ebp,%edi,4),%esi;\
+movzx	a ## H,		%edi;\
+ror	$15,		a ## D;\
+xor	s3(%ebp,%edi,4),c ## D;\
+movzx	b ## H,		%edi;\
+xor	s2(%ebp,%edi,4),%esi;\
+pop	%edi;\
+add	%esi,		c ## D;\
+add	c ## D,		%esi;\
+add	k+round(%ebp),	c ## D;\
+xor	%edi,		c ## D;\
+add	k+4+round(%ebp),%esi;\
+xor	%esi,		d ## D;\
+rol	$15,		d ## D;
+
+/*
+a input register containing a
+b input register containing b (rotated 16)
+c input register containing c
+d input register containing d (already rol $1)
+operations on a and b are interleaved to increase performance
+last round has different rotations for the output preparation
+*/
+#define decrypt_last_round(a,b,c,d,round)\
+push	c ## D;\
+movzx	a ## B,		%edi;\
+mov	(%ebp,%edi,4),	c ## D;\
+movzx	b ## B,		%edi;\
+mov	s3(%ebp,%edi,4),%esi;\
+movzx	a ## H,		%edi;\
+ror	$16,		a ## D;\
+xor	s1(%ebp,%edi,4),c ## D;\
+movzx	b ## H,		%edi;\
+ror	$16,		b ## D;\
+xor	(%ebp,%edi,4),	%esi;\
+movzx	a ## B,		%edi;\
+xor	s2(%ebp,%edi,4),c ## D;\
+movzx	b ## B,		%edi;\
+xor	s1(%ebp,%edi,4),%esi;\
+movzx	a ## H,		%edi;\
+ror	$16,		a ## D;\
+xor	s3(%ebp,%edi,4),c ## D;\
+movzx	b ## H,		%edi;\
+xor	s2(%ebp,%edi,4),%esi;\
+pop	%edi;\
+add	%esi,		c ## D;\
+add	c ## D,		%esi;\
+add	k+round(%ebp),	c ## D;\
+xor	%edi,		c ## D;\
+add	k+4+round(%ebp),%esi;\
+xor	%esi,		d ## D;\
+ror	$1,		d ## D;
+
+.align 4
+.global twofish_enc_blk
+.global twofish_dec_blk
+
+twofish_enc_blk:
+	push	%ebp			/* save registers according to calling convention*/
+	push    %ebx
+	push    %esi			
+	push    %edi
+
+	mov	tfm + 16(%esp),	%ebp	/* abuse the base pointer: set new base bointer to the crypto tfm */
+	add	$crypto_tfm_ctx_offset, %ebp	/* ctx adress */
+	mov     in_blk+16(%esp),%edi	/* input adress in edi */
+
+	mov	(%edi),		%eax
+	mov	b_offset(%edi),	%ebx
+	mov	c_offset(%edi),	%ecx
+	mov	d_offset(%edi),	%edx
+	input_whitening(%eax,%ebp,a_offset)
+	ror	$16,	%eax
+	input_whitening(%ebx,%ebp,b_offset)
+	input_whitening(%ecx,%ebp,c_offset)
+	input_whitening(%edx,%ebp,d_offset)
+	rol	$1,	%edx
+
+	encrypt_round(R0,R1,R2,R3,0);
+	encrypt_round(R2,R3,R0,R1,8);
+	encrypt_round(R0,R1,R2,R3,2*8);
+	encrypt_round(R2,R3,R0,R1,3*8);
+	encrypt_round(R0,R1,R2,R3,4*8);
+	encrypt_round(R2,R3,R0,R1,5*8);
+	encrypt_round(R0,R1,R2,R3,6*8);
+	encrypt_round(R2,R3,R0,R1,7*8);
+	encrypt_round(R0,R1,R2,R3,8*8);
+	encrypt_round(R2,R3,R0,R1,9*8);
+	encrypt_round(R0,R1,R2,R3,10*8);
+	encrypt_round(R2,R3,R0,R1,11*8);
+	encrypt_round(R0,R1,R2,R3,12*8);
+	encrypt_round(R2,R3,R0,R1,13*8);
+	encrypt_round(R0,R1,R2,R3,14*8);
+	encrypt_last_round(R2,R3,R0,R1,15*8);
+
+	output_whitening(%eax,%ebp,c_offset)
+	output_whitening(%ebx,%ebp,d_offset)
+	output_whitening(%ecx,%ebp,a_offset)
+	output_whitening(%edx,%ebp,b_offset)
+	mov	out_blk+16(%esp),%edi;
+	mov	%eax,		c_offset(%edi)
+	mov	%ebx,		d_offset(%edi)
+	mov	%ecx,		(%edi)
+	mov	%edx,		b_offset(%edi)
+
+	pop	%edi
+	pop	%esi
+	pop	%ebx
+	pop	%ebp
+	mov	$1,	%eax
+	ret
+
+twofish_dec_blk:	
+	push	%ebp			/* save registers according to calling convention*/
+	push    %ebx
+	push    %esi			
+	push    %edi
+
+
+	mov	tfm + 16(%esp),	%ebp	/* abuse the base pointer: set new base bointer to the crypto tfm */
+	add	$crypto_tfm_ctx_offset, %ebp	/* ctx adress */
+	mov     in_blk+16(%esp),%edi	/* input adress in edi */
+
+	mov	(%edi),		%eax
+	mov	b_offset(%edi),	%ebx
+	mov	c_offset(%edi),	%ecx
+	mov	d_offset(%edi),	%edx
+	output_whitening(%eax,%ebp,a_offset)
+	output_whitening(%ebx,%ebp,b_offset)
+	ror	$16,	%ebx
+	output_whitening(%ecx,%ebp,c_offset)
+	output_whitening(%edx,%ebp,d_offset)
+	rol	$1,	%ecx
+
+	decrypt_round(R0,R1,R2,R3,15*8);
+	decrypt_round(R2,R3,R0,R1,14*8);
+	decrypt_round(R0,R1,R2,R3,13*8);
+	decrypt_round(R2,R3,R0,R1,12*8);
+	decrypt_round(R0,R1,R2,R3,11*8);
+	decrypt_round(R2,R3,R0,R1,10*8);
+	decrypt_round(R0,R1,R2,R3,9*8);
+	decrypt_round(R2,R3,R0,R1,8*8);
+	decrypt_round(R0,R1,R2,R3,7*8);
+	decrypt_round(R2,R3,R0,R1,6*8);
+	decrypt_round(R0,R1,R2,R3,5*8);
+	decrypt_round(R2,R3,R0,R1,4*8);
+	decrypt_round(R0,R1,R2,R3,3*8);
+	decrypt_round(R2,R3,R0,R1,2*8);
+	decrypt_round(R0,R1,R2,R3,1*8);
+	decrypt_last_round(R2,R3,R0,R1,0);
+
+	input_whitening(%eax,%ebp,c_offset)
+	input_whitening(%ebx,%ebp,d_offset)
+	input_whitening(%ecx,%ebp,a_offset)
+	input_whitening(%edx,%ebp,b_offset)
+	mov	out_blk+16(%esp),%edi;
+	mov	%eax,		c_offset(%edi)
+	mov	%ebx,		d_offset(%edi)
+	mov	%ecx,		(%edi)
+	mov	%edx,		b_offset(%edi)
+
+	pop	%edi
+	pop	%esi
+	pop	%ebx
+	pop	%ebp
+	mov	$1,	%eax
+	ret
diff --git a/arch/i386/crypto/twofish.c b/arch/i386/crypto/twofish.c
new file mode 100644
index 0000000..084d14c
--- /dev/null
+++ b/arch/i386/crypto/twofish.c
@@ -0,0 +1,88 @@
+/*
+ *  Glue Code for optimized 586 assembler version of TWOFISH
+ *
+ * Originally Twofish for GPG
+ * By Matthew Skala <mskala@ansuz.sooke.bc.ca>, July 26, 1998
+ * 256-bit key length added March 20, 1999
+ * Some modifications to reduce the text size by Werner Koch, April, 1998
+ * Ported to the kerneli patch by Marc Mutz <Marc@Mutz.com>
+ * Ported to CryptoAPI by Colin Slater <hoho@tacomeat.net>
+ *
+ * The original author has disclaimed all copyright interest in this
+ * code and thus put it in the public domain. The subsequent authors
+ * have put this under the GNU General Public License.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307
+ * USA
+ *
+ * This code is a "clean room" implementation, written from the paper
+ * _Twofish: A 128-Bit Block Cipher_ by Bruce Schneier, John Kelsey,
+ * Doug Whiting, David Wagner, Chris Hall, and Niels Ferguson, available
+ * through http://www.counterpane.com/twofish.html
+ *
+ * For background information on multiplication in finite fields, used for
+ * the matrix operations in the key schedule, see the book _Contemporary
+ * Abstract Algebra_ by Joseph A. Gallian, especially chapter 22 in the
+ * Third Edition.
+ */
+
+#include <asm/byteorder.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/crypto.h>
+#include <linux/bitops.h>
+#include <crypto/twofish.h>
+
+
+asmlinkage void twofish_enc_blk(struct crypto_tfm *tfm, u8 *dst, const u8 *src);
+
+asmlinkage void twofish_dec_blk(struct crypto_tfm *tfm, u8 *dst, const u8 *src);
+
+
+static struct crypto_alg alg = {
+	.cra_name           =   "twofish",
+	.cra_driver_name    =	"twofish-i586",
+        .cra_priority       =   200,
+	.cra_flags          =   CRYPTO_ALG_TYPE_CIPHER,
+	.cra_blocksize      =   TF_BLOCK_SIZE,
+	.cra_ctxsize        =   sizeof(struct twofish_ctx),
+	.cra_alignmask      =	3,
+	.cra_module         =   THIS_MODULE,
+	.cra_list           =   LIST_HEAD_INIT(alg.cra_list),
+	.cra_u              =   { .cipher = {
+	.cia_min_keysize    =   TF_MIN_KEY_SIZE,
+	.cia_max_keysize    =   TF_MAX_KEY_SIZE,
+	.cia_setkey         =   twofish_setkey,
+	.cia_encrypt        =   twofish_enc_blk,
+	.cia_decrypt        =   twofish_dec_blk } }
+};
+
+static int __init init(void)
+{
+	return crypto_register_alg(&alg);
+}
+
+static void __exit fini(void)
+{
+	crypto_unregister_alg(&alg);
+}
+
+module_init(init);
+module_exit(fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION ("Twofish Cipher Algorithm, i586 asm optimized");
diff --git a/crypto/Kconfig b/crypto/Kconfig
index f364260..28b203b 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -150,6 +150,21 @@ config CRYPTO_TWOFISH_COMMON
 	  Common parts of the Twofish cipher algorithm shared by the 
 	  generic c and the assembler implementations.
 
+config CRYPTO_TWOFISH_586
+	tristate "Twofish cipher algorithms (i586)"
+	depends on CRYPTO && ((X86 || UML_X86) && !64BIT)
+	select CRYPTO_TWOFISH_COMMON
+	help
+	  Twofish cipher algorithm.
+
+	  Twofish was submitted as an AES (Advanced Encryption Standard)
+	  candidate cipher by researchers at CounterPane Systems.  It is a
+	  16 round block cipher supporting key sizes of 128, 192, and 256
+	  bits.
+
+	  See also:
+	  <http://www.schneier.com/twofish.html>
+
 config CRYPTO_SERPENT
 	tristate "Serpent cipher algorithm"
 	depends on CRYPTO

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH  4/4] Twofish cipher - x86_64 assembler
@ 2006-06-08 17:35 Joachim Fritschi
  2006-06-16 17:29 ` [PATCH 3/4] Twofish cipher - i586 assembler linux
  0 siblings, 1 reply; 9+ messages in thread
From: Joachim Fritschi @ 2006-06-08 17:35 UTC (permalink / raw)
  To: linux; +Cc: linux-kernel

On Thursday 08 June 2006 09:13, linux@horizon.com wrote:
> The following seem to me to be simplifications, and so
> make the code more readable.  For what it's worth...
>
> - It would be nice to delete the R6 and R7 definitions in order to
>   make clear that %esp and %ebp are NOT used as general-purpose registers.
Sounds reasonable
> - ctx, tmp1, tmp2, new1 and new2 are only used in "## D" form.
>   It would be simpler to include the D suffix at the call site
>   and remove 5 characters per use from the macro expansion.
Could lead to some additional confusion because it masks all the different 
8,32 and 64 bit operations from the users. One could not just look at the 
macro and see what "subregister" is used.
> - That would let you get rid of half of the macro definitions.
>   You only need R0, R1, R2, R3, R8 and R9.
>   The rest could simply be replaced by %r10d, etc.
Would lead to a mixed names. Don't know if that would really help.
> - Changing the argument order to (a, b, newa, newb, olda, oldb)
>   would make it clearer what's changing and what's staying the
>   same between rounds.
>
> - You could also get rid of the ## D sufixes on olda and oldb, but that's
>   of more dubious benefit.
>
> - How about making the macros more specific and not passing in the
>   constant arguments ctx, tmp1, tmp2, key1 and key2?

I like to pass all registers i use as parameters. That way you can always keep 
track of what registers are used. That way it's harder to screw up with 
unwanted sideeffects.
>
>
> Having looked at the code that much, I started seeing potential
> code improvements:
>
> - Why schedule the loading of the round keys so far in advance?
>   They're going to be L1 hits anyway, and anything amd64 can do lots
>   of code reordering.  There are no stores to act as fences.
>   You could just to an xor stright into where you need the values.
I guess you are right. Will be fixed
> - Why copy a -> olda and b->oldb at the end of each round?  Why not
>   just do
>
> +	encrypt_last_round(R0,R1,R8,R9,R2,R3,R5,  0,R10,R11,R12,R13);
> +	encrypt_last_round(R2,R3,R0,R1,R8,R9,R5,  8,R10,R11,R12,R13);
> +	encrypt_last_round(R8,R9,R2,R3,R0,R1,R5,2*8,R10,R11,R12,R13);
> +	encrypt_last_round(R0,R1,R8,R9,R2,R3,R5,3*8,R10,R11,R12,R13);
> +	encrypt_last_round(R2,R3,R0,R1,R8,R9,R5,4*8,R10,R11,R12,R13);
> etc.?
>
> Oh, but wait!  The ONLY inputs, AFAICT, to newa are
> +	mov	olda  ## D,	newa ## D;\
> +	mov	oldb ## D,	newb ## D;\
> +	xor	key1 ## D,	newa ## D;\
> +	xor	key2 ## D,	newb ## D;\
>
> So why not just make newa and olda the same register, thereby removing that
> mov as well, and replace the other uses of newa and newb in the loop
> with appropriate temps?
>
>
> That would make the round function:
> +/*
> + * The twofish round function.
> + * This is very close to a standard Feistel cipher:
> + * (c,d) ^= F(a,b,round_key)
> + * But twofish adds one-bit rotations.
> + * Other registers used:
> + * %rdi points to the context structure including the key schedule
> * * %r9d is a temporary.
> + * %r10d and %r11d hold the F() function output.
> + */
> +#define\
> + encrypt_round(a,b,c,d,round)\
> +	movzx	a ## B,		%r9d;\
> +	mov	s0(%rdi,%r9,4),	%r10d;\
> +	movzx	a ## H,		%r9d;\
> +	ror	$16,		a ## D;\
> +	xor     s1(%rdi,%r9,4),	%r10d;\
> +	movzx	a ## B,		%r9d;\
> +	xor     s2(%rdi,%r9,4),	%r10d;\
> +	movzx	a ## H,		%r9d;\
> +	xor     s3(%rdi,%r9,4),	%r10d;\
> +	ror	$16,		a ## D;\
> +	movzx	b ## B,		%r9d;\
> +	mov     s1(%rdi,%r9,4),	%r11d;\
> +	movzx	b ## H,		%r9d;\
> +	ror	$16,		b ## D;\
> +	xor	s2(%rdi,%r9,4),	%r11d;\
> +	movzx	b ## B,		%r9d;\
> +	xor	s3(%rdi,%r9,4),	%r11d;\
> +	movzx	b ## H,		%r9d;\
> +	xor     s0(%rdi,%r9,4),	%r11d;\
> +	ror	$15,		b ## D;\
> +	add	%r11d,		%r10d;\
> +	add	%r10d,		%r11d;\
> +	add	k+round(%rdi),	%r10d;\
> +	add	k+4+round(%rdi),%r11d;\
> +	xor	%r10d,		c ## D;\
> +	xor	%r11d,		d ## D;\
> +	ror	$1,		c ## D
>
> Notice that this has saved three registers (%r8, %r12, %r13) and
> eliminated six mov instructions.
>
> (Admittedly, unbenchmarked and untested.)
 You can't use the 8bit high register with rex prefix registers ( r8+ and any 
64bit register). I guess this could be fixed be moving the crypto ctx or the 
output adress to a rex register and have %esi or %edi as temp register for 
the sbox-index. Somehow i never considered %esi or %edi as a possible target 
for the 8bit high operation and was convinced the only way to avoid using 4 * 
8bit rotates was using temporary registers. 
> If I wanted to worry about scheduling, I might move the b-side S-box
> lookups ahead of the a-side to give that "ror $1,c" a smidgen more time
> to complete, and then interleave them:
>
> +#define\
> + encrypt_round(a,b,c,d,round)\
> +	movzx	b ## B,		%r9d;\
> +	mov     s1(%rdi,%r9,4),	%r11d;\
> +	movzx	a ## B,		%r9d;\
> +	mov	s0(%rdi,%r9,4),	%r10d;\
> +	movzx	b ## H,		%r9d;\
> +	xor	s2(%rdi,%r9,4),	%r11d;\
> +	ror	$16,		b ## D;\
> +	movzx	a ## H,		%r9d;\
> +	xor     s1(%rdi,%r9,4),	%r10d;\
> +	ror	$16,		a ## D;\
> +	movzx	b ## B,		%r9d;\
> +	xor	s3(%rdi,%r9,4),	%r11d;\
> +	movzx	a ## B,		%r9d;\
> +	xor     s2(%rdi,%r9,4),	%r10d;\
> +	movzx	b ## H,		%r9d;\
> +	xor     s0(%rdi,%r9,4),	%r11d;\
> +	ror	$15,		b ## D;\
> +	movzx	a ## H,		%r9d;\
> +	xor     s3(%rdi,%r9,4),	%r10d;\
> +	ror	$16,		a ## D;\
> +	add	%r11d,		%r10d;\
> +	add	%r10d,		%r11d;\
> +	add	k+round(%rdi),	%r10d;\
> +	add	k+4+round(%rdi),%r11d;\
> +	xor	%r10d,		c ## D;\
> +	xor	%r11d,		d ## D;\
> +	ror	$1,		c ## D
>
> And you could eliminate one more instruction by re-ordering the a-side
> S box lookups to do the "high half" lookups first, and then merging the
> resultant leading ror $16 with the trailing ror $1:
> (Note that this changes the required loop setup slightly.)
>
> +#define\
> + encrypt_round(a,b,c,d,round)\
> +	rol	$15,		a ## D;\
> +	movzx	b ## B,		%r9d;\
> +	mov     s1(%rdi,%r9,4),	%r11d;\
> +	movzx	a ## B,		%r9d;\
> +	mov	s2(%rdi,%r9,4),	%r10d;\
> +	movzx	b ## H,		%r9d;\
> +	xor	s2(%rdi,%r9,4),	%r11d;\
> +	ror	$16,		b ## D;\
> +	movzx	a ## H,		%r9d;\
> +	xor     s3(%rdi,%r9,4),	%r10d;\
> +	ror	$16,		a ## D;\
> +	movzx	b ## B,		%r9d;\
> +	xor	s3(%rdi,%r9,4),	%r11d;\
> +	movzx	a ## B,		%r9d;\
> +	xor     s0(%rdi,%r9,4),	%r10d;\
> +	movzx	b ## H,		%r9d;\
> +	xor     s0(%rdi,%r9,4),	%r11d;\
> +	ror	$15,		b ## D;\
> +	movzx	a ## H,		%r9d;\
> +	xor     s1(%rdi,%r9,4),	%r10d;\
> +	add	%r11d,		%r10d;\
> +	add	%r10d,		%r11d;\
> +	add	k+round(%rdi),	%r10d;\
> +	add	k+4+round(%rdi),%r11d;\
> +	xor	%r10d,		c ## D;\
> +	xor	%r11d,		d ## D
Very neat. I think i will run with this idea. Need some fixing for the rex 
thing and some modifications in the first and last round.
> I haven't looked at the x86_32 code to see how many of these ideas
> could be adapted there.  Unfortunately, even with the reduction, this
> still uses 8 registers, one more than possible on x86_32.
Yes. I think i got some ideas on how to fix it up a bit. 
> Probably the best thing to do there would be to de-interleave the
> a->%r10d and b->%r11d computations and spill (push/pop) the
> necessary register around the second block.  Something like:
>
> ctx in %edi
> %ebp and %esi are temps
>
> +#define\
> + encrypt_round(a,b,c,d,round)\
> +	rol	$15,		a ## D;\
> +	movzx	b ## B,		%esi;\
> +	mov     s1(%edi,%esi,4),%ebp;\
> +	movzx	b ## H,		%esi;\
> +	xor	s2(%edi,%esi,4),%ebp;\
> +	ror	$16,		b ## D;\
> +	movzx	b ## B,		%esi;\
> +	xor	s3(%edi,%esi,4),%ebp;\
> +	movzx	b ## H,		%esi;\
> +	xor     s0(%edi,%esi,4),%ebp;\
> +	ror	$15,		b ## D;\
> +	push	%ebp;\
> +	movzx	a ## B,		%esi;\
> +	mov	s2(%edi,%esi,4),%ebp;\
> +	movzx	a ## H,		%esi;\
> +	xor     s3(%edi,%esi,4),%ebp;\
> +	ror	$16,		a ## D;\
> +	movzx	a ## B,		%esi;\
> +	xor     s0(%edi,%esi,4),%ebp;\
> +	movzx	a ## H,		%esi;\
> +	xor     s1(%edi,%esi,4),%ebp;\
> +	pop	%esi;\
> +	add	%esi,		%ebp;\
> +	add	%ebp,		%esi;\
> +	add	k+round(%edi),	%ebp;\
> +	add	k+4+round(%edi),%esi;\
> +	xor	%ebp,		c ## D;\
> +	xor	%esi,		d ## D
>
> (Legalese: These code changes are in the public domain.  All of the code
> modifications presented here are simply the straightforward execution
> of the (uncopyrightable) ideas presented in the text, and therefore
> not protectable.  The only "creative" portions are the comment, the use
> of the variable names "c" and "d", and the choice of registers in the
> 32-bit code, for which copyright is abandoned.)
>
>
> And even bigger hack would be to rearrange the context structure to
> have the key first, then interleave the s0 and s1 boxes and use the
> (%rdi,%r9,8) addressing mode to access them.  That would, if you
> pre-offset %rdi a little bit so the key was at a negative offset,
> allow you to use a byte-offset addressing mode on 6 of the 10 loads in
> each round.  (Compared to 2 of 10 without.)
>
> In fact, on x86_64, you could go to the extreme of dedicating a register
> to point to the base of each of the S-boxes, so there is no displacement
> byte in the opcode at all.  That has to help the I-cache and the
> instruction decoders enough to pay for the additional setup instructions.
I already had that in mind and since you freed up some registers that is 
finally possible.

Seems like i have a lot of work ahead :D
Thanks for your very valuable comments.

-Joachim


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH  3/4] Twofish cipher - i586 assembler
  2006-06-08 17:35 [PATCH 4/4] Twofish cipher - x86_64 assembler Joachim Fritschi
@ 2006-06-16 17:29 ` linux
  2006-06-16 23:22   ` Joachim Fritschi
  0 siblings, 1 reply; 9+ messages in thread
From: linux @ 2006-06-16 17:29 UTC (permalink / raw)
  To: jfritschi, linux; +Cc: linux-kernel

Nice push/pop design!

A couple of questions:

1) Would it be worth moving encrypt_round's pop %edi earlier, like
   encrypt_first_round does?  Scheduling loads as early as possible is
   just good general principles.

2) Is it really worth having special first & last round definitions?

   encrypt_first round just has one more instruction that encrypt_round
   (ror $16,%eax) that could be moved to the pre-round setup, thereby
   eliminating the entire encrypt_first_round macro.

   And the only difference in encrypt_last_round is the absence of a
   "push b ### D" that could be delayed until the end of the macro and
   moved into the start of the next encrypt_round.

   Oh... and a change from "rol $15, c ## D" to "ror $1, c ## D".
   It might be worth living with the single extra instruction for
   the code simplicity.

   Then you'd have a single encrypt_round of:

/*
a input register containing a (prerotated 16 bits)
b input register containing b
c input register containing c
d input register containing d (prerotated 1 bit left)
operations on a and b are interleaved to increase performance
*/
#define encrypt_round(a,b,c,d,round)\
push	d ## D;\
movzx	b ## B,		%edi;\
mov	s1(%ebp,%edi,4),d ## D;\
movzx	a ## B,		%edi;\
mov	s2(%ebp,%edi,4),%esi;\
movzx	b ## H,		%edi;\
ror	$16,		b ## D;\
xor	s2(%ebp,%edi,4),d ## D;\
movzx	a ## H,		%edi;\
ror	$16,		a ## D;\
xor	s3(%ebp,%edi,4),%esi;\
movzx	b ## B,		%edi;\
xor	s3(%ebp,%edi,4),d ## D;\
movzx	a ## B,		%edi;\
xor	(%ebp,%edi,4),	%esi;\
movzx	b ## H,		%edi;\
ror	$15,		b ## D;\
xor	(%ebp,%edi,4),	d ## D;\
movzx	a ## H,		%edi;\
xor	s1(%ebp,%edi,4),%esi;\
pop	%edi;\
add	d ## D,		%esi;\
add	%esi,		d ## D;\
add	k+round(%ebp),	%esi;\
xor	%esi,		c ## D;\
rol	$15,		c ## D;\
add	k+4+round(%ebp),d ## D;\
xor	%edi,		d ## D;

which would be called by:
twofish_enc_blk:
	push	%ebp			/* save registers according to calling convention*/
	push    %edi
	push    %ebx
	push    %esi			
		
	mov	ctx + 16(%esp),	%ebp	/* abuse the base pointer: set new base bointer to the crypto ctx */
	mov     in_blk+16(%esp),%edi	/* input adress in edi */

	mov	(%edi),		%eax
	mov	b_offset(%edi),	%ebx
	mov	c_offset(%edi),	%ecx
	mov	d_offset(%edi),	%edx
	input_whitening(%eax,%ebp,a_offset)
	input_whitening(%ebx,%ebp,b_offset)
	input_whitening(%ecx,%ebp,c_offset)
	input_whitening(%edx,%ebp,d_offset)
	rol	$16,		%eax

	encrypt_round(R0,R1,R2,R3,0)
	encrypt_round(R2,R3,R0,R1,8)
	encrypt_round(R0,R1,R2,R3,2*8)
	encrypt_round(R2,R3,R0,R1,3*8)
	encrypt_round(R0,R1,R2,R3,4*8)
	encrypt_round(R2,R3,R0,R1,5*8)
	encrypt_round(R0,R1,R2,R3,6*8)
	encrypt_round(R2,R3,R0,R1,7*8)
	encrypt_round(R0,R1,R2,R3,8*8)
	encrypt_round(R2,R3,R0,R1,9*8)
	encrypt_round(R0,R1,R2,R3,10*8)
	encrypt_round(R2,R3,R0,R1,11*8)
	encrypt_round(R0,R1,R2,R3,12*8)
	encrypt_round(R2,R3,R0,R1,13*8)
	encrypt_round(R0,R1,R2,R3,14*8)
	encrypt_round(R2,R3,R0,R1,15*8)

	rol	$16,		%ecx
	output_whitening(%eax,%ebp,c_offset)
	output_whitening(%ebx,%ebp,d_offset)
	output_whitening(%ecx,%ebp,a_offset)
	output_whitening(%edx,%ebp,b_offset)

	mov	out_blk+16(%esp),%edi;
	mov	%ecx,		(%edi)
	mov	%edx,		b_offset(%edi)
	mov	%eax,		c_offset(%edi)
	mov	%ebx,		d_offset(%edi)

	pop	%edi
	pop	%esi
	pop	%ebx
	pop	%ebp
	mov	$1,	%eax
	ret

I'm also trying to figure out why the encrypt_round and decrypt_round
macros are different.  Normally, a Feistel cipher just requires that
the round subkeys be reversed to reverse the cipher; the F function is
unmodified.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH  3/4] Twofish cipher - i586 assembler
  2006-06-16 17:29 ` [PATCH 3/4] Twofish cipher - i586 assembler linux
@ 2006-06-16 23:22   ` Joachim Fritschi
  0 siblings, 0 replies; 9+ messages in thread
From: Joachim Fritschi @ 2006-06-16 23:22 UTC (permalink / raw)
  To: linux; +Cc: linux-kernel

On Friday 16 June 2006 19:29, linux@horizon.com wrote:
> Nice push/pop design!
>
> A couple of questions:
>
> 1) Would it be worth moving encrypt_round's pop %edi earlier, like
>    encrypt_first_round does?  Scheduling loads as early as possible is
>    just good general principles.
I guess i missed that :/. Will be fixed.
>
> 2) Is it really worth having special first & last round definitions?
>
>    encrypt_first round just has one more instruction that encrypt_round
>    (ror $16,%eax) that could be moved to the pre-round setup, thereby
>    eliminating the entire encrypt_first_round macro.
Good idea.
>
>    And the only difference in encrypt_last_round is the absence of a
>    "push b ### D" that could be delayed until the end of the macro and
>    moved into the start of the next encrypt_round.
>
>    Oh... and a change from "rol $15, c ## D" to "ror $1, c ## D".
>    It might be worth living with the single extra instruction for
>    the code simplicity.
There are 2 rotate changes. ( you missed "ror $15,	b ## D;" to "ror $16, b ## 
D;")  That's 2 instructions (expensive ones) vs. a little more simplicity in 
code. Not worth it imho since this patch is aimed for maximum perfomance and 
adding 2 workaround wont make it much simpler to understand, just a little 
less patchsize.
>    Then you'd have a single encrypt_round of:
>
> /*
> a input register containing a (prerotated 16 bits)
> b input register containing b
> c input register containing c
> d input register containing d (prerotated 1 bit left)
> operations on a and b are interleaved to increase performance
> */
> #define encrypt_round(a,b,c,d,round)\
> push	d ## D;\
> movzx	b ## B,		%edi;\
> mov	s1(%ebp,%edi,4),d ## D;\
> movzx	a ## B,		%edi;\
> mov	s2(%ebp,%edi,4),%esi;\
> movzx	b ## H,		%edi;\
> ror	$16,		b ## D;\
> xor	s2(%ebp,%edi,4),d ## D;\
> movzx	a ## H,		%edi;\
> ror	$16,		a ## D;\
> xor	s3(%ebp,%edi,4),%esi;\
> movzx	b ## B,		%edi;\
> xor	s3(%ebp,%edi,4),d ## D;\
> movzx	a ## B,		%edi;\
> xor	(%ebp,%edi,4),	%esi;\
> movzx	b ## H,		%edi;\
> ror	$15,		b ## D;\
> xor	(%ebp,%edi,4),	d ## D;\
> movzx	a ## H,		%edi;\
> xor	s1(%ebp,%edi,4),%esi;\
> pop	%edi;\
> add	d ## D,		%esi;\
> add	%esi,		d ## D;\
> add	k+round(%ebp),	%esi;\
> xor	%esi,		c ## D;\
> rol	$15,		c ## D;\
> add	k+4+round(%ebp),d ## D;\
> xor	%edi,		d ## D;
>
> which would be called by:
> twofish_enc_blk:
> 	push	%ebp			/* save registers according to calling convention*/
> 	push    %edi
> 	push    %ebx
> 	push    %esi
>
> 	mov	ctx + 16(%esp),	%ebp	/* abuse the base pointer: set new base bointer
> to the crypto ctx */ mov     in_blk+16(%esp),%edi	/* input adress in edi */
>
> 	mov	(%edi),		%eax
> 	mov	b_offset(%edi),	%ebx
> 	mov	c_offset(%edi),	%ecx
> 	mov	d_offset(%edi),	%edx
> 	input_whitening(%eax,%ebp,a_offset)
> 	input_whitening(%ebx,%ebp,b_offset)
> 	input_whitening(%ecx,%ebp,c_offset)
> 	input_whitening(%edx,%ebp,d_offset)
> 	rol	$16,		%eax
>
> 	encrypt_round(R0,R1,R2,R3,0)
> 	encrypt_round(R2,R3,R0,R1,8)
> 	encrypt_round(R0,R1,R2,R3,2*8)
> 	encrypt_round(R2,R3,R0,R1,3*8)
> 	encrypt_round(R0,R1,R2,R3,4*8)
> 	encrypt_round(R2,R3,R0,R1,5*8)
> 	encrypt_round(R0,R1,R2,R3,6*8)
> 	encrypt_round(R2,R3,R0,R1,7*8)
> 	encrypt_round(R0,R1,R2,R3,8*8)
> 	encrypt_round(R2,R3,R0,R1,9*8)
> 	encrypt_round(R0,R1,R2,R3,10*8)
> 	encrypt_round(R2,R3,R0,R1,11*8)
> 	encrypt_round(R0,R1,R2,R3,12*8)
> 	encrypt_round(R2,R3,R0,R1,13*8)
> 	encrypt_round(R0,R1,R2,R3,14*8)
> 	encrypt_round(R2,R3,R0,R1,15*8)
>
> 	rol	$16,		%ecx
> 	output_whitening(%eax,%ebp,c_offset)
> 	output_whitening(%ebx,%ebp,d_offset)
> 	output_whitening(%ecx,%ebp,a_offset)
> 	output_whitening(%edx,%ebp,b_offset)
>
> 	mov	out_blk+16(%esp),%edi;
> 	mov	%ecx,		(%edi)
> 	mov	%edx,		b_offset(%edi)
> 	mov	%eax,		c_offset(%edi)
> 	mov	%ebx,		d_offset(%edi)
>
> 	pop	%edi
> 	pop	%esi
> 	pop	%ebx
> 	pop	%ebp
> 	mov	$1,	%eax
> 	ret
>
> I'm also trying to figure out why the encrypt_round and decrypt_round
> macros are different.  Normally, a Feistel cipher just requires that
> the round subkeys be reversed to reverse the cipher; the F function is
> unmodified.
The rotates (1 left and 1 right) at the end of the round are exchanged, while 
the sbox lookups and roundkeys stay the same. This makes a the reuse of the 
code impossible.



^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2006-06-19 14:13 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2006-06-04 13:16 [PATCH 3/4] Twofish cipher - i586 assembler Joachim Fritschi
2006-06-04 22:49 ` Horst von Brand
2006-06-05 10:47   ` Joachim Fritschi
2006-06-07 19:38 ` Joachim Fritschi
2006-06-16 11:59   ` Joachim Fritschi
2006-06-17 10:30 ` Joachim Fritschi
2006-06-19 14:12 ` Joachim Fritschi
  -- strict thread matches above, loose matches on Subject: below --
2006-06-08 17:35 [PATCH 4/4] Twofish cipher - x86_64 assembler Joachim Fritschi
2006-06-16 17:29 ` [PATCH 3/4] Twofish cipher - i586 assembler linux
2006-06-16 23:22   ` Joachim Fritschi

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox