[PATCH 4/4] Twofish cipher - x86

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH  4/4] Twofish cipher - x86_64 assembler
@ 2006-06-04 13:16 Joachim Fritschi
  2006-06-04 19:10 ` Andi Kleen
                   ` (4 more replies)
  0 siblings, 5 replies; 20+ messages in thread
From: Joachim Fritschi @ 2006-06-04 13:16 UTC (permalink / raw)
  To: linux-kernel; +Cc: linux-crypto, herbert, ak

This patch adds the twofish x86_64 assembler routine. 

Changes since last version:
- The keysetup is now handled by the twofish_common.c (see patch 1 )
- The last round of the encrypt/decrypt routines where optimized saving 5 
instructions. 

Correctness was verified with the tcrypt module and automated test scripts.

Signed-off-by: Joachim Fritschi <jfritschi@freenet.de>

diff -uprN linux-2.6.17-rc5.twofish3/arch/x86_64/crypto/Makefile 
linux-2.6.17-rc5.twofish4/arch/x86_64/crypto/Makefile
--- linux-2.6.17-rc5.twofish3/arch/x86_64/crypto/Makefile	2006-05-30 
19:58:05.172677025 +0200
+++ linux-2.6.17-rc5.twofish4/arch/x86_64/crypto/Makefile	2006-05-31 
11:56:48.239053258 +0200
@@ -5,5 +5,8 @@
 #

 obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o
+obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o

 aes-x86_64-y := aes-x86_64-asm.o aes.o
+twofish-x86_64-y := twofish-x86_64-asm.o 
twofish.o ../../../crypto/twofish_common.o
+
diff -uprN linux-2.6.17-rc5.twofish3/arch/x86_64/crypto/twofish.c 
linux-2.6.17-rc5.twofish4/arch/x86_64/crypto/twofish.c
--- linux-2.6.17-rc5.twofish3/arch/x86_64/crypto/twofish.c	1970-01-01 
01:00:00.000000000 +0100
+++ linux-2.6.17-rc5.twofish4/arch/x86_64/crypto/twofish.c	2006-05-31 
11:55:51.492767729 +0200
@@ -0,0 +1,86 @@
+/*
+ * Glue Code for optimized x86_64 assembler version of TWOFISH
+ *
+ * Originally Twofish for GPG
+ * By Matthew Skala <mskala@ansuz.sooke.bc.ca>, July 26, 1998
+ * 256-bit key length added March 20, 1999
+ * Some modifications to reduce the text size by Werner Koch, April, 1998
+ * Ported to the kerneli patch by Marc Mutz <Marc@Mutz.com>
+ * Ported to CryptoAPI by Colin Slater <hoho@tacomeat.net>
+ *
+ * The original author has disclaimed all copyright interest in this
+ * code and thus put it in the public domain. The subsequent authors
+ * have put this under the GNU General Public License.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307
+ * USA
+ *
+ * This code is a "clean room" implementation, written from the paper
+ * _Twofish: A 128-Bit Block Cipher_ by Bruce Schneier, John Kelsey,
+ * Doug Whiting, David Wagner, Chris Hall, and Niels Ferguson, available
+ * through http://www.counterpane.com/twofish.html
+ *
+ * For background information on multiplication in finite fields, used for
+ * the matrix operations in the key schedule, see the book _Contemporary
+ * Abstract Algebra_ by Joseph A. Gallian, especially chapter 22 in the
+ * Third Edition.
+ */
+
+#include <asm/byteorder.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/crypto.h>
+#include <linux/bitops.h>
+#include <crypto/twofish.h>
+
+asmlinkage void twofish_enc_blk(void *ctx, u8 *dst, const u8 *src);
+
+asmlinkage void twofish_dec_blk(void *ctx, u8 *dst, const u8 *src);
+
+static struct crypto_alg alg = {
+	.cra_name           =   "twofish",
+	.cra_driver_name    =	"twofish-x86_64",
+	.cra_priority       =	200,
+	.cra_flags          =   CRYPTO_ALG_TYPE_CIPHER,
+	.cra_blocksize      =   TF_BLOCK_SIZE,
+	.cra_ctxsize        =   sizeof(struct twofish_ctx),
+	.cra_alignmask      =	3,
+	.cra_module         =   THIS_MODULE,
+	.cra_list           =   LIST_HEAD_INIT(alg.cra_list),
+	.cra_u              =   { .cipher = {
+	.cia_min_keysize    =   TF_MIN_KEY_SIZE,
+	.cia_max_keysize    =   TF_MAX_KEY_SIZE,
+	.cia_setkey         =   twofish_setkey,
+	.cia_encrypt        =   twofish_enc_blk,
+	.cia_decrypt        =   twofish_dec_blk } }
+};
+
+static int __init init(void)
+{
+	return crypto_register_alg(&alg);
+}
+
+static void __exit fini(void)
+{
+	crypto_unregister_alg(&alg);
+}
+
+module_init(init);
+module_exit(fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION ("Twofish Cipher Algorithm, x86_64 asm optimized");
diff -uprN linux-2.6.17-rc5.twofish3/arch/x86_64/crypto/twofish-x86_64-asm.S 
linux-2.6.17-rc5.twofish4/arch/x86_64/crypto/twofish-x86_64-asm.S
--- linux-2.6.17-rc5.twofish3/arch/x86_64/crypto/twofish-x86_64-asm.S	
1970-01-01 01:00:00.000000000 +0100
+++ linux-2.6.17-rc5.twofish4/arch/x86_64/crypto/twofish-x86_64-asm.S	
2006-05-31 11:58:05.204726048 +0200
@@ -0,0 +1,400 @@
+	/***************************************************************************
+	*   Copyright (C) 2006 by Joachim Fritschi, <jfritschi@freenet.de>        *
+	*                                                                         *
+	*   This program is free software; you can redistribute it and/or modify  *
+	*   it under the terms of the GNU General Public License as published by  *
+	*   the Free Software Foundation; either version 2 of the License, or     *
+	*   (at your option) any later version.                                   *
+	*                                                                         *
+	*   This program is distributed in the hope that it will be useful,       *
+	*   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+	*   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+	*   GNU General Public License for more details.                          *
+	*                                                                         *
+	*   You should have received a copy of the GNU General Public License     *
+	*   along with this program; if not, write to the                         *
+	*   Free Software Foundation, Inc.,                                       *
+	*   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+	***************************************************************************/
+
+.file "twofish-x86_64-asm.S"
+.text
+
+#define a_offset	0
+#define b_offset	4
+#define c_offset	8
+#define d_offset	12
+
+/* Structure of the crypto context struct*/
+
+#define s0	0	/* S0 Array 256 Words each */
+#define s1	1024	/* S1 Array */
+#define s2	2048	/* S2 Array */
+#define s3	3072	/* S3 Array */
+#define w	4096	/* 8 whitening keys (word) */
+#define k	4128	/* key 1-32 ( word ) */
+
+/* Defining a few register aliases for better reading */
+
+#define R0     %rax
+#define R0D    %eax
+#define R0W    %ax
+#define R0B    %al
+#define R0H    %ah
+
+#define R1     %rbx
+#define R1D    %ebx
+#define R1W    %bx
+#define R1B    %bl
+#define R1H    %bh
+
+#define R2     %rcx
+#define R2D    %ecx
+#define R2W    %cx
+#define R2B    %cl
+#define R2H    %ch
+
+#define R3     %rdx
+#define R3D    %edx
+#define R3W    %dx
+#define R3B    %dl
+#define R3H    %dh
+
+#define R4     %rsi
+#define R4D    %esi
+#define R4W    %six
+#define R4B    %sil
+
+#define R5     %rdi
+#define R5D    %edi
+#define R5W    %dix
+#define R5B    %dil
+
+#define R6     %rsp
+#define R6D    %esp
+#define R6W    %spx
+#define R6B    %spl
+
+#define R7     %rbp
+#define R7D    %ebp
+#define R7W    %bpx
+#define R7B    %bpl
+
+#define R8     %r8
+#define R8D    %r8d
+#define R8W    %r8w
+#define R8B    %r8b
+
+#define R9     %r9
+#define R9D    %r9d
+#define R9W    %r9w
+#define R9B    %r9b
+
+#define R10     %r10
+#define R10D    %r10d
+#define R10W    %r10w
+#define R10B    %r10b
+
+#define R11     %r11
+#define R11D    %r11d
+#define R11W    %r11w
+#define R11B    %r11b
+
+#define R12     %r12
+#define R12D    %r12d
+#define R12W    %r12w
+#define R12B    %r12b
+
+#define R13     %r13
+#define R13D    %r13d
+#define R13W    %r13w
+#define R13B    %r13b
+
+#define R14     %r14
+#define R14D    %r14d
+#define R14W    %r14w
+#define R14B    %r14b
+
+#define R15     %r15
+#define R15D    %r15d
+#define R15W    %r15w
+#define R15B    %r15b
+
+
+
+/* performs input whitening */
+#define input_whitening(src,context,offset)\
+	xor	w+offset(context), src;\
+
+/* performs input whitening */
+#define output_whitening(src,context,offset)\
+	xor	w+16+offset(context),src;\
+
+/* load sbox values */
+#define load_s(context,sbox,index,dst)\
+	xor	sbox(context,index,4),dst ## D;\
+
+/* load both round keys */
+#define load_round_key(dsta,dstb,context,round)\
+	mov	k+round(context),dsta ## D;\
+	mov	k+4+round(context),dstb ## D;
+
+
+#define encrypt_round(a,b,olda,oldb,newa,newb,ctx,round,tmp1,tmp2,key1,key2);
\
+	xor	tmp1,tmp1;\
+	load_round_key(key1,key2,ctx,round);\
+	movzx	a ## B,newa;\
+	movzx	a ## H,newb ## D;\
+	ror	$16,a ## D;\
+	load_s(ctx,s0,newa,tmp1);\
+	load_s(ctx,s1,newb,tmp1);\
+	movzx	a ## B,newa;\
+	movzx	a ## H,newb ## D;\
+	load_s(ctx,s2,newa,tmp1);\
+	load_s(ctx,s3,newb,tmp1);\
+	ror	$16,a ## D;\
+	xor	tmp2,tmp2;\
+	movzx	b ## B,newa;\
+	movzx	b ## H,newb ## D;\
+	ror	$16,b ## D;\
+	load_s(ctx,s1,newa,tmp2);\
+	load_s(ctx,s2,newb,tmp2);\
+	movzx	b ## B,newa;\
+	movzx	b ## H,newb ## D;\
+	load_s(ctx,s3,newa,tmp2);\
+	load_s(ctx,s0,newb,tmp2);\
+	ror	$15,b ## D;\
+	add	tmp2 ## D,tmp1 ## D;\
+	add	tmp1 ## D,tmp2 ## D;\
+	add	tmp1 ## D,key1 ## D;\
+	add	tmp2 ## D,key2 ## D;\
+	mov	olda  ## D,newa ## D;\
+	mov	oldb ## D,newb ## D;\
+	mov	a ## D,olda ## D;\
+	mov	b ## D,oldb ## D;\
+	xor	key1 ## D,newa ## D;\
+	xor	key2 ## D,newb ## D;\
+	ror	$1,newa ## D
+
+/* Last Round can ignore saving a,b for the next round */
+
+#define 
encrypt_last_round(a,b,olda,oldb,newa,newb,ctx,round,tmp1,tmp2,key1,key2);\
+	xor	tmp1,tmp1;\
+	load_round_key(key1,key2,ctx,round);\
+	movzx	a ## B,newa;\
+	movzx	a ## H,newb ## D;\
+	ror	$16,a ## D;\
+	load_s(ctx,s0,newa,tmp1);\
+	load_s(ctx,s1,newb,tmp1);\
+	movzx	a ## B,newa;\
+	movzx	a ## H,newb ## D;\
+	load_s(ctx,s2,newa,tmp1);\
+	load_s(ctx,s3,newb,tmp1);\
+	xor	tmp2,tmp2;\
+	movzx	b ## B,newa;\
+	movzx	b ## H,newb ## D;\
+	ror	$16,b ## D;\
+	load_s(ctx,s1,newa,tmp2);\
+	load_s(ctx,s2,newb,tmp2);\
+	movzx	b ## B,newa;\
+	movzx	b ## H,newb ## D;\
+	load_s(ctx,s3,newa,tmp2);\
+	load_s(ctx,s0,newb,tmp2);\
+	add	tmp2 ## D,tmp1 ## D;\
+	add	tmp1 ## D,tmp2 ## D;\
+	add	tmp1 ## D,key1 ## D;\
+	add	tmp2 ## D,key2 ## D;\
+	mov	olda  ## D,newa ## D;\
+	mov	oldb ## D,newb ## D;\
+	xor	key1 ## D,newa ## D;\
+	xor	key2 ## D,newb ## D;\
+	ror	$1,newa ## D
+
+#define decrypt_round(a,b,olda,oldb,newa,newb,ctx,round,tmp1,tmp2,key1,key2);
\
+	xor	tmp1,tmp1;\
+	load_round_key(key1,key2,ctx,round);\
+	movzx	a ## B,newa;\
+	movzx	a ## H,newb ## D;\
+	ror	$16,a ## D;\
+	load_s(ctx,s0,newa,tmp1);\
+	load_s(ctx,s1,newb,tmp1);\
+	movzx	a ## B,newa;\
+	movzx	a ## H,newb ## D;\
+	load_s(ctx,s2,newa,tmp1);\
+	load_s(ctx,s3,newb,tmp1);\
+	ror	$15,a ## D;\
+	xor	tmp2,tmp2;\
+	movzx	b ## B,newa;\
+	movzx	b ## H,newb ## D;\
+	ror	$16,b ## D;\
+	load_s(ctx,s1,newa,tmp2);\
+	load_s(ctx,s2,newb,tmp2);\
+	movzx	b ## B,newa;\
+	movzx	b ## H,newb ## D;\
+	load_s(ctx,s3,newa,tmp2);\
+	load_s(ctx,s0,newb,tmp2);\
+	ror	$16,b ## D;\
+	add	tmp2 ## D,tmp1 ## D;\
+	add	tmp1 ## D,tmp2 ## D;\
+	add	tmp1 ## D,key1 ## D;\
+	add	tmp2 ## D,key2 ## D;\
+	mov	olda  ## D,newa ## D;\
+	mov	oldb ## D,newb ## D;\
+	mov	a ## D,olda ## D;\
+	mov	b ## D,oldb ## D;\
+	xor	key1 ## D,newa ## D;\
+	xor	key2 ## D,newb ## D;\
+	ror	$1,newb ## D
+
+/* Last Round can ignore saving a,b for the next round */
+
+#define 
decrypt_last_round(a,b,olda,oldb,newa,newb,ctx,round,tmp1,tmp2,key1,key2);\
+	xor	tmp1,tmp1;\
+	load_round_key(key1,key2,ctx,round);\
+	movzx	a ## B,newa;\
+	movzx	a ## H,newb ## D;\
+	ror	$16,a ## D;\
+	load_s(ctx,s0,newa,tmp1);\
+	load_s(ctx,s1,newb,tmp1);\
+	movzx	a ## B,newa;\
+	movzx	a ## H,newb ## D;\
+	load_s(ctx,s2,newa,tmp1);\
+	load_s(ctx,s3,newb,tmp1);\
+	xor	tmp2,tmp2;\
+	movzx	b ## B,newa;\
+	movzx	b ## H,newb ## D;\
+	ror	$16,b ## D;\
+	load_s(ctx,s1,newa,tmp2);\
+	load_s(ctx,s2,newb,tmp2);\
+	movzx	b ## B,newa;\
+	movzx	b ## H,newb ## D;\
+	load_s(ctx,s3,newa,tmp2);\
+	load_s(ctx,s0,newb,tmp2);\
+	add	tmp2 ## D,tmp1 ## D;\
+	add	tmp1 ## D,tmp2 ## D;\
+	add	tmp1 ## D,key1 ## D;\
+	add	tmp2 ## D,key2 ## D;\
+	mov	olda  ## D,newa ## D;\
+	mov	oldb ## D,newb ## D;\
+	xor	key1 ## D,newa ## D;\
+	xor	key2 ## D,newb ## D;\
+	ror	$1,newb ## D
+
+
+
+
+.align 8
+.global twofish_enc_blk
+.global twofish_dec_blk
+
+
+
+twofish_enc_blk:
+	pushq    R1
+	pushq	 R12
+	pushq	 R13
+
+	/* r5 contains the crypto ctx adress */
+	/* r4 contains the output adress */
+	/* r3 contains the input adress */
+
+	movq	(R3),R1
+	movq	8(R3),R9
+	input_whitening(R1,R5,a_offset)
+	input_whitening(R9,R5,c_offset)
+	mov	R1D,R0D
+	shr	$32,R1
+	mov	R9D,R8D
+	shr	$32,R9
+	rol	$1,R9D
+
+	encrypt_round(R0,R1,R8,R9,R2,R3,R5,0,R10,R11,R12,R13);
+	encrypt_round(R2,R3,R8,R9,R0,R1,R5,8,R10,R11,R12,R13);
+	encrypt_round(R0,R1,R8,R9,R2,R3,R5,2*8,R10,R11,R12,R13);
+	encrypt_round(R2,R3,R8,R9,R0,R1,R5,3*8,R10,R11,R12,R13);
+	encrypt_round(R0,R1,R8,R9,R2,R3,R5,4*8,R10,R11,R12,R13);
+	encrypt_round(R2,R3,R8,R9,R0,R1,R5,5*8,R10,R11,R12,R13);
+	encrypt_round(R0,R1,R8,R9,R2,R3,R5,6*8,R10,R11,R12,R13);
+	encrypt_round(R2,R3,R8,R9,R0,R1,R5,7*8,R10,R11,R12,R13);
+	encrypt_round(R0,R1,R8,R9,R2,R3,R5,8*8,R10,R11,R12,R13);
+	encrypt_round(R2,R3,R8,R9,R0,R1,R5,9*8,R10,R11,R12,R13);
+	encrypt_round(R0,R1,R8,R9,R2,R3,R5,10*8,R10,R11,R12,R13);
+	encrypt_round(R2,R3,R8,R9,R0,R1,R5,11*8,R10,R11,R12,R13);
+	encrypt_round(R0,R1,R8,R9,R2,R3,R5,12*8,R10,R11,R12,R13);
+	encrypt_round(R2,R3,R8,R9,R0,R1,R5,13*8,R10,R11,R12,R13);
+	encrypt_round(R0,R1,R8,R9,R2,R3,R5,14*8,R10,R11,R12,R13);
+
+	mov	R3,R13
+	shl	$32,R13
+	xor	R2,R13
+	output_whitening(R13,R5,a_offset)
+	movq	R13,(R4)
+
+	encrypt_last_round(R2,R3,R8,R9,R0,R1,R5,15*8,R10,R11,R12,R13);
+
+
+	shl	$32,R1
+	xor	R0,R1
+
+	output_whitening(R1,R5,c_offset)
+	movq	R1,8(R4)
+
+	popq	R13
+	popq	R12
+	popq	R1
+	movq	$1,%rax
+	ret
+
+twofish_dec_blk:
+	pushq    R1
+	pushq	 R12
+	pushq	 R13
+
+	/* r5 contains the crypto ctx adress */
+	/* r4 contains the output adress */
+	/* r3 contains the input adress */
+
+	movq	(R3),R1
+	movq	8(R3),R9
+	output_whitening(R1,R5,a_offset)
+	output_whitening(R9,R5,c_offset)
+	mov	R1D,R0D
+	shr	$32,R1
+	mov	R9D,R8D
+	shr	$32,R9
+	rol	$1,R8D
+
+	decrypt_round(R0,R1,R8,R9,R2,R3,R5,15*8,R10,R11,R12,R13);
+	decrypt_round(R2,R3,R8,R9,R0,R1,R5,14*8,R10,R11,R12,R13);
+	decrypt_round(R0,R1,R8,R9,R2,R3,R5,13*8,R10,R11,R12,R13);
+	decrypt_round(R2,R3,R8,R9,R0,R1,R5,12*8,R10,R11,R12,R13);
+	decrypt_round(R0,R1,R8,R9,R2,R3,R5,11*8,R10,R11,R12,R13);
+	decrypt_round(R2,R3,R8,R9,R0,R1,R5,10*8,R10,R11,R12,R13);
+	decrypt_round(R0,R1,R8,R9,R2,R3,R5,9*8,R10,R11,R12,R13);
+	decrypt_round(R2,R3,R8,R9,R0,R1,R5,8*8,R10,R11,R12,R13);
+	decrypt_round(R0,R1,R8,R9,R2,R3,R5,7*8,R10,R11,R12,R13);
+	decrypt_round(R2,R3,R8,R9,R0,R1,R5,6*8,R10,R11,R12,R13);
+	decrypt_round(R0,R1,R8,R9,R2,R3,R5,5*8,R10,R11,R12,R13);
+	decrypt_round(R2,R3,R8,R9,R0,R1,R5,4*8,R10,R11,R12,R13);
+	decrypt_round(R0,R1,R8,R9,R2,R3,R5,3*8,R10,R11,R12,R13);
+	decrypt_round(R2,R3,R8,R9,R0,R1,R5,2*8,R10,R11,R12,R13);
+	decrypt_round(R0,R1,R8,R9,R2,R3,R5,8,R10,R11,R12,R13);
+
+	mov	R3,R13
+	shl	$32,R13
+	xor	R2,R13
+	input_whitening(R13,R5,a_offset)
+	movq	R13,(R4)
+
+	decrypt_last_round(R2,R3,R8,R9,R0,R1,R5,0,R10,R11,R12,R13);
+
+	shl	$32,R1
+	xor	R0,R1
+	input_whitening(R1,R5,c_offset)
+	movq	R1,8(R4)
+
+	popq	R13
+	popq	R12
+	popq	R1
+	movq	$1,%rax
+	ret
diff -uprN linux-2.6.17-rc5.twofish3/crypto/Kconfig 
linux-2.6.17-rc5.twofish4/crypto/Kconfig
--- linux-2.6.17-rc5.twofish3/crypto/Kconfig	2006-05-30 20:00:47.841035197 
+0200
+++ linux-2.6.17-rc5.twofish4/crypto/Kconfig	2006-05-31 11:52:43.234447029 
+0200
@@ -156,6 +156,20 @@ config CRYPTO_TWOFISH_586
 	  See also:
 	  <http://www.schneier.com/twofish.html>

+config CRYPTO_TWOFISH_X86_64
+        tristate "Twofish cipher algorithm (x86_64)"
+        depends on CRYPTO && ((X86 || UML_X86) && 64BIT)
+        help
+          Twofish cipher algorithm (x86_64).
+
+          Twofish was submitted as an AES (Advanced Encryption Standard)
+          candidate cipher by researchers at CounterPane Systems.  It is a
+          16 round block cipher supporting key sizes of 128, 192, and 256
+          bits.
+
+          See also:
+          <http://www.schneier.com/twofish.html>
+
 config CRYPTO_SERPENT
 	tristate "Serpent cipher algorithm"
 	depends on CRYPTO

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH  4/4] Twofish cipher - x86_64 assembler
  2006-06-04 13:16 [PATCH 4/4] Twofish cipher - x86_64 assembler Joachim Fritschi
@ 2006-06-04 19:10 ` Andi Kleen
  2006-06-04 21:01   ` Dag Arne Osvik
  2006-06-05 10:06   ` Joachim Fritschi
  2006-06-07 19:38 ` Joachim Fritschi
                   ` (3 subsequent siblings)
  4 siblings, 2 replies; 20+ messages in thread
From: Andi Kleen @ 2006-06-04 19:10 UTC (permalink / raw)
  To: Joachim Fritschi; +Cc: linux-kernel, linux-crypto, herbert

On Sunday 04 June 2006 15:16, Joachim Fritschi wrote:
> This patch adds the twofish x86_64 assembler routine.
>
> Changes since last version:
> - The keysetup is now handled by the twofish_common.c (see patch 1 )
> - The last round of the encrypt/decrypt routines where optimized saving 5
> instructions.
>
> Correctness was verified with the tcrypt module and automated test scripts.

Do you have some benchmark numbers that show that it's actually worth
it?

> +/* Defining a few register aliases for better reading */

Maybe you can read it now better, but for everybody else it is extremly 
confusing. It would be better if you just used the original register names.

-andi

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH  4/4] Twofish cipher - x86_64 assembler
  2006-06-04 19:10 ` Andi Kleen
@ 2006-06-04 21:01   ` Dag Arne Osvik
  2006-06-05 10:18     ` Joachim Fritschi
  2006-06-05 10:06   ` Joachim Fritschi
  1 sibling, 1 reply; 20+ messages in thread
From: Dag Arne Osvik @ 2006-06-04 21:01 UTC (permalink / raw)
  To: Andi Kleen; +Cc: Joachim Fritschi, linux-kernel, linux-crypto, herbert

Andi Kleen wrote:
> On Sunday 04 June 2006 15:16, Joachim Fritschi wrote:
>> This patch adds the twofish x86_64 assembler routine.

>> +/* Defining a few register aliases for better reading */
> 
> Maybe you can read it now better, but for everybody else it is extremly 
> confusing. It would be better if you just used the original register names.

I'd agree if you said this code could benefit from further readability
improvements.  But you're arguing against one.

Too bad AMD kept the old register names when defining AMD64..

-- 
  Dag Arne

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH  4/4] Twofish cipher - x86_64 assembler
  2006-06-04 21:01   ` Dag Arne Osvik
@ 2006-06-05 10:18     ` Joachim Fritschi
  2006-06-05 22:28       ` Dag Arne Osvik
  0 siblings, 1 reply; 20+ messages in thread
From: Joachim Fritschi @ 2006-06-05 10:18 UTC (permalink / raw)
  To: Dag Arne Osvik; +Cc: linux-kernel, linux-crypto, herbert

On Sunday 04 June 2006 23:01, Dag Arne Osvik wrote:
> Andi Kleen wrote:
> > On Sunday 04 June 2006 15:16, Joachim Fritschi wrote:
> >> This patch adds the twofish x86_64 assembler routine.
> >>
> >> +/* Defining a few register aliases for better reading */
> >
> > Maybe you can read it now better, but for everybody else it is extremly
> > confusing. It would be better if you just used the original register
> > names.
>
> I'd agree if you said this code could benefit from further readability
> improvements.  But you're arguing against one.
>
> Too bad AMD kept the old register names when defining AMD64..

I'd agree that the original register names would only complicate things. 

Can you give me any hint what to improve or maybe provide a suggestion on how 
to improve the overall readabilty.

Thanks,

Joachim


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH  4/4] Twofish cipher - x86_64 assembler
  2006-06-05 10:18     ` Joachim Fritschi
@ 2006-06-05 22:28       ` Dag Arne Osvik
  2006-06-05 22:44         ` Dag Arne Osvik
  2006-06-07 19:16         ` Joachim Fritschi
  0 siblings, 2 replies; 20+ messages in thread
From: Dag Arne Osvik @ 2006-06-05 22:28 UTC (permalink / raw)
  To: Joachim Fritschi; +Cc: Dag Arne Osvik, linux-kernel, linux-crypto, herbert

Joachim Fritschi wrote:
> On Sunday 04 June 2006 23:01, Dag Arne Osvik wrote:
>> Andi Kleen wrote:
>>> On Sunday 04 June 2006 15:16, Joachim Fritschi wrote:
>>>> This patch adds the twofish x86_64 assembler routine.
>>>>
>>>> +/* Defining a few register aliases for better reading */
>>> Maybe you can read it now better, but for everybody else it is extremly
>>> confusing. It would be better if you just used the original register
>>> names.
>> I'd agree if you said this code could benefit from further readability
>> improvements.  But you're arguing against one.
>>
>> Too bad AMD kept the old register names when defining AMD64..
> 
> I'd agree that the original register names would only complicate things. 
> 
> Can you give me any hint what to improve or maybe provide a suggestion on how 
> to improve the overall readabilty.

It looks better on second reading, but I have some comments:

Remove load_s - it's needless and (slightly) confusing
There are some cases of missing ## D
Why semicolon after closing parenthesis in macro definitions?
Try to align operands in columns
It would be nice to have some explanation of macro parameter names

Btw, why do you keep zeroing tmp registers when you don't need to?
32-bit ops zero the top half of the destination register.

Here's an example of a modified macro (modulo linewrapping by my mail
client):

#define
encrypt_round(a,b,olda,oldb,newa,newb,ctx,round,tmp1,tmp2,key1,key2) \
        load_round_key(key1,key2,ctx,round);\
        movzx   a ## B,         newa ## D;\
        movzx   a ## H,         newb ## D;\
        ror     $16,            a    ## D;\
        xor     s0(ctx,newa,4), tmp1 ## D;\
        xor     s1(ctx,newb,4), tmp1 ## D;\
        movzx   a ## B,         newa ## D;\
        movzx   a ## H,         newb ## D;\
        xor     s2(ctx,newa,4), tmp1 ## D;\
        xor     s3(ctx,newb,4), tmp1 ## D;\
        ror     $16,            a    ## D;\
        movzx   b ## B,         newa ## D;\
        movzx   b ## H,         newb ## D;\
        ror     $16,            b    ## D;\
        xor     s1(ctx,newa,4), tmp2 ## D;\
        xor     s2(ctx,newb,4), tmp2 ## D;\
        movzx   b ## B,         newa ## D;\
        movzx   b ## H,         newb ## D;\
        xor     s3(ctx,newa,4), tmp2 ## D;\
        xor     s0(ctx,newb,4), tmp2 ## D;\
        ror     $15,            b    ## D;\
        add     tmp2 ## D,      tmp1 ## D;\
        add     tmp1 ## D,      tmp2 ## D;\
        add     tmp1 ## D,      key1 ## D;\
        add     tmp2 ## D,      key2 ## D;\
        mov     olda ## D,      newa ## D;\
        mov     oldb ## D,      newb ## D;\
        mov     a    ## D,      olda ## D;\
        mov     b    ## D,      oldb ## D;\
        xor     key1 ## D,      newa ## D;\
        xor     key2 ## D,      newb ## D;\
        ror     $1,             newa ## D

At least a little bit more readable, right?

-- 
  Dag Arne

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH  4/4] Twofish cipher - x86_64 assembler
  2006-06-05 22:28       ` Dag Arne Osvik
@ 2006-06-05 22:44         ` Dag Arne Osvik
  2006-06-07 19:16         ` Joachim Fritschi
  1 sibling, 0 replies; 20+ messages in thread
From: Dag Arne Osvik @ 2006-06-05 22:44 UTC (permalink / raw)
  To: Dag Arne Osvik; +Cc: Joachim Fritschi, linux-kernel, linux-crypto, herbert

Dag Arne Osvik wrote:
> Joachim Fritschi wrote:
>> On Sunday 04 June 2006 23:01, Dag Arne Osvik wrote:
>>> Andi Kleen wrote:
>>>> On Sunday 04 June 2006 15:16, Joachim Fritschi wrote:
>>>>> This patch adds the twofish x86_64 assembler routine.
>>>>>
>>>>> +/* Defining a few register aliases for better reading */
>>>> Maybe you can read it now better, but for everybody else it is extremly
>>>> confusing. It would be better if you just used the original register
>>>> names.
>>> I'd agree if you said this code could benefit from further readability
>>> improvements.  But you're arguing against one.
>>>
>>> Too bad AMD kept the old register names when defining AMD64..
>> I'd agree that the original register names would only complicate things. 
>>
>> Can you give me any hint what to improve or maybe provide a suggestion on how 
>> to improve the overall readabilty.
> 
> It looks better on second reading, but I have some comments:
> 
> Remove load_s - it's needless and (slightly) confusing
> There are some cases of missing ## D
> Why semicolon after closing parenthesis in macro definitions?
> Try to align operands in columns
> It would be nice to have some explanation of macro parameter names
> 
> Btw, why do you keep zeroing tmp registers when you don't need to?
> 32-bit ops zero the top half of the destination register.

Sorry.. that zeroing was of course due to load_s only doing xor..

> 
> Here's an example of a modified macro (modulo linewrapping by my mail
> client):
> 
> #define
> encrypt_round(a,b,olda,oldb,newa,newb,ctx,round,tmp1,tmp2,key1,key2) \
>         load_round_key(key1,key2,ctx,round);\
>         movzx   a ## B,         newa ## D;\
>         movzx   a ## H,         newb ## D;\
>         ror     $16,            a    ## D;\
>         xor     s0(ctx,newa,4), tmp1 ## D;\
          ^^^ change to mov
>         xor     s1(ctx,newb,4), tmp1 ## D;\
>         movzx   a ## B,         newa ## D;\
>         movzx   a ## H,         newb ## D;\
>         xor     s2(ctx,newa,4), tmp1 ## D;\
>         xor     s3(ctx,newb,4), tmp1 ## D;\
>         ror     $16,            a    ## D;\
>         movzx   b ## B,         newa ## D;\
>         movzx   b ## H,         newb ## D;\
>         ror     $16,            b    ## D;\
>         xor     s1(ctx,newa,4), tmp2 ## D;\
          ^^^ change to mov
>         xor     s2(ctx,newb,4), tmp2 ## D;\
>         movzx   b ## B,         newa ## D;\
>         movzx   b ## H,         newb ## D;\
>         xor     s3(ctx,newa,4), tmp2 ## D;\
>         xor     s0(ctx,newb,4), tmp2 ## D;\
>         ror     $15,            b    ## D;\
>         add     tmp2 ## D,      tmp1 ## D;\
>         add     tmp1 ## D,      tmp2 ## D;\
>         add     tmp1 ## D,      key1 ## D;\
>         add     tmp2 ## D,      key2 ## D;\
>         mov     olda ## D,      newa ## D;\
>         mov     oldb ## D,      newb ## D;\
>         mov     a    ## D,      olda ## D;\
>         mov     b    ## D,      oldb ## D;\
>         xor     key1 ## D,      newa ## D;\
>         xor     key2 ## D,      newb ## D;\
>         ror     $1,             newa ## D
> 
> At least a little bit more readable, right?
> 

-- 
  Dag Arne

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH  4/4] Twofish cipher - x86_64 assembler
  2006-06-05 22:28       ` Dag Arne Osvik
  2006-06-05 22:44         ` Dag Arne Osvik
@ 2006-06-07 19:16         ` Joachim Fritschi
  1 sibling, 0 replies; 20+ messages in thread
From: Joachim Fritschi @ 2006-06-07 19:16 UTC (permalink / raw)
  To: Dag Arne Osvik; +Cc: linux-kernel, linux-crypto, herbert

On Tuesday 06 June 2006 00:28, Dag Arne Osvik wrote:
> Joachim Fritschi wrote:
> > Can you give me any hint what to improve or maybe provide a suggestion on how 
> > to improve the overall readabilty.
> 
> It looks better on second reading, but I have some comments:
> 
> Remove load_s - it's needless and (slightly) confusing
Done
> There are some cases of missing ## D
Oops, fixed.
> Why semicolon after closing parenthesis in macro definitions?
Braindamage on my part.
> Try to align operands in columns
Done
> It would be nice to have some explanation of macro parameter names
Done for the non trivial stuff
> Btw, why do you keep zeroing tmp registers when you don't need to?
> 32-bit ops zero the top half of the destination register.
Again braindamage :). Fixed it according to your suggestions.

Thanks for the comments. Will post a new patches in a few minutes.

-Joachim


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH  4/4] Twofish cipher - x86_64 assembler
  2006-06-04 19:10 ` Andi Kleen
  2006-06-04 21:01   ` Dag Arne Osvik
@ 2006-06-05 10:06   ` Joachim Fritschi
  2006-06-05 17:44     ` dean gaudet
  2006-06-05 23:35     ` Andi Kleen
  1 sibling, 2 replies; 20+ messages in thread
From: Joachim Fritschi @ 2006-06-05 10:06 UTC (permalink / raw)
  To: Andi Kleen; +Cc: linux-kernel, linux-crypto, herbert

On Sunday 04 June 2006 21:10, Andi Kleen wrote:
> On Sunday 04 June 2006 15:16, Joachim Fritschi wrote:
> > This patch adds the twofish x86_64 assembler routine.
> >
> > Changes since last version:
> > - The keysetup is now handled by the twofish_common.c (see patch 1 )
> > - The last round of the encrypt/decrypt routines where optimized saving 5
> > instructions.
> >
> > Correctness was verified with the tcrypt module and automated test
> > scripts.
>
> Do you have some benchmark numbers that show that it's actually worth
> it?

Here are the outputs from the tcrypt speedtests. They haven't changed much 
since the last patch:

http://homepages.tu-darmstadt.de/~fritschi/twofish/tcrypt-speed-c-i586.txt
http://homepages.tu-darmstadt.de/~fritschi/twofish/tcrypt-speed-asm-i586.txt
http://homepages.tu-darmstadt.de/~fritschi/twofish/tcrypt-speed-c-x86_64.txt
http://homepages.tu-darmstadt.de/~fritschi/twofish/tcrypt-speed-asm-x86_64.txt

Summary for cycles used for CBC encrypt decrypt (256bit / 8k blocks) assembler 
vs. generic-c:

i586 encrypt:   - 17%
i568 decrypt:   -24%
x86_64 encrypt: -22%
x86_64 decrypt: -17%

The numbers vary a bit with different blocksizes / keylength and per test.

I also did some filesystem benchmarks (bonnie++) with various ciphers. Most 
write tests maxed out my drives writing to disk.  But at least for the read 
speed you can see some notable performance improvements:
(Note: The x86 and x86_64 numbers are not comparable since the tests were done 
on different machines)

http://homepages.tu-darmstadt.de/~fritschi/twofish/output_20060531_160442_x86.html

Summary:
Sequential read speed improved between 25-32%
Sequential write speed improved at least 15% but the disk maxed out
Twofish 256 is a little bit faster than AES 128

http://homepages.tu-darmstadt.de/~fritschi/twofish/output_20060601_113747_x86_64.html

Summary:
Sequential read speed improved 13%
Seqential write speed maxed out the drives

> > +/* Defining a few register aliases for better reading */
>
> Maybe you can read it now better, but for everybody else it is extremly
> confusing. It would be better if you just used the original register names.

The problem with the registers is that the original naming is really crappy in 
the first place. The lower registers use the old x86 naming scheme plus some 
extensions but the upper registers are totally different. Since i use a lot 
of the lower / higher 8bit and 32 bit parts it would be virtually impossible 
to write simple macros with this naming scheme because there would be no easy 
way to switch from a (unknown) register to its 8bit subregister or the 32bit 
in a macro. While there might be some way to do this, i have not found any 
example in the kernel or any other source code.
As an explanation i can only add that i looked at the aes assembler 
implementation and used it as a example. I know refering to bad coding style 
in the kernel is no excuse for more bad coding but it seems to me that it is 
the only way to deal with the insane original register naming.
Imho using the original names would only complicate the macros and complicate 
understanding the algorithm itself. For the algorithm itself the the actual 
registers are simply irrelevant . It calls no other functions or uses any 
syscalls. It only uses them as storage. That way reffering to them in a 
numbered order with a suffix for 8/16/32bit was an easy way to improve 
readability and easier programming.
There might be some way to further improve readability but i have not found 
any other way. I'm open to suggestions :)

Regards,

Joachim

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH  4/4] Twofish cipher - x86_64 assembler
  2006-06-05 10:06   ` Joachim Fritschi
@ 2006-06-05 17:44     ` dean gaudet
  2006-06-05 19:46       ` Joachim Fritschi
  2006-06-05 23:35     ` Andi Kleen
  1 sibling, 1 reply; 20+ messages in thread
From: dean gaudet @ 2006-06-05 17:44 UTC (permalink / raw)
  To: Joachim Fritschi; +Cc: Andi Kleen, linux-kernel, linux-crypto, herbert

On Mon, 5 Jun 2006, Joachim Fritschi wrote:

> Here are the outputs from the tcrypt speedtests. They haven't changed much 
> since the last patch:
> 
> http://homepages.tu-darmstadt.de/~fritschi/twofish/tcrypt-speed-c-i586.txt
> http://homepages.tu-darmstadt.de/~fritschi/twofish/tcrypt-speed-asm-i586.txt
> http://homepages.tu-darmstadt.de/~fritschi/twofish/tcrypt-speed-c-x86_64.txt
> http://homepages.tu-darmstadt.de/~fritschi/twofish/tcrypt-speed-asm-x86_64.txt

when you quote anything related to cpu performance on an x86 processor 
it's absolutely essential to indicate which cpu it is... basically 
vendor_id, cpu family, and model from /proc/cpuinfo.  (for example the 
entire p4 family has incredible model-to-model variation on things like 
shifts and extensions.)


> > > +/* Defining a few register aliases for better reading */
> >
> > Maybe you can read it now better, but for everybody else it is extremly
> > confusing. It would be better if you just used the original register names.

i'd change the comment to:

/* define a few register aliases to simplify macro substitution */

because as you mention, it's totally impossible to write the macros 
otherwise.  (i've used the same trick myself a bunch of times.)

-dean

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH  4/4] Twofish cipher - x86_64 assembler
  2006-06-05 17:44     ` dean gaudet
@ 2006-06-05 19:46       ` Joachim Fritschi
  0 siblings, 0 replies; 20+ messages in thread
From: Joachim Fritschi @ 2006-06-05 19:46 UTC (permalink / raw)
  To: dean gaudet; +Cc: Andi Kleen, linux-kernel, linux-crypto, herbert

On Monday 05 June 2006 19:44, dean gaudet wrote:
> On Mon, 5 Jun 2006, Joachim Fritschi wrote:
> > Here are the outputs from the tcrypt speedtests. They haven't changed
> > much since the last patch:
> >
> > http://homepages.tu-darmstadt.de/~fritschi/twofish/tcrypt-speed-c-i586.tx
> >t
> > http://homepages.tu-darmstadt.de/~fritschi/twofish/tcrypt-speed-asm-i586.
> >txt
> > http://homepages.tu-darmstadt.de/~fritschi/twofish/tcrypt-speed-c-x86_64.
> >txt
> > http://homepages.tu-darmstadt.de/~fritschi/twofish/tcrypt-speed-asm-x86_6
> >4.txt
>
> when you quote anything related to cpu performance on an x86 processor
> it's absolutely essential to indicate which cpu it is... basically
> vendor_id, cpu family, and model from /proc/cpuinfo.  (for example the
> entire p4 family has incredible model-to-model variation on things like
> shifts and extensions.)

x86_64 benchmarks  where done on a:
vendor_id       : AuthenticAMD
cpu family      : 15
model           : 35
model name      : AMD Athlon(tm) 64 X2 Dual Core Processor 4400+
stepping        : 2
cpu MHz         : 2200.000
cache size      : 1024 KB
SMP was disabled

x86 benchmarks where done on a:
vendor_id       : AuthenticAMD
cpu family      : 6
model           : 8
model name      : AMD Athlon(tm) XP 2400+
stepping        : 1
cpu MHz         : 1991.695
cache size      : 256 KB

> > > > +/* Defining a few register aliases for better reading */
> > >
> > > Maybe you can read it now better, but for everybody else it is extremly
> > > confusing. It would be better if you just used the original register
> > > names.
>
> i'd change the comment to:
>
> /* define a few register aliases to simplify macro substitution */
>
> because as you mention, it's totally impossible to write the macros
> otherwise.  (i've used the same trick myself a bunch of times.)

Sounds ok to me. It was the main reason to use these aliases. For me it also 
improves readability but as author my view is probably a bit distorted ;)

Joachim


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH  4/4] Twofish cipher - x86_64 assembler
  2006-06-05 10:06   ` Joachim Fritschi
  2006-06-05 17:44     ` dean gaudet
@ 2006-06-05 23:35     ` Andi Kleen
  2006-06-07 19:21       ` Joachim Fritschi
  1 sibling, 1 reply; 20+ messages in thread
From: Andi Kleen @ 2006-06-05 23:35 UTC (permalink / raw)
  To: Joachim Fritschi; +Cc: linux-kernel, linux-crypto, herbert

On Monday 05 June 2006 12:06, Joachim Fritschi wrote:
> On Sunday 04 June 2006 21:10, Andi Kleen wrote:
> > On Sunday 04 June 2006 15:16, Joachim Fritschi wrote:
> > > This patch adds the twofish x86_64 assembler routine.
> > >
> > > Changes since last version:
> > > - The keysetup is now handled by the twofish_common.c (see patch 1 )
> > > - The last round of the encrypt/decrypt routines where optimized saving 5
> > > instructions.
> > >
> > > Correctness was verified with the tcrypt module and automated test
> > > scripts.
> >
> > Do you have some benchmark numbers that show that it's actually worth
> > it?
> 
> Here are the outputs from the tcrypt speedtests. They haven't changed much 
> since the last patch:

Ok thanks. I've tried to apply the patches, but can't because they're
word wrapped. Can you please resend and do a test send to yourself first,
checking that the patch can be really applied.


> There might be some way to further improve readability but i have not found 
> any other way. I'm open to suggestions :)


Sounds reasonable. Best you just fix the comment to say that this convention is needed 
for the macros.

-Andi

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH  4/4] Twofish cipher - x86_64 assembler
  2006-06-05 23:35     ` Andi Kleen
@ 2006-06-07 19:21       ` Joachim Fritschi
  0 siblings, 0 replies; 20+ messages in thread
From: Joachim Fritschi @ 2006-06-07 19:21 UTC (permalink / raw)
  To: Andi Kleen; +Cc: linux-kernel, linux-crypto, herbert

On Tuesday 06 June 2006 01:35, Andi Kleen wrote:
> On Monday 05 June 2006 12:06, Joachim Fritschi wrote:
> > On Sunday 04 June 2006 21:10, Andi Kleen wrote:
> > > On Sunday 04 June 2006 15:16, Joachim Fritschi wrote:
> > > > This patch adds the twofish x86_64 assembler routine.
> > > >
> > > > Changes since last version:
> > > > - The keysetup is now handled by the twofish_common.c (see patch 1 )
> > > > - The last round of the encrypt/decrypt routines where optimized saving 5
> > > > instructions.
> > > >
> > > > Correctness was verified with the tcrypt module and automated test
> > > > scripts.
> > >
> > > Do you have some benchmark numbers that show that it's actually worth
> > > it?
> > 
> > Here are the outputs from the tcrypt speedtests. They haven't changed much 
> > since the last patch:
> 
> Ok thanks. I've tried to apply the patches, but can't because they're
> word wrapped. Can you please resend and do a test send to yourself first,
> checking that the patch can be really applied.

Sorry i will send new patches, also including the fixes that were suggested by Dag.

-Joachim


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH  4/4] Twofish cipher - x86_64 assembler
  2006-06-04 13:16 [PATCH 4/4] Twofish cipher - x86_64 assembler Joachim Fritschi
  2006-06-04 19:10 ` Andi Kleen
@ 2006-06-07 19:38 ` Joachim Fritschi
  2006-06-16 12:00 ` Joachim Fritschi
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 20+ messages in thread
From: Joachim Fritschi @ 2006-06-07 19:38 UTC (permalink / raw)
  To: linux-kernel; +Cc: linux-crypto, herbert, ak

On Sunday 04 June 2006 15:16, Joachim Fritschi wrote:
> This patch adds the twofish x86_64 assembler routine. 
> 
> Changes since last version:
> - The keysetup is now handled by the twofish_common.c (see patch 1 )
> - The last round of the encrypt/decrypt routines where optimized saving 5 
> instructions. 
> 
> Correctness was verified with the tcrypt module and automated test scripts.
My first mail was wordwrapped. This one should be unwrapped and working.
It also includes some readability fixes, additional comments and a little code cleanup.

Signed-off-by: Joachim Fritschi <jfritschi@freenet.de>

diff -uprN linux-2.6.17-rc5.twofish3/arch/x86_64/crypto/Makefile linux-2.6.17-rc5.twofish4/arch/x86_64/crypto/Makefile
--- linux-2.6.17-rc5.twofish3/arch/x86_64/crypto/Makefile	2006-05-30 19:58:05.172677025 +0200
+++ linux-2.6.17-rc5.twofish4/arch/x86_64/crypto/Makefile	2006-05-31 11:56:48.239053258 +0200
@@ -5,5 +5,8 @@
 #

 obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o
+obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o

 aes-x86_64-y := aes-x86_64-asm.o aes.o
+twofish-x86_64-y := twofish-x86_64-asm.o twofish.o ../../../crypto/twofish_common.o
+
diff -uprN linux-2.6.17-rc5.twofish3/arch/x86_64/crypto/twofish.c linux-2.6.17-rc5.twofish4/arch/x86_64/crypto/twofish.c
--- linux-2.6.17-rc5.twofish3/arch/x86_64/crypto/twofish.c	1970-01-01 01:00:00.000000000 +0100
+++ linux-2.6.17-rc5.twofish4/arch/x86_64/crypto/twofish.c	2006-05-31 11:55:51.492767729 +0200
@@ -0,0 +1,86 @@
+/*
+ * Glue Code for optimized x86_64 assembler version of TWOFISH
+ *
+ * Originally Twofish for GPG
+ * By Matthew Skala <mskala@ansuz.sooke.bc.ca>, July 26, 1998
+ * 256-bit key length added March 20, 1999
+ * Some modifications to reduce the text size by Werner Koch, April, 1998
+ * Ported to the kerneli patch by Marc Mutz <Marc@Mutz.com>
+ * Ported to CryptoAPI by Colin Slater <hoho@tacomeat.net>
+ *
+ * The original author has disclaimed all copyright interest in this
+ * code and thus put it in the public domain. The subsequent authors
+ * have put this under the GNU General Public License.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307
+ * USA
+ *
+ * This code is a "clean room" implementation, written from the paper
+ * _Twofish: A 128-Bit Block Cipher_ by Bruce Schneier, John Kelsey,
+ * Doug Whiting, David Wagner, Chris Hall, and Niels Ferguson, available
+ * through http://www.counterpane.com/twofish.html
+ *
+ * For background information on multiplication in finite fields, used for
+ * the matrix operations in the key schedule, see the book _Contemporary
+ * Abstract Algebra_ by Joseph A. Gallian, especially chapter 22 in the
+ * Third Edition.
+ */
+
+#include <asm/byteorder.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/crypto.h>
+#include <linux/bitops.h>
+#include <crypto/twofish.h>
+
+asmlinkage void twofish_enc_blk(void *ctx, u8 *dst, const u8 *src);
+
+asmlinkage void twofish_dec_blk(void *ctx, u8 *dst, const u8 *src);
+
+static struct crypto_alg alg = {
+	.cra_name           =   "twofish",
+	.cra_driver_name    =	"twofish-x86_64",
+	.cra_priority       =	200,
+	.cra_flags          =   CRYPTO_ALG_TYPE_CIPHER,
+	.cra_blocksize      =   TF_BLOCK_SIZE,
+	.cra_ctxsize        =   sizeof(struct twofish_ctx),
+	.cra_alignmask      =	3,
+	.cra_module         =   THIS_MODULE,
+	.cra_list           =   LIST_HEAD_INIT(alg.cra_list),
+	.cra_u              =   { .cipher = {
+	.cia_min_keysize    =   TF_MIN_KEY_SIZE,
+	.cia_max_keysize    =   TF_MAX_KEY_SIZE,
+	.cia_setkey         =   twofish_setkey,
+	.cia_encrypt        =   twofish_enc_blk,
+	.cia_decrypt        =   twofish_dec_blk } }
+};
+
+static int __init init(void)
+{
+	return crypto_register_alg(&alg);
+}
+
+static void __exit fini(void)
+{
+	crypto_unregister_alg(&alg);
+}
+
+module_init(init);
+module_exit(fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION ("Twofish Cipher Algorithm, x86_64 asm optimized");
diff -uprN linux-2.6.17-rc5.twofish3/arch/x86_64/crypto/twofish-x86_64-asm.S linux-2.6.17-rc5.twofish4/arch/x86_64/crypto/twofish-x86_64-asm.S
--- linux-2.6.17-rc5.twofish3/arch/x86_64/crypto/twofish-x86_64-asm.S	1970-01-01 01:00:00.000000000 +0100
+++ linux-2.6.17-rc5.twofish4/arch/x86_64/crypto/twofish-x86_64-asm.S	2006-06-06 10:19:11.462630535 +0200
@@ -0,0 +1,406 @@
+/***************************************************************************
+*   Copyright (C) 2006 by Joachim Fritschi, <jfritschi@freenet.de>        *
+*                                                                         *
+*   This program is free software; you can redistribute it and/or modify  *
+*   it under the terms of the GNU General Public License as published by  *
+*   the Free Software Foundation; either version 2 of the License, or     *
+*   (at your option) any later version.                                   *
+*                                                                         *
+*   This program is distributed in the hope that it will be useful,       *
+*   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+*   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+*   GNU General Public License for more details.                          *
+*                                                                         *
+*   You should have received a copy of the GNU General Public License     *
+*   along with this program; if not, write to the                         *
+*   Free Software Foundation, Inc.,                                       *
+*   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+***************************************************************************/
+
+.file "twofish-x86_64-asm.S"
+.text
+
+#define a_offset	0
+#define b_offset	4
+#define c_offset	8
+#define d_offset	12
+
+/* Structure of the crypto context struct*/
+
+#define s0	0	/* S0 Array 256 Words each */
+#define s1	1024	/* S1 Array */
+#define s2	2048	/* S2 Array */
+#define s3	3072	/* S3 Array */
+#define w	4096	/* 8 whitening keys (word) */
+#define k	4128	/* key 1-32 ( word ) */
+
+/* define a few register aliases to allow macro substitution */
+
+#define R0     %rax
+#define R0D    %eax
+#define R0W    %ax
+#define R0B    %al
+#define R0H    %ah
+
+#define R1     %rbx
+#define R1D    %ebx
+#define R1W    %bx
+#define R1B    %bl
+#define R1H    %bh
+
+#define R2     %rcx
+#define R2D    %ecx
+#define R2W    %cx
+#define R2B    %cl
+#define R2H    %ch
+
+#define R3     %rdx
+#define R3D    %edx
+#define R3W    %dx
+#define R3B    %dl
+#define R3H    %dh
+
+#define R4     %rsi
+#define R4D    %esi
+#define R4W    %six
+#define R4B    %sil
+
+#define R5     %rdi
+#define R5D    %edi
+#define R5W    %dix
+#define R5B    %dil
+
+#define R6     %rsp
+#define R6D    %esp
+#define R6W    %spx
+#define R6B    %spl
+
+#define R7     %rbp
+#define R7D    %ebp
+#define R7W    %bpx
+#define R7B    %bpl
+
+#define R8     %r8
+#define R8D    %r8d
+#define R8W    %r8w
+#define R8B    %r8b
+
+#define R9     %r9
+#define R9D    %r9d
+#define R9W    %r9w
+#define R9B    %r9b
+
+#define R10     %r10
+#define R10D    %r10d
+#define R10W    %r10w
+#define R10B    %r10b
+
+#define R11     %r11
+#define R11D    %r11d
+#define R11W    %r11w
+#define R11B    %r11b
+
+#define R12     %r12
+#define R12D    %r12d
+#define R12W    %r12w
+#define R12B    %r12b
+
+#define R13     %r13
+#define R13D    %r13d
+#define R13W    %r13w
+#define R13B    %r13b
+
+#define R14     %r14
+#define R14D    %r14d
+#define R14W    %r14w
+#define R14B    %r14b
+
+#define R15     %r15
+#define R15D    %r15d
+#define R15W    %r15w
+#define R15B    %r15b
+
+
+
+/* performs input whitening */
+#define input_whitening(src,context,offset)\
+	xor	w+offset(context),	src;
+
+/* performs input whitening */
+#define output_whitening(src,context,offset)\
+	xor	w+16+offset(context),	src;
+
+/* load both round keys */
+#define load_round_key(dsta,dstb,context,round)\
+	mov	k+round(context),	dsta ## D;\
+	mov	k+4+round(context),	dstb ## D;
+
+/* Parameters for all encrypt/decrypt macros:
+a input register containing a
+b input register containing a
+olda register containing the a input of the last round
+oldb register containing the b input of the last round
+newa output register for a
+newb output register for b
+ctx  register cotaining the adress of the crypto context
+round roundoffset for loading the roundkeys
+tmp1 temporary register
+tmp2 temporary register
+key1 register for storing the a roundkey
+key2 register for storing the b roundkey
+*/
+
+#define\
+ encrypt_round(a,b,olda,oldb,newa,newb,ctx,round,tmp1,tmp2,key1,key2)\
+	load_round_key(key1,key2,ctx,round);\
+	movzx	a ## B,		newa ## D;\
+	movzx	a ## H,		newb ## D;\
+	ror	$16,		a ## D;\
+	mov	s0(ctx,newa,4),	tmp1 ## D;\
+	xor     s1(ctx,newb,4),	tmp1 ## D;\
+	movzx	a ## B,		newa ## D;\
+	movzx	a ## H,		newb ## D;\
+	xor     s2(ctx,newa,4),	tmp1 ## D;\
+	xor     s3(ctx,newb,4),	tmp1 ## D;\
+	ror	$16,		a ## D;\
+	movzx	b ## B,		newa ## D;\
+	movzx	b ## H,		newb ## D;\
+	ror	$16,		b ## D;\
+	mov     s1(ctx,newa,4),	tmp2 ## D;\
+	xor	s2(ctx,newb,4),	tmp2 ## D;\
+	movzx	b ## B,		newa ## D;\
+	movzx	b ## H,		newb ## D;\
+	xor	s3(ctx,newa,4),	tmp2 ## D;\
+	xor     s0(ctx,newb,4),	tmp2 ## D;\
+	ror	$15,		b ## D;\
+	add	tmp2 ## D,	tmp1 ## D;\
+	add	tmp1 ## D,	tmp2 ## D;\
+	add	tmp1 ## D,	key1 ## D;\
+	add	tmp2 ## D,	key2 ## D;\
+	mov	olda  ## D,	newa ## D;\
+	mov	oldb ## D,	newb ## D;\
+	mov	a ## D,		olda ## D;\
+	mov	b ## D,		oldb ## D;\
+	xor	key1 ## D,	newa ## D;\
+	xor	key2 ## D,	newb ## D;\
+	ror	$1,		newa ## D
+
+/* Last Round can ignore saving a,b for the next round */
+
+#define\
+ encrypt_last_round(a,b,olda,oldb,newa,newb,ctx,round,tmp1,tmp2,key1,key2)\
+	load_round_key(key1,key2,ctx,round);\
+	movzx	a ## B,		newa ## D;\
+	movzx	a ## H,		newb ## D;\
+	ror	$16,		a ## D;\
+	mov	s0(ctx,newa,4),	tmp1 ## D;\
+	xor     s1(ctx,newb,4),	tmp1 ## D;\
+	movzx	a ## B,		newa ## D;\
+	movzx	a ## H,		newb ## D;\
+	xor     s2(ctx,newa,4), tmp1 ## D;\
+	xor     s3(ctx,newb,4), tmp1 ## D;\
+	movzx	b ## B,		newa ## D;\
+	movzx	b ## H,		newb ## D;\
+	ror	$16,		b ## D;\
+	mov     s1(ctx,newa,4),	tmp2 ## D;\
+	xor	s2(ctx,newb,4),	tmp2 ## D;\
+	movzx	b ## B,		newa ## D;\
+	movzx	b ## H,		newb ## D;\
+	xor	s3(ctx,newa,4),	tmp2 ## D;\
+	xor     s0(ctx,newb,4),	tmp2 ## D;\
+	add	tmp2 ## D,	tmp1 ## D;\
+	add	tmp1 ## D,	tmp2 ## D;\
+	add	tmp1 ## D,	key1 ## D;\
+	add	tmp2 ## D,	key2 ## D;\
+	mov	olda  ## D,	newa ## D;\
+	mov	oldb ## D,	newb ## D;\
+	xor	key1 ## D,	newa ## D;\
+	xor	key2 ## D,	newb ## D;\
+	ror	$1,		newa ## D
+
+#define\
+ decrypt_round(a,b,olda,oldb,newa,newb,ctx,round,tmp1,tmp2,key1,key2)\
+	load_round_key(key1,key2,ctx,round);\
+	movzx	a ## B,		newa ## D;\
+	movzx	a ## H,		newb ## D;\
+	ror	$16,		a ## D;\
+	mov	s0(ctx,newa,4),	tmp1 ## D;\
+	xor	s1(ctx,newb,4),	tmp1 ## D;\
+	movzx	a ## B,		newa;\
+	movzx	a ## H,		newb ## D;\
+	xor	s2(ctx,newa,4),	tmp1 ## D;\
+	xor	s3(ctx,newb,4),	tmp1 ## D;\
+	ror	$15,		a ## D;\
+	movzx	b ## B,		newa ## D;\
+	movzx	b ## H,		newb ## D;\
+	ror	$16,		b ## D;\
+	mov	s1(ctx,newa,4),	tmp2 ## D;\
+	xor	s2(ctx,newb,4),	tmp2 ## D;\
+	movzx	b ## B,		newa ## D;\
+	movzx	b ## H,		newb ## D;\
+	xor	s3(ctx,newa,4),	tmp2 ## D;\
+	xor	s0(ctx,newb,4),	tmp2 ## D;\
+	ror	$16,		b ## D;\
+	add	tmp2 ## D,	tmp1 ## D;\
+	add	tmp1 ## D,	tmp2 ## D;\
+	add	tmp1 ## D,	key1 ## D;\
+	add	tmp2 ## D,	key2 ## D;\
+	mov	olda  ## D,	newa ## D;\
+	mov	oldb ## D,	newb ## D;\
+	mov	a ## D,		olda ## D;\
+	mov	b ## D,		oldb ## D;\
+	xor	key1 ## D,	newa ## D;\
+	xor	key2 ## D,	newb ## D;\
+	ror	$1,		newb ## D
+
+/* Last Round can ignore saving a,b for the next round */
+
+#define\
+ decrypt_last_round(a,b,olda,oldb,newa,newb,ctx,round,tmp1,tmp2,key1,key2)\
+	load_round_key(key1,key2,ctx,round);\
+	movzx	a ## B,		newa ## D;\
+	movzx	a ## H,		newb ## D;\
+	ror	$16,		a ## D;\
+	mov	s0(ctx,newa,4),	tmp1 ## D;\
+	xor	s1(ctx,newb,4),	tmp1 ## D;\
+	movzx	a ## B,		newa ## D;\
+	movzx	a ## H,		newb ## D;\
+	xor	s2(ctx,newa,4),	tmp1 ## D;\
+	xor	s3(ctx,newb,4),	tmp1 ## D;\
+	movzx	b ## B,		newa;\
+	movzx	b ## H,		newb ## D;\
+	ror	$16,b 		## D;\
+	mov	s1(ctx,newa,4),	tmp2 ## D;\
+	xor	s2(ctx,newb,4),	tmp2 ## D;\
+	movzx	b ## B,		newa ## D;\
+	movzx	b ## H,		newb ## D;\
+	xor	s3(ctx,newa,4),	tmp2 ## D;\
+	xor	s0(ctx,newb,4),	tmp2 ## D;\
+	add	tmp2 ## D,	tmp1 ## D;\
+	add	tmp1 ## D,	tmp2 ## D;\
+	add	tmp1 ## D,	key1 ## D;\
+	add	tmp2 ## D,	key2 ## D;\
+	mov	olda  ## D,	newa ## D;\
+	mov	oldb ## D,	newb ## D;\
+	xor	key1 ## D,	newa ## D;\
+	xor	key2 ## D,	newb ## D;\
+	ror	$1,		newb ## D
+
+
+
+
+.align 8
+.global twofish_enc_blk
+.global twofish_dec_blk
+
+
+
+twofish_enc_blk:
+	pushq    R1
+	pushq	 R12
+	pushq	 R13
+
+	/* r5 contains the crypto ctx adress */
+	/* r4 contains the output adress */
+	/* r3 contains the input adress */
+
+	movq	(R3),	R1
+	movq	8(R3),	R9
+	input_whitening(R1,R5,a_offset)
+	input_whitening(R9,R5,c_offset)
+	mov	R1D,	R0D
+	shr	$32,	R1
+	mov	R9D,	R8D
+	shr	$32,	R9
+	rol	$1,	R9D
+
+	encrypt_round(R0,R1,R8,R9,R2,R3,R5,0,R10,R11,R12,R13);
+	encrypt_round(R2,R3,R8,R9,R0,R1,R5,8,R10,R11,R12,R13);
+	encrypt_round(R0,R1,R8,R9,R2,R3,R5,2*8,R10,R11,R12,R13);
+	encrypt_round(R2,R3,R8,R9,R0,R1,R5,3*8,R10,R11,R12,R13);
+	encrypt_round(R0,R1,R8,R9,R2,R3,R5,4*8,R10,R11,R12,R13);
+	encrypt_round(R2,R3,R8,R9,R0,R1,R5,5*8,R10,R11,R12,R13);
+	encrypt_round(R0,R1,R8,R9,R2,R3,R5,6*8,R10,R11,R12,R13);
+	encrypt_round(R2,R3,R8,R9,R0,R1,R5,7*8,R10,R11,R12,R13);
+	encrypt_round(R0,R1,R8,R9,R2,R3,R5,8*8,R10,R11,R12,R13);
+	encrypt_round(R2,R3,R8,R9,R0,R1,R5,9*8,R10,R11,R12,R13);
+	encrypt_round(R0,R1,R8,R9,R2,R3,R5,10*8,R10,R11,R12,R13);
+	encrypt_round(R2,R3,R8,R9,R0,R1,R5,11*8,R10,R11,R12,R13);
+	encrypt_round(R0,R1,R8,R9,R2,R3,R5,12*8,R10,R11,R12,R13);
+	encrypt_round(R2,R3,R8,R9,R0,R1,R5,13*8,R10,R11,R12,R13);
+	encrypt_round(R0,R1,R8,R9,R2,R3,R5,14*8,R10,R11,R12,R13);
+
+	mov	R3,	R13
+	shl	$32,	R13
+	xor	R2,	R13
+	output_whitening(R13,R5,a_offset)
+	movq	R13,	(R4)
+
+	encrypt_last_round(R2,R3,R8,R9,R0,R1,R5,15*8,R10,R11,R12,R13);
+
+
+	shl	$32,	R1
+	xor	R0,	R1
+
+	output_whitening(R1,R5,c_offset)
+	movq	R1,	8(R4)
+
+	popq	R13
+	popq	R12
+	popq	R1
+	movq	$1,%rax
+	ret
+
+twofish_dec_blk:
+	pushq    R1
+	pushq	 R12
+	pushq	 R13
+
+	/* r5 contains the crypto ctx adress */
+	/* r4 contains the output adress */
+	/* r3 contains the input adress */
+
+	movq	(R3),	R1
+	movq	8(R3),	R9
+	output_whitening(R1,R5,a_offset)
+	output_whitening(R9,R5,c_offset)
+	mov	R1D,	R0D
+	shr	$32,	R1
+	mov	R9D,	R8D
+	shr	$32,	R9
+	rol	$1,	R8D
+
+	decrypt_round(R0,R1,R8,R9,R2,R3,R5,15*8,R10,R11,R12,R13);
+	decrypt_round(R2,R3,R8,R9,R0,R1,R5,14*8,R10,R11,R12,R13);
+	decrypt_round(R0,R1,R8,R9,R2,R3,R5,13*8,R10,R11,R12,R13);
+	decrypt_round(R2,R3,R8,R9,R0,R1,R5,12*8,R10,R11,R12,R13);
+	decrypt_round(R0,R1,R8,R9,R2,R3,R5,11*8,R10,R11,R12,R13);
+	decrypt_round(R2,R3,R8,R9,R0,R1,R5,10*8,R10,R11,R12,R13);
+	decrypt_round(R0,R1,R8,R9,R2,R3,R5,9*8,R10,R11,R12,R13);
+	decrypt_round(R2,R3,R8,R9,R0,R1,R5,8*8,R10,R11,R12,R13);
+	decrypt_round(R0,R1,R8,R9,R2,R3,R5,7*8,R10,R11,R12,R13);
+	decrypt_round(R2,R3,R8,R9,R0,R1,R5,6*8,R10,R11,R12,R13);
+	decrypt_round(R0,R1,R8,R9,R2,R3,R5,5*8,R10,R11,R12,R13);
+	decrypt_round(R2,R3,R8,R9,R0,R1,R5,4*8,R10,R11,R12,R13);
+	decrypt_round(R0,R1,R8,R9,R2,R3,R5,3*8,R10,R11,R12,R13);
+	decrypt_round(R2,R3,R8,R9,R0,R1,R5,2*8,R10,R11,R12,R13);
+	decrypt_round(R0,R1,R8,R9,R2,R3,R5,8,R10,R11,R12,R13);
+
+	mov	R3,	R13
+	shl	$32,	R13
+	xor	R2,	R13
+	input_whitening(R13,R5,a_offset)
+	movq	R13,	(R4)
+
+	decrypt_last_round(R2,R3,R8,R9,R0,R1,R5,0,R10,R11,R12,R13);
+
+	shl	$32,	R1
+	xor	R0,	R1
+	input_whitening(R1,R5,c_offset)
+	movq	R1,	8(R4)
+
+	popq	R13
+	popq	R12
+	popq	R1
+	movq	$1,	%rax
+	ret
diff -uprN linux-2.6.17-rc5.twofish3/crypto/Kconfig linux-2.6.17-rc5.twofish4/crypto/Kconfig
--- linux-2.6.17-rc5.twofish3/crypto/Kconfig	2006-05-30 20:00:47.841035197 +0200
+++ linux-2.6.17-rc5.twofish4/crypto/Kconfig	2006-05-31 11:52:43.234447029 +0200
@@ -156,6 +156,20 @@ config CRYPTO_TWOFISH_586
 	  See also:
 	  <http://www.schneier.com/twofish.html>

+config CRYPTO_TWOFISH_X86_64
+        tristate "Twofish cipher algorithm (x86_64)"
+        depends on CRYPTO && ((X86 || UML_X86) && 64BIT)
+        help
+          Twofish cipher algorithm (x86_64).
+
+          Twofish was submitted as an AES (Advanced Encryption Standard)
+          candidate cipher by researchers at CounterPane Systems.  It is a
+          16 round block cipher supporting key sizes of 128, 192, and 256
+          bits.
+
+          See also:
+          <http://www.schneier.com/twofish.html>
+
 config CRYPTO_SERPENT
 	tristate "Serpent cipher algorithm"
 	depends on CRYPTO

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH  4/4] Twofish cipher - x86_64 assembler
  2006-06-04 13:16 [PATCH 4/4] Twofish cipher - x86_64 assembler Joachim Fritschi
  2006-06-04 19:10 ` Andi Kleen
  2006-06-07 19:38 ` Joachim Fritschi
@ 2006-06-16 12:00 ` Joachim Fritschi
  2006-06-17 10:38 ` Joachim Fritschi
  2006-06-19 14:13 ` Joachim Fritschi
  4 siblings, 0 replies; 20+ messages in thread
From: Joachim Fritschi @ 2006-06-16 12:00 UTC (permalink / raw)
  To: linux-kernel; +Cc: linux-crypto, herbert, ak

Update patch for the x86_64 twofish assembler implementation.

Changes since last version:
-Updated to the new twofish_common setup
-Complete rewrite of the code according to the feedback i recieved 
(thanks linux@horizon.com)

The patch passed the trycpt tests and automated filesystem tests.
This rewrite resulted in some nice perfomance increase over my last patch.

Short summary of the tcrypt benchmarks:

Twofish Assembler vs. Twofish C (256bit 8kb block CBC)
encrypt: -27% Cycles
decrypt: -23% Cycles

Twofish Assembler vs. AES Assembler (128bit 8kb block CBC)
encrypt: +18%  Cycles
decrypt: +15% Cycles

Twofish Assembler vs. AES Assembler (256bit 8kb block CBC)
encrypt: -9% Cycles
decrypt: -8% Cycles

Full Output:
http://homepages.tu-darmstadt.de/~fritschi/twofish/tcrypt-speed-twofish-c-x86_64.txt
http://homepages.tu-darmstadt.de/~fritschi/twofish/tcrypt-speed-twofish-asm-x86_64.txt
http://homepages.tu-darmstadt.de/~fritschi/twofish/tcrypt-speed-aes-asm-x86_64.txt


Here is another bonnie++ benchmark with encrypted filesystems. Most runs maxed
out the hd. It should give some idea what the module can do for encrypted filesystem
performance even though you can't see the full numbers.

http://homepages.tu-darmstadt.de/~fritschi/twofish/output_20060610_130806_x86_64.html


Signed-off-by: Joachim Fritschi <jfritschi@freenet.de>

diff -uprN linux-2.6.17-rc5.twofish3/arch/x86_64/crypto/Makefile linux-2.6.17-rc5.twofish4/arch/x86_64/crypto/Makefile
--- linux-2.6.17-rc5.twofish3/arch/x86_64/crypto/Makefile	2006-06-11 16:03:17.716764337 +0200
+++ linux-2.6.17-rc5.twofish4/arch/x86_64/crypto/Makefile	2006-06-11 16:11:41.279630413 +0200
@@ -5,5 +5,8 @@
 # 
 
 obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o
+obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o
 
 aes-x86_64-y := aes-x86_64-asm.o aes.o
+twofish-x86_64-y := twofish-x86_64-asm.o twofish.o
+
diff -uprN linux-2.6.17-rc5.twofish3/arch/x86_64/crypto/twofish.c linux-2.6.17-rc5.twofish4/arch/x86_64/crypto/twofish.c
--- linux-2.6.17-rc5.twofish3/arch/x86_64/crypto/twofish.c	1970-01-01 01:00:00.000000000 +0100
+++ linux-2.6.17-rc5.twofish4/arch/x86_64/crypto/twofish.c	2006-06-11 16:10:49.426288180 +0200
@@ -0,0 +1,86 @@
+/*
+ * Glue Code for optimized x86_64 assembler version of TWOFISH
+ *
+ * Originally Twofish for GPG
+ * By Matthew Skala <mskala@ansuz.sooke.bc.ca>, July 26, 1998
+ * 256-bit key length added March 20, 1999
+ * Some modifications to reduce the text size by Werner Koch, April, 1998
+ * Ported to the kerneli patch by Marc Mutz <Marc@Mutz.com>
+ * Ported to CryptoAPI by Colin Slater <hoho@tacomeat.net>
+ *
+ * The original author has disclaimed all copyright interest in this
+ * code and thus put it in the public domain. The subsequent authors
+ * have put this under the GNU General Public License.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307
+ * USA
+ *
+ * This code is a "clean room" implementation, written from the paper
+ * _Twofish: A 128-Bit Block Cipher_ by Bruce Schneier, John Kelsey,
+ * Doug Whiting, David Wagner, Chris Hall, and Niels Ferguson, available
+ * through http://www.counterpane.com/twofish.html
+ *
+ * For background information on multiplication in finite fields, used for
+ * the matrix operations in the key schedule, see the book _Contemporary
+ * Abstract Algebra_ by Joseph A. Gallian, especially chapter 22 in the
+ * Third Edition.
+ */
+
+#include <asm/byteorder.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/crypto.h>
+#include <linux/bitops.h>
+#include <crypto/twofish.h>
+
+asmlinkage void twofish_enc_blk(void *ctx, u8 *dst, const u8 *src);
+
+asmlinkage void twofish_dec_blk(void *ctx, u8 *dst, const u8 *src);
+
+static struct crypto_alg alg = {
+	.cra_name           =   "twofish",
+	.cra_driver_name    =	"twofish-x86_64",
+	.cra_priority       =	200,
+	.cra_flags          =   CRYPTO_ALG_TYPE_CIPHER,
+	.cra_blocksize      =   TF_BLOCK_SIZE,
+	.cra_ctxsize        =   sizeof(struct twofish_ctx),
+	.cra_alignmask      =	3,
+	.cra_module         =   THIS_MODULE,
+	.cra_list           =   LIST_HEAD_INIT(alg.cra_list),
+	.cra_u              =   { .cipher = {
+	.cia_min_keysize    =   TF_MIN_KEY_SIZE,
+	.cia_max_keysize    =   TF_MAX_KEY_SIZE,
+	.cia_setkey         =   twofish_setkey,
+	.cia_encrypt        =   twofish_enc_blk,
+	.cia_decrypt        =   twofish_dec_blk } }
+};
+
+static int __init init(void)
+{
+	return crypto_register_alg(&alg);
+}
+
+static void __exit fini(void)
+{
+	crypto_unregister_alg(&alg);
+}
+
+module_init(init);
+module_exit(fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION ("Twofish Cipher Algorithm, x86_64 asm optimized");
diff -uprN linux-2.6.17-rc5.twofish3/arch/x86_64/crypto/twofish-x86_64-asm.S linux-2.6.17-rc5.twofish4/arch/x86_64/crypto/twofish-x86_64-asm.S
--- linux-2.6.17-rc5.twofish3/arch/x86_64/crypto/twofish-x86_64-asm.S	1970-01-01 01:00:00.000000000 +0100
+++ linux-2.6.17-rc5.twofish4/arch/x86_64/crypto/twofish-x86_64-asm.S	2006-06-11 21:50:17.303085147 +0200
@@ -0,0 +1,397 @@
+/***************************************************************************
+*   Copyright (C) 2006 by Joachim Fritschi, <jfritschi@freenet.de>        *
+*                                                                         *
+*   This program is free software; you can redistribute it and/or modify  *
+*   it under the terms of the GNU General Public License as published by  *
+*   the Free Software Foundation; either version 2 of the License, or     *
+*   (at your option) any later version.                                   *
+*                                                                         *
+*   This program is distributed in the hope that it will be useful,       *
+*   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+*   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+*   GNU General Public License for more details.                          *
+*                                                                         *
+*   You should have received a copy of the GNU General Public License     *
+*   along with this program; if not, write to the                         *
+*   Free Software Foundation, Inc.,                                       *
+*   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+***************************************************************************/
+
+.file "twofish-x86_64-asm.S"
+.text
+
+#define a_offset	0
+#define b_offset	4
+#define c_offset	8
+#define d_offset	12
+
+/* Structure of the crypto context struct*/
+
+#define s0	0	/* S0 Array 256 Words each */
+#define s1	1024	/* S1 Array */
+#define s2	2048	/* S2 Array */
+#define s3	3072	/* S3 Array */
+#define w	4096	/* 8 whitening keys (word) */
+#define k	4128	/* key 1-32 ( word ) */
+
+/* define a few register aliases to allow macro substitution */
+
+#define R0     %rax
+#define R0D    %eax
+#define R0B    %al
+#define R0H    %ah
+
+#define R1     %rbx
+#define R1D    %ebx
+#define R1B    %bl
+#define R1H    %bh
+
+#define R2     %rcx
+#define R2D    %ecx
+#define R2B    %cl
+#define R2H    %ch
+
+#define R3     %rdx
+#define R3D    %edx
+#define R3B    %dl
+#define R3H    %dh
+
+
+/* performs input whitening */
+#define input_whitening(src,context,offset)\
+	xor	w+offset(context),	src;
+
+/* performs input whitening */
+#define output_whitening(src,context,offset)\
+	xor	w+16+offset(context),	src;
+
+
+/*
+a input register containing a (rotated 16)
+b input register containing b
+c input register containing c
+d input register containing d (already rol $1)
+operations on a and b are interleaved to increase performance
+*/
+#define encrypt_round(a,b,c,d,round)\
+movzx	b ## B,		%edi;\
+mov	s1(%r11,%rdi,4),%r8d;\
+movzx	a ## B,		%edi;\
+mov	s2(%r11,%rdi,4),%r9d;\
+movzx	b ## H,		%edi;\
+ror	$16,		b ## D;\
+xor	s2(%r11,%rdi,4),%r8d;\
+movzx	a ## H,		%edi;\
+ror	$16,		a ## D;\
+xor	s3(%r11,%rdi,4),%r9d;\
+movzx	b ## B,		%edi;\
+xor	s3(%r11,%rdi,4),%r8d;\
+movzx	a ## B,		%edi;\
+xor	(%r11,%rdi,4),	%r9d;\
+movzx	b ## H,		%edi;\
+ror	$15,		b ## D;\
+xor	(%r11,%rdi,4),	%r8d;\
+movzx	a ## H,		%edi;\
+xor	s1(%r11,%rdi,4),%r9d;\
+add	%r8d,		%r9d;\
+add	%r9d,		%r8d;\
+add	k+round(%r11),	%r9d;\
+xor	%r9d,		c ## D;\
+rol	$15,		c ## D;\
+add	k+4+round(%r11),%r8d;\
+xor	%r8d,		d ## D;
+
+
+/*
+a input register containing a
+b input register containing b
+c input register containing c
+d input register containing d (already rol $1)
+operations on a and b are interleaved to increase performance
+*/
+#define encrypt_first_round(a,b,c,d,round)\
+movzx	a ## B,		%edi;\
+mov	(%r11,%rdi,4),	%r9d;\
+movzx	b ## B,		%edi;\
+mov	s1(%r11,%rdi,4),%r8d;\
+movzx	a ## H,		%edi;\
+ror	$16,		a ## D;\
+xor	s1(%r11,%rdi,4),%r9d;\
+movzx	b ## H,		%edi;\
+ror	$16,		b ## D;\
+xor	s2(%r11,%rdi,4),%r8d;\
+movzx	b ## B,		%edi;\
+xor	s3(%r11,%rdi,4),%r8d;\
+movzx	a ## B,		%edi;\
+xor	s2(%r11,%rdi,4),%r9d;\
+movzx	b ## H,		%edi;\
+ror	$15,		b ## D;\
+xor	(%r11,%rdi,4),	%r8d;\
+movzx	a ## H,		%edi;\
+ror	$16,		a ## D;\
+xor	s3(%r11,%rdi,4),%r9d;\
+add	%r8d,		%r9d;\
+add	%r9d,		%r8d;\
+add	k+round(%r11),	%r9d;\
+add	k+4+round(%r11),%r8d;\
+xor	%r9d,		c ## D;\
+rol	$15,		c ## D;\
+xor	%r8d,		d ## D;
+
+/*
+a input register containing a(rotated 16)
+b input register containing b
+c input register containing c
+d input register containing d (already rol $1)
+operations on a and b are interleaved to increase performance
+durimg the round a and b are prepared for the output whitening
+*/
+#define encrypt_last_round(a,b,c,d,round)\
+mov	b ## D,		%r10d;\
+shl	$32,		%r10;\
+movzx	b ## B,		%edi;\
+mov	s1(%r11,%rdi,4),%r8d;\
+movzx	a ## B,		%edi;\
+mov	s2(%r11,%rdi,4),%r9d;\
+movzx	b ## H,		%edi;\
+ror	$16,		b ## D;\
+xor	s2(%r11,%rdi,4),%r8d;\
+movzx	a ## H,		%edi;\
+ror	$16,		a ## D;\
+xor	s3(%r11,%rdi,4),%r9d;\
+movzx	b ## B,		%edi;\
+xor	s3(%r11,%rdi,4),%r8d;\
+movzx	a ## B,		%edi;\
+xor	(%r11,%rdi,4),	%r9d;\
+xor	a,		%r10;\
+movzx	b ## H,		%edi;\
+xor	(%r11,%rdi,4),	%r8d;\
+movzx	a ## H,		%edi;\
+xor	s1(%r11,%rdi,4),%r9d;\
+add	%r8d,		%r9d;\
+add	%r9d,		%r8d;\
+add	k+round(%r11),	%r9d;\
+xor	%r9d,		c ## D;\
+ror	$1,		c ## D;\
+add	k+4+round(%r11),%r8d;\
+xor	%r8d,		d ## D
+
+
+/*
+a input register containing a
+b input register containing b (rotated 16)
+c input register containing c (already rol $1)
+d input register containing d 
+operations on a and b are interleaved to increase performance
+*/
+#define decrypt_round(a,b,c,d,round)\
+movzx	a ## B,		%edi;\
+mov	(%r11,%rdi,4),	%r9d;\
+movzx	b ## B,		%edi;\
+mov	s3(%r11,%rdi,4),%r8d;\
+movzx	a ## H,		%edi;\
+ror	$16,		a ## D;\
+xor	s1(%r11,%rdi,4),%r9d;\
+movzx	b ## H,		%edi;\
+ror	$16,		b ## D;\
+xor	(%r11,%rdi,4),	%r8d;\
+movzx	a ## B,		%edi;\
+xor	s2(%r11,%rdi,4),%r9d;\
+movzx	b ## B,		%edi;\
+xor	s1(%r11,%rdi,4),%r8d;\
+movzx	a ## H,		%edi;\
+ror	$15,		a ## D;\
+xor	s3(%r11,%rdi,4),%r9d;\
+movzx	b ## H,		%edi;\
+xor	s2(%r11,%rdi,4),%r8d;\
+add	%r8d,		%r9d;\
+add	%r9d,		%r8d;\
+add	k+4+round(%r11),%r8d;\
+xor	%r8d,		d ## D;\
+rol	$15,		d ## D;\
+add	k+round(%r11),	%r9d;\
+xor	%r9d,		c ## D;
+
+/*
+a input register containing a
+b input register containing b
+c input register containing c (already rol $1)
+d input register containing d
+operations on a and b are interleaved to increase performance
+*/
+#define decrypt_first_round(a,b,c,d,round)\
+movzx	a ## B,		%edi;\
+mov	(%r11,%rdi,4),	%r9d;\
+movzx	b ## B,		%edi;\
+mov	s1(%r11,%rdi,4),%r8d;\
+movzx	a ## H,		%edi;\
+ror	$16,		a ## D;\
+xor	s1(%r11,%rdi,4),%r9d;\
+movzx	b ## H,		%edi;\
+ror	$16,		b ## D;\
+xor	s2(%r11,%rdi,4),%r8d;\
+movzx	b ## B,		%edi;\
+xor	s3(%r11,%rdi,4),%r8d;\
+movzx	a ## B,		%edi;\
+xor	s2(%r11,%rdi,4),%r9d;\
+movzx	b ## H,		%edi;\
+ror	$16,		b ## D;\
+xor	(%r11,%rdi,4),	%r8d;\
+movzx	a ## H,		%edi;\
+ror	$15,		a ## D;\
+xor	s3(%r11,%rdi,4),%r9d;\
+add	%r8d,		%r9d;\
+add	%r9d,		%r8d;\
+add	k+4+round(%r11),%r8d;\
+xor	%r8d,		d ## D;\
+rol	$15,		d ## D;\
+add	k+round(%r11),	%r9d;\
+xor	%r9d,		c ## D;
+
+/*
+a input register containing a
+b input register containing b
+c input register containing c (already rol $1)
+d input register containing d
+operations on a and b are interleaved to increase performance
+durimg the round a and b are prepared for the output whitening
+@ */
+#define decrypt_last_round(a,b,c,d,round)\
+movzx	a ## B,		%edi;\
+mov	(%r11,%rdi,4),	%r9d;\
+movzx	b ## B,		%edi;\
+mov	s3(%r11,%rdi,4),%r8d;\
+movzx	b ## H,		%edi;\
+ror	$16,		b ## D;\
+xor	(%r11,%rdi,4),	%r8d;\
+movzx	a ## H,		%edi;\
+mov	b ## D,		%r10d;\
+shl	$32,		%r10;\
+xor	a,		%r10;\
+ror	$16,		a ## D;\
+xor	s1(%r11,%rdi,4),%r9d;\
+movzx	b ## B,		%edi;\
+xor	s1(%r11,%rdi,4),%r8d;\
+movzx	a ## B,		%edi;\
+xor	s2(%r11,%rdi,4),%r9d;\
+movzx	b ## H,		%edi;\
+xor	s2(%r11,%rdi,4),%r8d;\
+movzx	a ## H,		%edi;\
+xor	s3(%r11,%rdi,4),%r9d;\
+add	%r8d,		%r9d;\
+add	%r9d,		%r8d;\
+add	k+4+round(%r11),%r8d;\
+xor	%r8d,		d ## D;\
+ror	$1,		d ## D;\
+add	k+round(%r11),	%r9d;\
+xor	%r9d,		c ## D;
+
+	
+	
+.align 8
+.global twofish_enc_blk
+.global twofish_dec_blk
+
+
+
+twofish_enc_blk:
+	pushq    R1
+	
+	/* %rdi contains the crypto ctx adress */
+	/* %rsi contains the output adress */
+	/* %rdx contains the input adress */
+	
+	/* ctx adress is moved to free one non-rex register
+	as target for the 8bit high operations */
+	mov	%rdi,		%r11
+
+	movq	(R3),	R1
+	movq	8(R3),	R3
+	input_whitening(R1,%r11,a_offset)
+	input_whitening(R3,%r11,c_offset)
+	mov	R1D,	R0D
+	shr	$32,	R1
+	mov	R3D,	R2D
+	shr	$32,	R3
+	rol	$1,	R3D
+
+	encrypt_first_round(R0,R1,R2,R3,0);
+	encrypt_round(R2,R3,R0,R1,8);
+	encrypt_round(R0,R1,R2,R3,2*8);
+	encrypt_round(R2,R3,R0,R1,3*8);
+	encrypt_round(R0,R1,R2,R3,4*8);
+	encrypt_round(R2,R3,R0,R1,5*8);
+	encrypt_round(R0,R1,R2,R3,6*8);
+	encrypt_round(R2,R3,R0,R1,7*8);
+	encrypt_round(R0,R1,R2,R3,8*8);
+	encrypt_round(R2,R3,R0,R1,9*8);
+	encrypt_round(R0,R1,R2,R3,10*8);
+	encrypt_round(R2,R3,R0,R1,11*8);
+	encrypt_round(R0,R1,R2,R3,12*8);
+	encrypt_round(R2,R3,R0,R1,13*8);
+	encrypt_round(R0,R1,R2,R3,14*8);
+	encrypt_last_round(R2,R3,R0,R1,15*8);
+
+
+	output_whitening(%r10,%r11,a_offset)
+	movq	%r10,	(%rsi)
+
+	shl	$32,	R1
+	xor	R0,	R1
+
+	output_whitening(R1,%r11,c_offset)
+	movq	R1,	8(%rsi)
+
+	popq	R1
+	movq	$1,%rax
+	ret
+	
+twofish_dec_blk:	
+	pushq    R1
+	
+	/* %rdi contains the crypto ctx adress */
+	/* %rsi contains the output adress */
+	/* %rdx contains the input adress */
+	/* ctx adress is moved to free one non-rex register
+	as target for the 8bit high operations */
+	mov	%rdi,		%r11
+
+	movq	(R3),	R1
+	movq	8(R3),	R3
+	output_whitening(R1,%r11,a_offset)
+	output_whitening(R3,%r11,c_offset)
+	mov	R1D,	R0D
+	shr	$32,	R1
+	mov	R3D,	R2D
+	shr	$32,	R3
+	rol	$1,	R2D
+
+	decrypt_first_round(R0,R1,R2,R3,15*8);
+	decrypt_round(R2,R3,R0,R1,14*8);
+	decrypt_round(R0,R1,R2,R3,13*8);
+	decrypt_round(R2,R3,R0,R1,12*8);
+	decrypt_round(R0,R1,R2,R3,11*8);
+	decrypt_round(R2,R3,R0,R1,10*8);
+	decrypt_round(R0,R1,R2,R3,9*8);
+	decrypt_round(R2,R3,R0,R1,8*8);
+	decrypt_round(R0,R1,R2,R3,7*8);
+	decrypt_round(R2,R3,R0,R1,6*8);
+	decrypt_round(R0,R1,R2,R3,5*8);
+	decrypt_round(R2,R3,R0,R1,4*8);
+	decrypt_round(R0,R1,R2,R3,3*8);
+	decrypt_round(R2,R3,R0,R1,2*8);
+	decrypt_round(R0,R1,R2,R3,1*8);
+	decrypt_last_round(R2,R3,R0,R1,0);
+
+	input_whitening(%r10,%r11,a_offset)
+	movq	%r10,	(%rsi)
+
+	shl	$32,	R1
+	xor	R0,	R1
+
+	input_whitening(R1,%r11,c_offset)
+	movq	R1,	8(%rsi)
+
+	popq	R1
+	movq	$1,%rax
+	ret
diff -uprN linux-2.6.17-rc5.twofish3/crypto/Kconfig linux-2.6.17-rc5.twofish4/crypto/Kconfig
--- linux-2.6.17-rc5.twofish3/crypto/Kconfig	2006-06-11 16:05:19.938782275 +0200
+++ linux-2.6.17-rc5.twofish4/crypto/Kconfig	2006-06-11 16:11:17.126755733 +0200
@@ -165,6 +165,21 @@ config CRYPTO_TWOFISH_586
 	  See also:
 	  <http://www.schneier.com/twofish.html>
 
+config CRYPTO_TWOFISH_X86_64
+        tristate "Twofish cipher algorithm (x86_64)"
+        depends on CRYPTO && ((X86 || UML_X86) && 64BIT)
+        select CRYPTO_TWOFISH_COMMON
+	help
+          Twofish cipher algorithm (x86_64).
+
+          Twofish was submitted as an AES (Advanced Encryption Standard)
+          candidate cipher by researchers at CounterPane Systems.  It is a
+          16 round block cipher supporting key sizes of 128, 192, and 256
+          bits.
+
+          See also:
+          <http://www.schneier.com/twofish.html>
+
 config CRYPTO_SERPENT
 	tristate "Serpent cipher algorithm"
 	depends on CRYPTO


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH  4/4] Twofish cipher - x86_64 assembler
  2006-06-04 13:16 [PATCH 4/4] Twofish cipher - x86_64 assembler Joachim Fritschi
                   ` (2 preceding siblings ...)
  2006-06-16 12:00 ` Joachim Fritschi
@ 2006-06-17 10:38 ` Joachim Fritschi
  2006-06-19 14:13 ` Joachim Fritschi
  4 siblings, 0 replies; 20+ messages in thread
From: Joachim Fritschi @ 2006-06-17 10:38 UTC (permalink / raw)
  To: linux-kernel; +Cc: linux-crypto, herbert, ak

After recieving some more feedback from linux@horizon.com, i have revised my
patch a bit and done some cosmetic changes. The first_round macros are now 
eliminated reducing the patchsize.

Correctness was verified with the tcrypt module and automated test scripts.
 
Signed-off-by: Joachim Fritschi <jfritschi@freenet.de>

diff -uprN linux-2.6.17-rc5.twofish3/arch/x86_64/crypto/Makefile linux-2.6.17-rc5.twofish4/arch/x86_64/crypto/Makefile
--- linux-2.6.17-rc5.twofish3/arch/x86_64/crypto/Makefile	2006-06-11 16:03:17.716764337 +0200
+++ linux-2.6.17-rc5.twofish4/arch/x86_64/crypto/Makefile	2006-06-11 16:11:41.279630413 +0200
@@ -5,5 +5,8 @@
 # 
 
 obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o
+obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o
 
 aes-x86_64-y := aes-x86_64-asm.o aes.o
+twofish-x86_64-y := twofish-x86_64-asm.o twofish.o
+
diff -uprN linux-2.6.17-rc5.twofish3/arch/x86_64/crypto/twofish.c linux-2.6.17-rc5.twofish4/arch/x86_64/crypto/twofish.c
--- linux-2.6.17-rc5.twofish3/arch/x86_64/crypto/twofish.c	1970-01-01 01:00:00.000000000 +0100
+++ linux-2.6.17-rc5.twofish4/arch/x86_64/crypto/twofish.c	2006-06-11 16:10:49.426288180 +0200
@@ -0,0 +1,86 @@
+/*
+ * Glue Code for optimized x86_64 assembler version of TWOFISH
+ *
+ * Originally Twofish for GPG
+ * By Matthew Skala <mskala@ansuz.sooke.bc.ca>, July 26, 1998
+ * 256-bit key length added March 20, 1999
+ * Some modifications to reduce the text size by Werner Koch, April, 1998
+ * Ported to the kerneli patch by Marc Mutz <Marc@Mutz.com>
+ * Ported to CryptoAPI by Colin Slater <hoho@tacomeat.net>
+ *
+ * The original author has disclaimed all copyright interest in this
+ * code and thus put it in the public domain. The subsequent authors
+ * have put this under the GNU General Public License.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307
+ * USA
+ *
+ * This code is a "clean room" implementation, written from the paper
+ * _Twofish: A 128-Bit Block Cipher_ by Bruce Schneier, John Kelsey,
+ * Doug Whiting, David Wagner, Chris Hall, and Niels Ferguson, available
+ * through http://www.counterpane.com/twofish.html
+ *
+ * For background information on multiplication in finite fields, used for
+ * the matrix operations in the key schedule, see the book _Contemporary
+ * Abstract Algebra_ by Joseph A. Gallian, especially chapter 22 in the
+ * Third Edition.
+ */
+
+#include <asm/byteorder.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/crypto.h>
+#include <linux/bitops.h>
+#include <crypto/twofish.h>
+
+asmlinkage void twofish_enc_blk(void *ctx, u8 *dst, const u8 *src);
+
+asmlinkage void twofish_dec_blk(void *ctx, u8 *dst, const u8 *src);
+
+static struct crypto_alg alg = {
+	.cra_name           =   "twofish",
+	.cra_driver_name    =	"twofish-x86_64",
+	.cra_priority       =	200,
+	.cra_flags          =   CRYPTO_ALG_TYPE_CIPHER,
+	.cra_blocksize      =   TF_BLOCK_SIZE,
+	.cra_ctxsize        =   sizeof(struct twofish_ctx),
+	.cra_alignmask      =	3,
+	.cra_module         =   THIS_MODULE,
+	.cra_list           =   LIST_HEAD_INIT(alg.cra_list),
+	.cra_u              =   { .cipher = {
+	.cia_min_keysize    =   TF_MIN_KEY_SIZE,
+	.cia_max_keysize    =   TF_MAX_KEY_SIZE,
+	.cia_setkey         =   twofish_setkey,
+	.cia_encrypt        =   twofish_enc_blk,
+	.cia_decrypt        =   twofish_dec_blk } }
+};
+
+static int __init init(void)
+{
+	return crypto_register_alg(&alg);
+}
+
+static void __exit fini(void)
+{
+	crypto_unregister_alg(&alg);
+}
+
+module_init(init);
+module_exit(fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION ("Twofish Cipher Algorithm, x86_64 asm optimized");
diff -uprN linux-2.6.17-rc5.twofish3/arch/x86_64/crypto/twofish-x86_64-asm.S linux-2.6.17-rc5.twofish4/arch/x86_64/crypto/twofish-x86_64-asm.S
--- linux-2.6.17-rc5.twofish3/arch/x86_64/crypto/twofish-x86_64-asm.S	1970-01-01 01:00:00.000000000 +0100
+++ linux-2.6.17-rc5.twofish4/arch/x86_64/crypto/twofish-x86_64-asm.S	2006-06-17 12:34:00.038152154 +0200
@@ -0,0 +1,321 @@
+/***************************************************************************
+*   Copyright (C) 2006 by Joachim Fritschi, <jfritschi@freenet.de>        *
+*                                                                         *
+*   This program is free software; you can redistribute it and/or modify  *
+*   it under the terms of the GNU General Public License as published by  *
+*   the Free Software Foundation; either version 2 of the License, or     *
+*   (at your option) any later version.                                   *
+*                                                                         *
+*   This program is distributed in the hope that it will be useful,       *
+*   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+*   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+*   GNU General Public License for more details.                          *
+*                                                                         *
+*   You should have received a copy of the GNU General Public License     *
+*   along with this program; if not, write to the                         *
+*   Free Software Foundation, Inc.,                                       *
+*   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+***************************************************************************/
+
+.file "twofish-x86_64-asm.S"
+.text
+
+#define a_offset	0
+#define b_offset	4
+#define c_offset	8
+#define d_offset	12
+
+/* Structure of the crypto context struct*/
+
+#define s0	0	/* S0 Array 256 Words each */
+#define s1	1024	/* S1 Array */
+#define s2	2048	/* S2 Array */
+#define s3	3072	/* S3 Array */
+#define w	4096	/* 8 whitening keys (word) */
+#define k	4128	/* key 1-32 ( word ) */
+
+/* define a few register aliases to allow macro substitution */
+
+#define R0     %rax
+#define R0D    %eax
+#define R0B    %al
+#define R0H    %ah
+
+#define R1     %rbx
+#define R1D    %ebx
+#define R1B    %bl
+#define R1H    %bh
+
+#define R2     %rcx
+#define R2D    %ecx
+#define R2B    %cl
+#define R2H    %ch
+
+#define R3     %rdx
+#define R3D    %edx
+#define R3B    %dl
+#define R3H    %dh
+
+
+/* performs input whitening */
+#define input_whitening(src,context,offset)\
+	xor	w+offset(context),	src;
+
+/* performs input whitening */
+#define output_whitening(src,context,offset)\
+	xor	w+16+offset(context),	src;
+
+
+/*
+a input register containing a (rotated 16)
+b input register containing b
+c input register containing c
+d input register containing d (already rol $1)
+operations on a and b are interleaved to increase performance
+*/
+#define encrypt_round(a,b,c,d,round)\
+movzx	b ## B,		%edi;\
+mov	s1(%r11,%rdi,4),%r8d;\
+movzx	a ## B,		%edi;\
+mov	s2(%r11,%rdi,4),%r9d;\
+movzx	b ## H,		%edi;\
+ror	$16,		b ## D;\
+xor	s2(%r11,%rdi,4),%r8d;\
+movzx	a ## H,		%edi;\
+ror	$16,		a ## D;\
+xor	s3(%r11,%rdi,4),%r9d;\
+movzx	b ## B,		%edi;\
+xor	s3(%r11,%rdi,4),%r8d;\
+movzx	a ## B,		%edi;\
+xor	(%r11,%rdi,4),	%r9d;\
+movzx	b ## H,		%edi;\
+ror	$15,		b ## D;\
+xor	(%r11,%rdi,4),	%r8d;\
+movzx	a ## H,		%edi;\
+xor	s1(%r11,%rdi,4),%r9d;\
+add	%r8d,		%r9d;\
+add	%r9d,		%r8d;\
+add	k+round(%r11),	%r9d;\
+xor	%r9d,		c ## D;\
+rol	$15,		c ## D;\
+add	k+4+round(%r11),%r8d;\
+xor	%r8d,		d ## D;
+
+/*
+a input register containing a(rotated 16)
+b input register containing b
+c input register containing c
+d input register containing d (already rol $1)
+operations on a and b are interleaved to increase performance
+during the round a and b are prepared for the output whitening
+*/
+#define encrypt_last_round(a,b,c,d,round)\
+mov	b ## D,		%r10d;\
+shl	$32,		%r10;\
+movzx	b ## B,		%edi;\
+mov	s1(%r11,%rdi,4),%r8d;\
+movzx	a ## B,		%edi;\
+mov	s2(%r11,%rdi,4),%r9d;\
+movzx	b ## H,		%edi;\
+ror	$16,		b ## D;\
+xor	s2(%r11,%rdi,4),%r8d;\
+movzx	a ## H,		%edi;\
+ror	$16,		a ## D;\
+xor	s3(%r11,%rdi,4),%r9d;\
+movzx	b ## B,		%edi;\
+xor	s3(%r11,%rdi,4),%r8d;\
+movzx	a ## B,		%edi;\
+xor	(%r11,%rdi,4),	%r9d;\
+xor	a,		%r10;\
+movzx	b ## H,		%edi;\
+xor	(%r11,%rdi,4),	%r8d;\
+movzx	a ## H,		%edi;\
+xor	s1(%r11,%rdi,4),%r9d;\
+add	%r8d,		%r9d;\
+add	%r9d,		%r8d;\
+add	k+round(%r11),	%r9d;\
+xor	%r9d,		c ## D;\
+ror	$1,		c ## D;\
+add	k+4+round(%r11),%r8d;\
+xor	%r8d,		d ## D
+
+/*
+a input register containing a
+b input register containing b (rotated 16)
+c input register containing c (already rol $1)
+d input register containing d 
+operations on a and b are interleaved to increase performance
+*/
+#define decrypt_round(a,b,c,d,round)\
+movzx	a ## B,		%edi;\
+mov	(%r11,%rdi,4),	%r9d;\
+movzx	b ## B,		%edi;\
+mov	s3(%r11,%rdi,4),%r8d;\
+movzx	a ## H,		%edi;\
+ror	$16,		a ## D;\
+xor	s1(%r11,%rdi,4),%r9d;\
+movzx	b ## H,		%edi;\
+ror	$16,		b ## D;\
+xor	(%r11,%rdi,4),	%r8d;\
+movzx	a ## B,		%edi;\
+xor	s2(%r11,%rdi,4),%r9d;\
+movzx	b ## B,		%edi;\
+xor	s1(%r11,%rdi,4),%r8d;\
+movzx	a ## H,		%edi;\
+ror	$15,		a ## D;\
+xor	s3(%r11,%rdi,4),%r9d;\
+movzx	b ## H,		%edi;\
+xor	s2(%r11,%rdi,4),%r8d;\
+add	%r8d,		%r9d;\
+add	%r9d,		%r8d;\
+add	k+round(%r11),	%r9d;\
+xor	%r9d,		c ## D;\
+add	k+4+round(%r11),%r8d;\
+xor	%r8d,		d ## D;\
+rol	$15,		d ## D;
+
+/*
+a input register containing a
+b input register containing b
+c input register containing c (already rol $1)
+d input register containing d
+operations on a and b are interleaved to increase performance
+during the round a and b are prepared for the output whitening
+*/
+#define decrypt_last_round(a,b,c,d,round)\
+movzx	a ## B,		%edi;\
+mov	(%r11,%rdi,4),	%r9d;\
+movzx	b ## B,		%edi;\
+mov	s3(%r11,%rdi,4),%r8d;\
+movzx	b ## H,		%edi;\
+ror	$16,		b ## D;\
+xor	(%r11,%rdi,4),	%r8d;\
+movzx	a ## H,		%edi;\
+mov	b ## D,		%r10d;\
+shl	$32,		%r10;\
+xor	a,		%r10;\
+ror	$16,		a ## D;\
+xor	s1(%r11,%rdi,4),%r9d;\
+movzx	b ## B,		%edi;\
+xor	s1(%r11,%rdi,4),%r8d;\
+movzx	a ## B,		%edi;\
+xor	s2(%r11,%rdi,4),%r9d;\
+movzx	b ## H,		%edi;\
+xor	s2(%r11,%rdi,4),%r8d;\
+movzx	a ## H,		%edi;\
+xor	s3(%r11,%rdi,4),%r9d;\
+add	%r8d,		%r9d;\
+add	%r9d,		%r8d;\
+add	k+round(%r11),	%r9d;\
+xor	%r9d,		c ## D;\
+add	k+4+round(%r11),%r8d;\
+xor	%r8d,		d ## D;\
+ror	$1,		d ## D;
+
+.align 8
+.global twofish_enc_blk
+.global twofish_dec_blk
+
+twofish_enc_blk:
+	pushq    R1
+
+	/* %rdi contains the crypto ctx adress */
+	/* %rsi contains the output adress */
+	/* %rdx contains the input adress */
+
+	/* ctx adress is moved to free one non-rex register
+	as target for the 8bit high operations */
+	mov	%rdi,		%r11
+
+	movq	(R3),	R1
+	movq	8(R3),	R3
+	input_whitening(R1,%r11,a_offset)
+	input_whitening(R3,%r11,c_offset)
+	mov	R1D,	R0D
+	rol	$16,	R0D
+	shr	$32,	R1
+	mov	R3D,	R2D
+	shr	$32,	R3
+	rol	$1,	R3D
+
+	encrypt_round(R0,R1,R2,R3,0);
+	encrypt_round(R2,R3,R0,R1,8);
+	encrypt_round(R0,R1,R2,R3,2*8);
+	encrypt_round(R2,R3,R0,R1,3*8);
+	encrypt_round(R0,R1,R2,R3,4*8);
+	encrypt_round(R2,R3,R0,R1,5*8);
+	encrypt_round(R0,R1,R2,R3,6*8);
+	encrypt_round(R2,R3,R0,R1,7*8);
+	encrypt_round(R0,R1,R2,R3,8*8);
+	encrypt_round(R2,R3,R0,R1,9*8);
+	encrypt_round(R0,R1,R2,R3,10*8);
+	encrypt_round(R2,R3,R0,R1,11*8);
+	encrypt_round(R0,R1,R2,R3,12*8);
+	encrypt_round(R2,R3,R0,R1,13*8);
+	encrypt_round(R0,R1,R2,R3,14*8);
+	encrypt_last_round(R2,R3,R0,R1,15*8);
+
+
+	output_whitening(%r10,%r11,a_offset)
+	movq	%r10,	(%rsi)
+
+	shl	$32,	R1
+	xor	R0,	R1
+
+	output_whitening(R1,%r11,c_offset)
+	movq	R1,	8(%rsi)
+
+	popq	R1
+	movq	$1,%rax
+	ret
+	
+twofish_dec_blk:	
+	pushq    R1
+
+	/* %rdi contains the crypto ctx adress */
+	/* %rsi contains the output adress */
+	/* %rdx contains the input adress */
+	/* ctx adress is moved to free one non-rex register
+	as target for the 8bit high operations */
+	mov	%rdi,		%r11
+
+	movq	(R3),	R1
+	movq	8(R3),	R3
+	output_whitening(R1,%r11,a_offset)
+	output_whitening(R3,%r11,c_offset)
+	mov	R1D,	R0D
+	shr	$32,	R1
+	rol	$16,	R1D
+	mov	R3D,	R2D
+	shr	$32,	R3
+	rol	$1,	R2D
+
+	decrypt_round(R0,R1,R2,R3,15*8);
+	decrypt_round(R2,R3,R0,R1,14*8);
+	decrypt_round(R0,R1,R2,R3,13*8);
+	decrypt_round(R2,R3,R0,R1,12*8);
+	decrypt_round(R0,R1,R2,R3,11*8);
+	decrypt_round(R2,R3,R0,R1,10*8);
+	decrypt_round(R0,R1,R2,R3,9*8);
+	decrypt_round(R2,R3,R0,R1,8*8);
+	decrypt_round(R0,R1,R2,R3,7*8);
+	decrypt_round(R2,R3,R0,R1,6*8);
+	decrypt_round(R0,R1,R2,R3,5*8);
+	decrypt_round(R2,R3,R0,R1,4*8);
+	decrypt_round(R0,R1,R2,R3,3*8);
+	decrypt_round(R2,R3,R0,R1,2*8);
+	decrypt_round(R0,R1,R2,R3,1*8);
+	decrypt_last_round(R2,R3,R0,R1,0);
+
+	input_whitening(%r10,%r11,a_offset)
+	movq	%r10,	(%rsi)
+
+	shl	$32,	R1
+	xor	R0,	R1
+
+	input_whitening(R1,%r11,c_offset)
+	movq	R1,	8(%rsi)
+
+	popq	R1
+	movq	$1,%rax
+	ret
diff -uprN linux-2.6.17-rc5.twofish3/crypto/Kconfig linux-2.6.17-rc5.twofish4/crypto/Kconfig
--- linux-2.6.17-rc5.twofish3/crypto/Kconfig	2006-06-11 16:05:19.938782275 +0200
+++ linux-2.6.17-rc5.twofish4/crypto/Kconfig	2006-06-11 16:11:17.126755733 +0200
@@ -165,6 +165,21 @@ config CRYPTO_TWOFISH_586
 	  See also:
 	  <http://www.schneier.com/twofish.html>
 
+config CRYPTO_TWOFISH_X86_64
+        tristate "Twofish cipher algorithm (x86_64)"
+        depends on CRYPTO && ((X86 || UML_X86) && 64BIT)
+        select CRYPTO_TWOFISH_COMMON
+	help
+          Twofish cipher algorithm (x86_64).
+
+          Twofish was submitted as an AES (Advanced Encryption Standard)
+          candidate cipher by researchers at CounterPane Systems.  It is a
+          16 round block cipher supporting key sizes of 128, 192, and 256
+          bits.
+
+          See also:
+          <http://www.schneier.com/twofish.html>
+
 config CRYPTO_SERPENT
 	tristate "Serpent cipher algorithm"
 	depends on CRYPTO


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH  4/4] Twofish cipher - x86_64 assembler
  2006-06-04 13:16 [PATCH 4/4] Twofish cipher - x86_64 assembler Joachim Fritschi
                   ` (3 preceding siblings ...)
  2006-06-17 10:38 ` Joachim Fritschi
@ 2006-06-19 14:13 ` Joachim Fritschi
  2006-06-20 11:14   ` Herbert Xu
  4 siblings, 1 reply; 20+ messages in thread
From: Joachim Fritschi @ 2006-06-19 14:13 UTC (permalink / raw)
  To: linux-kernel; +Cc: linux-crypto, herbert, ak

This patch is now based on the cryptodev tree using the new cryptoapi (crypto  tfm
 instead of the crypto ctx as parameter).

The module passed the tcrypt tests and testscripts.

Signed-off-by: Joachim Fritschi <jfritschi@freenet.de>

 arch/x86_64/crypto/Makefile             |    3 
 arch/x86_64/crypto/twofish-x86_64-asm.S |  324 +++++++++++++++++++++++++++++++
 arch/x86_64/crypto/twofish.c            |   86 ++++++++
 crypto/Kconfig                          |   15 +
 4 files changed, 428 insertions(+), 0 deletions(-)

diff --git a/arch/x86_64/crypto/Makefile b/arch/x86_64/crypto/Makefile
index 426d20f..15b538a 100644
--- a/arch/x86_64/crypto/Makefile
+++ b/arch/x86_64/crypto/Makefile
@@ -5,5 +5,8 @@ # Arch-specific CryptoAPI modules.
 # 
 
 obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o
+obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o
 
 aes-x86_64-y := aes-x86_64-asm.o aes.o
+twofish-x86_64-y := twofish-x86_64-asm.o twofish.o
+
diff --git a/arch/x86_64/crypto/twofish-x86_64-asm.S b/arch/x86_64/crypto/twofish-x86_64-asm.S
new file mode 100644
index 0000000..5b694c0
--- /dev/null
+++ b/arch/x86_64/crypto/twofish-x86_64-asm.S
@@ -0,0 +1,324 @@
+/***************************************************************************
+*   Copyright (C) 2006 by Joachim Fritschi, <jfritschi@freenet.de>        *
+*                                                                         *
+*   This program is free software; you can redistribute it and/or modify  *
+*   it under the terms of the GNU General Public License as published by  *
+*   the Free Software Foundation; either version 2 of the License, or     *
+*   (at your option) any later version.                                   *
+*                                                                         *
+*   This program is distributed in the hope that it will be useful,       *
+*   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+*   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+*   GNU General Public License for more details.                          *
+*                                                                         *
+*   You should have received a copy of the GNU General Public License     *
+*   along with this program; if not, write to the                         *
+*   Free Software Foundation, Inc.,                                       *
+*   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+***************************************************************************/
+
+.file "twofish-x86_64-asm.S"
+.text
+
+#include <asm/asm-offsets.h>
+
+#define a_offset	0
+#define b_offset	4
+#define c_offset	8
+#define d_offset	12
+
+/* Structure of the crypto context struct*/
+
+#define s0	0	/* S0 Array 256 Words each */
+#define s1	1024	/* S1 Array */
+#define s2	2048	/* S2 Array */
+#define s3	3072	/* S3 Array */
+#define w	4096	/* 8 whitening keys (word) */
+#define k	4128	/* key 1-32 ( word ) */
+
+/* define a few register aliases to allow macro substitution */
+
+#define R0     %rax
+#define R0D    %eax
+#define R0B    %al
+#define R0H    %ah
+
+#define R1     %rbx
+#define R1D    %ebx
+#define R1B    %bl
+#define R1H    %bh
+
+#define R2     %rcx
+#define R2D    %ecx
+#define R2B    %cl
+#define R2H    %ch
+
+#define R3     %rdx
+#define R3D    %edx
+#define R3B    %dl
+#define R3H    %dh
+
+
+/* performs input whitening */
+#define input_whitening(src,context,offset)\
+	xor	w+offset(context),	src;
+
+/* performs input whitening */
+#define output_whitening(src,context,offset)\
+	xor	w+16+offset(context),	src;
+
+
+/*
+a input register containing a (rotated 16)
+b input register containing b
+c input register containing c
+d input register containing d (already rol $1)
+operations on a and b are interleaved to increase performance
+*/
+#define encrypt_round(a,b,c,d,round)\
+movzx	b ## B,		%edi;\
+mov	s1(%r11,%rdi,4),%r8d;\
+movzx	a ## B,		%edi;\
+mov	s2(%r11,%rdi,4),%r9d;\
+movzx	b ## H,		%edi;\
+ror	$16,		b ## D;\
+xor	s2(%r11,%rdi,4),%r8d;\
+movzx	a ## H,		%edi;\
+ror	$16,		a ## D;\
+xor	s3(%r11,%rdi,4),%r9d;\
+movzx	b ## B,		%edi;\
+xor	s3(%r11,%rdi,4),%r8d;\
+movzx	a ## B,		%edi;\
+xor	(%r11,%rdi,4),	%r9d;\
+movzx	b ## H,		%edi;\
+ror	$15,		b ## D;\
+xor	(%r11,%rdi,4),	%r8d;\
+movzx	a ## H,		%edi;\
+xor	s1(%r11,%rdi,4),%r9d;\
+add	%r8d,		%r9d;\
+add	%r9d,		%r8d;\
+add	k+round(%r11),	%r9d;\
+xor	%r9d,		c ## D;\
+rol	$15,		c ## D;\
+add	k+4+round(%r11),%r8d;\
+xor	%r8d,		d ## D;
+
+/*
+a input register containing a(rotated 16)
+b input register containing b
+c input register containing c
+d input register containing d (already rol $1)
+operations on a and b are interleaved to increase performance
+during the round a and b are prepared for the output whitening
+*/
+#define encrypt_last_round(a,b,c,d,round)\
+mov	b ## D,		%r10d;\
+shl	$32,		%r10;\
+movzx	b ## B,		%edi;\
+mov	s1(%r11,%rdi,4),%r8d;\
+movzx	a ## B,		%edi;\
+mov	s2(%r11,%rdi,4),%r9d;\
+movzx	b ## H,		%edi;\
+ror	$16,		b ## D;\
+xor	s2(%r11,%rdi,4),%r8d;\
+movzx	a ## H,		%edi;\
+ror	$16,		a ## D;\
+xor	s3(%r11,%rdi,4),%r9d;\
+movzx	b ## B,		%edi;\
+xor	s3(%r11,%rdi,4),%r8d;\
+movzx	a ## B,		%edi;\
+xor	(%r11,%rdi,4),	%r9d;\
+xor	a,		%r10;\
+movzx	b ## H,		%edi;\
+xor	(%r11,%rdi,4),	%r8d;\
+movzx	a ## H,		%edi;\
+xor	s1(%r11,%rdi,4),%r9d;\
+add	%r8d,		%r9d;\
+add	%r9d,		%r8d;\
+add	k+round(%r11),	%r9d;\
+xor	%r9d,		c ## D;\
+ror	$1,		c ## D;\
+add	k+4+round(%r11),%r8d;\
+xor	%r8d,		d ## D
+
+/*
+a input register containing a
+b input register containing b (rotated 16)
+c input register containing c (already rol $1)
+d input register containing d 
+operations on a and b are interleaved to increase performance
+*/
+#define decrypt_round(a,b,c,d,round)\
+movzx	a ## B,		%edi;\
+mov	(%r11,%rdi,4),	%r9d;\
+movzx	b ## B,		%edi;\
+mov	s3(%r11,%rdi,4),%r8d;\
+movzx	a ## H,		%edi;\
+ror	$16,		a ## D;\
+xor	s1(%r11,%rdi,4),%r9d;\
+movzx	b ## H,		%edi;\
+ror	$16,		b ## D;\
+xor	(%r11,%rdi,4),	%r8d;\
+movzx	a ## B,		%edi;\
+xor	s2(%r11,%rdi,4),%r9d;\
+movzx	b ## B,		%edi;\
+xor	s1(%r11,%rdi,4),%r8d;\
+movzx	a ## H,		%edi;\
+ror	$15,		a ## D;\
+xor	s3(%r11,%rdi,4),%r9d;\
+movzx	b ## H,		%edi;\
+xor	s2(%r11,%rdi,4),%r8d;\
+add	%r8d,		%r9d;\
+add	%r9d,		%r8d;\
+add	k+round(%r11),	%r9d;\
+xor	%r9d,		c ## D;\
+add	k+4+round(%r11),%r8d;\
+xor	%r8d,		d ## D;\
+rol	$15,		d ## D;
+
+/*
+a input register containing a
+b input register containing b
+c input register containing c (already rol $1)
+d input register containing d
+operations on a and b are interleaved to increase performance
+during the round a and b are prepared for the output whitening
+*/
+#define decrypt_last_round(a,b,c,d,round)\
+movzx	a ## B,		%edi;\
+mov	(%r11,%rdi,4),	%r9d;\
+movzx	b ## B,		%edi;\
+mov	s3(%r11,%rdi,4),%r8d;\
+movzx	b ## H,		%edi;\
+ror	$16,		b ## D;\
+xor	(%r11,%rdi,4),	%r8d;\
+movzx	a ## H,		%edi;\
+mov	b ## D,		%r10d;\
+shl	$32,		%r10;\
+xor	a,		%r10;\
+ror	$16,		a ## D;\
+xor	s1(%r11,%rdi,4),%r9d;\
+movzx	b ## B,		%edi;\
+xor	s1(%r11,%rdi,4),%r8d;\
+movzx	a ## B,		%edi;\
+xor	s2(%r11,%rdi,4),%r9d;\
+movzx	b ## H,		%edi;\
+xor	s2(%r11,%rdi,4),%r8d;\
+movzx	a ## H,		%edi;\
+xor	s3(%r11,%rdi,4),%r9d;\
+add	%r8d,		%r9d;\
+add	%r9d,		%r8d;\
+add	k+round(%r11),	%r9d;\
+xor	%r9d,		c ## D;\
+add	k+4+round(%r11),%r8d;\
+xor	%r8d,		d ## D;\
+ror	$1,		d ## D;
+
+.align 8
+.global twofish_enc_blk
+.global twofish_dec_blk
+
+twofish_enc_blk:
+	pushq    R1
+
+	/* %rdi contains the crypto tfm adress */
+	/* %rsi contains the output adress */
+	/* %rdx contains the input adress */
+	add	$crypto_tfm_ctx_offset, %rdi	/* set ctx adress */
+	/* ctx adress is moved to free one non-rex register
+	as target for the 8bit high operations */
+	mov	%rdi,		%r11
+
+	movq	(R3),	R1
+	movq	8(R3),	R3
+	input_whitening(R1,%r11,a_offset)
+	input_whitening(R3,%r11,c_offset)
+	mov	R1D,	R0D
+	rol	$16,	R0D
+	shr	$32,	R1
+	mov	R3D,	R2D
+	shr	$32,	R3
+	rol	$1,	R3D
+
+	encrypt_round(R0,R1,R2,R3,0);
+	encrypt_round(R2,R3,R0,R1,8);
+	encrypt_round(R0,R1,R2,R3,2*8);
+	encrypt_round(R2,R3,R0,R1,3*8);
+	encrypt_round(R0,R1,R2,R3,4*8);
+	encrypt_round(R2,R3,R0,R1,5*8);
+	encrypt_round(R0,R1,R2,R3,6*8);
+	encrypt_round(R2,R3,R0,R1,7*8);
+	encrypt_round(R0,R1,R2,R3,8*8);
+	encrypt_round(R2,R3,R0,R1,9*8);
+	encrypt_round(R0,R1,R2,R3,10*8);
+	encrypt_round(R2,R3,R0,R1,11*8);
+	encrypt_round(R0,R1,R2,R3,12*8);
+	encrypt_round(R2,R3,R0,R1,13*8);
+	encrypt_round(R0,R1,R2,R3,14*8);
+	encrypt_last_round(R2,R3,R0,R1,15*8);
+
+
+	output_whitening(%r10,%r11,a_offset)
+	movq	%r10,	(%rsi)
+
+	shl	$32,	R1
+	xor	R0,	R1
+
+	output_whitening(R1,%r11,c_offset)
+	movq	R1,	8(%rsi)
+
+	popq	R1
+	movq	$1,%rax
+	ret
+	
+twofish_dec_blk:	
+	pushq    R1
+
+	/* %rdi contains the crypto tfm adress */
+	/* %rsi contains the output adress */
+	/* %rdx contains the input adress */
+	add	$crypto_tfm_ctx_offset, %rdi	/* set ctx adress */
+	/* ctx adress is moved to free one non-rex register
+	as target for the 8bit high operations */
+	mov	%rdi,		%r11
+
+	movq	(R3),	R1
+	movq	8(R3),	R3
+	output_whitening(R1,%r11,a_offset)
+	output_whitening(R3,%r11,c_offset)
+	mov	R1D,	R0D
+	shr	$32,	R1
+	rol	$16,	R1D
+	mov	R3D,	R2D
+	shr	$32,	R3
+	rol	$1,	R2D
+
+	decrypt_round(R0,R1,R2,R3,15*8);
+	decrypt_round(R2,R3,R0,R1,14*8);
+	decrypt_round(R0,R1,R2,R3,13*8);
+	decrypt_round(R2,R3,R0,R1,12*8);
+	decrypt_round(R0,R1,R2,R3,11*8);
+	decrypt_round(R2,R3,R0,R1,10*8);
+	decrypt_round(R0,R1,R2,R3,9*8);
+	decrypt_round(R2,R3,R0,R1,8*8);
+	decrypt_round(R0,R1,R2,R3,7*8);
+	decrypt_round(R2,R3,R0,R1,6*8);
+	decrypt_round(R0,R1,R2,R3,5*8);
+	decrypt_round(R2,R3,R0,R1,4*8);
+	decrypt_round(R0,R1,R2,R3,3*8);
+	decrypt_round(R2,R3,R0,R1,2*8);
+	decrypt_round(R0,R1,R2,R3,1*8);
+	decrypt_last_round(R2,R3,R0,R1,0);
+
+	input_whitening(%r10,%r11,a_offset)
+	movq	%r10,	(%rsi)
+
+	shl	$32,	R1
+	xor	R0,	R1
+
+	input_whitening(R1,%r11,c_offset)
+	movq	R1,	8(%rsi)
+
+	popq	R1
+	movq	$1,%rax
+	ret
diff --git a/arch/x86_64/crypto/twofish.c b/arch/x86_64/crypto/twofish.c
new file mode 100644
index 0000000..e38cc95
--- /dev/null
+++ b/arch/x86_64/crypto/twofish.c
@@ -0,0 +1,86 @@
+/*
+ * Glue Code for optimized x86_64 assembler version of TWOFISH
+ *
+ * Originally Twofish for GPG
+ * By Matthew Skala <mskala@ansuz.sooke.bc.ca>, July 26, 1998
+ * 256-bit key length added March 20, 1999
+ * Some modifications to reduce the text size by Werner Koch, April, 1998
+ * Ported to the kerneli patch by Marc Mutz <Marc@Mutz.com>
+ * Ported to CryptoAPI by Colin Slater <hoho@tacomeat.net>
+ *
+ * The original author has disclaimed all copyright interest in this
+ * code and thus put it in the public domain. The subsequent authors
+ * have put this under the GNU General Public License.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307
+ * USA
+ *
+ * This code is a "clean room" implementation, written from the paper
+ * _Twofish: A 128-Bit Block Cipher_ by Bruce Schneier, John Kelsey,
+ * Doug Whiting, David Wagner, Chris Hall, and Niels Ferguson, available
+ * through http://www.counterpane.com/twofish.html
+ *
+ * For background information on multiplication in finite fields, used for
+ * the matrix operations in the key schedule, see the book _Contemporary
+ * Abstract Algebra_ by Joseph A. Gallian, especially chapter 22 in the
+ * Third Edition.
+ */
+
+#include <asm/byteorder.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/crypto.h>
+#include <linux/bitops.h>
+#include <crypto/twofish.h>
+
+asmlinkage void twofish_enc_blk(struct crypto_tfm *tfm, u8 *dst, const u8 *src);
+
+asmlinkage void twofish_dec_blk(struct crypto_tfm *tfm, u8 *dst, const u8 *src);
+
+static struct crypto_alg alg = {
+	.cra_name           =   "twofish",
+	.cra_driver_name    =	"twofish-x86_64",
+	.cra_priority       =	200,
+	.cra_flags          =   CRYPTO_ALG_TYPE_CIPHER,
+	.cra_blocksize      =   TF_BLOCK_SIZE,
+	.cra_ctxsize        =   sizeof(struct twofish_ctx),
+	.cra_alignmask      =	3,
+	.cra_module         =   THIS_MODULE,
+	.cra_list           =   LIST_HEAD_INIT(alg.cra_list),
+	.cra_u              =   { .cipher = {
+	.cia_min_keysize    =   TF_MIN_KEY_SIZE,
+	.cia_max_keysize    =   TF_MAX_KEY_SIZE,
+	.cia_setkey         =   twofish_setkey,
+	.cia_encrypt        =   twofish_enc_blk,
+	.cia_decrypt        =   twofish_dec_blk } }
+};
+
+static int __init init(void)
+{
+	return crypto_register_alg(&alg);
+}
+
+static void __exit fini(void)
+{
+	crypto_unregister_alg(&alg);
+}
+
+module_init(init);
+module_exit(fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION ("Twofish Cipher Algorithm, x86_64 asm optimized");
diff --git a/crypto/Kconfig b/crypto/Kconfig
index 28b203b..990054d 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -165,6 +165,21 @@ config CRYPTO_TWOFISH_586
 	  See also:
 	  <http://www.schneier.com/twofish.html>
 
+config CRYPTO_TWOFISH_X86_64
+	tristate "Twofish cipher algorithm (x86_64)"
+	depends on CRYPTO && ((X86 || UML_X86) && 64BIT)
+	select CRYPTO_TWOFISH_COMMON
+	help
+	  Twofish cipher algorithm (x86_64).
+
+	  Twofish was submitted as an AES (Advanced Encryption Standard)
+	  candidate cipher by researchers at CounterPane Systems.  It is a
+	  16 round block cipher supporting key sizes of 128, 192, and 256
+	  bits.
+
+	  See also:
+	  <http://www.schneier.com/twofish.html>
+
 config CRYPTO_SERPENT
 	tristate "Serpent cipher algorithm"
 	depends on CRYPTO

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* Re: [PATCH  4/4] Twofish cipher - x86_64 assembler
  2006-06-19 14:13 ` Joachim Fritschi
@ 2006-06-20 11:14   ` Herbert Xu
  0 siblings, 0 replies; 20+ messages in thread
From: Herbert Xu @ 2006-06-20 11:14 UTC (permalink / raw)
  To: Joachim Fritschi; +Cc: linux-kernel, linux-crypto, ak

On Mon, Jun 19, 2006 at 04:13:01PM +0200, Joachim Fritschi wrote:
> This patch is now based on the cryptodev tree using the new cryptoapi (crypto  tfm
>  instead of the crypto ctx as parameter).
> 
> The module passed the tcrypt tests and testscripts.
> 
> Signed-off-by: Joachim Fritschi <jfritschi@freenet.de>

Thanks Joachim.  I've applied all four patches.  I had to add wrappers
around the twofish assembly routines because asmlinkage may differ from
the normal C calling convention.  It should get optimised away to just
a jump if the conventions are identical.

BTW Andi, I think it might be better to have the x86-64 patch sit in the
cryptodev tree rather than x86-64 because it won't even compile without
the previous patches.  If you really want to, I can leave out the x86-64
one in particular for you to merge after the others go upstream.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH  4/4] Twofish cipher - x86_64 assembler
@ 2006-06-08  7:13 linux
  2006-06-08 17:35 ` Joachim Fritschi
  0 siblings, 1 reply; 20+ messages in thread
From: linux @ 2006-06-08  7:13 UTC (permalink / raw)
  To: jfritschi, linux-kernel

The following seem to me to be simplifications, and so
make the code more readable.  For what it's worth...

- It would be nice to delete the R6 and R7 definitions in order to
  make clear that %esp and %ebp are NOT used as general-purpose registers.

- ctx, tmp1, tmp2, new1 and new2 are only used in "## D" form.
  It would be simpler to include the D suffix at the call site
  and remove 5 characters per use from the macro expansion.

- That would let you get rid of half of the macro definitions.
  You only need R0, R1, R2, R3, R8 and R9.
  The rest could simply be replaced by %r10d, etc.

- Changing the argument order to (a, b, newa, newb, olda, oldb)
  would make it clearer what's changing and what's staying the
  same between rounds.

- You could also get rid of the ## D sufixes on olda and oldb, but that's
  of more dubious benefit.

- How about making the macros more specific and not passing in the
  constant arguments ctx, tmp1, tmp2, key1 and key2?


Having looked at the code that much, I started seeing potential
code improvements:

- Why schedule the loading of the round keys so far in advance?
  They're going to be L1 hits anyway, and anything amd64 can do lots
  of code reordering.  There are no stores to act as fences.
  You could just to an xor stright into where you need the values.

- Why copy a -> olda and b->oldb at the end of each round?  Why not
  just do

+	encrypt_last_round(R0,R1,R8,R9,R2,R3,R5,  0,R10,R11,R12,R13);
+	encrypt_last_round(R2,R3,R0,R1,R8,R9,R5,  8,R10,R11,R12,R13);
+	encrypt_last_round(R8,R9,R2,R3,R0,R1,R5,2*8,R10,R11,R12,R13);
+	encrypt_last_round(R0,R1,R8,R9,R2,R3,R5,3*8,R10,R11,R12,R13);
+	encrypt_last_round(R2,R3,R0,R1,R8,R9,R5,4*8,R10,R11,R12,R13);
etc.?

Oh, but wait!  The ONLY inputs, AFAICT, to newa are
+	mov	olda  ## D,	newa ## D;\
+	mov	oldb ## D,	newb ## D;\
+	xor	key1 ## D,	newa ## D;\
+	xor	key2 ## D,	newb ## D;\

So why not just make newa and olda the same register, thereby removing that
mov as well, and replace the other uses of newa and newb in the loop
with appropriate temps?


That would make the round function:
+/*
+ * The twofish round function.
+ * This is very close to a standard Feistel cipher:
+ * (c,d) ^= F(a,b,round_key)
+ * But twofish adds one-bit rotations.
+ * Other registers used:
+ * %rdi points to the context structure including the key schedule
* * %r9d is a temporary.
+ * %r10d and %r11d hold the F() function output.
+ */
+#define\
+ encrypt_round(a,b,c,d,round)\
+	movzx	a ## B,		%r9d;\
+	mov	s0(%rdi,%r9,4),	%r10d;\
+	movzx	a ## H,		%r9d;\
+	ror	$16,		a ## D;\
+	xor     s1(%rdi,%r9,4),	%r10d;\
+	movzx	a ## B,		%r9d;\
+	xor     s2(%rdi,%r9,4),	%r10d;\
+	movzx	a ## H,		%r9d;\
+	xor     s3(%rdi,%r9,4),	%r10d;\
+	ror	$16,		a ## D;\
+	movzx	b ## B,		%r9d;\
+	mov     s1(%rdi,%r9,4),	%r11d;\
+	movzx	b ## H,		%r9d;\
+	ror	$16,		b ## D;\
+	xor	s2(%rdi,%r9,4),	%r11d;\
+	movzx	b ## B,		%r9d;\
+	xor	s3(%rdi,%r9,4),	%r11d;\
+	movzx	b ## H,		%r9d;\
+	xor     s0(%rdi,%r9,4),	%r11d;\
+	ror	$15,		b ## D;\
+	add	%r11d,		%r10d;\
+	add	%r10d,		%r11d;\
+	add	k+round(%rdi),	%r10d;\
+	add	k+4+round(%rdi),%r11d;\
+	xor	%r10d,		c ## D;\
+	xor	%r11d,		d ## D;\
+	ror	$1,		c ## D

Notice that this has saved three registers (%r8, %r12, %r13) and
eliminated six mov instructions.

(Admittedly, unbenchmarked and untested.)

If I wanted to worry about scheduling, I might move the b-side S-box
lookups ahead of the a-side to give that "ror $1,c" a smidgen more time
to complete, and then interleave them:

+#define\
+ encrypt_round(a,b,c,d,round)\
+	movzx	b ## B,		%r9d;\
+	mov     s1(%rdi,%r9,4),	%r11d;\
+	movzx	a ## B,		%r9d;\
+	mov	s0(%rdi,%r9,4),	%r10d;\
+	movzx	b ## H,		%r9d;\
+	xor	s2(%rdi,%r9,4),	%r11d;\
+	ror	$16,		b ## D;\
+	movzx	a ## H,		%r9d;\
+	xor     s1(%rdi,%r9,4),	%r10d;\
+	ror	$16,		a ## D;\
+	movzx	b ## B,		%r9d;\
+	xor	s3(%rdi,%r9,4),	%r11d;\
+	movzx	a ## B,		%r9d;\
+	xor     s2(%rdi,%r9,4),	%r10d;\
+	movzx	b ## H,		%r9d;\
+	xor     s0(%rdi,%r9,4),	%r11d;\
+	ror	$15,		b ## D;\
+	movzx	a ## H,		%r9d;\
+	xor     s3(%rdi,%r9,4),	%r10d;\
+	ror	$16,		a ## D;\
+	add	%r11d,		%r10d;\
+	add	%r10d,		%r11d;\
+	add	k+round(%rdi),	%r10d;\
+	add	k+4+round(%rdi),%r11d;\
+	xor	%r10d,		c ## D;\
+	xor	%r11d,		d ## D;\
+	ror	$1,		c ## D

And you could eliminate one more instruction by re-ordering the a-side
S box lookups to do the "high half" lookups first, and then merging the
resultant leading ror $16 with the trailing ror $1:
(Note that this changes the required loop setup slightly.)

+#define\
+ encrypt_round(a,b,c,d,round)\
+	rol	$15,		a ## D;\
+	movzx	b ## B,		%r9d;\
+	mov     s1(%rdi,%r9,4),	%r11d;\
+	movzx	a ## B,		%r9d;\
+	mov	s2(%rdi,%r9,4),	%r10d;\
+	movzx	b ## H,		%r9d;\
+	xor	s2(%rdi,%r9,4),	%r11d;\
+	ror	$16,		b ## D;\
+	movzx	a ## H,		%r9d;\
+	xor     s3(%rdi,%r9,4),	%r10d;\
+	ror	$16,		a ## D;\
+	movzx	b ## B,		%r9d;\
+	xor	s3(%rdi,%r9,4),	%r11d;\
+	movzx	a ## B,		%r9d;\
+	xor     s0(%rdi,%r9,4),	%r10d;\
+	movzx	b ## H,		%r9d;\
+	xor     s0(%rdi,%r9,4),	%r11d;\
+	ror	$15,		b ## D;\
+	movzx	a ## H,		%r9d;\
+	xor     s1(%rdi,%r9,4),	%r10d;\
+	add	%r11d,		%r10d;\
+	add	%r10d,		%r11d;\
+	add	k+round(%rdi),	%r10d;\
+	add	k+4+round(%rdi),%r11d;\
+	xor	%r10d,		c ## D;\
+	xor	%r11d,		d ## D

I haven't looked at the x86_32 code to see how many of these ideas
could be adapted there.  Unfortunately, even with the reduction, this
still uses 8 registers, one more than possible on x86_32.

Probably the best thing to do there would be to de-interleave the
a->%r10d and b->%r11d computations and spill (push/pop) the
necessary register around the second block.  Something like:

ctx in %edi
%ebp and %esi are temps

+#define\
+ encrypt_round(a,b,c,d,round)\
+	rol	$15,		a ## D;\
+	movzx	b ## B,		%esi;\
+	mov     s1(%edi,%esi,4),%ebp;\
+	movzx	b ## H,		%esi;\
+	xor	s2(%edi,%esi,4),%ebp;\
+	ror	$16,		b ## D;\
+	movzx	b ## B,		%esi;\
+	xor	s3(%edi,%esi,4),%ebp;\
+	movzx	b ## H,		%esi;\
+	xor     s0(%edi,%esi,4),%ebp;\
+	ror	$15,		b ## D;\
+	push	%ebp;\
+	movzx	a ## B,		%esi;\
+	mov	s2(%edi,%esi,4),%ebp;\
+	movzx	a ## H,		%esi;\
+	xor     s3(%edi,%esi,4),%ebp;\
+	ror	$16,		a ## D;\
+	movzx	a ## B,		%esi;\
+	xor     s0(%edi,%esi,4),%ebp;\
+	movzx	a ## H,		%esi;\
+	xor     s1(%edi,%esi,4),%ebp;\
+	pop	%esi;\
+	add	%esi,		%ebp;\
+	add	%ebp,		%esi;\
+	add	k+round(%edi),	%ebp;\
+	add	k+4+round(%edi),%esi;\
+	xor	%ebp,		c ## D;\
+	xor	%esi,		d ## D

(Legalese: These code changes are in the public domain.  All of the code
modifications presented here are simply the straightforward execution
of the (uncopyrightable) ideas presented in the text, and therefore
not protectable.  The only "creative" portions are the comment, the use
of the variable names "c" and "d", and the choice of registers in the
32-bit code, for which copyright is abandoned.)


And even bigger hack would be to rearrange the context structure to
have the key first, then interleave the s0 and s1 boxes and use the
(%rdi,%r9,8) addressing mode to access them.  That would, if you
pre-offset %rdi a little bit so the key was at a negative offset,
allow you to use a byte-offset addressing mode on 6 of the 10 loads in
each round.  (Compared to 2 of 10 without.)

In fact, on x86_64, you could go to the extreme of dedicating a register
to point to the base of each of the S-boxes, so there is no displacement
byte in the opcode at all.  That has to help the I-cache and the
instruction decoders enough to pay for the additional setup instructions.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH  4/4] Twofish cipher - x86_64 assembler
  2006-06-08  7:13 linux
@ 2006-06-08 17:35 ` Joachim Fritschi
  2006-06-09  1:13   ` linux
  0 siblings, 1 reply; 20+ messages in thread
From: Joachim Fritschi @ 2006-06-08 17:35 UTC (permalink / raw)
  To: linux; +Cc: linux-kernel

On Thursday 08 June 2006 09:13, linux@horizon.com wrote:
> The following seem to me to be simplifications, and so
> make the code more readable.  For what it's worth...
>
> - It would be nice to delete the R6 and R7 definitions in order to
>   make clear that %esp and %ebp are NOT used as general-purpose registers.
Sounds reasonable
> - ctx, tmp1, tmp2, new1 and new2 are only used in "## D" form.
>   It would be simpler to include the D suffix at the call site
>   and remove 5 characters per use from the macro expansion.
Could lead to some additional confusion because it masks all the different 
8,32 and 64 bit operations from the users. One could not just look at the 
macro and see what "subregister" is used.
> - That would let you get rid of half of the macro definitions.
>   You only need R0, R1, R2, R3, R8 and R9.
>   The rest could simply be replaced by %r10d, etc.
Would lead to a mixed names. Don't know if that would really help.
> - Changing the argument order to (a, b, newa, newb, olda, oldb)
>   would make it clearer what's changing and what's staying the
>   same between rounds.
>
> - You could also get rid of the ## D sufixes on olda and oldb, but that's
>   of more dubious benefit.
>
> - How about making the macros more specific and not passing in the
>   constant arguments ctx, tmp1, tmp2, key1 and key2?

I like to pass all registers i use as parameters. That way you can always keep 
track of what registers are used. That way it's harder to screw up with 
unwanted sideeffects.
>
>
> Having looked at the code that much, I started seeing potential
> code improvements:
>
> - Why schedule the loading of the round keys so far in advance?
>   They're going to be L1 hits anyway, and anything amd64 can do lots
>   of code reordering.  There are no stores to act as fences.
>   You could just to an xor stright into where you need the values.
I guess you are right. Will be fixed
> - Why copy a -> olda and b->oldb at the end of each round?  Why not
>   just do
>
> +	encrypt_last_round(R0,R1,R8,R9,R2,R3,R5,  0,R10,R11,R12,R13);
> +	encrypt_last_round(R2,R3,R0,R1,R8,R9,R5,  8,R10,R11,R12,R13);
> +	encrypt_last_round(R8,R9,R2,R3,R0,R1,R5,2*8,R10,R11,R12,R13);
> +	encrypt_last_round(R0,R1,R8,R9,R2,R3,R5,3*8,R10,R11,R12,R13);
> +	encrypt_last_round(R2,R3,R0,R1,R8,R9,R5,4*8,R10,R11,R12,R13);
> etc.?
>
> Oh, but wait!  The ONLY inputs, AFAICT, to newa are
> +	mov	olda  ## D,	newa ## D;\
> +	mov	oldb ## D,	newb ## D;\
> +	xor	key1 ## D,	newa ## D;\
> +	xor	key2 ## D,	newb ## D;\
>
> So why not just make newa and olda the same register, thereby removing that
> mov as well, and replace the other uses of newa and newb in the loop
> with appropriate temps?
>
>
> That would make the round function:
> +/*
> + * The twofish round function.
> + * This is very close to a standard Feistel cipher:
> + * (c,d) ^= F(a,b,round_key)
> + * But twofish adds one-bit rotations.
> + * Other registers used:
> + * %rdi points to the context structure including the key schedule
> * * %r9d is a temporary.
> + * %r10d and %r11d hold the F() function output.
> + */
> +#define\
> + encrypt_round(a,b,c,d,round)\
> +	movzx	a ## B,		%r9d;\
> +	mov	s0(%rdi,%r9,4),	%r10d;\
> +	movzx	a ## H,		%r9d;\
> +	ror	$16,		a ## D;\
> +	xor     s1(%rdi,%r9,4),	%r10d;\
> +	movzx	a ## B,		%r9d;\
> +	xor     s2(%rdi,%r9,4),	%r10d;\
> +	movzx	a ## H,		%r9d;\
> +	xor     s3(%rdi,%r9,4),	%r10d;\
> +	ror	$16,		a ## D;\
> +	movzx	b ## B,		%r9d;\
> +	mov     s1(%rdi,%r9,4),	%r11d;\
> +	movzx	b ## H,		%r9d;\
> +	ror	$16,		b ## D;\
> +	xor	s2(%rdi,%r9,4),	%r11d;\
> +	movzx	b ## B,		%r9d;\
> +	xor	s3(%rdi,%r9,4),	%r11d;\
> +	movzx	b ## H,		%r9d;\
> +	xor     s0(%rdi,%r9,4),	%r11d;\
> +	ror	$15,		b ## D;\
> +	add	%r11d,		%r10d;\
> +	add	%r10d,		%r11d;\
> +	add	k+round(%rdi),	%r10d;\
> +	add	k+4+round(%rdi),%r11d;\
> +	xor	%r10d,		c ## D;\
> +	xor	%r11d,		d ## D;\
> +	ror	$1,		c ## D
>
> Notice that this has saved three registers (%r8, %r12, %r13) and
> eliminated six mov instructions.
>
> (Admittedly, unbenchmarked and untested.)
 You can't use the 8bit high register with rex prefix registers ( r8+ and any 
64bit register). I guess this could be fixed be moving the crypto ctx or the 
output adress to a rex register and have %esi or %edi as temp register for 
the sbox-index. Somehow i never considered %esi or %edi as a possible target 
for the 8bit high operation and was convinced the only way to avoid using 4 * 
8bit rotates was using temporary registers. 
> If I wanted to worry about scheduling, I might move the b-side S-box
> lookups ahead of the a-side to give that "ror $1,c" a smidgen more time
> to complete, and then interleave them:
>
> +#define\
> + encrypt_round(a,b,c,d,round)\
> +	movzx	b ## B,		%r9d;\
> +	mov     s1(%rdi,%r9,4),	%r11d;\
> +	movzx	a ## B,		%r9d;\
> +	mov	s0(%rdi,%r9,4),	%r10d;\
> +	movzx	b ## H,		%r9d;\
> +	xor	s2(%rdi,%r9,4),	%r11d;\
> +	ror	$16,		b ## D;\
> +	movzx	a ## H,		%r9d;\
> +	xor     s1(%rdi,%r9,4),	%r10d;\
> +	ror	$16,		a ## D;\
> +	movzx	b ## B,		%r9d;\
> +	xor	s3(%rdi,%r9,4),	%r11d;\
> +	movzx	a ## B,		%r9d;\
> +	xor     s2(%rdi,%r9,4),	%r10d;\
> +	movzx	b ## H,		%r9d;\
> +	xor     s0(%rdi,%r9,4),	%r11d;\
> +	ror	$15,		b ## D;\
> +	movzx	a ## H,		%r9d;\
> +	xor     s3(%rdi,%r9,4),	%r10d;\
> +	ror	$16,		a ## D;\
> +	add	%r11d,		%r10d;\
> +	add	%r10d,		%r11d;\
> +	add	k+round(%rdi),	%r10d;\
> +	add	k+4+round(%rdi),%r11d;\
> +	xor	%r10d,		c ## D;\
> +	xor	%r11d,		d ## D;\
> +	ror	$1,		c ## D
>
> And you could eliminate one more instruction by re-ordering the a-side
> S box lookups to do the "high half" lookups first, and then merging the
> resultant leading ror $16 with the trailing ror $1:
> (Note that this changes the required loop setup slightly.)
>
> +#define\
> + encrypt_round(a,b,c,d,round)\
> +	rol	$15,		a ## D;\
> +	movzx	b ## B,		%r9d;\
> +	mov     s1(%rdi,%r9,4),	%r11d;\
> +	movzx	a ## B,		%r9d;\
> +	mov	s2(%rdi,%r9,4),	%r10d;\
> +	movzx	b ## H,		%r9d;\
> +	xor	s2(%rdi,%r9,4),	%r11d;\
> +	ror	$16,		b ## D;\
> +	movzx	a ## H,		%r9d;\
> +	xor     s3(%rdi,%r9,4),	%r10d;\
> +	ror	$16,		a ## D;\
> +	movzx	b ## B,		%r9d;\
> +	xor	s3(%rdi,%r9,4),	%r11d;\
> +	movzx	a ## B,		%r9d;\
> +	xor     s0(%rdi,%r9,4),	%r10d;\
> +	movzx	b ## H,		%r9d;\
> +	xor     s0(%rdi,%r9,4),	%r11d;\
> +	ror	$15,		b ## D;\
> +	movzx	a ## H,		%r9d;\
> +	xor     s1(%rdi,%r9,4),	%r10d;\
> +	add	%r11d,		%r10d;\
> +	add	%r10d,		%r11d;\
> +	add	k+round(%rdi),	%r10d;\
> +	add	k+4+round(%rdi),%r11d;\
> +	xor	%r10d,		c ## D;\
> +	xor	%r11d,		d ## D
Very neat. I think i will run with this idea. Need some fixing for the rex 
thing and some modifications in the first and last round.
> I haven't looked at the x86_32 code to see how many of these ideas
> could be adapted there.  Unfortunately, even with the reduction, this
> still uses 8 registers, one more than possible on x86_32.
Yes. I think i got some ideas on how to fix it up a bit. 
> Probably the best thing to do there would be to de-interleave the
> a->%r10d and b->%r11d computations and spill (push/pop) the
> necessary register around the second block.  Something like:
>
> ctx in %edi
> %ebp and %esi are temps
>
> +#define\
> + encrypt_round(a,b,c,d,round)\
> +	rol	$15,		a ## D;\
> +	movzx	b ## B,		%esi;\
> +	mov     s1(%edi,%esi,4),%ebp;\
> +	movzx	b ## H,		%esi;\
> +	xor	s2(%edi,%esi,4),%ebp;\
> +	ror	$16,		b ## D;\
> +	movzx	b ## B,		%esi;\
> +	xor	s3(%edi,%esi,4),%ebp;\
> +	movzx	b ## H,		%esi;\
> +	xor     s0(%edi,%esi,4),%ebp;\
> +	ror	$15,		b ## D;\
> +	push	%ebp;\
> +	movzx	a ## B,		%esi;\
> +	mov	s2(%edi,%esi,4),%ebp;\
> +	movzx	a ## H,		%esi;\
> +	xor     s3(%edi,%esi,4),%ebp;\
> +	ror	$16,		a ## D;\
> +	movzx	a ## B,		%esi;\
> +	xor     s0(%edi,%esi,4),%ebp;\
> +	movzx	a ## H,		%esi;\
> +	xor     s1(%edi,%esi,4),%ebp;\
> +	pop	%esi;\
> +	add	%esi,		%ebp;\
> +	add	%ebp,		%esi;\
> +	add	k+round(%edi),	%ebp;\
> +	add	k+4+round(%edi),%esi;\
> +	xor	%ebp,		c ## D;\
> +	xor	%esi,		d ## D
>
> (Legalese: These code changes are in the public domain.  All of the code
> modifications presented here are simply the straightforward execution
> of the (uncopyrightable) ideas presented in the text, and therefore
> not protectable.  The only "creative" portions are the comment, the use
> of the variable names "c" and "d", and the choice of registers in the
> 32-bit code, for which copyright is abandoned.)
>
>
> And even bigger hack would be to rearrange the context structure to
> have the key first, then interleave the s0 and s1 boxes and use the
> (%rdi,%r9,8) addressing mode to access them.  That would, if you
> pre-offset %rdi a little bit so the key was at a negative offset,
> allow you to use a byte-offset addressing mode on 6 of the 10 loads in
> each round.  (Compared to 2 of 10 without.)
>
> In fact, on x86_64, you could go to the extreme of dedicating a register
> to point to the base of each of the S-boxes, so there is no displacement
> byte in the opcode at all.  That has to help the I-cache and the
> instruction decoders enough to pay for the additional setup instructions.
I already had that in mind and since you freed up some registers that is 
finally possible.

Seems like i have a lot of work ahead :D
Thanks for your very valuable comments.

-Joachim


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH  4/4] Twofish cipher - x86_64 assembler
  2006-06-08 17:35 ` Joachim Fritschi
@ 2006-06-09  1:13   ` linux
  0 siblings, 0 replies; 20+ messages in thread
From: linux @ 2006-06-09  1:13 UTC (permalink / raw)
  To: jfritschi, linux; +Cc: linux-kernel

> You can't use the 8bit high register with rex prefix registers ( r8+
> and any 64bit register). I guess this could be fixed be moving the crypto
> ctx or the output adress to a rex register and have %esi or %edi as temp
> register for the sbox-index. Somehow i never considered %esi or %edi as
> a possible target for the 8bit high operation and was convinced the only
> way to avoid using 4 * 8bit rotates was using temporary registers.

Oh, yeah!  Sorry about that.  You're quite right, but you're also
quite right that it's fixable.

I'd have to refresh my memory of the whole REX prefix business to figure
out how to minimize the code size.  I've forgotten the details.


Anyway, I'm glad to have been of some help.

^ permalink raw reply	[flat|nested] 20+ messages in thread

end of thread, other threads:[~2006-06-20 11:14 UTC | newest]

Thread overview: 20+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2006-06-04 13:16 [PATCH 4/4] Twofish cipher - x86_64 assembler Joachim Fritschi
2006-06-04 19:10 ` Andi Kleen
2006-06-04 21:01   ` Dag Arne Osvik
2006-06-05 10:18     ` Joachim Fritschi
2006-06-05 22:28       ` Dag Arne Osvik
2006-06-05 22:44         ` Dag Arne Osvik
2006-06-07 19:16         ` Joachim Fritschi
2006-06-05 10:06   ` Joachim Fritschi
2006-06-05 17:44     ` dean gaudet
2006-06-05 19:46       ` Joachim Fritschi
2006-06-05 23:35     ` Andi Kleen
2006-06-07 19:21       ` Joachim Fritschi
2006-06-07 19:38 ` Joachim Fritschi
2006-06-16 12:00 ` Joachim Fritschi
2006-06-17 10:38 ` Joachim Fritschi
2006-06-19 14:13 ` Joachim Fritschi
2006-06-20 11:14   ` Herbert Xu
  -- strict thread matches above, loose matches on Subject: below --
2006-06-08  7:13 linux
2006-06-08 17:35 ` Joachim Fritschi
2006-06-09  1:13   ` linux

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox