Re: [PATCH] crypto: twofish - add x86_64/avx assembler implementation

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
To: Borislav Petkov <bp@alien8.de>
Cc: "Johannes Goetzfried"
	<Johannes.Goetzfried@informatik.stud.uni-erlangen.de>,
	linux-kernel@vger.kernel.org, linux-crypto@vger.kernel.org,
	"Tilo Müller" <tilo.mueller@informatik.uni-erlangen.de>,
	"Herbert Xu" <herbert@gondor.apana.org.au>
Subject: Re: [PATCH] crypto: twofish - add x86_64/avx assembler implementation
Date: Fri, 17 Aug 2012 10:37:10 +0300	[thread overview]
Message-ID: <20120817073048.16720.80328.stgit@localhost6.localdomain6> (raw)
In-Reply-To: <20120816132926.GB12029@x1.osrc.amd.com>

Quoting Borislav Petkov <bp@alien8.de>:

>
> Yep, looks better than the previous run and also a bit better or on par
> with the initial run I did.
>

I made few further changes, mainly moving/interleaving 'vmovq/vpextrq' ahead
so they should be completed before those target registers are needed. This
only gave 0.5% increase on Sandy-bridge, but might help more on Bulldozer.

-Jussi

---
 arch/x86/crypto/twofish-avx-x86_64-asm_64.S |  205 +++++++++++++++++----------
 1 file changed, 130 insertions(+), 75 deletions(-)

diff --git a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
index 35f4557..6638a87 100644
--- a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
@@ -4,6 +4,8 @@
  * Copyright (C) 2012 Johannes Goetzfried
  *     <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
  *
+ * Copyright © 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
+ *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
  * the Free Software Foundation; either version 2 of the License, or
@@ -47,16 +49,21 @@
 #define RC2 %xmm6
 #define RD2 %xmm7
 
-#define RX %xmm8
-#define RY %xmm9
+#define RX0 %xmm8
+#define RY0 %xmm9
+
+#define RX1 %xmm10
+#define RY1 %xmm11
 
-#define RK1 %xmm10
-#define RK2 %xmm11
+#define RK1 %xmm12
+#define RK2 %xmm13
 
-#define RID1  %rax
-#define RID1b %al
-#define RID2  %rbx
-#define RID2b %bl
+#define RT %xmm14
+
+#define RID1  %rbp
+#define RID1d %ebp
+#define RID2  %rsi
+#define RID2d %esi
 
 #define RGI1   %rdx
 #define RGI1bl %dl
@@ -65,6 +72,13 @@
 #define RGI2bl %cl
 #define RGI2bh %ch
 
+#define RGI3   %rax
+#define RGI3bl %al
+#define RGI3bh %ah
+#define RGI4   %rbx
+#define RGI4bl %bl
+#define RGI4bh %bh
+
 #define RGS1  %r8
 #define RGS1d %r8d
 #define RGS2  %r9
@@ -73,40 +87,53 @@
 #define RGS3d %r10d
 
 
-#define lookup_32bit(t0, t1, t2, t3, src, dst) \
-	movb		src ## bl,        RID1b;     \
-	movb		src ## bh,        RID2b;     \
+#define lookup_32bit(t0, t1, t2, t3, src, dst, interleave_op, il_reg) \
+	movzbl		src ## bl,        RID1d;     \
+	movzbl		src ## bh,        RID2d;     \
+	shrq $16,	src;                         \
 	movl		t0(CTX, RID1, 4), dst ## d;  \
 	xorl		t1(CTX, RID2, 4), dst ## d;  \
-	shrq $16,	src;                         \
-	movb		src ## bl,        RID1b;     \
-	movb		src ## bh,        RID2b;     \
+	movzbl		src ## bl,        RID1d;     \
+	movzbl		src ## bh,        RID2d;     \
+	interleave_op(il_reg);			     \
 	xorl		t2(CTX, RID1, 4), dst ## d;  \
 	xorl		t3(CTX, RID2, 4), dst ## d;
 
-#define G(a, x, t0, t1, t2, t3) \
-	vmovq		a,    RGI1;               \
-	vpsrldq $8,	a,    x;                  \
-	vmovq		x,    RGI2;               \
-	\
-	lookup_32bit(t0, t1, t2, t3, RGI1, RGS1); \
-	shrq $16,	RGI1;                     \
-	lookup_32bit(t0, t1, t2, t3, RGI1, RGS2); \
-	shlq $32,	RGS2;                     \
-	orq		RGS1, RGS2;               \
+#define dummy(d) /* do nothing */
+
+#define shr_next(reg) \
+	shrq $16,	reg;
+
+#define G(gi1, gi2, x, t0, t1, t2, t3) \
+	lookup_32bit(t0, t1, t2, t3, ##gi1, RGS1, shr_next, ##gi1);  \
+	lookup_32bit(t0, t1, t2, t3, ##gi1, RGS2, dummy, none);      \
+	shlq $32,	RGS2;                                        \
+	orq		RGS1, RGS2;                                  \
 	\
-	lookup_32bit(t0, t1, t2, t3, RGI2, RGS1); \
-	shrq $16,	RGI2;                     \
-	lookup_32bit(t0, t1, t2, t3, RGI2, RGS3); \
-	shlq $32,	RGS3;                     \
-	orq		RGS1, RGS3;               \
+	lookup_32bit(t0, t1, t2, t3, ##gi2, RGS3, shr_next, ##gi2);  \
+	lookup_32bit(t0, t1, t2, t3, ##gi2, RGS1, dummy, none);      \
+	shlq $32,	RGS1;                                        \
+	orq		RGS1, RGS3;                                  \
 	\
-	vmovq		RGS2, x;                  \
+	vmovq		RGS2, x;                                     \
 	vpinsrq $1,	RGS3, x, x;
 
-#define encround(a, b, c, d, x, y) \
-	G(a, x, s0, s1, s2, s3);           \
-	G(b, y, s1, s2, s3, s0);           \
+#define encround_head_2(a, b, c, d, x1, y1, x2, y2) \
+	vmovq			b ## 1, RGI3;	\
+	vpextrq $1,		b ## 1, RGI4;   \
+	G(RGI1, RGI2, x1, s0, s1, s2, s3);      \
+	vmovq			a ## 2, RGI1;   \
+	vpextrq $1,		a ## 2, RGI2;   \
+	G(RGI3, RGI4, y1, s1, s2, s3, s0);      \
+	vmovq			b ## 2, RGI3;   \
+	vpextrq $1,		b ## 2, RGI4;   \
+	G(RGI1, RGI2, x2, s0, s1, s2, s3);      \
+	G(RGI3, RGI4, y2, s1, s2, s3, s0);
+
+#define encround_tail(a, b, c, d, x, y) \
+	vpslld $1,		d, RT;     \
+	vpsrld $(32 - 1),	d, d;      \
+	vpor			d, RT,  d; \
 	vpaddd			x, y,   x; \
 	vpaddd			y, x,   y; \
 	vpaddd			x, RK1, x; \
@@ -115,14 +142,24 @@
 	vpsrld $1,		c, x;      \
 	vpslld $(32 - 1),	c, c;      \
 	vpor			c, x,   c; \
-	vpslld $1,		d, x;      \
-	vpsrld $(32 - 1),	d, d;      \
-	vpor			d, x,   d; \
 	vpxor			d, y,   d;
 
-#define decround(a, b, c, d, x, y) \
-	G(a, x, s0, s1, s2, s3);           \
-	G(b, y, s1, s2, s3, s0);           \
+#define decround_head_2(a, b, c, d, x1, y1, x2, y2) \
+	vmovq			b ## 1, RGI3;   \
+	vpextrq $1,		b ## 1, RGI4;   \
+	G(RGI1, RGI2, x1, s0, s1, s2, s3);      \
+	vmovq			a ## 2, RGI1;   \
+	vpextrq $1,		a ## 2, RGI2;   \
+	G(RGI3, RGI4, y1, s1, s2, s3, s0);      \
+	vmovq			b ## 2, RGI3;   \
+	vpextrq $1,		b ## 2, RGI4;   \
+	G(RGI1, RGI2, x2, s0, s1, s2, s3);      \
+	G(RGI3, RGI4, y2, s1, s2, s3, s0);
+
+#define decround_tail(a, b, c, d, x, y) \
+	vpslld $1,		c, RT;     \
+	vpsrld $(32 - 1),	c, c;      \
+	vpor			c, RT,  c; \
 	vpaddd			x, y,   x; \
 	vpaddd			y, x,   y; \
 	vpaddd			y, RK2, y; \
@@ -130,32 +167,44 @@
 	vpsrld $1,		d, y;      \
 	vpslld $(32 - 1),	d, d;      \
 	vpor			d, y,   d; \
-	vpslld $1,		c, y;      \
-	vpsrld $(32 - 1),	c, c;      \
-	vpor			c, y,   c; \
 	vpaddd			x, RK1, x; \
 	vpxor			x, c,   c;
 
-#define encrypt_round(n, a, b, c, d) \
-	vbroadcastss (k+4*(2*(n)))(CTX),   RK1;           \
-	vbroadcastss (k+4*(2*(n)+1))(CTX), RK2;           \
-	encround(a ## 1, b ## 1, c ## 1, d ## 1, RX, RY); \
-	encround(a ## 2, b ## 2, c ## 2, d ## 2, RX, RY);
-
-#define decrypt_round(n, a, b, c, d) \
-	vbroadcastss (k+4*(2*(n)))(CTX),   RK1;           \
-	vbroadcastss (k+4*(2*(n)+1))(CTX), RK2;           \
-	decround(a ## 1, b ## 1, c ## 1, d ## 1, RX, RY); \
-	decround(a ## 2, b ## 2, c ## 2, d ## 2, RX, RY);
+#define preload_rgi(c) \
+	vmovq			c, RGI1; \
+	vpextrq $1,		c, RGI2;
+
+#define encrypt_round(n, a, b, c, d, preload) \
+	vbroadcastss (k+4*(2*(n)))(CTX),   RK1;                  \
+	vbroadcastss (k+4*(2*(n)+1))(CTX), RK2;                  \
+	encround_head_2(a, b, c, d, RX0, RY0, RX1, RY1);         \
+	encround_tail(a ## 1, b ## 1, c ## 1, d ## 1, RX0, RY0); \
+	preload(c ## 1);                                         \
+	encround_tail(a ## 2, b ## 2, c ## 2, d ## 2, RX1, RY1);
+
+#define decrypt_round(n, a, b, c, d, preload) \
+	vbroadcastss (k+4*(2*(n)))(CTX),   RK1;                  \
+	vbroadcastss (k+4*(2*(n)+1))(CTX), RK2;                  \
+	decround_head_2(a, b, c, d, RX0, RY0, RX1, RY1);         \
+	decround_tail(a ## 1, b ## 1, c ## 1, d ## 1, RX0, RY0); \
+	preload(c ## 1);                                         \
+	decround_tail(a ## 2, b ## 2, c ## 2, d ## 2, RX1, RY1);
 
 #define encrypt_cycle(n) \
-	encrypt_round((2*n), RA, RB, RC, RD);       \
-	encrypt_round(((2*n) + 1), RC, RD, RA, RB);
+	encrypt_round((2*n), RA, RB, RC, RD, preload_rgi); \
+	encrypt_round(((2*n) + 1), RC, RD, RA, RB, preload_rgi);
+
+#define encrypt_cycle_last(n) \
+	encrypt_round((2*n), RA, RB, RC, RD, preload_rgi); \
+	encrypt_round(((2*n) + 1), RC, RD, RA, RB, dummy);
 
 #define decrypt_cycle(n) \
-	decrypt_round(((2*n) + 1), RC, RD, RA, RB); \
-	decrypt_round((2*n), RA, RB, RC, RD);
+	decrypt_round(((2*n) + 1), RC, RD, RA, RB, preload_rgi); \
+	decrypt_round((2*n), RA, RB, RC, RD, preload_rgi);
 
+#define decrypt_cycle_last(n) \
+	decrypt_round(((2*n) + 1), RC, RD, RA, RB, preload_rgi); \
+	decrypt_round((2*n), RA, RB, RC, RD, dummy);
 
 #define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
 	vpunpckldq		x1, x0, t0; \
@@ -216,17 +265,19 @@ __twofish_enc_blk_8way:
 	 *	%rcx: bool, if true: xor output
 	 */
 
+	pushq %rbp;
 	pushq %rbx;
 	pushq %rcx;
 
 	vmovdqu w(CTX), RK1;
 
 	leaq (4*4*4)(%rdx), %rax;
-	inpack_blocks(%rdx, RA1, RB1, RC1, RD1, RK1, RX, RY, RK2);
-	inpack_blocks(%rax, RA2, RB2, RC2, RD2, RK1, RX, RY, RK2);
+	inpack_blocks(%rdx, RA1, RB1, RC1, RD1, RK1, RX0, RY0, RK2);
+	vmovq		RA1, RGI1;
+	vpextrq $1,	RA1, RGI2;
+	inpack_blocks(%rax, RA2, RB2, RC2, RD2, RK1, RX0, RY0, RK2);
 
-	xorq RID1, RID1;
-	xorq RID2, RID2;
+	movq %rsi, %r11;
 
 	encrypt_cycle(0);
 	encrypt_cycle(1);
@@ -235,26 +286,27 @@ __twofish_enc_blk_8way:
 	encrypt_cycle(4);
 	encrypt_cycle(5);
 	encrypt_cycle(6);
-	encrypt_cycle(7);
+	encrypt_cycle_last(7);
 
 	vmovdqu (w+4*4)(CTX), RK1;
 
 	popq %rcx;
 	popq %rbx;
+	popq %rbp;
 
-	leaq (4*4*4)(%rsi), %rax;
+	leaq (4*4*4)(%r11), %rax;
 
 	testb %cl, %cl;
 	jnz __enc_xor8;
 
-	outunpack_blocks(%rsi, RC1, RD1, RA1, RB1, RK1, RX, RY, RK2);
-	outunpack_blocks(%rax, RC2, RD2, RA2, RB2, RK1, RX, RY, RK2);
+	outunpack_blocks(%r11, RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2);
+	outunpack_blocks(%rax, RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2);
 
 	ret;
 
 __enc_xor8:
-	outunpack_xor_blocks(%rsi, RC1, RD1, RA1, RB1, RK1, RX, RY, RK2);
-	outunpack_xor_blocks(%rax, RC2, RD2, RA2, RB2, RK1, RX, RY, RK2);
+	outunpack_xor_blocks(%r11, RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2);
+	outunpack_xor_blocks(%rax, RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2);
 
 	ret;
 
@@ -269,16 +321,18 @@ twofish_dec_blk_8way:
 	 *	%rdx: src
 	 */
 
+	pushq %rbp;
 	pushq %rbx;
 
 	vmovdqu (w+4*4)(CTX), RK1;
 
 	leaq (4*4*4)(%rdx), %rax;
-	inpack_blocks(%rdx, RC1, RD1, RA1, RB1, RK1, RX, RY, RK2);
-	inpack_blocks(%rax, RC2, RD2, RA2, RB2, RK1, RX, RY, RK2);
+	inpack_blocks(%rdx, RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2);
+	vmovq		RC1, RGI1;
+	vpextrq $1,	RC1, RGI2;
+	inpack_blocks(%rax, RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2);
 
-	xorq RID1, RID1;
-	xorq RID2, RID2;
+	movq %rsi, %r11;
 
 	decrypt_cycle(7);
 	decrypt_cycle(6);
@@ -287,14 +341,15 @@ twofish_dec_blk_8way:
 	decrypt_cycle(3);
 	decrypt_cycle(2);
 	decrypt_cycle(1);
-	decrypt_cycle(0);
+	decrypt_cycle_last(0);
 
 	vmovdqu (w)(CTX), RK1;
 
 	popq %rbx;
+	popq %rbp;
 
-	leaq (4*4*4)(%rsi), %rax;
-	outunpack_blocks(%rsi, RA1, RB1, RC1, RD1, RK1, RX, RY, RK2);
-	outunpack_blocks(%rax, RA2, RB2, RC2, RD2, RK1, RX, RY, RK2);
+	leaq (4*4*4)(%r11), %rax;
+	outunpack_blocks(%r11, RA1, RB1, RC1, RD1, RK1, RX0, RY0, RK2);
+	outunpack_blocks(%rax, RA2, RB2, RC2, RD2, RK1, RX0, RY0, RK2);
 
 	ret;

WARNING: multiple messages have this Message-ID (diff)

From: Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
To: Borislav Petkov <bp@alien8.de>
Cc: "Johannes Goetzfried"
	<Johannes.Goetzfried@informatik.stud.uni-erlangen.de>,
	linux-kernel@vger.kernel.org, linux-crypto@vger.kernel.org,
	"Tilo Müller" <tilo.mueller@informatik.uni-erlangen.de>,
	"Herbert Xu" <herbert@gondor.hengli.com.au>
Subject: Re: [PATCH] crypto: twofish - add x86_64/avx assembler implementation
Date: Fri, 17 Aug 2012 10:37:10 +0300	[thread overview]
Message-ID: <20120817073048.16720.80328.stgit@localhost6.localdomain6> (raw)
In-Reply-To: <20120816132926.GB12029@x1.osrc.amd.com>

Quoting Borislav Petkov <bp@alien8.de>:

>
> Yep, looks better than the previous run and also a bit better or on par
> with the initial run I did.
>

I made few further changes, mainly moving/interleaving 'vmovq/vpextrq' ahead
so they should be completed before those target registers are needed. This
only gave 0.5% increase on Sandy-bridge, but might help more on Bulldozer.

-Jussi

---
 arch/x86/crypto/twofish-avx-x86_64-asm_64.S |  205 +++++++++++++++++----------
 1 file changed, 130 insertions(+), 75 deletions(-)

diff --git a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
index 35f4557..6638a87 100644
--- a/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
+++ b/arch/x86/crypto/twofish-avx-x86_64-asm_64.S
@@ -4,6 +4,8 @@
  * Copyright (C) 2012 Johannes Goetzfried
  *     <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
  *
+ * Copyright © 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
+ *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
  * the Free Software Foundation; either version 2 of the License, or
@@ -47,16 +49,21 @@
 #define RC2 %xmm6
 #define RD2 %xmm7
 
-#define RX %xmm8
-#define RY %xmm9
+#define RX0 %xmm8
+#define RY0 %xmm9
+
+#define RX1 %xmm10
+#define RY1 %xmm11
 
-#define RK1 %xmm10
-#define RK2 %xmm11
+#define RK1 %xmm12
+#define RK2 %xmm13
 
-#define RID1  %rax
-#define RID1b %al
-#define RID2  %rbx
-#define RID2b %bl
+#define RT %xmm14
+
+#define RID1  %rbp
+#define RID1d %ebp
+#define RID2  %rsi
+#define RID2d %esi
 
 #define RGI1   %rdx
 #define RGI1bl %dl
@@ -65,6 +72,13 @@
 #define RGI2bl %cl
 #define RGI2bh %ch
 
+#define RGI3   %rax
+#define RGI3bl %al
+#define RGI3bh %ah
+#define RGI4   %rbx
+#define RGI4bl %bl
+#define RGI4bh %bh
+
 #define RGS1  %r8
 #define RGS1d %r8d
 #define RGS2  %r9
@@ -73,40 +87,53 @@
 #define RGS3d %r10d
 
 
-#define lookup_32bit(t0, t1, t2, t3, src, dst) \
-	movb		src ## bl,        RID1b;     \
-	movb		src ## bh,        RID2b;     \
+#define lookup_32bit(t0, t1, t2, t3, src, dst, interleave_op, il_reg) \
+	movzbl		src ## bl,        RID1d;     \
+	movzbl		src ## bh,        RID2d;     \
+	shrq $16,	src;                         \
 	movl		t0(CTX, RID1, 4), dst ## d;  \
 	xorl		t1(CTX, RID2, 4), dst ## d;  \
-	shrq $16,	src;                         \
-	movb		src ## bl,        RID1b;     \
-	movb		src ## bh,        RID2b;     \
+	movzbl		src ## bl,        RID1d;     \
+	movzbl		src ## bh,        RID2d;     \
+	interleave_op(il_reg);			     \
 	xorl		t2(CTX, RID1, 4), dst ## d;  \
 	xorl		t3(CTX, RID2, 4), dst ## d;
 
-#define G(a, x, t0, t1, t2, t3) \
-	vmovq		a,    RGI1;               \
-	vpsrldq $8,	a,    x;                  \
-	vmovq		x,    RGI2;               \
-	\
-	lookup_32bit(t0, t1, t2, t3, RGI1, RGS1); \
-	shrq $16,	RGI1;                     \
-	lookup_32bit(t0, t1, t2, t3, RGI1, RGS2); \
-	shlq $32,	RGS2;                     \
-	orq		RGS1, RGS2;               \
+#define dummy(d) /* do nothing */
+
+#define shr_next(reg) \
+	shrq $16,	reg;
+
+#define G(gi1, gi2, x, t0, t1, t2, t3) \
+	lookup_32bit(t0, t1, t2, t3, ##gi1, RGS1, shr_next, ##gi1);  \
+	lookup_32bit(t0, t1, t2, t3, ##gi1, RGS2, dummy, none);      \
+	shlq $32,	RGS2;                                        \
+	orq		RGS1, RGS2;                                  \
 	\
-	lookup_32bit(t0, t1, t2, t3, RGI2, RGS1); \
-	shrq $16,	RGI2;                     \
-	lookup_32bit(t0, t1, t2, t3, RGI2, RGS3); \
-	shlq $32,	RGS3;                     \
-	orq		RGS1, RGS3;               \
+	lookup_32bit(t0, t1, t2, t3, ##gi2, RGS3, shr_next, ##gi2);  \
+	lookup_32bit(t0, t1, t2, t3, ##gi2, RGS1, dummy, none);      \
+	shlq $32,	RGS1;                                        \
+	orq		RGS1, RGS3;                                  \
 	\
-	vmovq		RGS2, x;                  \
+	vmovq		RGS2, x;                                     \
 	vpinsrq $1,	RGS3, x, x;
 
-#define encround(a, b, c, d, x, y) \
-	G(a, x, s0, s1, s2, s3);           \
-	G(b, y, s1, s2, s3, s0);           \
+#define encround_head_2(a, b, c, d, x1, y1, x2, y2) \
+	vmovq			b ## 1, RGI3;	\
+	vpextrq $1,		b ## 1, RGI4;   \
+	G(RGI1, RGI2, x1, s0, s1, s2, s3);      \
+	vmovq			a ## 2, RGI1;   \
+	vpextrq $1,		a ## 2, RGI2;   \
+	G(RGI3, RGI4, y1, s1, s2, s3, s0);      \
+	vmovq			b ## 2, RGI3;   \
+	vpextrq $1,		b ## 2, RGI4;   \
+	G(RGI1, RGI2, x2, s0, s1, s2, s3);      \
+	G(RGI3, RGI4, y2, s1, s2, s3, s0);
+
+#define encround_tail(a, b, c, d, x, y) \
+	vpslld $1,		d, RT;     \
+	vpsrld $(32 - 1),	d, d;      \
+	vpor			d, RT,  d; \
 	vpaddd			x, y,   x; \
 	vpaddd			y, x,   y; \
 	vpaddd			x, RK1, x; \
@@ -115,14 +142,24 @@
 	vpsrld $1,		c, x;      \
 	vpslld $(32 - 1),	c, c;      \
 	vpor			c, x,   c; \
-	vpslld $1,		d, x;      \
-	vpsrld $(32 - 1),	d, d;      \
-	vpor			d, x,   d; \
 	vpxor			d, y,   d;
 
-#define decround(a, b, c, d, x, y) \
-	G(a, x, s0, s1, s2, s3);           \
-	G(b, y, s1, s2, s3, s0);           \
+#define decround_head_2(a, b, c, d, x1, y1, x2, y2) \
+	vmovq			b ## 1, RGI3;   \
+	vpextrq $1,		b ## 1, RGI4;   \
+	G(RGI1, RGI2, x1, s0, s1, s2, s3);      \
+	vmovq			a ## 2, RGI1;   \
+	vpextrq $1,		a ## 2, RGI2;   \
+	G(RGI3, RGI4, y1, s1, s2, s3, s0);      \
+	vmovq			b ## 2, RGI3;   \
+	vpextrq $1,		b ## 2, RGI4;   \
+	G(RGI1, RGI2, x2, s0, s1, s2, s3);      \
+	G(RGI3, RGI4, y2, s1, s2, s3, s0);
+
+#define decround_tail(a, b, c, d, x, y) \
+	vpslld $1,		c, RT;     \
+	vpsrld $(32 - 1),	c, c;      \
+	vpor			c, RT,  c; \
 	vpaddd			x, y,   x; \
 	vpaddd			y, x,   y; \
 	vpaddd			y, RK2, y; \
@@ -130,32 +167,44 @@
 	vpsrld $1,		d, y;      \
 	vpslld $(32 - 1),	d, d;      \
 	vpor			d, y,   d; \
-	vpslld $1,		c, y;      \
-	vpsrld $(32 - 1),	c, c;      \
-	vpor			c, y,   c; \
 	vpaddd			x, RK1, x; \
 	vpxor			x, c,   c;
 
-#define encrypt_round(n, a, b, c, d) \
-	vbroadcastss (k+4*(2*(n)))(CTX),   RK1;           \
-	vbroadcastss (k+4*(2*(n)+1))(CTX), RK2;           \
-	encround(a ## 1, b ## 1, c ## 1, d ## 1, RX, RY); \
-	encround(a ## 2, b ## 2, c ## 2, d ## 2, RX, RY);
-
-#define decrypt_round(n, a, b, c, d) \
-	vbroadcastss (k+4*(2*(n)))(CTX),   RK1;           \
-	vbroadcastss (k+4*(2*(n)+1))(CTX), RK2;           \
-	decround(a ## 1, b ## 1, c ## 1, d ## 1, RX, RY); \
-	decround(a ## 2, b ## 2, c ## 2, d ## 2, RX, RY);
+#define preload_rgi(c) \
+	vmovq			c, RGI1; \
+	vpextrq $1,		c, RGI2;
+
+#define encrypt_round(n, a, b, c, d, preload) \
+	vbroadcastss (k+4*(2*(n)))(CTX),   RK1;                  \
+	vbroadcastss (k+4*(2*(n)+1))(CTX), RK2;                  \
+	encround_head_2(a, b, c, d, RX0, RY0, RX1, RY1);         \
+	encround_tail(a ## 1, b ## 1, c ## 1, d ## 1, RX0, RY0); \
+	preload(c ## 1);                                         \
+	encround_tail(a ## 2, b ## 2, c ## 2, d ## 2, RX1, RY1);
+
+#define decrypt_round(n, a, b, c, d, preload) \
+	vbroadcastss (k+4*(2*(n)))(CTX),   RK1;                  \
+	vbroadcastss (k+4*(2*(n)+1))(CTX), RK2;                  \
+	decround_head_2(a, b, c, d, RX0, RY0, RX1, RY1);         \
+	decround_tail(a ## 1, b ## 1, c ## 1, d ## 1, RX0, RY0); \
+	preload(c ## 1);                                         \
+	decround_tail(a ## 2, b ## 2, c ## 2, d ## 2, RX1, RY1);
 
 #define encrypt_cycle(n) \
-	encrypt_round((2*n), RA, RB, RC, RD);       \
-	encrypt_round(((2*n) + 1), RC, RD, RA, RB);
+	encrypt_round((2*n), RA, RB, RC, RD, preload_rgi); \
+	encrypt_round(((2*n) + 1), RC, RD, RA, RB, preload_rgi);
+
+#define encrypt_cycle_last(n) \
+	encrypt_round((2*n), RA, RB, RC, RD, preload_rgi); \
+	encrypt_round(((2*n) + 1), RC, RD, RA, RB, dummy);
 
 #define decrypt_cycle(n) \
-	decrypt_round(((2*n) + 1), RC, RD, RA, RB); \
-	decrypt_round((2*n), RA, RB, RC, RD);
+	decrypt_round(((2*n) + 1), RC, RD, RA, RB, preload_rgi); \
+	decrypt_round((2*n), RA, RB, RC, RD, preload_rgi);
 
+#define decrypt_cycle_last(n) \
+	decrypt_round(((2*n) + 1), RC, RD, RA, RB, preload_rgi); \
+	decrypt_round((2*n), RA, RB, RC, RD, dummy);
 
 #define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
 	vpunpckldq		x1, x0, t0; \
@@ -216,17 +265,19 @@ __twofish_enc_blk_8way:
 	 *	%rcx: bool, if true: xor output
 	 */
 
+	pushq %rbp;
 	pushq %rbx;
 	pushq %rcx;
 
 	vmovdqu w(CTX), RK1;
 
 	leaq (4*4*4)(%rdx), %rax;
-	inpack_blocks(%rdx, RA1, RB1, RC1, RD1, RK1, RX, RY, RK2);
-	inpack_blocks(%rax, RA2, RB2, RC2, RD2, RK1, RX, RY, RK2);
+	inpack_blocks(%rdx, RA1, RB1, RC1, RD1, RK1, RX0, RY0, RK2);
+	vmovq		RA1, RGI1;
+	vpextrq $1,	RA1, RGI2;
+	inpack_blocks(%rax, RA2, RB2, RC2, RD2, RK1, RX0, RY0, RK2);
 
-	xorq RID1, RID1;
-	xorq RID2, RID2;
+	movq %rsi, %r11;
 
 	encrypt_cycle(0);
 	encrypt_cycle(1);
@@ -235,26 +286,27 @@ __twofish_enc_blk_8way:
 	encrypt_cycle(4);
 	encrypt_cycle(5);
 	encrypt_cycle(6);
-	encrypt_cycle(7);
+	encrypt_cycle_last(7);
 
 	vmovdqu (w+4*4)(CTX), RK1;
 
 	popq %rcx;
 	popq %rbx;
+	popq %rbp;
 
-	leaq (4*4*4)(%rsi), %rax;
+	leaq (4*4*4)(%r11), %rax;
 
 	testb %cl, %cl;
 	jnz __enc_xor8;
 
-	outunpack_blocks(%rsi, RC1, RD1, RA1, RB1, RK1, RX, RY, RK2);
-	outunpack_blocks(%rax, RC2, RD2, RA2, RB2, RK1, RX, RY, RK2);
+	outunpack_blocks(%r11, RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2);
+	outunpack_blocks(%rax, RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2);
 
 	ret;
 
 __enc_xor8:
-	outunpack_xor_blocks(%rsi, RC1, RD1, RA1, RB1, RK1, RX, RY, RK2);
-	outunpack_xor_blocks(%rax, RC2, RD2, RA2, RB2, RK1, RX, RY, RK2);
+	outunpack_xor_blocks(%r11, RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2);
+	outunpack_xor_blocks(%rax, RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2);
 
 	ret;
 
@@ -269,16 +321,18 @@ twofish_dec_blk_8way:
 	 *	%rdx: src
 	 */
 
+	pushq %rbp;
 	pushq %rbx;
 
 	vmovdqu (w+4*4)(CTX), RK1;
 
 	leaq (4*4*4)(%rdx), %rax;
-	inpack_blocks(%rdx, RC1, RD1, RA1, RB1, RK1, RX, RY, RK2);
-	inpack_blocks(%rax, RC2, RD2, RA2, RB2, RK1, RX, RY, RK2);
+	inpack_blocks(%rdx, RC1, RD1, RA1, RB1, RK1, RX0, RY0, RK2);
+	vmovq		RC1, RGI1;
+	vpextrq $1,	RC1, RGI2;
+	inpack_blocks(%rax, RC2, RD2, RA2, RB2, RK1, RX0, RY0, RK2);
 
-	xorq RID1, RID1;
-	xorq RID2, RID2;
+	movq %rsi, %r11;
 
 	decrypt_cycle(7);
 	decrypt_cycle(6);
@@ -287,14 +341,15 @@ twofish_dec_blk_8way:
 	decrypt_cycle(3);
 	decrypt_cycle(2);
 	decrypt_cycle(1);
-	decrypt_cycle(0);
+	decrypt_cycle_last(0);
 
 	vmovdqu (w)(CTX), RK1;
 
 	popq %rbx;
+	popq %rbp;
 
-	leaq (4*4*4)(%rsi), %rax;
-	outunpack_blocks(%rsi, RA1, RB1, RC1, RD1, RK1, RX, RY, RK2);
-	outunpack_blocks(%rax, RA2, RB2, RC2, RD2, RK1, RX, RY, RK2);
+	leaq (4*4*4)(%r11), %rax;
+	outunpack_blocks(%r11, RA1, RB1, RC1, RD1, RK1, RX0, RY0, RK2);
+	outunpack_blocks(%rax, RA2, RB2, RC2, RD2, RK1, RX0, RY0, RK2);
 
 	ret;

next prev parent reply	other threads:[~2012-08-17  7:37 UTC|newest]

Thread overview: 50+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2012-05-27 14:49 [PATCH] crypto: twofish - add x86_64/avx assembler implementation Johannes Goetzfried
2012-05-27 14:49 ` Johannes Goetzfried
2012-05-28  6:25 ` Jussi Kivilinna
2012-05-28  6:25   ` Jussi Kivilinna
2012-05-28 13:52   ` Johannes Goetzfried
2012-08-15  8:42 ` Jussi Kivilinna
2012-08-15  8:42   ` Jussi Kivilinna
2012-08-15  9:28   ` Borislav Petkov
2012-08-15  9:28     ` Borislav Petkov
2012-08-15 11:00     ` Jussi Kivilinna
2012-08-15 11:00       ` Jussi Kivilinna
2012-08-15 12:52       ` Borislav Petkov
2012-08-15 12:52         ` Borislav Petkov
2012-08-15 13:48         ` Jussi Kivilinna
2012-08-15 13:48           ` Jussi Kivilinna
2012-08-15 14:03           ` Borislav Petkov
2012-08-15 14:03             ` Borislav Petkov
2012-08-15 14:22             ` Jussi Kivilinna
2012-08-15 14:22               ` Jussi Kivilinna
2012-08-15 15:33               ` Borislav Petkov
2012-08-15 15:33                 ` Borislav Petkov
2012-08-15 17:34             ` Jussi Kivilinna
2012-08-15 17:34               ` Jussi Kivilinna
2012-08-16 13:29               ` Borislav Petkov
2012-08-16 13:29                 ` Borislav Petkov
2012-08-16 14:26                 ` Jussi Kivilinna
2012-08-16 14:26                   ` Jussi Kivilinna
2012-08-17  7:37                 ` Jussi Kivilinna [this message]
2012-08-17  7:37                   ` Jussi Kivilinna
2012-08-20 17:32                   ` Borislav Petkov
2012-08-20 17:32                     ` Borislav Petkov
2012-08-22  4:35                     ` Jussi Kivilinna
2012-08-22  4:35                       ` Jussi Kivilinna
2012-08-22 13:31                       ` Borislav Petkov
2012-08-22 13:31                         ` Borislav Petkov
2012-08-22 19:20                         ` Jussi Kivilinna
2012-08-22 19:20                           ` Jussi Kivilinna
2012-08-23  0:05                           ` Jason Garrett-Glaser
2012-08-23  0:05                             ` Jason Garrett-Glaser
2012-08-23  8:33                             ` Jussi Kivilinna
2012-08-23  8:33                               ` Jussi Kivilinna
2012-08-23 14:36                           ` Borislav Petkov
2012-08-23 14:36                             ` Borislav Petkov
2012-08-28  9:17                             ` Jussi Kivilinna
2012-08-28  9:17                               ` Jussi Kivilinna
2012-08-28 16:25                               ` Borislav Petkov
2012-08-28 16:25                                 ` Borislav Petkov
  -- strict thread matches above, loose matches on Subject: below --
2012-05-28 13:54 Johannes Goetzfried
2012-06-12 10:05 ` Herbert Xu
2012-06-12 10:05   ` Herbert Xu

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:35f4557 dfblob:6638a87 dfblob:35f4557 dfblob:6638a87 )
 OR (
bs:"Re: [PATCH] crypto: twofish - add x86_64/avx assembler implementation" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20120817073048.16720.80328.stgit@localhost6.localdomain6 \
    --to=jussi.kivilinna@mbnet.fi \
    --cc=Johannes.Goetzfried@informatik.stud.uni-erlangen.de \
    --cc=bp@alien8.de \
    --cc=herbert@gondor.apana.org.au \
    --cc=linux-crypto@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=tilo.mueller@informatik.uni-erlangen.de \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.