public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
* [PATCH -mm crypto] AES: x86_64 asm implementation optimization
@ 2008-04-09  6:41 Huang, Ying
  2008-04-16  7:31 ` Sebastian Siewior
  0 siblings, 1 reply; 21+ messages in thread
From: Huang, Ying @ 2008-04-09  6:41 UTC (permalink / raw)
  To: Herbert Xu, Adam J. Richter, Alexander Kjeldaas,
	Sebastian Siewior, akpm
  Cc: linux-kernel, linux-crypto

[-- Attachment #1: Type: text/plain, Size: 5847 bytes --]

This patch increases the performance of AES x86-64 implementation. The
average increment is more than 6.3% and the max increment is
more than 10.2% on Intel CORE 2 CPU. The performance increment is
gained via the following methods:

- Two additional temporary registers are used to hold the subset of
  the state, so that the dependency between instructions is reduced.

- The expanded key is loaded via 2 64bit load instead of 4 32-bit load.

This patch is based on 2.6.25-rc8-mm1.

The file attached is the test data via: modprobe tcrypt mode=200

- dmesg_1_core-stockn:	stock kernel data
- dmesg_1_core-op4n:	patched kernel data
- percent.txt:		(time_patched - time_stock) / time_stock * 100

Signed-off-by: Huang Ying <ying.huang@intel.com>

---
 arch/x86/crypto/aes-x86_64-asm_64.S |  101 ++++++++++++++++++++----------------
 include/crypto/aes.h                |    1 
 2 files changed, 58 insertions(+), 44 deletions(-)

--- a/arch/x86/crypto/aes-x86_64-asm_64.S
+++ b/arch/x86/crypto/aes-x86_64-asm_64.S
@@ -46,70 +46,81 @@
 #define R7	%rbp
 #define R7E	%ebp
 #define R8	%r8
+#define R8E	%r8d
 #define R9	%r9
+#define R9E	%r9d
 #define R10	%r10
 #define R11	%r11
+#define R12	%r12
+#define R12E	%r12d
+#define R16	%rsp
 
 #define prologue(FUNC,KEY,B128,B192,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11) \
 	.global	FUNC;			\
 	.type	FUNC,@function;		\
 	.align	8;			\
-FUNC:	movq	r1,r2;			\
-	movq	r3,r4;			\
-	leaq	BASE+KEY+48+4(r8),r9;	\
-	movq	r10,r11;		\
-	movl	(r7),r5 ## E;		\
-	movl	4(r7),r1 ## E;		\
-	movl	8(r7),r6 ## E;		\
-	movl	12(r7),r7 ## E;		\
-	movl	BASE+0(r8),r10 ## E;	\
-	xorl	-48(r9),r5 ## E;	\
-	xorl	-44(r9),r1 ## E;	\
-	xorl	-40(r9),r6 ## E;	\
-	xorl	-36(r9),r7 ## E;	\
-	cmpl	$24,r10 ## E;		\
+FUNC:	subq	$24,r11;		\
+	movl	(r6),r4 ## E;		\
+	leaq	BASE+KEY+48+8(r7),r8;	\
+	movq	r1,(r11);		\
+	movq	r9,r10;			\
+	movl	4(r6),r1 ## E;		\
+	movq	r2,8(r11);		\
+	movl	8(r6),r5 ## E;		\
+	movq	r3,16(r11);		\
+	movl	12(r6),r6 ## E;		\
+	movl	BASE+0(r7),r9 ## E;	\
+	xorl	-48(r8),r4 ## E;	\
+	xorl	-44(r8),r1 ## E;	\
+	xorl	-40(r8),r5 ## E;	\
+	xorl	-36(r8),r6 ## E;	\
+	cmpl	$24,r9 ## E;		\
 	jb	B128;			\
-	leaq	32(r9),r9;		\
+	leaq	32(r8),r8;		\
 	je	B192;			\
-	leaq	32(r9),r9;
+	leaq	32(r8),r8;
 
 #define epilogue(r1,r2,r3,r4,r5,r6,r7,r8,r9) \
-	movq	r1,r2;			\
-	movq	r3,r4;			\
-	movl	r5 ## E,(r9);		\
-	movl	r6 ## E,4(r9);		\
-	movl	r7 ## E,8(r9);		\
-	movl	r8 ## E,12(r9);		\
+	movq	(r9),r1;		\
+	movl	r4 ## E,(r8);		\
+	movq	8(r9),r2;		\
+	movl	r5 ## E,4(r8);		\
+	movq	16(r9),r3;		\
+	movl	r6 ## E,8(r8);		\
+	addq	$24,r9;			\
+	movl	r7 ## E,12(r8);		\
 	ret;
 
-#define round(TAB,OFFSET,r1,r2,r3,r4,r5,r6,r7,r8,ra,rb,rc,rd) \
+#define round(TAB,OFFSET,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11,ra,rb,rc,rd) \
 	movzbl	r2 ## H,r5 ## E;	\
 	movzbl	r2 ## L,r6 ## E;	\
+	movl	r4 ## E,r8 ## E;	\
+	shrl	$16,r4 ## E;		\
 	movl	TAB+1024(,r5,4),r5 ## E;\
-	movw	r4 ## X,r2 ## X;	\
 	movl	TAB(,r6,4),r6 ## E;	\
-	roll	$16,r2 ## E;		\
-	shrl	$16,r4 ## E;		\
 	movzbl	r4 ## H,r7 ## E;	\
 	movzbl	r4 ## L,r4 ## E;	\
-	xorl	OFFSET(r8),ra ## E;	\
-	xorl	OFFSET+4(r8),rb ## E;	\
+	movq	OFFSET(r11),r10;	\
+	shrl	$16,r2 ## E;		\
+	movl	r3 ## E,r9 ## E;	\
 	xorl	TAB+3072(,r7,4),r5 ## E;\
 	xorl	TAB+2048(,r4,4),r6 ## E;\
-	movzbl	r1 ## L,r7 ## E;	\
 	movzbl	r1 ## H,r4 ## E;	\
-	movl	TAB+1024(,r4,4),r4 ## E;\
-	movw	r3 ## X,r1 ## X;	\
-	roll	$16,r1 ## E;		\
+	movzbl	r1 ## L,r7 ## E;	\
 	shrl	$16,r3 ## E;		\
+	movl	TAB+1024(,r4,4),r4 ## E;\
 	xorl	TAB(,r7,4),r5 ## E;	\
+	shrl	$16,r1 ## E;		\
 	movzbl	r3 ## H,r7 ## E;	\
 	movzbl	r3 ## L,r3 ## E;	\
 	xorl	TAB+3072(,r7,4),r4 ## E;\
 	xorl	TAB+2048(,r3,4),r5 ## E;\
 	movzbl	r1 ## H,r7 ## E;	\
 	movzbl	r1 ## L,r3 ## E;	\
-	shrl	$16,r1 ## E;		\
+	xorl	r10 ## E,ra ## E;	\
+	movl	r9 ## E,r1 ## E;	\
+	movq	OFFSET+8(r11),r9;	\
+	shrq	$32,r10;		\
 	xorl	TAB+3072(,r7,4),r6 ## E;\
 	movl	TAB+2048(,r3,4),r3 ## E;\
 	movzbl	r1 ## H,r7 ## E;	\
@@ -118,38 +129,40 @@ FUNC:	movq	r1,r2;			\
 	xorl	TAB(,r1,4),r3 ## E;	\
 	movzbl	r2 ## H,r1 ## E;	\
 	movzbl	r2 ## L,r7 ## E;	\
-	shrl	$16,r2 ## E;		\
+	xorl	r9 ## E, rc ## E;	\
+	movl	r8 ## E,r2 ## E;	\
+	shrq	$32,r9;			\
+	xorl	r10 ## E,rb ## E;	\
 	xorl	TAB+3072(,r1,4),r3 ## E;\
 	xorl	TAB+2048(,r7,4),r4 ## E;\
 	movzbl	r2 ## H,r1 ## E;	\
+	xorl	r9 ## E, rd ## E;	\
 	movzbl	r2 ## L,r2 ## E;	\
-	xorl	OFFSET+8(r8),rc ## E;	\
-	xorl	OFFSET+12(r8),rd ## E;	\
-	xorl	TAB+1024(,r1,4),r3 ## E;\
-	xorl	TAB(,r2,4),r4 ## E;
+	xorl	TAB(,r2,4),r4 ## E;	\
+	xorl	TAB+1024(,r1,4),r3 ## E;
 
 #define move_regs(r1,r2,r3,r4) \
 	movl	r3 ## E,r1 ## E;	\
 	movl	r4 ## E,r2 ## E;
 
 #define entry(FUNC,KEY,B128,B192) \
-	prologue(FUNC,KEY,B128,B192,R2,R8,R7,R9,R1,R3,R4,R6,R10,R5,R11)
+	prologue(FUNC,KEY,B128,B192,R2,R7,R12,R1,R3,R4,R6,R10,R5,R11,R16)
 
-#define return epilogue(R8,R2,R9,R7,R5,R6,R3,R4,R11)
+#define return epilogue(R2,R7,R12,R5,R6,R3,R4,R11,R16)
 
 #define encrypt_round(TAB,OFFSET) \
-	round(TAB,OFFSET,R1,R2,R3,R4,R5,R6,R7,R10,R5,R6,R3,R4) \
+	round(TAB,OFFSET,R1,R2,R3,R4,R5,R6,R7,R8,R9,R12,R10,R5,R6,R3,R4) \
 	move_regs(R1,R2,R5,R6)
 
 #define encrypt_final(TAB,OFFSET) \
-	round(TAB,OFFSET,R1,R2,R3,R4,R5,R6,R7,R10,R5,R6,R3,R4)
+	round(TAB,OFFSET,R1,R2,R3,R4,R5,R6,R7,R8,R9,R12,R10,R5,R6,R3,R4)
 
 #define decrypt_round(TAB,OFFSET) \
-	round(TAB,OFFSET,R2,R1,R4,R3,R6,R5,R7,R10,R5,R6,R3,R4) \
+	round(TAB,OFFSET,R2,R1,R4,R3,R6,R5,R7,R8,R9,R12,R10,R5,R6,R3,R4) \
 	move_regs(R1,R2,R5,R6)
 
 #define decrypt_final(TAB,OFFSET) \
-	round(TAB,OFFSET,R2,R1,R4,R3,R6,R5,R7,R10,R5,R6,R3,R4)
+	round(TAB,OFFSET,R2,R1,R4,R3,R6,R5,R7,R8,R9,R12,R10,R5,R6,R3,R4)
 
 /* void aes_enc_blk(stuct crypto_tfm *tfm, u8 *out, const u8 *in) */
 
--- a/include/crypto/aes.h
+++ b/include/crypto/aes.h
@@ -19,6 +19,7 @@
 
 struct crypto_aes_ctx {
 	u32 key_length;
+	u32 _pad1;
 	u32 key_enc[AES_MAX_KEYLENGTH_U32];
 	u32 key_dec[AES_MAX_KEYLENGTH_U32];
 };


[-- Attachment #2: dmesg_1_core-stockn --]
[-- Type: text/plain, Size: 9946 bytes --]

e1000: eth2: e1000_watchdog: 10/100 speed: disabling TSO

testing speed of ecb(aes) encryption
test 0 (128 bit key, 16 byte blocks): 1 operation in 768 cycles (16 bytes)
test 1 (128 bit key, 64 byte blocks): 1 operation in 1202 cycles (64 bytes)
test 2 (128 bit key, 256 byte blocks): 1 operation in 3968 cycles (256 bytes)
test 3 (128 bit key, 1024 byte blocks): 1 operation in 15065 cycles (1024 bytes)
test 4 (128 bit key, 8192 byte blocks): 1 operation in 119202 cycles (8192 bytes)
test 5 (192 bit key, 16 byte blocks): 1 operation in 552 cycles (16 bytes)
test 6 (192 bit key, 64 byte blocks): 1 operation in 1362 cycles (64 bytes)
test 7 (192 bit key, 256 byte blocks): 1 operation in 4655 cycles (256 bytes)
test 8 (192 bit key, 1024 byte blocks): 1 operation in 17731 cycles (1024 bytes)
test 9 (192 bit key, 8192 byte blocks): 1 operation in 141618 cycles (8192 bytes)
test 10 (256 bit key, 16 byte blocks): 1 operation in 593 cycles (16 bytes)
test 11 (256 bit key, 64 byte blocks): 1 operation in 1522 cycles (64 bytes)
test 12 (256 bit key, 256 byte blocks): 1 operation in 5251 cycles (256 bytes)
test 13 (256 bit key, 1024 byte blocks): 1 operation in 20262 cycles (1024 bytes)
test 14 (256 bit key, 8192 byte blocks): 1 operation in 160605 cycles (8192 bytes)

testing speed of ecb(aes) decryption
test 0 (128 bit key, 16 byte blocks): 1 operation in 573 cycles (16 bytes)
test 1 (128 bit key, 64 byte blocks): 1 operation in 1226 cycles (64 bytes)
test 2 (128 bit key, 256 byte blocks): 1 operation in 3984 cycles (256 bytes)
test 3 (128 bit key, 1024 byte blocks): 1 operation in 14999 cycles (1024 bytes)
test 4 (128 bit key, 8192 byte blocks): 1 operation in 118126 cycles (8192 bytes)
test 5 (192 bit key, 16 byte blocks): 1 operation in 580 cycles (16 bytes)
test 6 (192 bit key, 64 byte blocks): 1 operation in 1405 cycles (64 bytes)
test 7 (192 bit key, 256 byte blocks): 1 operation in 4636 cycles (256 bytes)
test 8 (192 bit key, 1024 byte blocks): 1 operation in 17604 cycles (1024 bytes)
test 9 (192 bit key, 8192 byte blocks): 1 operation in 140289 cycles (8192 bytes)
test 10 (256 bit key, 16 byte blocks): 1 operation in 619 cycles (16 bytes)
test 11 (256 bit key, 64 byte blocks): 1 operation in 1551 cycles (64 bytes)
test 12 (256 bit key, 256 byte blocks): 1 operation in 5297 cycles (256 bytes)
test 13 (256 bit key, 1024 byte blocks): 1 operation in 20286 cycles (1024 bytes)
test 14 (256 bit key, 8192 byte blocks): 1 operation in 160281 cycles (8192 bytes)

testing speed of cbc(aes) encryption
test 0 (128 bit key, 16 byte blocks): 1 operation in 649 cycles (16 bytes)
test 1 (128 bit key, 64 byte blocks): 1 operation in 1378 cycles (64 bytes)
test 2 (128 bit key, 256 byte blocks): 1 operation in 4333 cycles (256 bytes)
test 3 (128 bit key, 1024 byte blocks): 1 operation in 16113 cycles (1024 bytes)
test 4 (128 bit key, 8192 byte blocks): 1 operation in 126978 cycles (8192 bytes)
test 5 (192 bit key, 16 byte blocks): 1 operation in 687 cycles (16 bytes)
test 6 (192 bit key, 64 byte blocks): 1 operation in 1550 cycles (64 bytes)
test 7 (192 bit key, 256 byte blocks): 1 operation in 5002 cycles (256 bytes)
test 8 (192 bit key, 1024 byte blocks): 1 operation in 18849 cycles (1024 bytes)
test 9 (192 bit key, 8192 byte blocks): 1 operation in 150723 cycles (8192 bytes)
test 10 (256 bit key, 16 byte blocks): 1 operation in 722 cycles (16 bytes)
test 11 (256 bit key, 64 byte blocks): 1 operation in 1713 cycles (64 bytes)
test 12 (256 bit key, 256 byte blocks): 1 operation in 5670 cycles (256 bytes)
test 13 (256 bit key, 1024 byte blocks): 1 operation in 21587 cycles (1024 bytes)
test 14 (256 bit key, 8192 byte blocks): 1 operation in 170571 cycles (8192 bytes)

testing speed of cbc(aes) decryption
test 0 (128 bit key, 16 byte blocks): 1 operation in 770 cycles (16 bytes)
test 1 (128 bit key, 64 byte blocks): 1 operation in 1501 cycles (64 bytes)
test 2 (128 bit key, 256 byte blocks): 1 operation in 4484 cycles (256 bytes)
test 3 (128 bit key, 1024 byte blocks): 1 operation in 16368 cycles (1024 bytes)
test 4 (128 bit key, 8192 byte blocks): 1 operation in 128557 cycles (8192 bytes)
test 5 (192 bit key, 16 byte blocks): 1 operation in 811 cycles (16 bytes)
test 6 (192 bit key, 64 byte blocks): 1 operation in 1678 cycles (64 bytes)
test 7 (192 bit key, 256 byte blocks): 1 operation in 5160 cycles (256 bytes)
test 8 (192 bit key, 1024 byte blocks): 1 operation in 19217 cycles (1024 bytes)
test 9 (192 bit key, 8192 byte blocks): 1 operation in 151977 cycles (8192 bytes)
test 10 (256 bit key, 16 byte blocks): 1 operation in 848 cycles (16 bytes)
test 11 (256 bit key, 64 byte blocks): 1 operation in 1843 cycles (64 bytes)
test 12 (256 bit key, 256 byte blocks): 1 operation in 5840 cycles (256 bytes)
test 13 (256 bit key, 1024 byte blocks): 1 operation in 21781 cycles (1024 bytes)
test 14 (256 bit key, 8192 byte blocks): 1 operation in 170436 cycles (8192 bytes)

testing speed of lrw(aes) encryption
test 0 (256 bit key, 16 byte blocks): 1 operation in 745 cycles (16 bytes)
test 1 (256 bit key, 64 byte blocks): 1 operation in 1525 cycles (64 bytes)
test 2 (256 bit key, 256 byte blocks): 1 operation in 4620 cycles (256 bytes)
test 3 (256 bit key, 1024 byte blocks): 1 operation in 16954 cycles (1024 bytes)
test 4 (256 bit key, 8192 byte blocks): 1 operation in 132816 cycles (8192 bytes)
test 5 (320 bit key, 16 byte blocks): 1 operation in 790 cycles (16 bytes)
test 6 (320 bit key, 64 byte blocks): 1 operation in 1696 cycles (64 bytes)
test 7 (320 bit key, 256 byte blocks): 1 operation in 5301 cycles (256 bytes)
test 8 (320 bit key, 1024 byte blocks): 1 operation in 19672 cycles (1024 bytes)
test 9 (320 bit key, 8192 byte blocks): 1 operation in 156073 cycles (8192 bytes)
test 10 (384 bit key, 16 byte blocks): 1 operation in 833 cycles (16 bytes)
test 11 (384 bit key, 64 byte blocks): 1 operation in 1870 cycles (64 bytes)
test 12 (384 bit key, 256 byte blocks): 1 operation in 5971 cycles (256 bytes)
test 13 (384 bit key, 1024 byte blocks): 1 operation in 22368 cycles (1024 bytes)
test 14 (384 bit key, 8192 byte blocks): 1 operation in 176158 cycles (8192 bytes)

testing speed of lrw(aes) decryption
test 0 (256 bit key, 16 byte blocks): 1 operation in 742 cycles (16 bytes)
test 1 (256 bit key, 64 byte blocks): 1 operation in 1528 cycles (64 bytes)
test 2 (256 bit key, 256 byte blocks): 1 operation in 4617 cycles (256 bytes)
test 3 (256 bit key, 1024 byte blocks): 1 operation in 16949 cycles (1024 bytes)
test 4 (256 bit key, 8192 byte blocks): 1 operation in 132822 cycles (8192 bytes)
test 5 (320 bit key, 16 byte blocks): 1 operation in 778 cycles (16 bytes)
test 6 (320 bit key, 64 byte blocks): 1 operation in 1701 cycles (64 bytes)
test 7 (320 bit key, 256 byte blocks): 1 operation in 5291 cycles (256 bytes)
test 8 (320 bit key, 1024 byte blocks): 1 operation in 19660 cycles (1024 bytes)
test 9 (320 bit key, 8192 byte blocks): 1 operation in 155871 cycles (8192 bytes)
test 10 (384 bit key, 16 byte blocks): 1 operation in 824 cycles (16 bytes)
test 11 (384 bit key, 64 byte blocks): 1 operation in 1864 cycles (64 bytes)
test 12 (384 bit key, 256 byte blocks): 1 operation in 5978 cycles (256 bytes)
test 13 (384 bit key, 1024 byte blocks): 1 operation in 22370 cycles (1024 bytes)
test 14 (384 bit key, 8192 byte blocks): 1 operation in 176247 cycles (8192 bytes)

testing speed of xts(aes) encryption
test 0 (256 bit key, 16 byte blocks): 1 operation in 770 cycles (16 bytes)
test 1 (256 bit key, 64 byte blocks): 1 operation in 1498 cycles (64 bytes)
test 2 (256 bit key, 256 byte blocks): 1 operation in 4486 cycles (256 bytes)
test 3 (256 bit key, 1024 byte blocks): 1 operation in 16456 cycles (1024 bytes)
test 4 (256 bit key, 8192 byte blocks): 1 operation in 128552 cycles (8192 bytes)
test 5 (384 bit key, 16 byte blocks): 1 operation in 840 cycles (16 bytes)
test 6 (384 bit key, 64 byte blocks): 1 operation in 1721 cycles (64 bytes)
test 7 (384 bit key, 256 byte blocks): 1 operation in 5195 cycles (256 bytes)
test 8 (384 bit key, 1024 byte blocks): 1 operation in 19166 cycles (1024 bytes)
test 9 (384 bit key, 8192 byte blocks): 1 operation in 150278 cycles (8192 bytes)
test 10 (512 bit key, 16 byte blocks): 1 operation in 921 cycles (16 bytes)
test 11 (512 bit key, 64 byte blocks): 1 operation in 1917 cycles (64 bytes)
test 12 (512 bit key, 256 byte blocks): 1 operation in 5916 cycles (256 bytes)
test 13 (512 bit key, 1024 byte blocks): 1 operation in 21977 cycles (1024 bytes)
test 14 (512 bit key, 8192 byte blocks): 1 operation in 172153 cycles (8192 bytes)

testing speed of xts(aes) decryption
test 0 (256 bit key, 16 byte blocks): 1 operation in 780 cycles (16 bytes)
test 1 (256 bit key, 64 byte blocks): 1 operation in 1507 cycles (64 bytes)
test 2 (256 bit key, 256 byte blocks): 1 operation in 4486 cycles (256 bytes)
test 3 (256 bit key, 1024 byte blocks): 1 operation in 16455 cycles (1024 bytes)
test 4 (256 bit key, 8192 byte blocks): 1 operation in 128540 cycles (8192 bytes)
test 5 (384 bit key, 16 byte blocks): 1 operation in 853 cycles (16 bytes)
test 6 (384 bit key, 64 byte blocks): 1 operation in 1718 cycles (64 bytes)
test 7 (384 bit key, 256 byte blocks): 1 operation in 5223 cycles (256 bytes)
test 8 (384 bit key, 1024 byte blocks): 1 operation in 19183 cycles (1024 bytes)
test 9 (384 bit key, 8192 byte blocks): 1 operation in 150166 cycles (8192 bytes)
test 10 (512 bit key, 16 byte blocks): 1 operation in 928 cycles (16 bytes)
test 11 (512 bit key, 64 byte blocks): 1 operation in 1925 cycles (64 bytes)
test 12 (512 bit key, 256 byte blocks): 1 operation in 5942 cycles (256 bytes)
test 13 (512 bit key, 1024 byte blocks): 1 operation in 21950 cycles (1024 bytes)
test 14 (512 bit key, 8192 byte blocks): 1 operation in 172112 cycles (8192 bytes)

[-- Attachment #3: dmesg_1_core-op4n --]
[-- Type: text/plain, Size: 9946 bytes --]

e1000: eth2: e1000_watchdog: 10/100 speed: disabling TSO

testing speed of ecb(aes) encryption
test 0 (128 bit key, 16 byte blocks): 1 operation in 511 cycles (16 bytes)
test 1 (128 bit key, 64 byte blocks): 1 operation in 1153 cycles (64 bytes)
test 2 (128 bit key, 256 byte blocks): 1 operation in 3717 cycles (256 bytes)
test 3 (128 bit key, 1024 byte blocks): 1 operation in 14003 cycles (1024 bytes)
test 4 (128 bit key, 8192 byte blocks): 1 operation in 110386 cycles (8192 bytes)
test 5 (192 bit key, 16 byte blocks): 1 operation in 529 cycles (16 bytes)
test 6 (192 bit key, 64 byte blocks): 1 operation in 1300 cycles (64 bytes)
test 7 (192 bit key, 256 byte blocks): 1 operation in 4344 cycles (256 bytes)
test 8 (192 bit key, 1024 byte blocks): 1 operation in 16576 cycles (1024 bytes)
test 9 (192 bit key, 8192 byte blocks): 1 operation in 132421 cycles (8192 bytes)
test 10 (256 bit key, 16 byte blocks): 1 operation in 568 cycles (16 bytes)
test 11 (256 bit key, 64 byte blocks): 1 operation in 1455 cycles (64 bytes)
test 12 (256 bit key, 256 byte blocks): 1 operation in 4969 cycles (256 bytes)
test 13 (256 bit key, 1024 byte blocks): 1 operation in 18983 cycles (1024 bytes)
test 14 (256 bit key, 8192 byte blocks): 1 operation in 151159 cycles (8192 bytes)

testing speed of ecb(aes) decryption
test 0 (128 bit key, 16 byte blocks): 1 operation in 588 cycles (16 bytes)
test 1 (128 bit key, 64 byte blocks): 1 operation in 1140 cycles (64 bytes)
test 2 (128 bit key, 256 byte blocks): 1 operation in 3650 cycles (256 bytes)
test 3 (128 bit key, 1024 byte blocks): 1 operation in 13721 cycles (1024 bytes)
test 4 (128 bit key, 8192 byte blocks): 1 operation in 108180 cycles (8192 bytes)
test 5 (192 bit key, 16 byte blocks): 1 operation in 554 cycles (16 bytes)
test 6 (192 bit key, 64 byte blocks): 1 operation in 1301 cycles (64 bytes)
test 7 (192 bit key, 256 byte blocks): 1 operation in 4267 cycles (256 bytes)
test 8 (192 bit key, 1024 byte blocks): 1 operation in 16175 cycles (1024 bytes)
test 9 (192 bit key, 8192 byte blocks): 1 operation in 129410 cycles (8192 bytes)
test 10 (256 bit key, 16 byte blocks): 1 operation in 592 cycles (16 bytes)
test 11 (256 bit key, 64 byte blocks): 1 operation in 1445 cycles (64 bytes)
test 12 (256 bit key, 256 byte blocks): 1 operation in 4847 cycles (256 bytes)
test 13 (256 bit key, 1024 byte blocks): 1 operation in 18501 cycles (1024 bytes)
test 14 (256 bit key, 8192 byte blocks): 1 operation in 146061 cycles (8192 bytes)

testing speed of cbc(aes) encryption
test 0 (128 bit key, 16 byte blocks): 1 operation in 637 cycles (16 bytes)
test 1 (128 bit key, 64 byte blocks): 1 operation in 1326 cycles (64 bytes)
test 2 (128 bit key, 256 byte blocks): 1 operation in 4086 cycles (256 bytes)
test 3 (128 bit key, 1024 byte blocks): 1 operation in 15168 cycles (1024 bytes)
test 4 (128 bit key, 8192 byte blocks): 1 operation in 119998 cycles (8192 bytes)
test 5 (192 bit key, 16 byte blocks): 1 operation in 663 cycles (16 bytes)
test 6 (192 bit key, 64 byte blocks): 1 operation in 1478 cycles (64 bytes)
test 7 (192 bit key, 256 byte blocks): 1 operation in 4730 cycles (256 bytes)
test 8 (192 bit key, 1024 byte blocks): 1 operation in 17692 cycles (1024 bytes)
test 9 (192 bit key, 8192 byte blocks): 1 operation in 141461 cycles (8192 bytes)
test 10 (256 bit key, 16 byte blocks): 1 operation in 702 cycles (16 bytes)
test 11 (256 bit key, 64 byte blocks): 1 operation in 1628 cycles (64 bytes)
test 12 (256 bit key, 256 byte blocks): 1 operation in 5321 cycles (256 bytes)
test 13 (256 bit key, 1024 byte blocks): 1 operation in 20120 cycles (1024 bytes)
test 14 (256 bit key, 8192 byte blocks): 1 operation in 159425 cycles (8192 bytes)

testing speed of cbc(aes) decryption
test 0 (128 bit key, 16 byte blocks): 1 operation in 741 cycles (16 bytes)
test 1 (128 bit key, 64 byte blocks): 1 operation in 1422 cycles (64 bytes)
test 2 (128 bit key, 256 byte blocks): 1 operation in 4136 cycles (256 bytes)
test 3 (128 bit key, 1024 byte blocks): 1 operation in 14971 cycles (1024 bytes)
test 4 (128 bit key, 8192 byte blocks): 1 operation in 117321 cycles (8192 bytes)
test 5 (192 bit key, 16 byte blocks): 1 operation in 756 cycles (16 bytes)
test 6 (192 bit key, 64 byte blocks): 1 operation in 1551 cycles (64 bytes)
test 7 (192 bit key, 256 byte blocks): 1 operation in 4728 cycles (256 bytes)
test 8 (192 bit key, 1024 byte blocks): 1 operation in 17419 cycles (1024 bytes)
test 9 (192 bit key, 8192 byte blocks): 1 operation in 138293 cycles (8192 bytes)
test 10 (256 bit key, 16 byte blocks): 1 operation in 810 cycles (16 bytes)
test 11 (256 bit key, 64 byte blocks): 1 operation in 1690 cycles (64 bytes)
test 12 (256 bit key, 256 byte blocks): 1 operation in 5369 cycles (256 bytes)
test 13 (256 bit key, 1024 byte blocks): 1 operation in 19844 cycles (1024 bytes)
test 14 (256 bit key, 8192 byte blocks): 1 operation in 156878 cycles (8192 bytes)

testing speed of lrw(aes) encryption
test 0 (256 bit key, 16 byte blocks): 1 operation in 732 cycles (16 bytes)
test 1 (256 bit key, 64 byte blocks): 1 operation in 1459 cycles (64 bytes)
test 2 (256 bit key, 256 byte blocks): 1 operation in 4350 cycles (256 bytes)
test 3 (256 bit key, 1024 byte blocks): 1 operation in 15880 cycles (1024 bytes)
test 4 (256 bit key, 8192 byte blocks): 1 operation in 124042 cycles (8192 bytes)
test 5 (320 bit key, 16 byte blocks): 1 operation in 768 cycles (16 bytes)
test 6 (320 bit key, 64 byte blocks): 1 operation in 1639 cycles (64 bytes)
test 7 (320 bit key, 256 byte blocks): 1 operation in 4945 cycles (256 bytes)
test 8 (320 bit key, 1024 byte blocks): 1 operation in 18299 cycles (1024 bytes)
test 9 (320 bit key, 8192 byte blocks): 1 operation in 145070 cycles (8192 bytes)
test 10 (384 bit key, 16 byte blocks): 1 operation in 812 cycles (16 bytes)
test 11 (384 bit key, 64 byte blocks): 1 operation in 1779 cycles (64 bytes)
test 12 (384 bit key, 256 byte blocks): 1 operation in 5580 cycles (256 bytes)
test 13 (384 bit key, 1024 byte blocks): 1 operation in 20790 cycles (1024 bytes)
test 14 (384 bit key, 8192 byte blocks): 1 operation in 163517 cycles (8192 bytes)

testing speed of lrw(aes) decryption
test 0 (256 bit key, 16 byte blocks): 1 operation in 727 cycles (16 bytes)
test 1 (256 bit key, 64 byte blocks): 1 operation in 1433 cycles (64 bytes)
test 2 (256 bit key, 256 byte blocks): 1 operation in 4231 cycles (256 bytes)
test 3 (256 bit key, 1024 byte blocks): 1 operation in 15406 cycles (1024 bytes)
test 4 (256 bit key, 8192 byte blocks): 1 operation in 120449 cycles (8192 bytes)
test 5 (320 bit key, 16 byte blocks): 1 operation in 762 cycles (16 bytes)
test 6 (320 bit key, 64 byte blocks): 1 operation in 1601 cycles (64 bytes)
test 7 (320 bit key, 256 byte blocks): 1 operation in 4823 cycles (256 bytes)
test 8 (320 bit key, 1024 byte blocks): 1 operation in 17750 cycles (1024 bytes)
test 9 (320 bit key, 8192 byte blocks): 1 operation in 140575 cycles (8192 bytes)
test 10 (384 bit key, 16 byte blocks): 1 operation in 794 cycles (16 bytes)
test 11 (384 bit key, 64 byte blocks): 1 operation in 1725 cycles (64 bytes)
test 12 (384 bit key, 256 byte blocks): 1 operation in 5419 cycles (256 bytes)
test 13 (384 bit key, 1024 byte blocks): 1 operation in 20121 cycles (1024 bytes)
test 14 (384 bit key, 8192 byte blocks): 1 operation in 158320 cycles (8192 bytes)

testing speed of xts(aes) encryption
test 0 (256 bit key, 16 byte blocks): 1 operation in 731 cycles (16 bytes)
test 1 (256 bit key, 64 byte blocks): 1 operation in 1432 cycles (64 bytes)
test 2 (256 bit key, 256 byte blocks): 1 operation in 4254 cycles (256 bytes)
test 3 (256 bit key, 1024 byte blocks): 1 operation in 15536 cycles (1024 bytes)
test 4 (256 bit key, 8192 byte blocks): 1 operation in 121465 cycles (8192 bytes)
test 5 (384 bit key, 16 byte blocks): 1 operation in 797 cycles (16 bytes)
test 6 (384 bit key, 64 byte blocks): 1 operation in 1626 cycles (64 bytes)
test 7 (384 bit key, 256 byte blocks): 1 operation in 4890 cycles (256 bytes)
test 8 (384 bit key, 1024 byte blocks): 1 operation in 18007 cycles (1024 bytes)
test 9 (384 bit key, 8192 byte blocks): 1 operation in 140970 cycles (8192 bytes)
test 10 (512 bit key, 16 byte blocks): 1 operation in 867 cycles (16 bytes)
test 11 (512 bit key, 64 byte blocks): 1 operation in 1823 cycles (64 bytes)
test 12 (512 bit key, 256 byte blocks): 1 operation in 5551 cycles (256 bytes)
test 13 (512 bit key, 1024 byte blocks): 1 operation in 20474 cycles (1024 bytes)
test 14 (512 bit key, 8192 byte blocks): 1 operation in 160336 cycles (8192 bytes)

testing speed of xts(aes) decryption
test 0 (256 bit key, 16 byte blocks): 1 operation in 736 cycles (16 bytes)
test 1 (256 bit key, 64 byte blocks): 1 operation in 1412 cycles (64 bytes)
test 2 (256 bit key, 256 byte blocks): 1 operation in 4162 cycles (256 bytes)
test 3 (256 bit key, 1024 byte blocks): 1 operation in 15168 cycles (1024 bytes)
test 4 (256 bit key, 8192 byte blocks): 1 operation in 118542 cycles (8192 bytes)
test 5 (384 bit key, 16 byte blocks): 1 operation in 803 cycles (16 bytes)
test 6 (384 bit key, 64 byte blocks): 1 operation in 1602 cycles (64 bytes)
test 7 (384 bit key, 256 byte blocks): 1 operation in 4773 cycles (256 bytes)
test 8 (384 bit key, 1024 byte blocks): 1 operation in 17577 cycles (1024 bytes)
test 9 (384 bit key, 8192 byte blocks): 1 operation in 137579 cycles (8192 bytes)
test 10 (512 bit key, 16 byte blocks): 1 operation in 867 cycles (16 bytes)
test 11 (512 bit key, 64 byte blocks): 1 operation in 1773 cycles (64 bytes)
test 12 (512 bit key, 256 byte blocks): 1 operation in 5405 cycles (256 bytes)
test 13 (512 bit key, 1024 byte blocks): 1 operation in 19925 cycles (1024 bytes)
test 14 (512 bit key, 8192 byte blocks): 1 operation in 155815 cycles (8192 bytes)

[-- Attachment #4: percent.txt --]
[-- Type: text/plain, Size: 2570 bytes --]

ecb1_128_16	-33.46	
ecb1_128_64	-4.08	
ecb1_128_256	-6.33	
ecb1_128_1024	-7.05	
ecb1_128_8192	-7.40	
ecb1_192_16	-4.17	
ecb1_192_64	-4.55	
ecb1_192_256	-6.68	
ecb1_192_1024	-6.51	
ecb1_192_8192	-6.49	
ecb1_256_16	-4.22	
ecb1_256_64	-4.40	
ecb1_256_256	-5.37	
ecb1_256_1024	-6.31	
ecb1_256_8192	-5.88	
ecb0_128_16	2.62	
ecb0_128_64	-7.01	
ecb0_128_256	-8.38	
ecb0_128_1024	-8.52	
ecb0_128_8192	-8.42	
ecb0_192_16	-4.48	
ecb0_192_64	-7.40	
ecb0_192_256	-7.96	
ecb0_192_1024	-8.12	
ecb0_192_8192	-7.75	
ecb0_256_16	-4.36	
ecb0_256_64	-6.83	
ecb0_256_256	-8.50	
ecb0_256_1024	-8.80	
ecb0_256_8192	-8.87	
cbc1_128_16	-1.85	
cbc1_128_64	-3.77	
cbc1_128_256	-5.70	
cbc1_128_1024	-5.86	
cbc1_128_8192	-5.50	
cbc1_192_16	-3.49	
cbc1_192_64	-4.65	
cbc1_192_256	-5.44	
cbc1_192_1024	-6.14	
cbc1_192_8192	-6.15	
cbc1_256_16	-2.77	
cbc1_256_64	-4.96	
cbc1_256_256	-6.16	
cbc1_256_1024	-6.80	
cbc1_256_8192	-6.53	
cbc0_128_16	-3.77	
cbc0_128_64	-5.26	
cbc0_128_256	-7.76	
cbc0_128_1024	-8.53	
cbc0_128_8192	-8.74	
cbc0_192_16	-6.78	
cbc0_192_64	-7.57	
cbc0_192_256	-8.37	
cbc0_192_1024	-9.36	
cbc0_192_8192	-9.00	
cbc0_256_16	-4.48	
cbc0_256_64	-8.30	
cbc0_256_256	-8.07	
cbc0_256_1024	-8.89	
cbc0_256_8192	-7.95	
lrw1_256_16	-1.74	
lrw1_256_64	-4.33	
lrw1_256_256	-5.84	
lrw1_256_1024	-6.33	
lrw1_256_8192	-6.61	
lrw1_320_16	-2.78	
lrw1_320_64	-3.36	
lrw1_320_256	-6.72	
lrw1_320_1024	-6.98	
lrw1_320_8192	-7.05	
lrw1_384_16	-2.52	
lrw1_384_64	-4.87	
lrw1_384_256	-6.55	
lrw1_384_1024	-7.05	
lrw1_384_8192	-7.18	
lrw0_256_16	-2.02	
lrw0_256_64	-6.22	
lrw0_256_256	-8.36	
lrw0_256_1024	-9.10	
lrw0_256_8192	-9.32	
lrw0_320_16	-2.06	
lrw0_320_64	-5.88	
lrw0_320_256	-8.85	
lrw0_320_1024	-9.72	
lrw0_320_8192	-9.81	
lrw0_384_16	-3.64	
lrw0_384_64	-7.46	
lrw0_384_256	-9.35	
lrw0_384_1024	-10.05	
lrw0_384_8192	-10.17	
xts1_256_16	-5.06	
xts1_256_64	-4.41	
xts1_256_256	-5.17	
xts1_256_1024	-5.59	
xts1_256_8192	-5.51	
xts1_384_16	-5.12	
xts1_384_64	-5.52	
xts1_384_256	-5.87	
xts1_384_1024	-6.05	
xts1_384_8192	-6.19	
xts1_512_16	-5.86	
xts1_512_64	-4.90	
xts1_512_256	-6.17	
xts1_512_1024	-6.84	
xts1_512_8192	-6.86	
xts0_256_16	-5.64	
xts0_256_64	-6.30	
xts0_256_256	-7.22	
xts0_256_1024	-7.82	
xts0_256_8192	-7.78	
xts0_384_16	-5.86	
xts0_384_64	-6.75	
xts0_384_256	-8.62	
xts0_384_1024	-8.37	
xts0_384_8192	-8.38	
xts0_512_16	-6.57	
xts0_512_64	-7.90	
xts0_512_256	-9.04	
xts0_512_1024	-9.23	
xts0_512_8192	-9.47	
average: -6.64
min:     -33.46
max:     2.62

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH -mm crypto] AES: x86_64 asm implementation optimization
  2008-04-09  6:41 [PATCH -mm crypto] AES: x86_64 asm implementation optimization Huang, Ying
@ 2008-04-16  7:31 ` Sebastian Siewior
  2008-04-16  8:19   ` Huang, Ying
  0 siblings, 1 reply; 21+ messages in thread
From: Sebastian Siewior @ 2008-04-16  7:31 UTC (permalink / raw)
  To: Huang, Ying
  Cc: Herbert Xu, Adam J. Richter, Alexander Kjeldaas, akpm,
	linux-kernel, linux-crypto, mingo, tglx

* Huang, Ying | 2008-04-09 14:41:02 [+0800]:

>This patch increases the performance of AES x86-64 implementation. The
>average increment is more than 6.3% and the max increment is
>more than 10.2% on Intel CORE 2 CPU. The performance increment is
>gained via the following methods:
>
>- Two additional temporary registers are used to hold the subset of
>  the state, so that the dependency between instructions is reduced.
>
>- The expanded key is loaded via 2 64bit load instead of 4 32-bit load.
>

>From your description I would assume that the performance can only
increase. However, on my
|model name      : AMD Athlon(tm) 64 Processor 3200+
the opposite is the case [1], [2]. I dunno why and I didn't mixup
patched & unpached :). I checked this patch on
|model name      : Intel(R) Core(TM)2 CPU         T7200  @ 2.00GHz
and the performance really increases [3], [4].

[1] http://download.breakpoint.cc/aes_patch/patched.txt
[2] http://download.breakpoint.cc/aes_patch/unpatched.txt
[3] http://download.breakpoint.cc/aes_patch/perf_patched.txt
[4] http://download.breakpoint.cc/aes_patch/perf_originall.txt

>---
> arch/x86/crypto/aes-x86_64-asm_64.S |  101 ++++++++++++++++++++----------------
> include/crypto/aes.h                |    1 
> 2 files changed, 58 insertions(+), 44 deletions(-)
>
>--- a/include/crypto/aes.h
>+++ b/include/crypto/aes.h
>@@ -19,6 +19,7 @@
> 
> struct crypto_aes_ctx {
> 	u32 key_length;
>+	u32 _pad1;

Why is this pad required? Do you want special alignment of the keys?

> 	u32 key_enc[AES_MAX_KEYLENGTH_U32];
> 	u32 key_dec[AES_MAX_KEYLENGTH_U32];
> };
>

Sebastian

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH -mm crypto] AES: x86_64 asm implementation optimization
  2008-04-16  7:31 ` Sebastian Siewior
@ 2008-04-16  8:19   ` Huang, Ying
  2008-04-16  8:23     ` Andi Kleen
  2008-04-16 18:40     ` Sebastian Siewior
  0 siblings, 2 replies; 21+ messages in thread
From: Huang, Ying @ 2008-04-16  8:19 UTC (permalink / raw)
  To: Sebastian Siewior
  Cc: Herbert Xu, Adam J. Richter, Alexander Kjeldaas, akpm,
	linux-kernel, linux-crypto, mingo, tglx


On Wed, 2008-04-16 at 09:31 +0200, Sebastian Siewior wrote:
> * Huang, Ying | 2008-04-09 14:41:02 [+0800]:
> 
> >This patch increases the performance of AES x86-64 implementation. The
> >average increment is more than 6.3% and the max increment is
> >more than 10.2% on Intel CORE 2 CPU. The performance increment is
> >gained via the following methods:
> >
> >- Two additional temporary registers are used to hold the subset of
> >  the state, so that the dependency between instructions is reduced.
> >
> >- The expanded key is loaded via 2 64bit load instead of 4 32-bit load.
> >
> 
> From your description I would assume that the performance can only
> increase. However, on my
> |model name      : AMD Athlon(tm) 64 Processor 3200+
> the opposite is the case [1], [2]. I dunno why and I didn't mixup
> patched & unpached :). I checked this patch on

En. I have no AMD machine. So I have not tested the patch on it. Maybe
there are some pipeline or load/store unit difference between Intel and
AMD CPUs. Tomorrow I can split the patch into a set of small patches,
with one patch for one small step. Can you help me to test these patches
to find out the reason for degradation on AMD CPU.

> |model name      : Intel(R) Core(TM)2 CPU         T7200  @ 2.00GHz
> and the performance really increases [3], [4].
> 
> [1] http://download.breakpoint.cc/aes_patch/patched.txt
> [2] http://download.breakpoint.cc/aes_patch/unpatched.txt
> [3] http://download.breakpoint.cc/aes_patch/perf_patched.txt
> [4] http://download.breakpoint.cc/aes_patch/perf_originall.txt
> 
> >---
> > arch/x86/crypto/aes-x86_64-asm_64.S |  101 ++++++++++++++++++++----------------
> > include/crypto/aes.h                |    1 
> > 2 files changed, 58 insertions(+), 44 deletions(-)
> >
> >--- a/include/crypto/aes.h
> >+++ b/include/crypto/aes.h
> >@@ -19,6 +19,7 @@
> > 
> > struct crypto_aes_ctx {
> > 	u32 key_length;
> >+	u32 _pad1;
> 
> Why is this pad required? Do you want special alignment of the keys?

Because the key is loaded in 64bit in this patch, I want to align the
key with 64bit address.

> > 	u32 key_enc[AES_MAX_KEYLENGTH_U32];
> > 	u32 key_dec[AES_MAX_KEYLENGTH_U32];
> > };
> >

Best Regards,
Huang Ying


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH -mm crypto] AES: x86_64 asm implementation optimization
  2008-04-16  8:19   ` Huang, Ying
@ 2008-04-16  8:23     ` Andi Kleen
  2008-04-16  9:50       ` Herbert Xu
  2008-04-16 18:40     ` Sebastian Siewior
  1 sibling, 1 reply; 21+ messages in thread
From: Andi Kleen @ 2008-04-16  8:23 UTC (permalink / raw)
  To: Huang, Ying
  Cc: Sebastian Siewior, Herbert Xu, Adam J. Richter,
	Alexander Kjeldaas, akpm, linux-kernel, linux-crypto, mingo, tglx

"Huang, Ying" <ying.huang@intel.com> writes:
>
> En. I have no AMD machine. So I have not tested the patch on it. Maybe
> there are some pipeline or load/store unit difference between Intel and
> AMD CPUs. Tomorrow I can split the patch into a set of small patches,
> with one patch for one small step. Can you help me to test these patches
> to find out the reason for degradation on AMD CPU.

It would be also quite possible to use two different implementations,
one for AMD another for Intel.  crypto frame work should have no
problems dealing with that.

-Andi

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH -mm crypto] AES: x86_64 asm implementation optimization
  2008-04-16  8:23     ` Andi Kleen
@ 2008-04-16  9:50       ` Herbert Xu
  0 siblings, 0 replies; 21+ messages in thread
From: Herbert Xu @ 2008-04-16  9:50 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Huang, Ying, Sebastian Siewior, Adam J. Richter,
	Alexander Kjeldaas, akpm, linux-kernel, linux-crypto, mingo, tglx

On Wed, Apr 16, 2008 at 10:23:04AM +0200, Andi Kleen wrote:
> 
> It would be also quite possible to use two different implementations,
> one for AMD another for Intel.  crypto frame work should have no
> problems dealing with that.

Yes that would definitely be an option.

Thanks,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH -mm crypto] AES: x86_64 asm implementation optimization
  2008-04-16  8:19   ` Huang, Ying
  2008-04-16  8:23     ` Andi Kleen
@ 2008-04-16 18:40     ` Sebastian Siewior
  2008-04-17  1:52       ` Huang, Ying
  2008-04-17  3:36       ` Huang, Ying
  1 sibling, 2 replies; 21+ messages in thread
From: Sebastian Siewior @ 2008-04-16 18:40 UTC (permalink / raw)
  To: Huang, Ying
  Cc: Herbert Xu, Adam J. Richter, akpm, linux-kernel, linux-crypto,
	mingo, tglx

cut Alexander Kjeldaas <astor@fast.no> from CC coz his mails bounce.

* Huang, Ying | 2008-04-16 16:19:09 [+0800]:

>Can you help me to test these patches
>to find out the reason for degradation on AMD CPU.
Sure. 

>> >--- a/include/crypto/aes.h
>> >+++ b/include/crypto/aes.h
>> >@@ -19,6 +19,7 @@
>> > 
>> > struct crypto_aes_ctx {
>> > 	u32 key_length;
>> >+	u32 _pad1;
>> 
>> Why is this pad required? Do you want special alignment of the keys?
>
>Because the key is loaded in 64bit in this patch, I want to align the
>key with 64bit address.

Than this won't work all the time. To make it bulletproof
- set .cra_alignmask in the glue code properly
- use the attribute aligned thing
- retrieve your private struct via crypto_tfm_ctx_aligned()

You might want to take a look on padlock-aes.c. The same thing is done
there but instead of crypto_tfm_ctx_aligned() a private function is
used (to let the compiler optimize most of the code away). Depending on
Herbert's mood you might get away with this as well (what would be
probably the case since you might prefer to do it asm) :)

>> > 	u32 key_enc[AES_MAX_KEYLENGTH_U32];
>> > 	u32 key_dec[AES_MAX_KEYLENGTH_U32];
>> > };
>> >
>
>Best Regards,
>Huang Ying
>

Sebastian

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH -mm crypto] AES: x86_64 asm implementation optimization
  2008-04-16 18:40     ` Sebastian Siewior
@ 2008-04-17  1:52       ` Huang, Ying
  2008-04-17  3:34         ` Herbert Xu
  2008-04-17  3:36       ` Huang, Ying
  1 sibling, 1 reply; 21+ messages in thread
From: Huang, Ying @ 2008-04-17  1:52 UTC (permalink / raw)
  To: Sebastian Siewior
  Cc: Herbert Xu, Adam J. Richter, akpm, linux-kernel, linux-crypto,
	mingo, tglx

On Wed, 2008-04-16 at 20:40 +0200, Sebastian Siewior wrote:
[...]
> >> >--- a/include/crypto/aes.h
> >> >+++ b/include/crypto/aes.h
> >> >@@ -19,6 +19,7 @@
> >> > 
> >> > struct crypto_aes_ctx {
> >> > 	u32 key_length;
> >> >+	u32 _pad1;
> >> 
> >> Why is this pad required? Do you want special alignment of the keys?
> >
> >Because the key is loaded in 64bit in this patch, I want to align the
> >key with 64bit address.
> 
> Than this won't work all the time. To make it bulletproof
> - set .cra_alignmask in the glue code properly
> - use the attribute aligned thing
> - retrieve your private struct via crypto_tfm_ctx_aligned()

As far as I know, the CRYPTO_MINALIGN is defined in
include/linux/crypto.h as __alignof__(unsigned long long), and the
__crt_ctx in crypto_tfm is aligned in CRYPTO_MINALIGN. So I think adding
a pad is sufficient for x86_64 implementation.

Best Regards,
Huang Ying


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH -mm crypto] AES: x86_64 asm implementation optimization
  2008-04-17  1:52       ` Huang, Ying
@ 2008-04-17  3:34         ` Herbert Xu
  2008-04-17  4:53           ` Huang, Ying
  2008-04-23 22:28           ` Sebastian Siewior
  0 siblings, 2 replies; 21+ messages in thread
From: Herbert Xu @ 2008-04-17  3:34 UTC (permalink / raw)
  To: Huang, Ying
  Cc: Sebastian Siewior, Adam J. Richter, akpm, linux-kernel,
	linux-crypto, mingo, tglx

On Thu, Apr 17, 2008 at 09:52:03AM +0800, Huang, Ying wrote:
>
> As far as I know, the CRYPTO_MINALIGN is defined in
> include/linux/crypto.h as __alignof__(unsigned long long), and the
> __crt_ctx in crypto_tfm is aligned in CRYPTO_MINALIGN. So I think adding
> a pad is sufficient for x86_64 implementation.

It should be sufficient but it would be better to use an align
attribute to better document the intention.

Thanks,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH -mm crypto] AES: x86_64 asm implementation optimization
  2008-04-16 18:40     ` Sebastian Siewior
  2008-04-17  1:52       ` Huang, Ying
@ 2008-04-17  3:36       ` Huang, Ying
  2008-04-23 22:32         ` Sebastian Siewior
  1 sibling, 1 reply; 21+ messages in thread
From: Huang, Ying @ 2008-04-17  3:36 UTC (permalink / raw)
  To: Sebastian Siewior
  Cc: Herbert Xu, Adam J. Richter, akpm, linux-kernel, linux-crypto,
	mingo, tglx

[-- Attachment #1: Type: text/plain, Size: 1513 bytes --]

Hi, Sebastian,

The files attached is the separated patches, from step1 to step 7. Thank
you very much for your help.

Best Regards,
Huang Ying

On Wed, 2008-04-16 at 20:40 +0200, Sebastian Siewior wrote:
> cut Alexander Kjeldaas <astor@fast.no> from CC coz his mails bounce.
> 
> * Huang, Ying | 2008-04-16 16:19:09 [+0800]:
> 
> >Can you help me to test these patches
> >to find out the reason for degradation on AMD CPU.
> Sure. 
> 
> >> >--- a/include/crypto/aes.h
> >> >+++ b/include/crypto/aes.h
> >> >@@ -19,6 +19,7 @@
> >> > 
> >> > struct crypto_aes_ctx {
> >> > 	u32 key_length;
> >> >+	u32 _pad1;
> >> 
> >> Why is this pad required? Do you want special alignment of the keys?
> >
> >Because the key is loaded in 64bit in this patch, I want to align the
> >key with 64bit address.
> 
> Than this won't work all the time. To make it bulletproof
> - set .cra_alignmask in the glue code properly
> - use the attribute aligned thing
> - retrieve your private struct via crypto_tfm_ctx_aligned()
> 
> You might want to take a look on padlock-aes.c. The same thing is done
> there but instead of crypto_tfm_ctx_aligned() a private function is
> used (to let the compiler optimize most of the code away). Depending on
> Herbert's mood you might get away with this as well (what would be
> probably the case since you might prefer to do it asm) :)
> 
> >> > 	u32 key_enc[AES_MAX_KEYLENGTH_U32];
> >> > 	u32 key_dec[AES_MAX_KEYLENGTH_U32];
> >> > };
> >> >
> >
> >Best Regards,
> >Huang Ying
> >
> 
> Sebastian

[-- Attachment #2: step1.patch --]
[-- Type: text/x-vhdl, Size: 907 bytes --]

---
 arch/x86/crypto/aes-x86_64-asm_64.S |   11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

--- a/arch/x86/crypto/aes-x86_64-asm_64.S
+++ b/arch/x86/crypto/aes-x86_64-asm_64.S
@@ -49,13 +49,17 @@
 #define R9	%r9
 #define R10	%r10
 #define R11	%r11
+#define R12	%r12
+#define R15	%r15
+#define R16	%rsp
 
 #define prologue(FUNC,KEY,B128,B192,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11) \
 	.global	FUNC;			\
 	.type	FUNC,@function;		\
 	.align	8;			\
-FUNC:	movq	r1,r2;			\
-	movq	r3,r4;			\
+FUNC:	subq	$8, R16;		\
+	movq	r3, r4;			\
+	movq	r1, (R16);		\
 	leaq	BASE+KEY+48+4(r8),r9;	\
 	movq	r10,r11;		\
 	movl	(r7),r5 ## E;		\
@@ -74,7 +78,8 @@ FUNC:	movq	r1,r2;			\
 	leaq	32(r9),r9;
 
 #define epilogue(r1,r2,r3,r4,r5,r6,r7,r8,r9) \
-	movq	r1,r2;			\
+	movq	(R16),r2;		\
+	addq	$8,R16;			\
 	movq	r3,r4;			\
 	movl	r5 ## E,(r9);		\
 	movl	r6 ## E,4(r9);		\

[-- Attachment #3: step2.patch --]
[-- Type: text/x-vhdl, Size: 997 bytes --]

---
 arch/x86/crypto/aes-x86_64-asm_64.S |    8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

--- a/arch/x86/crypto/aes-x86_64-asm_64.S
+++ b/arch/x86/crypto/aes-x86_64-asm_64.S
@@ -90,13 +90,13 @@ FUNC:	subq	$8, R16;		\
 #define round(TAB,OFFSET,r1,r2,r3,r4,r5,r6,r7,r8,ra,rb,rc,rd) \
 	movzbl	r2 ## H,r5 ## E;	\
 	movzbl	r2 ## L,r6 ## E;	\
+	movq	r4,R8;			\
+	shrl	$16,r4 ## E;		\
 	movl	TAB+1024(,r5,4),r5 ## E;\
-	movw	r4 ## X,r2 ## X;	\
 	movl	TAB(,r6,4),r6 ## E;	\
-	roll	$16,r2 ## E;		\
-	shrl	$16,r4 ## E;		\
 	movzbl	r4 ## H,r7 ## E;	\
 	movzbl	r4 ## L,r4 ## E;	\
+	shrl	$16,r2 ## E;		\
 	xorl	OFFSET(r8),ra ## E;	\
 	xorl	OFFSET+4(r8),rb ## E;	\
 	xorl	TAB+3072(,r7,4),r5 ## E;\
@@ -123,7 +123,7 @@ FUNC:	subq	$8, R16;		\
 	xorl	TAB(,r1,4),r3 ## E;	\
 	movzbl	r2 ## H,r1 ## E;	\
 	movzbl	r2 ## L,r7 ## E;	\
-	shrl	$16,r2 ## E;		\
+	movq	R8,r2;			\
 	xorl	TAB+3072(,r1,4),r3 ## E;\
 	xorl	TAB+2048(,r7,4),r4 ## E;\
 	movzbl	r2 ## H,r1 ## E;	\

[-- Attachment #4: step3.patch --]
[-- Type: text/x-vhdl, Size: 951 bytes --]

---
 arch/x86/crypto/aes-x86_64-asm_64.S |   12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

--- a/arch/x86/crypto/aes-x86_64-asm_64.S
+++ b/arch/x86/crypto/aes-x86_64-asm_64.S
@@ -57,13 +57,13 @@
 	.global	FUNC;			\
 	.type	FUNC,@function;		\
 	.align	8;			\
-FUNC:	subq	$8, R16;		\
-	movq	r3, r4;			\
-	movq	r1, (R16);		\
+FUNC:	subq	$16, R16;		\
+	movl	(r7),r5 ## E;		\
 	leaq	BASE+KEY+48+4(r8),r9;	\
 	movq	r10,r11;		\
-	movl	(r7),r5 ## E;		\
+	movq	r1, (R16);		\
 	movl	4(r7),r1 ## E;		\
+	movq	r3, 8(R16);		\
 	movl	8(r7),r6 ## E;		\
 	movl	12(r7),r7 ## E;		\
 	movl	BASE+0(r8),r10 ## E;	\
@@ -79,11 +79,11 @@ FUNC:	subq	$8, R16;		\
 
 #define epilogue(r1,r2,r3,r4,r5,r6,r7,r8,r9) \
 	movq	(R16),r2;		\
-	addq	$8,R16;			\
-	movq	r3,r4;			\
 	movl	r5 ## E,(r9);		\
+	movq	8(R16),r4;		\
 	movl	r6 ## E,4(r9);		\
 	movl	r7 ## E,8(r9);		\
+	addq	$16,R16;		\
 	movl	r8 ## E,12(r9);		\
 	ret;
 

[-- Attachment #5: step4.patch --]
[-- Type: text/x-vhdl, Size: 1036 bytes --]

---
 arch/x86/crypto/aes-x86_64-asm_64.S |   10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

--- a/arch/x86/crypto/aes-x86_64-asm_64.S
+++ b/arch/x86/crypto/aes-x86_64-asm_64.S
@@ -99,14 +99,14 @@ FUNC:	subq	$16, R16;		\
 	shrl	$16,r2 ## E;		\
 	xorl	OFFSET(r8),ra ## E;	\
 	xorl	OFFSET+4(r8),rb ## E;	\
+	movq	r3,R9;			\
 	xorl	TAB+3072(,r7,4),r5 ## E;\
 	xorl	TAB+2048(,r4,4),r6 ## E;\
-	movzbl	r1 ## L,r7 ## E;	\
 	movzbl	r1 ## H,r4 ## E;	\
-	movl	TAB+1024(,r4,4),r4 ## E;\
-	movw	r3 ## X,r1 ## X;	\
-	roll	$16,r1 ## E;		\
+	movzbl	r1 ## L,r7 ## E;	\
 	shrl	$16,r3 ## E;		\
+	shrl	$16,r1 ## E;		\
+	movl	TAB+1024(,r4,4),r4 ## E;\
 	xorl	TAB(,r7,4),r5 ## E;	\
 	movzbl	r3 ## H,r7 ## E;	\
 	movzbl	r3 ## L,r3 ## E;	\
@@ -114,7 +114,7 @@ FUNC:	subq	$16, R16;		\
 	xorl	TAB+2048(,r3,4),r5 ## E;\
 	movzbl	r1 ## H,r7 ## E;	\
 	movzbl	r1 ## L,r3 ## E;	\
-	shrl	$16,r1 ## E;		\
+	movq	R9,r1;			\
 	xorl	TAB+3072(,r7,4),r6 ## E;\
 	movl	TAB+2048(,r3,4),r3 ## E;\
 	movzbl	r1 ## H,r7 ## E;	\

[-- Attachment #6: step5.patch --]
[-- Type: text/x-vhdl, Size: 1164 bytes --]

---
 arch/x86/crypto/aes-x86_64-asm_64.S |    8 +++++---
 include/crypto/aes.h                |    1 +
 2 files changed, 6 insertions(+), 3 deletions(-)

--- a/arch/x86/crypto/aes-x86_64-asm_64.S
+++ b/arch/x86/crypto/aes-x86_64-asm_64.S
@@ -57,14 +57,15 @@
 	.global	FUNC;			\
 	.type	FUNC,@function;		\
 	.align	8;			\
-FUNC:	subq	$16, R16;		\
+FUNC:	subq	$24, R16;		\
 	movl	(r7),r5 ## E;		\
-	leaq	BASE+KEY+48+4(r8),r9;	\
+	leaq	BASE+KEY+48+8(r8),r9;	\
 	movq	r10,r11;		\
 	movq	r1, (R16);		\
 	movl	4(r7),r1 ## E;		\
 	movq	r3, 8(R16);		\
 	movl	8(r7),r6 ## E;		\
+	movq	R12, 16(R16);		\
 	movl	12(r7),r7 ## E;		\
 	movl	BASE+0(r8),r10 ## E;	\
 	xorl	-48(r9),r5 ## E;	\
@@ -82,8 +83,9 @@ FUNC:	subq	$16, R16;		\
 	movl	r5 ## E,(r9);		\
 	movq	8(R16),r4;		\
 	movl	r6 ## E,4(r9);		\
+	movq	16(R16),R12;		\
 	movl	r7 ## E,8(r9);		\
-	addq	$16,R16;		\
+	addq	$24,R16;		\
 	movl	r8 ## E,12(r9);		\
 	ret;
 
--- a/include/crypto/aes.h
+++ b/include/crypto/aes.h
@@ -19,6 +19,7 @@
 
 struct crypto_aes_ctx {
 	u32 key_length;
+	u32 _pad1;
 	u32 key_enc[AES_MAX_KEYLENGTH_U32];
 	u32 key_dec[AES_MAX_KEYLENGTH_U32];
 };

[-- Attachment #7: step6.patch --]
[-- Type: text/x-vhdl, Size: 1058 bytes --]

---
 arch/x86/crypto/aes-x86_64-asm_64.S |    6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

--- a/arch/x86/crypto/aes-x86_64-asm_64.S
+++ b/arch/x86/crypto/aes-x86_64-asm_64.S
@@ -98,9 +98,8 @@ FUNC:	subq	$24, R16;		\
 	movl	TAB(,r6,4),r6 ## E;	\
 	movzbl	r4 ## H,r7 ## E;	\
 	movzbl	r4 ## L,r4 ## E;	\
+	movq	OFFSET(r8),R12;		\
 	shrl	$16,r2 ## E;		\
-	xorl	OFFSET(r8),ra ## E;	\
-	xorl	OFFSET+4(r8),rb ## E;	\
 	movq	r3,R9;			\
 	xorl	TAB+3072(,r7,4),r5 ## E;\
 	xorl	TAB+2048(,r4,4),r6 ## E;\
@@ -116,7 +115,9 @@ FUNC:	subq	$24, R16;		\
 	xorl	TAB+2048(,r3,4),r5 ## E;\
 	movzbl	r1 ## H,r7 ## E;	\
 	movzbl	r1 ## L,r3 ## E;	\
+	xorq	R12,ra;			\
 	movq	R9,r1;			\
+	shrq	$32,R12;		\
 	xorl	TAB+3072(,r7,4),r6 ## E;\
 	movl	TAB+2048(,r3,4),r3 ## E;\
 	movzbl	r1 ## H,r7 ## E;	\
@@ -126,6 +127,7 @@ FUNC:	subq	$24, R16;		\
 	movzbl	r2 ## H,r1 ## E;	\
 	movzbl	r2 ## L,r7 ## E;	\
 	movq	R8,r2;			\
+	xorq	R12,rb;			\
 	xorl	TAB+3072(,r1,4),r3 ## E;\
 	xorl	TAB+2048(,r7,4),r4 ## E;\
 	movzbl	r2 ## H,r1 ## E;	\

[-- Attachment #8: step7.patch --]
[-- Type: text/x-vhdl, Size: 1040 bytes --]

---
 arch/x86/crypto/aes-x86_64-asm_64.S |   10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

--- a/arch/x86/crypto/aes-x86_64-asm_64.S
+++ b/arch/x86/crypto/aes-x86_64-asm_64.S
@@ -117,6 +117,7 @@ FUNC:	subq	$24, R16;		\
 	movzbl	r1 ## L,r3 ## E;	\
 	xorq	R12,ra;			\
 	movq	R9,r1;			\
+	movq	OFFSET+8(r8),R9;	\
 	shrq	$32,R12;		\
 	xorl	TAB+3072(,r7,4),r6 ## E;\
 	movl	TAB+2048(,r3,4),r3 ## E;\
@@ -126,16 +127,17 @@ FUNC:	subq	$24, R16;		\
 	xorl	TAB(,r1,4),r3 ## E;	\
 	movzbl	r2 ## H,r1 ## E;	\
 	movzbl	r2 ## L,r7 ## E;	\
+	xorq	R9,rc;			\
 	movq	R8,r2;			\
+	shrq	$32,R9;			\
 	xorq	R12,rb;			\
 	xorl	TAB+3072(,r1,4),r3 ## E;\
 	xorl	TAB+2048(,r7,4),r4 ## E;\
 	movzbl	r2 ## H,r1 ## E;	\
+	xorq	R9,rd;			\
 	movzbl	r2 ## L,r2 ## E;	\
-	xorl	OFFSET+8(r8),rc ## E;	\
-	xorl	OFFSET+12(r8),rd ## E;	\
-	xorl	TAB+1024(,r1,4),r3 ## E;\
-	xorl	TAB(,r2,4),r4 ## E;
+	xorl	TAB(,r2,4),r4 ## E;	\
+	xorl	TAB+1024(,r1,4),r3 ## E;
 
 #define move_regs(r1,r2,r3,r4) \
 	movl	r3 ## E,r1 ## E;	\

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH -mm crypto] AES: x86_64 asm implementation optimization
  2008-04-17  3:34         ` Herbert Xu
@ 2008-04-17  4:53           ` Huang, Ying
  2008-04-23 22:28           ` Sebastian Siewior
  1 sibling, 0 replies; 21+ messages in thread
From: Huang, Ying @ 2008-04-17  4:53 UTC (permalink / raw)
  To: Herbert Xu
  Cc: Sebastian Siewior, Adam J. Richter, akpm, linux-kernel,
	linux-crypto, mingo, tglx

On Thu, 2008-04-17 at 11:34 +0800, Herbert Xu wrote:
> On Thu, Apr 17, 2008 at 09:52:03AM +0800, Huang, Ying wrote:
> >
> > As far as I know, the CRYPTO_MINALIGN is defined in
> > include/linux/crypto.h as __alignof__(unsigned long long), and the
> > __crt_ctx in crypto_tfm is aligned in CRYPTO_MINALIGN. So I think adding
> > a pad is sufficient for x86_64 implementation.
> 
> It should be sufficient but it would be better to use an align
> attribute to better document the intention.

OK. I will use the align attribute.

Best Regards,
Huang Ying

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH -mm crypto] AES: x86_64 asm implementation optimization
  2008-04-17  3:34         ` Herbert Xu
  2008-04-17  4:53           ` Huang, Ying
@ 2008-04-23 22:28           ` Sebastian Siewior
  2008-04-24  0:51             ` Herbert Xu
  1 sibling, 1 reply; 21+ messages in thread
From: Sebastian Siewior @ 2008-04-23 22:28 UTC (permalink / raw)
  To: Herbert Xu
  Cc: Huang, Ying, Adam J. Richter, akpm, linux-kernel, linux-crypto,
	mingo, tglx

* Herbert Xu | 2008-04-17 11:34:02 [+0800]:

>On Thu, Apr 17, 2008 at 09:52:03AM +0800, Huang, Ying wrote:
>>
>> As far as I know, the CRYPTO_MINALIGN is defined in
>> include/linux/crypto.h as __alignof__(unsigned long long), and the
>> __crt_ctx in crypto_tfm is aligned in CRYPTO_MINALIGN. So I think adding
>> a pad is sufficient for x86_64 implementation.
>
>It should be sufficient but it would be better to use an align
Doesn't this imply that kmalloc() returns memory that is always pointer
aligned what isn't the case AFAIK?

>Thanks,

Sebastian

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH -mm crypto] AES: x86_64 asm implementation optimization
  2008-04-17  3:36       ` Huang, Ying
@ 2008-04-23 22:32         ` Sebastian Siewior
  2008-04-25  3:11           ` Huang, Ying
  0 siblings, 1 reply; 21+ messages in thread
From: Sebastian Siewior @ 2008-04-23 22:32 UTC (permalink / raw)
  To: Huang, Ying
  Cc: Herbert Xu, Adam J. Richter, akpm, linux-kernel, linux-crypto,
	mingo, tglx

[-- Attachment #1: Type: text/plain, Size: 830 bytes --]

* Huang, Ying | 2008-04-17 11:36:43 [+0800]:

>Hi, Sebastian,
Hi Huang,

>The files attached is the separated patches, from step1 to step 7. Thank
>you very much for your help.
I've run the following script:

|#!/bin/bash
|check_error()
|{
|        r=$?
|        if [ ! $r -eq 0 ]
|        then
|                exit 1
|        fi
|}
|
|modprobe tcrypt mode=200
|modprobe tcrypt mode=200
|dmesg -c > step-0.txt
|
|for ((i=1; i<=7; i++))
|do
|        quilt push step${i}.patch
|        check_error
|
|        make
|        check_error
|
|        rmmod aes_x86_64
|        check_error
|
|        insmod arch/x86/crypto/aes-x86_64.ko
|        check_error
|
|        modprobe tcrypt mode=200
|        modprobe tcrypt mode=200
|        dmesg -c > step-${i}.txt
|done

and the result is attached.

>Best Regards,
>Huang Ying

Sebastian

[-- Attachment #2: steps.tbz2 --]
[-- Type: application/octet-stream, Size: 13454 bytes --]

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH -mm crypto] AES: x86_64 asm implementation optimization
  2008-04-23 22:28           ` Sebastian Siewior
@ 2008-04-24  0:51             ` Herbert Xu
  0 siblings, 0 replies; 21+ messages in thread
From: Herbert Xu @ 2008-04-24  0:51 UTC (permalink / raw)
  To: Sebastian Siewior
  Cc: Huang, Ying, Adam J. Richter, akpm, linux-kernel, linux-crypto,
	mingo, tglx

On Thu, Apr 24, 2008 at 12:28:43AM +0200, Sebastian Siewior wrote:
> >> __crt_ctx in crypto_tfm is aligned in CRYPTO_MINALIGN. So I think adding
> >> a pad is sufficient for x86_64 implementation.
> >
> >It should be sufficient but it would be better to use an align
> Doesn't this imply that kmalloc() returns memory that is always pointer
> aligned what isn't the case AFAIK?

Parse error :)

kmalloc returns memory that should always be aligned to
CRYPTO_MINALIGN and in particular it's always pointer aligned.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH -mm crypto] AES: x86_64 asm implementation optimization
  2008-04-23 22:32         ` Sebastian Siewior
@ 2008-04-25  3:11           ` Huang, Ying
  2008-04-25  7:12             ` Sebastian Siewior
  2008-04-29 22:12             ` Sebastian Siewior
  0 siblings, 2 replies; 21+ messages in thread
From: Huang, Ying @ 2008-04-25  3:11 UTC (permalink / raw)
  To: Sebastian Siewior
  Cc: Herbert Xu, Adam J. Richter, akpm, linux-kernel, linux-crypto,
	mingo, tglx

[-- Attachment #1: Type: text/plain, Size: 1483 bytes --]

Hi, Sebastian,

Thank you very much for your help. From the result you sent, the biggest
performance degradation is between step 4 and step 5. In that step, one
more register is saved before and restored after encryption/decryption.
So I think the reason maybe the read/write port throughput of CPU.

I changed the patches to group the read or write together instead of
interleaving. Can you help me to test these new patches? The new patches
is attached with the mail.

Best Regards,
Huang Ying

On Thu, 2008-04-24 at 00:32 +0200, Sebastian Siewior wrote:
> * Huang, Ying | 2008-04-17 11:36:43 [+0800]:
> 
> >Hi, Sebastian,
> Hi Huang,
> 
> >The files attached is the separated patches, from step1 to step 7. Thank
> >you very much for your help.
> I've run the following script:
> 
> |#!/bin/bash
> |check_error()
> |{
> |        r=$?
> |        if [ ! $r -eq 0 ]
> |        then
> |                exit 1
> |        fi
> |}
> |
> |modprobe tcrypt mode=200
> |modprobe tcrypt mode=200
> |dmesg -c > step-0.txt
> |
> |for ((i=1; i<=7; i++))
> |do
> |        quilt push step${i}.patch
> |        check_error
> |
> |        make
> |        check_error
> |
> |        rmmod aes_x86_64
> |        check_error
> |
> |        insmod arch/x86/crypto/aes-x86_64.ko
> |        check_error
> |
> |        modprobe tcrypt mode=200
> |        modprobe tcrypt mode=200
> |        dmesg -c > step-${i}.txt
> |done
> 
> and the result is attached.
> 
> >Best Regards,
> >Huang Ying
> 
> Sebastian

[-- Attachment #2: patches.tbz2 --]
[-- Type: application/x-bzip-compressed-tar, Size: 1735 bytes --]

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH -mm crypto] AES: x86_64 asm implementation optimization
  2008-04-25  3:11           ` Huang, Ying
@ 2008-04-25  7:12             ` Sebastian Siewior
  2008-04-25  7:21               ` Huang, Ying
  2008-04-29 22:12             ` Sebastian Siewior
  1 sibling, 1 reply; 21+ messages in thread
From: Sebastian Siewior @ 2008-04-25  7:12 UTC (permalink / raw)
  To: Huang, Ying
  Cc: Herbert Xu, Adam J. Richter, akpm, linux-kernel, linux-crypto,
	mingo, tglx

* Huang, Ying | 2008-04-25 11:11:17 [+0800]:

>Hi, Sebastian,
Hi Huang,

>So I think the reason maybe the read/write port throughput of CPU.
Ah so it is just a local problem you say? I may get my fingers on
another amd box....

>I changed the patches to group the read or write together instead of
>interleaving. Can you help me to test these new patches? The new patches
>is attached with the mail.
Sure.

>Best Regards,
>Huang Ying

Sebastian

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH -mm crypto] AES: x86_64 asm implementation optimization
  2008-04-25  7:12             ` Sebastian Siewior
@ 2008-04-25  7:21               ` Huang, Ying
  2008-04-25  7:37                 ` Sebastian Siewior
  0 siblings, 1 reply; 21+ messages in thread
From: Huang, Ying @ 2008-04-25  7:21 UTC (permalink / raw)
  To: Sebastian Siewior
  Cc: Herbert Xu, Adam J. Richter, akpm, linux-kernel, linux-crypto,
	mingo, tglx

Hi, Sebastian,

On Fri, 2008-04-25 at 09:12 +0200, Sebastian Siewior wrote:
> * Huang, Ying | 2008-04-25 11:11:17 [+0800]:
> 
> >Hi, Sebastian,
> Hi Huang,
> 
> >So I think the reason maybe the read/write port throughput of CPU.
> Ah so it is just a local problem you say? I may get my fingers on
> another amd box....

I mean the read/write port design difference between CPU
micro-architecture. It is not a local problem. Sorry for my English.

> >I changed the patches to group the read or write together instead of
> >interleaving. Can you help me to test these new patches? The new patches
> >is attached with the mail.
> Sure.
Thank you very much.

Best Regards,
Huang Ying


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH -mm crypto] AES: x86_64 asm implementation optimization
  2008-04-25  7:21               ` Huang, Ying
@ 2008-04-25  7:37                 ` Sebastian Siewior
  0 siblings, 0 replies; 21+ messages in thread
From: Sebastian Siewior @ 2008-04-25  7:37 UTC (permalink / raw)
  To: Huang, Ying
  Cc: Herbert Xu, Adam J. Richter, akpm, linux-kernel, linux-crypto,
	mingo, tglx

* Huang, Ying | 2008-04-25 15:21:27 [+0800]:

>Hi, Sebastian,
Hi Huang,

>> >So I think the reason maybe the read/write port throughput of CPU.
>> Ah so it is just a local problem you say? I may get my fingers on
>> another amd box....
>
>I mean the read/write port design difference between CPU
>micro-architecture. It is not a local problem. Sorry for my English.
No, that is what I meant somehow. It is possible that AMD improved this
in a later CPU generation. The AMD box I have is a pretty old one in
comparison to Intel's dual core. It is also possible that they are
aiming a different target and nothing changed :)

>Best Regards,
>Huang Ying

Sebastian

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH -mm crypto] AES: x86_64 asm implementation optimization
  2008-04-25  3:11           ` Huang, Ying
  2008-04-25  7:12             ` Sebastian Siewior
@ 2008-04-29 22:12             ` Sebastian Siewior
  2008-05-04  6:25               ` dean gaudet
  2008-05-07  5:26               ` Huang, Ying
  1 sibling, 2 replies; 21+ messages in thread
From: Sebastian Siewior @ 2008-04-29 22:12 UTC (permalink / raw)
  To: Huang, Ying
  Cc: Herbert Xu, Adam J. Richter, akpm, linux-kernel, linux-crypto,
	mingo, tglx

[-- Attachment #1: Type: text/plain, Size: 338 bytes --]

* Huang, Ying | 2008-04-25 11:11:17 [+0800]:

>Hi, Sebastian,
Hi Huang,

sorry for the delay.

>I changed the patches to group the read or write together instead of
>interleaving. Can you help me to test these new patches? The new patches
>is attached with the mail.
The new results are attached.

>
>Best Regards,
>Huang Ying

Sebastian

[-- Attachment #2: steps-txt-v2.tbz2 --]
[-- Type: application/octet-stream, Size: 13213 bytes --]

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH -mm crypto] AES: x86_64 asm implementation optimization
  2008-04-29 22:12             ` Sebastian Siewior
@ 2008-05-04  6:25               ` dean gaudet
  2008-05-07  5:12                 ` Huang, Ying
  2008-05-07  5:26               ` Huang, Ying
  1 sibling, 1 reply; 21+ messages in thread
From: dean gaudet @ 2008-05-04  6:25 UTC (permalink / raw)
  To: Sebastian Siewior
  Cc: Huang, Ying, Herbert Xu, Adam J. Richter, akpm, linux-kernel,
	linux-crypto, mingo, tglx

one of the more important details in evaluating these changes would be the 
family/model/stepping of the processors being microbenchmarked... could 
you folks include /proc/cpuinfo with the results?

also -- please drop the #define for R16 to %rsp ... it obfuscates more 
than it helps anything.

thanks
-dean

On Wed, 30 Apr 2008, Sebastian Siewior wrote:

> * Huang, Ying | 2008-04-25 11:11:17 [+0800]:
> 
> >Hi, Sebastian,
> Hi Huang,
> 
> sorry for the delay.
> 
> >I changed the patches to group the read or write together instead of
> >interleaving. Can you help me to test these new patches? The new patches
> >is attached with the mail.
> The new results are attached.
> 
> >
> >Best Regards,
> >Huang Ying
> 
> Sebastian
> 

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH -mm crypto] AES: x86_64 asm implementation optimization
  2008-05-04  6:25               ` dean gaudet
@ 2008-05-07  5:12                 ` Huang, Ying
  0 siblings, 0 replies; 21+ messages in thread
From: Huang, Ying @ 2008-05-07  5:12 UTC (permalink / raw)
  To: dean gaudet
  Cc: Sebastian Siewior, Herbert Xu, Adam J. Richter, akpm,
	linux-kernel, linux-crypto, mingo, tglx

[-- Attachment #1: Type: text/plain, Size: 935 bytes --]

Hi,

On Sat, 2008-05-03 at 23:25 -0700, dean gaudet wrote:
> one of the more important details in evaluating these changes would be the 
> family/model/stepping of the processors being microbenchmarked... could 
> you folks include /proc/cpuinfo with the results?

The file attached is /proc/cpuinfo of my testing machine.

Best Regards,
Huang Ying

> also -- please drop the #define for R16 to %rsp ... it obfuscates more 
> than it helps anything.
> 
> thanks
> -dean
> 
> On Wed, 30 Apr 2008, Sebastian Siewior wrote:
> 
> > * Huang, Ying | 2008-04-25 11:11:17 [+0800]:
> > 
> > >Hi, Sebastian,
> > Hi Huang,
> > 
> > sorry for the delay.
> > 
> > >I changed the patches to group the read or write together instead of
> > >interleaving. Can you help me to test these new patches? The new patches
> > >is attached with the mail.
> > The new results are attached.
> > 
> > >
> > >Best Regards,
> > >Huang Ying
> > 
> > Sebastian
> > 

[-- Attachment #2: cpuinfo --]
[-- Type: text/plain, Size: 1374 bytes --]

processor	: 0
vendor_id	: GenuineIntel
cpu family	: 6
model		: 15
model name	: Intel(R) Core(TM)2 CPU          6400  @ 2.13GHz
stepping	: 2
cpu MHz		: 2128.006
cache size	: 2048 KB
physical id	: 0
siblings	: 2
core id		: 0
cpu cores	: 2
fpu		: yes
fpu_exception	: yes
cpuid level	: 10
wp		: yes
flags		: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx lm constant_tsc arch_perfmon pebs bts rep_good pni monitor ds_cpl vmx est tm2 ssse3 cx16 xtpr lahf_lm
bogomips	: 4259.15
clflush size	: 64
cache_alignment	: 64
address sizes	: 36 bits physical, 48 bits virtual
power management:

processor	: 1
vendor_id	: GenuineIntel
cpu family	: 6
model		: 15
model name	: Intel(R) Core(TM)2 CPU          6400  @ 2.13GHz
stepping	: 2
cpu MHz		: 2128.006
cache size	: 2048 KB
physical id	: 0
siblings	: 2
core id		: 1
cpu cores	: 2
fpu		: yes
fpu_exception	: yes
cpuid level	: 10
wp		: yes
flags		: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx lm constant_tsc arch_perfmon pebs bts rep_good pni monitor ds_cpl vmx est tm2 ssse3 cx16 xtpr lahf_lm
bogomips	: 4256.08
clflush size	: 64
cache_alignment	: 64
address sizes	: 36 bits physical, 48 bits virtual
power management:


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH -mm crypto] AES: x86_64 asm implementation optimization
  2008-04-29 22:12             ` Sebastian Siewior
  2008-05-04  6:25               ` dean gaudet
@ 2008-05-07  5:26               ` Huang, Ying
  1 sibling, 0 replies; 21+ messages in thread
From: Huang, Ying @ 2008-05-07  5:26 UTC (permalink / raw)
  To: Sebastian Siewior
  Cc: Herbert Xu, Adam J. Richter, akpm, linux-kernel, linux-crypto,
	mingo, tglx

Hi, Sebastian,

On Wed, 2008-04-30 at 00:12 +0200, Sebastian Siewior wrote:
> * Huang, Ying | 2008-04-25 11:11:17 [+0800]:
> 
> >Hi, Sebastian,
> Hi Huang,
> 
> sorry for the delay.
> 
> >I changed the patches to group the read or write together instead of
> >interleaving. Can you help me to test these new patches? The new patches
> >is attached with the mail.
> The new results are attached.

It seems that the performance degradation between step4 to step5 is
decreased. But the overall performance degradation between step0 to
step7 is still about 5%.

I also test the patches on Pentium 4 CPUs, and the performance decreased
too. So I think this optimization is CPU micro-architecture dependent.

While the dependency between instructions are reduced, more registers
(at most 3) are saved/restored before/after encryption/decryption. If
the CPU has no extra execution unit for newly independent instructions
but more registers are saved/restored, the performance will decrease.

We maybe should select different implementation based on
micro-architecture.

Best Regards,
Huang Ying


^ permalink raw reply	[flat|nested] 21+ messages in thread

end of thread, other threads:[~2008-05-07  5:20 UTC | newest]

Thread overview: 21+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-04-09  6:41 [PATCH -mm crypto] AES: x86_64 asm implementation optimization Huang, Ying
2008-04-16  7:31 ` Sebastian Siewior
2008-04-16  8:19   ` Huang, Ying
2008-04-16  8:23     ` Andi Kleen
2008-04-16  9:50       ` Herbert Xu
2008-04-16 18:40     ` Sebastian Siewior
2008-04-17  1:52       ` Huang, Ying
2008-04-17  3:34         ` Herbert Xu
2008-04-17  4:53           ` Huang, Ying
2008-04-23 22:28           ` Sebastian Siewior
2008-04-24  0:51             ` Herbert Xu
2008-04-17  3:36       ` Huang, Ying
2008-04-23 22:32         ` Sebastian Siewior
2008-04-25  3:11           ` Huang, Ying
2008-04-25  7:12             ` Sebastian Siewior
2008-04-25  7:21               ` Huang, Ying
2008-04-25  7:37                 ` Sebastian Siewior
2008-04-29 22:12             ` Sebastian Siewior
2008-05-04  6:25               ` dean gaudet
2008-05-07  5:12                 ` Huang, Ying
2008-05-07  5:26               ` Huang, Ying

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox