All of lore.kernel.org
 help / color / mirror / Atom feed
From: "Huang, Ying" <ying.huang@intel.com>
To: Sebastian Siewior <linux-crypto@ml.breakpoint.cc>
Cc: Herbert Xu <herbert@gondor.apana.org.au>,
	"Adam J. Richter" <adam@yggdrasil.com>,
	akpm@linux-foundation.org, linux-kernel@vger.kernel.org,
	linux-crypto@vger.kernel.org, mingo@elte.hu, tglx@linutronix.de
Subject: Re: [PATCH -mm crypto] AES: x86_64 asm implementation optimization
Date: Thu, 17 Apr 2008 11:36:43 +0800	[thread overview]
Message-ID: <1208403403.4322.27.camel@caritas-dev.intel.com> (raw)
In-Reply-To: <20080416184016.GA21365@Chamillionaire.breakpoint.cc>

[-- Attachment #1: Type: text/plain, Size: 1513 bytes --]

Hi, Sebastian,

The files attached is the separated patches, from step1 to step 7. Thank
you very much for your help.

Best Regards,
Huang Ying

On Wed, 2008-04-16 at 20:40 +0200, Sebastian Siewior wrote:
> cut Alexander Kjeldaas <astor@fast.no> from CC coz his mails bounce.
> 
> * Huang, Ying | 2008-04-16 16:19:09 [+0800]:
> 
> >Can you help me to test these patches
> >to find out the reason for degradation on AMD CPU.
> Sure. 
> 
> >> >--- a/include/crypto/aes.h
> >> >+++ b/include/crypto/aes.h
> >> >@@ -19,6 +19,7 @@
> >> > 
> >> > struct crypto_aes_ctx {
> >> > 	u32 key_length;
> >> >+	u32 _pad1;
> >> 
> >> Why is this pad required? Do you want special alignment of the keys?
> >
> >Because the key is loaded in 64bit in this patch, I want to align the
> >key with 64bit address.
> 
> Than this won't work all the time. To make it bulletproof
> - set .cra_alignmask in the glue code properly
> - use the attribute aligned thing
> - retrieve your private struct via crypto_tfm_ctx_aligned()
> 
> You might want to take a look on padlock-aes.c. The same thing is done
> there but instead of crypto_tfm_ctx_aligned() a private function is
> used (to let the compiler optimize most of the code away). Depending on
> Herbert's mood you might get away with this as well (what would be
> probably the case since you might prefer to do it asm) :)
> 
> >> > 	u32 key_enc[AES_MAX_KEYLENGTH_U32];
> >> > 	u32 key_dec[AES_MAX_KEYLENGTH_U32];
> >> > };
> >> >
> >
> >Best Regards,
> >Huang Ying
> >
> 
> Sebastian

[-- Attachment #2: step1.patch --]
[-- Type: text/x-vhdl, Size: 907 bytes --]

---
 arch/x86/crypto/aes-x86_64-asm_64.S |   11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

--- a/arch/x86/crypto/aes-x86_64-asm_64.S
+++ b/arch/x86/crypto/aes-x86_64-asm_64.S
@@ -49,13 +49,17 @@
 #define R9	%r9
 #define R10	%r10
 #define R11	%r11
+#define R12	%r12
+#define R15	%r15
+#define R16	%rsp
 
 #define prologue(FUNC,KEY,B128,B192,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11) \
 	.global	FUNC;			\
 	.type	FUNC,@function;		\
 	.align	8;			\
-FUNC:	movq	r1,r2;			\
-	movq	r3,r4;			\
+FUNC:	subq	$8, R16;		\
+	movq	r3, r4;			\
+	movq	r1, (R16);		\
 	leaq	BASE+KEY+48+4(r8),r9;	\
 	movq	r10,r11;		\
 	movl	(r7),r5 ## E;		\
@@ -74,7 +78,8 @@ FUNC:	movq	r1,r2;			\
 	leaq	32(r9),r9;
 
 #define epilogue(r1,r2,r3,r4,r5,r6,r7,r8,r9) \
-	movq	r1,r2;			\
+	movq	(R16),r2;		\
+	addq	$8,R16;			\
 	movq	r3,r4;			\
 	movl	r5 ## E,(r9);		\
 	movl	r6 ## E,4(r9);		\

[-- Attachment #3: step2.patch --]
[-- Type: text/x-vhdl, Size: 997 bytes --]

---
 arch/x86/crypto/aes-x86_64-asm_64.S |    8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

--- a/arch/x86/crypto/aes-x86_64-asm_64.S
+++ b/arch/x86/crypto/aes-x86_64-asm_64.S
@@ -90,13 +90,13 @@ FUNC:	subq	$8, R16;		\
 #define round(TAB,OFFSET,r1,r2,r3,r4,r5,r6,r7,r8,ra,rb,rc,rd) \
 	movzbl	r2 ## H,r5 ## E;	\
 	movzbl	r2 ## L,r6 ## E;	\
+	movq	r4,R8;			\
+	shrl	$16,r4 ## E;		\
 	movl	TAB+1024(,r5,4),r5 ## E;\
-	movw	r4 ## X,r2 ## X;	\
 	movl	TAB(,r6,4),r6 ## E;	\
-	roll	$16,r2 ## E;		\
-	shrl	$16,r4 ## E;		\
 	movzbl	r4 ## H,r7 ## E;	\
 	movzbl	r4 ## L,r4 ## E;	\
+	shrl	$16,r2 ## E;		\
 	xorl	OFFSET(r8),ra ## E;	\
 	xorl	OFFSET+4(r8),rb ## E;	\
 	xorl	TAB+3072(,r7,4),r5 ## E;\
@@ -123,7 +123,7 @@ FUNC:	subq	$8, R16;		\
 	xorl	TAB(,r1,4),r3 ## E;	\
 	movzbl	r2 ## H,r1 ## E;	\
 	movzbl	r2 ## L,r7 ## E;	\
-	shrl	$16,r2 ## E;		\
+	movq	R8,r2;			\
 	xorl	TAB+3072(,r1,4),r3 ## E;\
 	xorl	TAB+2048(,r7,4),r4 ## E;\
 	movzbl	r2 ## H,r1 ## E;	\

[-- Attachment #4: step3.patch --]
[-- Type: text/x-vhdl, Size: 951 bytes --]

---
 arch/x86/crypto/aes-x86_64-asm_64.S |   12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

--- a/arch/x86/crypto/aes-x86_64-asm_64.S
+++ b/arch/x86/crypto/aes-x86_64-asm_64.S
@@ -57,13 +57,13 @@
 	.global	FUNC;			\
 	.type	FUNC,@function;		\
 	.align	8;			\
-FUNC:	subq	$8, R16;		\
-	movq	r3, r4;			\
-	movq	r1, (R16);		\
+FUNC:	subq	$16, R16;		\
+	movl	(r7),r5 ## E;		\
 	leaq	BASE+KEY+48+4(r8),r9;	\
 	movq	r10,r11;		\
-	movl	(r7),r5 ## E;		\
+	movq	r1, (R16);		\
 	movl	4(r7),r1 ## E;		\
+	movq	r3, 8(R16);		\
 	movl	8(r7),r6 ## E;		\
 	movl	12(r7),r7 ## E;		\
 	movl	BASE+0(r8),r10 ## E;	\
@@ -79,11 +79,11 @@ FUNC:	subq	$8, R16;		\
 
 #define epilogue(r1,r2,r3,r4,r5,r6,r7,r8,r9) \
 	movq	(R16),r2;		\
-	addq	$8,R16;			\
-	movq	r3,r4;			\
 	movl	r5 ## E,(r9);		\
+	movq	8(R16),r4;		\
 	movl	r6 ## E,4(r9);		\
 	movl	r7 ## E,8(r9);		\
+	addq	$16,R16;		\
 	movl	r8 ## E,12(r9);		\
 	ret;
 

[-- Attachment #5: step4.patch --]
[-- Type: text/x-vhdl, Size: 1036 bytes --]

---
 arch/x86/crypto/aes-x86_64-asm_64.S |   10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

--- a/arch/x86/crypto/aes-x86_64-asm_64.S
+++ b/arch/x86/crypto/aes-x86_64-asm_64.S
@@ -99,14 +99,14 @@ FUNC:	subq	$16, R16;		\
 	shrl	$16,r2 ## E;		\
 	xorl	OFFSET(r8),ra ## E;	\
 	xorl	OFFSET+4(r8),rb ## E;	\
+	movq	r3,R9;			\
 	xorl	TAB+3072(,r7,4),r5 ## E;\
 	xorl	TAB+2048(,r4,4),r6 ## E;\
-	movzbl	r1 ## L,r7 ## E;	\
 	movzbl	r1 ## H,r4 ## E;	\
-	movl	TAB+1024(,r4,4),r4 ## E;\
-	movw	r3 ## X,r1 ## X;	\
-	roll	$16,r1 ## E;		\
+	movzbl	r1 ## L,r7 ## E;	\
 	shrl	$16,r3 ## E;		\
+	shrl	$16,r1 ## E;		\
+	movl	TAB+1024(,r4,4),r4 ## E;\
 	xorl	TAB(,r7,4),r5 ## E;	\
 	movzbl	r3 ## H,r7 ## E;	\
 	movzbl	r3 ## L,r3 ## E;	\
@@ -114,7 +114,7 @@ FUNC:	subq	$16, R16;		\
 	xorl	TAB+2048(,r3,4),r5 ## E;\
 	movzbl	r1 ## H,r7 ## E;	\
 	movzbl	r1 ## L,r3 ## E;	\
-	shrl	$16,r1 ## E;		\
+	movq	R9,r1;			\
 	xorl	TAB+3072(,r7,4),r6 ## E;\
 	movl	TAB+2048(,r3,4),r3 ## E;\
 	movzbl	r1 ## H,r7 ## E;	\

[-- Attachment #6: step5.patch --]
[-- Type: text/x-vhdl, Size: 1164 bytes --]

---
 arch/x86/crypto/aes-x86_64-asm_64.S |    8 +++++---
 include/crypto/aes.h                |    1 +
 2 files changed, 6 insertions(+), 3 deletions(-)

--- a/arch/x86/crypto/aes-x86_64-asm_64.S
+++ b/arch/x86/crypto/aes-x86_64-asm_64.S
@@ -57,14 +57,15 @@
 	.global	FUNC;			\
 	.type	FUNC,@function;		\
 	.align	8;			\
-FUNC:	subq	$16, R16;		\
+FUNC:	subq	$24, R16;		\
 	movl	(r7),r5 ## E;		\
-	leaq	BASE+KEY+48+4(r8),r9;	\
+	leaq	BASE+KEY+48+8(r8),r9;	\
 	movq	r10,r11;		\
 	movq	r1, (R16);		\
 	movl	4(r7),r1 ## E;		\
 	movq	r3, 8(R16);		\
 	movl	8(r7),r6 ## E;		\
+	movq	R12, 16(R16);		\
 	movl	12(r7),r7 ## E;		\
 	movl	BASE+0(r8),r10 ## E;	\
 	xorl	-48(r9),r5 ## E;	\
@@ -82,8 +83,9 @@ FUNC:	subq	$16, R16;		\
 	movl	r5 ## E,(r9);		\
 	movq	8(R16),r4;		\
 	movl	r6 ## E,4(r9);		\
+	movq	16(R16),R12;		\
 	movl	r7 ## E,8(r9);		\
-	addq	$16,R16;		\
+	addq	$24,R16;		\
 	movl	r8 ## E,12(r9);		\
 	ret;
 
--- a/include/crypto/aes.h
+++ b/include/crypto/aes.h
@@ -19,6 +19,7 @@
 
 struct crypto_aes_ctx {
 	u32 key_length;
+	u32 _pad1;
 	u32 key_enc[AES_MAX_KEYLENGTH_U32];
 	u32 key_dec[AES_MAX_KEYLENGTH_U32];
 };

[-- Attachment #7: step6.patch --]
[-- Type: text/x-vhdl, Size: 1058 bytes --]

---
 arch/x86/crypto/aes-x86_64-asm_64.S |    6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

--- a/arch/x86/crypto/aes-x86_64-asm_64.S
+++ b/arch/x86/crypto/aes-x86_64-asm_64.S
@@ -98,9 +98,8 @@ FUNC:	subq	$24, R16;		\
 	movl	TAB(,r6,4),r6 ## E;	\
 	movzbl	r4 ## H,r7 ## E;	\
 	movzbl	r4 ## L,r4 ## E;	\
+	movq	OFFSET(r8),R12;		\
 	shrl	$16,r2 ## E;		\
-	xorl	OFFSET(r8),ra ## E;	\
-	xorl	OFFSET+4(r8),rb ## E;	\
 	movq	r3,R9;			\
 	xorl	TAB+3072(,r7,4),r5 ## E;\
 	xorl	TAB+2048(,r4,4),r6 ## E;\
@@ -116,7 +115,9 @@ FUNC:	subq	$24, R16;		\
 	xorl	TAB+2048(,r3,4),r5 ## E;\
 	movzbl	r1 ## H,r7 ## E;	\
 	movzbl	r1 ## L,r3 ## E;	\
+	xorq	R12,ra;			\
 	movq	R9,r1;			\
+	shrq	$32,R12;		\
 	xorl	TAB+3072(,r7,4),r6 ## E;\
 	movl	TAB+2048(,r3,4),r3 ## E;\
 	movzbl	r1 ## H,r7 ## E;	\
@@ -126,6 +127,7 @@ FUNC:	subq	$24, R16;		\
 	movzbl	r2 ## H,r1 ## E;	\
 	movzbl	r2 ## L,r7 ## E;	\
 	movq	R8,r2;			\
+	xorq	R12,rb;			\
 	xorl	TAB+3072(,r1,4),r3 ## E;\
 	xorl	TAB+2048(,r7,4),r4 ## E;\
 	movzbl	r2 ## H,r1 ## E;	\

[-- Attachment #8: step7.patch --]
[-- Type: text/x-vhdl, Size: 1040 bytes --]

---
 arch/x86/crypto/aes-x86_64-asm_64.S |   10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

--- a/arch/x86/crypto/aes-x86_64-asm_64.S
+++ b/arch/x86/crypto/aes-x86_64-asm_64.S
@@ -117,6 +117,7 @@ FUNC:	subq	$24, R16;		\
 	movzbl	r1 ## L,r3 ## E;	\
 	xorq	R12,ra;			\
 	movq	R9,r1;			\
+	movq	OFFSET+8(r8),R9;	\
 	shrq	$32,R12;		\
 	xorl	TAB+3072(,r7,4),r6 ## E;\
 	movl	TAB+2048(,r3,4),r3 ## E;\
@@ -126,16 +127,17 @@ FUNC:	subq	$24, R16;		\
 	xorl	TAB(,r1,4),r3 ## E;	\
 	movzbl	r2 ## H,r1 ## E;	\
 	movzbl	r2 ## L,r7 ## E;	\
+	xorq	R9,rc;			\
 	movq	R8,r2;			\
+	shrq	$32,R9;			\
 	xorq	R12,rb;			\
 	xorl	TAB+3072(,r1,4),r3 ## E;\
 	xorl	TAB+2048(,r7,4),r4 ## E;\
 	movzbl	r2 ## H,r1 ## E;	\
+	xorq	R9,rd;			\
 	movzbl	r2 ## L,r2 ## E;	\
-	xorl	OFFSET+8(r8),rc ## E;	\
-	xorl	OFFSET+12(r8),rd ## E;	\
-	xorl	TAB+1024(,r1,4),r3 ## E;\
-	xorl	TAB(,r2,4),r4 ## E;
+	xorl	TAB(,r2,4),r4 ## E;	\
+	xorl	TAB+1024(,r1,4),r3 ## E;
 
 #define move_regs(r1,r2,r3,r4) \
 	movl	r3 ## E,r1 ## E;	\

  parent reply	other threads:[~2008-04-17  3:32 UTC|newest]

Thread overview: 22+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2008-04-09  6:41 [PATCH -mm crypto] AES: x86_64 asm implementation optimization Huang, Ying
2008-04-09  6:41 ` Huang, Ying
2008-04-16  7:31 ` Sebastian Siewior
2008-04-16  8:19   ` Huang, Ying
2008-04-16  8:23     ` Andi Kleen
2008-04-16  9:50       ` Herbert Xu
2008-04-16 18:40     ` Sebastian Siewior
2008-04-17  1:52       ` Huang, Ying
2008-04-17  3:34         ` Herbert Xu
2008-04-17  4:53           ` Huang, Ying
2008-04-23 22:28           ` Sebastian Siewior
2008-04-24  0:51             ` Herbert Xu
2008-04-17  3:36       ` Huang, Ying [this message]
2008-04-23 22:32         ` Sebastian Siewior
2008-04-25  3:11           ` Huang, Ying
2008-04-25  7:12             ` Sebastian Siewior
2008-04-25  7:21               ` Huang, Ying
2008-04-25  7:37                 ` Sebastian Siewior
2008-04-29 22:12             ` Sebastian Siewior
2008-05-04  6:25               ` dean gaudet
2008-05-07  5:12                 ` Huang, Ying
2008-05-07  5:26               ` Huang, Ying

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1208403403.4322.27.camel@caritas-dev.intel.com \
    --to=ying.huang@intel.com \
    --cc=adam@yggdrasil.com \
    --cc=akpm@linux-foundation.org \
    --cc=herbert@gondor.apana.org.au \
    --cc=linux-crypto@ml.breakpoint.cc \
    --cc=linux-crypto@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mingo@elte.hu \
    --cc=tglx@linutronix.de \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.