* [PATCH] crypto: vmx: Improved AES/XTS performance of 6-way unrolling for ppc.
@ 2023-08-29 15:37 Danny Tsen
2023-08-30 4:37 ` Michael Ellerman
0 siblings, 1 reply; 3+ messages in thread
From: Danny Tsen @ 2023-08-29 15:37 UTC (permalink / raw)
To: linux-crypto
Cc: herbert, dtsen, nayna, linux-kernel, Danny Tsen, appro, ltcgcw,
leitao, linuxppc-dev
Improve AES/XTS performance of 6-way unrolling for PowerPC up
to 17% with tcrypt. This is done by using one instruction,
vpermxor, to replace xor and vsldoi.
This patch has been tested with the kernel crypto module tcrypt.ko and
has passed the selftest. The patch is also tested with
CONFIG_CRYPTO_MANAGER_EXTRA_TESTS enabled.
Signed-off-by: Danny Tsen <dtsen@linux.ibm.com>
---
drivers/crypto/vmx/aesp8-ppc.pl | 141 +++++++++++++++++++++-----------
1 file changed, 92 insertions(+), 49 deletions(-)
diff --git a/drivers/crypto/vmx/aesp8-ppc.pl b/drivers/crypto/vmx/aesp8-ppc.pl
index 50a0a18f35da..f729589d792e 100644
--- a/drivers/crypto/vmx/aesp8-ppc.pl
+++ b/drivers/crypto/vmx/aesp8-ppc.pl
@@ -132,11 +132,12 @@ rcon:
.long 0x1b000000, 0x1b000000, 0x1b000000, 0x1b000000 ?rev
.long 0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c ?rev
.long 0,0,0,0 ?asis
+.long 0x0f102132, 0x43546576, 0x8798a9ba, 0xcbdcedfe
Lconsts:
mflr r0
bcl 20,31,\$+4
mflr $ptr #vvvvv "distance between . and rcon
- addi $ptr,$ptr,-0x48
+ addi $ptr,$ptr,-0x58
mtlr r0
blr
.long 0
@@ -2495,6 +2496,17 @@ _aesp8_xts_encrypt6x:
li $x70,0x70
mtspr 256,r0
+ xxlor 2, 32+$eighty7, 32+$eighty7
+ vsldoi $eighty7,$tmp,$eighty7,1 # 0x010101..87
+ xxlor 1, 32+$eighty7, 32+$eighty7
+
+ # Load XOR Lconsts.
+ mr $x70, r6
+ bl Lconsts
+ lxvw4x 0, $x40, r6 # load XOR contents
+ mr r6, $x70
+ li $x70,0x70
+
subi $rounds,$rounds,3 # -4 in total
lvx $rndkey0,$x00,$key1 # load key schedule
@@ -2537,69 +2549,77 @@ Load_xts_enc_key:
?vperm v31,v31,$twk5,$keyperm
lvx v25,$x10,$key_ # pre-load round[2]
+ # Switch to use the following codes with 0x010101..87 to generate tweak.
+ # eighty7 = 0x010101..87
+ # vsrab tmp, tweak, seven # next tweak value, right shift 7 bits
+ # vand tmp, tmp, eighty7 # last byte with carry
+ # vaddubm tweak, tweak, tweak # left shift 1 bit (x2)
+ # xxlor vsx, 0, 0
+ # vpermxor tweak, tweak, tmp, vsx
+
vperm $in0,$inout,$inptail,$inpperm
subi $inp,$inp,31 # undo "caller"
vxor $twk0,$tweak,$rndkey0
vsrab $tmp,$tweak,$seven # next tweak value
vaddubm $tweak,$tweak,$tweak
- vsldoi $tmp,$tmp,$tmp,15
vand $tmp,$tmp,$eighty7
vxor $out0,$in0,$twk0
- vxor $tweak,$tweak,$tmp
+ xxlor 32+$in1, 0, 0
+ vpermxor $tweak, $tweak, $tmp, $in1
lvx_u $in1,$x10,$inp
vxor $twk1,$tweak,$rndkey0
vsrab $tmp,$tweak,$seven # next tweak value
vaddubm $tweak,$tweak,$tweak
- vsldoi $tmp,$tmp,$tmp,15
le?vperm $in1,$in1,$in1,$leperm
vand $tmp,$tmp,$eighty7
vxor $out1,$in1,$twk1
- vxor $tweak,$tweak,$tmp
+ xxlor 32+$in2, 0, 0
+ vpermxor $tweak, $tweak, $tmp, $in2
lvx_u $in2,$x20,$inp
andi. $taillen,$len,15
vxor $twk2,$tweak,$rndkey0
vsrab $tmp,$tweak,$seven # next tweak value
vaddubm $tweak,$tweak,$tweak
- vsldoi $tmp,$tmp,$tmp,15
le?vperm $in2,$in2,$in2,$leperm
vand $tmp,$tmp,$eighty7
vxor $out2,$in2,$twk2
- vxor $tweak,$tweak,$tmp
+ xxlor 32+$in3, 0, 0
+ vpermxor $tweak, $tweak, $tmp, $in3
lvx_u $in3,$x30,$inp
sub $len,$len,$taillen
vxor $twk3,$tweak,$rndkey0
vsrab $tmp,$tweak,$seven # next tweak value
vaddubm $tweak,$tweak,$tweak
- vsldoi $tmp,$tmp,$tmp,15
le?vperm $in3,$in3,$in3,$leperm
vand $tmp,$tmp,$eighty7
vxor $out3,$in3,$twk3
- vxor $tweak,$tweak,$tmp
+ xxlor 32+$in4, 0, 0
+ vpermxor $tweak, $tweak, $tmp, $in4
lvx_u $in4,$x40,$inp
subi $len,$len,0x60
vxor $twk4,$tweak,$rndkey0
vsrab $tmp,$tweak,$seven # next tweak value
vaddubm $tweak,$tweak,$tweak
- vsldoi $tmp,$tmp,$tmp,15
le?vperm $in4,$in4,$in4,$leperm
vand $tmp,$tmp,$eighty7
vxor $out4,$in4,$twk4
- vxor $tweak,$tweak,$tmp
+ xxlor 32+$in5, 0, 0
+ vpermxor $tweak, $tweak, $tmp, $in5
lvx_u $in5,$x50,$inp
addi $inp,$inp,0x60
vxor $twk5,$tweak,$rndkey0
vsrab $tmp,$tweak,$seven # next tweak value
vaddubm $tweak,$tweak,$tweak
- vsldoi $tmp,$tmp,$tmp,15
le?vperm $in5,$in5,$in5,$leperm
vand $tmp,$tmp,$eighty7
vxor $out5,$in5,$twk5
- vxor $tweak,$tweak,$tmp
+ xxlor 32+$in0, 0, 0
+ vpermxor $tweak, $tweak, $tmp, $in0
vxor v31,v31,$rndkey0
mtctr $rounds
@@ -2625,6 +2645,8 @@ Loop_xts_enc6x:
lvx v25,$x10,$key_ # round[4]
bdnz Loop_xts_enc6x
+ xxlor 32+$eighty7, 1, 1 # 0x010101..87
+
subic $len,$len,96 # $len-=96
vxor $in0,$twk0,v31 # xor with last round key
vcipher $out0,$out0,v24
@@ -2634,7 +2656,6 @@ Loop_xts_enc6x:
vaddubm $tweak,$tweak,$tweak
vcipher $out2,$out2,v24
vcipher $out3,$out3,v24
- vsldoi $tmp,$tmp,$tmp,15
vcipher $out4,$out4,v24
vcipher $out5,$out5,v24
@@ -2642,7 +2663,8 @@ Loop_xts_enc6x:
vand $tmp,$tmp,$eighty7
vcipher $out0,$out0,v25
vcipher $out1,$out1,v25
- vxor $tweak,$tweak,$tmp
+ xxlor 32+$in1, 0, 0
+ vpermxor $tweak, $tweak, $tmp, $in1
vcipher $out2,$out2,v25
vcipher $out3,$out3,v25
vxor $in1,$twk1,v31
@@ -2653,13 +2675,13 @@ Loop_xts_enc6x:
and r0,r0,$len
vaddubm $tweak,$tweak,$tweak
- vsldoi $tmp,$tmp,$tmp,15
vcipher $out0,$out0,v26
vcipher $out1,$out1,v26
vand $tmp,$tmp,$eighty7
vcipher $out2,$out2,v26
vcipher $out3,$out3,v26
- vxor $tweak,$tweak,$tmp
+ xxlor 32+$in2, 0, 0
+ vpermxor $tweak, $tweak, $tmp, $in2
vcipher $out4,$out4,v26
vcipher $out5,$out5,v26
@@ -2673,7 +2695,6 @@ Loop_xts_enc6x:
vaddubm $tweak,$tweak,$tweak
vcipher $out0,$out0,v27
vcipher $out1,$out1,v27
- vsldoi $tmp,$tmp,$tmp,15
vcipher $out2,$out2,v27
vcipher $out3,$out3,v27
vand $tmp,$tmp,$eighty7
@@ -2681,7 +2702,8 @@ Loop_xts_enc6x:
vcipher $out5,$out5,v27
addi $key_,$sp,$FRAME+15 # rewind $key_
- vxor $tweak,$tweak,$tmp
+ xxlor 32+$in3, 0, 0
+ vpermxor $tweak, $tweak, $tmp, $in3
vcipher $out0,$out0,v28
vcipher $out1,$out1,v28
vxor $in3,$twk3,v31
@@ -2690,7 +2712,6 @@ Loop_xts_enc6x:
vcipher $out2,$out2,v28
vcipher $out3,$out3,v28
vaddubm $tweak,$tweak,$tweak
- vsldoi $tmp,$tmp,$tmp,15
vcipher $out4,$out4,v28
vcipher $out5,$out5,v28
lvx v24,$x00,$key_ # re-pre-load round[1]
@@ -2698,7 +2719,8 @@ Loop_xts_enc6x:
vcipher $out0,$out0,v29
vcipher $out1,$out1,v29
- vxor $tweak,$tweak,$tmp
+ xxlor 32+$in4, 0, 0
+ vpermxor $tweak, $tweak, $tmp, $in4
vcipher $out2,$out2,v29
vcipher $out3,$out3,v29
vxor $in4,$twk4,v31
@@ -2708,14 +2730,14 @@ Loop_xts_enc6x:
vcipher $out5,$out5,v29
lvx v25,$x10,$key_ # re-pre-load round[2]
vaddubm $tweak,$tweak,$tweak
- vsldoi $tmp,$tmp,$tmp,15
vcipher $out0,$out0,v30
vcipher $out1,$out1,v30
vand $tmp,$tmp,$eighty7
vcipher $out2,$out2,v30
vcipher $out3,$out3,v30
- vxor $tweak,$tweak,$tmp
+ xxlor 32+$in5, 0, 0
+ vpermxor $tweak, $tweak, $tmp, $in5
vcipher $out4,$out4,v30
vcipher $out5,$out5,v30
vxor $in5,$twk5,v31
@@ -2725,7 +2747,6 @@ Loop_xts_enc6x:
vcipherlast $out0,$out0,$in0
lvx_u $in0,$x00,$inp # load next input block
vaddubm $tweak,$tweak,$tweak
- vsldoi $tmp,$tmp,$tmp,15
vcipherlast $out1,$out1,$in1
lvx_u $in1,$x10,$inp
vcipherlast $out2,$out2,$in2
@@ -2738,7 +2759,10 @@ Loop_xts_enc6x:
vcipherlast $out4,$out4,$in4
le?vperm $in2,$in2,$in2,$leperm
lvx_u $in4,$x40,$inp
- vxor $tweak,$tweak,$tmp
+ xxlor 10, 32+$in0, 32+$in0
+ xxlor 32+$in0, 0, 0
+ vpermxor $tweak, $tweak, $tmp, $in0
+ xxlor 32+$in0, 10, 10
vcipherlast $tmp,$out5,$in5 # last block might be needed
# in stealing mode
le?vperm $in3,$in3,$in3,$leperm
@@ -2771,6 +2795,8 @@ Loop_xts_enc6x:
mtctr $rounds
beq Loop_xts_enc6x # did $len-=96 borrow?
+ xxlor 32+$eighty7, 2, 2 # 0x010101..87
+
addic. $len,$len,0x60
beq Lxts_enc6x_zero
cmpwi $len,0x20
@@ -3147,6 +3173,17 @@ _aesp8_xts_decrypt6x:
li $x70,0x70
mtspr 256,r0
+ xxlor 2, 32+$eighty7, 32+$eighty7
+ vsldoi $eighty7,$tmp,$eighty7,1 # 0x010101..87
+ xxlor 1, 32+$eighty7, 32+$eighty7
+
+ # Load XOR Lconsts.
+ mr $x70, r6
+ bl Lconsts
+ lxvw4x 0, $x40, r6 # load XOR contents
+ mr r6, $x70
+ li $x70,0x70
+
subi $rounds,$rounds,3 # -4 in total
lvx $rndkey0,$x00,$key1 # load key schedule
@@ -3194,64 +3231,64 @@ Load_xts_dec_key:
vxor $twk0,$tweak,$rndkey0
vsrab $tmp,$tweak,$seven # next tweak value
vaddubm $tweak,$tweak,$tweak
- vsldoi $tmp,$tmp,$tmp,15
vand $tmp,$tmp,$eighty7
vxor $out0,$in0,$twk0
- vxor $tweak,$tweak,$tmp
+ xxlor 32+$in1, 0, 0
+ vpermxor $tweak, $tweak, $tmp, $in1
lvx_u $in1,$x10,$inp
vxor $twk1,$tweak,$rndkey0
vsrab $tmp,$tweak,$seven # next tweak value
vaddubm $tweak,$tweak,$tweak
- vsldoi $tmp,$tmp,$tmp,15
le?vperm $in1,$in1,$in1,$leperm
vand $tmp,$tmp,$eighty7
vxor $out1,$in1,$twk1
- vxor $tweak,$tweak,$tmp
+ xxlor 32+$in2, 0, 0
+ vpermxor $tweak, $tweak, $tmp, $in2
lvx_u $in2,$x20,$inp
andi. $taillen,$len,15
vxor $twk2,$tweak,$rndkey0
vsrab $tmp,$tweak,$seven # next tweak value
vaddubm $tweak,$tweak,$tweak
- vsldoi $tmp,$tmp,$tmp,15
le?vperm $in2,$in2,$in2,$leperm
vand $tmp,$tmp,$eighty7
vxor $out2,$in2,$twk2
- vxor $tweak,$tweak,$tmp
+ xxlor 32+$in3, 0, 0
+ vpermxor $tweak, $tweak, $tmp, $in3
lvx_u $in3,$x30,$inp
sub $len,$len,$taillen
vxor $twk3,$tweak,$rndkey0
vsrab $tmp,$tweak,$seven # next tweak value
vaddubm $tweak,$tweak,$tweak
- vsldoi $tmp,$tmp,$tmp,15
le?vperm $in3,$in3,$in3,$leperm
vand $tmp,$tmp,$eighty7
vxor $out3,$in3,$twk3
- vxor $tweak,$tweak,$tmp
+ xxlor 32+$in4, 0, 0
+ vpermxor $tweak, $tweak, $tmp, $in4
lvx_u $in4,$x40,$inp
subi $len,$len,0x60
vxor $twk4,$tweak,$rndkey0
vsrab $tmp,$tweak,$seven # next tweak value
vaddubm $tweak,$tweak,$tweak
- vsldoi $tmp,$tmp,$tmp,15
le?vperm $in4,$in4,$in4,$leperm
vand $tmp,$tmp,$eighty7
vxor $out4,$in4,$twk4
- vxor $tweak,$tweak,$tmp
+ xxlor 32+$in5, 0, 0
+ vpermxor $tweak, $tweak, $tmp, $in5
lvx_u $in5,$x50,$inp
addi $inp,$inp,0x60
vxor $twk5,$tweak,$rndkey0
vsrab $tmp,$tweak,$seven # next tweak value
vaddubm $tweak,$tweak,$tweak
- vsldoi $tmp,$tmp,$tmp,15
le?vperm $in5,$in5,$in5,$leperm
vand $tmp,$tmp,$eighty7
vxor $out5,$in5,$twk5
- vxor $tweak,$tweak,$tmp
+ xxlor 32+$in0, 0, 0
+ vpermxor $tweak, $tweak, $tmp, $in0
vxor v31,v31,$rndkey0
mtctr $rounds
@@ -3277,6 +3314,8 @@ Loop_xts_dec6x:
lvx v25,$x10,$key_ # round[4]
bdnz Loop_xts_dec6x
+ xxlor 32+$eighty7, 1, 1 # 0x010101..87
+
subic $len,$len,96 # $len-=96
vxor $in0,$twk0,v31 # xor with last round key
vncipher $out0,$out0,v24
@@ -3286,7 +3325,6 @@ Loop_xts_dec6x:
vaddubm $tweak,$tweak,$tweak
vncipher $out2,$out2,v24
vncipher $out3,$out3,v24
- vsldoi $tmp,$tmp,$tmp,15
vncipher $out4,$out4,v24
vncipher $out5,$out5,v24
@@ -3294,7 +3332,8 @@ Loop_xts_dec6x:
vand $tmp,$tmp,$eighty7
vncipher $out0,$out0,v25
vncipher $out1,$out1,v25
- vxor $tweak,$tweak,$tmp
+ xxlor 32+$in1, 0, 0
+ vpermxor $tweak, $tweak, $tmp, $in1
vncipher $out2,$out2,v25
vncipher $out3,$out3,v25
vxor $in1,$twk1,v31
@@ -3305,13 +3344,13 @@ Loop_xts_dec6x:
and r0,r0,$len
vaddubm $tweak,$tweak,$tweak
- vsldoi $tmp,$tmp,$tmp,15
vncipher $out0,$out0,v26
vncipher $out1,$out1,v26
vand $tmp,$tmp,$eighty7
vncipher $out2,$out2,v26
vncipher $out3,$out3,v26
- vxor $tweak,$tweak,$tmp
+ xxlor 32+$in2, 0, 0
+ vpermxor $tweak, $tweak, $tmp, $in2
vncipher $out4,$out4,v26
vncipher $out5,$out5,v26
@@ -3325,7 +3364,6 @@ Loop_xts_dec6x:
vaddubm $tweak,$tweak,$tweak
vncipher $out0,$out0,v27
vncipher $out1,$out1,v27
- vsldoi $tmp,$tmp,$tmp,15
vncipher $out2,$out2,v27
vncipher $out3,$out3,v27
vand $tmp,$tmp,$eighty7
@@ -3333,7 +3371,8 @@ Loop_xts_dec6x:
vncipher $out5,$out5,v27
addi $key_,$sp,$FRAME+15 # rewind $key_
- vxor $tweak,$tweak,$tmp
+ xxlor 32+$in3, 0, 0
+ vpermxor $tweak, $tweak, $tmp, $in3
vncipher $out0,$out0,v28
vncipher $out1,$out1,v28
vxor $in3,$twk3,v31
@@ -3342,7 +3381,6 @@ Loop_xts_dec6x:
vncipher $out2,$out2,v28
vncipher $out3,$out3,v28
vaddubm $tweak,$tweak,$tweak
- vsldoi $tmp,$tmp,$tmp,15
vncipher $out4,$out4,v28
vncipher $out5,$out5,v28
lvx v24,$x00,$key_ # re-pre-load round[1]
@@ -3350,7 +3388,8 @@ Loop_xts_dec6x:
vncipher $out0,$out0,v29
vncipher $out1,$out1,v29
- vxor $tweak,$tweak,$tmp
+ xxlor 32+$in4, 0, 0
+ vpermxor $tweak, $tweak, $tmp, $in4
vncipher $out2,$out2,v29
vncipher $out3,$out3,v29
vxor $in4,$twk4,v31
@@ -3360,14 +3399,14 @@ Loop_xts_dec6x:
vncipher $out5,$out5,v29
lvx v25,$x10,$key_ # re-pre-load round[2]
vaddubm $tweak,$tweak,$tweak
- vsldoi $tmp,$tmp,$tmp,15
vncipher $out0,$out0,v30
vncipher $out1,$out1,v30
vand $tmp,$tmp,$eighty7
vncipher $out2,$out2,v30
vncipher $out3,$out3,v30
- vxor $tweak,$tweak,$tmp
+ xxlor 32+$in5, 0, 0
+ vpermxor $tweak, $tweak, $tmp, $in5
vncipher $out4,$out4,v30
vncipher $out5,$out5,v30
vxor $in5,$twk5,v31
@@ -3377,7 +3416,6 @@ Loop_xts_dec6x:
vncipherlast $out0,$out0,$in0
lvx_u $in0,$x00,$inp # load next input block
vaddubm $tweak,$tweak,$tweak
- vsldoi $tmp,$tmp,$tmp,15
vncipherlast $out1,$out1,$in1
lvx_u $in1,$x10,$inp
vncipherlast $out2,$out2,$in2
@@ -3390,7 +3428,10 @@ Loop_xts_dec6x:
vncipherlast $out4,$out4,$in4
le?vperm $in2,$in2,$in2,$leperm
lvx_u $in4,$x40,$inp
- vxor $tweak,$tweak,$tmp
+ xxlor 10, 32+$in0, 32+$in0
+ xxlor 32+$in0, 0, 0
+ vpermxor $tweak, $tweak, $tmp, $in0
+ xxlor 32+$in0, 10, 10
vncipherlast $out5,$out5,$in5
le?vperm $in3,$in3,$in3,$leperm
lvx_u $in5,$x50,$inp
@@ -3421,6 +3462,8 @@ Loop_xts_dec6x:
mtctr $rounds
beq Loop_xts_dec6x # did $len-=96 borrow?
+ xxlor 32+$eighty7, 2, 2 # 0x010101..87
+
addic. $len,$len,0x60
beq Lxts_dec6x_zero
cmpwi $len,0x20
--
2.31.1
^ permalink raw reply related [flat|nested] 3+ messages in thread
* Re: [PATCH] crypto: vmx: Improved AES/XTS performance of 6-way unrolling for ppc.
2023-08-29 15:37 [PATCH] crypto: vmx: Improved AES/XTS performance of 6-way unrolling for ppc Danny Tsen
@ 2023-08-30 4:37 ` Michael Ellerman
2023-08-30 13:50 ` Danny Tsen
0 siblings, 1 reply; 3+ messages in thread
From: Michael Ellerman @ 2023-08-30 4:37 UTC (permalink / raw)
To: Danny Tsen, linux-crypto
Cc: herbert, dtsen, nayna, linux-kernel, Danny Tsen, appro, ltcgcw,
leitao, linuxppc-dev
Danny Tsen <dtsen@linux.ibm.com> writes:
> Improve AES/XTS performance of 6-way unrolling for PowerPC up
> to 17% with tcrypt. This is done by using one instruction,
> vpermxor, to replace xor and vsldoi.
>
> This patch has been tested with the kernel crypto module tcrypt.ko and
> has passed the selftest. The patch is also tested with
> CONFIG_CRYPTO_MANAGER_EXTRA_TESTS enabled.
>
> Signed-off-by: Danny Tsen <dtsen@linux.ibm.com>
> ---
> drivers/crypto/vmx/aesp8-ppc.pl | 141 +++++++++++++++++++++-----------
> 1 file changed, 92 insertions(+), 49 deletions(-)
That's CRYPTOGAMS code, and is so far largely unchanged from the
original. I see you've sent the same change to openssl, but it's not
merged yet. Please document that in the change log, we want to keep the
code in sync as much as possible, and document any divergences.
cheers
> diff --git a/drivers/crypto/vmx/aesp8-ppc.pl b/drivers/crypto/vmx/aesp8-ppc.pl
> index 50a0a18f35da..f729589d792e 100644
> --- a/drivers/crypto/vmx/aesp8-ppc.pl
> +++ b/drivers/crypto/vmx/aesp8-ppc.pl
> @@ -132,11 +132,12 @@ rcon:
> .long 0x1b000000, 0x1b000000, 0x1b000000, 0x1b000000 ?rev
> .long 0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c ?rev
> .long 0,0,0,0 ?asis
> +.long 0x0f102132, 0x43546576, 0x8798a9ba, 0xcbdcedfe
> Lconsts:
> mflr r0
> bcl 20,31,\$+4
> mflr $ptr #vvvvv "distance between . and rcon
> - addi $ptr,$ptr,-0x48
> + addi $ptr,$ptr,-0x58
> mtlr r0
> blr
> .long 0
> @@ -2495,6 +2496,17 @@ _aesp8_xts_encrypt6x:
> li $x70,0x70
> mtspr 256,r0
>
> + xxlor 2, 32+$eighty7, 32+$eighty7
> + vsldoi $eighty7,$tmp,$eighty7,1 # 0x010101..87
> + xxlor 1, 32+$eighty7, 32+$eighty7
> +
> + # Load XOR Lconsts.
> + mr $x70, r6
> + bl Lconsts
> + lxvw4x 0, $x40, r6 # load XOR contents
> + mr r6, $x70
> + li $x70,0x70
> +
> subi $rounds,$rounds,3 # -4 in total
>
> lvx $rndkey0,$x00,$key1 # load key schedule
> @@ -2537,69 +2549,77 @@ Load_xts_enc_key:
> ?vperm v31,v31,$twk5,$keyperm
> lvx v25,$x10,$key_ # pre-load round[2]
>
> + # Switch to use the following codes with 0x010101..87 to generate tweak.
> + # eighty7 = 0x010101..87
> + # vsrab tmp, tweak, seven # next tweak value, right shift 7 bits
> + # vand tmp, tmp, eighty7 # last byte with carry
> + # vaddubm tweak, tweak, tweak # left shift 1 bit (x2)
> + # xxlor vsx, 0, 0
> + # vpermxor tweak, tweak, tmp, vsx
> +
> vperm $in0,$inout,$inptail,$inpperm
> subi $inp,$inp,31 # undo "caller"
> vxor $twk0,$tweak,$rndkey0
> vsrab $tmp,$tweak,$seven # next tweak value
> vaddubm $tweak,$tweak,$tweak
> - vsldoi $tmp,$tmp,$tmp,15
> vand $tmp,$tmp,$eighty7
> vxor $out0,$in0,$twk0
> - vxor $tweak,$tweak,$tmp
> + xxlor 32+$in1, 0, 0
> + vpermxor $tweak, $tweak, $tmp, $in1
>
> lvx_u $in1,$x10,$inp
> vxor $twk1,$tweak,$rndkey0
> vsrab $tmp,$tweak,$seven # next tweak value
> vaddubm $tweak,$tweak,$tweak
> - vsldoi $tmp,$tmp,$tmp,15
> le?vperm $in1,$in1,$in1,$leperm
> vand $tmp,$tmp,$eighty7
> vxor $out1,$in1,$twk1
> - vxor $tweak,$tweak,$tmp
> + xxlor 32+$in2, 0, 0
> + vpermxor $tweak, $tweak, $tmp, $in2
>
> lvx_u $in2,$x20,$inp
> andi. $taillen,$len,15
> vxor $twk2,$tweak,$rndkey0
> vsrab $tmp,$tweak,$seven # next tweak value
> vaddubm $tweak,$tweak,$tweak
> - vsldoi $tmp,$tmp,$tmp,15
> le?vperm $in2,$in2,$in2,$leperm
> vand $tmp,$tmp,$eighty7
> vxor $out2,$in2,$twk2
> - vxor $tweak,$tweak,$tmp
> + xxlor 32+$in3, 0, 0
> + vpermxor $tweak, $tweak, $tmp, $in3
>
> lvx_u $in3,$x30,$inp
> sub $len,$len,$taillen
> vxor $twk3,$tweak,$rndkey0
> vsrab $tmp,$tweak,$seven # next tweak value
> vaddubm $tweak,$tweak,$tweak
> - vsldoi $tmp,$tmp,$tmp,15
> le?vperm $in3,$in3,$in3,$leperm
> vand $tmp,$tmp,$eighty7
> vxor $out3,$in3,$twk3
> - vxor $tweak,$tweak,$tmp
> + xxlor 32+$in4, 0, 0
> + vpermxor $tweak, $tweak, $tmp, $in4
>
> lvx_u $in4,$x40,$inp
> subi $len,$len,0x60
> vxor $twk4,$tweak,$rndkey0
> vsrab $tmp,$tweak,$seven # next tweak value
> vaddubm $tweak,$tweak,$tweak
> - vsldoi $tmp,$tmp,$tmp,15
> le?vperm $in4,$in4,$in4,$leperm
> vand $tmp,$tmp,$eighty7
> vxor $out4,$in4,$twk4
> - vxor $tweak,$tweak,$tmp
> + xxlor 32+$in5, 0, 0
> + vpermxor $tweak, $tweak, $tmp, $in5
>
> lvx_u $in5,$x50,$inp
> addi $inp,$inp,0x60
> vxor $twk5,$tweak,$rndkey0
> vsrab $tmp,$tweak,$seven # next tweak value
> vaddubm $tweak,$tweak,$tweak
> - vsldoi $tmp,$tmp,$tmp,15
> le?vperm $in5,$in5,$in5,$leperm
> vand $tmp,$tmp,$eighty7
> vxor $out5,$in5,$twk5
> - vxor $tweak,$tweak,$tmp
> + xxlor 32+$in0, 0, 0
> + vpermxor $tweak, $tweak, $tmp, $in0
>
> vxor v31,v31,$rndkey0
> mtctr $rounds
> @@ -2625,6 +2645,8 @@ Loop_xts_enc6x:
> lvx v25,$x10,$key_ # round[4]
> bdnz Loop_xts_enc6x
>
> + xxlor 32+$eighty7, 1, 1 # 0x010101..87
> +
> subic $len,$len,96 # $len-=96
> vxor $in0,$twk0,v31 # xor with last round key
> vcipher $out0,$out0,v24
> @@ -2634,7 +2656,6 @@ Loop_xts_enc6x:
> vaddubm $tweak,$tweak,$tweak
> vcipher $out2,$out2,v24
> vcipher $out3,$out3,v24
> - vsldoi $tmp,$tmp,$tmp,15
> vcipher $out4,$out4,v24
> vcipher $out5,$out5,v24
>
> @@ -2642,7 +2663,8 @@ Loop_xts_enc6x:
> vand $tmp,$tmp,$eighty7
> vcipher $out0,$out0,v25
> vcipher $out1,$out1,v25
> - vxor $tweak,$tweak,$tmp
> + xxlor 32+$in1, 0, 0
> + vpermxor $tweak, $tweak, $tmp, $in1
> vcipher $out2,$out2,v25
> vcipher $out3,$out3,v25
> vxor $in1,$twk1,v31
> @@ -2653,13 +2675,13 @@ Loop_xts_enc6x:
>
> and r0,r0,$len
> vaddubm $tweak,$tweak,$tweak
> - vsldoi $tmp,$tmp,$tmp,15
> vcipher $out0,$out0,v26
> vcipher $out1,$out1,v26
> vand $tmp,$tmp,$eighty7
> vcipher $out2,$out2,v26
> vcipher $out3,$out3,v26
> - vxor $tweak,$tweak,$tmp
> + xxlor 32+$in2, 0, 0
> + vpermxor $tweak, $tweak, $tmp, $in2
> vcipher $out4,$out4,v26
> vcipher $out5,$out5,v26
>
> @@ -2673,7 +2695,6 @@ Loop_xts_enc6x:
> vaddubm $tweak,$tweak,$tweak
> vcipher $out0,$out0,v27
> vcipher $out1,$out1,v27
> - vsldoi $tmp,$tmp,$tmp,15
> vcipher $out2,$out2,v27
> vcipher $out3,$out3,v27
> vand $tmp,$tmp,$eighty7
> @@ -2681,7 +2702,8 @@ Loop_xts_enc6x:
> vcipher $out5,$out5,v27
>
> addi $key_,$sp,$FRAME+15 # rewind $key_
> - vxor $tweak,$tweak,$tmp
> + xxlor 32+$in3, 0, 0
> + vpermxor $tweak, $tweak, $tmp, $in3
> vcipher $out0,$out0,v28
> vcipher $out1,$out1,v28
> vxor $in3,$twk3,v31
> @@ -2690,7 +2712,6 @@ Loop_xts_enc6x:
> vcipher $out2,$out2,v28
> vcipher $out3,$out3,v28
> vaddubm $tweak,$tweak,$tweak
> - vsldoi $tmp,$tmp,$tmp,15
> vcipher $out4,$out4,v28
> vcipher $out5,$out5,v28
> lvx v24,$x00,$key_ # re-pre-load round[1]
> @@ -2698,7 +2719,8 @@ Loop_xts_enc6x:
>
> vcipher $out0,$out0,v29
> vcipher $out1,$out1,v29
> - vxor $tweak,$tweak,$tmp
> + xxlor 32+$in4, 0, 0
> + vpermxor $tweak, $tweak, $tmp, $in4
> vcipher $out2,$out2,v29
> vcipher $out3,$out3,v29
> vxor $in4,$twk4,v31
> @@ -2708,14 +2730,14 @@ Loop_xts_enc6x:
> vcipher $out5,$out5,v29
> lvx v25,$x10,$key_ # re-pre-load round[2]
> vaddubm $tweak,$tweak,$tweak
> - vsldoi $tmp,$tmp,$tmp,15
>
> vcipher $out0,$out0,v30
> vcipher $out1,$out1,v30
> vand $tmp,$tmp,$eighty7
> vcipher $out2,$out2,v30
> vcipher $out3,$out3,v30
> - vxor $tweak,$tweak,$tmp
> + xxlor 32+$in5, 0, 0
> + vpermxor $tweak, $tweak, $tmp, $in5
> vcipher $out4,$out4,v30
> vcipher $out5,$out5,v30
> vxor $in5,$twk5,v31
> @@ -2725,7 +2747,6 @@ Loop_xts_enc6x:
> vcipherlast $out0,$out0,$in0
> lvx_u $in0,$x00,$inp # load next input block
> vaddubm $tweak,$tweak,$tweak
> - vsldoi $tmp,$tmp,$tmp,15
> vcipherlast $out1,$out1,$in1
> lvx_u $in1,$x10,$inp
> vcipherlast $out2,$out2,$in2
> @@ -2738,7 +2759,10 @@ Loop_xts_enc6x:
> vcipherlast $out4,$out4,$in4
> le?vperm $in2,$in2,$in2,$leperm
> lvx_u $in4,$x40,$inp
> - vxor $tweak,$tweak,$tmp
> + xxlor 10, 32+$in0, 32+$in0
> + xxlor 32+$in0, 0, 0
> + vpermxor $tweak, $tweak, $tmp, $in0
> + xxlor 32+$in0, 10, 10
> vcipherlast $tmp,$out5,$in5 # last block might be needed
> # in stealing mode
> le?vperm $in3,$in3,$in3,$leperm
> @@ -2771,6 +2795,8 @@ Loop_xts_enc6x:
> mtctr $rounds
> beq Loop_xts_enc6x # did $len-=96 borrow?
>
> + xxlor 32+$eighty7, 2, 2 # 0x010101..87
> +
> addic. $len,$len,0x60
> beq Lxts_enc6x_zero
> cmpwi $len,0x20
> @@ -3147,6 +3173,17 @@ _aesp8_xts_decrypt6x:
> li $x70,0x70
> mtspr 256,r0
>
> + xxlor 2, 32+$eighty7, 32+$eighty7
> + vsldoi $eighty7,$tmp,$eighty7,1 # 0x010101..87
> + xxlor 1, 32+$eighty7, 32+$eighty7
> +
> + # Load XOR Lconsts.
> + mr $x70, r6
> + bl Lconsts
> + lxvw4x 0, $x40, r6 # load XOR contents
> + mr r6, $x70
> + li $x70,0x70
> +
> subi $rounds,$rounds,3 # -4 in total
>
> lvx $rndkey0,$x00,$key1 # load key schedule
> @@ -3194,64 +3231,64 @@ Load_xts_dec_key:
> vxor $twk0,$tweak,$rndkey0
> vsrab $tmp,$tweak,$seven # next tweak value
> vaddubm $tweak,$tweak,$tweak
> - vsldoi $tmp,$tmp,$tmp,15
> vand $tmp,$tmp,$eighty7
> vxor $out0,$in0,$twk0
> - vxor $tweak,$tweak,$tmp
> + xxlor 32+$in1, 0, 0
> + vpermxor $tweak, $tweak, $tmp, $in1
>
> lvx_u $in1,$x10,$inp
> vxor $twk1,$tweak,$rndkey0
> vsrab $tmp,$tweak,$seven # next tweak value
> vaddubm $tweak,$tweak,$tweak
> - vsldoi $tmp,$tmp,$tmp,15
> le?vperm $in1,$in1,$in1,$leperm
> vand $tmp,$tmp,$eighty7
> vxor $out1,$in1,$twk1
> - vxor $tweak,$tweak,$tmp
> + xxlor 32+$in2, 0, 0
> + vpermxor $tweak, $tweak, $tmp, $in2
>
> lvx_u $in2,$x20,$inp
> andi. $taillen,$len,15
> vxor $twk2,$tweak,$rndkey0
> vsrab $tmp,$tweak,$seven # next tweak value
> vaddubm $tweak,$tweak,$tweak
> - vsldoi $tmp,$tmp,$tmp,15
> le?vperm $in2,$in2,$in2,$leperm
> vand $tmp,$tmp,$eighty7
> vxor $out2,$in2,$twk2
> - vxor $tweak,$tweak,$tmp
> + xxlor 32+$in3, 0, 0
> + vpermxor $tweak, $tweak, $tmp, $in3
>
> lvx_u $in3,$x30,$inp
> sub $len,$len,$taillen
> vxor $twk3,$tweak,$rndkey0
> vsrab $tmp,$tweak,$seven # next tweak value
> vaddubm $tweak,$tweak,$tweak
> - vsldoi $tmp,$tmp,$tmp,15
> le?vperm $in3,$in3,$in3,$leperm
> vand $tmp,$tmp,$eighty7
> vxor $out3,$in3,$twk3
> - vxor $tweak,$tweak,$tmp
> + xxlor 32+$in4, 0, 0
> + vpermxor $tweak, $tweak, $tmp, $in4
>
> lvx_u $in4,$x40,$inp
> subi $len,$len,0x60
> vxor $twk4,$tweak,$rndkey0
> vsrab $tmp,$tweak,$seven # next tweak value
> vaddubm $tweak,$tweak,$tweak
> - vsldoi $tmp,$tmp,$tmp,15
> le?vperm $in4,$in4,$in4,$leperm
> vand $tmp,$tmp,$eighty7
> vxor $out4,$in4,$twk4
> - vxor $tweak,$tweak,$tmp
> + xxlor 32+$in5, 0, 0
> + vpermxor $tweak, $tweak, $tmp, $in5
>
> lvx_u $in5,$x50,$inp
> addi $inp,$inp,0x60
> vxor $twk5,$tweak,$rndkey0
> vsrab $tmp,$tweak,$seven # next tweak value
> vaddubm $tweak,$tweak,$tweak
> - vsldoi $tmp,$tmp,$tmp,15
> le?vperm $in5,$in5,$in5,$leperm
> vand $tmp,$tmp,$eighty7
> vxor $out5,$in5,$twk5
> - vxor $tweak,$tweak,$tmp
> + xxlor 32+$in0, 0, 0
> + vpermxor $tweak, $tweak, $tmp, $in0
>
> vxor v31,v31,$rndkey0
> mtctr $rounds
> @@ -3277,6 +3314,8 @@ Loop_xts_dec6x:
> lvx v25,$x10,$key_ # round[4]
> bdnz Loop_xts_dec6x
>
> + xxlor 32+$eighty7, 1, 1 # 0x010101..87
> +
> subic $len,$len,96 # $len-=96
> vxor $in0,$twk0,v31 # xor with last round key
> vncipher $out0,$out0,v24
> @@ -3286,7 +3325,6 @@ Loop_xts_dec6x:
> vaddubm $tweak,$tweak,$tweak
> vncipher $out2,$out2,v24
> vncipher $out3,$out3,v24
> - vsldoi $tmp,$tmp,$tmp,15
> vncipher $out4,$out4,v24
> vncipher $out5,$out5,v24
>
> @@ -3294,7 +3332,8 @@ Loop_xts_dec6x:
> vand $tmp,$tmp,$eighty7
> vncipher $out0,$out0,v25
> vncipher $out1,$out1,v25
> - vxor $tweak,$tweak,$tmp
> + xxlor 32+$in1, 0, 0
> + vpermxor $tweak, $tweak, $tmp, $in1
> vncipher $out2,$out2,v25
> vncipher $out3,$out3,v25
> vxor $in1,$twk1,v31
> @@ -3305,13 +3344,13 @@ Loop_xts_dec6x:
>
> and r0,r0,$len
> vaddubm $tweak,$tweak,$tweak
> - vsldoi $tmp,$tmp,$tmp,15
> vncipher $out0,$out0,v26
> vncipher $out1,$out1,v26
> vand $tmp,$tmp,$eighty7
> vncipher $out2,$out2,v26
> vncipher $out3,$out3,v26
> - vxor $tweak,$tweak,$tmp
> + xxlor 32+$in2, 0, 0
> + vpermxor $tweak, $tweak, $tmp, $in2
> vncipher $out4,$out4,v26
> vncipher $out5,$out5,v26
>
> @@ -3325,7 +3364,6 @@ Loop_xts_dec6x:
> vaddubm $tweak,$tweak,$tweak
> vncipher $out0,$out0,v27
> vncipher $out1,$out1,v27
> - vsldoi $tmp,$tmp,$tmp,15
> vncipher $out2,$out2,v27
> vncipher $out3,$out3,v27
> vand $tmp,$tmp,$eighty7
> @@ -3333,7 +3371,8 @@ Loop_xts_dec6x:
> vncipher $out5,$out5,v27
>
> addi $key_,$sp,$FRAME+15 # rewind $key_
> - vxor $tweak,$tweak,$tmp
> + xxlor 32+$in3, 0, 0
> + vpermxor $tweak, $tweak, $tmp, $in3
> vncipher $out0,$out0,v28
> vncipher $out1,$out1,v28
> vxor $in3,$twk3,v31
> @@ -3342,7 +3381,6 @@ Loop_xts_dec6x:
> vncipher $out2,$out2,v28
> vncipher $out3,$out3,v28
> vaddubm $tweak,$tweak,$tweak
> - vsldoi $tmp,$tmp,$tmp,15
> vncipher $out4,$out4,v28
> vncipher $out5,$out5,v28
> lvx v24,$x00,$key_ # re-pre-load round[1]
> @@ -3350,7 +3388,8 @@ Loop_xts_dec6x:
>
> vncipher $out0,$out0,v29
> vncipher $out1,$out1,v29
> - vxor $tweak,$tweak,$tmp
> + xxlor 32+$in4, 0, 0
> + vpermxor $tweak, $tweak, $tmp, $in4
> vncipher $out2,$out2,v29
> vncipher $out3,$out3,v29
> vxor $in4,$twk4,v31
> @@ -3360,14 +3399,14 @@ Loop_xts_dec6x:
> vncipher $out5,$out5,v29
> lvx v25,$x10,$key_ # re-pre-load round[2]
> vaddubm $tweak,$tweak,$tweak
> - vsldoi $tmp,$tmp,$tmp,15
>
> vncipher $out0,$out0,v30
> vncipher $out1,$out1,v30
> vand $tmp,$tmp,$eighty7
> vncipher $out2,$out2,v30
> vncipher $out3,$out3,v30
> - vxor $tweak,$tweak,$tmp
> + xxlor 32+$in5, 0, 0
> + vpermxor $tweak, $tweak, $tmp, $in5
> vncipher $out4,$out4,v30
> vncipher $out5,$out5,v30
> vxor $in5,$twk5,v31
> @@ -3377,7 +3416,6 @@ Loop_xts_dec6x:
> vncipherlast $out0,$out0,$in0
> lvx_u $in0,$x00,$inp # load next input block
> vaddubm $tweak,$tweak,$tweak
> - vsldoi $tmp,$tmp,$tmp,15
> vncipherlast $out1,$out1,$in1
> lvx_u $in1,$x10,$inp
> vncipherlast $out2,$out2,$in2
> @@ -3390,7 +3428,10 @@ Loop_xts_dec6x:
> vncipherlast $out4,$out4,$in4
> le?vperm $in2,$in2,$in2,$leperm
> lvx_u $in4,$x40,$inp
> - vxor $tweak,$tweak,$tmp
> + xxlor 10, 32+$in0, 32+$in0
> + xxlor 32+$in0, 0, 0
> + vpermxor $tweak, $tweak, $tmp, $in0
> + xxlor 32+$in0, 10, 10
> vncipherlast $out5,$out5,$in5
> le?vperm $in3,$in3,$in3,$leperm
> lvx_u $in5,$x50,$inp
> @@ -3421,6 +3462,8 @@ Loop_xts_dec6x:
> mtctr $rounds
> beq Loop_xts_dec6x # did $len-=96 borrow?
>
> + xxlor 32+$eighty7, 2, 2 # 0x010101..87
> +
> addic. $len,$len,0x60
> beq Lxts_dec6x_zero
> cmpwi $len,0x20
> --
> 2.31.1
^ permalink raw reply [flat|nested] 3+ messages in thread
* Re: [PATCH] crypto: vmx: Improved AES/XTS performance of 6-way unrolling for ppc.
2023-08-30 4:37 ` Michael Ellerman
@ 2023-08-30 13:50 ` Danny Tsen
0 siblings, 0 replies; 3+ messages in thread
From: Danny Tsen @ 2023-08-30 13:50 UTC (permalink / raw)
To: Michael Ellerman, linux-crypto
Cc: herbert, dtsen, nayna, linux-kernel, appro, ltcgcw, leitao,
linuxppc-dev
Hi Michael,
I just submitted the v2 patch.
Thanks.
-Danny
On 8/29/23 11:37 PM, Michael Ellerman wrote:
> Danny Tsen <dtsen@linux.ibm.com> writes:
>> Improve AES/XTS performance of 6-way unrolling for PowerPC up
>> to 17% with tcrypt. This is done by using one instruction,
>> vpermxor, to replace xor and vsldoi.
>>
>> This patch has been tested with the kernel crypto module tcrypt.ko and
>> has passed the selftest. The patch is also tested with
>> CONFIG_CRYPTO_MANAGER_EXTRA_TESTS enabled.
>>
>> Signed-off-by: Danny Tsen <dtsen@linux.ibm.com>
>> ---
>> drivers/crypto/vmx/aesp8-ppc.pl | 141 +++++++++++++++++++++-----------
>> 1 file changed, 92 insertions(+), 49 deletions(-)
> That's CRYPTOGAMS code, and is so far largely unchanged from the
> original. I see you've sent the same change to openssl, but it's not
> merged yet. Please document that in the change log, we want to keep the
> code in sync as much as possible, and document any divergences.
>
> cheers
>
>> diff --git a/drivers/crypto/vmx/aesp8-ppc.pl b/drivers/crypto/vmx/aesp8-ppc.pl
>> index 50a0a18f35da..f729589d792e 100644
>> --- a/drivers/crypto/vmx/aesp8-ppc.pl
>> +++ b/drivers/crypto/vmx/aesp8-ppc.pl
>> @@ -132,11 +132,12 @@ rcon:
>> .long 0x1b000000, 0x1b000000, 0x1b000000, 0x1b000000 ?rev
>> .long 0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c ?rev
>> .long 0,0,0,0 ?asis
>> +.long 0x0f102132, 0x43546576, 0x8798a9ba, 0xcbdcedfe
>> Lconsts:
>> mflr r0
>> bcl 20,31,\$+4
>> mflr $ptr #vvvvv "distance between . and rcon
>> - addi $ptr,$ptr,-0x48
>> + addi $ptr,$ptr,-0x58
>> mtlr r0
>> blr
>> .long 0
>> @@ -2495,6 +2496,17 @@ _aesp8_xts_encrypt6x:
>> li $x70,0x70
>> mtspr 256,r0
>>
>> + xxlor 2, 32+$eighty7, 32+$eighty7
>> + vsldoi $eighty7,$tmp,$eighty7,1 # 0x010101..87
>> + xxlor 1, 32+$eighty7, 32+$eighty7
>> +
>> + # Load XOR Lconsts.
>> + mr $x70, r6
>> + bl Lconsts
>> + lxvw4x 0, $x40, r6 # load XOR contents
>> + mr r6, $x70
>> + li $x70,0x70
>> +
>> subi $rounds,$rounds,3 # -4 in total
>>
>> lvx $rndkey0,$x00,$key1 # load key schedule
>> @@ -2537,69 +2549,77 @@ Load_xts_enc_key:
>> ?vperm v31,v31,$twk5,$keyperm
>> lvx v25,$x10,$key_ # pre-load round[2]
>>
>> + # Switch to use the following codes with 0x010101..87 to generate tweak.
>> + # eighty7 = 0x010101..87
>> + # vsrab tmp, tweak, seven # next tweak value, right shift 7 bits
>> + # vand tmp, tmp, eighty7 # last byte with carry
>> + # vaddubm tweak, tweak, tweak # left shift 1 bit (x2)
>> + # xxlor vsx, 0, 0
>> + # vpermxor tweak, tweak, tmp, vsx
>> +
>> vperm $in0,$inout,$inptail,$inpperm
>> subi $inp,$inp,31 # undo "caller"
>> vxor $twk0,$tweak,$rndkey0
>> vsrab $tmp,$tweak,$seven # next tweak value
>> vaddubm $tweak,$tweak,$tweak
>> - vsldoi $tmp,$tmp,$tmp,15
>> vand $tmp,$tmp,$eighty7
>> vxor $out0,$in0,$twk0
>> - vxor $tweak,$tweak,$tmp
>> + xxlor 32+$in1, 0, 0
>> + vpermxor $tweak, $tweak, $tmp, $in1
>>
>> lvx_u $in1,$x10,$inp
>> vxor $twk1,$tweak,$rndkey0
>> vsrab $tmp,$tweak,$seven # next tweak value
>> vaddubm $tweak,$tweak,$tweak
>> - vsldoi $tmp,$tmp,$tmp,15
>> le?vperm $in1,$in1,$in1,$leperm
>> vand $tmp,$tmp,$eighty7
>> vxor $out1,$in1,$twk1
>> - vxor $tweak,$tweak,$tmp
>> + xxlor 32+$in2, 0, 0
>> + vpermxor $tweak, $tweak, $tmp, $in2
>>
>> lvx_u $in2,$x20,$inp
>> andi. $taillen,$len,15
>> vxor $twk2,$tweak,$rndkey0
>> vsrab $tmp,$tweak,$seven # next tweak value
>> vaddubm $tweak,$tweak,$tweak
>> - vsldoi $tmp,$tmp,$tmp,15
>> le?vperm $in2,$in2,$in2,$leperm
>> vand $tmp,$tmp,$eighty7
>> vxor $out2,$in2,$twk2
>> - vxor $tweak,$tweak,$tmp
>> + xxlor 32+$in3, 0, 0
>> + vpermxor $tweak, $tweak, $tmp, $in3
>>
>> lvx_u $in3,$x30,$inp
>> sub $len,$len,$taillen
>> vxor $twk3,$tweak,$rndkey0
>> vsrab $tmp,$tweak,$seven # next tweak value
>> vaddubm $tweak,$tweak,$tweak
>> - vsldoi $tmp,$tmp,$tmp,15
>> le?vperm $in3,$in3,$in3,$leperm
>> vand $tmp,$tmp,$eighty7
>> vxor $out3,$in3,$twk3
>> - vxor $tweak,$tweak,$tmp
>> + xxlor 32+$in4, 0, 0
>> + vpermxor $tweak, $tweak, $tmp, $in4
>>
>> lvx_u $in4,$x40,$inp
>> subi $len,$len,0x60
>> vxor $twk4,$tweak,$rndkey0
>> vsrab $tmp,$tweak,$seven # next tweak value
>> vaddubm $tweak,$tweak,$tweak
>> - vsldoi $tmp,$tmp,$tmp,15
>> le?vperm $in4,$in4,$in4,$leperm
>> vand $tmp,$tmp,$eighty7
>> vxor $out4,$in4,$twk4
>> - vxor $tweak,$tweak,$tmp
>> + xxlor 32+$in5, 0, 0
>> + vpermxor $tweak, $tweak, $tmp, $in5
>>
>> lvx_u $in5,$x50,$inp
>> addi $inp,$inp,0x60
>> vxor $twk5,$tweak,$rndkey0
>> vsrab $tmp,$tweak,$seven # next tweak value
>> vaddubm $tweak,$tweak,$tweak
>> - vsldoi $tmp,$tmp,$tmp,15
>> le?vperm $in5,$in5,$in5,$leperm
>> vand $tmp,$tmp,$eighty7
>> vxor $out5,$in5,$twk5
>> - vxor $tweak,$tweak,$tmp
>> + xxlor 32+$in0, 0, 0
>> + vpermxor $tweak, $tweak, $tmp, $in0
>>
>> vxor v31,v31,$rndkey0
>> mtctr $rounds
>> @@ -2625,6 +2645,8 @@ Loop_xts_enc6x:
>> lvx v25,$x10,$key_ # round[4]
>> bdnz Loop_xts_enc6x
>>
>> + xxlor 32+$eighty7, 1, 1 # 0x010101..87
>> +
>> subic $len,$len,96 # $len-=96
>> vxor $in0,$twk0,v31 # xor with last round key
>> vcipher $out0,$out0,v24
>> @@ -2634,7 +2656,6 @@ Loop_xts_enc6x:
>> vaddubm $tweak,$tweak,$tweak
>> vcipher $out2,$out2,v24
>> vcipher $out3,$out3,v24
>> - vsldoi $tmp,$tmp,$tmp,15
>> vcipher $out4,$out4,v24
>> vcipher $out5,$out5,v24
>>
>> @@ -2642,7 +2663,8 @@ Loop_xts_enc6x:
>> vand $tmp,$tmp,$eighty7
>> vcipher $out0,$out0,v25
>> vcipher $out1,$out1,v25
>> - vxor $tweak,$tweak,$tmp
>> + xxlor 32+$in1, 0, 0
>> + vpermxor $tweak, $tweak, $tmp, $in1
>> vcipher $out2,$out2,v25
>> vcipher $out3,$out3,v25
>> vxor $in1,$twk1,v31
>> @@ -2653,13 +2675,13 @@ Loop_xts_enc6x:
>>
>> and r0,r0,$len
>> vaddubm $tweak,$tweak,$tweak
>> - vsldoi $tmp,$tmp,$tmp,15
>> vcipher $out0,$out0,v26
>> vcipher $out1,$out1,v26
>> vand $tmp,$tmp,$eighty7
>> vcipher $out2,$out2,v26
>> vcipher $out3,$out3,v26
>> - vxor $tweak,$tweak,$tmp
>> + xxlor 32+$in2, 0, 0
>> + vpermxor $tweak, $tweak, $tmp, $in2
>> vcipher $out4,$out4,v26
>> vcipher $out5,$out5,v26
>>
>> @@ -2673,7 +2695,6 @@ Loop_xts_enc6x:
>> vaddubm $tweak,$tweak,$tweak
>> vcipher $out0,$out0,v27
>> vcipher $out1,$out1,v27
>> - vsldoi $tmp,$tmp,$tmp,15
>> vcipher $out2,$out2,v27
>> vcipher $out3,$out3,v27
>> vand $tmp,$tmp,$eighty7
>> @@ -2681,7 +2702,8 @@ Loop_xts_enc6x:
>> vcipher $out5,$out5,v27
>>
>> addi $key_,$sp,$FRAME+15 # rewind $key_
>> - vxor $tweak,$tweak,$tmp
>> + xxlor 32+$in3, 0, 0
>> + vpermxor $tweak, $tweak, $tmp, $in3
>> vcipher $out0,$out0,v28
>> vcipher $out1,$out1,v28
>> vxor $in3,$twk3,v31
>> @@ -2690,7 +2712,6 @@ Loop_xts_enc6x:
>> vcipher $out2,$out2,v28
>> vcipher $out3,$out3,v28
>> vaddubm $tweak,$tweak,$tweak
>> - vsldoi $tmp,$tmp,$tmp,15
>> vcipher $out4,$out4,v28
>> vcipher $out5,$out5,v28
>> lvx v24,$x00,$key_ # re-pre-load round[1]
>> @@ -2698,7 +2719,8 @@ Loop_xts_enc6x:
>>
>> vcipher $out0,$out0,v29
>> vcipher $out1,$out1,v29
>> - vxor $tweak,$tweak,$tmp
>> + xxlor 32+$in4, 0, 0
>> + vpermxor $tweak, $tweak, $tmp, $in4
>> vcipher $out2,$out2,v29
>> vcipher $out3,$out3,v29
>> vxor $in4,$twk4,v31
>> @@ -2708,14 +2730,14 @@ Loop_xts_enc6x:
>> vcipher $out5,$out5,v29
>> lvx v25,$x10,$key_ # re-pre-load round[2]
>> vaddubm $tweak,$tweak,$tweak
>> - vsldoi $tmp,$tmp,$tmp,15
>>
>> vcipher $out0,$out0,v30
>> vcipher $out1,$out1,v30
>> vand $tmp,$tmp,$eighty7
>> vcipher $out2,$out2,v30
>> vcipher $out3,$out3,v30
>> - vxor $tweak,$tweak,$tmp
>> + xxlor 32+$in5, 0, 0
>> + vpermxor $tweak, $tweak, $tmp, $in5
>> vcipher $out4,$out4,v30
>> vcipher $out5,$out5,v30
>> vxor $in5,$twk5,v31
>> @@ -2725,7 +2747,6 @@ Loop_xts_enc6x:
>> vcipherlast $out0,$out0,$in0
>> lvx_u $in0,$x00,$inp # load next input block
>> vaddubm $tweak,$tweak,$tweak
>> - vsldoi $tmp,$tmp,$tmp,15
>> vcipherlast $out1,$out1,$in1
>> lvx_u $in1,$x10,$inp
>> vcipherlast $out2,$out2,$in2
>> @@ -2738,7 +2759,10 @@ Loop_xts_enc6x:
>> vcipherlast $out4,$out4,$in4
>> le?vperm $in2,$in2,$in2,$leperm
>> lvx_u $in4,$x40,$inp
>> - vxor $tweak,$tweak,$tmp
>> + xxlor 10, 32+$in0, 32+$in0
>> + xxlor 32+$in0, 0, 0
>> + vpermxor $tweak, $tweak, $tmp, $in0
>> + xxlor 32+$in0, 10, 10
>> vcipherlast $tmp,$out5,$in5 # last block might be needed
>> # in stealing mode
>> le?vperm $in3,$in3,$in3,$leperm
>> @@ -2771,6 +2795,8 @@ Loop_xts_enc6x:
>> mtctr $rounds
>> beq Loop_xts_enc6x # did $len-=96 borrow?
>>
>> + xxlor 32+$eighty7, 2, 2 # 0x010101..87
>> +
>> addic. $len,$len,0x60
>> beq Lxts_enc6x_zero
>> cmpwi $len,0x20
>> @@ -3147,6 +3173,17 @@ _aesp8_xts_decrypt6x:
>> li $x70,0x70
>> mtspr 256,r0
>>
>> + xxlor 2, 32+$eighty7, 32+$eighty7
>> + vsldoi $eighty7,$tmp,$eighty7,1 # 0x010101..87
>> + xxlor 1, 32+$eighty7, 32+$eighty7
>> +
>> + # Load XOR Lconsts.
>> + mr $x70, r6
>> + bl Lconsts
>> + lxvw4x 0, $x40, r6 # load XOR contents
>> + mr r6, $x70
>> + li $x70,0x70
>> +
>> subi $rounds,$rounds,3 # -4 in total
>>
>> lvx $rndkey0,$x00,$key1 # load key schedule
>> @@ -3194,64 +3231,64 @@ Load_xts_dec_key:
>> vxor $twk0,$tweak,$rndkey0
>> vsrab $tmp,$tweak,$seven # next tweak value
>> vaddubm $tweak,$tweak,$tweak
>> - vsldoi $tmp,$tmp,$tmp,15
>> vand $tmp,$tmp,$eighty7
>> vxor $out0,$in0,$twk0
>> - vxor $tweak,$tweak,$tmp
>> + xxlor 32+$in1, 0, 0
>> + vpermxor $tweak, $tweak, $tmp, $in1
>>
>> lvx_u $in1,$x10,$inp
>> vxor $twk1,$tweak,$rndkey0
>> vsrab $tmp,$tweak,$seven # next tweak value
>> vaddubm $tweak,$tweak,$tweak
>> - vsldoi $tmp,$tmp,$tmp,15
>> le?vperm $in1,$in1,$in1,$leperm
>> vand $tmp,$tmp,$eighty7
>> vxor $out1,$in1,$twk1
>> - vxor $tweak,$tweak,$tmp
>> + xxlor 32+$in2, 0, 0
>> + vpermxor $tweak, $tweak, $tmp, $in2
>>
>> lvx_u $in2,$x20,$inp
>> andi. $taillen,$len,15
>> vxor $twk2,$tweak,$rndkey0
>> vsrab $tmp,$tweak,$seven # next tweak value
>> vaddubm $tweak,$tweak,$tweak
>> - vsldoi $tmp,$tmp,$tmp,15
>> le?vperm $in2,$in2,$in2,$leperm
>> vand $tmp,$tmp,$eighty7
>> vxor $out2,$in2,$twk2
>> - vxor $tweak,$tweak,$tmp
>> + xxlor 32+$in3, 0, 0
>> + vpermxor $tweak, $tweak, $tmp, $in3
>>
>> lvx_u $in3,$x30,$inp
>> sub $len,$len,$taillen
>> vxor $twk3,$tweak,$rndkey0
>> vsrab $tmp,$tweak,$seven # next tweak value
>> vaddubm $tweak,$tweak,$tweak
>> - vsldoi $tmp,$tmp,$tmp,15
>> le?vperm $in3,$in3,$in3,$leperm
>> vand $tmp,$tmp,$eighty7
>> vxor $out3,$in3,$twk3
>> - vxor $tweak,$tweak,$tmp
>> + xxlor 32+$in4, 0, 0
>> + vpermxor $tweak, $tweak, $tmp, $in4
>>
>> lvx_u $in4,$x40,$inp
>> subi $len,$len,0x60
>> vxor $twk4,$tweak,$rndkey0
>> vsrab $tmp,$tweak,$seven # next tweak value
>> vaddubm $tweak,$tweak,$tweak
>> - vsldoi $tmp,$tmp,$tmp,15
>> le?vperm $in4,$in4,$in4,$leperm
>> vand $tmp,$tmp,$eighty7
>> vxor $out4,$in4,$twk4
>> - vxor $tweak,$tweak,$tmp
>> + xxlor 32+$in5, 0, 0
>> + vpermxor $tweak, $tweak, $tmp, $in5
>>
>> lvx_u $in5,$x50,$inp
>> addi $inp,$inp,0x60
>> vxor $twk5,$tweak,$rndkey0
>> vsrab $tmp,$tweak,$seven # next tweak value
>> vaddubm $tweak,$tweak,$tweak
>> - vsldoi $tmp,$tmp,$tmp,15
>> le?vperm $in5,$in5,$in5,$leperm
>> vand $tmp,$tmp,$eighty7
>> vxor $out5,$in5,$twk5
>> - vxor $tweak,$tweak,$tmp
>> + xxlor 32+$in0, 0, 0
>> + vpermxor $tweak, $tweak, $tmp, $in0
>>
>> vxor v31,v31,$rndkey0
>> mtctr $rounds
>> @@ -3277,6 +3314,8 @@ Loop_xts_dec6x:
>> lvx v25,$x10,$key_ # round[4]
>> bdnz Loop_xts_dec6x
>>
>> + xxlor 32+$eighty7, 1, 1 # 0x010101..87
>> +
>> subic $len,$len,96 # $len-=96
>> vxor $in0,$twk0,v31 # xor with last round key
>> vncipher $out0,$out0,v24
>> @@ -3286,7 +3325,6 @@ Loop_xts_dec6x:
>> vaddubm $tweak,$tweak,$tweak
>> vncipher $out2,$out2,v24
>> vncipher $out3,$out3,v24
>> - vsldoi $tmp,$tmp,$tmp,15
>> vncipher $out4,$out4,v24
>> vncipher $out5,$out5,v24
>>
>> @@ -3294,7 +3332,8 @@ Loop_xts_dec6x:
>> vand $tmp,$tmp,$eighty7
>> vncipher $out0,$out0,v25
>> vncipher $out1,$out1,v25
>> - vxor $tweak,$tweak,$tmp
>> + xxlor 32+$in1, 0, 0
>> + vpermxor $tweak, $tweak, $tmp, $in1
>> vncipher $out2,$out2,v25
>> vncipher $out3,$out3,v25
>> vxor $in1,$twk1,v31
>> @@ -3305,13 +3344,13 @@ Loop_xts_dec6x:
>>
>> and r0,r0,$len
>> vaddubm $tweak,$tweak,$tweak
>> - vsldoi $tmp,$tmp,$tmp,15
>> vncipher $out0,$out0,v26
>> vncipher $out1,$out1,v26
>> vand $tmp,$tmp,$eighty7
>> vncipher $out2,$out2,v26
>> vncipher $out3,$out3,v26
>> - vxor $tweak,$tweak,$tmp
>> + xxlor 32+$in2, 0, 0
>> + vpermxor $tweak, $tweak, $tmp, $in2
>> vncipher $out4,$out4,v26
>> vncipher $out5,$out5,v26
>>
>> @@ -3325,7 +3364,6 @@ Loop_xts_dec6x:
>> vaddubm $tweak,$tweak,$tweak
>> vncipher $out0,$out0,v27
>> vncipher $out1,$out1,v27
>> - vsldoi $tmp,$tmp,$tmp,15
>> vncipher $out2,$out2,v27
>> vncipher $out3,$out3,v27
>> vand $tmp,$tmp,$eighty7
>> @@ -3333,7 +3371,8 @@ Loop_xts_dec6x:
>> vncipher $out5,$out5,v27
>>
>> addi $key_,$sp,$FRAME+15 # rewind $key_
>> - vxor $tweak,$tweak,$tmp
>> + xxlor 32+$in3, 0, 0
>> + vpermxor $tweak, $tweak, $tmp, $in3
>> vncipher $out0,$out0,v28
>> vncipher $out1,$out1,v28
>> vxor $in3,$twk3,v31
>> @@ -3342,7 +3381,6 @@ Loop_xts_dec6x:
>> vncipher $out2,$out2,v28
>> vncipher $out3,$out3,v28
>> vaddubm $tweak,$tweak,$tweak
>> - vsldoi $tmp,$tmp,$tmp,15
>> vncipher $out4,$out4,v28
>> vncipher $out5,$out5,v28
>> lvx v24,$x00,$key_ # re-pre-load round[1]
>> @@ -3350,7 +3388,8 @@ Loop_xts_dec6x:
>>
>> vncipher $out0,$out0,v29
>> vncipher $out1,$out1,v29
>> - vxor $tweak,$tweak,$tmp
>> + xxlor 32+$in4, 0, 0
>> + vpermxor $tweak, $tweak, $tmp, $in4
>> vncipher $out2,$out2,v29
>> vncipher $out3,$out3,v29
>> vxor $in4,$twk4,v31
>> @@ -3360,14 +3399,14 @@ Loop_xts_dec6x:
>> vncipher $out5,$out5,v29
>> lvx v25,$x10,$key_ # re-pre-load round[2]
>> vaddubm $tweak,$tweak,$tweak
>> - vsldoi $tmp,$tmp,$tmp,15
>>
>> vncipher $out0,$out0,v30
>> vncipher $out1,$out1,v30
>> vand $tmp,$tmp,$eighty7
>> vncipher $out2,$out2,v30
>> vncipher $out3,$out3,v30
>> - vxor $tweak,$tweak,$tmp
>> + xxlor 32+$in5, 0, 0
>> + vpermxor $tweak, $tweak, $tmp, $in5
>> vncipher $out4,$out4,v30
>> vncipher $out5,$out5,v30
>> vxor $in5,$twk5,v31
>> @@ -3377,7 +3416,6 @@ Loop_xts_dec6x:
>> vncipherlast $out0,$out0,$in0
>> lvx_u $in0,$x00,$inp # load next input block
>> vaddubm $tweak,$tweak,$tweak
>> - vsldoi $tmp,$tmp,$tmp,15
>> vncipherlast $out1,$out1,$in1
>> lvx_u $in1,$x10,$inp
>> vncipherlast $out2,$out2,$in2
>> @@ -3390,7 +3428,10 @@ Loop_xts_dec6x:
>> vncipherlast $out4,$out4,$in4
>> le?vperm $in2,$in2,$in2,$leperm
>> lvx_u $in4,$x40,$inp
>> - vxor $tweak,$tweak,$tmp
>> + xxlor 10, 32+$in0, 32+$in0
>> + xxlor 32+$in0, 0, 0
>> + vpermxor $tweak, $tweak, $tmp, $in0
>> + xxlor 32+$in0, 10, 10
>> vncipherlast $out5,$out5,$in5
>> le?vperm $in3,$in3,$in3,$leperm
>> lvx_u $in5,$x50,$inp
>> @@ -3421,6 +3462,8 @@ Loop_xts_dec6x:
>> mtctr $rounds
>> beq Loop_xts_dec6x # did $len-=96 borrow?
>>
>> + xxlor 32+$eighty7, 2, 2 # 0x010101..87
>> +
>> addic. $len,$len,0x60
>> beq Lxts_dec6x_zero
>> cmpwi $len,0x20
>> --
>> 2.31.1
^ permalink raw reply [flat|nested] 3+ messages in thread
end of thread, other threads:[~2023-08-30 13:52 UTC | newest]
Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2023-08-29 15:37 [PATCH] crypto: vmx: Improved AES/XTS performance of 6-way unrolling for ppc Danny Tsen
2023-08-30 4:37 ` Michael Ellerman
2023-08-30 13:50 ` Danny Tsen
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).