[PATCH] LoongArch: vDSO: Tune the chacha20 implementation

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH] LoongArch: vDSO: Tune the chacha20 implementation
@ 2024-09-19  9:13 Xi Ruoyao
  2024-09-20 15:11 ` Jason A. Donenfeld
  2024-09-23  7:15 ` Huacai Chen
  0 siblings, 2 replies; 6+ messages in thread
From: Xi Ruoyao @ 2024-09-19  9:13 UTC (permalink / raw)
  To: Jason A . Donenfeld, Huacai Chen, WANG Xuerui, Christophe Leroy
  Cc: linux-crypto, loongarch, linux-kernel, Jinyang He, Tiezhu Yang,
	Arnd Bergmann, Xi Ruoyao

As Christophe pointed out, tuning the chacha20 implementation by
scheduling the instructions like what GCC does can improve the
performance.

The tuning does not introduce too much complexity (basically it's just
reordering some instructions).  And the tuning does not hurt readibility
too much: actually the tuned code looks even more similar to a
textbook-style implementation based on 128-bit vectors.  So overall it's
a good deal to me.

Tested with vdso_test_getchacha and benched with vdso_test_getrandom.
On a LA664 the speedup is 5%, and I expect a larger speedup on LA[2-4]64
with a lower issue rate.

Suggested-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Link: https://lore.kernel.org/all/77655d9e-fc05-4300-8f0d-7b2ad840d091@csgroup.eu/
Signed-off-by: Xi Ruoyao <xry111@xry111.site>
---
 arch/loongarch/vdso/vgetrandom-chacha.S | 92 +++++++++++++++----------
 1 file changed, 55 insertions(+), 37 deletions(-)

diff --git a/arch/loongarch/vdso/vgetrandom-chacha.S b/arch/loongarch/vdso/vgetrandom-chacha.S
index 7e86a50f6e85..0c5f1183c480 100644
--- a/arch/loongarch/vdso/vgetrandom-chacha.S
+++ b/arch/loongarch/vdso/vgetrandom-chacha.S
@@ -9,23 +9,11 @@
 
 .text
 
-/* Salsa20 quarter-round */
-.macro	QR	a b c d
-	add.w		\a, \a, \b
-	xor		\d, \d, \a
-	rotri.w		\d, \d, 16
-
-	add.w		\c, \c, \d
-	xor		\b, \b, \c
-	rotri.w		\b, \b, 20
-
-	add.w		\a, \a, \b
-	xor		\d, \d, \a
-	rotri.w		\d, \d, 24
-
-	add.w		\c, \c, \d
-	xor		\b, \b, \c
-	rotri.w		\b, \b, 25
+.macro	OP_4REG	op d0 d1 d2 d3 s0 s1 s2 s3
+	\op	\d0, \d0, \s0
+	\op	\d1, \d1, \s1
+	\op	\d2, \d2, \s2
+	\op	\d3, \d3, \s3
 .endm
 
 /*
@@ -74,6 +62,23 @@ SYM_FUNC_START(__arch_chacha20_blocks_nostack)
 /* Reuse i as copy3 */
 #define copy3		i
 
+/* Packs to be used with OP_4REG */
+#define line0		state0, state1, state2, state3
+#define line1		state4, state5, state6, state7
+#define line2		state8, state9, state10, state11
+#define line3		state12, state13, state14, state15
+
+#define	line1_perm	state5, state6, state7, state4
+#define	line2_perm	state10, state11, state8, state9
+#define	line3_perm	state15, state12, state13, state14
+
+#define	copy		copy0, copy1, copy2, copy3
+
+#define _16		16, 16, 16, 16
+#define _20		20, 20, 20, 20
+#define _24		24, 24, 24, 24
+#define _25		25, 25, 25, 25
+
 	/*
 	 * The ABI requires s0-s9 saved, and sp aligned to 16-byte.
 	 * This does not violate the stack-less requirement: no sensitive data
@@ -126,16 +131,38 @@ SYM_FUNC_START(__arch_chacha20_blocks_nostack)
 	li.w		i, 10
 .Lpermute:
 	/* odd round */
-	QR		state0, state4, state8, state12
-	QR		state1, state5, state9, state13
-	QR		state2, state6, state10, state14
-	QR		state3, state7, state11, state15
+	OP_4REG	add.w	line0, line1
+	OP_4REG	xor	line3, line0
+	OP_4REG	rotri.w	line3, _16
+
+	OP_4REG	add.w	line2, line3
+	OP_4REG	xor	line1, line2
+	OP_4REG	rotri.w	line1, _20
+
+	OP_4REG	add.w	line0, line1
+	OP_4REG	xor	line3, line0
+	OP_4REG	rotri.w	line3, _24
+
+	OP_4REG	add.w	line2, line3
+	OP_4REG	xor	line1, line2
+	OP_4REG	rotri.w	line1, _25
 
 	/* even round */
-	QR		state0, state5, state10, state15
-	QR		state1, state6, state11, state12
-	QR		state2, state7, state8, state13
-	QR		state3, state4, state9, state14
+	OP_4REG	add.w	line0, line1_perm
+	OP_4REG	xor	line3_perm, line0
+	OP_4REG	rotri.w	line3_perm, _16
+
+	OP_4REG	add.w	line2_perm, line3_perm
+	OP_4REG	xor	line1_perm, line2_perm
+	OP_4REG	rotri.w	line1_perm, _20
+
+	OP_4REG	add.w	line0, line1_perm
+	OP_4REG	xor	line3_perm, line0
+	OP_4REG	rotri.w	line3_perm, _24
+
+	OP_4REG	add.w	line2_perm, line3_perm
+	OP_4REG	xor	line1_perm, line2_perm
+	OP_4REG	rotri.w	line1_perm, _25
 
 	addi.w		i, i, -1
 	bnez		i, .Lpermute
@@ -147,10 +174,7 @@ SYM_FUNC_START(__arch_chacha20_blocks_nostack)
 	li.w		copy3, 0x6b206574
 
 	/* output[0,1,2,3] = copy[0,1,2,3] + state[0,1,2,3] */
-	add.w		state0, state0, copy0
-	add.w		state1, state1, copy1
-	add.w		state2, state2, copy2
-	add.w		state3, state3, copy3
+	OP_4REG	add.w	line0, copy
 	st.w		state0, output, 0
 	st.w		state1, output, 4
 	st.w		state2, output, 8
@@ -165,10 +189,7 @@ SYM_FUNC_START(__arch_chacha20_blocks_nostack)
 	ld.w		state3, key, 12
 
 	/* output[4,5,6,7] = state[0,1,2,3] + state[4,5,6,7] */
-	add.w		state4, state4, state0
-	add.w		state5, state5, state1
-	add.w		state6, state6, state2
-	add.w		state7, state7, state3
+	OP_4REG	add.w	line1, line0
 	st.w		state4, output, 16
 	st.w		state5, output, 20
 	st.w		state6, output, 24
@@ -181,10 +202,7 @@ SYM_FUNC_START(__arch_chacha20_blocks_nostack)
 	ld.w		state3, key, 28
 
 	/* output[8,9,10,11] = state[0,1,2,3] + state[8,9,10,11] */
-	add.w		state8, state8, state0
-	add.w		state9, state9, state1
-	add.w		state10, state10, state2
-	add.w		state11, state11, state3
+	OP_4REG	add.w	line2, line0
 	st.w		state8, output, 32
 	st.w		state9, output, 36
 	st.w		state10, output, 40
-- 
2.46.1


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH] LoongArch: vDSO: Tune the chacha20 implementation
  2024-09-19  9:13 [PATCH] LoongArch: vDSO: Tune the chacha20 implementation Xi Ruoyao
@ 2024-09-20 15:11 ` Jason A. Donenfeld
  2024-09-23  7:15 ` Huacai Chen
  1 sibling, 0 replies; 6+ messages in thread
From: Jason A. Donenfeld @ 2024-09-20 15:11 UTC (permalink / raw)
  To: Xi Ruoyao
  Cc: Huacai Chen, WANG Xuerui, Christophe Leroy, linux-crypto,
	loongarch, linux-kernel, Jinyang He, Tiezhu Yang, Arnd Bergmann

On Thu, Sep 19, 2024 at 05:13:59PM +0800, Xi Ruoyao wrote:
> As Christophe pointed out, tuning the chacha20 implementation by
> scheduling the instructions like what GCC does can improve the
> performance.
> 
> The tuning does not introduce too much complexity (basically it's just
> reordering some instructions).  And the tuning does not hurt readibility
> too much: actually the tuned code looks even more similar to a
> textbook-style implementation based on 128-bit vectors.  So overall it's
> a good deal to me.
> 
> Tested with vdso_test_getchacha and benched with vdso_test_getrandom.
> On a LA664 the speedup is 5%, and I expect a larger speedup on LA[2-4]64
> with a lower issue rate.
> 
> Suggested-by: Christophe Leroy <christophe.leroy@csgroup.eu>
> Link: https://lore.kernel.org/all/77655d9e-fc05-4300-8f0d-7b2ad840d091@csgroup.eu/
> Signed-off-by: Xi Ruoyao <xry111@xry111.site>

That seems like a reasonable optimization to me. I'll queue it up in
random.git and send it in my pull next week.

Thanks.

Jason

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] LoongArch: vDSO: Tune the chacha20 implementation
  2024-09-19  9:13 [PATCH] LoongArch: vDSO: Tune the chacha20 implementation Xi Ruoyao
  2024-09-20 15:11 ` Jason A. Donenfeld
@ 2024-09-23  7:15 ` Huacai Chen
  2024-09-23  8:06   ` Xi Ruoyao
  1 sibling, 1 reply; 6+ messages in thread
From: Huacai Chen @ 2024-09-23  7:15 UTC (permalink / raw)
  To: Xi Ruoyao
  Cc: Jason A . Donenfeld, WANG Xuerui, Christophe Leroy, linux-crypto,
	loongarch, linux-kernel, Jinyang He, Tiezhu Yang, Arnd Bergmann

Hi, Ruoyao,

On Thu, Sep 19, 2024 at 5:15 PM Xi Ruoyao <xry111@xry111.site> wrote:
>
> As Christophe pointed out, tuning the chacha20 implementation by
> scheduling the instructions like what GCC does can improve the
> performance.
>
> The tuning does not introduce too much complexity (basically it's just
> reordering some instructions).  And the tuning does not hurt readibility
> too much: actually the tuned code looks even more similar to a
> textbook-style implementation based on 128-bit vectors.  So overall it's
> a good deal to me.
>
> Tested with vdso_test_getchacha and benched with vdso_test_getrandom.
> On a LA664 the speedup is 5%, and I expect a larger speedup on LA[2-4]64
> with a lower issue rate.
>
> Suggested-by: Christophe Leroy <christophe.leroy@csgroup.eu>
> Link: https://lore.kernel.org/all/77655d9e-fc05-4300-8f0d-7b2ad840d091@csgroup.eu/
> Signed-off-by: Xi Ruoyao <xry111@xry111.site>
> ---
>  arch/loongarch/vdso/vgetrandom-chacha.S | 92 +++++++++++++++----------
>  1 file changed, 55 insertions(+), 37 deletions(-)
>
> diff --git a/arch/loongarch/vdso/vgetrandom-chacha.S b/arch/loongarch/vdso/vgetrandom-chacha.S
> index 7e86a50f6e85..0c5f1183c480 100644
> --- a/arch/loongarch/vdso/vgetrandom-chacha.S
> +++ b/arch/loongarch/vdso/vgetrandom-chacha.S
> @@ -9,23 +9,11 @@
>
>  .text
>
> -/* Salsa20 quarter-round */
> -.macro QR      a b c d
> -       add.w           \a, \a, \b
> -       xor             \d, \d, \a
> -       rotri.w         \d, \d, 16
> -
> -       add.w           \c, \c, \d
> -       xor             \b, \b, \c
> -       rotri.w         \b, \b, 20
> -
> -       add.w           \a, \a, \b
> -       xor             \d, \d, \a
> -       rotri.w         \d, \d, 24
> -
> -       add.w           \c, \c, \d
> -       xor             \b, \b, \c
> -       rotri.w         \b, \b, 25
> +.macro OP_4REG op d0 d1 d2 d3 s0 s1 s2 s3
> +       \op     \d0, \d0, \s0
> +       \op     \d1, \d1, \s1
> +       \op     \d2, \d2, \s2
> +       \op     \d3, \d3, \s3
>  .endm
>
>  /*
> @@ -74,6 +62,23 @@ SYM_FUNC_START(__arch_chacha20_blocks_nostack)
>  /* Reuse i as copy3 */
>  #define copy3          i
>
> +/* Packs to be used with OP_4REG */
> +#define line0          state0, state1, state2, state3
> +#define line1          state4, state5, state6, state7
> +#define line2          state8, state9, state10, state11
> +#define line3          state12, state13, state14, state15
> +
> +#define        line1_perm      state5, state6, state7, state4
> +#define        line2_perm      state10, state11, state8, state9
> +#define        line3_perm      state15, state12, state13, state14
> +
> +#define        copy            copy0, copy1, copy2, copy3
The indentation here is strange, it seems some of them are spaces and
some of them are tabs.

Huacai

> +
> +#define _16            16, 16, 16, 16
> +#define _20            20, 20, 20, 20
> +#define _24            24, 24, 24, 24
> +#define _25            25, 25, 25, 25
> +
>         /*
>          * The ABI requires s0-s9 saved, and sp aligned to 16-byte.
>          * This does not violate the stack-less requirement: no sensitive data
> @@ -126,16 +131,38 @@ SYM_FUNC_START(__arch_chacha20_blocks_nostack)
>         li.w            i, 10
>  .Lpermute:
>         /* odd round */
> -       QR              state0, state4, state8, state12
> -       QR              state1, state5, state9, state13
> -       QR              state2, state6, state10, state14
> -       QR              state3, state7, state11, state15
> +       OP_4REG add.w   line0, line1
> +       OP_4REG xor     line3, line0
> +       OP_4REG rotri.w line3, _16
> +
> +       OP_4REG add.w   line2, line3
> +       OP_4REG xor     line1, line2
> +       OP_4REG rotri.w line1, _20
> +
> +       OP_4REG add.w   line0, line1
> +       OP_4REG xor     line3, line0
> +       OP_4REG rotri.w line3, _24
> +
> +       OP_4REG add.w   line2, line3
> +       OP_4REG xor     line1, line2
> +       OP_4REG rotri.w line1, _25
>
>         /* even round */
> -       QR              state0, state5, state10, state15
> -       QR              state1, state6, state11, state12
> -       QR              state2, state7, state8, state13
> -       QR              state3, state4, state9, state14
> +       OP_4REG add.w   line0, line1_perm
> +       OP_4REG xor     line3_perm, line0
> +       OP_4REG rotri.w line3_perm, _16
> +
> +       OP_4REG add.w   line2_perm, line3_perm
> +       OP_4REG xor     line1_perm, line2_perm
> +       OP_4REG rotri.w line1_perm, _20
> +
> +       OP_4REG add.w   line0, line1_perm
> +       OP_4REG xor     line3_perm, line0
> +       OP_4REG rotri.w line3_perm, _24
> +
> +       OP_4REG add.w   line2_perm, line3_perm
> +       OP_4REG xor     line1_perm, line2_perm
> +       OP_4REG rotri.w line1_perm, _25
>
>         addi.w          i, i, -1
>         bnez            i, .Lpermute
> @@ -147,10 +174,7 @@ SYM_FUNC_START(__arch_chacha20_blocks_nostack)
>         li.w            copy3, 0x6b206574
>
>         /* output[0,1,2,3] = copy[0,1,2,3] + state[0,1,2,3] */
> -       add.w           state0, state0, copy0
> -       add.w           state1, state1, copy1
> -       add.w           state2, state2, copy2
> -       add.w           state3, state3, copy3
> +       OP_4REG add.w   line0, copy
>         st.w            state0, output, 0
>         st.w            state1, output, 4
>         st.w            state2, output, 8
> @@ -165,10 +189,7 @@ SYM_FUNC_START(__arch_chacha20_blocks_nostack)
>         ld.w            state3, key, 12
>
>         /* output[4,5,6,7] = state[0,1,2,3] + state[4,5,6,7] */
> -       add.w           state4, state4, state0
> -       add.w           state5, state5, state1
> -       add.w           state6, state6, state2
> -       add.w           state7, state7, state3
> +       OP_4REG add.w   line1, line0
>         st.w            state4, output, 16
>         st.w            state5, output, 20
>         st.w            state6, output, 24
> @@ -181,10 +202,7 @@ SYM_FUNC_START(__arch_chacha20_blocks_nostack)
>         ld.w            state3, key, 28
>
>         /* output[8,9,10,11] = state[0,1,2,3] + state[8,9,10,11] */
> -       add.w           state8, state8, state0
> -       add.w           state9, state9, state1
> -       add.w           state10, state10, state2
> -       add.w           state11, state11, state3
> +       OP_4REG add.w   line2, line0
>         st.w            state8, output, 32
>         st.w            state9, output, 36
>         st.w            state10, output, 40
> --
> 2.46.1
>
>

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] LoongArch: vDSO: Tune the chacha20 implementation
  2024-09-23  7:15 ` Huacai Chen
@ 2024-09-23  8:06   ` Xi Ruoyao
  2024-09-23 12:48     ` Jason A. Donenfeld
  0 siblings, 1 reply; 6+ messages in thread
From: Xi Ruoyao @ 2024-09-23  8:06 UTC (permalink / raw)
  To: Huacai Chen, Jason A . Donenfeld
  Cc: WANG Xuerui, Christophe Leroy, linux-crypto, loongarch,
	linux-kernel, Jinyang He, Tiezhu Yang, Arnd Bergmann

On Mon, 2024-09-23 at 15:15 +0800, Huacai Chen wrote:
> > +#define line3          state12, state13, state14, state15
> > +
> > +#define        line1_perm      state5, state6, state7, state4
> > +#define        line2_perm      state10, state11, state8, state9
> > +#define        line3_perm      state15, state12, state13, state14
> > +
> > +#define        copy            copy0, copy1, copy2, copy3
> The indentation here is strange, it seems some of them are spaces and
> some of them are tabs.

Oops indeed.  The tabs after "#define" should be a space instead.

Jason: can you edit it for me or do you want a new revision of the patch
to fix it?


-- 
Xi Ruoyao <xry111@xry111.site>
School of Aerospace Science and Technology, Xidian University

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] LoongArch: vDSO: Tune the chacha20 implementation
  2024-09-23  8:06   ` Xi Ruoyao
@ 2024-09-23 12:48     ` Jason A. Donenfeld
  2024-09-24  7:12       ` Huacai Chen
  0 siblings, 1 reply; 6+ messages in thread
From: Jason A. Donenfeld @ 2024-09-23 12:48 UTC (permalink / raw)
  To: Xi Ruoyao
  Cc: Huacai Chen, WANG Xuerui, Christophe Leroy, linux-crypto,
	loongarch, linux-kernel, Jinyang He, Tiezhu Yang, Arnd Bergmann

On Mon, Sep 23, 2024 at 04:06:41PM +0800, Xi Ruoyao wrote:
> On Mon, 2024-09-23 at 15:15 +0800, Huacai Chen wrote:
> > > +#define line3          state12, state13, state14, state15
> > > +
> > > +#define        line1_perm      state5, state6, state7, state4
> > > +#define        line2_perm      state10, state11, state8, state9
> > > +#define        line3_perm      state15, state12, state13, state14
> > > +
> > > +#define        copy            copy0, copy1, copy2, copy3
> > The indentation here is strange, it seems some of them are spaces and
> > some of them are tabs.
> 
> Oops indeed.  The tabs after "#define" should be a space instead.
> 
> Jason: can you edit it for me or do you want a new revision of the patch

I've fixed it in tree.

Jason

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] LoongArch: vDSO: Tune the chacha20 implementation
  2024-09-23 12:48     ` Jason A. Donenfeld
@ 2024-09-24  7:12       ` Huacai Chen
  0 siblings, 0 replies; 6+ messages in thread
From: Huacai Chen @ 2024-09-24  7:12 UTC (permalink / raw)
  To: Jason A. Donenfeld
  Cc: Xi Ruoyao, WANG Xuerui, Christophe Leroy, linux-crypto, loongarch,
	linux-kernel, Jinyang He, Tiezhu Yang, Arnd Bergmann

Reviewed-by: Huacai Chen <chenhuacai@loongson.cn>

On Mon, Sep 23, 2024 at 8:48 PM Jason A. Donenfeld <Jason@zx2c4.com> wrote:
>
> On Mon, Sep 23, 2024 at 04:06:41PM +0800, Xi Ruoyao wrote:
> > On Mon, 2024-09-23 at 15:15 +0800, Huacai Chen wrote:
> > > > +#define line3          state12, state13, state14, state15
> > > > +
> > > > +#define        line1_perm      state5, state6, state7, state4
> > > > +#define        line2_perm      state10, state11, state8, state9
> > > > +#define        line3_perm      state15, state12, state13, state14
> > > > +
> > > > +#define        copy            copy0, copy1, copy2, copy3
> > > The indentation here is strange, it seems some of them are spaces and
> > > some of them are tabs.
> >
> > Oops indeed.  The tabs after "#define" should be a space instead.
> >
> > Jason: can you edit it for me or do you want a new revision of the patch
>
> I've fixed it in tree.
>
> Jason

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2024-09-24  7:12 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-09-19  9:13 [PATCH] LoongArch: vDSO: Tune the chacha20 implementation Xi Ruoyao
2024-09-20 15:11 ` Jason A. Donenfeld
2024-09-23  7:15 ` Huacai Chen
2024-09-23  8:06   ` Xi Ruoyao
2024-09-23 12:48     ` Jason A. Donenfeld
2024-09-24  7:12       ` Huacai Chen

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox