Re: [PATCH 1/2] powerpc: Batch up loads/stores on saving and restoring Altivec

linuxppc-dev.lists.ozlabs.org archive mirror
 help / color / mirror / Atom feed

From: Michael Neuling <mikey@neuling.org>
To: Cyril Bur <cyrilbur@gmail.com>, linuxppc-dev@ozlabs.org
Cc: anton@samba.org
Subject: Re: [PATCH 1/2] powerpc: Batch up loads/stores on saving and restoring Altivec
Date: Thu, 10 Mar 2016 10:01:07 +1100	[thread overview]
Message-ID: <1457564467.17010.1.camel@neuling.org> (raw)
In-Reply-To: <1456811735-1289-1-git-send-email-cyrilbur@gmail.com>

On Tue, 2016-03-01 at 16:55 +1100, Cyril Bur wrote:

> Currently the assembly to save and restore Altivec registers boils down t=
o
> a load immediate of the offset of the specific Altivec register in memory
> followed by the load/store which repeats in sequence for each Altivec
> register.
>=20
> This patch attempts to do better by loading up four registers with
> immediates so that the loads and stores can be batched up and better
> pipelined by the processor.
>=20
> This patch results in four load/stores in sequence and one add between
> groups of four. Also, by using a pair of base registers it means that the
> result of the add is not needed by the following instruction.

What the performance improvement?

> Signed-off-by: Cyril Bur <cyrilbur@gmail.com>
> ---
> These patches need to be applied on to of my rework of FPU/VMX/VSX
> switching: https://patchwork.ozlabs.org/patch/589703/
>=20
> I left in some of my comments indicating if functions are called from C o=
r
> not. Looking at them now, they might be a bit much, let me know what you
> think.

I think that's ok, although they are likely to get stale quickly.

>=20
> Tested 64 bit BE and LE under KVM, not sure how I can test 32bit.
>=20
>=20
>  arch/powerpc/include/asm/ppc_asm.h | 63 ++++++++++++++++++++++++++++++--=
------
>  arch/powerpc/kernel/tm.S           |  6 ++--
>  arch/powerpc/kernel/vector.S       | 20 +++++++++---
>  3 files changed, 70 insertions(+), 19 deletions(-)
>=20
> diff --git a/arch/powerpc/include/asm/ppc_asm.h b/arch/powerpc/include/as=
m/ppc_asm.h
> index 499d9f8..5ba69ed 100644
> --- a/arch/powerpc/include/asm/ppc_asm.h
> +++ b/arch/powerpc/include/asm/ppc_asm.h
> @@ -110,18 +110,57 @@ END_FW_FTR_SECTION_IFSET(FW_FEATURE_SPLPAR)
>  #define REST_16FPRS(n, base)  	  REST_8FPRS(n, base); REST_8FPRS(n+8, ba=
se)
>  #define REST_32FPRS(n, base)  	  REST_16FPRS(n, base); REST_16FPRS(n+16,=
 base)
> =20
> -#define SAVE_VR(n,b,base)  	  li b,16*(n);  stvx n,base,b
> -#define SAVE_2VRS(n,b,base)  	  SAVE_VR(n,b,base); SAVE_VR(n+1,b,base)
> -#define SAVE_4VRS(n,b,base)  	  SAVE_2VRS(n,b,base); SAVE_2VRS(n+2,b,bas=
e)
> -#define SAVE_8VRS(n,b,base)  	  SAVE_4VRS(n,b,base); SAVE_4VRS(n+4,b,bas=
e)
> -#define SAVE_16VRS(n,b,base)  	  SAVE_8VRS(n,b,base); SAVE_8VRS(n+8,b,ba=
se)
> -#define SAVE_32VRS(n,b,base)  	  SAVE_16VRS(n,b,base); SAVE_16VRS(n+16,b=
,base)
> -#define REST_VR(n,b,base)  	  li b,16*(n); lvx n,base,b
> -#define REST_2VRS(n,b,base)  	  REST_VR(n,b,base); REST_VR(n+1,b,base)
> -#define REST_4VRS(n,b,base)  	  REST_2VRS(n,b,base); REST_2VRS(n+2,b,bas=
e)
> -#define REST_8VRS(n,b,base)  	  REST_4VRS(n,b,base); REST_4VRS(n+4,b,bas=
e)
> -#define REST_16VRS(n,b,base)  	  REST_8VRS(n,b,base); REST_8VRS(n+8,b,ba=
se)
> -#define REST_32VRS(n,b,base)  	  REST_16VRS(n,b,base); REST_16VRS(n+16,b=
,base)

Can you use consistent names between off and reg? in the below

> +#define __SAVE_4VRS(n,off0,off1,off2,off3,base) \
> +  	  stvx n,base,off0; \
> +  	  stvx n+1,base,off1; \
> +  	  stvx n+2,base,off2; \
> +  	  stvx n+3,base,off3
> +
> +/* Restores the base for the caller */

Can you make this:
/* Base: non-volatile, reg[0-4]: volatile */

> +#define SAVE_32VRS(reg0,reg1,reg2,reg3,reg4,base) \
> +  	  addi reg4,base,64; \
> +  	  li reg0,0; li reg1,16; li reg2,32; li reg3,48; \
> +  	  __SAVE_4VRS(0,reg0,reg1,reg2,reg3,base); \
> +  	  addi base,base,128; \
> +  	  __SAVE_4VRS(4,reg0,reg1,reg2,reg3,reg4); \
> +  	  addi reg4,reg4,128; \
> +  	  __SAVE_4VRS(8,reg0,reg1,reg2,reg3,base); \
> +  	  addi base,base,128; \
> +  	  __SAVE_4VRS(12,reg0,reg1,reg2,reg3,reg4); \
> +  	  addi reg4,reg4,128; \
> +  	  __SAVE_4VRS(16,reg0,reg1,reg2,reg3,base); \
> +  	  addi base,base,128; \
> +  	  __SAVE_4VRS(20,reg0,reg1,reg2,reg3,reg4); \
> +  	  addi reg4,reg4,128; \
> +  	  __SAVE_4VRS(24,reg0,reg1,reg2,reg3,base); \
> +  	  __SAVE_4VRS(28,reg0,reg1,reg2,reg3,reg4); \
> +  	  subi base,base,384

You can swap these last two lines which will make base reuse quicker
later.  Although that might not be needed.

> +#define __REST_4VRS(n,off0,off1,off2,off3,base) \
> +  	  lvx n,base,off0; \
> +  	  lvx n+1,base,off1; \
> +  	  lvx n+2,base,off2; \
> +  	  lvx n+3,base,off3
> +
> +/* Restores the base for the caller */
> +#define REST_32VRS(reg0,reg1,reg2,reg3,reg4,base) \
> +  	  addi reg4,base,64; \
> +  	  li reg0,0; li reg1,16; li reg2,32; li reg3,48; \
> +  	  __REST_4VRS(0,reg0,reg1,reg2,reg3,base); \
> +  	  addi base,base,128; \
> +  	  __REST_4VRS(4,reg0,reg1,reg2,reg3,reg4); \
> +  	  addi reg4,reg4,128; \
> +  	  __REST_4VRS(8,reg0,reg1,reg2,reg3,base); \
> +  	  addi base,base,128; \
> +  	  __REST_4VRS(12,reg0,reg1,reg2,reg3,reg4); \
> +  	  addi reg4,reg4,128; \
> +  	  __REST_4VRS(16,reg0,reg1,reg2,reg3,base); \
> +  	  addi base,base,128; \
> +  	  __REST_4VRS(20,reg0,reg1,reg2,reg3,reg4); \
> +  	  addi reg4,reg4,128; \
> +  	  __REST_4VRS(24,reg0,reg1,reg2,reg3,base); \
> +  	  __REST_4VRS(28,reg0,reg1,reg2,reg3,reg4); \
> +  	  subi base,base,384
> =20
>  #ifdef __BIG_ENDIAN__
>  #define STXVD2X_ROT(n,b,base)  	  	> STXVD2X(n,b,base)
> diff --git a/arch/powerpc/kernel/tm.S b/arch/powerpc/kernel/tm.S
> index bf8f34a..81e1305 100644
> --- a/arch/powerpc/kernel/tm.S
> +++ b/arch/powerpc/kernel/tm.S
> @@ -96,6 +96,8 @@ _GLOBAL(tm_abort)
>   * they will abort back to the checkpointed state we save out here.
>   *
>   * Call with IRQs off, stacks get all out of sync for some periods in he=
re!
> + *
> + * Is called from C
>   */
>  _GLOBAL(tm_reclaim)
>    	  mfcr  	  r6
> @@ -151,7 +153,7 @@ _GLOBAL(tm_reclaim)
>    	  beq  	  dont_backup_vec
> =20
>    	  addi  	  r7, r3, THREAD_TRANSACT_VRSTATE
> -  	  SAVE_32VRS(0, r6, r7)  	  /* r6 scratch, r7 transact vr state */
> +  	  SAVE_32VRS(r6,r8,r9,r10,r11,r7)  	  /* r6,r8,r9,r10,r11 scratch, r7=
 transact vr state */

Line wrapping here.

>    	  mfvscr  	  v0
>    	  li  	  r6, VRSTATE_VSCR
>    	  stvx  	  v0, r7, r6
> @@ -361,7 +363,7 @@ _GLOBAL(__tm_recheckpoint)
>    	  li  	  r5, VRSTATE_VSCR
>    	  lvx  	  v0, r8, r5
>    	  mtvscr  	  v0
> -  	  REST_32VRS(0, r5, r8)  	  	  	  /* r5 scratch, r8 ptr */
> +  	  REST_32VRS(r5,r6,r9,r10,r11,r8)  	  	  	  /* r5,r6,r9,r10,r11 scrat=
ch, r8 ptr */

wrapping here too


>  dont_restore_vec:
>    	  ld  	  r5, THREAD_VRSAVE(r3)
>    	  mtspr  	  SPRN_VRSAVE, r5
> diff --git a/arch/powerpc/kernel/vector.S b/arch/powerpc/kernel/vector.S
> index 1c2e7a3..8d587fb 100644
> --- a/arch/powerpc/kernel/vector.S
> +++ b/arch/powerpc/kernel/vector.S
> @@ -13,6 +13,8 @@
>   * This is similar to load_up_altivec but for the transactional version =
of the
>   * vector regs.  It doesn't mess with the task MSR or valid flags.
>   * Furthermore, VEC laziness is not supported with TM currently.
> + *
> + * Is called from C
>   */
>  _GLOBAL(do_load_up_transact_altivec)
>    	  mfmsr  	  r6
> @@ -27,7 +29,7 @@ _GLOBAL(do_load_up_transact_altivec)
>    	  lvx  	  v0,r10,r3
>    	  mtvscr  	  v0
>    	  addi  	  r10,r3,THREAD_TRANSACT_VRSTATE
> -  	  REST_32VRS(0,r4,r10)
> +  	  REST_32VRS(r4,r5,r6,r7,r8,r10)
> =20
>    	  blr
>  #endif
> @@ -35,20 +37,24 @@ _GLOBAL(do_load_up_transact_altivec)
>  /*
>   * Load state from memory into VMX registers including VSCR.
>   * Assumes the caller has enabled VMX in the MSR.
> + *
> + * Is called from C
>   */
>  _GLOBAL(load_vr_state)
>    	  li  	  r4,VRSTATE_VSCR
>    	  lvx  	  v0,r4,r3
>    	  mtvscr  	  v0
> -  	  REST_32VRS(0,r4,r3)
> +  	  REST_32VRS(r4,r5,r6,r7,r8,r3)
>    	  blr
> =20
>  /*
>   * Store VMX state into memory, including VSCR.
>   * Assumes the caller has enabled VMX in the MSR.
> + *
> + * NOT called from C
>   */
>  _GLOBAL(store_vr_state)
> -  	  SAVE_32VRS(0, r4, r3)
> +  	  SAVE_32VRS(r4,r5,r6,r7,r8,r3)
>    	  mfvscr  	  v0
>    	  li  	  r4, VRSTATE_VSCR
>    	  stvx  	  v0, r4, r3
> @@ -63,6 +69,8 @@ _GLOBAL(store_vr_state)
>   *
>   * Note that on 32-bit this can only use registers that will be
>   * restored by fast_exception_return, i.e. r3 - r6, r10 and r11.
> + *
> + * NOT called from C
>   */
>  _GLOBAL(load_up_altivec)
>    	  mfmsr  	  r5  	  	  	  /* grab the current MSR */
> @@ -101,13 +109,15 @@ _GLOBAL(load_up_altivec)
>    	  stw  	  r4,THREAD_USED_VR(r5)
>    	  lvx  	  v0,r10,r6
>    	  mtvscr  	  v0
> -  	  REST_32VRS(0,r4,r6)
> +  	  REST_32VRS(r3,r4,r5,r10,r11,r6)
>    	  /* restore registers and return */
>    	  blr
> =20
>  /*
>   * save_altivec(tsk)
>   * Save the vector registers to its thread_struct
> + *
> + * Is called from C
>   */
>  _GLOBAL(save_altivec)
>    	  addi  	  r3,r3,THREAD  	  	> /* want THREAD of task */
> @@ -116,7 +126,7 @@ _GLOBAL(save_altivec)
>    	  PPC_LCMPI  	  0,r7,0
>    	  bne  	  2f
>    	  addi  	  r7,r3,THREAD_VRSTATE
> -2:  	  SAVE_32VRS(0,r4,r7)
> +2:  	  SAVE_32VRS(r4,r5,r6,r8,r9,r7)
>    	  mfvscr  	  v0
>    	  li  	  r4,VRSTATE_VSCR
>    	  stvx  	  v0,r4,r7

next prev parent reply	other threads:[~2016-03-09 23:01 UTC|newest]

Thread overview: 8+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2016-03-01  5:55 [PATCH 1/2] powerpc: Batch up loads/stores on saving and restoring Altivec Cyril Bur
2016-03-01  5:55 ` [PATCH 2/2] powerpc: Batch up loads/stores on saving and restoring VSX Cyril Bur
2016-03-10  0:09   ` Michael Neuling
2016-03-10  1:02     ` Cyril Bur
2016-03-09 23:01 ` Michael Neuling [this message]
2016-03-10  0:56   ` [PATCH 1/2] powerpc: Batch up loads/stores on saving and restoring Altivec Cyril Bur
2016-03-10  5:37   ` Cyril Bur
2016-03-11  4:25     ` Cyril Bur

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1457564467.17010.1.camel@neuling.org \
    --to=mikey@neuling.org \
    --cc=anton@samba.org \
    --cc=cyrilbur@gmail.com \
    --cc=linuxppc-dev@ozlabs.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).