From: Michael Neuling <mikey@neuling.org>
To: Cyril Bur <cyrilbur@gmail.com>, linuxppc-dev@ozlabs.org
Cc: anton@samba.org
Subject: Re: [PATCH 1/2] powerpc: Batch up loads/stores on saving and restoring Altivec
Date: Thu, 10 Mar 2016 10:01:07 +1100 [thread overview]
Message-ID: <1457564467.17010.1.camel@neuling.org> (raw)
In-Reply-To: <1456811735-1289-1-git-send-email-cyrilbur@gmail.com>
On Tue, 2016-03-01 at 16:55 +1100, Cyril Bur wrote:
> Currently the assembly to save and restore Altivec registers boils down t=
o
> a load immediate of the offset of the specific Altivec register in memory
> followed by the load/store which repeats in sequence for each Altivec
> register.
>=20
> This patch attempts to do better by loading up four registers with
> immediates so that the loads and stores can be batched up and better
> pipelined by the processor.
>=20
> This patch results in four load/stores in sequence and one add between
> groups of four. Also, by using a pair of base registers it means that the
> result of the add is not needed by the following instruction.
What the performance improvement?
> Signed-off-by: Cyril Bur <cyrilbur@gmail.com>
> ---
> These patches need to be applied on to of my rework of FPU/VMX/VSX
> switching: https://patchwork.ozlabs.org/patch/589703/
>=20
> I left in some of my comments indicating if functions are called from C o=
r
> not. Looking at them now, they might be a bit much, let me know what you
> think.
I think that's ok, although they are likely to get stale quickly.
>=20
> Tested 64 bit BE and LE under KVM, not sure how I can test 32bit.
>=20
>=20
> arch/powerpc/include/asm/ppc_asm.h | 63 ++++++++++++++++++++++++++++++--=
------
> arch/powerpc/kernel/tm.S | 6 ++--
> arch/powerpc/kernel/vector.S | 20 +++++++++---
> 3 files changed, 70 insertions(+), 19 deletions(-)
>=20
> diff --git a/arch/powerpc/include/asm/ppc_asm.h b/arch/powerpc/include/as=
m/ppc_asm.h
> index 499d9f8..5ba69ed 100644
> --- a/arch/powerpc/include/asm/ppc_asm.h
> +++ b/arch/powerpc/include/asm/ppc_asm.h
> @@ -110,18 +110,57 @@ END_FW_FTR_SECTION_IFSET(FW_FEATURE_SPLPAR)
> #define REST_16FPRS(n, base) REST_8FPRS(n, base); REST_8FPRS(n+8, ba=
se)
> #define REST_32FPRS(n, base) REST_16FPRS(n, base); REST_16FPRS(n+16,=
base)
> =20
> -#define SAVE_VR(n,b,base) li b,16*(n); stvx n,base,b
> -#define SAVE_2VRS(n,b,base) SAVE_VR(n,b,base); SAVE_VR(n+1,b,base)
> -#define SAVE_4VRS(n,b,base) SAVE_2VRS(n,b,base); SAVE_2VRS(n+2,b,bas=
e)
> -#define SAVE_8VRS(n,b,base) SAVE_4VRS(n,b,base); SAVE_4VRS(n+4,b,bas=
e)
> -#define SAVE_16VRS(n,b,base) SAVE_8VRS(n,b,base); SAVE_8VRS(n+8,b,ba=
se)
> -#define SAVE_32VRS(n,b,base) SAVE_16VRS(n,b,base); SAVE_16VRS(n+16,b=
,base)
> -#define REST_VR(n,b,base) li b,16*(n); lvx n,base,b
> -#define REST_2VRS(n,b,base) REST_VR(n,b,base); REST_VR(n+1,b,base)
> -#define REST_4VRS(n,b,base) REST_2VRS(n,b,base); REST_2VRS(n+2,b,bas=
e)
> -#define REST_8VRS(n,b,base) REST_4VRS(n,b,base); REST_4VRS(n+4,b,bas=
e)
> -#define REST_16VRS(n,b,base) REST_8VRS(n,b,base); REST_8VRS(n+8,b,ba=
se)
> -#define REST_32VRS(n,b,base) REST_16VRS(n,b,base); REST_16VRS(n+16,b=
,base)
Can you use consistent names between off and reg? in the below
> +#define __SAVE_4VRS(n,off0,off1,off2,off3,base) \
> + stvx n,base,off0; \
> + stvx n+1,base,off1; \
> + stvx n+2,base,off2; \
> + stvx n+3,base,off3
> +
> +/* Restores the base for the caller */
Can you make this:
/* Base: non-volatile, reg[0-4]: volatile */
> +#define SAVE_32VRS(reg0,reg1,reg2,reg3,reg4,base) \
> + addi reg4,base,64; \
> + li reg0,0; li reg1,16; li reg2,32; li reg3,48; \
> + __SAVE_4VRS(0,reg0,reg1,reg2,reg3,base); \
> + addi base,base,128; \
> + __SAVE_4VRS(4,reg0,reg1,reg2,reg3,reg4); \
> + addi reg4,reg4,128; \
> + __SAVE_4VRS(8,reg0,reg1,reg2,reg3,base); \
> + addi base,base,128; \
> + __SAVE_4VRS(12,reg0,reg1,reg2,reg3,reg4); \
> + addi reg4,reg4,128; \
> + __SAVE_4VRS(16,reg0,reg1,reg2,reg3,base); \
> + addi base,base,128; \
> + __SAVE_4VRS(20,reg0,reg1,reg2,reg3,reg4); \
> + addi reg4,reg4,128; \
> + __SAVE_4VRS(24,reg0,reg1,reg2,reg3,base); \
> + __SAVE_4VRS(28,reg0,reg1,reg2,reg3,reg4); \
> + subi base,base,384
You can swap these last two lines which will make base reuse quicker
later. Although that might not be needed.
> +#define __REST_4VRS(n,off0,off1,off2,off3,base) \
> + lvx n,base,off0; \
> + lvx n+1,base,off1; \
> + lvx n+2,base,off2; \
> + lvx n+3,base,off3
> +
> +/* Restores the base for the caller */
> +#define REST_32VRS(reg0,reg1,reg2,reg3,reg4,base) \
> + addi reg4,base,64; \
> + li reg0,0; li reg1,16; li reg2,32; li reg3,48; \
> + __REST_4VRS(0,reg0,reg1,reg2,reg3,base); \
> + addi base,base,128; \
> + __REST_4VRS(4,reg0,reg1,reg2,reg3,reg4); \
> + addi reg4,reg4,128; \
> + __REST_4VRS(8,reg0,reg1,reg2,reg3,base); \
> + addi base,base,128; \
> + __REST_4VRS(12,reg0,reg1,reg2,reg3,reg4); \
> + addi reg4,reg4,128; \
> + __REST_4VRS(16,reg0,reg1,reg2,reg3,base); \
> + addi base,base,128; \
> + __REST_4VRS(20,reg0,reg1,reg2,reg3,reg4); \
> + addi reg4,reg4,128; \
> + __REST_4VRS(24,reg0,reg1,reg2,reg3,base); \
> + __REST_4VRS(28,reg0,reg1,reg2,reg3,reg4); \
> + subi base,base,384
> =20
> #ifdef __BIG_ENDIAN__
> #define STXVD2X_ROT(n,b,base) > STXVD2X(n,b,base)
> diff --git a/arch/powerpc/kernel/tm.S b/arch/powerpc/kernel/tm.S
> index bf8f34a..81e1305 100644
> --- a/arch/powerpc/kernel/tm.S
> +++ b/arch/powerpc/kernel/tm.S
> @@ -96,6 +96,8 @@ _GLOBAL(tm_abort)
> * they will abort back to the checkpointed state we save out here.
> *
> * Call with IRQs off, stacks get all out of sync for some periods in he=
re!
> + *
> + * Is called from C
> */
> _GLOBAL(tm_reclaim)
> mfcr r6
> @@ -151,7 +153,7 @@ _GLOBAL(tm_reclaim)
> beq dont_backup_vec
> =20
> addi r7, r3, THREAD_TRANSACT_VRSTATE
> - SAVE_32VRS(0, r6, r7) /* r6 scratch, r7 transact vr state */
> + SAVE_32VRS(r6,r8,r9,r10,r11,r7) /* r6,r8,r9,r10,r11 scratch, r7=
transact vr state */
Line wrapping here.
> mfvscr v0
> li r6, VRSTATE_VSCR
> stvx v0, r7, r6
> @@ -361,7 +363,7 @@ _GLOBAL(__tm_recheckpoint)
> li r5, VRSTATE_VSCR
> lvx v0, r8, r5
> mtvscr v0
> - REST_32VRS(0, r5, r8) /* r5 scratch, r8 ptr */
> + REST_32VRS(r5,r6,r9,r10,r11,r8) /* r5,r6,r9,r10,r11 scrat=
ch, r8 ptr */
wrapping here too
> dont_restore_vec:
> ld r5, THREAD_VRSAVE(r3)
> mtspr SPRN_VRSAVE, r5
> diff --git a/arch/powerpc/kernel/vector.S b/arch/powerpc/kernel/vector.S
> index 1c2e7a3..8d587fb 100644
> --- a/arch/powerpc/kernel/vector.S
> +++ b/arch/powerpc/kernel/vector.S
> @@ -13,6 +13,8 @@
> * This is similar to load_up_altivec but for the transactional version =
of the
> * vector regs. It doesn't mess with the task MSR or valid flags.
> * Furthermore, VEC laziness is not supported with TM currently.
> + *
> + * Is called from C
> */
> _GLOBAL(do_load_up_transact_altivec)
> mfmsr r6
> @@ -27,7 +29,7 @@ _GLOBAL(do_load_up_transact_altivec)
> lvx v0,r10,r3
> mtvscr v0
> addi r10,r3,THREAD_TRANSACT_VRSTATE
> - REST_32VRS(0,r4,r10)
> + REST_32VRS(r4,r5,r6,r7,r8,r10)
> =20
> blr
> #endif
> @@ -35,20 +37,24 @@ _GLOBAL(do_load_up_transact_altivec)
> /*
> * Load state from memory into VMX registers including VSCR.
> * Assumes the caller has enabled VMX in the MSR.
> + *
> + * Is called from C
> */
> _GLOBAL(load_vr_state)
> li r4,VRSTATE_VSCR
> lvx v0,r4,r3
> mtvscr v0
> - REST_32VRS(0,r4,r3)
> + REST_32VRS(r4,r5,r6,r7,r8,r3)
> blr
> =20
> /*
> * Store VMX state into memory, including VSCR.
> * Assumes the caller has enabled VMX in the MSR.
> + *
> + * NOT called from C
> */
> _GLOBAL(store_vr_state)
> - SAVE_32VRS(0, r4, r3)
> + SAVE_32VRS(r4,r5,r6,r7,r8,r3)
> mfvscr v0
> li r4, VRSTATE_VSCR
> stvx v0, r4, r3
> @@ -63,6 +69,8 @@ _GLOBAL(store_vr_state)
> *
> * Note that on 32-bit this can only use registers that will be
> * restored by fast_exception_return, i.e. r3 - r6, r10 and r11.
> + *
> + * NOT called from C
> */
> _GLOBAL(load_up_altivec)
> mfmsr r5 /* grab the current MSR */
> @@ -101,13 +109,15 @@ _GLOBAL(load_up_altivec)
> stw r4,THREAD_USED_VR(r5)
> lvx v0,r10,r6
> mtvscr v0
> - REST_32VRS(0,r4,r6)
> + REST_32VRS(r3,r4,r5,r10,r11,r6)
> /* restore registers and return */
> blr
> =20
> /*
> * save_altivec(tsk)
> * Save the vector registers to its thread_struct
> + *
> + * Is called from C
> */
> _GLOBAL(save_altivec)
> addi r3,r3,THREAD > /* want THREAD of task */
> @@ -116,7 +126,7 @@ _GLOBAL(save_altivec)
> PPC_LCMPI 0,r7,0
> bne 2f
> addi r7,r3,THREAD_VRSTATE
> -2: SAVE_32VRS(0,r4,r7)
> +2: SAVE_32VRS(r4,r5,r6,r8,r9,r7)
> mfvscr v0
> li r4,VRSTATE_VSCR
> stvx v0,r4,r7
next prev parent reply other threads:[~2016-03-09 23:01 UTC|newest]
Thread overview: 8+ messages / expand[flat|nested] mbox.gz Atom feed top
2016-03-01 5:55 [PATCH 1/2] powerpc: Batch up loads/stores on saving and restoring Altivec Cyril Bur
2016-03-01 5:55 ` [PATCH 2/2] powerpc: Batch up loads/stores on saving and restoring VSX Cyril Bur
2016-03-10 0:09 ` Michael Neuling
2016-03-10 1:02 ` Cyril Bur
2016-03-09 23:01 ` Michael Neuling [this message]
2016-03-10 0:56 ` [PATCH 1/2] powerpc: Batch up loads/stores on saving and restoring Altivec Cyril Bur
2016-03-10 5:37 ` Cyril Bur
2016-03-11 4:25 ` Cyril Bur
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1457564467.17010.1.camel@neuling.org \
--to=mikey@neuling.org \
--cc=anton@samba.org \
--cc=cyrilbur@gmail.com \
--cc=linuxppc-dev@ozlabs.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.