Re: [PATCH] MIPS: MSA: misaligned support

All of lore.kernel.org
 help / color / mirror / Atom feed

From: James Hogan <james.hogan@imgtec.com>
To: Leonid Yegoshin <Leonid.Yegoshin@imgtec.com>,
	<linux-mips@linux-mips.org>, <wangr@lemote.com>,
	<peterz@infradead.org>, <qais.yousef@imgtec.com>,
	<linux-kernel@vger.kernel.org>, <ralf@linux-mips.org>,
	<davidlohr@hp.com>, <chenhc@lemote.com>, <manuel.lauss@gmail.com>,
	<mingo@kernel.org>
Subject: Re: [PATCH] MIPS: MSA: misaligned support
Date: Wed, 18 Mar 2015 11:27:25 +0000	[thread overview]
Message-ID: <5509611D.80404@imgtec.com> (raw)
In-Reply-To: <20150318011630.2702.28882.stgit@ubuntu-yegoshin>

[-- Attachment #1: Type: text/plain, Size: 13353 bytes --]

Hi Leonid,

On 18/03/15 01:16, Leonid Yegoshin wrote:
> MIPS R5, MIPS R6 and MSA HW specs allow a broad range of address exception
> on unalaigned MSA load/store operations - from none unaligned up to

unaligned

> full support in HW. In practice, it is expected that HW can occasionally
> triggers AdE for non-aligned data access (misalignment). It is usually
> expected on page boundaries because HW handling of two TLBs in single
> data access operation may be complicated and expensive.
> 
> So, this patch handles MSA LD.df and ST.df Address Error exceptions.
> 
> It handles separately two cases - MSA owned by thread and MSA registers
> saved in current->thread.fpu. If thread still ownes MSA unit then it

owns

> loads and stores directly with MSA unit and only one MSA register. Saving
> and restoring the full MSA context (512bytes) on each misalign exception
> is expensive! Preemption is disabled, of course.
> 
> Signed-off-by: Leonid Yegoshin <Leonid.Yegoshin@imgtec.com>
> ---
>  arch/mips/include/asm/processor.h |    2 +
>  arch/mips/include/uapi/asm/inst.h |   21 +++++
>  arch/mips/kernel/r4k_fpu.S        |  107 ++++++++++++++++++++++++++++
>  arch/mips/kernel/unaligned.c      |  143 +++++++++++++++++++++++++++++++++++++
>  4 files changed, 273 insertions(+)
> 
> diff --git a/arch/mips/include/asm/processor.h b/arch/mips/include/asm/processor.h
> index f1df4cb4a286..af2675060244 100644
> --- a/arch/mips/include/asm/processor.h
> +++ b/arch/mips/include/asm/processor.h
> @@ -104,6 +104,8 @@ extern unsigned int vced_count, vcei_count;
>  #endif
>  
>  union fpureg {
> +	__u8    val8[FPU_REG_WIDTH / 8];
> +	__u16   val16[FPU_REG_WIDTH / 16];
>  	__u32	val32[FPU_REG_WIDTH / 32];
>  	__u64	val64[FPU_REG_WIDTH / 64];
>  };
> diff --git a/arch/mips/include/uapi/asm/inst.h b/arch/mips/include/uapi/asm/inst.h
> index 89c22433b1c6..7ab6987cb7d5 100644
> --- a/arch/mips/include/uapi/asm/inst.h
> +++ b/arch/mips/include/uapi/asm/inst.h
> @@ -58,6 +58,7 @@ enum spec_op {
>  	dsll_op, spec7_unused_op, dsrl_op, dsra_op,
>  	dsll32_op, spec8_unused_op, dsrl32_op, dsra32_op
>  };
> +#define msa_op  mdmx_op
>  
>  /*
>   * func field of spec2 opcode.
> @@ -217,6 +218,14 @@ enum bshfl_func {
>  };
>  
>  /*
> + * func field for MSA MI10 format
> + */
> +enum msa_mi10_func {
> +	msa_ld_op = 8,
> +	msa_st_op = 9,

Most other opcode enumerations in this file are specified in hexadecimal.

> +};
> +
> +/*
>   * (microMIPS) Major opcodes.
>   */
>  enum mm_major_op {
> @@ -616,6 +625,17 @@ struct spec3_format {   /* SPEC3 */
>  	;)))))
>  };
>  
> +struct msa_mi10_format {        /* MSA */
> +	__BITFIELD_FIELD(unsigned int opcode : 6,
> +	__BITFIELD_FIELD(signed int s10 : 10,
> +	__BITFIELD_FIELD(unsigned int rs : 5,
> +	__BITFIELD_FIELD(unsigned int wd : 5,
> +	__BITFIELD_FIELD(unsigned int func : 4,
> +	__BITFIELD_FIELD(unsigned int df : 2,
> +	;))))))
> +};
> +
> +
>  /*
>   * microMIPS instruction formats (32-bit length)
>   *
> @@ -884,6 +904,7 @@ union mips_instruction {
>  	struct p_format p_format;
>  	struct f_format f_format;
>  	struct ma_format ma_format;
> +	struct msa_mi10_format msa_mi10_format;
>  	struct b_format b_format;
>  	struct ps_format ps_format;
>  	struct v_format v_format;
> diff --git a/arch/mips/kernel/r4k_fpu.S b/arch/mips/kernel/r4k_fpu.S
> index 6c160c67984c..5f48f45f81e7 100644
> --- a/arch/mips/kernel/r4k_fpu.S
> +++ b/arch/mips/kernel/r4k_fpu.S
> @@ -13,6 +13,7 @@
>   * Copyright (C) 1999, 2001 Silicon Graphics, Inc.
>   */
>  #include <asm/asm.h>
> +#include <asm/asmmacro.h>
>  #include <asm/errno.h>
>  #include <asm/fpregdef.h>
>  #include <asm/mipsregs.h>
> @@ -268,6 +269,112 @@ LEAF(_restore_fp_context32)
>  	END(_restore_fp_context32)
>  #endif
>  
> +#ifdef CONFIG_CPU_HAS_MSA
> +
> +	.macro  msa_ld_d    wd, base
> +	ld_d    \wd, 0, \base
> +	jalr    $0, $31

Why not just:
	jr	ra

like every other function in that file? I hope jr would be encoded
correctly on r6 automatically?

> +	  nop

I think a single extra space of indentation for delay slots is the
convention, rather than 2. Same below.

> +	.align  4

doesn't this mean the first one & label might not be suitably aligned.
Would it be better to put this before the ld_d (no need for it after
$w31 case) and putting another .align 4 before the Lmsa_to and Lmsa_from
labels (so the label itself is aligned)?

> +	.endm
> +
> +	.macro  msa_st_d    wd, base
> +	st_d    \wd, 0, \base
> +	jalr    $0, $31
> +	  nop
> +	.align  4

same comments as above.

> +	.endm
> +
> +LEAF(msa_to_wd)
> +	.set    push
> +	.set    noreorder
> +	sll         t0, a0, 4
> +	PTR_LA      t1, Lmsa_to
> +	PTR_ADDU    t0, t0, t1
> +	jalr        $0, t0

Likewise here, "jr t0"? and same for msa_from_wd

> +	  nop
> +Lmsa_to:
> +	msa_ld_d    0, a1
> +	msa_ld_d    1, a1
> +	msa_ld_d    2, a1
> +	msa_ld_d    3, a1
> +	msa_ld_d    4, a1
> +	msa_ld_d    5, a1
> +	msa_ld_d    6, a1
> +	msa_ld_d    7, a1
> +	msa_ld_d    8, a1
> +	msa_ld_d    9, a1
> +	msa_ld_d    10, a1
> +	msa_ld_d    11, a1
> +	msa_ld_d    12, a1
> +	msa_ld_d    13, a1
> +	msa_ld_d    14, a1
> +	msa_ld_d    15, a1
> +	msa_ld_d    16, a1
> +	msa_ld_d    17, a1
> +	msa_ld_d    18, a1
> +	msa_ld_d    19, a1
> +	msa_ld_d    20, a1
> +	msa_ld_d    21, a1
> +	msa_ld_d    22, a1
> +	msa_ld_d    23, a1
> +	msa_ld_d    24, a1
> +	msa_ld_d    25, a1
> +	msa_ld_d    26, a1
> +	msa_ld_d    27, a1
> +	msa_ld_d    28, a1
> +	msa_ld_d    29, a1
> +	msa_ld_d    30, a1
> +	msa_ld_d    31, a1
> +	.set    pop
> +	END(msa_to_wd)
> +
> +LEAF(msa_from_wd)
> +	.set    push
> +	.set    noreorder
> +	sll         t0, a0, 4
> +	PTR_LA      t1, Lmsa_from
> +	PTR_ADDU    t0, t0, t1
> +	jalr        $0, t0
> +	  nop
> +Lmsa_from:
> +	msa_st_d    0, a1
> +	msa_st_d    1, a1
> +	msa_st_d    2, a1
> +	msa_st_d    3, a1
> +	msa_st_d    4, a1
> +	msa_st_d    5, a1
> +	msa_st_d    6, a1
> +	msa_st_d    7, a1
> +	msa_st_d    8, a1
> +	msa_st_d    9, a1
> +	msa_st_d    10, a1
> +	msa_st_d    11, a1
> +	msa_st_d    12, a1
> +	msa_st_d    13, a1
> +	msa_st_d    14, a1
> +	msa_st_d    15, a1
> +	msa_st_d    16, a1
> +	msa_st_d    17, a1
> +	msa_st_d    18, a1
> +	msa_st_d    19, a1
> +	msa_st_d    20, a1
> +	msa_st_d    21, a1
> +	msa_st_d    22, a1
> +	msa_st_d    23, a1
> +	msa_st_d    24, a1
> +	msa_st_d    25, a1
> +	msa_st_d    26, a1
> +	msa_st_d    27, a1
> +	msa_st_d    28, a1
> +	msa_st_d    29, a1
> +	msa_st_d    30, a1
> +	msa_st_d    31, a1
> +	.set    pop
> +	END(msa_from_wd)
> +
> +#endif /* CONFIG_CPU_HAS_MSA */
> +
>  	.set	reorder
>  
>  	.type	fault@function
> diff --git a/arch/mips/kernel/unaligned.c b/arch/mips/kernel/unaligned.c
> index e11906dff885..558f41fa93c5 100644
> --- a/arch/mips/kernel/unaligned.c
> +++ b/arch/mips/kernel/unaligned.c
> @@ -108,6 +108,11 @@ static u32 unaligned_action;
>  #endif
>  extern void show_registers(struct pt_regs *regs);
>  
> +#ifdef CONFIG_CPU_HAS_MSA
> +void msa_to_wd(unsigned int wd, union fpureg *from);
> +void msa_from_wd(unsigned int wd, union fpureg *to);
> +#endif
> +
>  #ifdef __BIG_ENDIAN
>  #define     LoadHW(addr, value, res)  \
>  		__asm__ __volatile__ (".set\tnoat\n"        \
> @@ -422,6 +427,64 @@ extern void show_registers(struct pt_regs *regs);
>  		: "r" (value), "r" (addr), "i" (-EFAULT));
>  #endif
>  
> +#ifdef CONFIG_CPU_HAS_MSA
> +#ifdef __BIG_ENDIAN
> +/*
> + * MSA data format conversion.
> + * Only for BIG ENDIAN - LITTLE ENDIAN has register format which matches memory
> + * layout contiguosly.

contiguously

> + *
> + * Conversion is done between two Double words and other formats (W/H/B)
> + * because kernel uses LD.D and ST.D to load/store MSA registers and keeps
> + * MSA registers in this format in current->thread.fpu.fpr
> + */
> +static void msa_convert(union fpureg *to, union fpureg *from, int fmt)
> +{
> +	switch (fmt) {
> +	case 0: /* byte */
> +		to->val8[0] = from->val8[7];
> +		to->val8[1] = from->val8[6];
> +		to->val8[2] = from->val8[5];
> +		to->val8[3] = from->val8[4];
> +		to->val8[4] = from->val8[3];
> +		to->val8[5] = from->val8[2];
> +		to->val8[6] = from->val8[1];
> +		to->val8[7] = from->val8[0];
> +		to->val8[8] = from->val8[15];
> +		to->val8[9] = from->val8[14];
> +		to->val8[10] = from->val8[13];
> +		to->val8[11] = from->val8[12];
> +		to->val8[12] = from->val8[11];
> +		to->val8[13] = from->val8[10];
> +		to->val8[14] = from->val8[9];
> +		to->val8[15] = from->val8[8];
> +		break;
> +
> +	case 1: /* halfword */
> +		to->val16[0] = from->val16[3];
> +		to->val16[1] = from->val16[2];
> +		to->val16[2] = from->val16[1];
> +		to->val16[3] = from->val16[0];
> +		to->val16[4] = from->val16[7];
> +		to->val16[5] = from->val16[6];
> +		to->val16[6] = from->val16[5];
> +		to->val16[7] = from->val16[4];
> +		break;
> +
> +	case 2: /* word */
> +		to->val32[0] = from->val32[1];
> +		to->val32[1] = from->val32[0];
> +		to->val32[2] = from->val32[3];
> +		to->val32[3] = from->val32[2];

FWIW since the FP/MSA patches that Paul submitted, there are also
working endian agnostic accessors created with BUILD_FPR_ACCESS, which
use the FPR_IDX macro (see http://patchwork.linux-mips.org/patch/9169/),
which should work for 8bit and 16bit sizes too.

I wonder if the compiler would unroll/optimise this sort of thing:
	for (i = 0; i < (FPU_REG_WIDTH / 8); ++i)
		to_val8[i] = from->val[FPR_IDX(8, i)];

No worries if not.

> +		break;
> +
> +	case 3: /* doubleword, no conversion */
> +		break;

don't you still need to copy the value though?

> +	}
> +}
> +#endif
> +#endif
> +
>  static void emulate_load_store_insn(struct pt_regs *regs,
>  	void __user *addr, unsigned int __user *pc)
>  {
> @@ -434,6 +497,10 @@ static void emulate_load_store_insn(struct pt_regs *regs,
>  #ifdef	CONFIG_EVA
>  	mm_segment_t seg;
>  #endif
> +#ifdef CONFIG_CPU_HAS_MSA
> +	union fpureg msadatabase[2], *msadata;
> +	unsigned int func, df, rs, wd;
> +#endif
>  	origpc = (unsigned long)pc;
>  	orig31 = regs->regs[31];
>  
> @@ -703,6 +770,82 @@ static void emulate_load_store_insn(struct pt_regs *regs,
>  			break;
>  		return;
>  
> +#ifdef CONFIG_CPU_HAS_MSA
> +	case msa_op:
> +		if (cpu_has_mdmx)
> +			goto sigill;
> +
> +		func = insn.msa_mi10_format.func;
> +		switch (func) {
> +		default:
> +			goto sigbus;
> +
> +		case msa_ld_op:
> +		case msa_st_op:
> +			;
> +		}
> +
> +		if (!thread_msa_context_live())
> +			goto sigbus;

Will this ever happen? (I can't see AdE handler enabling interrupts).

If the MSA context genuinely isn't live (i.e. it can be considered
UNPREDICTABLE), then surely a load operation should still succeed?

> +
> +		df = insn.msa_mi10_format.df;
> +		rs = insn.msa_mi10_format.rs;
> +		wd = insn.msa_mi10_format.wd;
> +		addr = (unsigned long *)(regs->regs[rs] + (insn.msa_mi10_format.s10 * (1 << df)));

"* (1 << df)"?
why not just "<< df"?

> +		/* align a working space in stack... */
> +		msadata = (union fpureg *)(((unsigned long)msadatabase + 15) & ~(unsigned long)0xf);

Maybe you could just use __aligned(16) on a single local union fpureg.

> +		if (func == msa_ld_op) {
> +			if (!access_ok(VERIFY_READ, addr, 16))
> +				goto sigbus;
> +			compute_return_epc(regs);
> +			res = __copy_from_user_inatomic(msadata, addr, 16);
> +			if (res)
> +				goto fault;
> +			preempt_disable();
> +			if (test_thread_flag(TIF_USEDMSA)) {
> +#ifdef __BIG_ENDIAN
> +				msa_convert(&current->thread.fpu.fpr[wd], msadata, df);
> +				msa_to_wd(wd, &current->thread.fpu.fpr[wd]);
> +#else
> +				msa_to_wd(wd, msadata);
> +#endif
> +				preempt_enable();
> +			} else {
> +				preempt_enable();
> +#ifdef __BIG_ENDIAN
> +				msa_convert(&current->thread.fpu.fpr[wd], msadata, df);
> +#else
> +				current->thread.fpu.fpr[wd] = *msadata;
> +#endif

I'm not a fan of the ifdefs, but i can see its awkward to abstract
msa_convert without causing extra copies (although I don't think its a
critical code path).

> +			}
> +		} else {
> +			if (!access_ok(VERIFY_WRITE, addr, 16))
> +				goto sigbus;
> +			compute_return_epc(regs);

forgot to preempt_disable()?

> +			if (test_thread_flag(TIF_USEDMSA)) {
> +#ifdef __BIG_ENDIAN
> +				msa_from_wd(wd, &current->thread.fpu.fpr[wd]);
> +				msa_convert(msadata, &current->thread.fpu.fpr[wd], df);
> +#else
> +				msa_from_wd(wd, msadata);
> +#endif
> +				preempt_enable();
> +			} else {
> +				preempt_enable();
> +#ifdef __BIG_ENDIAN
> +				msa_convert(msadata, &current->thread.fpu.fpr[wd], df);
> +#else
> +				*msadata = current->thread.fpu.fpr[wd];

hmm, you could cheat and change this to the following?:
				msadata = &current->thread.fpu.fpr[wd];

> +#endif
> +			}
> +			res = __copy_to_user_inatomic(addr, msadata, 16);
> +			if (res)
> +				goto fault;
> +		}
> +
> +		break;
> +#endif /* CONFIG_CPU_HAS_MSA */
> +
>  	/*
>  	 * COP2 is available to implementor for application specific use.
>  	 * It's up to applications to register a notifier chain and do
> 
> 

Cheers
James


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 819 bytes --]

WARNING: multiple messages have this Message-ID (diff)

From: James Hogan <james.hogan@imgtec.com>
To: Leonid Yegoshin <Leonid.Yegoshin@imgtec.com>,
	linux-mips@linux-mips.org, wangr@lemote.com,
	peterz@infradead.org, qais.yousef@imgtec.com,
	linux-kernel@vger.kernel.org, ralf@linux-mips.org,
	davidlohr@hp.com, chenhc@lemote.com, manuel.lauss@gmail.com,
	mingo@kernel.org
Subject: Re: [PATCH] MIPS: MSA: misaligned support
Date: Wed, 18 Mar 2015 11:27:25 +0000	[thread overview]
Message-ID: <5509611D.80404@imgtec.com> (raw)
Message-ID: <20150318112725.tGxHc95bo5ZY4f2Npd-_-L0sW_CIeaFlKc-r_lgFcBQ@z> (raw)
In-Reply-To: <20150318011630.2702.28882.stgit@ubuntu-yegoshin>

[-- Attachment #1: Type: text/plain, Size: 13353 bytes --]

Hi Leonid,

On 18/03/15 01:16, Leonid Yegoshin wrote:
> MIPS R5, MIPS R6 and MSA HW specs allow a broad range of address exception
> on unalaigned MSA load/store operations - from none unaligned up to

unaligned

> full support in HW. In practice, it is expected that HW can occasionally
> triggers AdE for non-aligned data access (misalignment). It is usually
> expected on page boundaries because HW handling of two TLBs in single
> data access operation may be complicated and expensive.
> 
> So, this patch handles MSA LD.df and ST.df Address Error exceptions.
> 
> It handles separately two cases - MSA owned by thread and MSA registers
> saved in current->thread.fpu. If thread still ownes MSA unit then it

owns

> loads and stores directly with MSA unit and only one MSA register. Saving
> and restoring the full MSA context (512bytes) on each misalign exception
> is expensive! Preemption is disabled, of course.
> 
> Signed-off-by: Leonid Yegoshin <Leonid.Yegoshin@imgtec.com>
> ---
>  arch/mips/include/asm/processor.h |    2 +
>  arch/mips/include/uapi/asm/inst.h |   21 +++++
>  arch/mips/kernel/r4k_fpu.S        |  107 ++++++++++++++++++++++++++++
>  arch/mips/kernel/unaligned.c      |  143 +++++++++++++++++++++++++++++++++++++
>  4 files changed, 273 insertions(+)
> 
> diff --git a/arch/mips/include/asm/processor.h b/arch/mips/include/asm/processor.h
> index f1df4cb4a286..af2675060244 100644
> --- a/arch/mips/include/asm/processor.h
> +++ b/arch/mips/include/asm/processor.h
> @@ -104,6 +104,8 @@ extern unsigned int vced_count, vcei_count;
>  #endif
>  
>  union fpureg {
> +	__u8    val8[FPU_REG_WIDTH / 8];
> +	__u16   val16[FPU_REG_WIDTH / 16];
>  	__u32	val32[FPU_REG_WIDTH / 32];
>  	__u64	val64[FPU_REG_WIDTH / 64];
>  };
> diff --git a/arch/mips/include/uapi/asm/inst.h b/arch/mips/include/uapi/asm/inst.h
> index 89c22433b1c6..7ab6987cb7d5 100644
> --- a/arch/mips/include/uapi/asm/inst.h
> +++ b/arch/mips/include/uapi/asm/inst.h
> @@ -58,6 +58,7 @@ enum spec_op {
>  	dsll_op, spec7_unused_op, dsrl_op, dsra_op,
>  	dsll32_op, spec8_unused_op, dsrl32_op, dsra32_op
>  };
> +#define msa_op  mdmx_op
>  
>  /*
>   * func field of spec2 opcode.
> @@ -217,6 +218,14 @@ enum bshfl_func {
>  };
>  
>  /*
> + * func field for MSA MI10 format
> + */
> +enum msa_mi10_func {
> +	msa_ld_op = 8,
> +	msa_st_op = 9,

Most other opcode enumerations in this file are specified in hexadecimal.

> +};
> +
> +/*
>   * (microMIPS) Major opcodes.
>   */
>  enum mm_major_op {
> @@ -616,6 +625,17 @@ struct spec3_format {   /* SPEC3 */
>  	;)))))
>  };
>  
> +struct msa_mi10_format {        /* MSA */
> +	__BITFIELD_FIELD(unsigned int opcode : 6,
> +	__BITFIELD_FIELD(signed int s10 : 10,
> +	__BITFIELD_FIELD(unsigned int rs : 5,
> +	__BITFIELD_FIELD(unsigned int wd : 5,
> +	__BITFIELD_FIELD(unsigned int func : 4,
> +	__BITFIELD_FIELD(unsigned int df : 2,
> +	;))))))
> +};
> +
> +
>  /*
>   * microMIPS instruction formats (32-bit length)
>   *
> @@ -884,6 +904,7 @@ union mips_instruction {
>  	struct p_format p_format;
>  	struct f_format f_format;
>  	struct ma_format ma_format;
> +	struct msa_mi10_format msa_mi10_format;
>  	struct b_format b_format;
>  	struct ps_format ps_format;
>  	struct v_format v_format;
> diff --git a/arch/mips/kernel/r4k_fpu.S b/arch/mips/kernel/r4k_fpu.S
> index 6c160c67984c..5f48f45f81e7 100644
> --- a/arch/mips/kernel/r4k_fpu.S
> +++ b/arch/mips/kernel/r4k_fpu.S
> @@ -13,6 +13,7 @@
>   * Copyright (C) 1999, 2001 Silicon Graphics, Inc.
>   */
>  #include <asm/asm.h>
> +#include <asm/asmmacro.h>
>  #include <asm/errno.h>
>  #include <asm/fpregdef.h>
>  #include <asm/mipsregs.h>
> @@ -268,6 +269,112 @@ LEAF(_restore_fp_context32)
>  	END(_restore_fp_context32)
>  #endif
>  
> +#ifdef CONFIG_CPU_HAS_MSA
> +
> +	.macro  msa_ld_d    wd, base
> +	ld_d    \wd, 0, \base
> +	jalr    $0, $31

Why not just:
	jr	ra

like every other function in that file? I hope jr would be encoded
correctly on r6 automatically?

> +	  nop

I think a single extra space of indentation for delay slots is the
convention, rather than 2. Same below.

> +	.align  4

doesn't this mean the first one & label might not be suitably aligned.
Would it be better to put this before the ld_d (no need for it after
$w31 case) and putting another .align 4 before the Lmsa_to and Lmsa_from
labels (so the label itself is aligned)?

> +	.endm
> +
> +	.macro  msa_st_d    wd, base
> +	st_d    \wd, 0, \base
> +	jalr    $0, $31
> +	  nop
> +	.align  4

same comments as above.

> +	.endm
> +
> +LEAF(msa_to_wd)
> +	.set    push
> +	.set    noreorder
> +	sll         t0, a0, 4
> +	PTR_LA      t1, Lmsa_to
> +	PTR_ADDU    t0, t0, t1
> +	jalr        $0, t0

Likewise here, "jr t0"? and same for msa_from_wd

> +	  nop
> +Lmsa_to:
> +	msa_ld_d    0, a1
> +	msa_ld_d    1, a1
> +	msa_ld_d    2, a1
> +	msa_ld_d    3, a1
> +	msa_ld_d    4, a1
> +	msa_ld_d    5, a1
> +	msa_ld_d    6, a1
> +	msa_ld_d    7, a1
> +	msa_ld_d    8, a1
> +	msa_ld_d    9, a1
> +	msa_ld_d    10, a1
> +	msa_ld_d    11, a1
> +	msa_ld_d    12, a1
> +	msa_ld_d    13, a1
> +	msa_ld_d    14, a1
> +	msa_ld_d    15, a1
> +	msa_ld_d    16, a1
> +	msa_ld_d    17, a1
> +	msa_ld_d    18, a1
> +	msa_ld_d    19, a1
> +	msa_ld_d    20, a1
> +	msa_ld_d    21, a1
> +	msa_ld_d    22, a1
> +	msa_ld_d    23, a1
> +	msa_ld_d    24, a1
> +	msa_ld_d    25, a1
> +	msa_ld_d    26, a1
> +	msa_ld_d    27, a1
> +	msa_ld_d    28, a1
> +	msa_ld_d    29, a1
> +	msa_ld_d    30, a1
> +	msa_ld_d    31, a1
> +	.set    pop
> +	END(msa_to_wd)
> +
> +LEAF(msa_from_wd)
> +	.set    push
> +	.set    noreorder
> +	sll         t0, a0, 4
> +	PTR_LA      t1, Lmsa_from
> +	PTR_ADDU    t0, t0, t1
> +	jalr        $0, t0
> +	  nop
> +Lmsa_from:
> +	msa_st_d    0, a1
> +	msa_st_d    1, a1
> +	msa_st_d    2, a1
> +	msa_st_d    3, a1
> +	msa_st_d    4, a1
> +	msa_st_d    5, a1
> +	msa_st_d    6, a1
> +	msa_st_d    7, a1
> +	msa_st_d    8, a1
> +	msa_st_d    9, a1
> +	msa_st_d    10, a1
> +	msa_st_d    11, a1
> +	msa_st_d    12, a1
> +	msa_st_d    13, a1
> +	msa_st_d    14, a1
> +	msa_st_d    15, a1
> +	msa_st_d    16, a1
> +	msa_st_d    17, a1
> +	msa_st_d    18, a1
> +	msa_st_d    19, a1
> +	msa_st_d    20, a1
> +	msa_st_d    21, a1
> +	msa_st_d    22, a1
> +	msa_st_d    23, a1
> +	msa_st_d    24, a1
> +	msa_st_d    25, a1
> +	msa_st_d    26, a1
> +	msa_st_d    27, a1
> +	msa_st_d    28, a1
> +	msa_st_d    29, a1
> +	msa_st_d    30, a1
> +	msa_st_d    31, a1
> +	.set    pop
> +	END(msa_from_wd)
> +
> +#endif /* CONFIG_CPU_HAS_MSA */
> +
>  	.set	reorder
>  
>  	.type	fault@function
> diff --git a/arch/mips/kernel/unaligned.c b/arch/mips/kernel/unaligned.c
> index e11906dff885..558f41fa93c5 100644
> --- a/arch/mips/kernel/unaligned.c
> +++ b/arch/mips/kernel/unaligned.c
> @@ -108,6 +108,11 @@ static u32 unaligned_action;
>  #endif
>  extern void show_registers(struct pt_regs *regs);
>  
> +#ifdef CONFIG_CPU_HAS_MSA
> +void msa_to_wd(unsigned int wd, union fpureg *from);
> +void msa_from_wd(unsigned int wd, union fpureg *to);
> +#endif
> +
>  #ifdef __BIG_ENDIAN
>  #define     LoadHW(addr, value, res)  \
>  		__asm__ __volatile__ (".set\tnoat\n"        \
> @@ -422,6 +427,64 @@ extern void show_registers(struct pt_regs *regs);
>  		: "r" (value), "r" (addr), "i" (-EFAULT));
>  #endif
>  
> +#ifdef CONFIG_CPU_HAS_MSA
> +#ifdef __BIG_ENDIAN
> +/*
> + * MSA data format conversion.
> + * Only for BIG ENDIAN - LITTLE ENDIAN has register format which matches memory
> + * layout contiguosly.

contiguously

> + *
> + * Conversion is done between two Double words and other formats (W/H/B)
> + * because kernel uses LD.D and ST.D to load/store MSA registers and keeps
> + * MSA registers in this format in current->thread.fpu.fpr
> + */
> +static void msa_convert(union fpureg *to, union fpureg *from, int fmt)
> +{
> +	switch (fmt) {
> +	case 0: /* byte */
> +		to->val8[0] = from->val8[7];
> +		to->val8[1] = from->val8[6];
> +		to->val8[2] = from->val8[5];
> +		to->val8[3] = from->val8[4];
> +		to->val8[4] = from->val8[3];
> +		to->val8[5] = from->val8[2];
> +		to->val8[6] = from->val8[1];
> +		to->val8[7] = from->val8[0];
> +		to->val8[8] = from->val8[15];
> +		to->val8[9] = from->val8[14];
> +		to->val8[10] = from->val8[13];
> +		to->val8[11] = from->val8[12];
> +		to->val8[12] = from->val8[11];
> +		to->val8[13] = from->val8[10];
> +		to->val8[14] = from->val8[9];
> +		to->val8[15] = from->val8[8];
> +		break;
> +
> +	case 1: /* halfword */
> +		to->val16[0] = from->val16[3];
> +		to->val16[1] = from->val16[2];
> +		to->val16[2] = from->val16[1];
> +		to->val16[3] = from->val16[0];
> +		to->val16[4] = from->val16[7];
> +		to->val16[5] = from->val16[6];
> +		to->val16[6] = from->val16[5];
> +		to->val16[7] = from->val16[4];
> +		break;
> +
> +	case 2: /* word */
> +		to->val32[0] = from->val32[1];
> +		to->val32[1] = from->val32[0];
> +		to->val32[2] = from->val32[3];
> +		to->val32[3] = from->val32[2];

FWIW since the FP/MSA patches that Paul submitted, there are also
working endian agnostic accessors created with BUILD_FPR_ACCESS, which
use the FPR_IDX macro (see http://patchwork.linux-mips.org/patch/9169/),
which should work for 8bit and 16bit sizes too.

I wonder if the compiler would unroll/optimise this sort of thing:
	for (i = 0; i < (FPU_REG_WIDTH / 8); ++i)
		to_val8[i] = from->val[FPR_IDX(8, i)];

No worries if not.

> +		break;
> +
> +	case 3: /* doubleword, no conversion */
> +		break;

don't you still need to copy the value though?

> +	}
> +}
> +#endif
> +#endif
> +
>  static void emulate_load_store_insn(struct pt_regs *regs,
>  	void __user *addr, unsigned int __user *pc)
>  {
> @@ -434,6 +497,10 @@ static void emulate_load_store_insn(struct pt_regs *regs,
>  #ifdef	CONFIG_EVA
>  	mm_segment_t seg;
>  #endif
> +#ifdef CONFIG_CPU_HAS_MSA
> +	union fpureg msadatabase[2], *msadata;
> +	unsigned int func, df, rs, wd;
> +#endif
>  	origpc = (unsigned long)pc;
>  	orig31 = regs->regs[31];
>  
> @@ -703,6 +770,82 @@ static void emulate_load_store_insn(struct pt_regs *regs,
>  			break;
>  		return;
>  
> +#ifdef CONFIG_CPU_HAS_MSA
> +	case msa_op:
> +		if (cpu_has_mdmx)
> +			goto sigill;
> +
> +		func = insn.msa_mi10_format.func;
> +		switch (func) {
> +		default:
> +			goto sigbus;
> +
> +		case msa_ld_op:
> +		case msa_st_op:
> +			;
> +		}
> +
> +		if (!thread_msa_context_live())
> +			goto sigbus;

Will this ever happen? (I can't see AdE handler enabling interrupts).

If the MSA context genuinely isn't live (i.e. it can be considered
UNPREDICTABLE), then surely a load operation should still succeed?

> +
> +		df = insn.msa_mi10_format.df;
> +		rs = insn.msa_mi10_format.rs;
> +		wd = insn.msa_mi10_format.wd;
> +		addr = (unsigned long *)(regs->regs[rs] + (insn.msa_mi10_format.s10 * (1 << df)));

"* (1 << df)"?
why not just "<< df"?

> +		/* align a working space in stack... */
> +		msadata = (union fpureg *)(((unsigned long)msadatabase + 15) & ~(unsigned long)0xf);

Maybe you could just use __aligned(16) on a single local union fpureg.

> +		if (func == msa_ld_op) {
> +			if (!access_ok(VERIFY_READ, addr, 16))
> +				goto sigbus;
> +			compute_return_epc(regs);
> +			res = __copy_from_user_inatomic(msadata, addr, 16);
> +			if (res)
> +				goto fault;
> +			preempt_disable();
> +			if (test_thread_flag(TIF_USEDMSA)) {
> +#ifdef __BIG_ENDIAN
> +				msa_convert(&current->thread.fpu.fpr[wd], msadata, df);
> +				msa_to_wd(wd, &current->thread.fpu.fpr[wd]);
> +#else
> +				msa_to_wd(wd, msadata);
> +#endif
> +				preempt_enable();
> +			} else {
> +				preempt_enable();
> +#ifdef __BIG_ENDIAN
> +				msa_convert(&current->thread.fpu.fpr[wd], msadata, df);
> +#else
> +				current->thread.fpu.fpr[wd] = *msadata;
> +#endif

I'm not a fan of the ifdefs, but i can see its awkward to abstract
msa_convert without causing extra copies (although I don't think its a
critical code path).

> +			}
> +		} else {
> +			if (!access_ok(VERIFY_WRITE, addr, 16))
> +				goto sigbus;
> +			compute_return_epc(regs);

forgot to preempt_disable()?

> +			if (test_thread_flag(TIF_USEDMSA)) {
> +#ifdef __BIG_ENDIAN
> +				msa_from_wd(wd, &current->thread.fpu.fpr[wd]);
> +				msa_convert(msadata, &current->thread.fpu.fpr[wd], df);
> +#else
> +				msa_from_wd(wd, msadata);
> +#endif
> +				preempt_enable();
> +			} else {
> +				preempt_enable();
> +#ifdef __BIG_ENDIAN
> +				msa_convert(msadata, &current->thread.fpu.fpr[wd], df);
> +#else
> +				*msadata = current->thread.fpu.fpr[wd];

hmm, you could cheat and change this to the following?:
				msadata = &current->thread.fpu.fpr[wd];

> +#endif
> +			}
> +			res = __copy_to_user_inatomic(addr, msadata, 16);
> +			if (res)
> +				goto fault;
> +		}
> +
> +		break;
> +#endif /* CONFIG_CPU_HAS_MSA */
> +
>  	/*
>  	 * COP2 is available to implementor for application specific use.
>  	 * It's up to applications to register a notifier chain and do
> 
> 

Cheers
James


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 819 bytes --]

next prev parent reply	other threads:[~2015-03-18 11:27 UTC|newest]

Thread overview: 11+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2015-03-18  1:16 [PATCH] MIPS: MSA: misaligned support Leonid Yegoshin
2015-03-18  1:16 ` Leonid Yegoshin
2015-03-18 11:27 ` James Hogan [this message]
2015-03-18 11:27   ` James Hogan
2015-03-18 19:46   ` Leonid Yegoshin
2015-03-18 22:12     ` James Hogan
2015-03-18 23:25       ` Leonid Yegoshin
2015-03-19  9:51         ` James Hogan
2015-03-19 23:23           ` Leonid Yegoshin
2015-03-18 11:41 ` James Hogan
2015-03-18 11:41   ` James Hogan

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=5509611D.80404@imgtec.com \
    --to=james.hogan@imgtec.com \
    --cc=Leonid.Yegoshin@imgtec.com \
    --cc=chenhc@lemote.com \
    --cc=davidlohr@hp.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mips@linux-mips.org \
    --cc=manuel.lauss@gmail.com \
    --cc=mingo@kernel.org \
    --cc=peterz@infradead.org \
    --cc=qais.yousef@imgtec.com \
    --cc=ralf@linux-mips.org \
    --cc=wangr@lemote.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.