All of lore.kernel.org
 help / color / mirror / Atom feed
From: Antony Pavlov <antonynpavlov@gmail.com>
To: Aleksey Kuleshov <rndfax@yandex.ru>
Cc: barebox@lists.infradead.org
Subject: Re: [PATCH] MIPS: import optimized string functions from Linux
Date: Tue, 24 Nov 2015 17:46:03 +0300	[thread overview]
Message-ID: <20151124174603.7278b5fdee2a8bc1df00a892@gmail.com> (raw)
In-Reply-To: <1448383102-27566-1-git-send-email-rndfax@yandex.ru>

On Tue, 24 Nov 2015 19:38:22 +0300
Aleksey Kuleshov <rndfax@yandex.ru> wrote:

> 10x performance gain according to simple test on QEMU malta:
> barebox:/ time memcpy 0xa0000000 0xa0001000 0x100000
> 
> Signed-off-by: Aleksey Kuleshov <rndfax@yandex.ru>
> ---
>  arch/mips/Kconfig              |   7 ++
>  arch/mips/include/asm/string.h |   9 +-
>  arch/mips/lib/Makefile         |   3 +
>  arch/mips/lib/memcpy.S         | 276 +++++++++++++++++++++++++++++++++++++++++
>  arch/mips/lib/memset.S         | 146 ++++++++++++++++++++++
>  5 files changed, 440 insertions(+), 1 deletion(-)
>  create mode 100644 arch/mips/lib/memcpy.S
>  create mode 100644 arch/mips/lib/memset.S
> 
> diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
> index a2d443f..7deb825 100644
> --- a/arch/mips/Kconfig
> +++ b/arch/mips/Kconfig
> @@ -331,6 +331,13 @@ config NMON_HELP
>  	  Say yes here to get the nmon commands message on
>  	  every nmon start.
>  
> +config MIPS_OPTIMZED_STRING_FUNCTIONS
               ^^^^^^^^
                     OPTIMZED again?
                     AFAIR you have already fixed this, n'est-ce pas?
                     Also please add 'default y'.


> +	bool "use assembler optimized string functions"
> +	help
> +	  Say yes here to use assembler optimized memcpy / memset functions.
> +	  These functions work much faster than the normal versions but
> +	  increase your binary size.
> +
>  endmenu
>  
>  source common/Kconfig
> diff --git a/arch/mips/include/asm/string.h b/arch/mips/include/asm/string.h
> index 3a32d18..5a12b38 100644
> --- a/arch/mips/include/asm/string.h
> +++ b/arch/mips/include/asm/string.h
> @@ -22,6 +22,13 @@
>  #ifndef __ASM_MIPS_STRING_H
>  #define __ASM_MIPS_STRING_H
>  
> -/* nothing special yet */
> +#ifdef CONFIG_MIPS_OPTIMZED_STRING_FUNCTIONS
> +
> +#define __HAVE_ARCH_MEMCPY
> +extern void *memcpy(void *, const void *, __kernel_size_t);
> +#define __HAVE_ARCH_MEMSET
> +extern void *memset(void *, int, __kernel_size_t);
> +
> +#endif
>  
>  #endif
> diff --git a/arch/mips/lib/Makefile b/arch/mips/lib/Makefile
> index 0145f35..7970f25 100644
> --- a/arch/mips/lib/Makefile
> +++ b/arch/mips/lib/Makefile
> @@ -7,6 +7,9 @@ obj-y += cpu-probe.o
>  obj-y += traps.o
>  obj-y += genex.o
>  
> +obj-$(CONFIG_MIPS_OPTIMZED_STRING_FUNCTIONS) += memcpy.o
> +obj-$(CONFIG_MIPS_OPTIMZED_STRING_FUNCTIONS) += memset.o
> +
>  obj-$(CONFIG_CPU_MIPS32) += c-r4k.o
>  obj-$(CONFIG_CPU_MIPS64) += c-r4k.o
>  
> diff --git a/arch/mips/lib/memcpy.S b/arch/mips/lib/memcpy.S
> new file mode 100644
> index 0000000..a5af0c9
> --- /dev/null
> +++ b/arch/mips/lib/memcpy.S
> @@ -0,0 +1,276 @@
> +/*
> + * This file is subject to the terms and conditions of the GNU General Public
> + * License.  See the file "COPYING" in the main directory of this archive
> + * for more details.
> + *
> + * Copyright (C) 1998, 99, 2000, 01, 2002 Ralf Baechle (ralf@gnu.org)
> + * Copyright (C) 1999, 2000, 01, 2002 Silicon Graphics, Inc.
> + * Copyright (C) 2002 Broadcom, Inc.
> + *   memcpy/copy_user author: Mark Vandevoorde
> + * Copyright (C) 2007  Maciej W. Rozycki
> + * Copyright (C) 2014 Imagination Technologies Ltd.
> + *
> + * Kernel-mode memcpy function without exceptions for _some_ MIPS CPUs
> + * by Aleksey Kuleshov (rndfax@yandex.ru), 2015
> + *
> + */
> +
> +#include <asm/asm.h>
> +#include <asm/asm-offsets.h>
> +#include <asm/regdef.h>
> +
> +#define dst a0
> +#define src a1
> +#define len a2
> +
> +#define LOADK lw /* No exception */
> +#define LOAD(reg, addr)		lw reg, addr
> +#define LOADL(reg, addr)	lwl reg, addr
> +#define LOADR(reg, addr)	lwr reg, addr
> +#define STOREL(reg, addr)	swl reg, addr
> +#define STORER(reg, addr)	swr reg, addr
> +#define STORE(reg, addr)	sw reg, addr
> +#define ADD    addu
> +#define SUB    subu
> +#define SRL    srl
> +#define SLL    sll
> +#define SRA    sra
> +#define SLLV   sllv
> +#define SRLV   srlv
> +#define NBYTES 4
> +#define LOG_NBYTES 2
> +
> +#define LOADB(reg, addr)	lb reg, addr
> +#define STOREB(reg, addr)	sb reg, addr
> +
> +#ifdef CONFIG_CPU_LITTLE_ENDIAN
> +#define LDFIRST LOADR
> +#define LDREST	LOADL
> +#define STFIRST STORER
> +#define STREST	STOREL
> +#define SHIFT_DISCARD SLLV
> +#else
> +#define LDFIRST LOADL
> +#define LDREST	LOADR
> +#define STFIRST STOREL
> +#define STREST	STORER
> +#define SHIFT_DISCARD SRLV
> +#endif
> +
> +#define FIRST(unit) ((unit)*NBYTES)
> +#define REST(unit)  (FIRST(unit)+NBYTES-1)
> +#define UNIT(unit)  FIRST(unit)
> +
> +#define ADDRMASK (NBYTES-1)
> +
> +	.text
> +	.align	5
> +	.set	noreorder
> +LEAF(memcpy)					/* a0=dst a1=src a2=len */
> +	move	v0, dst				/* return value */
> +
> +	/*
> +	 * Note: dst & src may be unaligned, len may be 0
> +	 * Temps
> +	 */
> +#define rem t8
> +
> +	/*
> +	 * The "issue break"s below are very approximate.
> +	 * Issue delays for dcache fills will perturb the schedule, as will
> +	 * load queue full replay traps, etc.
> +	 *
> +	 * If len < NBYTES use byte operations.
> +	 */
> +	sltu	t2, len, NBYTES
> +	and	t1, dst, ADDRMASK
> +	bnez	t2, .Lcopy_bytes_checklen
> +	 and	t0, src, ADDRMASK
> +	bnez	t1, .Ldst_unaligned
> +	 nop
> +	bnez	t0, .Lsrc_unaligned_dst_aligned
> +	/*
> +	 * use delay slot for fall-through
> +	 * src and dst are aligned; need to compute rem
> +	 */
> +.Lboth_aligned:
> +	 SRL	t0, len, LOG_NBYTES+3	 # +3 for 8 units/iter
> +	beqz	t0, .Lcleanup_both_aligned # len < 8*NBYTES
> +	 and	rem, len, (8*NBYTES-1)	 # rem = len % (8*NBYTES)
> +	.align	4
> +1:
> +	LOAD(t0, UNIT(0)(src))
> +	LOAD(t1, UNIT(1)(src))
> +	LOAD(t2, UNIT(2)(src))
> +	LOAD(t3, UNIT(3)(src))
> +	SUB	len, len, 8*NBYTES
> +	LOAD(t4, UNIT(4)(src))
> +	LOAD(t7, UNIT(5)(src))
> +	STORE(t0, UNIT(0)(dst))
> +	STORE(t1, UNIT(1)(dst))
> +	LOAD(t0, UNIT(6)(src))
> +	LOAD(t1, UNIT(7)(src))
> +	ADD	src, src, 8*NBYTES
> +	ADD	dst, dst, 8*NBYTES
> +	STORE(t2, UNIT(-6)(dst))
> +	STORE(t3, UNIT(-5)(dst))
> +	STORE(t4, UNIT(-4)(dst))
> +	STORE(t7, UNIT(-3)(dst))
> +	STORE(t0, UNIT(-2)(dst))
> +	STORE(t1, UNIT(-1)(dst))
> +	bne	len, rem, 1b
> +	 nop
> +
> +	/*
> +	 * len == rem == the number of bytes left to copy < 8*NBYTES
> +	 */
> +.Lcleanup_both_aligned:
> +	beqz	len, .Ldone
> +	 sltu	t0, len, 4*NBYTES
> +	bnez	t0, .Lless_than_4units
> +	 and	rem, len, (NBYTES-1)	# rem = len % NBYTES
> +	/*
> +	 * len >= 4*NBYTES
> +	 */
> +	LOAD( t0, UNIT(0)(src))
> +	LOAD( t1, UNIT(1)(src))
> +	LOAD( t2, UNIT(2)(src))
> +	LOAD( t3, UNIT(3)(src))
> +	SUB	len, len, 4*NBYTES
> +	ADD	src, src, 4*NBYTES
> +	STORE(t0, UNIT(0)(dst))
> +	STORE(t1, UNIT(1)(dst))
> +	STORE(t2, UNIT(2)(dst))
> +	STORE(t3, UNIT(3)(dst))
> +	.set	reorder				/* DADDI_WAR */
> +	ADD	dst, dst, 4*NBYTES
> +	beqz	len, .Ldone
> +	.set	noreorder
> +.Lless_than_4units:
> +	/*
> +	 * rem = len % NBYTES
> +	 */
> +	beq	rem, len, .Lcopy_bytes
> +	 nop
> +1:
> +	LOAD(t0, 0(src))
> +	ADD	src, src, NBYTES
> +	SUB	len, len, NBYTES
> +	STORE(t0, 0(dst))
> +	.set	reorder				/* DADDI_WAR */
> +	ADD	dst, dst, NBYTES
> +	bne	rem, len, 1b
> +	.set	noreorder
> +
> +	/*
> +	 * src and dst are aligned, need to copy rem bytes (rem < NBYTES)
> +	 * A loop would do only a byte at a time with possible branch
> +	 * mispredicts.	 Can't do an explicit LOAD dst,mask,or,STORE
> +	 * because can't assume read-access to dst.  Instead, use
> +	 * STREST dst, which doesn't require read access to dst.
> +	 *
> +	 * This code should perform better than a simple loop on modern,
> +	 * wide-issue mips processors because the code has fewer branches and
> +	 * more instruction-level parallelism.
> +	 */
> +#define bits t2
> +	beqz	len, .Ldone
> +	 ADD	t1, dst, len	# t1 is just past last byte of dst
> +	li	bits, 8*NBYTES
> +	SLL	rem, len, 3	# rem = number of bits to keep
> +	LOAD(t0, 0(src))
> +	SUB	bits, bits, rem # bits = number of bits to discard
> +	SHIFT_DISCARD t0, t0, bits
> +	STREST(t0, -1(t1))
> +	jr	ra
> +	 move	len, zero
> +.Ldst_unaligned:
> +	/*
> +	 * dst is unaligned
> +	 * t0 = src & ADDRMASK
> +	 * t1 = dst & ADDRMASK; T1 > 0
> +	 * len >= NBYTES
> +	 *
> +	 * Copy enough bytes to align dst
> +	 * Set match = (src and dst have same alignment)
> +	 */
> +#define match rem
> +	LDFIRST(t3, FIRST(0)(src))
> +	ADD	t2, zero, NBYTES
> +	LDREST(t3, REST(0)(src))
> +	SUB	t2, t2, t1	# t2 = number of bytes copied
> +	xor	match, t0, t1
> +	STFIRST(t3, FIRST(0)(dst))
> +	beq	len, t2, .Ldone
> +	 SUB	len, len, t2
> +	ADD	dst, dst, t2
> +	beqz	match, .Lboth_aligned
> +	 ADD	src, src, t2
> +
> +.Lsrc_unaligned_dst_aligned:
> +	SRL	t0, len, LOG_NBYTES+2	 # +2 for 4 units/iter
> +	beqz	t0, .Lcleanup_src_unaligned
> +	 and	rem, len, (4*NBYTES-1)	 # rem = len % 4*NBYTES
> +1:
> +/*
> + * Avoid consecutive LD*'s to the same register since some mips
> + * implementations can't issue them in the same cycle.
> + * It's OK to load FIRST(N+1) before REST(N) because the two addresses
> + * are to the same unit (unless src is aligned, but it's not).
> + */
> +	LDFIRST(t0, FIRST(0)(src))
> +	LDFIRST(t1, FIRST(1)(src))
> +	SUB	len, len, 4*NBYTES
> +	LDREST(t0, REST(0)(src))
> +	LDREST(t1, REST(1)(src))
> +	LDFIRST(t2, FIRST(2)(src))
> +	LDFIRST(t3, FIRST(3)(src))
> +	LDREST(t2, REST(2)(src))
> +	LDREST(t3, REST(3)(src))
> +	ADD	src, src, 4*NBYTES
> +	STORE(t0, UNIT(0)(dst))
> +	STORE(t1, UNIT(1)(dst))
> +	STORE(t2, UNIT(2)(dst))
> +	STORE(t3, UNIT(3)(dst))
> +	.set	reorder				/* DADDI_WAR */
> +	ADD	dst, dst, 4*NBYTES
> +	bne	len, rem, 1b
> +	.set	noreorder
> +
> +.Lcleanup_src_unaligned:
> +	beqz	len, .Ldone
> +	 and	rem, len, NBYTES-1  # rem = len % NBYTES
> +	beq	rem, len, .Lcopy_bytes
> +	 nop
> +1:
> +	LDFIRST(t0, FIRST(0)(src))
> +	LDREST(t0, REST(0)(src))
> +	ADD	src, src, NBYTES
> +	SUB	len, len, NBYTES
> +	STORE(t0, 0(dst))
> +	.set	reorder				/* DADDI_WAR */
> +	ADD	dst, dst, NBYTES
> +	bne	len, rem, 1b
> +	.set	noreorder
> +
> +.Lcopy_bytes_checklen:
> +	beqz	len, .Ldone
> +	 nop
> +.Lcopy_bytes:
> +	/* 0 < len < NBYTES  */
> +#define COPY_BYTE(N)			\
> +	LOADB(t0, N(src));	\
> +	SUB	len, len, 1;		\
> +	beqz	len, .Ldone;		\
> +	STOREB(t0, N(dst))
> +
> +	COPY_BYTE(0)
> +	COPY_BYTE(1)
> +	LOADB(t0, NBYTES-2(src))
> +	SUB	len, len, 1
> +	jr	ra
> +	STOREB(t0, NBYTES-2(dst))
> +.Ldone:
> +	jr	ra
> +	 nop
> +	END(memcpy)
> diff --git a/arch/mips/lib/memset.S b/arch/mips/lib/memset.S
> new file mode 100644
> index 0000000..d3c1c72
> --- /dev/null
> +++ b/arch/mips/lib/memset.S
> @@ -0,0 +1,146 @@
> +/*
> + * This file is subject to the terms and conditions of the GNU General Public
> + * License.  See the file "COPYING" in the main directory of this archive
> + * for more details.
> + *
> + * Copyright (C) 1998, 1999, 2000 by Ralf Baechle
> + * Copyright (C) 1999, 2000 Silicon Graphics, Inc.
> + * Copyright (C) 2007 by Maciej W. Rozycki
> + * Copyright (C) 2011, 2012 MIPS Technologies, Inc.
> + *
> + * Kernel-mode memset function without exceptions
> + * by Aleksey Kuleshov (rndfax@yandex.ru), 2015
> + */
> +#include <asm/asm.h>
> +#include <asm/asm-offsets.h>
> +#include <asm/regdef.h>
> +
> +#if LONGSIZE == 4
> +#define LONG_S_L swl
> +#define LONG_S_R swr
> +#else
> +#define LONG_S_L sdl
> +#define LONG_S_R sdr
> +#endif
> +
> +#define STORSIZE LONGSIZE
> +#define STORMASK LONGMASK
> +#define FILL64RG a1
> +#define FILLPTRG t0
> +
> +/*
> + * memset(void *s, int c, size_t n)
> + *
> + * a0: start of area to clear
> + * a1: char to fill with
> + * a2: size of area to clear
> + */
> +
> +LEAF(memset)
> +	beqz		a1, 1f
> +	move		v0, a0			/* result */
> +
> +	andi		a1, 0xff		/* spread fillword */
> +	LONG_SLL		t1, a1, 8
> +	or		a1, t1
> +	LONG_SLL		t1, a1, 16
> +#if LONGSIZE == 8
> +	or		a1, t1
> +	LONG_SLL		t1, a1, 32
> +#endif
> +	or		a1, t1
> +
> +	.macro	f_fill64 dst, offset, val
> +	LONG_S	\val, (\offset +  0 * STORSIZE)(\dst)
> +	LONG_S	\val, (\offset +  1 * STORSIZE)(\dst)
> +	LONG_S	\val, (\offset +  2 * STORSIZE)(\dst)
> +	LONG_S	\val, (\offset +  3 * STORSIZE)(\dst)
> +	LONG_S	\val, (\offset +  4 * STORSIZE)(\dst)
> +	LONG_S	\val, (\offset +  5 * STORSIZE)(\dst)
> +	LONG_S	\val, (\offset +  6 * STORSIZE)(\dst)
> +	LONG_S	\val, (\offset +  7 * STORSIZE)(\dst)
> +#if (LONGSIZE == 4)
> +	LONG_S	\val, (\offset +  8 * STORSIZE)(\dst)
> +	LONG_S	\val, (\offset +  9 * STORSIZE)(\dst)
> +	LONG_S	\val, (\offset + 10 * STORSIZE)(\dst)
> +	LONG_S	\val, (\offset + 11 * STORSIZE)(\dst)
> +	LONG_S	\val, (\offset + 12 * STORSIZE)(\dst)
> +	LONG_S	\val, (\offset + 13 * STORSIZE)(\dst)
> +	LONG_S	\val, (\offset + 14 * STORSIZE)(\dst)
> +	LONG_S	\val, (\offset + 15 * STORSIZE)(\dst)
> +#endif
> +	.endm
> +
> +	.set	noreorder
> +	.align	5
> +
> +1:
> +	sltiu		t0, a2, STORSIZE	/* very small region? */
> +	bnez		t0, .Lsmall_memset
> +	andi		t0, a0, STORMASK	/* aligned? */
> +
> +	beqz		t0, 1f
> +	PTR_SUBU	t0, STORSIZE		/* alignment in bytes */
> +
> +#ifdef __MIPSEB__
> +	LONG_S_L	a1, (a0)		/* make word/dword aligned */
> +#else
> +	LONG_S_R	a1, (a0)		/* make word/dword aligned */
> +#endif
> +	PTR_SUBU	a0, t0			/* long align ptr */
> +	PTR_ADDU	a2, t0			/* correct size */
> +
> +1:	ori		t1, a2, 0x3f		/* # of full blocks */
> +	xori		t1, 0x3f
> +	beqz		t1, .Lmemset_partial	/* no block to fill */
> +	andi		t0, a2, 0x40-STORSIZE
> +
> +	PTR_ADDU	t1, a0			/* end address */
> +	.set		reorder
> +1:	PTR_ADDIU	a0, 64
> +	f_fill64 a0, -64, FILL64RG
> +	bne		t1, a0, 1b
> +	.set		noreorder
> +
> +.Lmemset_partial:
> +	PTR_LA		t1, 2f			/* where to start */
> +#if LONGSIZE == 4
> +	PTR_SUBU	t1, FILLPTRG
> +#else
> +	.set		noat
> +	LONG_SRL	AT, FILLPTRG, 1
> +	PTR_SUBU	t1, AT
> +	.set		at
> +#endif
> +	jr		t1
> +	PTR_ADDU	a0, t0			/* dest ptr */
> +
> +	.set		push
> +	.set		noreorder
> +	.set		nomacro
> +	/* ... but first do longs ... */
> +	f_fill64 a0, -64, FILL64RG
> +2:	.set		pop
> +	andi		a2, STORMASK		/* At most one long to go */
> +
> +	beqz		a2, 1f
> +	PTR_ADDU	a0, a2			/* What's left */
> +#ifdef __MIPSEB__
> +	LONG_S_R	a1, -1(a0)
> +#else
> +	LONG_S_L	a1, -1(a0)
> +#endif
> +1:	jr		ra
> +	move		a2, zero
> +
> +.Lsmall_memset:
> +	beqz		a2, 2f
> +	PTR_ADDU	t1, a0, a2
> +
> +1:	PTR_ADDIU	a0, 1			/* fill bytewise */
> +	bne		t1, a0, 1b
> +	sb		a1, -1(a0)
> +
> +2:	jr		ra			/* done */
> +	move		a2, zero
> +	END(memset)
> -- 
> 2.5.1
> 
> 
> _______________________________________________
> barebox mailing list
> barebox@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/barebox


-- 
-- 
Best regards,
  Antony Pavlov

_______________________________________________
barebox mailing list
barebox@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/barebox

  reply	other threads:[~2015-11-24 14:22 UTC|newest]

Thread overview: 6+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2015-11-24 16:38 [PATCH] MIPS: import optimized string functions from Linux Aleksey Kuleshov
2015-11-24 14:46 ` Antony Pavlov [this message]
  -- strict thread matches above, loose matches on Subject: below --
2015-11-25 12:23 Aleksey Kuleshov
2015-11-25 10:08 ` Antony Pavlov
2015-11-25 12:47 ` Antony Pavlov
2015-11-26  8:38 ` Sascha Hauer

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20151124174603.7278b5fdee2a8bc1df00a892@gmail.com \
    --to=antonynpavlov@gmail.com \
    --cc=barebox@lists.infradead.org \
    --cc=rndfax@yandex.ru \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.