From mboxrd@z Thu Jan 1 00:00:00 1970 From: David Mosberger Date: Tue, 22 May 2001 07:39:36 +0000 Subject: [Linux-ia64] libc patch: update of memcpy et al Message-Id: List-Id: MIME-Version: 1.0 Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit To: linux-ia64@vger.kernel.org The patch below bring the performance of memcpy(), memmove(), bcopy(), and bzero() up to more reasonable levels. I haven't run this through "make check", but I believe the changes to be OK. Thanks, --david ChangeLog 2001-05-22 David Mosberger * sysdeps/ia64/memmove.S: Increase MEMLAT from 6 to 21 for better performance. * sysdeps/ia64/memcpy.S: Ditto. * sysdeps/ia64/bcopy.S: New file. * sysdeps/ia64/bzero.S: New file (derived from memset.S). Index: sysdeps/ia64/memcpy.S =================================RCS file: /cvs/glibc/libc/sysdeps/ia64/memcpy.S,v retrieving revision 1.6 diff -u -r1.6 memcpy.S --- sysdeps/ia64/memcpy.S 2001/04/11 19:56:45 1.6 +++ sysdeps/ia64/memcpy.S 2001/05/22 07:32:57 @@ -68,10 +68,12 @@ br.ctop.sptk .loop##shift ; \ br.cond.sptk .cpyfew ; /* deal with the remaining bytes */ +#define MEMLAT 21 +#define Nrot (((2*MEMLAT+3) + 7) & ~7) + ENTRY(memcpy) .prologue - alloc r2 = ar.pfs, 3, 16 - 3, 0, 16 -#include "softpipe.h" + alloc r2 = ar.pfs, 3, Nrot - 3, 0, Nrot .rotr r[MEMLAT + 2], q[MEMLAT + 1] .rotp p[MEMLAT + 2] mov ret0 = in0 // return value = dest Index: sysdeps/ia64/memmove.S =================================RCS file: /cvs/glibc/libc/sysdeps/ia64/memmove.S,v retrieving revision 1.3 diff -u -r1.3 memmove.S --- sysdeps/ia64/memmove.S 2001/04/11 19:56:45 1.3 +++ sysdeps/ia64/memmove.S 2001/05/22 07:32:57 @@ -67,10 +67,12 @@ br.ctop.sptk .loop##shift ; \ br.cond.sptk .cpyfew ; /* deal with the remaining bytes */ -ENTRY(memmove) +#define MEMLAT 21 +#define Nrot (((2*MEMLAT+3) + 7) & ~7) + +ENTRY(__memmove) .prologue - alloc r2 = ar.pfs, 3, 29, 0, 32 -#include "softpipe.h" + alloc r2 = ar.pfs, 3, Nrot - 3, 0, Nrot .rotr r[MEMLAT + 2], q[MEMLAT + 1] .rotp p[MEMLAT + 2] mov ret0 = in0 // return value = dest @@ -235,6 +237,8 @@ data8 .loop56 - .loop40 data8 .loop56 - .loop48 data8 .loop56 - .loop56 + +END(__memmove) -END(memmove) +weak_alias(__memmove, memmove) --- /dev/null Sat Mar 24 01:35:12 2001 +++ sysdeps/ia64/bcopy.S Tue May 22 00:14:54 2001 @@ -0,0 +1,10 @@ +#include + +ENTRY(bcopy) + .regstk 3, 0, 0, 0 + mov r8 = in0 + mov in0 = in1 + ;; + mov in1 = r8 + br.cond.sptk.many __memmove +END(bcopy) --- /dev/null Sat Mar 24 01:35:12 2001 +++ sysdeps/ia64/bzero.S Tue May 22 00:22:19 2001 @@ -0,0 +1,94 @@ +/* Optimized version of the standard bzero() function. + This file is part of the GNU C Library. + Copyright (C) 2000, 2001 Free Software Foundation, Inc. + Contributed by Dan Pop . + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Library General Public License as + published by the Free Software Foundation; either version 2 of the + License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Library General Public License for more details. + + You should have received a copy of the GNU Library General Public + License along with the GNU C Library; see the file COPYING.LIB. If not, + write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, + Boston, MA 02111-1307, USA. */ + +/* Return: dest + + Inputs: + in0: dest + in1: count + + The algorithm is fairly straightforward: set byte by byte until we + we get to a word aligned address, then set word by word as much as + possible; the remaining few bytes are set one by one. */ + +#include +#undef ret + +#define dest in0 +#define cnt in1 + +#define save_pfs loc0 +#define ptr1 loc1 +#define ptr2 loc2 +#define tmp loc3 +#define loopcnt loc4 +#define save_lc loc5 + +ENTRY(bzero) + .prologue + alloc save_pfs = ar.pfs, 2, 6, 0, 0 + .save ar.lc, save_lc + mov save_lc = ar.lc + .body + mov ret0 = dest + and tmp = 7, dest + cmp.eq p6, p0 = cnt, r0 +(p6) br.cond.spnt .restore_and_exit ;; + mov ptr1 = dest + sub loopcnt = 8, tmp + cmp.gt p6, p0 = 16, cnt +(p6) br.cond.spnt .set_few;; + cmp.eq p6, p0 = tmp, r0 +(p6) br.cond.sptk .dest_aligned + sub cnt = cnt, loopcnt + adds loopcnt = -1, loopcnt;; + mov ar.lc = loopcnt;; +.l1: + st1 [ptr1] = r0, 1 + br.cloop.dptk .l1 ;; +.dest_aligned: + adds ptr2 = 8, ptr1 + shr.u loopcnt = cnt, 4 ;; // loopcnt = cnt / 16 + cmp.eq p6, p0 = loopcnt, r0 +(p6) br.cond.spnt .one_more + and cnt = 0xf, cnt // compute the remaining cnt + adds loopcnt = -1, loopcnt;; + mov ar.lc = loopcnt;; +.l2: + st8 [ptr1] = r0, 16 + st8 [ptr2] = r0, 16 + br.cloop.dptk .l2 + cmp.le p6, p0 = 8, cnt ;; +.one_more: +(p6) st8 [ptr1] = r0, 8 +(p6) adds cnt = -8, cnt ;; + cmp.eq p6, p0 = cnt, r0 +(p6) br.cond.spnt .restore_and_exit +.set_few: + adds loopcnt = -1, cnt;; + mov ar.lc = loopcnt;; +.l3: + st1 [ptr1] = r0, 1 + br.cloop.dptk .l3 ;; +.restore_and_exit: + mov ar.lc = save_lc + mov ar.pfs = save_pfs + br.ret.sptk.many b0 +END(bzero)