* [Linux-ia64] Update: optimizing __copy_user
@ 2000-10-23 20:04 Jun Nakajima
0 siblings, 0 replies; only message in thread
From: Jun Nakajima @ 2000-10-23 20:04 UTC (permalink / raw)
To: linux-ia64
I have updated __copy_user, reflecting the comments I got. Thanks for
those comments. The major changes are that it uses 'shrp' to make the
software pipeline shorter and more efficient (rahter than using 'shr.u',
'shl', and 'or').
--
Jun U Nakajima
Core OS Development
SCO/Murray Hill, NJ
Email: jun@sco.com, Phone: 908-790-2352 Fax: 908-790-2426
-----------------------------------------------------------------------
*** copy_user.S.org Tue Oct 10 11:31:43 2000
--- copy_user.S Mon Oct 23 12:00:06 2000
***************
*** 65,70 ****
--- 65,76 ----
//
// local registers
//
+ #define t1 r2 // rshift in bytes
+ #define t2 r3 // lshift in bytes
+ #define rshift r14 // right shift in bits
+ #define lshift r15 // left shift in bits
+ #define word1 r16
+ #define word2 r17
#define cnt r18
#define len2 r19
#define saved_lc r20
***************
*** 134,139 ****
--- 140,329 ----
br.ret.sptk.few rp // end of short memcpy
//
+ // Not 8-byte alinged
+ //
+ diff_align_copy_user:
+ // At this point we know we have more than 16 bytes to copy
+ // and also that src and dest do _not_ have the same alignment.
+ and src2=0x7,src1 // src offset
+ and dst2=0x7,dst1 // dst offset
+ ;;
+ // The basic idea is that we copy byte-by-byte at the head so
+ // that we can reach 8-byte alignment for both src1 and dst1.
+ // Then copy the body using software pipelined 8-byte copy,
+ // shifting the two back-to-back words right and left, then copy
+ // the tail by copying byte-by-byte.
+ //
+ // Fault handling. If the byte-by-byte at the head fails on the
+ // load, then restart and finish the pipleline by copying zeros
+ // to the dst1. Then copy zeros for the rest of dst1.
+ // If 8-byte software pipeline fails on the load, do the same as
+ // failure_in3 does. If the byte-by-byte at the tail fails, it
is
+ // handled simply by failure_in_pipe1.
+ //
+ // The case p14 represents the source has more bytes in the
+ // the first word (by the shifted part), whereas the p15 needs
to
+ // copy some bytes from the 2nd word of the source that has the
+ // tail of the 1st of the destination.
+ //
+
+ //
+ // Optimization. If dst1 is 8-byte aligned (not rarely), we
don't need
+ // to copy the head to dst1, to start 8-byte copy software
pipleline.
+ // We know src1 is not 8-byte aligned in this case.
+ //
+ cmp.eq p14,p15=r0,dst2
+ (p15) br.cond.spnt.few 1f
+ ;;
+ sub t1=8,src2
+ mov t2=src2
+ ;;
+ shl rshift=t2,3
+ sub len1=len,t1 // set len1
+ ;;
+ sub lshiftd,rshift
+ ;;
+ br.cond.spnt.few word_copy_user
+ ;;
+ 1:
+ cmp.leu p14,p15=src2,dst2
+ sub t1=dst2,src2
+ ;;
+ .pred.rel "mutex", p14, p15
+ (p14) sub word1=8,src2 // (8 - src
offset)
+ (p15) sub t1=r0,t1 // absolute
value
+ (p15) sub word1=8,dst2 // (8 - dst
offset)
+ ;;
+ // For the case p14, we don't need to copy the shifted part to
+ // the 1st word of destination.
+ sub t2=8,t1
+ (p14) sub word1=word1,t1
+ ;;
+ sub len1=len,word1 // resulting len
+ (p15) shl rshift=t1,3 // in bits
+ (p14) shl rshift=t2,3
+ ;;
+ (p14) sub len1=len1,t1
+ adds cnt=-1,word1
+ ;;
+ sub lshiftd,rshift
+ mov ar.ec=PIPE_DEPTH
+ mov pr.rot=1<<16 // p16=true all others are false
+ mov ar.lc=cnt
+ ;;
+ 2:
+ EX(failure_in_pipe2,(p16) ld1 val1[0]=[src1],1)
+ ;;
+ EX(failure_out,(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1)
+ br.ctop.dptk.few 2b
+ ;;
+ clrrrb
+ ;;
+ word_copy_user:
+ cmp.gtu p9,p0\x16,len1
+ (p9) br.cond.spnt.few 4f // if (16 > len1) skip 8-byte
copy
+ ;;
+ shr.u cnt=len1,3 // number of 64-bit words
+ ;;
+ adds cnt=-1,cnt
+ ;;
+ .pred.rel "mutex", p14, p15
+ (p14) sub src1=src1,t2
+ (p15) sub src1=src1,t1
+ //
+ // Now both src1 and dst1 point to an 8-byte alinged address.
And
+ // we have more than 8 bytes to copy.
+ //
+ mov ar.lc=cnt
+ mov ar.ec=PIPE_DEPTH
+ mov pr.rot=1<<16 // p16=true all others are false
+ ;;
+ 3:
+ //
+ // The pipleline consists of 3 stages:
+ // 1 (p16): Load a word from src1
+ // 2 (EPI_1): Shift right pair, saving to tmp
+ // 3 (EPI): Store tmp to dst1
+ //
+ // To make it simple, use at least 2 (p16) loops to set up
val1[n]
+ // because we need 2 back-to-back val1[] to get tmp.
+ // Note that this implies EPI_2 must be p18 or greater.
+ //
+
+ #define EPI_1 p[PIPE_DEPTH-2]
+ #define SWITCH(pred, shift) cmp.eq pred,p0=shift,rshift
+ #define CASE(pred, shift) \
+ (pred) br.cond.spnt.few copy_user_bit##shift
+ #define BODY(rshift)
\
+ copy_user_bit##rshift:
\
+ 1:
\
+ EX(failure_out,(EPI) st8 [dst1]=tmp,8);
\
+ (EPI_1) shrp tmp=val1[PIPE_DEPTH-3],val1[PIPE_DEPTH-2],rshift;
\
+ EX(failure_in2,(p16) ld8 val1[0]=[src1],8);
\
+ br.ctop.dptk.few 1b;
\
+ ;;
\
+ br.cond.spnt.few .diff_align_do_tail
+
+ //
+ // Since the instruction 'shrp' requires a fixed 128-bit value
+ // specifying the bits to shift, we need to provide 7 cases
+ // below.
+ //
+ SWITCH(p6, 8)
+ SWITCH(p7, 16)
+ SWITCH(p8, 24)
+ SWITCH(p9, 32)
+ SWITCH(p10, 40)
+ SWITCH(p11, 48)
+ SWITCH(p12, 56)
+ ;;
+ CASE(p6, 8)
+ CASE(p7, 16)
+ CASE(p8, 24)
+ CASE(p9, 32)
+ CASE(p10, 40)
+ CASE(p11, 48)
+ CASE(p12, 56)
+ ;;
+ BODY(8)
+ BODY(16)
+ BODY(24)
+ BODY(32)
+ BODY(40)
+ BODY(48)
+ BODY(56)
+ ;;
+ .diff_align_do_tail:
+ .pred.rel "mutex", p14, p15
+ (p14) sub src1=src1,t1
+ (p14) adds dst1=-8,dst1
+ (p15) sub dst1=dst1,t1
+ ;;
+ 4:
+ // Tail correction.
+ //
+ // The problem with this piplelined loop is that the last word
is not
+ // loaded and thus parf of the last word written is not correct.
+ // To fix that, we simply copy the tail byte by byte.
+
+ sub len1=endsrc,src1,1
+ clrrrb
+ ;;
+ mov ar.ec=PIPE_DEPTH
+ mov pr.rot=1<<16 // p16=true all others are false
+ mov ar.lc=len1
+ ;;
+ 5:
+ EX(failure_in_pipe1,(p16) ld1 val1[0]=[src1],1)
+
+ EX(failure_out,(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1)
+ br.ctop.dptk.few 5b
+ ;;
+ mov pr=saved_pr,0xffffffffffff0000
+ mov ar.pfs=saved_pfs
+ br.ret.dptk.few rp
+
+ //
// Beginning of long mempcy (i.e. > 16 bytes)
//
long_copy_user:
***************
*** 142,148 ****
;;
cmp.eq p10,p8=r0,tmp
mov len1=len // copy because of rotation
! (p8) br.cond.dpnt.few 1b // XXX Fixme. memcpy_diff_align
;;
// At this point we know we have more than 16 bytes to copy
// and also that both src and dest have the same alignment
--- 332,338 ----
;;
cmp.eq p10,p8=r0,tmp
mov len1=len // copy because of rotation
! (p8) br.cond.dpnt.few diff_align_copy_user
;;
// At this point we know we have more than 16 bytes to copy
// and also that both src and dest have the same alignment
***************
*** 267,272 ****
--- 457,477 ----
mov ar.pfs=saved_pfs
br.ret.dptk.few rp
+ //
+ // This is the case where the byte by byte copy fails on the
load
+ // when we copy the head. We need to finish the pipeline and
copy
+ // zeros for the rest of the destination. Since this happens
+ // at the top we still need to fill the body and tail.
+ failure_in_pipe2:
+ sub ret0=endsrc,src1 // number of bytes to zero, i.e. not
copied
+ 2:
+ (p16) mov val1[0]=r0
+ (EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1
+ br.ctop.dptk.few 2b
+ ;;
+ sub len=enddst,dst1,1 // precompute len
+ br.cond.dptk.few failure_in1bis
+ ;;
//
// Here we handle the head & tail part when we check for
alignment.
***************
*** 395,400 ****
--- 600,622 ----
mov ar.pfs=saved_pfs
br.ret.dptk.few rp
+ failure_in2:
+ sub ret0=endsrc,src1 // number of bytes to zero, i.e. not
copied
+ ;;
+ 3:
+ (p16) mov val1[0]=r0
+ (EPI) st8 [dst1]=val1[PIPE_DEPTH-1],8
+ br.ctop.dptk.few 3b
+ ;;
+ cmp.ne p6,p0=dst1,enddst // Do we need to finish the tail
?
+ sub len=enddst,dst1,1 // precompute len
+ (p6) br.cond.dptk.few failure_in1bis
+ ;;
+ mov pr=saved_pr,0xffffffffffff0000
+ mov ar.lc=saved_lc
+ mov ar.pfs=saved_pfs
+ br.ret.dptk.few rp
+
//
// handling of failures on stores: that's the easy part
//
^ permalink raw reply [flat|nested] only message in thread
only message in thread, other threads:[~2000-10-23 20:04 UTC | newest]
Thread overview: (only message) (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2000-10-23 20:04 [Linux-ia64] Update: optimizing __copy_user Jun Nakajima
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox