* [patch] Improve ia64_leave_syscall() for McKinley-type cores.
@ 2005-01-19 5:02 David Mosberger
0 siblings, 0 replies; only message in thread
From: David Mosberger @ 2005-01-19 5:02 UTC (permalink / raw)
To: linux-ia64
Optimize ia64_leave_syscall() a bit better for McKinley-type cores.
The patch looks big, but that's mostly due to renaming r16/r17 to r2/r3.
Good for a 13 cycle improvement.
--david
Signed-off-by: David Mosberger-Tang <davidm@hpl.hp.com>
# This is a BitKeeper generated diff -Nru style patch.
#
# ChangeSet
# 2005/01/18 18:12:07-08:00 davidm@tiger.hpl.hp.com
# ia64: Improve ia64_leave_syscall() for McKinley-type cores. Improves
# (normal) getpid() from 271 to 258 cycles.
#
# arch/ia64/kernel/entry.S
# 2005/01/18 18:11:50-08:00 davidm@tiger.hpl.hp.com +51 -48
# (ia64_leave_syscall): Make it a local function (it's not used anywhere
# else anymore).
# Use r2/r3 as base-pointers instead of r16/r17.
# Load b6 into r18 instead of r22 (frees up a register).
# Read ar.bsp early (_big_ savings!).
# Reschedule for McKinley-type cores.
# Do srlz.i _before_ restoring the stack-pointer or updating
# current->thread.on_ustack.
# (skip_rbs_switch): Clear r2 if pLvSys is TRUE.
#
diff -Nru a/arch/ia64/kernel/entry.S b/arch/ia64/kernel/entry.S
--- a/arch/ia64/kernel/entry.S 2005-01-18 21:00:24 -08:00
+++ b/arch/ia64/kernel/entry.S 2005-01-18 21:00:24 -08:00
@@ -633,10 +633,12 @@
* r13: restored (user-level thread pointer)
* r14: cleared
* r15: restored (syscall #)
- * r16-r19: cleared
+ * r16-r17: cleared
+ * r18: user-level b6
+ * r19: cleared
* r20: user-level ar.fpsr
* r21: user-level b0
- * r22: user-level b6
+ * r22: cleared
* r23: user-level ar.bspstore
* r24: user-level ar.rnat
* r25: user-level ar.unat
@@ -661,7 +663,7 @@
* ar.csd: cleared
* ar.ssd: cleared
*/
-GLOBAL_ENTRY(ia64_leave_syscall)
+ENTRY(ia64_leave_syscall)
PT_REGS_UNWIND_INFO(0)
/*
* work.need_resched etc. mustn't get changed by this CPU before it returns to
@@ -690,79 +692,80 @@
(pUStk) cmp.eq.unc p6,p0=r0,r0 // p6 <- pUStk
#endif
.work_processed_syscall:
- adds r16=PT(LOADRS)+16,r12
- adds r17=PT(AR_BSPSTORE)+16,r12
+ adds r2=PT(LOADRS)+16,r12
+ adds r3=PT(AR_BSPSTORE)+16,r12
adds r18=TI_FLAGS+IA64_TASK_SIZE,r13
;;
(p6) ld4 r31=[r18] // load current_thread_info()->flags
- ld8 r19=[r16],PT(B6)-PT(LOADRS) // load ar.rsc value for "loadrs"
- nop.i 0
+ ld8 r19=[r2],PT(B6)-PT(LOADRS) // load ar.rsc value for "loadrs"
+ mov b7=r0 // clear b7
;;
- ld8 r23=[r17],PT(R9)-PT(AR_BSPSTORE) // load ar.bspstore (may be garbage)
- ld8 r22=[r16],PT(R8)-PT(B6) // load b6
+ ld8 r23=[r3],PT(R9)-PT(AR_BSPSTORE) // load ar.bspstore (may be garbage)
+ ld8 r18=[r2],PT(R8)-PT(B6) // load b6
(p6) and r15=TIF_WORK_MASK,r31 // any work other than TIF_SYSCALL_TRACE?
;;
-
- mov.m ar.ccv=r0 // clear ar.ccv
+ mov r16=ar.bsp // M2 get existing backing store pointer
(p6) cmp4.ne.unc p6,p0=r15, r0 // any special work pending?
(p6) br.cond.spnt .work_pending
;;
// start restoring the state saved on the kernel stack (struct pt_regs):
- ld8.fill r8=[r16],16
- ld8.fill r9=[r17],16
+ ld8.fill r8=[r2],16
+ ld8.fill r9=[r3],16
mov f6ð // clear f6
;;
- ld8.fill r10=[r16],16
- ld8.fill r11=[r17],16
+ invala // M0|1 invalidate ALAT
+ rsm psr.i | psr.ic // M2 initiate turning off of interrupt and interruption collection
+ mov f9ð // clear f9
+
+ ld8.fill r10=[r2],16
+ ld8.fill r11=[r3],16
mov f7ð // clear f7
;;
- ld8 r29=[r16],16 // load cr.ipsr
- ld8 r28=[r17],16 // load cr.iip
+ ld8 r29=[r2],16 // load cr.ipsr
+ ld8 r28=[r3],16 // load cr.iip
mov f8ð // clear f8
;;
- ld8 r30=[r16],16 // load cr.ifs
- ld8 r25=[r17],16 // load ar.unat
+ ld8 r30=[r2],16 // M0|1 load cr.ifs
+ mov.m ar.ssd=r0 // M2 clear ar.ssd
cmp.eq p9,p0=r0,r0 // set p9 to indicate that we should restore cr.ifs
;;
- rsm psr.i | psr.ic // initiate turning off of interrupt and interruption collection
- invala // invalidate ALAT
- mov f9ð // clear f9
-
- mov.m ar.ssd=r0 // clear ar.ssd
- mov.m ar.csd=r0 // clear ar.csd
+ ld8 r25=[r3],16 // M0|1 load ar.unat
+ mov.m ar.csd=r0 // M2 clear ar.csd
+ mov r22=r0 // clear r22
+ ;;
+ ld8 r26=[r2],PT(B0)-PT(AR_PFS) // M0|1 load ar.pfs
+ nop.m 0
mov f10ð // clear f10
;;
- ld8 r26=[r16],16 // load ar.pfs
- ld8 r27=[r17],PT(PR)-PT(AR_RSC) // load ar.rsc
+ ld8 r21=[r2],PT(AR_RNAT)-PT(B0) // load b0
+ ld8 r27=[r3],PT(PR)-PT(AR_RSC) // load ar.rsc
mov f11ð // clear f11
;;
- ld8 r24=[r16],PT(B0)-PT(AR_RNAT) // load ar.rnat (may be garbage)
- ld8 r31=[r17],PT(R1)-PT(PR) // load predicates
+ ld8 r24=[r2],PT(AR_FPSR)-PT(AR_RNAT) // load ar.rnat (may be garbage)
+ ld8 r31=[r3],PT(R1)-PT(PR) // load predicates
(pUStk) add r14=IA64_TASK_THREAD_ON_USTACK_OFFSET,r13
;;
- ld8 r21=[r16],PT(R12)-PT(B0) // load b0
- ld8.fill r1=[r17],16 // load r1
-(pUStk) mov r3=1
- ;;
- ld8.fill r12=[r16],16
- ld8.fill r13=[r17],16
- mov r2=r0 // clear r2
+ ld8 r20=[r2],PT(R12)-PT(AR_FPSR) // load ar.fpsr
+ ld8.fill r1=[r3],16 // load r1
+(pUStk) mov r17=1
;;
- ld8 r20=[r16] // load ar.fpsr
- ld8.fill r15=[r17] // load r15
- mov b7=r0 // clear b7
+ srlz.i // M0 ensure interruption collection is off
+ ld8.fill r13=[r3],16
+ nop.i 0
;;
-(pUStk) st1 [r14]=r3
- addl r17=THIS_CPU(ia64_phys_stacked_size_p8),r0
+ ld8.fill r12=[r2] // restore r12 (sp)
+ ld8.fill r15=[r3] // restore r15
+ addl r3=THIS_CPU(ia64_phys_stacked_size_p8),r0
+ ;;
+(pUStk) ld4 r3=[r3] // r3 = cpu_data->phys_stacked_size_p8
+(pUStk) st1 [r14]=r17
+ mov b6=r18 // I0 restore b6
;;
- mov r16=ar.bsp // get existing backing store pointer
- srlz.i // ensure interruption collection is off
+ shr.u r18=r19,16 // I0|1 get byte size of existing "dirty" partition
mov r14=r0 // clear r14
- ;;
- ld4 r17=[r17] // r17 = cpu_data->phys_stacked_size_p8
- mov b6=r22 // restore b6
- shr.u r18=r19,16 // get byte size of existing "dirty" partition
(pKStk) br.cond.dpnt.many skip_rbs_switch
+
+ mov.m ar.ccv=r0 // clear ar.ccv
(pNonSys) br.cond.dpnt.many dont_preserve_current_frame
br.cond.sptk.many rbs_switch
END(ia64_leave_syscall)
@@ -1054,7 +1057,7 @@
;;
(pUStk) mov ar.rnat=r24 // M2 must happen with RSE in lazy mode
nop 0
- nop 0
+(pLvSys)mov r2=r0
mov ar.rsc=r27 // M2
mov pr=r31,-1 // I0
^ permalink raw reply [flat|nested] only message in thread
only message in thread, other threads:[~2005-01-19 5:02 UTC | newest]
Thread overview: (only message) (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2005-01-19 5:02 [patch] Improve ia64_leave_syscall() for McKinley-type cores David Mosberger
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox