* [PATCH 1/2] arm64: entry: avoid writing lr explicitly for constructing return paths
@ 2014-11-07 13:32 Will Deacon
2014-11-07 13:32 ` [PATCH 2/2] arm64: entry: use ldp/stp instead of push/pop when saving/restoring regs Will Deacon
0 siblings, 1 reply; 2+ messages in thread
From: Will Deacon @ 2014-11-07 13:32 UTC (permalink / raw)
To: linux-arm-kernel
Using an explicit adr instruction to set the link register to point at
ret_fast_syscall/ret_to_user can defeat branch and return stack predictors.
Instead, use the standard calling instructions (bl, blr) and have an
unconditional branch as the following instruction.
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
arch/arm64/kernel/entry.S | 45 +++++++++++++++++++++++++--------------------
1 file changed, 25 insertions(+), 20 deletions(-)
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index 726b910fe6ec..2cebe56d650c 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -455,8 +455,8 @@ el0_da:
bic x0, x26, #(0xff << 56)
mov x1, x25
mov x2, sp
- adr lr, ret_to_user
- b do_mem_abort
+ bl do_mem_abort
+ b ret_to_user
el0_ia:
/*
* Instruction abort handling
@@ -468,8 +468,8 @@ el0_ia:
mov x0, x26
orr x1, x25, #1 << 24 // use reserved ISS bit for instruction aborts
mov x2, sp
- adr lr, ret_to_user
- b do_mem_abort
+ bl do_mem_abort
+ b ret_to_user
el0_fpsimd_acc:
/*
* Floating Point or Advanced SIMD access
@@ -478,8 +478,8 @@ el0_fpsimd_acc:
ct_user_exit
mov x0, x25
mov x1, sp
- adr lr, ret_to_user
- b do_fpsimd_acc
+ bl do_fpsimd_acc
+ b ret_to_user
el0_fpsimd_exc:
/*
* Floating Point or Advanced SIMD exception
@@ -488,8 +488,8 @@ el0_fpsimd_exc:
ct_user_exit
mov x0, x25
mov x1, sp
- adr lr, ret_to_user
- b do_fpsimd_exc
+ bl do_fpsimd_exc
+ b ret_to_user
el0_sp_pc:
/*
* Stack or PC alignment exception handling
@@ -500,8 +500,8 @@ el0_sp_pc:
mov x0, x26
mov x1, x25
mov x2, sp
- adr lr, ret_to_user
- b do_sp_pc_abort
+ bl do_sp_pc_abort
+ b ret_to_user
el0_undef:
/*
* Undefined instruction
@@ -510,8 +510,8 @@ el0_undef:
enable_dbg_and_irq
ct_user_exit
mov x0, sp
- adr lr, ret_to_user
- b do_undefinstr
+ bl do_undefinstr
+ b ret_to_user
el0_dbg:
/*
* Debug exception handling
@@ -530,8 +530,8 @@ el0_inv:
mov x0, sp
mov x1, #BAD_SYNC
mrs x2, esr_el1
- adr lr, ret_to_user
- b bad_mode
+ bl bad_mode
+ b ret_to_user
ENDPROC(el0_sync)
.align 6
@@ -653,14 +653,15 @@ el0_svc_naked: // compat entry point
ldr x16, [tsk, #TI_FLAGS] // check for syscall hooks
tst x16, #_TIF_SYSCALL_WORK
b.ne __sys_trace
- adr lr, ret_fast_syscall // return address
cmp scno, sc_nr // check upper syscall limit
b.hs ni_sys
ldr x16, [stbl, scno, lsl #3] // address in the syscall table
- br x16 // call sys_* routine
+ blr x16 // call sys_* routine
+ b ret_fast_syscall
ni_sys:
mov x0, sp
- b do_ni_syscall
+ bl do_ni_syscall
+ b ret_fast_syscall
ENDPROC(el0_svc)
/*
@@ -670,17 +671,16 @@ ENDPROC(el0_svc)
__sys_trace:
mov x0, sp
bl syscall_trace_enter
- adr lr, __sys_trace_return // return address
uxtw scno, w0 // syscall number (possibly new)
mov x1, sp // pointer to regs
cmp scno, sc_nr // check upper syscall limit
- b.hs ni_sys
+ b.hs __ni_sys_trace
ldp x0, x1, [sp] // restore the syscall args
ldp x2, x3, [sp, #S_X2]
ldp x4, x5, [sp, #S_X4]
ldp x6, x7, [sp, #S_X6]
ldr x16, [stbl, scno, lsl #3] // address in the syscall table
- br x16 // call sys_* routine
+ blr x16 // call sys_* routine
__sys_trace_return:
str x0, [sp] // save returned x0
@@ -688,6 +688,11 @@ __sys_trace_return:
bl syscall_trace_exit
b ret_to_user
+__ni_sys_trace:
+ mov x0, sp
+ bl do_ni_syscall
+ b __sys_trace_return
+
/*
* Special system call wrappers.
*/
--
2.1.1
^ permalink raw reply related [flat|nested] 2+ messages in thread
* [PATCH 2/2] arm64: entry: use ldp/stp instead of push/pop when saving/restoring regs
2014-11-07 13:32 [PATCH 1/2] arm64: entry: avoid writing lr explicitly for constructing return paths Will Deacon
@ 2014-11-07 13:32 ` Will Deacon
0 siblings, 0 replies; 2+ messages in thread
From: Will Deacon @ 2014-11-07 13:32 UTC (permalink / raw)
To: linux-arm-kernel
The push/pop instructions can be suboptimal when saving/restoring large
amounts of data to/from the stack, for example on entry/exit from the
kernel. This is because:
(1) They act on descending addresses (i.e. the newly decremented sp),
which may defeat some hardware prefetchers
(2) They introduce an implicit dependency between each instruction, as
the sp has to be updated in order to resolve the address of the
next access.
This patch removes the push/pop instructions from our kernel entry/exit
macros in favour of ldp/stp plus offset.
Signed-off-by: Will Deacon <will.deacon@arm.com>
---
arch/arm64/kernel/entry.S | 75 +++++++++++++++++++++++------------------------
1 file changed, 37 insertions(+), 38 deletions(-)
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index 2cebe56d650c..622a409916f3 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -64,25 +64,26 @@
#define BAD_ERROR 3
.macro kernel_entry, el, regsize = 64
- sub sp, sp, #S_FRAME_SIZE - S_LR // room for LR, SP, SPSR, ELR
+ sub sp, sp, #S_FRAME_SIZE
.if \regsize == 32
mov w0, w0 // zero upper 32 bits of x0
.endif
- push x28, x29
- push x26, x27
- push x24, x25
- push x22, x23
- push x20, x21
- push x18, x19
- push x16, x17
- push x14, x15
- push x12, x13
- push x10, x11
- push x8, x9
- push x6, x7
- push x4, x5
- push x2, x3
- push x0, x1
+ stp x0, x1, [sp, #16 * 0]
+ stp x2, x3, [sp, #16 * 1]
+ stp x4, x5, [sp, #16 * 2]
+ stp x6, x7, [sp, #16 * 3]
+ stp x8, x9, [sp, #16 * 4]
+ stp x10, x11, [sp, #16 * 5]
+ stp x12, x13, [sp, #16 * 6]
+ stp x14, x15, [sp, #16 * 7]
+ stp x16, x17, [sp, #16 * 8]
+ stp x18, x19, [sp, #16 * 9]
+ stp x20, x21, [sp, #16 * 10]
+ stp x22, x23, [sp, #16 * 11]
+ stp x24, x25, [sp, #16 * 12]
+ stp x26, x27, [sp, #16 * 13]
+ stp x28, x29, [sp, #16 * 14]
+
.if \el == 0
mrs x21, sp_el0
get_thread_info tsk // Ensure MDSCR_EL1.SS is clear,
@@ -118,33 +119,31 @@
.if \el == 0
ct_user_enter
ldr x23, [sp, #S_SP] // load return stack pointer
+ msr sp_el0, x23
.endif
+ msr elr_el1, x21 // set up the return data
+ msr spsr_el1, x22
.if \ret
ldr x1, [sp, #S_X1] // preserve x0 (syscall return)
- add sp, sp, S_X2
.else
- pop x0, x1
- .endif
- pop x2, x3 // load the rest of the registers
- pop x4, x5
- pop x6, x7
- pop x8, x9
- msr elr_el1, x21 // set up the return data
- msr spsr_el1, x22
- .if \el == 0
- msr sp_el0, x23
+ ldp x0, x1, [sp, #16 * 0]
.endif
- pop x10, x11
- pop x12, x13
- pop x14, x15
- pop x16, x17
- pop x18, x19
- pop x20, x21
- pop x22, x23
- pop x24, x25
- pop x26, x27
- pop x28, x29
- ldr lr, [sp], #S_FRAME_SIZE - S_LR // load LR and restore SP
+ ldp x2, x3, [sp, #16 * 1]
+ ldp x4, x5, [sp, #16 * 2]
+ ldp x6, x7, [sp, #16 * 3]
+ ldp x8, x9, [sp, #16 * 4]
+ ldp x10, x11, [sp, #16 * 5]
+ ldp x12, x13, [sp, #16 * 6]
+ ldp x14, x15, [sp, #16 * 7]
+ ldp x16, x17, [sp, #16 * 8]
+ ldp x18, x19, [sp, #16 * 9]
+ ldp x20, x21, [sp, #16 * 10]
+ ldp x22, x23, [sp, #16 * 11]
+ ldp x24, x25, [sp, #16 * 12]
+ ldp x26, x27, [sp, #16 * 13]
+ ldp x28, x29, [sp, #16 * 14]
+ ldr lr, [sp, #S_LR]
+ add sp, sp, #S_FRAME_SIZE // restore sp
eret // return to kernel
.endm
--
2.1.1
^ permalink raw reply related [flat|nested] 2+ messages in thread
end of thread, other threads:[~2014-11-07 13:32 UTC | newest]
Thread overview: 2+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2014-11-07 13:32 [PATCH 1/2] arm64: entry: avoid writing lr explicitly for constructing return paths Will Deacon
2014-11-07 13:32 ` [PATCH 2/2] arm64: entry: use ldp/stp instead of push/pop when saving/restoring regs Will Deacon
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).