public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
* [ABOMINATION] x86: Fast interrupt return to userspace
@ 2014-05-06 20:29 Andy Lutomirski
  2014-05-06 20:35 ` Linus Torvalds
  0 siblings, 1 reply; 11+ messages in thread
From: Andy Lutomirski @ 2014-05-06 20:29 UTC (permalink / raw)
  To: Linus Torvalds, Thomas Gleixner, Linux Kernel Mailing List, x86,
	Steven Rostedt, Gleb Natapov, Paolo Bonzini
  Cc: Andy Lutomirski

This could be even faster if it were written in assembler :)

The only reason it's Signed-off-by is that I agree to the DCO.
That should not be construed to mean that anyone should apply
this patch.  It's an abomination and it will do terrible,
terrible things.

It boots, though :)  I haven't tested it beyond that.

Signed-off-by: Andy Lutomirski <luto@amacapital.net>
---
 arch/x86/include/asm/calling.h    | 10 ++++++++++
 arch/x86/kernel/entry_64.S        | 14 ++++++++++++++
 arch/x86/kernel/process_64.c      | 37 +++++++++++++++++++++++++++++++++++++
 arch/x86/kernel/vsyscall_64.c     |  2 +-
 arch/x86/kernel/vsyscall_emu_64.S |  5 +++++
 5 files changed, 67 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/calling.h b/arch/x86/include/asm/calling.h
index cb4c73b..ead0345 100644
--- a/arch/x86/include/asm/calling.h
+++ b/arch/x86/include/asm/calling.h
@@ -46,7 +46,9 @@ For 32-bit we have the following conventions - kernel is built with
 
 */
 
+#ifdef __ASSEMBLY__
 #include <asm/dwarf2.h>
+#endif
 
 #ifdef CONFIG_X86_64
 
@@ -85,6 +87,8 @@ For 32-bit we have the following conventions - kernel is built with
 #define ARGOFFSET	R11
 #define SWFRAME		ORIG_RAX
 
+#ifdef __ASSEMBLY__
+
 	.macro SAVE_ARGS addskip=0, save_rcx=1, save_r891011=1
 	subq  $9*8+\addskip, %rsp
 	CFI_ADJUST_CFA_OFFSET	9*8+\addskip
@@ -195,8 +199,12 @@ For 32-bit we have the following conventions - kernel is built with
 	.byte 0xf1
 	.endm
 
+#endif /* __ASSEMBLY__ */
+
 #else /* CONFIG_X86_64 */
 
+#ifdef __ASSEMBLY__
+
 /*
  * For 32bit only simplified versions of SAVE_ALL/RESTORE_ALL. These
  * are different from the entry_32.S versions in not changing the segment
@@ -240,5 +248,7 @@ For 32-bit we have the following conventions - kernel is built with
 	CFI_RESTORE eax
 	.endm
 
+#endif /* __ASSEMBLY__ */
+
 #endif /* CONFIG_X86_64 */
 
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 1e96c36..7e3eae1 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1027,6 +1027,9 @@ retint_swapgs:		/* return to user-space */
 	 */
 	DISABLE_INTERRUPTS(CLBR_ANY)
 	TRACE_IRQS_IRETQ
+	call install_sysret_trampoline
+	test %rax,%rax
+	jnz iret_via_sysret
 	SWAPGS
 	jmp restore_args
 
@@ -1036,6 +1039,7 @@ retint_restore_args:	/* return to kernel space */
 	 * The iretq could re-enable interrupts:
 	 */
 	TRACE_IRQS_IRETQ
+
 restore_args:
 	RESTORE_ARGS 1,8,1
 
@@ -1043,6 +1047,16 @@ irq_return:
 	INTERRUPT_RETURN
 	_ASM_EXTABLE(irq_return, bad_iret)
 
+iret_via_sysret:
+	SWAPGS
+	RESTORE_ARGS 1,8,1
+	popq %rcx /* RIP */
+	popq %r11 /* CS */
+	popq %r11 /* RFLAGS */
+	popq %rsp /* RSP */
+	          /* ignore SS */
+	sysretq
+
 #ifdef CONFIG_PARAVIRT
 ENTRY(native_iret)
 	iretq
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 9c0280f..e48aced 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -562,3 +562,40 @@ unsigned long KSTK_ESP(struct task_struct *task)
 	return (test_tsk_thread_flag(task, TIF_IA32)) ?
 			(task_pt_regs(task)->sp) : ((task)->thread.usersp);
 }
+
+#include <asm/calling.h>
+
+unsigned long notrace install_sysret_trampoline(void)
+{
+	unsigned long *here = __builtin_frame_address(0);
+	unsigned long *asmframe = here + 2;
+	unsigned long __user * newrsp;
+
+#define FRAMEVAL(x) asmframe[((x)-ARGOFFSET) / 8]
+	newrsp =  (unsigned long __user * __force)(FRAMEVAL(RSP) - 128 - 3*8);
+
+	if (FRAMEVAL(CS) != __USER_CS)
+		return 0;
+
+	/*
+	 * A real implementation would do:
+	 * if (!access_ok(VERIFY_WRITE, newrsp, 3*8))
+	 *		return 0;
+	 */
+
+	if (__put_user(FRAMEVAL(RIP), newrsp + 2))
+		return 0;
+
+	if (__put_user(FRAMEVAL(R11), newrsp + 1))
+		return 0;
+
+	if (__put_user(FRAMEVAL(RCX), newrsp))
+		return 0;
+
+	/* Hi there, optimizer. */
+	ACCESS_ONCE(FRAMEVAL(RIP)) = 0xffffffffff600c00;
+	ACCESS_ONCE(FRAMEVAL(RSP)) = (unsigned long)newrsp;
+	return 1;
+
+#undef FRAMEVAL
+}
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 8b3b3eb..77a5ef3 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -54,7 +54,7 @@
 
 DEFINE_VVAR(int, vgetcpu_mode);
 
-static enum { EMULATE, NATIVE, NONE } vsyscall_mode = EMULATE;
+static enum { EMULATE, NATIVE, NONE } vsyscall_mode = NATIVE;
 
 static int __init vsyscall_setup(char *str)
 {
diff --git a/arch/x86/kernel/vsyscall_emu_64.S b/arch/x86/kernel/vsyscall_emu_64.S
index c9596a9..a54a780 100644
--- a/arch/x86/kernel/vsyscall_emu_64.S
+++ b/arch/x86/kernel/vsyscall_emu_64.S
@@ -32,6 +32,11 @@ __vsyscall_page:
 	syscall
 	ret
 
+	.balign 1024, 0xcc
+	popq %rcx
+	popq %r11
+	retq $128
+
 	.balign 4096, 0xcc
 
 	.size __vsyscall_page, 4096
-- 
1.9.0


^ permalink raw reply related	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2014-05-07 15:27 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2014-05-06 20:29 [ABOMINATION] x86: Fast interrupt return to userspace Andy Lutomirski
2014-05-06 20:35 ` Linus Torvalds
2014-05-06 21:00   ` Linus Torvalds
2014-05-06 21:25     ` Linus Torvalds
2014-05-06 21:34       ` Andy Lutomirski
2014-05-06 21:48         ` Linus Torvalds
2014-05-06 22:49           ` Steven Rostedt
2014-05-06 23:37             ` Måns Rullgård
2014-05-07 11:14         ` Ingo Molnar
2014-05-07 15:27           ` Andy Lutomirski
2014-05-06 21:40       ` Linus Torvalds

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox