[tip:x86/seccomp] x86: Split syscall_trace_enter into two phases

All of lore.kernel.org
 help / color / mirror / Atom feed

From: tip-bot for Andy Lutomirski <tipbot@zytor.com>
To: linux-tip-commits@vger.kernel.org
Cc: linux-kernel@vger.kernel.org, luto@amacapital.net, hpa@zytor.com,
	mingo@kernel.org, tglx@linutronix.de, hpa@linux.intel.com
Subject: [tip:x86/seccomp] x86: Split syscall_trace_enter into two phases
Date: Mon, 8 Sep 2014 19:44:10 -0700	[thread overview]
Message-ID: <tip-e0ffbaabc46db508b8717f023c0ce03b980eefac@git.kernel.org> (raw)
In-Reply-To: <2df320a600020fda055fccf2b668145729dd0c04.1409954077.git.luto@amacapital.net>

Commit-ID:  e0ffbaabc46db508b8717f023c0ce03b980eefac
Gitweb:     http://git.kernel.org/tip/e0ffbaabc46db508b8717f023c0ce03b980eefac
Author:     Andy Lutomirski <luto@amacapital.net>
AuthorDate: Fri, 5 Sep 2014 15:13:54 -0700
Committer:  H. Peter Anvin <hpa@linux.intel.com>
CommitDate: Mon, 8 Sep 2014 14:14:03 -0700

x86: Split syscall_trace_enter into two phases

This splits syscall_trace_enter into syscall_trace_enter_phase1 and
syscall_trace_enter_phase2.  Only phase 2 has full pt_regs, and only
phase 2 is permitted to modify any of pt_regs except for orig_ax.

The intent is that phase 1 can be called from the syscall fast path.

In this implementation, phase1 can handle any combination of
TIF_NOHZ (RCU context tracking), TIF_SECCOMP, and TIF_SYSCALL_AUDIT,
unless seccomp requests a ptrace event, in which case phase2 is
forced.

In principle, this could yield a big speedup for TIF_NOHZ as well as
for TIF_SECCOMP if syscall exit work were similarly split up.

Signed-off-by: Andy Lutomirski <luto@amacapital.net>
Link: http://lkml.kernel.org/r/2df320a600020fda055fccf2b668145729dd0c04.1409954077.git.luto@amacapital.net
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/include/asm/ptrace.h |   5 ++
 arch/x86/kernel/ptrace.c      | 157 +++++++++++++++++++++++++++++++++++-------
 2 files changed, 138 insertions(+), 24 deletions(-)

diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 6205f0c..86fc2bb 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -75,6 +75,11 @@ convert_ip_to_linear(struct task_struct *child, struct pt_regs *regs);
 extern void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs,
 			 int error_code, int si_code);
 
+
+extern unsigned long syscall_trace_enter_phase1(struct pt_regs *, u32 arch);
+extern long syscall_trace_enter_phase2(struct pt_regs *, u32 arch,
+				       unsigned long phase1_result);
+
 extern long syscall_trace_enter(struct pt_regs *);
 extern void syscall_trace_leave(struct pt_regs *);
 
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index bbf338a..29576c2 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -1441,20 +1441,126 @@ void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs,
 	force_sig_info(SIGTRAP, &info, tsk);
 }
 
+static void do_audit_syscall_entry(struct pt_regs *regs, u32 arch)
+{
+#ifdef CONFIG_X86_64
+	if (arch == AUDIT_ARCH_X86_64) {
+		audit_syscall_entry(arch, regs->orig_ax, regs->di,
+				    regs->si, regs->dx, regs->r10);
+	} else
+#endif
+	{
+		audit_syscall_entry(arch, regs->orig_ax, regs->bx,
+				    regs->cx, regs->dx, regs->si);
+	}
+}
+
 /*
- * We must return the syscall number to actually look up in the table.
- * This can be -1L to skip running any syscall at all.
+ * We can return 0 to resume the syscall or anything else to go to phase
+ * 2.  If we resume the syscall, we need to put something appropriate in
+ * regs->orig_ax.
+ *
+ * NB: We don't have full pt_regs here, but regs->orig_ax and regs->ax
+ * are fully functional.
+ *
+ * For phase 2's benefit, our return value is:
+ * 0:			resume the syscall
+ * 1:			go to phase 2; no seccomp phase 2 needed
+ * anything else:	go to phase 2; pass return value to seccomp
  */
-long syscall_trace_enter(struct pt_regs *regs)
+unsigned long syscall_trace_enter_phase1(struct pt_regs *regs, u32 arch)
 {
-	long ret = 0;
+	unsigned long ret = 0;
+	u32 work;
+
+	BUG_ON(regs != task_pt_regs(current));
+
+	work = ACCESS_ONCE(current_thread_info()->flags) &
+		_TIF_WORK_SYSCALL_ENTRY;
 
 	/*
 	 * If TIF_NOHZ is set, we are required to call user_exit() before
 	 * doing anything that could touch RCU.
 	 */
-	if (test_thread_flag(TIF_NOHZ))
+	if (work & _TIF_NOHZ) {
 		user_exit();
+		work &= ~TIF_NOHZ;
+	}
+
+#ifdef CONFIG_SECCOMP
+	/*
+	 * Do seccomp first -- it should minimize exposure of other
+	 * code, and keeping seccomp fast is probably more valuable
+	 * than the rest of this.
+	 */
+	if (work & _TIF_SECCOMP) {
+		struct seccomp_data sd;
+
+		sd.arch = arch;
+		sd.nr = regs->orig_ax;
+		sd.instruction_pointer = regs->ip;
+#ifdef CONFIG_X86_64
+		if (arch == AUDIT_ARCH_X86_64) {
+			sd.args[0] = regs->di;
+			sd.args[1] = regs->si;
+			sd.args[2] = regs->dx;
+			sd.args[3] = regs->r10;
+			sd.args[4] = regs->r8;
+			sd.args[5] = regs->r9;
+		} else
+#endif
+		{
+			sd.args[0] = regs->bx;
+			sd.args[1] = regs->cx;
+			sd.args[2] = regs->dx;
+			sd.args[3] = regs->si;
+			sd.args[4] = regs->di;
+			sd.args[5] = regs->bp;
+		}
+
+		BUILD_BUG_ON(SECCOMP_PHASE1_OK != 0);
+		BUILD_BUG_ON(SECCOMP_PHASE1_SKIP != 1);
+
+		ret = seccomp_phase1(&sd);
+		if (ret == SECCOMP_PHASE1_SKIP) {
+			regs->orig_ax = -1;
+			ret = 0;
+		} else if (ret != SECCOMP_PHASE1_OK) {
+			return ret;  /* Go directly to phase 2 */
+		}
+
+		work &= ~_TIF_SECCOMP;
+	}
+#endif
+
+	/* Do our best to finish without phase 2. */
+	if (work == 0)
+		return ret;  /* seccomp and/or nohz only (ret == 0 here) */
+
+#ifdef CONFIG_AUDITSYSCALL
+	if (work == _TIF_SYSCALL_AUDIT) {
+		/*
+		 * If there is no more work to be done except auditing,
+		 * then audit in phase 1.  Phase 2 always audits, so, if
+		 * we audit here, then we can't go on to phase 2.
+		 */
+		do_audit_syscall_entry(regs, arch);
+		return 0;
+	}
+#endif
+
+	return 1;  /* Something is enabled that we can't handle in phase 1 */
+}
+
+/* Returns the syscall nr to run (which should match regs->orig_ax). */
+long syscall_trace_enter_phase2(struct pt_regs *regs, u32 arch,
+				unsigned long phase1_result)
+{
+	long ret = 0;
+	u32 work = ACCESS_ONCE(current_thread_info()->flags) &
+		_TIF_WORK_SYSCALL_ENTRY;
+
+	BUG_ON(regs != task_pt_regs(current));
 
 	/*
 	 * If we stepped into a sysenter/syscall insn, it trapped in
@@ -1463,17 +1569,21 @@ long syscall_trace_enter(struct pt_regs *regs)
 	 * do_debug() and we need to set it again to restore the user
 	 * state.  If we entered on the slow path, TF was already set.
 	 */
-	if (test_thread_flag(TIF_SINGLESTEP))
+	if (work & _TIF_SINGLESTEP)
 		regs->flags |= X86_EFLAGS_TF;
 
-	/* do the secure computing check first */
-	if (secure_computing()) {
+#ifdef CONFIG_SECCOMP
+	/*
+	 * Call seccomp_phase2 before running the other hooks so that
+	 * they can see any changes made by a seccomp tracer.
+	 */
+	if (phase1_result > 1 && seccomp_phase2(phase1_result)) {
 		/* seccomp failures shouldn't expose any additional code. */
-		ret = -1L;
-		goto out;
+		return -1;
 	}
+#endif
 
-	if (unlikely(test_thread_flag(TIF_SYSCALL_EMU)))
+	if (unlikely(work & _TIF_SYSCALL_EMU))
 		ret = -1L;
 
 	if ((ret || test_thread_flag(TIF_SYSCALL_TRACE)) &&
@@ -1483,23 +1593,22 @@ long syscall_trace_enter(struct pt_regs *regs)
 	if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
 		trace_sys_enter(regs, regs->orig_ax);
 
-	if (is_ia32_task())
-		audit_syscall_entry(AUDIT_ARCH_I386,
-				    regs->orig_ax,
-				    regs->bx, regs->cx,
-				    regs->dx, regs->si);
-#ifdef CONFIG_X86_64
-	else
-		audit_syscall_entry(AUDIT_ARCH_X86_64,
-				    regs->orig_ax,
-				    regs->di, regs->si,
-				    regs->dx, regs->r10);
-#endif
+	do_audit_syscall_entry(regs, arch);
 
-out:
 	return ret ?: regs->orig_ax;
 }
 
+long syscall_trace_enter(struct pt_regs *regs)
+{
+	u32 arch = is_ia32_task() ? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64;
+	unsigned long phase1_result = syscall_trace_enter_phase1(regs, arch);
+
+	if (phase1_result == 0)
+		return regs->orig_ax;
+	else
+		return syscall_trace_enter_phase2(regs, arch, phase1_result);
+}
+
 void syscall_trace_leave(struct pt_regs *regs)
 {
 	bool step;

next prev parent reply	other threads:[~2014-09-09  2:44 UTC|newest]

Thread overview: 61+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2014-09-05 22:13 [PATCH v5 0/5] x86: two-phase syscall tracing and seccomp fastpath Andy Lutomirski
2014-09-05 22:13 ` Andy Lutomirski
2014-09-05 22:13 ` [PATCH v5 1/5] x86,x32,audit: Fix x32's AUDIT_ARCH wrt audit Andy Lutomirski
2014-09-05 22:13   ` Andy Lutomirski
2014-09-09  2:43   ` [tip:x86/seccomp] x86, x32, audit: " tip-bot for Andy Lutomirski
2014-09-05 22:13 ` [PATCH v5 2/5] x86,entry: Only call user_exit if TIF_NOHZ Andy Lutomirski
2014-09-05 22:13   ` Andy Lutomirski
2014-09-09  2:43   ` [tip:x86/seccomp] x86, entry: " tip-bot for Andy Lutomirski
2014-09-05 22:13 ` [PATCH v5 3/5] x86: Split syscall_trace_enter into two phases Andy Lutomirski
2014-09-05 22:13   ` Andy Lutomirski
2014-09-09  2:44   ` tip-bot for Andy Lutomirski [this message]
2015-02-05 21:19   ` Dmitry V. Levin
2015-02-05 21:19     ` Dmitry V. Levin
2015-02-05 21:27     ` Kees Cook
2015-02-05 21:27       ` Kees Cook
2015-02-05 21:40       ` Dmitry V. Levin
2015-02-05 21:40         ` Dmitry V. Levin
2015-02-05 21:52         ` Andy Lutomirski
2015-02-05 21:52           ` Andy Lutomirski
2015-02-05 23:12           ` Kees Cook
2015-02-05 23:12             ` Kees Cook
2015-02-05 23:39             ` Dmitry V. Levin
2015-02-05 23:39               ` Dmitry V. Levin
2015-02-05 23:49               ` Kees Cook
2015-02-05 23:49                 ` Kees Cook
2015-02-06  0:09                 ` Andy Lutomirski
2015-02-06  0:09                   ` Andy Lutomirski
2015-02-06  2:32                   ` Dmitry V. Levin
2015-02-06  2:32                     ` Dmitry V. Levin
2015-02-06  2:38                     ` Andy Lutomirski
2015-02-06  2:38                       ` Andy Lutomirski
2015-02-06 19:23                       ` Kees Cook
2015-02-06 19:23                         ` Kees Cook
2015-02-06 19:32                         ` Andy Lutomirski
2015-02-06 19:32                           ` Andy Lutomirski
2015-02-06 20:07                           ` Kees Cook
2015-02-06 20:07                             ` Kees Cook
2015-02-06 20:12                             ` Andy Lutomirski
2015-02-06 20:12                               ` Andy Lutomirski
2015-02-06 20:16                               ` Kees Cook
2015-02-06 20:16                                 ` Kees Cook
2015-02-06 20:20                                 ` Andy Lutomirski
2015-02-06 20:20                                   ` Andy Lutomirski
2015-02-06 23:17                             ` a method to distinguish between syscall-enter/exit-stop Dmitry V. Levin
2015-02-06 23:17                               ` Dmitry V. Levin
2015-02-07  1:07                               ` Kees Cook
2015-02-07  1:07                                 ` Kees Cook
2015-02-07  3:04                                 ` Dmitry V. Levin
2015-02-07  3:04                                   ` Dmitry V. Levin
2015-02-06 20:11                         ` [PATCH v5 3/5] x86: Split syscall_trace_enter into two phases H. Peter Anvin
2015-02-06 20:11                           ` H. Peter Anvin
2014-09-05 22:13 ` [PATCH v5 4/5] x86_64,entry: Treat regs->ax the same in fastpath and slowpath syscalls Andy Lutomirski
2014-09-05 22:13   ` [PATCH v5 4/5] x86_64, entry: " Andy Lutomirski
2014-09-09  2:44   ` [tip:x86/seccomp] x86_64, entry: Treat regs-> ax " tip-bot for Andy Lutomirski
2014-09-05 22:13 ` [PATCH v5 5/5] x86_64,entry: Use split-phase syscall_trace_enter for 64-bit syscalls Andy Lutomirski
2014-09-05 22:13   ` [PATCH v5 5/5] x86_64, entry: " Andy Lutomirski
2014-09-09  2:44   ` [tip:x86/seccomp] " tip-bot for Andy Lutomirski
2014-09-08 19:29 ` [PATCH v5 0/5] x86: two-phase syscall tracing and seccomp fastpath Kees Cook
2014-09-08 19:29   ` Kees Cook
2014-09-08 19:49   ` H. Peter Anvin
2014-09-08 19:49     ` H. Peter Anvin

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:6205f0c dfblob:86fc2bb dfblob:bbf338a dfblob:29576c2 )
 OR (
bs:"[tip:x86/seccomp] x86: Split syscall_trace_enter into two phases" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=tip-e0ffbaabc46db508b8717f023c0ce03b980eefac@git.kernel.org \
    --to=tipbot@zytor.com \
    --cc=hpa@linux.intel.com \
    --cc=hpa@zytor.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-tip-commits@vger.kernel.org \
    --cc=luto@amacapital.net \
    --cc=mingo@kernel.org \
    --cc=tglx@linutronix.de \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.