From: Peter Zijlstra <peterz@infradead.org>
To: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Oleg Nesterov <oleg@redhat.com>,
rjw@rjwysocki.net, mingo@kernel.org, vincent.guittot@linaro.org,
dietmar.eggemann@arm.com, rostedt@goodmis.org, mgorman@suse.de,
bigeasy@linutronix.de, Will Deacon <will@kernel.org>,
linux-kernel@vger.kernel.org, tj@kernel.org,
linux-pm@vger.kernel.org
Subject: Re: [RFC][PATCH] ptrace: Don't change __state
Date: Thu, 21 Apr 2022 12:26:31 +0200 [thread overview]
Message-ID: <20220421102631.GE2762@worktop.programming.kicks-ass.net> (raw)
In-Reply-To: <20220421072138.GI2731@worktop.programming.kicks-ass.net>
On Thu, Apr 21, 2022 at 09:21:38AM +0200, Peter Zijlstra wrote:
> > @@ -267,13 +266,13 @@ static int ptrace_check_attach(struct task_struct *child, bool ignore_state)
> > read_unlock(&tasklist_lock);
> >
> > if (!ret && !ignore_state) {
> > - if (!wait_task_inactive(child, __TASK_TRACED)) {
> > + if (!wait_task_inactive(child, TASK_TRACED)) {
>
> This is still very dubious, there are spinlocks between
> set_current_state(TASK_TRACED) and schedule(), so wait_task_inactive()
> can fail where we don't want it to due to TASK_TRACED being temporarily
> held in ->saved_state.
As such, I've taken the liberty of munging yours and Oleg's approach
together. I've yet to actually test this but it looks feasible.
---
--- a/include/linux/sched/jobctl.h
+++ b/include/linux/sched/jobctl.h
@@ -19,9 +19,11 @@ struct task_struct;
#define JOBCTL_TRAPPING_BIT 21 /* switching to TRACED */
#define JOBCTL_LISTENING_BIT 22 /* ptracer is listening for events */
#define JOBCTL_TRAP_FREEZE_BIT 23 /* trap for cgroup freezer */
+#define JOBCTL_DELAY_WAKEKILL_BIT 24 /* delay killable wakeups */
-#define JOBCTL_STOPPED_BIT 24
-#define JOBCTL_TRACED_BIT 25
+#define JOBCTL_STOPPED_BIT 25
+#define JOBCTL_TRACED_BIT 26
+#define JOBCTL_TRACED_QUIESCE_BIT 27
#define JOBCTL_STOP_DEQUEUED (1UL << JOBCTL_STOP_DEQUEUED_BIT)
#define JOBCTL_STOP_PENDING (1UL << JOBCTL_STOP_PENDING_BIT)
@@ -31,9 +33,11 @@ struct task_struct;
#define JOBCTL_TRAPPING (1UL << JOBCTL_TRAPPING_BIT)
#define JOBCTL_LISTENING (1UL << JOBCTL_LISTENING_BIT)
#define JOBCTL_TRAP_FREEZE (1UL << JOBCTL_TRAP_FREEZE_BIT)
+#define JOBCTL_DELAY_WAKEKILL (1UL << JOBCTL_DELAY_WAKEKILL_BIT)
#define JOBCTL_STOPPED (1UL << JOBCTL_STOPPED_BIT)
#define JOBCTL_TRACED (1UL << JOBCTL_TRACED_BIT)
+#define JOBCTL_TRACED_QUIESCE (1UL << JOBCTL_TRACED_QUIESCE_BIT)
#define JOBCTL_TRAP_MASK (JOBCTL_TRAP_STOP | JOBCTL_TRAP_NOTIFY)
#define JOBCTL_PENDING_MASK (JOBCTL_STOP_PENDING | JOBCTL_TRAP_MASK)
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -193,26 +193,32 @@ static bool looks_like_a_spurious_pid(st
*/
static bool ptrace_freeze_traced(struct task_struct *task)
{
+ unsigned long flags;
bool ret = false;
/* Lockless, nobody but us can set this flag */
if (task->jobctl & JOBCTL_LISTENING)
return ret;
- spin_lock_irq(&task->sighand->siglock);
- if (task_is_traced(task) && !looks_like_a_spurious_pid(task) &&
+ if (!lock_task_sighand(task, &flags))
+ return ret;
+
+ if (READ_ONCE(task->__state) == TASK_TRACED &&
+ !looks_like_a_spurious_pid(task) &&
!__fatal_signal_pending(task)) {
- WRITE_ONCE(task->__state, __TASK_TRACED);
+ WARN_ON_ONCE(!task_is_traced(task));
+ WARN_ON_ONCE(task->jobctl & JOBCTL_DELAY_WAKEKILL);
+ task->jobctl |= JOBCTL_DELAY_WAKEKILL;
ret = true;
}
- spin_unlock_irq(&task->sighand->siglock);
+ unlock_task_sighand(task, &flags);
return ret;
}
static void ptrace_unfreeze_traced(struct task_struct *task)
{
- if (READ_ONCE(task->__state) != __TASK_TRACED)
+ if (!task_is_traced(task))
return;
WARN_ON(!task->ptrace || task->parent != current);
@@ -222,12 +228,11 @@ static void ptrace_unfreeze_traced(struc
* Recheck state under the lock to close this race.
*/
spin_lock_irq(&task->sighand->siglock);
- if (READ_ONCE(task->__state) == __TASK_TRACED) {
- if (__fatal_signal_pending(task)) {
- task->jobctl &= ~JOBCTL_TRACED;
- wake_up_state(task, __TASK_TRACED);
- } else
- WRITE_ONCE(task->__state, TASK_TRACED);
+// WARN_ON_ONCE(!(task->jobctl & JOBCTL_DELAY_WAKEKILL));
+ task->jobctl &= ~JOBCTL_DELAY_WAKEKILL;
+ if (__fatal_signal_pending(task)) {
+ task->jobctl &= ~JOBCTL_TRACED;
+ wake_up_state(task, TASK_WAKEKILL);
}
spin_unlock_irq(&task->sighand->siglock);
}
@@ -251,40 +256,46 @@ static void ptrace_unfreeze_traced(struc
*/
static int ptrace_check_attach(struct task_struct *child, bool ignore_state)
{
- int ret = -ESRCH;
+ int traced;
/*
* We take the read lock around doing both checks to close a
- * possible race where someone else was tracing our child and
- * detached between these two checks. After this locked check,
- * we are sure that this is our traced child and that can only
- * be changed by us so it's not changing right after this.
+ * possible race where someone else attaches or detaches our
+ * natural child.
*/
read_lock(&tasklist_lock);
- if (child->ptrace && child->parent == current) {
- WARN_ON(READ_ONCE(child->__state) == __TASK_TRACED);
- /*
- * child->sighand can't be NULL, release_task()
- * does ptrace_unlink() before __exit_signal().
- */
- if (ignore_state || ptrace_freeze_traced(child))
- ret = 0;
- }
+ traced = child->ptrace && child->parent == current;
read_unlock(&tasklist_lock);
+ if (!traced)
+ return -ESRCH;
- if (!ret && !ignore_state) {
- if (!wait_task_inactive(child, __TASK_TRACED)) {
- /*
- * This can only happen if may_ptrace_stop() fails and
- * ptrace_stop() changes ->state back to TASK_RUNNING,
- * so we should not worry about leaking __TASK_TRACED.
- */
- WARN_ON(READ_ONCE(child->__state) == __TASK_TRACED);
- ret = -ESRCH;
- }
+ WARN_ON_ONCE(READ_ONCE(child->__state) == __TASK_TRACED);
+ WARN_ON_ONCE(READ_ONCE(child->jobctl) & JOBCTL_DELAY_WAKEKILL);
+
+ if (ignore_state)
+ return 0;
+
+ if (!task_is_traced(child))
+ return -ESRCH;
+
+ /* wait for JOBCTL_TRACED_QUIESCE to go away, see ptrace_stop() */
+ for (;;) {
+ if (fatal_signal_pending(current))
+ return -EINTR;
+
+ set_current_state(TASK_KILLABLE);
+ if (!(READ_ONCE(child->jobctl) & JOBCTL_TRACED_QUIESCE))
+ break;
+
+ schedule();
}
+ __set_current_state(TASK_RUNNING);
- return ret;
+ if (!wait_task_inactive(child, TASK_TRACED) ||
+ !ptrace_freeze_traced(child))
+ return -ESRCH;
+
+ return 0;
}
static bool ptrace_has_cap(struct user_namespace *ns, unsigned int mode)
@@ -1329,8 +1340,7 @@ SYSCALL_DEFINE4(ptrace, long, request, l
goto out_put_task_struct;
ret = arch_ptrace(child, request, addr, data);
- if (ret || request != PTRACE_DETACH)
- ptrace_unfreeze_traced(child);
+ ptrace_unfreeze_traced(child);
out_put_task_struct:
put_task_struct(child);
@@ -1472,8 +1482,7 @@ COMPAT_SYSCALL_DEFINE4(ptrace, compat_lo
request == PTRACE_INTERRUPT);
if (!ret) {
ret = compat_arch_ptrace(child, request, addr, data);
- if (ret || request != PTRACE_DETACH)
- ptrace_unfreeze_traced(child);
+ ptrace_unfreeze_traced(child);
}
out_put_task_struct:
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -764,6 +764,10 @@ void signal_wake_up_state(struct task_st
{
lockdep_assert_held(&t->sighand->siglock);
+ /* Suppress wakekill? */
+ if (t->jobctl & JOBCTL_DELAY_WAKEKILL)
+ state &= ~TASK_WAKEKILL;
+
set_tsk_thread_flag(t, TIF_SIGPENDING);
/*
@@ -774,7 +778,7 @@ void signal_wake_up_state(struct task_st
* handle its death signal.
*/
if (wake_up_state(t, state | TASK_INTERRUPTIBLE))
- t->jobctl &= ~(JOBCTL_STOPPED | JOBCTL_TRACED);
+ t->jobctl &= ~(JOBCTL_STOPPED | JOBCTL_TRACED | JOBCTL_TRACED_QUIESCE);
else
kick_process(t);
}
@@ -2187,6 +2191,15 @@ static void do_notify_parent_cldstop(str
spin_unlock_irqrestore(&sighand->siglock, flags);
}
+static void clear_traced_quiesce(void)
+{
+ spin_lock_irq(¤t->sighand->siglock);
+ WARN_ON_ONCE(!(current->jobctl & JOBCTL_TRACED_QUIESCE));
+ current->jobctl &= ~JOBCTL_TRACED_QUIESCE;
+ wake_up_state(current->parent, TASK_KILLABLE);
+ spin_unlock_irq(¤t->sighand->siglock);
+}
+
/*
* This must be called with current->sighand->siglock held.
*
@@ -2225,7 +2238,7 @@ static int ptrace_stop(int exit_code, in
* schedule() will not sleep if there is a pending signal that
* can awaken the task.
*/
- current->jobctl |= JOBCTL_TRACED;
+ current->jobctl |= JOBCTL_TRACED | JOBCTL_TRACED_QUIESCE;
set_special_state(TASK_TRACED);
/*
@@ -2290,14 +2303,26 @@ static int ptrace_stop(int exit_code, in
/*
* Don't want to allow preemption here, because
* sys_ptrace() needs this task to be inactive.
- *
- * XXX: implement read_unlock_no_resched().
*/
preempt_disable();
read_unlock(&tasklist_lock);
- cgroup_enter_frozen();
+ cgroup_enter_frozen(); // XXX broken on PREEMPT_RT !!!
+
+ /*
+ * JOBCTL_TRACE_QUIESCE bridges the gap between
+ * set_current_state(TASK_TRACED) above and schedule() below.
+ * There must not be any blocking (specifically anything that
+ * touched ->saved_state on PREEMPT_RT) between here and
+ * schedule().
+ *
+ * ptrace_check_attach() relies on this with its
+ * wait_task_inactive() usage.
+ */
+ clear_traced_quiesce();
+
preempt_enable_no_resched();
freezable_schedule();
+
cgroup_leave_frozen(true);
} else {
/*
@@ -2335,6 +2360,7 @@ static int ptrace_stop(int exit_code, in
/* LISTENING can be set only during STOP traps, clear it */
current->jobctl &= ~JOBCTL_LISTENING;
+ current->jobctl &= ~JOBCTL_DELAY_WAKEKILL;
/*
* Queued signals ignored us while we were stopped for tracing.
next prev parent reply other threads:[~2022-04-21 10:27 UTC|newest]
Thread overview: 42+ messages / expand[flat|nested] mbox.gz Atom feed top
2022-04-12 11:44 [PATCH 0/5] ptrace-vs-PREEMPT_RT and freezer rewrite Peter Zijlstra
2022-04-12 11:44 ` [PATCH 1/5] sched,signal,ptrace: Rework TASK_TRACED, TASK_STOPPED state Peter Zijlstra
2022-04-13 13:29 ` Oleg Nesterov
2022-04-13 16:47 ` Peter Zijlstra
2022-04-12 11:44 ` [PATCH 2/5] sched,ptrace: Fix ptrace_check_attach() vs PREEMPT_RT Peter Zijlstra
2022-04-13 13:24 ` Oleg Nesterov
2022-04-13 16:58 ` Peter Zijlstra
2022-04-13 18:57 ` Oleg Nesterov
2022-04-13 18:59 ` Oleg Nesterov
2022-04-13 19:20 ` Peter Zijlstra
2022-04-13 19:56 ` Peter Zijlstra
2022-04-14 11:54 ` Oleg Nesterov
2022-04-14 12:08 ` Oleg Nesterov
2022-04-14 18:34 ` Oleg Nesterov
2022-04-14 22:45 ` Peter Zijlstra
2022-04-15 10:16 ` Oleg Nesterov
2022-04-15 10:57 ` Oleg Nesterov
2022-04-15 12:01 ` Peter Zijlstra
2022-04-18 17:01 ` Oleg Nesterov
2022-04-18 17:19 ` Oleg Nesterov
2022-04-20 13:17 ` Peter Zijlstra
2022-04-20 18:03 ` Oleg Nesterov
2022-04-20 20:54 ` [RFC][PATCH] ptrace: Don't change __state Eric W. Biederman
2022-04-21 7:21 ` Peter Zijlstra
2022-04-21 10:26 ` Peter Zijlstra [this message]
2022-04-21 10:49 ` Oleg Nesterov
2022-04-21 11:50 ` Peter Zijlstra
2022-04-21 14:45 ` Eric W. Biederman
2022-04-21 9:46 ` Oleg Nesterov
2022-04-21 15:01 ` Eric W. Biederman
2022-04-21 11:46 ` kernel test robot
2022-04-27 0:51 ` [ptrace] [confidence: ] 7d3fafb751: BUG:sleeping_function_called_from_invalid_context_at_arch/x86/entry/common.c kernel test robot
2022-04-27 0:51 ` kernel test robot
2022-04-20 10:20 ` [PATCH 2/5] sched,ptrace: Fix ptrace_check_attach() vs PREEMPT_RT Peter Zijlstra
2022-04-20 11:35 ` Oleg Nesterov
2022-04-15 12:00 ` Peter Zijlstra
2022-04-15 12:56 ` Oleg Nesterov
2022-04-12 11:44 ` [PATCH 3/5] freezer: Have {,un}lock_system_sleep() save/restore flags Peter Zijlstra
2022-04-12 11:44 ` [PATCH 4/5] freezer,umh: Clean up freezer/initrd interaction Peter Zijlstra
2022-04-12 11:44 ` [PATCH 5/5] freezer,sched: Rewrite core freezer logic Peter Zijlstra
-- strict thread matches above, loose matches on Subject: below --
2022-04-21 13:01 [RFC][PATCH] ptrace: Don't change __state kernel test robot
2022-04-23 11:43 kernel test robot
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20220421102631.GE2762@worktop.programming.kicks-ass.net \
--to=peterz@infradead.org \
--cc=bigeasy@linutronix.de \
--cc=dietmar.eggemann@arm.com \
--cc=ebiederm@xmission.com \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-pm@vger.kernel.org \
--cc=mgorman@suse.de \
--cc=mingo@kernel.org \
--cc=oleg@redhat.com \
--cc=rjw@rjwysocki.net \
--cc=rostedt@goodmis.org \
--cc=tj@kernel.org \
--cc=vincent.guittot@linaro.org \
--cc=will@kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.