All of lore.kernel.org
 help / color / mirror / Atom feed
From: Oleg Nesterov <oleg@redhat.com>
To: Bernd Edlinger <bernd.edlinger@hotmail.de>,
	Linus Torvalds <torvalds@linux-foundation.org>,
	Dmitry Levin <ldv@strace.io>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>,
	Alexey Dobriyan <adobriyan@gmail.com>,
	Kees Cook <kees@kernel.org>,
	Andy Lutomirski <luto@amacapital.net>,
	Will Drewry <wad@chromium.org>,
	Christian Brauner <brauner@kernel.org>,
	Andrew Morton <akpm@linux-foundation.org>,
	Michal Hocko <mhocko@suse.com>, Serge Hallyn <serge@hallyn.com>,
	James Morris <jamorris@linux.microsoft.com>,
	Randy Dunlap <rdunlap@infradead.org>,
	Suren Baghdasaryan <surenb@google.com>,
	Yafang Shao <laoar.shao@gmail.com>, Helge Deller <deller@gmx.de>,
	"Eric W. Biederman" <ebiederm@xmission.com>,
	Adrian Reber <areber@redhat.com>,
	Thomas Gleixner <tglx@linutronix.de>,
	Jens Axboe <axboe@kernel.dk>, Alexei Starovoitov <ast@kernel.org>,
	"linux-fsdevel@vger.kernel.org" <linux-fsdevel@vger.kernel.org>,
	"linux-kernel@vger.kernel.org" <linux-kernel@vger.kernel.org>,
	linux-kselftest@vger.kernel.org, linux-mm@kvack.org,
	linux-security-module@vger.kernel.org,
	tiozhang <tiozhang@didiglobal.com>,
	Luis Chamberlain <mcgrof@kernel.org>,
	"Paulo Alcantara (SUSE)" <pc@manguebit.com>,
	Sergey Senozhatsky <senozhatsky@chromium.org>,
	Frederic Weisbecker <frederic@kernel.org>,
	YueHaibing <yuehaibing@huawei.com>,
	Paul Moore <paul@paul-moore.com>,
	Aleksa Sarai <cyphar@cyphar.com>,
	Stefan Roesch <shr@devkernel.io>, Chao Yu <chao@kernel.org>,
	xu xin <xu.xin16@zte.com.cn>, Jeff Layton <jlayton@kernel.org>,
	Jan Kara <jack@suse.cz>, David Hildenbrand <david@redhat.com>,
	Dave Chinner <dchinner@redhat.com>, Shuah Khan <shuah@kernel.org>,
	Elena Reshetova <elena.reshetova@intel.com>,
	David Windsor <dwindsor@gmail.com>,
	Mateusz Guzik <mjguzik@gmail.com>,
	Ard Biesheuvel <ardb@kernel.org>,
	"Joel Fernandes (Google)" <joel@joelfernandes.org>,
	"Matthew Wilcox (Oracle)" <willy@infradead.org>,
	Hans Liljestrand <ishkamiel@gmail.com>,
	Penglei Jiang <superman.xpt@gmail.com>,
	Lorenzo Stoakes <lorenzo.stoakes@oracle.com>,
	Adrian Ratiu <adrian.ratiu@collabora.com>,
	Ingo Molnar <mingo@kernel.org>,
	"Peter Zijlstra (Intel)" <peterz@infradead.org>,
	Cyrill Gorcunov <gorcunov@gmail.com>,
	Eric Dumazet <edumazet@google.com>
Subject: [RFC PATCH 2/3] exec: don't wait for zombie threads with cred_guard_mutex held
Date: Sun, 9 Nov 2025 18:15:33 +0100	[thread overview]
Message-ID: <aRDMNWx-69fL_gf-@redhat.com> (raw)
In-Reply-To: <aRDL3HOB21pMVMWC@redhat.com>

This simple program

    #include <unistd.h>
    #include <signal.h>
    #include <sys/ptrace.h>
    #include <pthread.h>

    void *thread(void *arg)
    {
	    ptrace(PTRACE_TRACEME, 0,0,0);
	    return NULL;
    }

    int main(void)
    {
	    int pid = fork();

	    if (!pid) {
		    pthread_t pt;
		    pthread_create(&pt, NULL, thread, NULL);
		    pthread_join(pt, NULL);
		    execlp("echo", "echo", "passed", NULL);
	    }

	    sleep(1);
	    ptrace(PTRACE_ATTACH, pid, 0,0);
	    kill(pid, SIGCONT);

	    return 0;
    }

hangs because de_thread() waits for debugger which should release the killed
thread with cred_guard_mutex held, while the debugger sleeps waiting for the
same mutex. Not really that bad, the tracer can be killed, but still this is
a bug and people hit it in practice.

With this patch:

	- de_thread() waits until all the sub-threads pass exit_notify() and
	  become zombies.

	- setup_new_exec() waits until all the sub-threads are reaped without
	  cred_guard_mutex held.

	- unshare_sighand() and flush_signal_handlers() are moved from
	  begin_new_exec() to setup_new_exec(), we can't call them until all
	  sub-threads go away.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
---
 fs/exec.c       | 140 +++++++++++++++++++++++-------------------------
 kernel/exit.c   |   9 ++--
 kernel/signal.c |   2 +-
 3 files changed, 71 insertions(+), 80 deletions(-)

diff --git a/fs/exec.c b/fs/exec.c
index 136a7ab5d91c..2bac7deb9a98 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -905,42 +905,56 @@ static int exec_mmap(struct mm_struct *mm)
 	return 0;
 }
 
-static int de_thread(struct task_struct *tsk)
+static int kill_sub_threads(struct task_struct *tsk)
 {
 	struct signal_struct *sig = tsk->signal;
-	struct sighand_struct *oldsighand = tsk->sighand;
-	spinlock_t *lock = &oldsighand->siglock;
-
-	if (thread_group_empty(tsk))
-		goto no_thread_group;
+	int err = -EINTR;
 
-	/*
-	 * Kill all other threads in the thread group.
-	 */
-	spin_lock_irq(lock);
-	if ((sig->flags & SIGNAL_GROUP_EXIT) || sig->group_exec_task) {
-		/*
-		 * Another group action in progress, just
-		 * return so that the signal is processed.
-		 */
-		spin_unlock_irq(lock);
-		return -EAGAIN;
+	read_lock(&tasklist_lock);
+	spin_lock_irq(&tsk->sighand->siglock);
+	if (!((sig->flags & SIGNAL_GROUP_EXIT) || sig->group_exec_task)) {
+		sig->group_exec_task = tsk;
+		sig->notify_count = -zap_other_threads(tsk);
+		err = 0;
 	}
+	spin_unlock_irq(&tsk->sighand->siglock);
+	read_unlock(&tasklist_lock);
 
-	sig->group_exec_task = tsk;
-	sig->notify_count = zap_other_threads(tsk);
-	if (!thread_group_leader(tsk))
-		sig->notify_count--;
+	return err;
+}
 
-	while (sig->notify_count) {
-		__set_current_state(TASK_KILLABLE);
-		spin_unlock_irq(lock);
-		schedule();
+static int wait_for_notify_count(struct task_struct *tsk)
+{
+	for (;;) {
 		if (__fatal_signal_pending(tsk))
-			goto killed;
-		spin_lock_irq(lock);
+			return -EINTR;
+		set_current_state(TASK_KILLABLE);
+		if (!tsk->signal->notify_count)
+			break;
+		schedule();
 	}
-	spin_unlock_irq(lock);
+	__set_current_state(TASK_RUNNING);
+	return 0;
+}
+
+static void clear_group_exec_task(struct task_struct *tsk)
+{
+	struct signal_struct *sig = tsk->signal;
+
+	/* protects against exit_notify() and __exit_signal() */
+	read_lock(&tasklist_lock);
+	sig->group_exec_task = NULL;
+	sig->notify_count = 0;
+	read_unlock(&tasklist_lock);
+}
+
+static int de_thread(struct task_struct *tsk)
+{
+	if (thread_group_empty(tsk))
+		goto no_thread_group;
+
+	if (kill_sub_threads(tsk) || wait_for_notify_count(tsk))
+		return -EINTR;
 
 	/*
 	 * At this point all other threads have exited, all we have to
@@ -948,26 +962,10 @@ static int de_thread(struct task_struct *tsk)
 	 * and to assume its PID:
 	 */
 	if (!thread_group_leader(tsk)) {
-		struct task_struct *leader = tsk->group_leader;
-
-		for (;;) {
-			cgroup_threadgroup_change_begin(tsk);
-			write_lock_irq(&tasklist_lock);
-			/*
-			 * Do this under tasklist_lock to ensure that
-			 * exit_notify() can't miss ->group_exec_task
-			 */
-			sig->notify_count = -1;
-			if (likely(leader->exit_state))
-				break;
-			__set_current_state(TASK_KILLABLE);
-			write_unlock_irq(&tasklist_lock);
-			cgroup_threadgroup_change_end(tsk);
-			schedule();
-			if (__fatal_signal_pending(tsk))
-				goto killed;
-		}
+		struct task_struct *leader = tsk->group_leader, *t;
 
+		cgroup_threadgroup_change_begin(tsk);
+		write_lock_irq(&tasklist_lock);
 		/*
 		 * The only record we have of the real-time age of a
 		 * process, regardless of execs it's done, is start_time.
@@ -1000,8 +998,8 @@ static int de_thread(struct task_struct *tsk)
 		list_replace_rcu(&leader->tasks, &tsk->tasks);
 		list_replace_init(&leader->sibling, &tsk->sibling);
 
-		tsk->group_leader = tsk;
-		leader->group_leader = tsk;
+		for_each_thread(tsk, t)
+			t->group_leader = tsk;
 
 		tsk->exit_signal = SIGCHLD;
 		leader->exit_signal = -1;
@@ -1021,23 +1019,11 @@ static int de_thread(struct task_struct *tsk)
 		release_task(leader);
 	}
 
-	sig->group_exec_task = NULL;
-	sig->notify_count = 0;
-
 no_thread_group:
 	/* we have changed execution domain */
 	tsk->exit_signal = SIGCHLD;
-
 	BUG_ON(!thread_group_leader(tsk));
 	return 0;
-
-killed:
-	/* protects against exit_notify() and __exit_signal() */
-	read_lock(&tasklist_lock);
-	sig->group_exec_task = NULL;
-	sig->notify_count = 0;
-	read_unlock(&tasklist_lock);
-	return -EAGAIN;
 }
 
 
@@ -1171,13 +1157,6 @@ int begin_new_exec(struct linux_binprm * bprm)
 	flush_itimer_signals();
 #endif
 
-	/*
-	 * Make the signal table private.
-	 */
-	retval = unshare_sighand(me);
-	if (retval)
-		goto out_unlock;
-
 	me->flags &= ~(PF_RANDOMIZE | PF_FORKNOEXEC |
 					PF_NOFREEZE | PF_NO_SETAFFINITY);
 	flush_thread();
@@ -1249,7 +1228,6 @@ int begin_new_exec(struct linux_binprm * bprm)
 	/* An exec changes our domain. We are no longer part of the thread
 	   group */
 	WRITE_ONCE(me->self_exec_id, me->self_exec_id + 1);
-	flush_signal_handlers(me, 0);
 
 	retval = set_cred_ucounts(bprm->cred);
 	if (retval < 0)
@@ -1293,8 +1271,9 @@ int begin_new_exec(struct linux_binprm * bprm)
 	up_write(&me->signal->exec_update_lock);
 	if (!bprm->cred)
 		mutex_unlock(&me->signal->cred_guard_mutex);
-
 out:
+	if (me->signal->group_exec_task == me)
+		clear_group_exec_task(me);
 	return retval;
 }
 EXPORT_SYMBOL(begin_new_exec);
@@ -1325,6 +1304,8 @@ int setup_new_exec(struct linux_binprm * bprm)
 {
 	/* Setup things that can depend upon the personality */
 	struct task_struct *me = current;
+	struct signal_struct *sig = me->signal;
+	int err = 0;
 
 	arch_pick_mmap_layout(me->mm, &bprm->rlim_stack);
 
@@ -1335,10 +1316,23 @@ int setup_new_exec(struct linux_binprm * bprm)
 	 * some architectures like powerpc
 	 */
 	me->mm->task_size = TASK_SIZE;
-	up_write(&me->signal->exec_update_lock);
-	mutex_unlock(&me->signal->cred_guard_mutex);
+	up_write(&sig->exec_update_lock);
+	mutex_unlock(&sig->cred_guard_mutex);
 
-	return 0;
+	if (sig->group_exec_task) {
+		spin_lock_irq(&me->sighand->siglock);
+		sig->notify_count = sig->nr_threads - 1;
+		spin_unlock_irq(&me->sighand->siglock);
+
+		err = wait_for_notify_count(me);
+		clear_group_exec_task(me);
+	}
+
+	if (!err)
+		err = unshare_sighand(me);
+	if (!err)
+		flush_signal_handlers(me, 0);
+	return err;
 }
 EXPORT_SYMBOL(setup_new_exec);
 
diff --git a/kernel/exit.c b/kernel/exit.c
index f041f0c05ebb..bcde78c97253 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -178,10 +178,7 @@ static void __exit_signal(struct release_task_post *post, struct task_struct *ts
 		tty = sig->tty;
 		sig->tty = NULL;
 	} else {
-		/*
-		 * If there is any task waiting for the group exit
-		 * then notify it:
-		 */
+		/* mt-exec, setup_new_exec() -> wait_for_notify_count() */
 		if (sig->notify_count > 0 && !--sig->notify_count)
 			wake_up_process(sig->group_exec_task);
 
@@ -766,8 +763,8 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
 		list_add(&tsk->ptrace_entry, &dead);
 	}
 
-	/* mt-exec, de_thread() is waiting for group leader */
-	if (unlikely(tsk->signal->notify_count < 0))
+	/* mt-exec, de_thread() -> wait_for_notify_count() */
+	if (tsk->signal->notify_count < 0 && !++tsk->signal->notify_count)
 		wake_up_process(tsk->signal->group_exec_task);
 	write_unlock_irq(&tasklist_lock);
 
diff --git a/kernel/signal.c b/kernel/signal.c
index fe9190d84f28..334212044940 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1343,13 +1343,13 @@ int zap_other_threads(struct task_struct *p)
 
 	for_other_threads(p, t) {
 		task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK);
-		count++;
 
 		/* Don't bother with already dead threads */
 		if (t->exit_state)
 			continue;
 		sigaddset(&t->pending.signal, SIGKILL);
 		signal_wake_up(t, 1);
+		count++;
 	}
 
 	return count;
-- 
2.25.1.362.g51ebf55



  parent reply	other threads:[~2025-11-09 17:16 UTC|newest]

Thread overview: 74+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-06-17 12:23 [PATCH v10] exec: Fix dead-lock in de_thread with ptrace_attach Bernd Edlinger
2021-07-11 17:43 ` [PATCH v11] " Bernd Edlinger
2023-10-30  5:20   ` [PATCH v12] " Bernd Edlinger
2023-10-30  9:00     ` kernel test robot
2023-11-06  6:41     ` [PATCH v13] " Bernd Edlinger
2024-01-15 19:22       ` [PATCH v14] " Bernd Edlinger
2024-01-15 19:37         ` Matthew Wilcox
2024-01-17  9:51           ` Bernd Edlinger
2024-01-16 15:22         ` Oleg Nesterov
2024-01-17 15:07           ` Bernd Edlinger
2024-01-17 16:38             ` Oleg Nesterov
2024-01-22 13:24               ` Bernd Edlinger
2024-01-22 13:44                 ` Oleg Nesterov
2024-01-22 21:30                 ` Kees Cook
2024-01-23 18:30                   ` Bernd Edlinger
2024-01-24  0:09                     ` Kees Cook
2024-01-22 18:31         ` [PATCH v15] " Bernd Edlinger
2025-08-18  6:04           ` Jain, Ayush
2025-08-18 20:53           ` [PATCH v16] " Bernd Edlinger
2025-08-19  4:36             ` Kees Cook
2025-08-19 18:53               ` Bernd Edlinger
2025-08-21 17:34             ` [PATCH v17] " Bernd Edlinger
2025-10-27  6:26               ` Bernd Edlinger
2025-10-27 12:06               ` Peter Zijlstra
2025-11-02 16:17               ` Oleg Nesterov
2025-11-05 14:32               ` Oleg Nesterov
2025-11-11  9:21                 ` Christian Brauner
2025-11-11 11:07                   ` Bernd Edlinger
2025-11-11 13:12                     ` Oleg Nesterov
2025-11-11 13:45                       ` Bernd Edlinger
2025-11-12  9:52                         ` Oleg Nesterov
2025-11-17  6:31                   ` Bernd Edlinger
2025-11-17 15:01                     ` Oleg Nesterov
2025-11-17 20:08                       ` Bernd Edlinger
2025-11-23 18:32                         ` Oleg Nesterov
2025-11-29 15:06                           ` Bernd Edlinger
2025-12-01 15:13                             ` Oleg Nesterov
2025-11-09 17:14               ` [RFC PATCH 0/3] mt-exec: fix deadlock with ptrace_attach() Oleg Nesterov
2025-11-09 17:14                 ` [RFC PATCH 1/3] exec: make setup_new_exec() return int Oleg Nesterov
2025-11-09 17:15                 ` Oleg Nesterov [this message]
2025-11-10 10:58                   ` [RFC PATCH 2/3] exec: don't wait for zombie threads with cred_guard_mutex held Cyrill Gorcunov
2025-11-10 15:09                     ` Oleg Nesterov
2025-11-10 21:49                       ` Cyrill Gorcunov
2025-11-11 14:09                         ` Oleg Nesterov
2025-11-09 17:16                 ` [RFC PATCH 3/3] ptrace: ensure PTRACE_EVENT_EXIT won't stop if the tracee is killed by exec Oleg Nesterov
2025-11-10  5:28                 ` [RFC PATCH 0/3] mt-exec: fix deadlock with ptrace_attach() Bernd Edlinger
2025-11-10 14:47                   ` Oleg Nesterov
2025-11-18 18:13               ` [PATCH v18] exec: Fix dead-lock in de_thread with ptrace_attach Bernd Edlinger
2025-11-20 15:15                 ` Eric W. Biederman
2025-11-20 17:29                   ` Eric W. Biederman
2025-11-20 20:57                     ` [RFC][PATCH] exec: Move cred computation under exec_update_lock Eric W. Biederman
2025-11-20 23:50                       ` Eric W. Biederman
2025-11-21  2:59                         ` Bernd Edlinger
2025-11-21  7:18                           ` Eric W. Biederman
2025-11-21  9:35                             ` Bernd Edlinger
2025-11-21 11:26                               ` Bernd Edlinger
2025-11-21 19:19                                 ` Eric W. Biederman
2025-11-21 23:06                                   ` Ryan Lee
2025-11-23 18:52                       ` Oleg Nesterov
2025-11-23 23:22                         ` Eric W. Biederman
2025-11-25 16:19                           ` Bernd Edlinger
2025-11-25 11:55                       ` Roberto Sassu
2025-12-01 16:06                         ` Are setuid shell scripts safe? (Implied by security_bprm_creds_for_exec) Eric W. Biederman
2025-12-01 16:49                           ` Roberto Sassu
2025-12-01 18:53                             ` Eric W. Biederman
2025-12-01 21:39                               ` David Laight
2025-12-03 13:16                               ` Bernd Edlinger
2025-12-04  5:49                                 ` Al Viro
2025-12-04  9:32                                   ` David Laight
2025-12-04 13:03                                   ` Bernd Edlinger
2025-12-09 12:28                                     ` Jan Kara
2025-12-04 15:43                           ` Stephen Smalley
2025-11-22 17:10                     ` [PATCH v18] exec: Fix dead-lock in de_thread with ptrace_attach Bernd Edlinger
2025-12-19  8:15                 ` [PATCH v19] " Bernd Edlinger

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=aRDMNWx-69fL_gf-@redhat.com \
    --to=oleg@redhat.com \
    --cc=adobriyan@gmail.com \
    --cc=adrian.ratiu@collabora.com \
    --cc=akpm@linux-foundation.org \
    --cc=ardb@kernel.org \
    --cc=areber@redhat.com \
    --cc=ast@kernel.org \
    --cc=axboe@kernel.dk \
    --cc=bernd.edlinger@hotmail.de \
    --cc=brauner@kernel.org \
    --cc=chao@kernel.org \
    --cc=cyphar@cyphar.com \
    --cc=david@redhat.com \
    --cc=dchinner@redhat.com \
    --cc=deller@gmx.de \
    --cc=dwindsor@gmail.com \
    --cc=ebiederm@xmission.com \
    --cc=edumazet@google.com \
    --cc=elena.reshetova@intel.com \
    --cc=frederic@kernel.org \
    --cc=gorcunov@gmail.com \
    --cc=ishkamiel@gmail.com \
    --cc=jack@suse.cz \
    --cc=jamorris@linux.microsoft.com \
    --cc=jlayton@kernel.org \
    --cc=joel@joelfernandes.org \
    --cc=kees@kernel.org \
    --cc=laoar.shao@gmail.com \
    --cc=ldv@strace.io \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-kselftest@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=linux-security-module@vger.kernel.org \
    --cc=lorenzo.stoakes@oracle.com \
    --cc=luto@amacapital.net \
    --cc=mcgrof@kernel.org \
    --cc=mhocko@suse.com \
    --cc=mingo@kernel.org \
    --cc=mjguzik@gmail.com \
    --cc=paul@paul-moore.com \
    --cc=pc@manguebit.com \
    --cc=peterz@infradead.org \
    --cc=rdunlap@infradead.org \
    --cc=senozhatsky@chromium.org \
    --cc=serge@hallyn.com \
    --cc=shr@devkernel.io \
    --cc=shuah@kernel.org \
    --cc=superman.xpt@gmail.com \
    --cc=surenb@google.com \
    --cc=tglx@linutronix.de \
    --cc=tiozhang@didiglobal.com \
    --cc=torvalds@linux-foundation.org \
    --cc=viro@zeniv.linux.org.uk \
    --cc=wad@chromium.org \
    --cc=willy@infradead.org \
    --cc=xu.xin16@zte.com.cn \
    --cc=yuehaibing@huawei.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.