* [PATCH v4 1/4] clone: add CLONE_AUTOREAP
2026-02-23 10:44 [PATCH v4 0/4] pidfd: add CLONE_AUTOREAP and CLONE_PIDFD_AUTOKILL Christian Brauner
@ 2026-02-23 10:44 ` Christian Brauner
2026-02-23 10:44 ` [PATCH v4 2/4] pidfd: add CLONE_PIDFD_AUTOKILL Christian Brauner
` (2 subsequent siblings)
3 siblings, 0 replies; 14+ messages in thread
From: Christian Brauner @ 2026-02-23 10:44 UTC (permalink / raw)
To: Oleg Nesterov, Jann Horn
Cc: Linus Torvalds, Ingo Molnar, Peter Zijlstra, linux-kernel,
linux-fsdevel, Christian Brauner
Add a new clone3() flag CLONE_AUTOREAP that makes a child process
auto-reap on exit without ever becoming a zombie. This is a per-process
property in contrast to the existing auto-reap mechanism via
SA_NOCLDWAIT or SIG_IGN for SIGCHLD which applies to all children of a
given parent.
Currently the only way to automatically reap children is to set
SA_NOCLDWAIT or SIG_IGN on SIGCHLD. This is a parent-scoped property
affecting all children which makes it unsuitable for libraries or
applications that need selective auto-reaping of specific children while
still being able to wait() on others.
CLONE_AUTOREAP stores an autoreap flag in the child's signal_struct.
When the child exits do_notify_parent() checks this flag and causes
exit_notify() to transition the task directly to EXIT_DEAD. Since the
flag lives on the child it survives reparenting: if the original parent
exits and the child is reparented to a subreaper or init the child still
auto-reaps when it eventually exits.
CLONE_AUTOREAP can be combined with CLONE_PIDFD to allow the parent to
monitor the child's exit via poll() and retrieve exit status via
PIDFD_GET_INFO. Without CLONE_PIDFD it provides a fire-and-forget
pattern where the parent simply doesn't care about the child's exit
status. No exit signal is delivered so exit_signal must be zero.
CLONE_AUTOREAP is rejected in combination with CLONE_PARENT. If a
CLONE_AUTOREAP child were to clone(CLONE_PARENT) the new grandchild
would inherit exit_signal == 0 from the autoreap parent's group leader
but without signal->autoreap. This grandchild would become a zombie that
never sends a signal and is never autoreaped - confusing and arguably
broken behavior.
The flag is not inherited by the autoreap process's own children. Each
child that should be autoreaped must be explicitly created with
CLONE_AUTOREAP.
Link: https://github.com/uapi-group/kernel-features/issues/45
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
include/linux/sched/signal.h | 1 +
include/uapi/linux/sched.h | 1 +
kernel/fork.c | 14 +++++++++++++-
kernel/ptrace.c | 3 ++-
kernel/signal.c | 4 ++++
5 files changed, 21 insertions(+), 2 deletions(-)
diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index a22248aebcf9..f842c86b806f 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -132,6 +132,7 @@ struct signal_struct {
*/
unsigned int is_child_subreaper:1;
unsigned int has_child_subreaper:1;
+ unsigned int autoreap:1;
#ifdef CONFIG_POSIX_TIMERS
diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h
index 359a14cc76a4..8a22ea640817 100644
--- a/include/uapi/linux/sched.h
+++ b/include/uapi/linux/sched.h
@@ -36,6 +36,7 @@
/* Flags for the clone3() syscall. */
#define CLONE_CLEAR_SIGHAND 0x100000000ULL /* Clear any signal handler and reset to SIG_DFL. */
#define CLONE_INTO_CGROUP 0x200000000ULL /* Clone into a specific cgroup given the right permissions. */
+#define CLONE_AUTOREAP 0x400000000ULL /* Auto-reap child on exit. */
/*
* cloning flags intersect with CSIGNAL so can be used with unshare and clone3
diff --git a/kernel/fork.c b/kernel/fork.c
index e832da9d15a4..0dedf2999f0c 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2028,6 +2028,15 @@ __latent_entropy struct task_struct *copy_process(
return ERR_PTR(-EINVAL);
}
+ if (clone_flags & CLONE_AUTOREAP) {
+ if (clone_flags & CLONE_THREAD)
+ return ERR_PTR(-EINVAL);
+ if (clone_flags & CLONE_PARENT)
+ return ERR_PTR(-EINVAL);
+ if (args->exit_signal)
+ return ERR_PTR(-EINVAL);
+ }
+
/*
* Force any signals received before this point to be delivered
* before the fork happens. Collect up signals sent to multiple
@@ -2435,6 +2444,8 @@ __latent_entropy struct task_struct *copy_process(
*/
p->signal->has_child_subreaper = p->real_parent->signal->has_child_subreaper ||
p->real_parent->signal->is_child_subreaper;
+ if (clone_flags & CLONE_AUTOREAP)
+ p->signal->autoreap = 1;
list_add_tail(&p->sibling, &p->real_parent->children);
list_add_tail_rcu(&p->tasks, &init_task.tasks);
attach_pid(p, PIDTYPE_TGID);
@@ -2897,7 +2908,8 @@ static bool clone3_args_valid(struct kernel_clone_args *kargs)
{
/* Verify that no unknown flags are passed along. */
if (kargs->flags &
- ~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND | CLONE_INTO_CGROUP))
+ ~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND | CLONE_INTO_CGROUP |
+ CLONE_AUTOREAP))
return false;
/*
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 392ec2f75f01..68c17daef8d4 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -549,7 +549,8 @@ static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p)
if (!dead && thread_group_empty(p)) {
if (!same_thread_group(p->real_parent, tracer))
dead = do_notify_parent(p, p->exit_signal);
- else if (ignoring_children(tracer->sighand)) {
+ else if (ignoring_children(tracer->sighand) ||
+ p->signal->autoreap) {
__wake_up_parent(p, tracer);
dead = true;
}
diff --git a/kernel/signal.c b/kernel/signal.c
index d65d0fe24bfb..e61f39fa8c8a 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2251,6 +2251,10 @@ bool do_notify_parent(struct task_struct *tsk, int sig)
if (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN)
sig = 0;
}
+ if (!tsk->ptrace && tsk->signal->autoreap) {
+ autoreap = true;
+ sig = 0;
+ }
/*
* Send with __send_signal as si_pid and si_uid are in the
* parent's namespaces.
--
2.47.3
^ permalink raw reply related [flat|nested] 14+ messages in thread* [PATCH v4 2/4] pidfd: add CLONE_PIDFD_AUTOKILL
2026-02-23 10:44 [PATCH v4 0/4] pidfd: add CLONE_AUTOREAP and CLONE_PIDFD_AUTOKILL Christian Brauner
2026-02-23 10:44 ` [PATCH v4 1/4] clone: add CLONE_AUTOREAP Christian Brauner
@ 2026-02-23 10:44 ` Christian Brauner
2026-02-23 15:47 ` Oleg Nesterov
2026-02-23 10:45 ` [PATCH v4 3/4] selftests/pidfd: add CLONE_AUTOREAP tests Christian Brauner
2026-02-23 10:45 ` [PATCH v4 4/4] selftests/pidfd: add CLONE_PIDFD_AUTOKILL tests Christian Brauner
3 siblings, 1 reply; 14+ messages in thread
From: Christian Brauner @ 2026-02-23 10:44 UTC (permalink / raw)
To: Oleg Nesterov, Jann Horn
Cc: Linus Torvalds, Ingo Molnar, Peter Zijlstra, linux-kernel,
linux-fsdevel, Christian Brauner
Add a new clone3() flag CLONE_PIDFD_AUTOKILL that ties a child's
lifetime to the pidfd returned from clone3(). When the last reference to
the struct file created by clone3() is closed the kernel sends SIGKILL
to the child. A pidfd obtained via pidfd_open() for the same process
does not keep the child alive and does not trigger autokill - only the
specific struct file from clone3() has this property.
This is useful for container runtimes, service managers, and sandboxed
subprocess execution - any scenario where the child must die if the
parent crashes or abandons the pidfd.
CLONE_PIDFD_AUTOKILL requires both CLONE_PIDFD (the whole point is tying
lifetime to the pidfd file) and CLONE_AUTOREAP (a killed child with no
one to reap it would become a zombie). CLONE_THREAD is rejected because
autokill targets a process not a thread.
The clone3 pidfd is identified by the PIDFD_AUTOKILL file flag set on
the struct file at clone3() time. The pidfs .release handler checks this
flag and sends SIGKILL via do_send_sig_info(SIGKILL, SEND_SIG_PRIV, ...)
only when it is set. Files from pidfd_open() or open_by_handle_at() are
distinct struct files that do not carry this flag. dup()/fork() share the
same struct file so they extend the child's lifetime until the last
reference drops.
CLONE_PIDFD_AUTOKILL automatically sets no_new_privs on the child
process. This ensures the child cannot escalate privileges beyond the
parent's credential level via setuid/setgid exec. Because the child can
never outprivilege the parent the autokill SIGKILL is always within the
parent's natural authority.
This is a deliberate departure from the pdeath_signal model which is
reset during secureexec and commit_creds() rendering it useless for
container runtimes that need to deprivilege themselves. Setting
no_new_privs on the child avoids the need for any such magical resets:
the kill-on-close contract is absolute.
The no_new_privs restriction only affects the child. The parent retains
its full privileges and can continue to execute setuid binaries.
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
fs/pidfs.c | 38 ++++++++++++++++++++++++++++++++------
include/uapi/linux/pidfd.h | 1 +
include/uapi/linux/sched.h | 1 +
kernel/fork.c | 22 +++++++++++++++++++---
4 files changed, 53 insertions(+), 9 deletions(-)
diff --git a/fs/pidfs.c b/fs/pidfs.c
index 318253344b5c..a8d1bca0395d 100644
--- a/fs/pidfs.c
+++ b/fs/pidfs.c
@@ -8,6 +8,8 @@
#include <linux/mount.h>
#include <linux/pid.h>
#include <linux/pidfs.h>
+#include <linux/sched/signal.h>
+#include <linux/signal.h>
#include <linux/pid_namespace.h>
#include <linux/poll.h>
#include <linux/proc_fs.h>
@@ -637,7 +639,28 @@ static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
return open_namespace(ns_common);
}
+static int pidfs_file_release(struct inode *inode, struct file *file)
+{
+ struct pid *pid = inode->i_private;
+ struct task_struct *task;
+
+ if (!(file->f_flags & PIDFD_AUTOKILL))
+ return 0;
+
+ guard(rcu)();
+ task = pid_task(pid, PIDTYPE_TGID);
+ if (!task)
+ return 0;
+
+ /* Not available for kthreads or user workers for now. */
+ if (WARN_ON_ONCE(task->flags & (PF_KTHREAD | PF_USER_WORKER)))
+ return 0;
+ do_send_sig_info(SIGKILL, SEND_SIG_PRIV, task, PIDTYPE_TGID);
+ return 0;
+}
+
static const struct file_operations pidfs_file_operations = {
+ .release = pidfs_file_release,
.poll = pidfd_poll,
#ifdef CONFIG_PROC_FS
.show_fdinfo = pidfd_show_fdinfo,
@@ -1093,11 +1116,11 @@ struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags)
int ret;
/*
- * Ensure that PIDFD_STALE can be passed as a flag without
- * overloading other uapi pidfd flags.
+ * Ensure that internal pidfd flags don't overlap with each
+ * other or with uapi pidfd flags.
*/
- BUILD_BUG_ON(PIDFD_STALE == PIDFD_THREAD);
- BUILD_BUG_ON(PIDFD_STALE == PIDFD_NONBLOCK);
+ BUILD_BUG_ON(hweight32(PIDFD_THREAD | PIDFD_NONBLOCK |
+ PIDFD_STALE | PIDFD_AUTOKILL) != 4);
ret = path_from_stashed(&pid->stashed, pidfs_mnt, get_pid(pid), &path);
if (ret < 0)
@@ -1108,9 +1131,12 @@ struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags)
flags &= ~PIDFD_STALE;
flags |= O_RDWR;
pidfd_file = dentry_open(&path, flags, current_cred());
- /* Raise PIDFD_THREAD explicitly as do_dentry_open() strips it. */
+ /*
+ * Raise PIDFD_THREAD and PIDFD_AUTOKILL explicitly as
+ * do_dentry_open() strips O_EXCL and O_TRUNC.
+ */
if (!IS_ERR(pidfd_file))
- pidfd_file->f_flags |= (flags & PIDFD_THREAD);
+ pidfd_file->f_flags |= (flags & (PIDFD_THREAD | PIDFD_AUTOKILL));
return pidfd_file;
}
diff --git a/include/uapi/linux/pidfd.h b/include/uapi/linux/pidfd.h
index ea9a6811fc76..9281956a9f32 100644
--- a/include/uapi/linux/pidfd.h
+++ b/include/uapi/linux/pidfd.h
@@ -13,6 +13,7 @@
#ifdef __KERNEL__
#include <linux/sched.h>
#define PIDFD_STALE CLONE_PIDFD
+#define PIDFD_AUTOKILL O_TRUNC
#endif
/* Flags for pidfd_send_signal(). */
diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h
index 8a22ea640817..b1aea8a86e2f 100644
--- a/include/uapi/linux/sched.h
+++ b/include/uapi/linux/sched.h
@@ -37,6 +37,7 @@
#define CLONE_CLEAR_SIGHAND 0x100000000ULL /* Clear any signal handler and reset to SIG_DFL. */
#define CLONE_INTO_CGROUP 0x200000000ULL /* Clone into a specific cgroup given the right permissions. */
#define CLONE_AUTOREAP 0x400000000ULL /* Auto-reap child on exit. */
+#define CLONE_PIDFD_AUTOKILL 0x800000000ULL /* Kill child when clone pidfd closes. */
/*
* cloning flags intersect with CSIGNAL so can be used with unshare and clone3
diff --git a/kernel/fork.c b/kernel/fork.c
index 0dedf2999f0c..778aed24e01d 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2037,6 +2037,15 @@ __latent_entropy struct task_struct *copy_process(
return ERR_PTR(-EINVAL);
}
+ if (clone_flags & CLONE_PIDFD_AUTOKILL) {
+ if (!(clone_flags & CLONE_PIDFD))
+ return ERR_PTR(-EINVAL);
+ if (!(clone_flags & CLONE_AUTOREAP))
+ return ERR_PTR(-EINVAL);
+ if (clone_flags & CLONE_THREAD)
+ return ERR_PTR(-EINVAL);
+ }
+
/*
* Force any signals received before this point to be delivered
* before the fork happens. Collect up signals sent to multiple
@@ -2259,13 +2268,20 @@ __latent_entropy struct task_struct *copy_process(
* if the fd table isn't shared).
*/
if (clone_flags & CLONE_PIDFD) {
- int flags = (clone_flags & CLONE_THREAD) ? PIDFD_THREAD : 0;
+ unsigned flags = PIDFD_STALE;
+
+ if (clone_flags & CLONE_THREAD)
+ flags |= PIDFD_THREAD;
+ if (clone_flags & CLONE_PIDFD_AUTOKILL) {
+ task_set_no_new_privs(p);
+ flags |= PIDFD_AUTOKILL;
+ }
/*
* Note that no task has been attached to @pid yet indicate
* that via CLONE_PIDFD.
*/
- retval = pidfd_prepare(pid, flags | PIDFD_STALE, &pidfile);
+ retval = pidfd_prepare(pid, flags, &pidfile);
if (retval < 0)
goto bad_fork_free_pid;
pidfd = retval;
@@ -2909,7 +2925,7 @@ static bool clone3_args_valid(struct kernel_clone_args *kargs)
/* Verify that no unknown flags are passed along. */
if (kargs->flags &
~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND | CLONE_INTO_CGROUP |
- CLONE_AUTOREAP))
+ CLONE_AUTOREAP | CLONE_PIDFD_AUTOKILL))
return false;
/*
--
2.47.3
^ permalink raw reply related [flat|nested] 14+ messages in thread* Re: [PATCH v4 2/4] pidfd: add CLONE_PIDFD_AUTOKILL
2026-02-23 10:44 ` [PATCH v4 2/4] pidfd: add CLONE_PIDFD_AUTOKILL Christian Brauner
@ 2026-02-23 15:47 ` Oleg Nesterov
2026-02-23 15:51 ` Oleg Nesterov
0 siblings, 1 reply; 14+ messages in thread
From: Oleg Nesterov @ 2026-02-23 15:47 UTC (permalink / raw)
To: Christian Brauner
Cc: Jann Horn, Linus Torvalds, Ingo Molnar, Peter Zijlstra,
linux-kernel, linux-fsdevel
On 02/23, Christian Brauner wrote:
>
> @@ -2259,13 +2268,20 @@ __latent_entropy struct task_struct *copy_process(
> * if the fd table isn't shared).
> */
> if (clone_flags & CLONE_PIDFD) {
> - int flags = (clone_flags & CLONE_THREAD) ? PIDFD_THREAD : 0;
> + unsigned flags = PIDFD_STALE;
> +
> + if (clone_flags & CLONE_THREAD)
> + flags |= PIDFD_THREAD;
> + if (clone_flags & CLONE_PIDFD_AUTOKILL) {
> + task_set_no_new_privs(p);
> + flags |= PIDFD_AUTOKILL;
> + }
>
> /*
> * Note that no task has been attached to @pid yet indicate
> * that via CLONE_PIDFD.
> */
> - retval = pidfd_prepare(pid, flags | PIDFD_STALE, &pidfile);
> + retval = pidfd_prepare(pid, flags, &pidfile);
Confused... I think you also need to change pidfs_alloc_file() to restore
O_TRUNC after do_dentry_open() clears this flag? Just like it curently does
pidfd_file->f_flags |= (flags & PIDFD_THREAD);
?
Oleg.
^ permalink raw reply [flat|nested] 14+ messages in thread* Re: [PATCH v4 2/4] pidfd: add CLONE_PIDFD_AUTOKILL
2026-02-23 15:47 ` Oleg Nesterov
@ 2026-02-23 15:51 ` Oleg Nesterov
2026-02-23 17:05 ` pidfd && O_RDWR Oleg Nesterov
0 siblings, 1 reply; 14+ messages in thread
From: Oleg Nesterov @ 2026-02-23 15:51 UTC (permalink / raw)
To: Christian Brauner
Cc: Jann Horn, Linus Torvalds, Ingo Molnar, Peter Zijlstra,
linux-kernel, linux-fsdevel
On 02/23, Oleg Nesterov wrote:
>
> On 02/23, Christian Brauner wrote:
> >
> > @@ -2259,13 +2268,20 @@ __latent_entropy struct task_struct *copy_process(
> > * if the fd table isn't shared).
> > */
> > if (clone_flags & CLONE_PIDFD) {
> > - int flags = (clone_flags & CLONE_THREAD) ? PIDFD_THREAD : 0;
> > + unsigned flags = PIDFD_STALE;
> > +
> > + if (clone_flags & CLONE_THREAD)
> > + flags |= PIDFD_THREAD;
> > + if (clone_flags & CLONE_PIDFD_AUTOKILL) {
> > + task_set_no_new_privs(p);
> > + flags |= PIDFD_AUTOKILL;
> > + }
> >
> > /*
> > * Note that no task has been attached to @pid yet indicate
> > * that via CLONE_PIDFD.
> > */
> > - retval = pidfd_prepare(pid, flags | PIDFD_STALE, &pidfile);
> > + retval = pidfd_prepare(pid, flags, &pidfile);
>
> Confused... I think you also need to change pidfs_alloc_file() to restore
> O_TRUNC after do_dentry_open() clears this flag? Just like it curently does
>
> pidfd_file->f_flags |= (flags & PIDFD_THREAD);
Aah! please ignore me. Somehow I missed exactly this change in your patch.
Sorry for noise!
Oleg.
^ permalink raw reply [flat|nested] 14+ messages in thread* pidfd && O_RDWR
2026-02-23 15:51 ` Oleg Nesterov
@ 2026-02-23 17:05 ` Oleg Nesterov
2026-02-23 18:14 ` David Laight
2026-02-23 19:21 ` Oleg Nesterov
0 siblings, 2 replies; 14+ messages in thread
From: Oleg Nesterov @ 2026-02-23 17:05 UTC (permalink / raw)
To: Christian Brauner
Cc: Jann Horn, Linus Torvalds, Ingo Molnar, Peter Zijlstra,
linux-kernel, linux-fsdevel
On 02/23, Oleg Nesterov wrote:
>
> Sorry for noise!
Yes, but let me add more (off-topic) noise to this thread...
pidfd_prepare() does pidfs_alloc_file(pid, flags | O_RDWR) and "| O_RDWR"
makes no sense because pidfs_alloc_file() itself does
flags |= O_RDWR;
I was going to send the trivial cleanup, but why a pidfs file needs
O_RDWR/FMODE_WRITE ?
Actually the same question about some anon_inode_getfile_fmode(O_RDWR)
users, for example signalfd.c.
Can you explain just for my education?
Oleg.
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: pidfd && O_RDWR
2026-02-23 17:05 ` pidfd && O_RDWR Oleg Nesterov
@ 2026-02-23 18:14 ` David Laight
2026-02-23 19:21 ` Oleg Nesterov
1 sibling, 0 replies; 14+ messages in thread
From: David Laight @ 2026-02-23 18:14 UTC (permalink / raw)
To: Oleg Nesterov
Cc: Christian Brauner, Jann Horn, Linus Torvalds, Ingo Molnar,
Peter Zijlstra, linux-kernel, linux-fsdevel
On Mon, 23 Feb 2026 18:05:44 +0100
Oleg Nesterov <oleg@redhat.com> wrote:
> On 02/23, Oleg Nesterov wrote:
> >
> > Sorry for noise!
>
> Yes, but let me add more (off-topic) noise to this thread...
>
> pidfd_prepare() does pidfs_alloc_file(pid, flags | O_RDWR) and "| O_RDWR"
> makes no sense because pidfs_alloc_file() itself does
>
> flags |= O_RDWR;
>
> I was going to send the trivial cleanup, but why a pidfs file needs
> O_RDWR/FMODE_WRITE ?
Or why any program that gets that far through the code 'wins'
write access.
David
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: pidfd && O_RDWR
2026-02-23 17:05 ` pidfd && O_RDWR Oleg Nesterov
2026-02-23 18:14 ` David Laight
@ 2026-02-23 19:21 ` Oleg Nesterov
2026-02-23 21:39 ` Christian Brauner
1 sibling, 1 reply; 14+ messages in thread
From: Oleg Nesterov @ 2026-02-23 19:21 UTC (permalink / raw)
To: Christian Brauner
Cc: Jann Horn, Linus Torvalds, Ingo Molnar, Peter Zijlstra,
linux-kernel, linux-fsdevel
On 02/23, Oleg Nesterov wrote:
>
> pidfd_prepare() does pidfs_alloc_file(pid, flags | O_RDWR) and "| O_RDWR"
> makes no sense because pidfs_alloc_file() itself does
>
> flags |= O_RDWR;
>
> I was going to send the trivial cleanup, but why a pidfs file needs
> O_RDWR/FMODE_WRITE ?
>
> Actually the same question about some anon_inode_getfile_fmode(O_RDWR)
> users, for example signalfd.c.
perhaps an accidental legacy from 628ff7c1d8d8 ("anonfd: Allow making anon
files read-only") ?
Oleg.
^ permalink raw reply [flat|nested] 14+ messages in thread* Re: pidfd && O_RDWR
2026-02-23 19:21 ` Oleg Nesterov
@ 2026-02-23 21:39 ` Christian Brauner
2026-02-24 9:43 ` David Laight
2026-02-24 10:17 ` Oleg Nesterov
0 siblings, 2 replies; 14+ messages in thread
From: Christian Brauner @ 2026-02-23 21:39 UTC (permalink / raw)
To: Oleg Nesterov
Cc: Jann Horn, Linus Torvalds, Ingo Molnar, Peter Zijlstra,
linux-kernel, linux-fsdevel
On Mon, Feb 23, 2026 at 08:21:02PM +0100, Oleg Nesterov wrote:
> On 02/23, Oleg Nesterov wrote:
> >
> > pidfd_prepare() does pidfs_alloc_file(pid, flags | O_RDWR) and "| O_RDWR"
> > makes no sense because pidfs_alloc_file() itself does
> >
> > flags |= O_RDWR;
> >
> > I was going to send the trivial cleanup, but why a pidfs file needs
> > O_RDWR/FMODE_WRITE ?
> >
> > Actually the same question about some anon_inode_getfile_fmode(O_RDWR)
> > users, for example signalfd.c.
>
> perhaps an accidental legacy from 628ff7c1d8d8 ("anonfd: Allow making anon
> files read-only") ?
It was always a possibility that we would support some form of
write-like operation eventually. And we have support for setting trusted
extended attributes on pidfds for some time now (trusted xattrs require
global cap_sys_admin).
^ permalink raw reply [flat|nested] 14+ messages in thread* Re: pidfd && O_RDWR
2026-02-23 21:39 ` Christian Brauner
@ 2026-02-24 9:43 ` David Laight
2026-02-24 10:17 ` Oleg Nesterov
1 sibling, 0 replies; 14+ messages in thread
From: David Laight @ 2026-02-24 9:43 UTC (permalink / raw)
To: Christian Brauner
Cc: Oleg Nesterov, Jann Horn, Linus Torvalds, Ingo Molnar,
Peter Zijlstra, linux-kernel, linux-fsdevel
On Mon, 23 Feb 2026 22:39:22 +0100
Christian Brauner <brauner@kernel.org> wrote:
> On Mon, Feb 23, 2026 at 08:21:02PM +0100, Oleg Nesterov wrote:
> > On 02/23, Oleg Nesterov wrote:
> > >
> > > pidfd_prepare() does pidfs_alloc_file(pid, flags | O_RDWR) and "| O_RDWR"
> > > makes no sense because pidfs_alloc_file() itself does
> > >
> > > flags |= O_RDWR;
> > >
> > > I was going to send the trivial cleanup, but why a pidfs file needs
> > > O_RDWR/FMODE_WRITE ?
> > >
> > > Actually the same question about some anon_inode_getfile_fmode(O_RDWR)
> > > users, for example signalfd.c.
> >
> > perhaps an accidental legacy from 628ff7c1d8d8 ("anonfd: Allow making anon
> > files read-only") ?
>
> It was always a possibility that we would support some form of
> write-like operation eventually. And we have support for setting trusted
> extended attributes on pidfds for some time now (trusted xattrs require
> global cap_sys_admin).
>
Isn't 'sending a signal' a write-like operation?
David
^ permalink raw reply [flat|nested] 14+ messages in thread* Re: pidfd && O_RDWR
2026-02-23 21:39 ` Christian Brauner
2026-02-24 9:43 ` David Laight
@ 2026-02-24 10:17 ` Oleg Nesterov
2026-02-24 16:47 ` Christian Brauner
1 sibling, 1 reply; 14+ messages in thread
From: Oleg Nesterov @ 2026-02-24 10:17 UTC (permalink / raw)
To: Christian Brauner
Cc: Jann Horn, Linus Torvalds, Ingo Molnar, Peter Zijlstra,
linux-kernel, linux-fsdevel
On 02/23, Christian Brauner wrote:
>
> On Mon, Feb 23, 2026 at 08:21:02PM +0100, Oleg Nesterov wrote:
> > On 02/23, Oleg Nesterov wrote:
> > >
> > > pidfd_prepare() does pidfs_alloc_file(pid, flags | O_RDWR) and "| O_RDWR"
> > > makes no sense because pidfs_alloc_file() itself does
> > >
> > > flags |= O_RDWR;
> > >
> > > I was going to send the trivial cleanup, but why a pidfs file needs
> > > O_RDWR/FMODE_WRITE ?
> > >
> > > Actually the same question about some anon_inode_getfile_fmode(O_RDWR)
> > > users, for example signalfd.c.
> >
> > perhaps an accidental legacy from 628ff7c1d8d8 ("anonfd: Allow making anon
> > files read-only") ?
>
> It was always a possibility that we would support some form of
> write-like operation eventually. And we have support for setting trusted
> extended attributes on pidfds for some time now (trusted xattrs require
> global cap_sys_admin).
But why do we need O_RDWR right now? That was my question.
I can be easily wrong, but I think that pidfs_xattr_handlers logic doesn't
need it...
OK, I won't pretend I understand fs, I'll send the trivial cleanup which just
removes the unnecessary "flags | O_RDWR" in pidfd_prepare().
Oleg.
^ permalink raw reply [flat|nested] 14+ messages in thread* Re: pidfd && O_RDWR
2026-02-24 10:17 ` Oleg Nesterov
@ 2026-02-24 16:47 ` Christian Brauner
0 siblings, 0 replies; 14+ messages in thread
From: Christian Brauner @ 2026-02-24 16:47 UTC (permalink / raw)
To: Oleg Nesterov
Cc: Jann Horn, Linus Torvalds, Ingo Molnar, Peter Zijlstra,
linux-kernel, linux-fsdevel
On Tue, Feb 24, 2026 at 11:17:43AM +0100, Oleg Nesterov wrote:
> On 02/23, Christian Brauner wrote:
> >
> > On Mon, Feb 23, 2026 at 08:21:02PM +0100, Oleg Nesterov wrote:
> > > On 02/23, Oleg Nesterov wrote:
> > > >
> > > > pidfd_prepare() does pidfs_alloc_file(pid, flags | O_RDWR) and "| O_RDWR"
> > > > makes no sense because pidfs_alloc_file() itself does
> > > >
> > > > flags |= O_RDWR;
> > > >
> > > > I was going to send the trivial cleanup, but why a pidfs file needs
> > > > O_RDWR/FMODE_WRITE ?
> > > >
> > > > Actually the same question about some anon_inode_getfile_fmode(O_RDWR)
> > > > users, for example signalfd.c.
> > >
> > > perhaps an accidental legacy from 628ff7c1d8d8 ("anonfd: Allow making anon
> > > files read-only") ?
> >
> > It was always a possibility that we would support some form of
> > write-like operation eventually. And we have support for setting trusted
> > extended attributes on pidfds for some time now (trusted xattrs require
> > global cap_sys_admin).
>
> But why do we need O_RDWR right now? That was my question.
>
> I can be easily wrong, but I think that pidfs_xattr_handlers logic doesn't
> need it...
>
> OK, I won't pretend I understand fs, I'll send the trivial cleanup which just
> removes the unnecessary "flags | O_RDWR" in pidfd_prepare().
xattrs don't need FMODE_WRITE. You can use O_RDONLY fds with the
justification that it's metadata (most likely). Although I always found
that rather weird. Sending signals is technically also equivalent to
writing and I think that was the original reason this was done. If you
want to remove it then be my guest.
^ permalink raw reply [flat|nested] 14+ messages in thread
* [PATCH v4 3/4] selftests/pidfd: add CLONE_AUTOREAP tests
2026-02-23 10:44 [PATCH v4 0/4] pidfd: add CLONE_AUTOREAP and CLONE_PIDFD_AUTOKILL Christian Brauner
2026-02-23 10:44 ` [PATCH v4 1/4] clone: add CLONE_AUTOREAP Christian Brauner
2026-02-23 10:44 ` [PATCH v4 2/4] pidfd: add CLONE_PIDFD_AUTOKILL Christian Brauner
@ 2026-02-23 10:45 ` Christian Brauner
2026-02-23 10:45 ` [PATCH v4 4/4] selftests/pidfd: add CLONE_PIDFD_AUTOKILL tests Christian Brauner
3 siblings, 0 replies; 14+ messages in thread
From: Christian Brauner @ 2026-02-23 10:45 UTC (permalink / raw)
To: Oleg Nesterov, Jann Horn
Cc: Linus Torvalds, Ingo Molnar, Peter Zijlstra, linux-kernel,
linux-fsdevel, Christian Brauner
Add tests for the new CLONE_AUTOREAP clone3() flag:
- autoreap_without_pidfd: CLONE_AUTOREAP without CLONE_PIDFD works
(fire-and-forget)
- autoreap_rejects_exit_signal: CLONE_AUTOREAP with non-zero
exit_signal fails
- autoreap_rejects_parent: CLONE_AUTOREAP with CLONE_PARENT fails
- autoreap_rejects_thread: CLONE_AUTOREAP with CLONE_THREAD fails
- autoreap_basic: child exits, pidfd poll works, PIDFD_GET_INFO returns
correct exit code, waitpid() returns -ECHILD
- autoreap_signaled: child killed by signal, exit info correct via pidfd
- autoreap_reparent: autoreap grandchild reparented to subreaper still
auto-reaps
- autoreap_multithreaded: autoreap process with sub-threads auto-reaps
after last thread exits
- autoreap_no_inherit: grandchild forked without CLONE_AUTOREAP becomes
a regular zombie
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
tools/testing/selftests/pidfd/.gitignore | 1 +
tools/testing/selftests/pidfd/Makefile | 2 +-
.../testing/selftests/pidfd/pidfd_autoreap_test.c | 507 +++++++++++++++++++++
3 files changed, 509 insertions(+), 1 deletion(-)
diff --git a/tools/testing/selftests/pidfd/.gitignore b/tools/testing/selftests/pidfd/.gitignore
index 144e7ff65d6a..4cd8ec7fd349 100644
--- a/tools/testing/selftests/pidfd/.gitignore
+++ b/tools/testing/selftests/pidfd/.gitignore
@@ -12,3 +12,4 @@ pidfd_info_test
pidfd_exec_helper
pidfd_xattr_test
pidfd_setattr_test
+pidfd_autoreap_test
diff --git a/tools/testing/selftests/pidfd/Makefile b/tools/testing/selftests/pidfd/Makefile
index 764a8f9ecefa..4211f91e9af8 100644
--- a/tools/testing/selftests/pidfd/Makefile
+++ b/tools/testing/selftests/pidfd/Makefile
@@ -4,7 +4,7 @@ CFLAGS += -g $(KHDR_INCLUDES) $(TOOLS_INCLUDES) -pthread -Wall
TEST_GEN_PROGS := pidfd_test pidfd_fdinfo_test pidfd_open_test \
pidfd_poll_test pidfd_wait pidfd_getfd_test pidfd_setns_test \
pidfd_file_handle_test pidfd_bind_mount pidfd_info_test \
- pidfd_xattr_test pidfd_setattr_test
+ pidfd_xattr_test pidfd_setattr_test pidfd_autoreap_test
TEST_GEN_PROGS_EXTENDED := pidfd_exec_helper
diff --git a/tools/testing/selftests/pidfd/pidfd_autoreap_test.c b/tools/testing/selftests/pidfd/pidfd_autoreap_test.c
new file mode 100644
index 000000000000..9e52a16239ea
--- /dev/null
+++ b/tools/testing/selftests/pidfd/pidfd_autoreap_test.c
@@ -0,0 +1,507 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2026 Christian Brauner <brauner@kernel.org>
+
+#define _GNU_SOURCE
+#include <errno.h>
+#include <fcntl.h>
+#include <linux/types.h>
+#include <poll.h>
+#include <pthread.h>
+#include <sched.h>
+#include <signal.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <syscall.h>
+#include <sys/ioctl.h>
+#include <sys/prctl.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+#include "pidfd.h"
+#include "kselftest_harness.h"
+
+#ifndef CLONE_AUTOREAP
+#define CLONE_AUTOREAP 0x400000000ULL
+#endif
+
+static pid_t create_autoreap_child(int *pidfd)
+{
+ struct __clone_args args = {
+ .flags = CLONE_PIDFD | CLONE_AUTOREAP,
+ .exit_signal = 0,
+ .pidfd = ptr_to_u64(pidfd),
+ };
+
+ return sys_clone3(&args, sizeof(args));
+}
+
+/*
+ * Test that CLONE_AUTOREAP works without CLONE_PIDFD (fire-and-forget).
+ */
+TEST(autoreap_without_pidfd)
+{
+ struct __clone_args args = {
+ .flags = CLONE_AUTOREAP,
+ .exit_signal = 0,
+ };
+ pid_t pid;
+ int ret;
+
+ pid = sys_clone3(&args, sizeof(args));
+ if (pid < 0 && errno == EINVAL)
+ SKIP(return, "CLONE_AUTOREAP not supported");
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0)
+ _exit(0);
+
+ /*
+ * Give the child a moment to exit and be autoreaped.
+ * Then verify no zombie remains.
+ */
+ usleep(200000);
+ ret = waitpid(pid, NULL, WNOHANG);
+ ASSERT_EQ(ret, -1);
+ ASSERT_EQ(errno, ECHILD);
+}
+
+/*
+ * Test that CLONE_AUTOREAP with a non-zero exit_signal fails.
+ */
+TEST(autoreap_rejects_exit_signal)
+{
+ struct __clone_args args = {
+ .flags = CLONE_PIDFD | CLONE_AUTOREAP,
+ .exit_signal = SIGCHLD,
+ };
+ int pidfd = -1;
+ pid_t pid;
+
+ args.pidfd = ptr_to_u64(&pidfd);
+
+ pid = sys_clone3(&args, sizeof(args));
+ ASSERT_EQ(pid, -1);
+ ASSERT_EQ(errno, EINVAL);
+}
+
+/*
+ * Test that CLONE_AUTOREAP with CLONE_PARENT fails.
+ */
+TEST(autoreap_rejects_parent)
+{
+ struct __clone_args args = {
+ .flags = CLONE_PIDFD | CLONE_AUTOREAP | CLONE_PARENT,
+ .exit_signal = 0,
+ };
+ int pidfd = -1;
+ pid_t pid;
+
+ args.pidfd = ptr_to_u64(&pidfd);
+ pid = sys_clone3(&args, sizeof(args));
+ ASSERT_EQ(pid, -1);
+ ASSERT_EQ(errno, EINVAL);
+}
+
+/*
+ * Test that CLONE_AUTOREAP with CLONE_THREAD fails.
+ */
+TEST(autoreap_rejects_thread)
+{
+ struct __clone_args args = {
+ .flags = CLONE_PIDFD | CLONE_AUTOREAP |
+ CLONE_THREAD | CLONE_SIGHAND |
+ CLONE_VM,
+ .exit_signal = 0,
+ };
+ int pidfd = -1;
+ pid_t pid;
+
+ args.pidfd = ptr_to_u64(&pidfd);
+
+ pid = sys_clone3(&args, sizeof(args));
+ ASSERT_EQ(pid, -1);
+ ASSERT_EQ(errno, EINVAL);
+}
+
+/*
+ * Basic test: create an autoreap child, let it exit, verify:
+ * - pidfd becomes readable (poll returns POLLIN)
+ * - PIDFD_GET_INFO returns the correct exit code
+ * - waitpid() returns -1/ECHILD (no zombie)
+ */
+TEST(autoreap_basic)
+{
+ struct pidfd_info info = { .mask = PIDFD_INFO_EXIT };
+ int pidfd = -1, ret;
+ struct pollfd pfd;
+ pid_t pid;
+
+ pid = create_autoreap_child(&pidfd);
+ if (pid < 0 && errno == EINVAL)
+ SKIP(return, "CLONE_AUTOREAP not supported");
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0)
+ _exit(42);
+
+ ASSERT_GE(pidfd, 0);
+
+ /* Wait for the child to exit via pidfd poll. */
+ pfd.fd = pidfd;
+ pfd.events = POLLIN;
+ ret = poll(&pfd, 1, 5000);
+ ASSERT_EQ(ret, 1);
+ ASSERT_TRUE(pfd.revents & POLLIN);
+
+ /* Verify exit info via PIDFD_GET_INFO. */
+ ret = ioctl(pidfd, PIDFD_GET_INFO, &info);
+ ASSERT_EQ(ret, 0);
+ ASSERT_TRUE(info.mask & PIDFD_INFO_EXIT);
+ /*
+ * exit_code is in waitpid format: for _exit(42),
+ * WIFEXITED is true and WEXITSTATUS is 42.
+ */
+ ASSERT_TRUE(WIFEXITED(info.exit_code));
+ ASSERT_EQ(WEXITSTATUS(info.exit_code), 42);
+
+ /* Verify no zombie: waitpid should fail with ECHILD. */
+ ret = waitpid(pid, NULL, WNOHANG);
+ ASSERT_EQ(ret, -1);
+ ASSERT_EQ(errno, ECHILD);
+
+ close(pidfd);
+}
+
+/*
+ * Test that an autoreap child killed by a signal reports
+ * the correct exit info.
+ */
+TEST(autoreap_signaled)
+{
+ struct pidfd_info info = { .mask = PIDFD_INFO_EXIT };
+ int pidfd = -1, ret;
+ struct pollfd pfd;
+ pid_t pid;
+
+ pid = create_autoreap_child(&pidfd);
+ if (pid < 0 && errno == EINVAL)
+ SKIP(return, "CLONE_AUTOREAP not supported");
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ pause();
+ _exit(1);
+ }
+
+ ASSERT_GE(pidfd, 0);
+
+ /* Kill the child. */
+ ret = sys_pidfd_send_signal(pidfd, SIGKILL, NULL, 0);
+ ASSERT_EQ(ret, 0);
+
+ /* Wait for exit via pidfd. */
+ pfd.fd = pidfd;
+ pfd.events = POLLIN;
+ ret = poll(&pfd, 1, 5000);
+ ASSERT_EQ(ret, 1);
+ ASSERT_TRUE(pfd.revents & POLLIN);
+
+ /* Verify signal info. */
+ ret = ioctl(pidfd, PIDFD_GET_INFO, &info);
+ ASSERT_EQ(ret, 0);
+ ASSERT_TRUE(info.mask & PIDFD_INFO_EXIT);
+ ASSERT_TRUE(WIFSIGNALED(info.exit_code));
+ ASSERT_EQ(WTERMSIG(info.exit_code), SIGKILL);
+
+ /* No zombie. */
+ ret = waitpid(pid, NULL, WNOHANG);
+ ASSERT_EQ(ret, -1);
+ ASSERT_EQ(errno, ECHILD);
+
+ close(pidfd);
+}
+
+/*
+ * Test autoreap survives reparenting: middle process creates an
+ * autoreap grandchild, then exits. The grandchild gets reparented
+ * to us (the grandparent, which is a subreaper). When the grandchild
+ * exits, it should still be autoreaped - no zombie under us.
+ */
+TEST(autoreap_reparent)
+{
+ int ipc_sockets[2], ret;
+ int pidfd = -1;
+ struct pollfd pfd;
+ pid_t mid_pid, grandchild_pid;
+ char buf[32] = {};
+
+ /* Make ourselves a subreaper so reparented children come to us. */
+ ret = prctl(PR_SET_CHILD_SUBREAPER, 1);
+ ASSERT_EQ(ret, 0);
+
+ ret = socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
+ ASSERT_EQ(ret, 0);
+
+ mid_pid = fork();
+ ASSERT_GE(mid_pid, 0);
+
+ if (mid_pid == 0) {
+ /* Middle child: create an autoreap grandchild. */
+ int gc_pidfd = -1;
+
+ close(ipc_sockets[0]);
+
+ grandchild_pid = create_autoreap_child(&gc_pidfd);
+ if (grandchild_pid < 0) {
+ write_nointr(ipc_sockets[1], "E", 1);
+ close(ipc_sockets[1]);
+ _exit(1);
+ }
+
+ if (grandchild_pid == 0) {
+ /* Grandchild: wait for signal to exit. */
+ close(ipc_sockets[1]);
+ if (gc_pidfd >= 0)
+ close(gc_pidfd);
+ pause();
+ _exit(0);
+ }
+
+ /* Send grandchild PID to grandparent. */
+ snprintf(buf, sizeof(buf), "%d", grandchild_pid);
+ write_nointr(ipc_sockets[1], buf, strlen(buf));
+ close(ipc_sockets[1]);
+ if (gc_pidfd >= 0)
+ close(gc_pidfd);
+
+ /* Middle child exits, grandchild gets reparented. */
+ _exit(0);
+ }
+
+ close(ipc_sockets[1]);
+
+ /* Read grandchild's PID. */
+ ret = read_nointr(ipc_sockets[0], buf, sizeof(buf) - 1);
+ close(ipc_sockets[0]);
+ ASSERT_GT(ret, 0);
+
+ if (buf[0] == 'E') {
+ waitpid(mid_pid, NULL, 0);
+ prctl(PR_SET_CHILD_SUBREAPER, 0);
+ SKIP(return, "CLONE_AUTOREAP not supported");
+ }
+
+ grandchild_pid = atoi(buf);
+ ASSERT_GT(grandchild_pid, 0);
+
+ /* Wait for the middle child to exit. */
+ ret = waitpid(mid_pid, NULL, 0);
+ ASSERT_EQ(ret, mid_pid);
+
+ /*
+ * Now the grandchild is reparented to us (subreaper).
+ * Open a pidfd for the grandchild and kill it.
+ */
+ pidfd = sys_pidfd_open(grandchild_pid, 0);
+ ASSERT_GE(pidfd, 0);
+
+ ret = sys_pidfd_send_signal(pidfd, SIGKILL, NULL, 0);
+ ASSERT_EQ(ret, 0);
+
+ /* Wait for it to exit via pidfd poll. */
+ pfd.fd = pidfd;
+ pfd.events = POLLIN;
+ ret = poll(&pfd, 1, 5000);
+ ASSERT_EQ(ret, 1);
+ ASSERT_TRUE(pfd.revents & POLLIN);
+
+ /*
+ * The grandchild should have been autoreaped even though
+ * we (the new parent) haven't set SA_NOCLDWAIT.
+ * waitpid should return -1/ECHILD.
+ */
+ ret = waitpid(grandchild_pid, NULL, WNOHANG);
+ EXPECT_EQ(ret, -1);
+ EXPECT_EQ(errno, ECHILD);
+
+ close(pidfd);
+
+ /* Clean up subreaper status. */
+ prctl(PR_SET_CHILD_SUBREAPER, 0);
+}
+
+static int thread_sock_fd;
+
+static void *thread_func(void *arg)
+{
+ /* Signal parent we're running. */
+ write_nointr(thread_sock_fd, "1", 1);
+
+ /* Give main thread time to call _exit() first. */
+ usleep(200000);
+
+ return NULL;
+}
+
+/*
+ * Test that an autoreap child with multiple threads is properly
+ * autoreaped only after all threads have exited.
+ */
+TEST(autoreap_multithreaded)
+{
+ struct pidfd_info info = { .mask = PIDFD_INFO_EXIT };
+ int ipc_sockets[2], ret;
+ int pidfd = -1;
+ struct pollfd pfd;
+ pid_t pid;
+ char c;
+
+ ret = socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
+ ASSERT_EQ(ret, 0);
+
+ pid = create_autoreap_child(&pidfd);
+ if (pid < 0 && errno == EINVAL) {
+ close(ipc_sockets[0]);
+ close(ipc_sockets[1]);
+ SKIP(return, "CLONE_AUTOREAP not supported");
+ }
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ pthread_t thread;
+
+ close(ipc_sockets[0]);
+
+ /*
+ * Create a sub-thread that outlives the main thread.
+ * The thread signals readiness, then sleeps.
+ * The main thread waits briefly, then calls _exit().
+ */
+ thread_sock_fd = ipc_sockets[1];
+ pthread_create(&thread, NULL, thread_func, NULL);
+ pthread_detach(thread);
+
+ /* Wait for thread to be running. */
+ usleep(100000);
+
+ /* Main thread exits; sub-thread is still alive. */
+ _exit(99);
+ }
+
+ close(ipc_sockets[1]);
+
+ /* Wait for the sub-thread to signal readiness. */
+ ret = read_nointr(ipc_sockets[0], &c, 1);
+ close(ipc_sockets[0]);
+ ASSERT_EQ(ret, 1);
+
+ /* Wait for the process to fully exit via pidfd poll. */
+ pfd.fd = pidfd;
+ pfd.events = POLLIN;
+ ret = poll(&pfd, 1, 5000);
+ ASSERT_EQ(ret, 1);
+ ASSERT_TRUE(pfd.revents & POLLIN);
+
+ /* Verify exit info. */
+ ret = ioctl(pidfd, PIDFD_GET_INFO, &info);
+ ASSERT_EQ(ret, 0);
+ ASSERT_TRUE(info.mask & PIDFD_INFO_EXIT);
+ ASSERT_TRUE(WIFEXITED(info.exit_code));
+ ASSERT_EQ(WEXITSTATUS(info.exit_code), 99);
+
+ /* No zombie. */
+ ret = waitpid(pid, NULL, WNOHANG);
+ ASSERT_EQ(ret, -1);
+ ASSERT_EQ(errno, ECHILD);
+
+ close(pidfd);
+}
+
+/*
+ * Test that autoreap is NOT inherited by grandchildren.
+ */
+TEST(autoreap_no_inherit)
+{
+ int ipc_sockets[2], ret;
+ int pidfd = -1;
+ pid_t pid;
+ char buf[2] = {};
+ struct pollfd pfd;
+
+ ret = socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets);
+ ASSERT_EQ(ret, 0);
+
+ pid = create_autoreap_child(&pidfd);
+ if (pid < 0 && errno == EINVAL) {
+ close(ipc_sockets[0]);
+ close(ipc_sockets[1]);
+ SKIP(return, "CLONE_AUTOREAP not supported");
+ }
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ pid_t gc;
+ int status;
+
+ close(ipc_sockets[0]);
+
+ /* Autoreap child forks a grandchild (without autoreap). */
+ gc = fork();
+ if (gc < 0) {
+ write_nointr(ipc_sockets[1], "E", 1);
+ _exit(1);
+ }
+ if (gc == 0) {
+ /* Grandchild: exit immediately. */
+ close(ipc_sockets[1]);
+ _exit(77);
+ }
+
+ /*
+ * The grandchild should become a regular zombie
+ * since it was NOT created with CLONE_AUTOREAP.
+ * Wait for it to verify.
+ */
+ ret = waitpid(gc, &status, 0);
+ if (ret == gc && WIFEXITED(status) &&
+ WEXITSTATUS(status) == 77) {
+ write_nointr(ipc_sockets[1], "P", 1);
+ } else {
+ write_nointr(ipc_sockets[1], "F", 1);
+ }
+ close(ipc_sockets[1]);
+ _exit(0);
+ }
+
+ close(ipc_sockets[1]);
+
+ ret = read_nointr(ipc_sockets[0], buf, 1);
+ close(ipc_sockets[0]);
+ ASSERT_EQ(ret, 1);
+
+ /*
+ * 'P' means the autoreap child was able to waitpid() its
+ * grandchild (correct - grandchild should be a normal zombie,
+ * not autoreaped).
+ */
+ ASSERT_EQ(buf[0], 'P');
+
+ /* Wait for the autoreap child to exit. */
+ pfd.fd = pidfd;
+ pfd.events = POLLIN;
+ ret = poll(&pfd, 1, 5000);
+ ASSERT_EQ(ret, 1);
+
+ /* Autoreap child itself should be autoreaped. */
+ ret = waitpid(pid, NULL, WNOHANG);
+ ASSERT_EQ(ret, -1);
+ ASSERT_EQ(errno, ECHILD);
+
+ close(pidfd);
+}
+
+TEST_HARNESS_MAIN
--
2.47.3
^ permalink raw reply related [flat|nested] 14+ messages in thread* [PATCH v4 4/4] selftests/pidfd: add CLONE_PIDFD_AUTOKILL tests
2026-02-23 10:44 [PATCH v4 0/4] pidfd: add CLONE_AUTOREAP and CLONE_PIDFD_AUTOKILL Christian Brauner
` (2 preceding siblings ...)
2026-02-23 10:45 ` [PATCH v4 3/4] selftests/pidfd: add CLONE_AUTOREAP tests Christian Brauner
@ 2026-02-23 10:45 ` Christian Brauner
3 siblings, 0 replies; 14+ messages in thread
From: Christian Brauner @ 2026-02-23 10:45 UTC (permalink / raw)
To: Oleg Nesterov, Jann Horn
Cc: Linus Torvalds, Ingo Molnar, Peter Zijlstra, linux-kernel,
linux-fsdevel, Christian Brauner
Add tests for the new CLONE_PIDFD_AUTOKILL clone3() flag:
- autokill_sets_no_new_privs: child created with CLONE_PIDFD_AUTOKILL
has no_new_privs set, parent does not
- autoreap_no_new_privs_unset: plain CLONE_AUTOREAP child does not get
no_new_privs (only CLONE_PIDFD_AUTOKILL sets it)
- autokill_basic: child blocks in pause(), parent closes clone3 pidfd,
child is killed and autoreaped
- autokill_requires_pidfd: CLONE_PIDFD_AUTOKILL without CLONE_PIDFD
fails with EINVAL
- autokill_requires_autoreap: CLONE_PIDFD_AUTOKILL without
CLONE_AUTOREAP fails with EINVAL
- autokill_rejects_thread: CLONE_PIDFD_AUTOKILL with CLONE_THREAD fails
with EINVAL
- autokill_pidfd_open_no_effect: closing a pidfd_open() fd does not kill
the child, closing the clone3 pidfd does
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
.../testing/selftests/pidfd/pidfd_autoreap_test.c | 286 +++++++++++++++++++++
1 file changed, 286 insertions(+)
diff --git a/tools/testing/selftests/pidfd/pidfd_autoreap_test.c b/tools/testing/selftests/pidfd/pidfd_autoreap_test.c
index 9e52a16239ea..9037542eef2a 100644
--- a/tools/testing/selftests/pidfd/pidfd_autoreap_test.c
+++ b/tools/testing/selftests/pidfd/pidfd_autoreap_test.c
@@ -28,6 +28,10 @@
#define CLONE_AUTOREAP 0x400000000ULL
#endif
+#ifndef CLONE_PIDFD_AUTOKILL
+#define CLONE_PIDFD_AUTOKILL 0x800000000ULL
+#endif
+
static pid_t create_autoreap_child(int *pidfd)
{
struct __clone_args args = {
@@ -504,4 +508,286 @@ TEST(autoreap_no_inherit)
close(pidfd);
}
+/*
+ * Helper: create a child with CLONE_PIDFD | CLONE_PIDFD_AUTOKILL | CLONE_AUTOREAP.
+ */
+static pid_t create_autokill_child(int *pidfd)
+{
+ struct __clone_args args = {
+ .flags = CLONE_PIDFD | CLONE_PIDFD_AUTOKILL |
+ CLONE_AUTOREAP,
+ .exit_signal = 0,
+ .pidfd = ptr_to_u64(pidfd),
+ };
+
+ return sys_clone3(&args, sizeof(args));
+}
+
+/*
+ * Test that CLONE_PIDFD_AUTOKILL sets no_new_privs on the child.
+ * The child checks via prctl(PR_GET_NO_NEW_PRIVS) and reports back.
+ * The parent must NOT have no_new_privs set afterwards.
+ */
+TEST(autokill_sets_no_new_privs)
+{
+ struct pidfd_info info = { .mask = PIDFD_INFO_EXIT };
+ int pidfd = -1, ret;
+ struct pollfd pfd;
+ pid_t pid;
+
+ /* Ensure parent does not already have no_new_privs. */
+ ret = prctl(PR_GET_NO_NEW_PRIVS, 0, 0, 0, 0);
+ ASSERT_EQ(ret, 0) {
+ TH_LOG("Parent already has no_new_privs set, cannot run test");
+ }
+
+ pid = create_autokill_child(&pidfd);
+ if (pid < 0 && errno == EINVAL)
+ SKIP(return, "CLONE_PIDFD_AUTOKILL not supported");
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ /*
+ * Child: check no_new_privs. Exit 0 if set, 1 if not.
+ */
+ ret = prctl(PR_GET_NO_NEW_PRIVS, 0, 0, 0, 0);
+ _exit(ret == 1 ? 0 : 1);
+ }
+
+ ASSERT_GE(pidfd, 0);
+
+ /* Parent must still NOT have no_new_privs. */
+ ret = prctl(PR_GET_NO_NEW_PRIVS, 0, 0, 0, 0);
+ ASSERT_EQ(ret, 0) {
+ TH_LOG("Parent got no_new_privs after creating autokill child");
+ }
+
+ /* Wait for child to exit. */
+ pfd.fd = pidfd;
+ pfd.events = POLLIN;
+ ret = poll(&pfd, 1, 5000);
+ ASSERT_EQ(ret, 1);
+
+ /* Verify child exited with 0 (no_new_privs was set). */
+ ret = ioctl(pidfd, PIDFD_GET_INFO, &info);
+ ASSERT_EQ(ret, 0);
+ ASSERT_TRUE(info.mask & PIDFD_INFO_EXIT);
+ ASSERT_TRUE(WIFEXITED(info.exit_code));
+ ASSERT_EQ(WEXITSTATUS(info.exit_code), 0) {
+ TH_LOG("Child did not have no_new_privs set");
+ }
+
+ close(pidfd);
+}
+
+/*
+ * Test that a plain CLONE_AUTOREAP child does NOT get no_new_privs.
+ * Only CLONE_PIDFD_AUTOKILL should set it.
+ */
+TEST(autoreap_no_new_privs_unset)
+{
+ struct pidfd_info info = { .mask = PIDFD_INFO_EXIT };
+ int pidfd = -1, ret;
+ struct pollfd pfd;
+ pid_t pid;
+
+ pid = create_autoreap_child(&pidfd);
+ if (pid < 0 && errno == EINVAL)
+ SKIP(return, "CLONE_AUTOREAP not supported");
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ /*
+ * Child: check no_new_privs. Exit 0 if NOT set, 1 if set.
+ */
+ ret = prctl(PR_GET_NO_NEW_PRIVS, 0, 0, 0, 0);
+ _exit(ret == 0 ? 0 : 1);
+ }
+
+ ASSERT_GE(pidfd, 0);
+
+ pfd.fd = pidfd;
+ pfd.events = POLLIN;
+ ret = poll(&pfd, 1, 5000);
+ ASSERT_EQ(ret, 1);
+
+ ret = ioctl(pidfd, PIDFD_GET_INFO, &info);
+ ASSERT_EQ(ret, 0);
+ ASSERT_TRUE(info.mask & PIDFD_INFO_EXIT);
+ ASSERT_TRUE(WIFEXITED(info.exit_code));
+ ASSERT_EQ(WEXITSTATUS(info.exit_code), 0) {
+ TH_LOG("Plain autoreap child unexpectedly has no_new_privs");
+ }
+
+ close(pidfd);
+}
+
+/*
+ * Basic autokill test: child blocks in pause(), parent closes the
+ * clone3 pidfd, child should be killed and autoreaped.
+ */
+TEST(autokill_basic)
+{
+ int pidfd = -1, pollfd_fd = -1, ret;
+ struct pollfd pfd;
+ pid_t pid;
+
+ pid = create_autokill_child(&pidfd);
+ if (pid < 0 && errno == EINVAL)
+ SKIP(return, "CLONE_PIDFD_AUTOKILL not supported");
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ pause();
+ _exit(1);
+ }
+
+ ASSERT_GE(pidfd, 0);
+
+ /*
+ * Open a second pidfd via pidfd_open() so we can observe the
+ * child's death after closing the clone3 pidfd.
+ */
+ pollfd_fd = sys_pidfd_open(pid, 0);
+ ASSERT_GE(pollfd_fd, 0);
+
+ /* Close the clone3 pidfd — this should trigger autokill. */
+ close(pidfd);
+
+ /* Wait for the child to die via the pidfd_open'd fd. */
+ pfd.fd = pollfd_fd;
+ pfd.events = POLLIN;
+ ret = poll(&pfd, 1, 5000);
+ ASSERT_EQ(ret, 1);
+ ASSERT_TRUE(pfd.revents & POLLIN);
+
+ /* Child should be autoreaped — no zombie. */
+ usleep(100000);
+ ret = waitpid(pid, NULL, WNOHANG);
+ ASSERT_EQ(ret, -1);
+ ASSERT_EQ(errno, ECHILD);
+
+ close(pollfd_fd);
+}
+
+/*
+ * CLONE_PIDFD_AUTOKILL without CLONE_PIDFD must fail with EINVAL.
+ */
+TEST(autokill_requires_pidfd)
+{
+ struct __clone_args args = {
+ .flags = CLONE_PIDFD_AUTOKILL | CLONE_AUTOREAP,
+ .exit_signal = 0,
+ };
+ pid_t pid;
+
+ pid = sys_clone3(&args, sizeof(args));
+ ASSERT_EQ(pid, -1);
+ ASSERT_EQ(errno, EINVAL);
+}
+
+/*
+ * CLONE_PIDFD_AUTOKILL without CLONE_AUTOREAP must fail with EINVAL.
+ */
+TEST(autokill_requires_autoreap)
+{
+ struct __clone_args args = {
+ .flags = CLONE_PIDFD | CLONE_PIDFD_AUTOKILL,
+ .exit_signal = SIGCHLD,
+ };
+ int pidfd = -1;
+ pid_t pid;
+
+ args.pidfd = ptr_to_u64(&pidfd);
+
+ pid = sys_clone3(&args, sizeof(args));
+ ASSERT_EQ(pid, -1);
+ ASSERT_EQ(errno, EINVAL);
+}
+
+/*
+ * CLONE_PIDFD_AUTOKILL with CLONE_THREAD must fail with EINVAL.
+ */
+TEST(autokill_rejects_thread)
+{
+ struct __clone_args args = {
+ .flags = CLONE_PIDFD | CLONE_PIDFD_AUTOKILL |
+ CLONE_AUTOREAP | CLONE_THREAD |
+ CLONE_SIGHAND | CLONE_VM,
+ .exit_signal = 0,
+ };
+ int pidfd = -1;
+ pid_t pid;
+
+ args.pidfd = ptr_to_u64(&pidfd);
+
+ pid = sys_clone3(&args, sizeof(args));
+ ASSERT_EQ(pid, -1);
+ ASSERT_EQ(errno, EINVAL);
+}
+
+/*
+ * Test that only the clone3 pidfd triggers autokill, not pidfd_open().
+ * Close the pidfd_open'd fd first — child should survive.
+ * Then close the clone3 pidfd — child should be killed and autoreaped.
+ */
+TEST(autokill_pidfd_open_no_effect)
+{
+ int pidfd = -1, open_fd = -1, ret;
+ struct pollfd pfd;
+ pid_t pid;
+
+ pid = create_autokill_child(&pidfd);
+ if (pid < 0 && errno == EINVAL)
+ SKIP(return, "CLONE_PIDFD_AUTOKILL not supported");
+ ASSERT_GE(pid, 0);
+
+ if (pid == 0) {
+ pause();
+ _exit(1);
+ }
+
+ ASSERT_GE(pidfd, 0);
+
+ /* Open a second pidfd via pidfd_open(). */
+ open_fd = sys_pidfd_open(pid, 0);
+ ASSERT_GE(open_fd, 0);
+
+ /*
+ * Close the pidfd_open'd fd — child should survive because
+ * only the clone3 pidfd has autokill.
+ */
+ close(open_fd);
+ usleep(200000);
+
+ /* Verify child is still alive by polling the clone3 pidfd. */
+ pfd.fd = pidfd;
+ pfd.events = POLLIN;
+ ret = poll(&pfd, 1, 0);
+ ASSERT_EQ(ret, 0) {
+ TH_LOG("Child died after closing pidfd_open fd — should still be alive");
+ }
+
+ /* Open another observation fd before triggering autokill. */
+ open_fd = sys_pidfd_open(pid, 0);
+ ASSERT_GE(open_fd, 0);
+
+ /* Now close the clone3 pidfd — this triggers autokill. */
+ close(pidfd);
+
+ pfd.fd = open_fd;
+ pfd.events = POLLIN;
+ ret = poll(&pfd, 1, 5000);
+ ASSERT_EQ(ret, 1);
+ ASSERT_TRUE(pfd.revents & POLLIN);
+
+ /* Child should be autoreaped — no zombie. */
+ usleep(100000);
+ ret = waitpid(pid, NULL, WNOHANG);
+ ASSERT_EQ(ret, -1);
+ ASSERT_EQ(errno, ECHILD);
+
+ close(open_fd);
+}
+
TEST_HARNESS_MAIN
--
2.47.3
^ permalink raw reply related [flat|nested] 14+ messages in thread