public inbox for linux-fsdevel@vger.kernel.org
 help / color / mirror / Atom feed
* [PATCH RFC v2] pidfs: convert rb-tree to rhashtable
@ 2026-01-20 14:52 Christian Brauner
  2026-01-20 15:43 ` Mateusz Guzik
                   ` (2 more replies)
  0 siblings, 3 replies; 19+ messages in thread
From: Christian Brauner @ 2026-01-20 14:52 UTC (permalink / raw)
  To: Mateusz Guzik; +Cc: Alexander Viro, Jan Kara, linux-fsdevel, Christian Brauner

Mateusz reported performance penalties [1] during task creation because
pidfs uses pidmap_lock to add elements into the rbtree. Switch to an
rhashtable to have separate fine-grained locking and to decouple from
pidmap_lock moving all heavy manipulations outside of it.

Convert the pidfs inode-to-pid mapping from an rb-tree with seqcount
protection to an rhashtable. This removes the global pidmap_lock
contention from pidfs_ino_get_pid() lookups and allows the hashtable
insert to happen outside the pidmap_lock.

pidfs_add_pid() is split. pidfs_prepare_pid() allocates inode number and
initializes pid fields and is called inside pidmap_lock. pidfs_add_pid()
inserts pid into rhashtable and is called outside pidmap_lock. Insertion
into the rhashtable can fail and memory allocation may happen so we need
to drop the spinlock.

To guard against accidently opening an already reaped task
pidfs_ino_get_pid() uses additional checks beyond pid_vnr(). If
pid->attr is PIDFS_PID_DEAD or NULL the pid either never had a pidfd or
it already went through pidfs_exit() aka the process as already reaped.
If pid->attr is valid check PIDFS_ATTR_BIT_EXIT to figure out whether
the task has exited.

This slightly changes visibility semantics: pidfd creation is denied
after pidfs_exit() runs, which is just before the pid number is removed
from the via free_pid(). That should not be an issue though.

Link: https://lore.kernel.org/20251206131955.780557-1-mjguzik@gmail.com [1]
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
Changes in v2:
- Ensure that pid is removed before call_rcu() from pidfs.
- Don't drop and reacquire spinlock.
- Link to v1: https://patch.msgid.link/20260119-work-pidfs-rhashtable-v1-1-159c7700300a@kernel.org
---
 fs/pidfs.c            | 81 +++++++++++++++++++++------------------------------
 include/linux/pid.h   |  4 +--
 include/linux/pidfs.h |  3 +-
 kernel/pid.c          | 13 ++++++---
 4 files changed, 46 insertions(+), 55 deletions(-)

diff --git a/fs/pidfs.c b/fs/pidfs.c
index dba703d4ce4a..ee0e36dd29d2 100644
--- a/fs/pidfs.c
+++ b/fs/pidfs.c
@@ -21,6 +21,7 @@
 #include <linux/utsname.h>
 #include <net/net_namespace.h>
 #include <linux/coredump.h>
+#include <linux/rhashtable.h>
 #include <linux/xattr.h>
 
 #include "internal.h"
@@ -55,7 +56,14 @@ struct pidfs_attr {
 	__u32 coredump_signal;
 };
 
-static struct rb_root pidfs_ino_tree = RB_ROOT;
+static struct rhashtable pidfs_ino_ht;
+
+static const struct rhashtable_params pidfs_ino_ht_params = {
+	.key_offset		= offsetof(struct pid, ino),
+	.key_len		= sizeof(u64),
+	.head_offset		= offsetof(struct pid, pidfs_hash),
+	.automatic_shrinking	= true,
+};
 
 #if BITS_PER_LONG == 32
 static inline unsigned long pidfs_ino(u64 ino)
@@ -84,21 +92,11 @@ static inline u32 pidfs_gen(u64 ino)
 }
 #endif
 
-static int pidfs_ino_cmp(struct rb_node *a, const struct rb_node *b)
-{
-	struct pid *pid_a = rb_entry(a, struct pid, pidfs_node);
-	struct pid *pid_b = rb_entry(b, struct pid, pidfs_node);
-	u64 pid_ino_a = pid_a->ino;
-	u64 pid_ino_b = pid_b->ino;
-
-	if (pid_ino_a < pid_ino_b)
-		return -1;
-	if (pid_ino_a > pid_ino_b)
-		return 1;
-	return 0;
-}
-
-void pidfs_add_pid(struct pid *pid)
+/*
+ * Allocate inode number and initialize pidfs fields.
+ * Called with pidmap_lock held.
+ */
+void pidfs_prepare_pid(struct pid *pid)
 {
 	static u64 pidfs_ino_nr = 2;
 
@@ -131,20 +129,22 @@ void pidfs_add_pid(struct pid *pid)
 		pidfs_ino_nr += 2;
 
 	pid->ino = pidfs_ino_nr;
+	pid->pidfs_hash.next = NULL;
 	pid->stashed = NULL;
 	pid->attr = NULL;
 	pidfs_ino_nr++;
+}
 
-	write_seqcount_begin(&pidmap_lock_seq);
-	rb_find_add_rcu(&pid->pidfs_node, &pidfs_ino_tree, pidfs_ino_cmp);
-	write_seqcount_end(&pidmap_lock_seq);
+int pidfs_add_pid(struct pid *pid)
+{
+	return rhashtable_insert_fast(&pidfs_ino_ht, &pid->pidfs_hash,
+				      pidfs_ino_ht_params);
 }
 
 void pidfs_remove_pid(struct pid *pid)
 {
-	write_seqcount_begin(&pidmap_lock_seq);
-	rb_erase(&pid->pidfs_node, &pidfs_ino_tree);
-	write_seqcount_end(&pidmap_lock_seq);
+	rhashtable_remove_fast(&pidfs_ino_ht, &pid->pidfs_hash,
+			       pidfs_ino_ht_params);
 }
 
 void pidfs_free_pid(struct pid *pid)
@@ -773,42 +773,24 @@ static int pidfs_encode_fh(struct inode *inode, u32 *fh, int *max_len,
 	return FILEID_KERNFS;
 }
 
-static int pidfs_ino_find(const void *key, const struct rb_node *node)
-{
-	const u64 pid_ino = *(u64 *)key;
-	const struct pid *pid = rb_entry(node, struct pid, pidfs_node);
-
-	if (pid_ino < pid->ino)
-		return -1;
-	if (pid_ino > pid->ino)
-		return 1;
-	return 0;
-}
-
 /* Find a struct pid based on the inode number. */
 static struct pid *pidfs_ino_get_pid(u64 ino)
 {
 	struct pid *pid;
-	struct rb_node *node;
-	unsigned int seq;
+	struct pidfs_attr *attr;
 
 	guard(rcu)();
-	do {
-		seq = read_seqcount_begin(&pidmap_lock_seq);
-		node = rb_find_rcu(&ino, &pidfs_ino_tree, pidfs_ino_find);
-		if (node)
-			break;
-	} while (read_seqcount_retry(&pidmap_lock_seq, seq));
-
-	if (!node)
+	pid = rhashtable_lookup(&pidfs_ino_ht, &ino, pidfs_ino_ht_params);
+	if (!pid)
+		return NULL;
+	attr = READ_ONCE(pid->attr);
+	if (IS_ERR_OR_NULL(attr))
+		return NULL;
+	if (test_bit(PIDFS_ATTR_BIT_EXIT, &attr->attr_mask))
 		return NULL;
-
-	pid = rb_entry(node, struct pid, pidfs_node);
-
 	/* Within our pid namespace hierarchy? */
 	if (pid_vnr(pid) == 0)
 		return NULL;
-
 	return get_pid(pid);
 }
 
@@ -1086,6 +1068,9 @@ struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags)
 
 void __init pidfs_init(void)
 {
+	if (rhashtable_init(&pidfs_ino_ht, &pidfs_ino_ht_params))
+		panic("Failed to initialize pidfs hashtable");
+
 	pidfs_attr_cachep = kmem_cache_create("pidfs_attr_cache", sizeof(struct pidfs_attr), 0,
 					 (SLAB_HWCACHE_ALIGN | SLAB_RECLAIM_ACCOUNT |
 					  SLAB_ACCOUNT | SLAB_PANIC), NULL);
diff --git a/include/linux/pid.h b/include/linux/pid.h
index 003a1027d219..ce9b5cb7560b 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -6,6 +6,7 @@
 #include <linux/rculist.h>
 #include <linux/rcupdate.h>
 #include <linux/refcount.h>
+#include <linux/rhashtable-types.h>
 #include <linux/sched.h>
 #include <linux/wait.h>
 
@@ -60,7 +61,7 @@ struct pid {
 	spinlock_t lock;
 	struct {
 		u64 ino;
-		struct rb_node pidfs_node;
+		struct rhash_head pidfs_hash;
 		struct dentry *stashed;
 		struct pidfs_attr *attr;
 	};
@@ -73,7 +74,6 @@ struct pid {
 	struct upid numbers[];
 };
 
-extern seqcount_spinlock_t pidmap_lock_seq;
 extern struct pid init_struct_pid;
 
 struct file;
diff --git a/include/linux/pidfs.h b/include/linux/pidfs.h
index 3e08c33da2df..416bdff4d6ce 100644
--- a/include/linux/pidfs.h
+++ b/include/linux/pidfs.h
@@ -6,7 +6,8 @@ struct coredump_params;
 
 struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags);
 void __init pidfs_init(void);
-void pidfs_add_pid(struct pid *pid);
+void pidfs_prepare_pid(struct pid *pid);
+int pidfs_add_pid(struct pid *pid);
 void pidfs_remove_pid(struct pid *pid);
 void pidfs_exit(struct task_struct *tsk);
 #ifdef CONFIG_COREDUMP
diff --git a/kernel/pid.c b/kernel/pid.c
index ad4400a9f15f..6077da774652 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -43,7 +43,6 @@
 #include <linux/sched/task.h>
 #include <linux/idr.h>
 #include <linux/pidfs.h>
-#include <linux/seqlock.h>
 #include <net/sock.h>
 #include <uapi/linux/pidfd.h>
 
@@ -85,7 +84,6 @@ struct pid_namespace init_pid_ns = {
 EXPORT_SYMBOL_GPL(init_pid_ns);
 
 static  __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock);
-seqcount_spinlock_t pidmap_lock_seq = SEQCNT_SPINLOCK_ZERO(pidmap_lock_seq, &pidmap_lock);
 
 void put_pid(struct pid *pid)
 {
@@ -141,9 +139,9 @@ void free_pid(struct pid *pid)
 
 		idr_remove(&ns->idr, upid->nr);
 	}
-	pidfs_remove_pid(pid);
 	spin_unlock(&pidmap_lock);
 
+	pidfs_remove_pid(pid);
 	call_rcu(&pid->rcu, delayed_put_pid);
 }
 
@@ -315,7 +313,8 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *arg_set_tid,
 	retval = -ENOMEM;
 	if (unlikely(!(ns->pid_allocated & PIDNS_ADDING)))
 		goto out_free;
-	pidfs_add_pid(pid);
+	pidfs_prepare_pid(pid);
+
 	for (upid = pid->numbers + ns->level; upid >= pid->numbers; --upid) {
 		/* Make the PID visible to find_pid_ns. */
 		idr_replace(&upid->ns->idr, pid, upid->nr);
@@ -325,6 +324,12 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *arg_set_tid,
 	idr_preload_end();
 	ns_ref_active_get(ns);
 
+	retval = pidfs_add_pid(pid);
+	if (unlikely(retval)) {
+		free_pid(pid);
+		pid = ERR_PTR(-ENOMEM);
+	}
+
 	return pid;
 
 out_free:

---
base-commit: f54c7e54d2de2d7b58aa54604218a6fc00bb2e77
change-id: 20260119-work-pidfs-rhashtable-9d14071bd77a


^ permalink raw reply related	[flat|nested] 19+ messages in thread

* Re: [PATCH RFC v2] pidfs: convert rb-tree to rhashtable
  2026-01-20 14:52 [PATCH RFC v2] pidfs: convert rb-tree to rhashtable Christian Brauner
@ 2026-01-20 15:43 ` Mateusz Guzik
  2026-01-21 10:59 ` Jan Kara
  2026-02-20 15:11 ` Guillaume Tucker
  2 siblings, 0 replies; 19+ messages in thread
From: Mateusz Guzik @ 2026-01-20 15:43 UTC (permalink / raw)
  To: Christian Brauner; +Cc: Alexander Viro, Jan Kara, linux-fsdevel

On Tue, Jan 20, 2026 at 3:52 PM Christian Brauner <brauner@kernel.org> wrote:
>
> Mateusz reported performance penalties [1] during task creation because
> pidfs uses pidmap_lock to add elements into the rbtree. Switch to an
> rhashtable to have separate fine-grained locking and to decouple from
> pidmap_lock moving all heavy manipulations outside of it.
>

FYI I have a WIP patch to address the cgroup problem. With that thing
in place the pidmap lock is back at the top of the profile, finally
followed by tasklist_lock.

After I sort that out, with pidmap lock back on top I'll implement the
lockless allocation scheme.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH RFC v2] pidfs: convert rb-tree to rhashtable
  2026-01-20 14:52 [PATCH RFC v2] pidfs: convert rb-tree to rhashtable Christian Brauner
  2026-01-20 15:43 ` Mateusz Guzik
@ 2026-01-21 10:59 ` Jan Kara
  2026-01-22 10:18   ` Christian Brauner
  2026-02-20 15:11 ` Guillaume Tucker
  2 siblings, 1 reply; 19+ messages in thread
From: Jan Kara @ 2026-01-21 10:59 UTC (permalink / raw)
  To: Christian Brauner; +Cc: Mateusz Guzik, Alexander Viro, Jan Kara, linux-fsdevel

On Tue 20-01-26 15:52:35, Christian Brauner wrote:
> Mateusz reported performance penalties [1] during task creation because
> pidfs uses pidmap_lock to add elements into the rbtree. Switch to an
> rhashtable to have separate fine-grained locking and to decouple from
> pidmap_lock moving all heavy manipulations outside of it.
> 
> Convert the pidfs inode-to-pid mapping from an rb-tree with seqcount
> protection to an rhashtable. This removes the global pidmap_lock
> contention from pidfs_ino_get_pid() lookups and allows the hashtable
> insert to happen outside the pidmap_lock.
> 
> pidfs_add_pid() is split. pidfs_prepare_pid() allocates inode number and
> initializes pid fields and is called inside pidmap_lock. pidfs_add_pid()
> inserts pid into rhashtable and is called outside pidmap_lock. Insertion
> into the rhashtable can fail and memory allocation may happen so we need
> to drop the spinlock.
> 
> To guard against accidently opening an already reaped task
> pidfs_ino_get_pid() uses additional checks beyond pid_vnr(). If
> pid->attr is PIDFS_PID_DEAD or NULL the pid either never had a pidfd or
> it already went through pidfs_exit() aka the process as already reaped.
> If pid->attr is valid check PIDFS_ATTR_BIT_EXIT to figure out whether
> the task has exited.
> 
> This slightly changes visibility semantics: pidfd creation is denied
> after pidfs_exit() runs, which is just before the pid number is removed
> from the via free_pid(). That should not be an issue though.
> 
> Link: https://lore.kernel.org/20251206131955.780557-1-mjguzik@gmail.com [1]
> Signed-off-by: Christian Brauner <brauner@kernel.org>

Looks very nice! Feel free to add:

Reviewed-by: Jan Kara <jack@suse.cz>

I have just one question about the new PIDFS_ATTR_BIT_EXIT check.  AFAIU it
protects from grabbing struct pid references for pids that are dying. But
we can also call free_pid() from places like ksys_setsid() where
PIDFS_ATTR_BIT_EXIT is not set. So this check only seems as a convenience
rather than some hard guarantee, am I right?

								Honza

> ---
> Changes in v2:
> - Ensure that pid is removed before call_rcu() from pidfs.
> - Don't drop and reacquire spinlock.
> - Link to v1: https://patch.msgid.link/20260119-work-pidfs-rhashtable-v1-1-159c7700300a@kernel.org
> ---
>  fs/pidfs.c            | 81 +++++++++++++++++++++------------------------------
>  include/linux/pid.h   |  4 +--
>  include/linux/pidfs.h |  3 +-
>  kernel/pid.c          | 13 ++++++---
>  4 files changed, 46 insertions(+), 55 deletions(-)
> 
> diff --git a/fs/pidfs.c b/fs/pidfs.c
> index dba703d4ce4a..ee0e36dd29d2 100644
> --- a/fs/pidfs.c
> +++ b/fs/pidfs.c
> @@ -21,6 +21,7 @@
>  #include <linux/utsname.h>
>  #include <net/net_namespace.h>
>  #include <linux/coredump.h>
> +#include <linux/rhashtable.h>
>  #include <linux/xattr.h>
>  
>  #include "internal.h"
> @@ -55,7 +56,14 @@ struct pidfs_attr {
>  	__u32 coredump_signal;
>  };
>  
> -static struct rb_root pidfs_ino_tree = RB_ROOT;
> +static struct rhashtable pidfs_ino_ht;
> +
> +static const struct rhashtable_params pidfs_ino_ht_params = {
> +	.key_offset		= offsetof(struct pid, ino),
> +	.key_len		= sizeof(u64),
> +	.head_offset		= offsetof(struct pid, pidfs_hash),
> +	.automatic_shrinking	= true,
> +};
>  
>  #if BITS_PER_LONG == 32
>  static inline unsigned long pidfs_ino(u64 ino)
> @@ -84,21 +92,11 @@ static inline u32 pidfs_gen(u64 ino)
>  }
>  #endif
>  
> -static int pidfs_ino_cmp(struct rb_node *a, const struct rb_node *b)
> -{
> -	struct pid *pid_a = rb_entry(a, struct pid, pidfs_node);
> -	struct pid *pid_b = rb_entry(b, struct pid, pidfs_node);
> -	u64 pid_ino_a = pid_a->ino;
> -	u64 pid_ino_b = pid_b->ino;
> -
> -	if (pid_ino_a < pid_ino_b)
> -		return -1;
> -	if (pid_ino_a > pid_ino_b)
> -		return 1;
> -	return 0;
> -}
> -
> -void pidfs_add_pid(struct pid *pid)
> +/*
> + * Allocate inode number and initialize pidfs fields.
> + * Called with pidmap_lock held.
> + */
> +void pidfs_prepare_pid(struct pid *pid)
>  {
>  	static u64 pidfs_ino_nr = 2;
>  
> @@ -131,20 +129,22 @@ void pidfs_add_pid(struct pid *pid)
>  		pidfs_ino_nr += 2;
>  
>  	pid->ino = pidfs_ino_nr;
> +	pid->pidfs_hash.next = NULL;
>  	pid->stashed = NULL;
>  	pid->attr = NULL;
>  	pidfs_ino_nr++;
> +}
>  
> -	write_seqcount_begin(&pidmap_lock_seq);
> -	rb_find_add_rcu(&pid->pidfs_node, &pidfs_ino_tree, pidfs_ino_cmp);
> -	write_seqcount_end(&pidmap_lock_seq);
> +int pidfs_add_pid(struct pid *pid)
> +{
> +	return rhashtable_insert_fast(&pidfs_ino_ht, &pid->pidfs_hash,
> +				      pidfs_ino_ht_params);
>  }
>  
>  void pidfs_remove_pid(struct pid *pid)
>  {
> -	write_seqcount_begin(&pidmap_lock_seq);
> -	rb_erase(&pid->pidfs_node, &pidfs_ino_tree);
> -	write_seqcount_end(&pidmap_lock_seq);
> +	rhashtable_remove_fast(&pidfs_ino_ht, &pid->pidfs_hash,
> +			       pidfs_ino_ht_params);
>  }
>  
>  void pidfs_free_pid(struct pid *pid)
> @@ -773,42 +773,24 @@ static int pidfs_encode_fh(struct inode *inode, u32 *fh, int *max_len,
>  	return FILEID_KERNFS;
>  }
>  
> -static int pidfs_ino_find(const void *key, const struct rb_node *node)
> -{
> -	const u64 pid_ino = *(u64 *)key;
> -	const struct pid *pid = rb_entry(node, struct pid, pidfs_node);
> -
> -	if (pid_ino < pid->ino)
> -		return -1;
> -	if (pid_ino > pid->ino)
> -		return 1;
> -	return 0;
> -}
> -
>  /* Find a struct pid based on the inode number. */
>  static struct pid *pidfs_ino_get_pid(u64 ino)
>  {
>  	struct pid *pid;
> -	struct rb_node *node;
> -	unsigned int seq;
> +	struct pidfs_attr *attr;
>  
>  	guard(rcu)();
> -	do {
> -		seq = read_seqcount_begin(&pidmap_lock_seq);
> -		node = rb_find_rcu(&ino, &pidfs_ino_tree, pidfs_ino_find);
> -		if (node)
> -			break;
> -	} while (read_seqcount_retry(&pidmap_lock_seq, seq));
> -
> -	if (!node)
> +	pid = rhashtable_lookup(&pidfs_ino_ht, &ino, pidfs_ino_ht_params);
> +	if (!pid)
> +		return NULL;
> +	attr = READ_ONCE(pid->attr);
> +	if (IS_ERR_OR_NULL(attr))
> +		return NULL;
> +	if (test_bit(PIDFS_ATTR_BIT_EXIT, &attr->attr_mask))
>  		return NULL;
> -
> -	pid = rb_entry(node, struct pid, pidfs_node);
> -
>  	/* Within our pid namespace hierarchy? */
>  	if (pid_vnr(pid) == 0)
>  		return NULL;
> -
>  	return get_pid(pid);
>  }
>  
> @@ -1086,6 +1068,9 @@ struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags)
>  
>  void __init pidfs_init(void)
>  {
> +	if (rhashtable_init(&pidfs_ino_ht, &pidfs_ino_ht_params))
> +		panic("Failed to initialize pidfs hashtable");
> +
>  	pidfs_attr_cachep = kmem_cache_create("pidfs_attr_cache", sizeof(struct pidfs_attr), 0,
>  					 (SLAB_HWCACHE_ALIGN | SLAB_RECLAIM_ACCOUNT |
>  					  SLAB_ACCOUNT | SLAB_PANIC), NULL);
> diff --git a/include/linux/pid.h b/include/linux/pid.h
> index 003a1027d219..ce9b5cb7560b 100644
> --- a/include/linux/pid.h
> +++ b/include/linux/pid.h
> @@ -6,6 +6,7 @@
>  #include <linux/rculist.h>
>  #include <linux/rcupdate.h>
>  #include <linux/refcount.h>
> +#include <linux/rhashtable-types.h>
>  #include <linux/sched.h>
>  #include <linux/wait.h>
>  
> @@ -60,7 +61,7 @@ struct pid {
>  	spinlock_t lock;
>  	struct {
>  		u64 ino;
> -		struct rb_node pidfs_node;
> +		struct rhash_head pidfs_hash;
>  		struct dentry *stashed;
>  		struct pidfs_attr *attr;
>  	};
> @@ -73,7 +74,6 @@ struct pid {
>  	struct upid numbers[];
>  };
>  
> -extern seqcount_spinlock_t pidmap_lock_seq;
>  extern struct pid init_struct_pid;
>  
>  struct file;
> diff --git a/include/linux/pidfs.h b/include/linux/pidfs.h
> index 3e08c33da2df..416bdff4d6ce 100644
> --- a/include/linux/pidfs.h
> +++ b/include/linux/pidfs.h
> @@ -6,7 +6,8 @@ struct coredump_params;
>  
>  struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags);
>  void __init pidfs_init(void);
> -void pidfs_add_pid(struct pid *pid);
> +void pidfs_prepare_pid(struct pid *pid);
> +int pidfs_add_pid(struct pid *pid);
>  void pidfs_remove_pid(struct pid *pid);
>  void pidfs_exit(struct task_struct *tsk);
>  #ifdef CONFIG_COREDUMP
> diff --git a/kernel/pid.c b/kernel/pid.c
> index ad4400a9f15f..6077da774652 100644
> --- a/kernel/pid.c
> +++ b/kernel/pid.c
> @@ -43,7 +43,6 @@
>  #include <linux/sched/task.h>
>  #include <linux/idr.h>
>  #include <linux/pidfs.h>
> -#include <linux/seqlock.h>
>  #include <net/sock.h>
>  #include <uapi/linux/pidfd.h>
>  
> @@ -85,7 +84,6 @@ struct pid_namespace init_pid_ns = {
>  EXPORT_SYMBOL_GPL(init_pid_ns);
>  
>  static  __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock);
> -seqcount_spinlock_t pidmap_lock_seq = SEQCNT_SPINLOCK_ZERO(pidmap_lock_seq, &pidmap_lock);
>  
>  void put_pid(struct pid *pid)
>  {
> @@ -141,9 +139,9 @@ void free_pid(struct pid *pid)
>  
>  		idr_remove(&ns->idr, upid->nr);
>  	}
> -	pidfs_remove_pid(pid);
>  	spin_unlock(&pidmap_lock);
>  
> +	pidfs_remove_pid(pid);
>  	call_rcu(&pid->rcu, delayed_put_pid);
>  }
>  
> @@ -315,7 +313,8 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *arg_set_tid,
>  	retval = -ENOMEM;
>  	if (unlikely(!(ns->pid_allocated & PIDNS_ADDING)))
>  		goto out_free;
> -	pidfs_add_pid(pid);
> +	pidfs_prepare_pid(pid);
> +
>  	for (upid = pid->numbers + ns->level; upid >= pid->numbers; --upid) {
>  		/* Make the PID visible to find_pid_ns. */
>  		idr_replace(&upid->ns->idr, pid, upid->nr);
> @@ -325,6 +324,12 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *arg_set_tid,
>  	idr_preload_end();
>  	ns_ref_active_get(ns);
>  
> +	retval = pidfs_add_pid(pid);
> +	if (unlikely(retval)) {
> +		free_pid(pid);
> +		pid = ERR_PTR(-ENOMEM);
> +	}
> +
>  	return pid;
>  
>  out_free:
> 
> ---
> base-commit: f54c7e54d2de2d7b58aa54604218a6fc00bb2e77
> change-id: 20260119-work-pidfs-rhashtable-9d14071bd77a
> 
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH RFC v2] pidfs: convert rb-tree to rhashtable
  2026-01-21 10:59 ` Jan Kara
@ 2026-01-22 10:18   ` Christian Brauner
  2026-01-22 13:21     ` Jan Kara
  0 siblings, 1 reply; 19+ messages in thread
From: Christian Brauner @ 2026-01-22 10:18 UTC (permalink / raw)
  To: Jan Kara; +Cc: Mateusz Guzik, Alexander Viro, linux-fsdevel

On Wed, Jan 21, 2026 at 11:59:11AM +0100, Jan Kara wrote:
> On Tue 20-01-26 15:52:35, Christian Brauner wrote:
> > Mateusz reported performance penalties [1] during task creation because
> > pidfs uses pidmap_lock to add elements into the rbtree. Switch to an
> > rhashtable to have separate fine-grained locking and to decouple from
> > pidmap_lock moving all heavy manipulations outside of it.
> > 
> > Convert the pidfs inode-to-pid mapping from an rb-tree with seqcount
> > protection to an rhashtable. This removes the global pidmap_lock
> > contention from pidfs_ino_get_pid() lookups and allows the hashtable
> > insert to happen outside the pidmap_lock.
> > 
> > pidfs_add_pid() is split. pidfs_prepare_pid() allocates inode number and
> > initializes pid fields and is called inside pidmap_lock. pidfs_add_pid()
> > inserts pid into rhashtable and is called outside pidmap_lock. Insertion
> > into the rhashtable can fail and memory allocation may happen so we need
> > to drop the spinlock.
> > 
> > To guard against accidently opening an already reaped task
> > pidfs_ino_get_pid() uses additional checks beyond pid_vnr(). If
> > pid->attr is PIDFS_PID_DEAD or NULL the pid either never had a pidfd or
> > it already went through pidfs_exit() aka the process as already reaped.
> > If pid->attr is valid check PIDFS_ATTR_BIT_EXIT to figure out whether
> > the task has exited.
> > 
> > This slightly changes visibility semantics: pidfd creation is denied
> > after pidfs_exit() runs, which is just before the pid number is removed
> > from the via free_pid(). That should not be an issue though.
> > 
> > Link: https://lore.kernel.org/20251206131955.780557-1-mjguzik@gmail.com [1]
> > Signed-off-by: Christian Brauner <brauner@kernel.org>
> 
> Looks very nice! Feel free to add:
> 
> Reviewed-by: Jan Kara <jack@suse.cz>
> 
> I have just one question about the new PIDFS_ATTR_BIT_EXIT check.  AFAIU it
> protects from grabbing struct pid references for pids that are dying. But
> we can also call free_pid() from places like ksys_setsid() where
> PIDFS_ATTR_BIT_EXIT is not set. So this check only seems as a convenience
> rather than some hard guarantee, am I right?

Excellent question!

So the way this whole freeing works is weird. So when a task becomes a
session leader or process group leader __change_pid() detaches the task
from its old session/process group leader.

The old session leader/process group leader pid only ends up with
free_pid() called on it if it has already been reaped, i.e.,
pid_has_task() returns NULL.

But if the task has already been reaped then it will have already passed
pidfs_exit() and so the exit bit will be set.

So this only handles already dead pids.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH RFC v2] pidfs: convert rb-tree to rhashtable
  2026-01-22 10:18   ` Christian Brauner
@ 2026-01-22 13:21     ` Jan Kara
  0 siblings, 0 replies; 19+ messages in thread
From: Jan Kara @ 2026-01-22 13:21 UTC (permalink / raw)
  To: Christian Brauner; +Cc: Jan Kara, Mateusz Guzik, Alexander Viro, linux-fsdevel

On Thu 22-01-26 11:18:10, Christian Brauner wrote:
> On Wed, Jan 21, 2026 at 11:59:11AM +0100, Jan Kara wrote:
> > On Tue 20-01-26 15:52:35, Christian Brauner wrote:
> > > Mateusz reported performance penalties [1] during task creation because
> > > pidfs uses pidmap_lock to add elements into the rbtree. Switch to an
> > > rhashtable to have separate fine-grained locking and to decouple from
> > > pidmap_lock moving all heavy manipulations outside of it.
> > > 
> > > Convert the pidfs inode-to-pid mapping from an rb-tree with seqcount
> > > protection to an rhashtable. This removes the global pidmap_lock
> > > contention from pidfs_ino_get_pid() lookups and allows the hashtable
> > > insert to happen outside the pidmap_lock.
> > > 
> > > pidfs_add_pid() is split. pidfs_prepare_pid() allocates inode number and
> > > initializes pid fields and is called inside pidmap_lock. pidfs_add_pid()
> > > inserts pid into rhashtable and is called outside pidmap_lock. Insertion
> > > into the rhashtable can fail and memory allocation may happen so we need
> > > to drop the spinlock.
> > > 
> > > To guard against accidently opening an already reaped task
> > > pidfs_ino_get_pid() uses additional checks beyond pid_vnr(). If
> > > pid->attr is PIDFS_PID_DEAD or NULL the pid either never had a pidfd or
> > > it already went through pidfs_exit() aka the process as already reaped.
> > > If pid->attr is valid check PIDFS_ATTR_BIT_EXIT to figure out whether
> > > the task has exited.
> > > 
> > > This slightly changes visibility semantics: pidfd creation is denied
> > > after pidfs_exit() runs, which is just before the pid number is removed
> > > from the via free_pid(). That should not be an issue though.
> > > 
> > > Link: https://lore.kernel.org/20251206131955.780557-1-mjguzik@gmail.com [1]
> > > Signed-off-by: Christian Brauner <brauner@kernel.org>
> > 
> > Looks very nice! Feel free to add:
> > 
> > Reviewed-by: Jan Kara <jack@suse.cz>
> > 
> > I have just one question about the new PIDFS_ATTR_BIT_EXIT check.  AFAIU it
> > protects from grabbing struct pid references for pids that are dying. But
> > we can also call free_pid() from places like ksys_setsid() where
> > PIDFS_ATTR_BIT_EXIT is not set. So this check only seems as a convenience
> > rather than some hard guarantee, am I right?
> 
> Excellent question!
> 
> So the way this whole freeing works is weird. So when a task becomes a
> session leader or process group leader __change_pid() detaches the task
> from its old session/process group leader.
> 
> The old session leader/process group leader pid only ends up with
> free_pid() called on it if it has already been reaped, i.e.,
> pid_has_task() returns NULL.

Aha, I have missed this subtlety in __change_pid(). Thanks for
clarification!

								Honza
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH RFC v2] pidfs: convert rb-tree to rhashtable
  2026-01-20 14:52 [PATCH RFC v2] pidfs: convert rb-tree to rhashtable Christian Brauner
  2026-01-20 15:43 ` Mateusz Guzik
  2026-01-21 10:59 ` Jan Kara
@ 2026-02-20 15:11 ` Guillaume Tucker
  2026-02-24 13:22   ` Christian Brauner
  2 siblings, 1 reply; 19+ messages in thread
From: Guillaume Tucker @ 2026-02-20 15:11 UTC (permalink / raw)
  To: Christian Brauner, Mateusz Guzik
  Cc: Alexander Viro, Jan Kara, linux-fsdevel, Mark Brown, kunit-dev,
	David Gow

Hi Christian et al,

On 20/01/2026 15:52, Christian Brauner wrote:
> Mateusz reported performance penalties [1] during task creation because
> pidfs uses pidmap_lock to add elements into the rbtree. Switch to an
> rhashtable to have separate fine-grained locking and to decouple from
> pidmap_lock moving all heavy manipulations outside of it.
> 
> Convert the pidfs inode-to-pid mapping from an rb-tree with seqcount
> protection to an rhashtable. This removes the global pidmap_lock
> contention from pidfs_ino_get_pid() lookups and allows the hashtable
> insert to happen outside the pidmap_lock.
> 
> pidfs_add_pid() is split. pidfs_prepare_pid() allocates inode number and
> initializes pid fields and is called inside pidmap_lock. pidfs_add_pid()
> inserts pid into rhashtable and is called outside pidmap_lock. Insertion
> into the rhashtable can fail and memory allocation may happen so we need
> to drop the spinlock.
> 
> To guard against accidently opening an already reaped task
> pidfs_ino_get_pid() uses additional checks beyond pid_vnr(). If
> pid->attr is PIDFS_PID_DEAD or NULL the pid either never had a pidfd or
> it already went through pidfs_exit() aka the process as already reaped.
> If pid->attr is valid check PIDFS_ATTR_BIT_EXIT to figure out whether
> the task has exited.
> 
> This slightly changes visibility semantics: pidfd creation is denied
> after pidfs_exit() runs, which is just before the pid number is removed
> from the via free_pid(). That should not be an issue though.
> 
> Link: https://lore.kernel.org/20251206131955.780557-1-mjguzik@gmail.com [1]
> Signed-off-by: Christian Brauner <brauner@kernel.org>
> ---
> Changes in v2:
> - Ensure that pid is removed before call_rcu() from pidfs.
> - Don't drop and reacquire spinlock.
> - Link to v1: https://patch.msgid.link/20260119-work-pidfs-rhashtable-v1-1-159c7700300a@kernel.org
> ---
>  fs/pidfs.c            | 81 +++++++++++++++++++++------------------------------
>  include/linux/pid.h   |  4 +--
>  include/linux/pidfs.h |  3 +-
>  kernel/pid.c          | 13 ++++++---
>  4 files changed, 46 insertions(+), 55 deletions(-)

[...]

> diff --git a/kernel/pid.c b/kernel/pid.c
> index ad4400a9f15f..6077da774652 100644
> --- a/kernel/pid.c
> +++ b/kernel/pid.c
> @@ -43,7 +43,6 @@
>  #include <linux/sched/task.h>
>  #include <linux/idr.h>
>  #include <linux/pidfs.h>
> -#include <linux/seqlock.h>
>  #include <net/sock.h>
>  #include <uapi/linux/pidfd.h>
>  
> @@ -85,7 +84,6 @@ struct pid_namespace init_pid_ns = {
>  EXPORT_SYMBOL_GPL(init_pid_ns);
>  
>  static  __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock);
> -seqcount_spinlock_t pidmap_lock_seq = SEQCNT_SPINLOCK_ZERO(pidmap_lock_seq, &pidmap_lock);
>  
>  void put_pid(struct pid *pid)
>  {
> @@ -141,9 +139,9 @@ void free_pid(struct pid *pid)
>  
>  		idr_remove(&ns->idr, upid->nr);
>  	}
> -	pidfs_remove_pid(pid);
>  	spin_unlock(&pidmap_lock);
>  
> +	pidfs_remove_pid(pid);
>  	call_rcu(&pid->rcu, delayed_put_pid);
>  }

There appears to be a reproducible panic in rcu since next-20260216
at least while running KUnit.  After running a bisection I found that
it was visible at a merge commit adding this patch 44e59e62b2a2
("Merge branch 'kernel-7.0.misc' into vfs.all").  I then narrowed it
down further on a test branch by rebasing the pidfs series on top of
the last known working commit:

    https://gitlab.com/gtucker/linux/-/commits/kunit-rcu-debug-rebased

I also did some initial investigation with basic printk debugging and
haven't found anything obviously wrong in this patch itself, although
I'm no expert in pidfs...  One symptom is that the kernel panic
always happens because the function pointer to delayed_put_pid()
becomes corrupt.  As a quick hack, if I just call put_pid() in
free_pid() rather than go through rcu then there's no panic - see the
last commit on the test branch from the link above.  The issue is
still in next-20260219 as far as I can tell.

Here's how to reproduce this, using the new container script and a
plain container image to run KUnit vith QEMU on x86:

scripts/container -s -i docker.io/gtucker/korg-gcc:kunit -- \
    tools/testing/kunit/kunit.py \
    run \
    --arch=x86_64 \
    --cross_compile=x86_64-linux-

The panic can be seen in .kunit/test.log:

    [gtucker] rcu_do_batch:2609 count=7 func=ffffffff99026d40
    Oops: invalid opcode: 0000 [#2] SMP NOPTI
    CPU: 0 UID: 0 PID: 197 Comm: kunit_try_catch Tainted: G      D          N  6.19.0-09950-gc33cbc7ffae4 #77 PREEMPT(lazy)
    Tainted: [D]=DIE, [N]=TEST
    Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 0.1 11/11/2019
    RIP: 0010:0xffffffff99026d42

Looking at the last rcu callbacks that were enqueued with my extra
printk messages:

$ grep call_rcu .kunit/test.log | tail -n16
[gtucker] call_rcu include/linux/sched/task.h:159 put_task_struct ffffffff98887ae0
[gtucker] call_rcu include/linux/sched/task.h:159 put_task_struct ffffffff98887ae0
[gtucker] call_rcu include/linux/sched/task.h:159 put_task_struct ffffffff98887ae0
[gtucker] call_rcu lib/radix-tree.c:310 radix_tree_node_free ffffffff98ccc1a0
[gtucker] call_rcu lib/radix-tree.c:310 radix_tree_node_free ffffffff98ccc1a0
[gtucker] call_rcu include/linux/sched/task.h:159 put_task_struct ffffffff98887ae0
[gtucker] call_rcu kernel/cred.c:83 __put_cred ffffffff988b7cd0
[gtucker] call_rcu kernel/cred.c:83 __put_cred ffffffff988b7cd0
[gtucker] call_rcu kernel/cred.c:83 __put_cred ffffffff988b7cd0
[gtucker] call_rcu kernel/pid.c:148 free_pid ffffffff988adaf0
[gtucker] call_rcu kernel/exit.c:237 put_task_struct_rcu_user ffffffff9888e440
[gtucker] call_rcu lib/radix-tree.c:310 radix_tree_node_free ffffffff98ccc1a0
[gtucker] call_rcu lib/radix-tree.c:310 radix_tree_node_free ffffffff98ccc1a0
[gtucker] call_rcu kernel/pid.c:148 free_pid ffffffff988adaf0
[gtucker] call_rcu kernel/exit.c:237 put_task_struct_rcu_user ffffffff9888e440
[gtucker] call_rcu kernel/cred.c:83 __put_cred ffffffff988b7cd0

and then the ones that were called:

$ grep rcu_do_batch .kunit/test.log | tail
[gtucker] rcu_do_batch:2609 count=7 func=ffffffff98887ae0
[gtucker] rcu_do_batch:2609 count=8 func=ffffffff98887ae0
[gtucker] rcu_do_batch:2609 count=9 func=ffffffff98887ae0
[gtucker] rcu_do_batch:2609 count=1 func=ffffffff98ccc1a0
[gtucker] rcu_do_batch:2609 count=2 func=ffffffff98887ae0
[gtucker] rcu_do_batch:2609 count=3 func=ffffffff988b7cd0
[gtucker] rcu_do_batch:2609 count=4 func=ffffffff988b7cd0
[gtucker] rcu_do_batch:2609 count=5 func=ffffffff988b7cd0
[gtucker] rcu_do_batch:2609 count=6 func=ffffffff98ccc1a0
[gtucker] rcu_do_batch:2609 count=7 func=ffffffff99026d40

we can see that the last pointer ffffffff99026d40 was never enqueued,
and the one from free_pid() ffffffff988adaf0 was never dequeued.
This is where I stopped investigating as it looked legit and someone
else might have more clues as to what's going on here.  I've only
seen the problem with this callback but again, KUnit is a very narrow
kind of workload so the root cause may well be lying elsewhere.

Please let me know if you need any more debugging details or if I can
help test a fix.  Hope this helps!

Cheers,
Guillaume


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH RFC v2] pidfs: convert rb-tree to rhashtable
  2026-02-20 15:11 ` Guillaume Tucker
@ 2026-02-24 13:22   ` Christian Brauner
  2026-02-24 16:25     ` make_task_dead() & kthread_exit() Christian Brauner
  2026-02-24 16:37     ` [PATCH RFC v2] pidfs: convert rb-tree to rhashtable Mark Brown
  0 siblings, 2 replies; 19+ messages in thread
From: Christian Brauner @ 2026-02-24 13:22 UTC (permalink / raw)
  To: Guillaume Tucker
  Cc: Mateusz Guzik, Alexander Viro, Jan Kara, linux-fsdevel,
	Mark Brown, kunit-dev, David Gow

On Fri, Feb 20, 2026 at 04:11:43PM +0100, Guillaume Tucker wrote:
> Hi Christian et al,
> 
> On 20/01/2026 15:52, Christian Brauner wrote:
> > Mateusz reported performance penalties [1] during task creation because
> > pidfs uses pidmap_lock to add elements into the rbtree. Switch to an
> > rhashtable to have separate fine-grained locking and to decouple from
> > pidmap_lock moving all heavy manipulations outside of it.
> > 
> > Convert the pidfs inode-to-pid mapping from an rb-tree with seqcount
> > protection to an rhashtable. This removes the global pidmap_lock
> > contention from pidfs_ino_get_pid() lookups and allows the hashtable
> > insert to happen outside the pidmap_lock.
> > 
> > pidfs_add_pid() is split. pidfs_prepare_pid() allocates inode number and
> > initializes pid fields and is called inside pidmap_lock. pidfs_add_pid()
> > inserts pid into rhashtable and is called outside pidmap_lock. Insertion
> > into the rhashtable can fail and memory allocation may happen so we need
> > to drop the spinlock.
> > 
> > To guard against accidently opening an already reaped task
> > pidfs_ino_get_pid() uses additional checks beyond pid_vnr(). If
> > pid->attr is PIDFS_PID_DEAD or NULL the pid either never had a pidfd or
> > it already went through pidfs_exit() aka the process as already reaped.
> > If pid->attr is valid check PIDFS_ATTR_BIT_EXIT to figure out whether
> > the task has exited.
> > 
> > This slightly changes visibility semantics: pidfd creation is denied
> > after pidfs_exit() runs, which is just before the pid number is removed
> > from the via free_pid(). That should not be an issue though.
> > 
> > Link: https://lore.kernel.org/20251206131955.780557-1-mjguzik@gmail.com [1]
> > Signed-off-by: Christian Brauner <brauner@kernel.org>
> > ---
> > Changes in v2:
> > - Ensure that pid is removed before call_rcu() from pidfs.
> > - Don't drop and reacquire spinlock.
> > - Link to v1: https://patch.msgid.link/20260119-work-pidfs-rhashtable-v1-1-159c7700300a@kernel.org
> > ---
> >  fs/pidfs.c            | 81 +++++++++++++++++++++------------------------------
> >  include/linux/pid.h   |  4 +--
> >  include/linux/pidfs.h |  3 +-
> >  kernel/pid.c          | 13 ++++++---
> >  4 files changed, 46 insertions(+), 55 deletions(-)
> 
> [...]
> 
> > diff --git a/kernel/pid.c b/kernel/pid.c
> > index ad4400a9f15f..6077da774652 100644
> > --- a/kernel/pid.c
> > +++ b/kernel/pid.c
> > @@ -43,7 +43,6 @@
> >  #include <linux/sched/task.h>
> >  #include <linux/idr.h>
> >  #include <linux/pidfs.h>
> > -#include <linux/seqlock.h>
> >  #include <net/sock.h>
> >  #include <uapi/linux/pidfd.h>
> >  
> > @@ -85,7 +84,6 @@ struct pid_namespace init_pid_ns = {
> >  EXPORT_SYMBOL_GPL(init_pid_ns);
> >  
> >  static  __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock);
> > -seqcount_spinlock_t pidmap_lock_seq = SEQCNT_SPINLOCK_ZERO(pidmap_lock_seq, &pidmap_lock);
> >  
> >  void put_pid(struct pid *pid)
> >  {
> > @@ -141,9 +139,9 @@ void free_pid(struct pid *pid)
> >  
> >  		idr_remove(&ns->idr, upid->nr);
> >  	}
> > -	pidfs_remove_pid(pid);
> >  	spin_unlock(&pidmap_lock);
> >  
> > +	pidfs_remove_pid(pid);
> >  	call_rcu(&pid->rcu, delayed_put_pid);
> >  }
> 
> There appears to be a reproducible panic in rcu since next-20260216
> at least while running KUnit.  After running a bisection I found that
> it was visible at a merge commit adding this patch 44e59e62b2a2
> ("Merge branch 'kernel-7.0.misc' into vfs.all").  I then narrowed it
> down further on a test branch by rebasing the pidfs series on top of
> the last known working commit:
> 
>     https://gitlab.com/gtucker/linux/-/commits/kunit-rcu-debug-rebased
> 
> I also did some initial investigation with basic printk debugging and
> haven't found anything obviously wrong in this patch itself, although
> I'm no expert in pidfs...  One symptom is that the kernel panic
> always happens because the function pointer to delayed_put_pid()
> becomes corrupt.  As a quick hack, if I just call put_pid() in
> free_pid() rather than go through rcu then there's no panic - see the
> last commit on the test branch from the link above.  The issue is
> still in next-20260219 as far as I can tell.
> 
> Here's how to reproduce this, using the new container script and a
> plain container image to run KUnit vith QEMU on x86:
> 
> scripts/container -s -i docker.io/gtucker/korg-gcc:kunit -- \
>     tools/testing/kunit/kunit.py \
>     run \
>     --arch=x86_64 \
>     --cross_compile=x86_64-linux-
> 
> The panic can be seen in .kunit/test.log:
> 
>     [gtucker] rcu_do_batch:2609 count=7 func=ffffffff99026d40
>     Oops: invalid opcode: 0000 [#2] SMP NOPTI
>     CPU: 0 UID: 0 PID: 197 Comm: kunit_try_catch Tainted: G      D          N  6.19.0-09950-gc33cbc7ffae4 #77 PREEMPT(lazy)
>     Tainted: [D]=DIE, [N]=TEST
>     Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 0.1 11/11/2019
>     RIP: 0010:0xffffffff99026d42
> 
> Looking at the last rcu callbacks that were enqueued with my extra
> printk messages:
> 
> $ grep call_rcu .kunit/test.log | tail -n16
> [gtucker] call_rcu include/linux/sched/task.h:159 put_task_struct ffffffff98887ae0
> [gtucker] call_rcu include/linux/sched/task.h:159 put_task_struct ffffffff98887ae0
> [gtucker] call_rcu include/linux/sched/task.h:159 put_task_struct ffffffff98887ae0
> [gtucker] call_rcu lib/radix-tree.c:310 radix_tree_node_free ffffffff98ccc1a0
> [gtucker] call_rcu lib/radix-tree.c:310 radix_tree_node_free ffffffff98ccc1a0
> [gtucker] call_rcu include/linux/sched/task.h:159 put_task_struct ffffffff98887ae0
> [gtucker] call_rcu kernel/cred.c:83 __put_cred ffffffff988b7cd0
> [gtucker] call_rcu kernel/cred.c:83 __put_cred ffffffff988b7cd0
> [gtucker] call_rcu kernel/cred.c:83 __put_cred ffffffff988b7cd0
> [gtucker] call_rcu kernel/pid.c:148 free_pid ffffffff988adaf0
> [gtucker] call_rcu kernel/exit.c:237 put_task_struct_rcu_user ffffffff9888e440
> [gtucker] call_rcu lib/radix-tree.c:310 radix_tree_node_free ffffffff98ccc1a0
> [gtucker] call_rcu lib/radix-tree.c:310 radix_tree_node_free ffffffff98ccc1a0
> [gtucker] call_rcu kernel/pid.c:148 free_pid ffffffff988adaf0
> [gtucker] call_rcu kernel/exit.c:237 put_task_struct_rcu_user ffffffff9888e440
> [gtucker] call_rcu kernel/cred.c:83 __put_cred ffffffff988b7cd0
> 
> and then the ones that were called:
> 
> $ grep rcu_do_batch .kunit/test.log | tail
> [gtucker] rcu_do_batch:2609 count=7 func=ffffffff98887ae0
> [gtucker] rcu_do_batch:2609 count=8 func=ffffffff98887ae0
> [gtucker] rcu_do_batch:2609 count=9 func=ffffffff98887ae0
> [gtucker] rcu_do_batch:2609 count=1 func=ffffffff98ccc1a0
> [gtucker] rcu_do_batch:2609 count=2 func=ffffffff98887ae0
> [gtucker] rcu_do_batch:2609 count=3 func=ffffffff988b7cd0
> [gtucker] rcu_do_batch:2609 count=4 func=ffffffff988b7cd0
> [gtucker] rcu_do_batch:2609 count=5 func=ffffffff988b7cd0
> [gtucker] rcu_do_batch:2609 count=6 func=ffffffff98ccc1a0
> [gtucker] rcu_do_batch:2609 count=7 func=ffffffff99026d40
> 
> we can see that the last pointer ffffffff99026d40 was never enqueued,
> and the one from free_pid() ffffffff988adaf0 was never dequeued.
> This is where I stopped investigating as it looked legit and someone
> else might have more clues as to what's going on here.  I've only
> seen the problem with this callback but again, KUnit is a very narrow
> kind of workload so the root cause may well be lying elsewhere.
> 
> Please let me know if you need any more debugging details or if I can
> help test a fix.  Hope this helps!

Thanks for the report. I have so far no idea how that can happen:

* Is this reproducible with multiple compilers?
* Is this reproducible on v7.0-rc1?

Fwiw, we're missing one check currently:

diff --git a/kernel/pid.c b/kernel/pid.c
index 3b96571d0fe6..dc11acd445ae 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -326,7 +326,8 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *arg_set_tid,

        retval = pidfs_add_pid(pid);
        if (unlikely(retval)) {
-               free_pid(pid);
+               if (pid != &init_struct_pid)
+                       free_pid(pid);
                pid = ERR_PTR(-ENOMEM);
        }

But it seems unlikely that pidfs_add_pid() fails here.

^ permalink raw reply related	[flat|nested] 19+ messages in thread

* make_task_dead() & kthread_exit()
  2026-02-24 13:22   ` Christian Brauner
@ 2026-02-24 16:25     ` Christian Brauner
  2026-02-24 17:04       ` Mark Brown
                         ` (2 more replies)
  2026-02-24 16:37     ` [PATCH RFC v2] pidfs: convert rb-tree to rhashtable Mark Brown
  1 sibling, 3 replies; 19+ messages in thread
From: Christian Brauner @ 2026-02-24 16:25 UTC (permalink / raw)
  To: Guillaume Tucker, Tejun Heo
  Cc: Mateusz Guzik, Alexander Viro, Jan Kara, linux-fsdevel,
	Mark Brown, kunit-dev, David Gow, Linus Torvalds

On Tue, Feb 24, 2026 at 02:22:31PM +0100, Christian Brauner wrote:
> On Fri, Feb 20, 2026 at 04:11:43PM +0100, Guillaume Tucker wrote:
> > Hi Christian et al,
> > 
> > On 20/01/2026 15:52, Christian Brauner wrote:
> > > Mateusz reported performance penalties [1] during task creation because
> > > pidfs uses pidmap_lock to add elements into the rbtree. Switch to an
> > > rhashtable to have separate fine-grained locking and to decouple from
> > > pidmap_lock moving all heavy manipulations outside of it.
> > > 
> > > Convert the pidfs inode-to-pid mapping from an rb-tree with seqcount
> > > protection to an rhashtable. This removes the global pidmap_lock
> > > contention from pidfs_ino_get_pid() lookups and allows the hashtable
> > > insert to happen outside the pidmap_lock.
> > > 
> > > pidfs_add_pid() is split. pidfs_prepare_pid() allocates inode number and
> > > initializes pid fields and is called inside pidmap_lock. pidfs_add_pid()
> > > inserts pid into rhashtable and is called outside pidmap_lock. Insertion
> > > into the rhashtable can fail and memory allocation may happen so we need
> > > to drop the spinlock.
> > > 
> > > To guard against accidently opening an already reaped task
> > > pidfs_ino_get_pid() uses additional checks beyond pid_vnr(). If
> > > pid->attr is PIDFS_PID_DEAD or NULL the pid either never had a pidfd or
> > > it already went through pidfs_exit() aka the process as already reaped.
> > > If pid->attr is valid check PIDFS_ATTR_BIT_EXIT to figure out whether
> > > the task has exited.
> > > 
> > > This slightly changes visibility semantics: pidfd creation is denied
> > > after pidfs_exit() runs, which is just before the pid number is removed
> > > from the via free_pid(). That should not be an issue though.
> > > 
> > > Link: https://lore.kernel.org/20251206131955.780557-1-mjguzik@gmail.com [1]
> > > Signed-off-by: Christian Brauner <brauner@kernel.org>
> > > ---
> > > Changes in v2:
> > > - Ensure that pid is removed before call_rcu() from pidfs.
> > > - Don't drop and reacquire spinlock.
> > > - Link to v1: https://patch.msgid.link/20260119-work-pidfs-rhashtable-v1-1-159c7700300a@kernel.org
> > > ---
> > >  fs/pidfs.c            | 81 +++++++++++++++++++++------------------------------
> > >  include/linux/pid.h   |  4 +--
> > >  include/linux/pidfs.h |  3 +-
> > >  kernel/pid.c          | 13 ++++++---
> > >  4 files changed, 46 insertions(+), 55 deletions(-)
> > 
> > [...]
> > 
> > > diff --git a/kernel/pid.c b/kernel/pid.c
> > > index ad4400a9f15f..6077da774652 100644
> > > --- a/kernel/pid.c
> > > +++ b/kernel/pid.c
> > > @@ -43,7 +43,6 @@
> > >  #include <linux/sched/task.h>
> > >  #include <linux/idr.h>
> > >  #include <linux/pidfs.h>
> > > -#include <linux/seqlock.h>
> > >  #include <net/sock.h>
> > >  #include <uapi/linux/pidfd.h>
> > >  
> > > @@ -85,7 +84,6 @@ struct pid_namespace init_pid_ns = {
> > >  EXPORT_SYMBOL_GPL(init_pid_ns);
> > >  
> > >  static  __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock);
> > > -seqcount_spinlock_t pidmap_lock_seq = SEQCNT_SPINLOCK_ZERO(pidmap_lock_seq, &pidmap_lock);
> > >  
> > >  void put_pid(struct pid *pid)
> > >  {
> > > @@ -141,9 +139,9 @@ void free_pid(struct pid *pid)
> > >  
> > >  		idr_remove(&ns->idr, upid->nr);
> > >  	}
> > > -	pidfs_remove_pid(pid);
> > >  	spin_unlock(&pidmap_lock);
> > >  
> > > +	pidfs_remove_pid(pid);
> > >  	call_rcu(&pid->rcu, delayed_put_pid);
> > >  }
> > 
> > There appears to be a reproducible panic in rcu since next-20260216
> > at least while running KUnit.  After running a bisection I found that
> > it was visible at a merge commit adding this patch 44e59e62b2a2
> > ("Merge branch 'kernel-7.0.misc' into vfs.all").  I then narrowed it
> > down further on a test branch by rebasing the pidfs series on top of
> > the last known working commit:
> > 
> >     https://gitlab.com/gtucker/linux/-/commits/kunit-rcu-debug-rebased
> > 
> > I also did some initial investigation with basic printk debugging and
> > haven't found anything obviously wrong in this patch itself, although
> > I'm no expert in pidfs...  One symptom is that the kernel panic
> > always happens because the function pointer to delayed_put_pid()
> > becomes corrupt.  As a quick hack, if I just call put_pid() in
> > free_pid() rather than go through rcu then there's no panic - see the
> > last commit on the test branch from the link above.  The issue is
> > still in next-20260219 as far as I can tell.
> > 
> > Here's how to reproduce this, using the new container script and a
> > plain container image to run KUnit vith QEMU on x86:
> > 
> > scripts/container -s -i docker.io/gtucker/korg-gcc:kunit -- \
> >     tools/testing/kunit/kunit.py \
> >     run \
> >     --arch=x86_64 \
> >     --cross_compile=x86_64-linux-
> > 
> > The panic can be seen in .kunit/test.log:
> > 
> >     [gtucker] rcu_do_batch:2609 count=7 func=ffffffff99026d40
> >     Oops: invalid opcode: 0000 [#2] SMP NOPTI
> >     CPU: 0 UID: 0 PID: 197 Comm: kunit_try_catch Tainted: G      D          N  6.19.0-09950-gc33cbc7ffae4 #77 PREEMPT(lazy)
> >     Tainted: [D]=DIE, [N]=TEST
> >     Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 0.1 11/11/2019
> >     RIP: 0010:0xffffffff99026d42
> > 
> > Looking at the last rcu callbacks that were enqueued with my extra
> > printk messages:
> > 
> > $ grep call_rcu .kunit/test.log | tail -n16
> > [gtucker] call_rcu include/linux/sched/task.h:159 put_task_struct ffffffff98887ae0
> > [gtucker] call_rcu include/linux/sched/task.h:159 put_task_struct ffffffff98887ae0
> > [gtucker] call_rcu include/linux/sched/task.h:159 put_task_struct ffffffff98887ae0
> > [gtucker] call_rcu lib/radix-tree.c:310 radix_tree_node_free ffffffff98ccc1a0
> > [gtucker] call_rcu lib/radix-tree.c:310 radix_tree_node_free ffffffff98ccc1a0
> > [gtucker] call_rcu include/linux/sched/task.h:159 put_task_struct ffffffff98887ae0
> > [gtucker] call_rcu kernel/cred.c:83 __put_cred ffffffff988b7cd0
> > [gtucker] call_rcu kernel/cred.c:83 __put_cred ffffffff988b7cd0
> > [gtucker] call_rcu kernel/cred.c:83 __put_cred ffffffff988b7cd0
> > [gtucker] call_rcu kernel/pid.c:148 free_pid ffffffff988adaf0
> > [gtucker] call_rcu kernel/exit.c:237 put_task_struct_rcu_user ffffffff9888e440
> > [gtucker] call_rcu lib/radix-tree.c:310 radix_tree_node_free ffffffff98ccc1a0
> > [gtucker] call_rcu lib/radix-tree.c:310 radix_tree_node_free ffffffff98ccc1a0
> > [gtucker] call_rcu kernel/pid.c:148 free_pid ffffffff988adaf0
> > [gtucker] call_rcu kernel/exit.c:237 put_task_struct_rcu_user ffffffff9888e440
> > [gtucker] call_rcu kernel/cred.c:83 __put_cred ffffffff988b7cd0
> > 
> > and then the ones that were called:
> > 
> > $ grep rcu_do_batch .kunit/test.log | tail
> > [gtucker] rcu_do_batch:2609 count=7 func=ffffffff98887ae0
> > [gtucker] rcu_do_batch:2609 count=8 func=ffffffff98887ae0
> > [gtucker] rcu_do_batch:2609 count=9 func=ffffffff98887ae0
> > [gtucker] rcu_do_batch:2609 count=1 func=ffffffff98ccc1a0
> > [gtucker] rcu_do_batch:2609 count=2 func=ffffffff98887ae0
> > [gtucker] rcu_do_batch:2609 count=3 func=ffffffff988b7cd0
> > [gtucker] rcu_do_batch:2609 count=4 func=ffffffff988b7cd0
> > [gtucker] rcu_do_batch:2609 count=5 func=ffffffff988b7cd0
> > [gtucker] rcu_do_batch:2609 count=6 func=ffffffff98ccc1a0
> > [gtucker] rcu_do_batch:2609 count=7 func=ffffffff99026d40
> > 
> > we can see that the last pointer ffffffff99026d40 was never enqueued,
> > and the one from free_pid() ffffffff988adaf0 was never dequeued.
> > This is where I stopped investigating as it looked legit and someone
> > else might have more clues as to what's going on here.  I've only
> > seen the problem with this callback but again, KUnit is a very narrow
> > kind of workload so the root cause may well be lying elsewhere.
> > 
> > Please let me know if you need any more debugging details or if I can
> > help test a fix.  Hope this helps!
> 
> Thanks for the report. I have so far no idea how that can happen:
> 
> * Is this reproducible with multiple compilers?
> * Is this reproducible on v7.0-rc1?
> 
> Fwiw, we're missing one check currently:
> 
> diff --git a/kernel/pid.c b/kernel/pid.c
> index 3b96571d0fe6..dc11acd445ae 100644
> --- a/kernel/pid.c
> +++ b/kernel/pid.c
> @@ -326,7 +326,8 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *arg_set_tid,
> 
>         retval = pidfs_add_pid(pid);
>         if (unlikely(retval)) {
> -               free_pid(pid);
> +               if (pid != &init_struct_pid)
> +                       free_pid(pid);
>                 pid = ERR_PTR(-ENOMEM);
>         }
> 
> But it seems unlikely that pidfs_add_pid() fails here.

Ugh, yuck squared.

IIUC, the bug is a UAF in free_kthread_struct(). It wasn't easy
detectable until the pidfs rhashtable conversion changed struct pid's
size and field layout.

The rhashtable conversion replaced struct pid's struct rb_node of 24
bytes with struct rhash_head of 8 bytes, shrinking struct pid by 16
bytes bringing it to 144 bytes. This means it's now the same size as
struct kthread (without CONFIG_BLK_CGROUP). Both structs use
SLAB_HWCACHE_ALIGN bringing it to 192. KUnit sets
CONFIG_SLAB_MERGE_DEFAULT=y. So now struct pid and struct kthread share
slab caches. First part of the puzzle.

struct pid.rcu.func is at offset 0x78 and struct kthread.affinity node
at offset 0x78. I'm I'm right then we can already see where this is
going.

So free_kthread_struct() calls kfree(kthread) without checking whether
the kthread's affinity_node is still linked in kthread_affinity_list.

If a kthread exits via a path that bypasses kthread_exit() (e.g.,
make_task_dead() after an oops -- which calls do_exit() directly),
the affinity_node remains in the global kthread_affinity_list. When
free_kthread_struct() later frees the kthread struct, the linked list
still references the freed memory. Any subsequent list_del() by another
kthread in kthread_exit() writes to the freed memory:

    list_del(&kthread->affinity_node):
    entry->prev->next = entry->next   // writes to freed predecessor's offset 0x78

With cache merging, this freed kthread memory may have been reused for a
struct pid. The write corrupts pid.rcu.func at the same offset 0x78,
replacing delayed_put_pid with &kthread_affinity_list. The next RCU
callback invocation is fscked.

* Raising SLAB_NO_MERGE makes the bug go away.
* Turn on
  CONFIG_KASAN=y
  CONFIG_KASAN_GENERIC=y
  CONFIG_KASAN_INLINE=y

  BUG: KASAN: slab-use-after-free in kthread_exit+0x255/0x280
  Write of size 8 at addr ffff888003238f78 by task kunit_try_catch/183

  Allocated by task 169:
    set_kthread_struct+0x31/0x150
    copy_process+0x203d/0x4c00

  <snip>

  Freed by task 0:
    free_task+0x147/0x3a0
    rcu_core+0x58e/0x1c50

  <snip>

Object at ffff888003238f00 belongs to kmalloc-192 cache.
Allocated by
set_kthread_struct()
-> kzalloc(sizeof(*kthread)).
Freed by
free_task()
-> free_kthread_struct()
   -> kfree().

Written to at offset 0x78 (affinity_node.next) by another kthread's
list_del in kthread_exit().

Fix should be something like

    void free_kthread_struct(struct task_struct *k)
    {
        struct kthread *kthread;

        kthread = to_kthread(k);
        if (!kthread)
            return;

    +   if (!list_empty(&kthread->affinity_node)) {
    +       mutex_lock(&kthread_affinity_lock);
    +       list_del(&kthread->affinity_node);
    +       mutex_unlock(&kthread_affinity_lock);
    +   }
    +   if (kthread->preferred_affinity)
    +       kfree(kthread->preferred_affinity);

    #ifdef CONFIG_BLK_CGROUP
        WARN_ON_ONCE(kthread->blkcg_css);
    #endif
        k->worker_private = NULL;
        kfree(kthread->full_name);
        kfree(kthread);
    }

The normal kthread_exit() path already unlinks the node. After
list_del(), the node's pointers are set to LIST_POISON1/LIST_POISON2, so
list_empty() returns false. To avoid a double-unlink, kthread_exit()
should use list_del_init() instead of list_del(), so that
free_kthread_struct()'s list_empty() check correctly detects the
already-unlinked state.

I had claude reproduce this under various conditions because I'm a lazy fsck.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH RFC v2] pidfs: convert rb-tree to rhashtable
  2026-02-24 13:22   ` Christian Brauner
  2026-02-24 16:25     ` make_task_dead() & kthread_exit() Christian Brauner
@ 2026-02-24 16:37     ` Mark Brown
  1 sibling, 0 replies; 19+ messages in thread
From: Mark Brown @ 2026-02-24 16:37 UTC (permalink / raw)
  To: Christian Brauner
  Cc: Guillaume Tucker, Mateusz Guzik, Alexander Viro, Jan Kara,
	linux-fsdevel, kunit-dev, David Gow

[-- Attachment #1: Type: text/plain, Size: 1737 bytes --]

On Tue, Feb 24, 2026 at 02:22:31PM +0100, Christian Brauner wrote:
> On Fri, Feb 20, 2026 at 04:11:43PM +0100, Guillaume Tucker wrote:

> > we can see that the last pointer ffffffff99026d40 was never enqueued,
> > and the one from free_pid() ffffffff988adaf0 was never dequeued.
> > This is where I stopped investigating as it looked legit and someone
> > else might have more clues as to what's going on here.  I've only
> > seen the problem with this callback but again, KUnit is a very narrow
> > kind of workload so the root cause may well be lying elsewhere.

> > Please let me know if you need any more debugging details or if I can
> > help test a fix.  Hope this helps!

> Thanks for the report. I have so far no idea how that can happen:

> * Is this reproducible with multiple compilers?

FWIW I've been seeing RCU stalls and crashes in KUnit while building
-next which I believe are the same as Guillaume is reporting, these have
reproduced with multiple arches using the Debian GCC 14 and with LLVM
for me.  Example of my command line:

   ./tools/testing/kunit/kunit.py run --arch arm64 --cross_compile=aarch64-linux-gnu-

> * Is this reproducible on v7.0-rc1?

The issues I'm seeing certainly are.  I just tried testing on this
specific commit (802182490445f6bcf5de0e0518fb967c2afb6da1) and managed
to have KUnit complete though...

> Fwiw, we're missing one check currently:

...

> But it seems unlikely that pidfs_add_pid() fails here.

That doesn't seem to have any impact for me.  I suspect that there may
be multiple interacting changes triggering the issue (or making it more
likely to occur) which isn't helping anything :/  See also:

   https://lore.kernel.org/r/59e8c352-7bc8-4358-af74-fee8c29280ba@davidgow.net

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: make_task_dead() & kthread_exit()
  2026-02-24 16:25     ` make_task_dead() & kthread_exit() Christian Brauner
@ 2026-02-24 17:04       ` Mark Brown
  2026-02-24 18:27         ` Guillaume Tucker
  2026-02-24 19:30       ` Linus Torvalds
  2026-02-26  4:08       ` David Gow
  2 siblings, 1 reply; 19+ messages in thread
From: Mark Brown @ 2026-02-24 17:04 UTC (permalink / raw)
  To: Christian Brauner
  Cc: Guillaume Tucker, Tejun Heo, Mateusz Guzik, Alexander Viro,
	Jan Kara, linux-fsdevel, kunit-dev, David Gow, Linus Torvalds

[-- Attachment #1: Type: text/plain, Size: 2574 bytes --]

On Tue, Feb 24, 2026 at 05:25:21PM +0100, Christian Brauner wrote:

> Ugh, yuck squared.

> IIUC, the bug is a UAF in free_kthread_struct(). It wasn't easy
> detectable until the pidfs rhashtable conversion changed struct pid's
> size and field layout.

Eeew, indeed.  Thanks for figuring this out!

> Fix should be something like

>     void free_kthread_struct(struct task_struct *k)
>     {
>         struct kthread *kthread;
> 
>         kthread = to_kthread(k);
>         if (!kthread)
>             return;
> 
>     +   if (!list_empty(&kthread->affinity_node)) {
>     +       mutex_lock(&kthread_affinity_lock);
>     +       list_del(&kthread->affinity_node);
>     +       mutex_unlock(&kthread_affinity_lock);
>     +   }
>     +   if (kthread->preferred_affinity)
>     +       kfree(kthread->preferred_affinity);
> 
>     #ifdef CONFIG_BLK_CGROUP
>         WARN_ON_ONCE(kthread->blkcg_css);
>     #endif
>         k->worker_private = NULL;
>         kfree(kthread->full_name);
>         kfree(kthread);
>     }

> The normal kthread_exit() path already unlinks the node. After
> list_del(), the node's pointers are set to LIST_POISON1/LIST_POISON2, so
> list_empty() returns false. To avoid a double-unlink, kthread_exit()
> should use list_del_init() instead of list_del(), so that
> free_kthread_struct()'s list_empty() check correctly detects the
> already-unlinked state.

Confirmed that the above patch plus the list_del_init() change seems to
fix the issue for me, the full patch I tested with is below to confirm.
Feel free to add:

Tested-by: Mark Brown <broonie@kernel.org>

diff --git a/kernel/kthread.c b/kernel/kthread.c
index 20451b624b67..3778fcbc56db 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -147,6 +147,13 @@ void free_kthread_struct(struct task_struct *k)
 	kthread = to_kthread(k);
 	if (!kthread)
 		return;
+	if (!list_empty(&kthread->affinity_node)) {
+		mutex_lock(&kthread_affinity_lock);
+		list_del(&kthread->affinity_node);
+		mutex_unlock(&kthread_affinity_lock);
+	}
+	if (kthread->preferred_affinity)
+		kfree(kthread->preferred_affinity);
 
 #ifdef CONFIG_BLK_CGROUP
 	WARN_ON_ONCE(kthread->blkcg_css);
@@ -325,7 +332,7 @@ void __noreturn kthread_exit(long result)
 	kthread->result = result;
 	if (!list_empty(&kthread->affinity_node)) {
 		mutex_lock(&kthread_affinity_lock);
-		list_del(&kthread->affinity_node);
+		list_del_init(&kthread->affinity_node);
 		mutex_unlock(&kthread_affinity_lock);
 
 		if (kthread->preferred_affinity) {

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply related	[flat|nested] 19+ messages in thread

* Re: make_task_dead() & kthread_exit()
  2026-02-24 17:04       ` Mark Brown
@ 2026-02-24 18:27         ` Guillaume Tucker
  0 siblings, 0 replies; 19+ messages in thread
From: Guillaume Tucker @ 2026-02-24 18:27 UTC (permalink / raw)
  To: Mark Brown, Christian Brauner
  Cc: Tejun Heo, Mateusz Guzik, Alexander Viro, Jan Kara, linux-fsdevel,
	kunit-dev, David Gow, Linus Torvalds

On 24/02/2026 18:04, 'Mark Brown' via KUnit Development wrote:
> On Tue, Feb 24, 2026 at 05:25:21PM +0100, Christian Brauner wrote:
> 
>> Ugh, yuck squared.
> 
>> IIUC, the bug is a UAF in free_kthread_struct(). It wasn't easy
>> detectable until the pidfs rhashtable conversion changed struct pid's
>> size and field layout.
> 
> Eeew, indeed.  Thanks for figuring this out!
> 
>> Fix should be something like
> 
>>     void free_kthread_struct(struct task_struct *k)
>>     {
>>         struct kthread *kthread;
>>
>>         kthread = to_kthread(k);
>>         if (!kthread)
>>             return;
>>
>>     +   if (!list_empty(&kthread->affinity_node)) {
>>     +       mutex_lock(&kthread_affinity_lock);
>>     +       list_del(&kthread->affinity_node);
>>     +       mutex_unlock(&kthread_affinity_lock);
>>     +   }
>>     +   if (kthread->preferred_affinity)
>>     +       kfree(kthread->preferred_affinity);
>>
>>     #ifdef CONFIG_BLK_CGROUP
>>         WARN_ON_ONCE(kthread->blkcg_css);
>>     #endif
>>         k->worker_private = NULL;
>>         kfree(kthread->full_name);
>>         kfree(kthread);
>>     }
> 
>> The normal kthread_exit() path already unlinks the node. After
>> list_del(), the node's pointers are set to LIST_POISON1/LIST_POISON2, so
>> list_empty() returns false. To avoid a double-unlink, kthread_exit()
>> should use list_del_init() instead of list_del(), so that
>> free_kthread_struct()'s list_empty() check correctly detects the
>> already-unlinked state.
> 
> Confirmed that the above patch plus the list_del_init() change seems to
> fix the issue for me, the full patch I tested with is below to confirm.
> Feel free to add:
> 
> Tested-by: Mark Brown <broonie@kernel.org>


I second Mark's feedback.  I only used GCC 14.2 on x86_64 (see my
original email with a container image).  I confirm it is reproducible
on v7.0-rc1 and that the patch below fixes the issue.  So for what
it's worth, feel free to add:

Tested-by: Guillaume Tucker <gtucker@gtucker.io>

Cheers,
Guillaume


PS: It also happens to be a great first result for my new automated
bisection tool :)  I'll add some post-processing with debug configs
turned on to automate typical sanity checks e.g. KASAN and builds
with alternative compilers and architectures.


> diff --git a/kernel/kthread.c b/kernel/kthread.c
> index 20451b624b67..3778fcbc56db 100644
> --- a/kernel/kthread.c
> +++ b/kernel/kthread.c
> @@ -147,6 +147,13 @@ void free_kthread_struct(struct task_struct *k)
>  	kthread = to_kthread(k);
>  	if (!kthread)
>  		return;
> +	if (!list_empty(&kthread->affinity_node)) {
> +		mutex_lock(&kthread_affinity_lock);
> +		list_del(&kthread->affinity_node);
> +		mutex_unlock(&kthread_affinity_lock);
> +	}
> +	if (kthread->preferred_affinity)
> +		kfree(kthread->preferred_affinity);
>  
>  #ifdef CONFIG_BLK_CGROUP
>  	WARN_ON_ONCE(kthread->blkcg_css);
> @@ -325,7 +332,7 @@ void __noreturn kthread_exit(long result)
>  	kthread->result = result;
>  	if (!list_empty(&kthread->affinity_node)) {
>  		mutex_lock(&kthread_affinity_lock);
> -		list_del(&kthread->affinity_node);
> +		list_del_init(&kthread->affinity_node);
>  		mutex_unlock(&kthread_affinity_lock);
>  
>  		if (kthread->preferred_affinity) {
> 


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: make_task_dead() & kthread_exit()
  2026-02-24 16:25     ` make_task_dead() & kthread_exit() Christian Brauner
  2026-02-24 17:04       ` Mark Brown
@ 2026-02-24 19:30       ` Linus Torvalds
  2026-02-26  4:09         ` David Gow
  2026-02-26  9:47         ` Christian Brauner
  2026-02-26  4:08       ` David Gow
  2 siblings, 2 replies; 19+ messages in thread
From: Linus Torvalds @ 2026-02-24 19:30 UTC (permalink / raw)
  To: Christian Brauner
  Cc: Guillaume Tucker, Tejun Heo, Mateusz Guzik, Alexander Viro,
	Jan Kara, linux-fsdevel, Mark Brown, kunit-dev, David Gow

[-- Attachment #1: Type: text/plain, Size: 2244 bytes --]

On Tue, 24 Feb 2026 at 08:25, Christian Brauner <brauner@kernel.org> wrote:
>
> If a kthread exits via a path that bypasses kthread_exit() (e.g.,
> make_task_dead() after an oops -- which calls do_exit() directly),
> the affinity_node remains in the global kthread_affinity_list. When
> free_kthread_struct() later frees the kthread struct, the linked list
> still references the freed memory. Any subsequent list_del() by another
> kthread in kthread_exit() writes to the freed memory:

Ugh.

So this is nasty, but I really detest the suggested fix. It just
smells wrong to have that affinity_node cleanup done in two different
places depending on how the exit is done.

IOW, I think the proper fix would be to just make sure that
kthread_exit() isn't actually ever bypassed.

Because looking at this, there are other issues with do_exit() killing
a kthread - it currently also means that kthread->result randomly
doesn't get set, for example, so kthread_stop() would appear to
basically return garbage.

No, nobody likely cares about the kthread_stop() return value for that
case, but it's an example of the same kind of "two different exit
paths, inconsistent data structures" issue.

How about something like the attached, in other words?

NOTE NOTE NOTE! This is *entirely* untested. It might do unspeakable
things to your pets, so please check it. I'm sending this patch out as
a "I really would prefer this kind of approach" example, not as
anything more than that.

Because I really think the core fundamental problem was that there
were two different exit paths that did different things, and we
shouldn't try to fix the symptoms of that problem, but instead really
fix the core issue.

Hmm?

Side note: while writing this suggested patch, I do note that this
comment is wrong:

 * When "(p->flags & PF_KTHREAD)" is set the task is a kthread and will
 * always remain a kthread.  For kthreads p->worker_private always
 * points to a struct kthread.  For tasks that are not kthreads
 * p->worker_private is used to point to other things.

because 'init_task' is marked as PF_KTHREAD, but does *not* have a
p->worker_private.

Anyway, that doesn't affect this particular code, but it might be
worth thinking about.

               Linus

[-- Attachment #2: patch.diff --]
[-- Type: text/x-patch, Size: 5238 bytes --]

 include/linux/kthread.h | 21 ++++++++++++++++++++-
 kernel/exit.c           |  6 ++++++
 kernel/kthread.c        | 41 +++++------------------------------------
 3 files changed, 31 insertions(+), 37 deletions(-)

diff --git a/include/linux/kthread.h b/include/linux/kthread.h
index c92c1149ee6e..a279b626d854 100644
--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -7,6 +7,24 @@
 
 struct mm_struct;
 
+/* Opaque kthread data, internal to kernel/thread.c */
+struct kthread;
+
+/*
+ * When "(p->flags & PF_KTHREAD)" is set the task is a kthread and will
+ * always remain a kthread.  For kthreads p->worker_private always
+ * points to a struct kthread.  For tasks that are not kthreads
+ * p->worker_private is used to point to other things.
+ *
+ * Return NULL for any task that is not a kthread.
+ */
+static inline struct kthread *tsk_is_kthread(struct task_struct *p)
+{
+	if (p->flags & PF_KTHREAD)
+		return p->worker_private;
+	return NULL;
+}
+
 __printf(4, 5)
 struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
 					   void *data,
@@ -98,9 +116,10 @@ void *kthread_probe_data(struct task_struct *k);
 int kthread_park(struct task_struct *k);
 void kthread_unpark(struct task_struct *k);
 void kthread_parkme(void);
-void kthread_exit(long result) __noreturn;
+#define kthread_exit(result) do_exit(result)
 void kthread_complete_and_exit(struct completion *, long) __noreturn;
 int kthreads_update_housekeeping(void);
+void kthread_do_exit(struct kthread *, long);
 
 int kthreadd(void *unused);
 extern struct task_struct *kthreadd_task;
diff --git a/kernel/exit.c b/kernel/exit.c
index 8a87021211ae..ede3117fa7d4 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -896,11 +896,16 @@ static void synchronize_group_exit(struct task_struct *tsk, long code)
 void __noreturn do_exit(long code)
 {
 	struct task_struct *tsk = current;
+	struct kthread *kthread;
 	int group_dead;
 
 	WARN_ON(irqs_disabled());
 	WARN_ON(tsk->plug);
 
+	kthread = tsk_is_kthread(tsk);
+	if (unlikely(kthread))
+		kthread_do_exit(kthread, code);
+
 	kcov_task_exit(tsk);
 	kmsan_task_exit(tsk);
 
@@ -1013,6 +1018,7 @@ void __noreturn do_exit(long code)
 	lockdep_free_task(tsk);
 	do_task_dead();
 }
+EXPORT_SYMBOL(do_exit);
 
 void __noreturn make_task_dead(int signr)
 {
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 20451b624b67..791210daf8b4 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -85,24 +85,6 @@ static inline struct kthread *to_kthread(struct task_struct *k)
 	return k->worker_private;
 }
 
-/*
- * Variant of to_kthread() that doesn't assume @p is a kthread.
- *
- * When "(p->flags & PF_KTHREAD)" is set the task is a kthread and will
- * always remain a kthread.  For kthreads p->worker_private always
- * points to a struct kthread.  For tasks that are not kthreads
- * p->worker_private is used to point to other things.
- *
- * Return NULL for any task that is not a kthread.
- */
-static inline struct kthread *__to_kthread(struct task_struct *p)
-{
-	void *kthread = p->worker_private;
-	if (kthread && !(p->flags & PF_KTHREAD))
-		kthread = NULL;
-	return kthread;
-}
-
 void get_kthread_comm(char *buf, size_t buf_size, struct task_struct *tsk)
 {
 	struct kthread *kthread = to_kthread(tsk);
@@ -193,7 +175,7 @@ EXPORT_SYMBOL_GPL(kthread_should_park);
 
 bool kthread_should_stop_or_park(void)
 {
-	struct kthread *kthread = __to_kthread(current);
+	struct kthread *kthread = tsk_is_kthread(current);
 
 	if (!kthread)
 		return false;
@@ -234,7 +216,7 @@ EXPORT_SYMBOL_GPL(kthread_freezable_should_stop);
  */
 void *kthread_func(struct task_struct *task)
 {
-	struct kthread *kthread = __to_kthread(task);
+	struct kthread *kthread = tsk_is_kthread(task);
 	if (kthread)
 		return kthread->threadfn;
 	return NULL;
@@ -266,7 +248,7 @@ EXPORT_SYMBOL_GPL(kthread_data);
  */
 void *kthread_probe_data(struct task_struct *task)
 {
-	struct kthread *kthread = __to_kthread(task);
+	struct kthread *kthread = tsk_is_kthread(task);
 	void *data = NULL;
 
 	if (kthread)
@@ -309,19 +291,8 @@ void kthread_parkme(void)
 }
 EXPORT_SYMBOL_GPL(kthread_parkme);
 
-/**
- * kthread_exit - Cause the current kthread return @result to kthread_stop().
- * @result: The integer value to return to kthread_stop().
- *
- * While kthread_exit can be called directly, it exists so that
- * functions which do some additional work in non-modular code such as
- * module_put_and_kthread_exit can be implemented.
- *
- * Does not return.
- */
-void __noreturn kthread_exit(long result)
+void kthread_do_exit(struct kthread *kthread, long result)
 {
-	struct kthread *kthread = to_kthread(current);
 	kthread->result = result;
 	if (!list_empty(&kthread->affinity_node)) {
 		mutex_lock(&kthread_affinity_lock);
@@ -333,9 +304,7 @@ void __noreturn kthread_exit(long result)
 			kthread->preferred_affinity = NULL;
 		}
 	}
-	do_exit(0);
 }
-EXPORT_SYMBOL(kthread_exit);
 
 /**
  * kthread_complete_and_exit - Exit the current kthread.
@@ -683,7 +652,7 @@ void kthread_set_per_cpu(struct task_struct *k, int cpu)
 
 bool kthread_is_per_cpu(struct task_struct *p)
 {
-	struct kthread *kthread = __to_kthread(p);
+	struct kthread *kthread = tsk_is_kthread(p);
 	if (!kthread)
 		return false;
 

^ permalink raw reply related	[flat|nested] 19+ messages in thread

* Re: make_task_dead() & kthread_exit()
  2026-02-24 16:25     ` make_task_dead() & kthread_exit() Christian Brauner
  2026-02-24 17:04       ` Mark Brown
  2026-02-24 19:30       ` Linus Torvalds
@ 2026-02-26  4:08       ` David Gow
  2 siblings, 0 replies; 19+ messages in thread
From: David Gow @ 2026-02-26  4:08 UTC (permalink / raw)
  To: Christian Brauner, Guillaume Tucker, Tejun Heo
  Cc: Mateusz Guzik, Alexander Viro, Jan Kara, linux-fsdevel,
	Mark Brown, kunit-dev, Linus Torvalds

Le 25/02/2026 à 12:25 AM, Christian Brauner a écrit :
> On Tue, Feb 24, 2026 at 02:22:31PM +0100, Christian Brauner wrote:

(...snip...)

> 
> Ugh, yuck squared.
> 
> IIUC, the bug is a UAF in free_kthread_struct(). It wasn't easy
> detectable until the pidfs rhashtable conversion changed struct pid's
> size and field layout.
> 
> The rhashtable conversion replaced struct pid's struct rb_node of 24
> bytes with struct rhash_head of 8 bytes, shrinking struct pid by 16
> bytes bringing it to 144 bytes. This means it's now the same size as
> struct kthread (without CONFIG_BLK_CGROUP). Both structs use
> SLAB_HWCACHE_ALIGN bringing it to 192. KUnit sets
> CONFIG_SLAB_MERGE_DEFAULT=y. So now struct pid and struct kthread share
> slab caches. First part of the puzzle.
> 
> struct pid.rcu.func is at offset 0x78 and struct kthread.affinity node
> at offset 0x78. I'm I'm right then we can already see where this is
> going.
> 
> So free_kthread_struct() calls kfree(kthread) without checking whether
> the kthread's affinity_node is still linked in kthread_affinity_list.
> 
> If a kthread exits via a path that bypasses kthread_exit() (e.g.,
> make_task_dead() after an oops -- which calls do_exit() directly),
> the affinity_node remains in the global kthread_affinity_list. When
> free_kthread_struct() later frees the kthread struct, the linked list
> still references the freed memory. Any subsequent list_del() by another
> kthread in kthread_exit() writes to the freed memory:
> 
>      list_del(&kthread->affinity_node):
>      entry->prev->next = entry->next   // writes to freed predecessor's offset 0x78
> 
> With cache merging, this freed kthread memory may have been reused for a
> struct pid. The write corrupts pid.rcu.func at the same offset 0x78,
> replacing delayed_put_pid with &kthread_affinity_list. The next RCU
> callback invocation is fscked.
> 

Aha... that makes sense. What a mess!

> 
> Fix should be something like
> 
>      void free_kthread_struct(struct task_struct *k)
>      {
>          struct kthread *kthread;
> 
>          kthread = to_kthread(k);
>          if (!kthread)
>              return;
> 
>      +   if (!list_empty(&kthread->affinity_node)) {
>      +       mutex_lock(&kthread_affinity_lock);
>      +       list_del(&kthread->affinity_node);
>      +       mutex_unlock(&kthread_affinity_lock);
>      +   }
>      +   if (kthread->preferred_affinity)
>      +       kfree(kthread->preferred_affinity);
> 
>      #ifdef CONFIG_BLK_CGROUP
>          WARN_ON_ONCE(kthread->blkcg_css);
>      #endif
>          k->worker_private = NULL;
>          kfree(kthread->full_name);
>          kfree(kthread);
>      }
> 

This fixes the KUnit issues for me, too, so:

Tested-by: David Gow <david@davidgow.net>

(But so does Linus' patch, which is probably the nicer long-term solution.)

Cheers,
-- David

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: make_task_dead() & kthread_exit()
  2026-02-24 19:30       ` Linus Torvalds
@ 2026-02-26  4:09         ` David Gow
  2026-02-26  9:47         ` Christian Brauner
  1 sibling, 0 replies; 19+ messages in thread
From: David Gow @ 2026-02-26  4:09 UTC (permalink / raw)
  To: Linus Torvalds, Christian Brauner
  Cc: Guillaume Tucker, Tejun Heo, Mateusz Guzik, Alexander Viro,
	Jan Kara, linux-fsdevel, Mark Brown, kunit-dev

Le 25/02/2026 à 3:30 AM, Linus Torvalds a écrit :
> On Tue, 24 Feb 2026 at 08:25, Christian Brauner <brauner@kernel.org> wrote:
>>
>> If a kthread exits via a path that bypasses kthread_exit() (e.g.,
>> make_task_dead() after an oops -- which calls do_exit() directly),
>> the affinity_node remains in the global kthread_affinity_list. When
>> free_kthread_struct() later frees the kthread struct, the linked list
>> still references the freed memory. Any subsequent list_del() by another
>> kthread in kthread_exit() writes to the freed memory:
> 
> Ugh.
> 
> So this is nasty, but I really detest the suggested fix. It just
> smells wrong to have that affinity_node cleanup done in two different
> places depending on how the exit is done.
> 
> IOW, I think the proper fix would be to just make sure that
> kthread_exit() isn't actually ever bypassed.
> 
> Because looking at this, there are other issues with do_exit() killing
> a kthread - it currently also means that kthread->result randomly
> doesn't get set, for example, so kthread_stop() would appear to
> basically return garbage.
> 
> No, nobody likely cares about the kthread_stop() return value for that
> case, but it's an example of the same kind of "two different exit
> paths, inconsistent data structures" issue.
> 
> How about something like the attached, in other words?
> 
> NOTE NOTE NOTE! This is *entirely* untested. It might do unspeakable
> things to your pets, so please check it. I'm sending this patch out as
> a "I really would prefer this kind of approach" example, not as
> anything more than that.
> 
> Because I really think the core fundamental problem was that there
> were two different exit paths that did different things, and we
> shouldn't try to fix the symptoms of that problem, but instead really
> fix the core issue.
> 
> Hmm?

FWIW, the attached patch fixes the issue for me, and my machines have 
been been stable running it this morning (+reverting the firewire merge 
due to [1]), so the idea seems pretty sound.

Tested-by: David Gow <david@davidgow.net>

Cheers,
-- David

[1]: 
https://lore.kernel.org/all/CAHk-=wgUmxkjwsWzX1rTo=DTnaouwY-VT8BjrTqfH7RmTwO72w@mail.gmail.com/

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: make_task_dead() & kthread_exit()
  2026-02-24 19:30       ` Linus Torvalds
  2026-02-26  4:09         ` David Gow
@ 2026-02-26  9:47         ` Christian Brauner
  2026-02-27 16:54           ` Linus Torvalds
                             ` (2 more replies)
  1 sibling, 3 replies; 19+ messages in thread
From: Christian Brauner @ 2026-02-26  9:47 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Guillaume Tucker, Tejun Heo, Mateusz Guzik, Alexander Viro,
	Jan Kara, linux-fsdevel, Mark Brown, kunit-dev, David Gow

[-- Attachment #1: Type: text/plain, Size: 3299 bytes --]

On Tue, Feb 24, 2026 at 11:30:57AM -0800, Linus Torvalds wrote:
> On Tue, 24 Feb 2026 at 08:25, Christian Brauner <brauner@kernel.org> wrote:
> >
> > If a kthread exits via a path that bypasses kthread_exit() (e.g.,
> > make_task_dead() after an oops -- which calls do_exit() directly),
> > the affinity_node remains in the global kthread_affinity_list. When
> > free_kthread_struct() later frees the kthread struct, the linked list
> > still references the freed memory. Any subsequent list_del() by another
> > kthread in kthread_exit() writes to the freed memory:
> 
> Ugh.
> 
> So this is nasty, but I really detest the suggested fix. It just
> smells wrong to have that affinity_node cleanup done in two different
> places depending on how the exit is done.
> 
> IOW, I think the proper fix would be to just make sure that
> kthread_exit() isn't actually ever bypassed.
> 
> Because looking at this, there are other issues with do_exit() killing
> a kthread - it currently also means that kthread->result randomly
> doesn't get set, for example, so kthread_stop() would appear to
> basically return garbage.
> 
> No, nobody likely cares about the kthread_stop() return value for that
> case, but it's an example of the same kind of "two different exit
> paths, inconsistent data structures" issue.
> 
> How about something like the attached, in other words?
> 
> NOTE NOTE NOTE! This is *entirely* untested. It might do unspeakable
> things to your pets, so please check it. I'm sending this patch out as
> a "I really would prefer this kind of approach" example, not as
> anything more than that.
> 
> Because I really think the core fundamental problem was that there
> were two different exit paths that did different things, and we
> shouldn't try to fix the symptoms of that problem, but instead really
> fix the core issue.
> 
> Hmm?
> 
> Side note: while writing this suggested patch, I do note that this
> comment is wrong:
> 
>  * When "(p->flags & PF_KTHREAD)" is set the task is a kthread and will
>  * always remain a kthread.  For kthreads p->worker_private always
>  * points to a struct kthread.  For tasks that are not kthreads
>  * p->worker_private is used to point to other things.
> 
> because 'init_task' is marked as PF_KTHREAD, but does *not* have a
> p->worker_private.
> 
> Anyway, that doesn't affect this particular code, but it might be
> worth thinking about.

Oh nice.
I was kinda hoping Tejun would jump on this one and so just pointed to
one potential way to fix it but didn't really spend time on it.

Anyway, let's just take what you proposed and slap a commit message on
it. Fwiw, init_task does have ->worker_private it just gets set later
during sched_init():

          /*
           * The idle task doesn't need the kthread struct to function, but it
           * is dressed up as a per-CPU kthread and thus needs to play the part
           * if we want to avoid special-casing it in code that deals with per-CPU
           * kthreads.
           */
          WARN_ON(!set_kthread_struct(current));

I think that @current here is misleading. When sched_init() runs it
should be single-threaded still and current == &init_task. So that
set_kthread_struct(current) call sets @init_task's worker_private iiuc.

Patch appended. I'll stuff it into vfs.fixes.

[-- Attachment #2: 0001-kthread-consolidate-kthread-exit-paths-to-prevent-us.patch --]
[-- Type: text/x-diff, Size: 7471 bytes --]

From 28aaa9c39945b7925a1cc1d513c8f21ed38f5e4f Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Thu, 26 Feb 2026 10:43:55 +0100
Subject: [PATCH] kthread: consolidate kthread exit paths to prevent
 use-after-free

Guillaume reported crashes via corrupted RCU callback function pointers
during KUnit testing. The crash was traced back to the pidfs rhashtable
conversion which replaced the 24-byte rb_node with an 8-byte rhash_head
in struct pid, shrinking it from 160 to 144 bytes.

struct kthread (without CONFIG_BLK_CGROUP) is also 144 bytes. With
CONFIG_SLAB_MERGE_DEFAULT and SLAB_HWCACHE_ALIGN both round up to
192 bytes and share the same slab cache. struct pid.rcu.func and
struct kthread.affinity_node both sit at offset 0x78.

When a kthread exits via make_task_dead() it bypasses kthread_exit() and
misses the affinity_node cleanup. free_kthread_struct() frees the memory
while the node is still linked into the global kthread_affinity_list. A
subsequent list_del() by another kthread writes through dangling list
pointers into the freed and reused memory, corrupting the pid's
rcu.func pointer.

Instead of patching free_kthread_struct() to handle the missed cleanup,
consolidate all kthread exit paths. Turn kthread_exit() into a macro
that calls do_exit() and add kthread_do_exit() which is called from
do_exit() for any task with PF_KTHREAD set. This guarantees that
kthread-specific cleanup always happens regardless of the exit path -
make_task_dead(), direct do_exit(), or kthread_exit().

Replace __to_kthread() with a new tsk_is_kthread() accessor in the
public header. Export do_exit() since module code using the
kthread_exit() macro now needs it directly.

Reported-by: Guillaume Tucker <gtucker@gtucker.io>
Tested-by: Guillaume Tucker <gtucker@gtucker.io>
Tested-by: Mark Brown <broonie@kernel.org>
Tested-by: David Gow <davidgow@google.com>
Cc: <stable@vger.kernel.org>
Link: https://lore.kernel.org/all/20260224-mittlerweile-besessen-2738831ae7f6@brauner
Co-developed-by: Linus Torvalds <torvalds@linux-foundation.org>
Fixes: 4d13f4304fa4 ("kthread: Implement preferred affinity")
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
 include/linux/kthread.h | 21 ++++++++++++++++++++-
 kernel/exit.c           |  6 ++++++
 kernel/kthread.c        | 41 +++++------------------------------------
 3 files changed, 31 insertions(+), 37 deletions(-)

diff --git a/include/linux/kthread.h b/include/linux/kthread.h
index c92c1149ee6e..a01a474719a7 100644
--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -7,6 +7,24 @@
 
 struct mm_struct;
 
+/* opaque kthread data */
+struct kthread;
+
+/*
+ * When "(p->flags & PF_KTHREAD)" is set the task is a kthread and will
+ * always remain a kthread.  For kthreads p->worker_private always
+ * points to a struct kthread.  For tasks that are not kthreads
+ * p->worker_private is used to point to other things.
+ *
+ * Return NULL for any task that is not a kthread.
+ */
+static inline struct kthread *tsk_is_kthread(struct task_struct *p)
+{
+	if (p->flags & PF_KTHREAD)
+		return p->worker_private;
+	return NULL;
+}
+
 __printf(4, 5)
 struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
 					   void *data,
@@ -98,9 +116,10 @@ void *kthread_probe_data(struct task_struct *k);
 int kthread_park(struct task_struct *k);
 void kthread_unpark(struct task_struct *k);
 void kthread_parkme(void);
-void kthread_exit(long result) __noreturn;
+#define kthread_exit(result) do_exit(result)
 void kthread_complete_and_exit(struct completion *, long) __noreturn;
 int kthreads_update_housekeeping(void);
+void kthread_do_exit(struct kthread *, long);
 
 int kthreadd(void *unused);
 extern struct task_struct *kthreadd_task;
diff --git a/kernel/exit.c b/kernel/exit.c
index 8a87021211ae..ede3117fa7d4 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -896,11 +896,16 @@ static void synchronize_group_exit(struct task_struct *tsk, long code)
 void __noreturn do_exit(long code)
 {
 	struct task_struct *tsk = current;
+	struct kthread *kthread;
 	int group_dead;
 
 	WARN_ON(irqs_disabled());
 	WARN_ON(tsk->plug);
 
+	kthread = tsk_is_kthread(tsk);
+	if (unlikely(kthread))
+		kthread_do_exit(kthread, code);
+
 	kcov_task_exit(tsk);
 	kmsan_task_exit(tsk);
 
@@ -1013,6 +1018,7 @@ void __noreturn do_exit(long code)
 	lockdep_free_task(tsk);
 	do_task_dead();
 }
+EXPORT_SYMBOL(do_exit);
 
 void __noreturn make_task_dead(int signr)
 {
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 20451b624b67..791210daf8b4 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -85,24 +85,6 @@ static inline struct kthread *to_kthread(struct task_struct *k)
 	return k->worker_private;
 }
 
-/*
- * Variant of to_kthread() that doesn't assume @p is a kthread.
- *
- * When "(p->flags & PF_KTHREAD)" is set the task is a kthread and will
- * always remain a kthread.  For kthreads p->worker_private always
- * points to a struct kthread.  For tasks that are not kthreads
- * p->worker_private is used to point to other things.
- *
- * Return NULL for any task that is not a kthread.
- */
-static inline struct kthread *__to_kthread(struct task_struct *p)
-{
-	void *kthread = p->worker_private;
-	if (kthread && !(p->flags & PF_KTHREAD))
-		kthread = NULL;
-	return kthread;
-}
-
 void get_kthread_comm(char *buf, size_t buf_size, struct task_struct *tsk)
 {
 	struct kthread *kthread = to_kthread(tsk);
@@ -193,7 +175,7 @@ EXPORT_SYMBOL_GPL(kthread_should_park);
 
 bool kthread_should_stop_or_park(void)
 {
-	struct kthread *kthread = __to_kthread(current);
+	struct kthread *kthread = tsk_is_kthread(current);
 
 	if (!kthread)
 		return false;
@@ -234,7 +216,7 @@ EXPORT_SYMBOL_GPL(kthread_freezable_should_stop);
  */
 void *kthread_func(struct task_struct *task)
 {
-	struct kthread *kthread = __to_kthread(task);
+	struct kthread *kthread = tsk_is_kthread(task);
 	if (kthread)
 		return kthread->threadfn;
 	return NULL;
@@ -266,7 +248,7 @@ EXPORT_SYMBOL_GPL(kthread_data);
  */
 void *kthread_probe_data(struct task_struct *task)
 {
-	struct kthread *kthread = __to_kthread(task);
+	struct kthread *kthread = tsk_is_kthread(task);
 	void *data = NULL;
 
 	if (kthread)
@@ -309,19 +291,8 @@ void kthread_parkme(void)
 }
 EXPORT_SYMBOL_GPL(kthread_parkme);
 
-/**
- * kthread_exit - Cause the current kthread return @result to kthread_stop().
- * @result: The integer value to return to kthread_stop().
- *
- * While kthread_exit can be called directly, it exists so that
- * functions which do some additional work in non-modular code such as
- * module_put_and_kthread_exit can be implemented.
- *
- * Does not return.
- */
-void __noreturn kthread_exit(long result)
+void kthread_do_exit(struct kthread *kthread, long result)
 {
-	struct kthread *kthread = to_kthread(current);
 	kthread->result = result;
 	if (!list_empty(&kthread->affinity_node)) {
 		mutex_lock(&kthread_affinity_lock);
@@ -333,9 +304,7 @@ void __noreturn kthread_exit(long result)
 			kthread->preferred_affinity = NULL;
 		}
 	}
-	do_exit(0);
 }
-EXPORT_SYMBOL(kthread_exit);
 
 /**
  * kthread_complete_and_exit - Exit the current kthread.
@@ -683,7 +652,7 @@ void kthread_set_per_cpu(struct task_struct *k, int cpu)
 
 bool kthread_is_per_cpu(struct task_struct *p)
 {
-	struct kthread *kthread = __to_kthread(p);
+	struct kthread *kthread = tsk_is_kthread(p);
 	if (!kthread)
 		return false;
 
-- 
2.47.3


^ permalink raw reply related	[flat|nested] 19+ messages in thread

* Re: make_task_dead() & kthread_exit()
  2026-02-26  9:47         ` Christian Brauner
@ 2026-02-27 16:54           ` Linus Torvalds
  2026-03-03 10:03           ` Alice Ryhl
  2026-03-06 11:05           ` Christian Loehle
  2 siblings, 0 replies; 19+ messages in thread
From: Linus Torvalds @ 2026-02-27 16:54 UTC (permalink / raw)
  To: Christian Brauner
  Cc: Guillaume Tucker, Tejun Heo, Mateusz Guzik, Alexander Viro,
	Jan Kara, linux-fsdevel, Mark Brown, kunit-dev, David Gow

On Thu, 26 Feb 2026 at 01:48, Christian Brauner <brauner@kernel.org> wrote:
>
> Anyway, let's just take what you proposed and slap a commit message on
> it.

I will look forward to getting this patch through the normal channels
and I have removed it from my "misc tree of random patches".

> Fwiw, init_task does have ->worker_private it just gets set later
> during sched_init():

Ahh. That was not very obvious indeed.

                 Linus

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: make_task_dead() & kthread_exit()
  2026-02-26  9:47         ` Christian Brauner
  2026-02-27 16:54           ` Linus Torvalds
@ 2026-03-03 10:03           ` Alice Ryhl
  2026-03-06 11:05           ` Christian Loehle
  2 siblings, 0 replies; 19+ messages in thread
From: Alice Ryhl @ 2026-03-03 10:03 UTC (permalink / raw)
  To: brauner
  Cc: broonie, davidgow, gtucker, jack, kunit-dev, linux-fsdevel,
	mjguzik, tj, torvalds, viro, Alice Ryhl

Christian Brauner <brauner@kernel.org> writes:
> Oh nice.
> I was kinda hoping Tejun would jump on this one and so just pointed to
> one potential way to fix it but didn't really spend time on it.
> 
> Anyway, let's just take what you proposed and slap a commit message on
> it. Fwiw, init_task does have ->worker_private it just gets set later
> during sched_init():
> 
>           /*
>            * The idle task doesn't need the kthread struct to function, but it
>            * is dressed up as a per-CPU kthread and thus needs to play the part
>            * if we want to avoid special-casing it in code that deals with per-CPU
>            * kthreads.
>            */
>           WARN_ON(!set_kthread_struct(current));
> 
> I think that @current here is misleading. When sched_init() runs it
> should be single-threaded still and current == &init_task. So that
> set_kthread_struct(current) call sets @init_task's worker_private iiuc.
> 
> Patch appended. I'll stuff it into vfs.fixes.

welp, I guess you get another one here:

Tested-by: Alice Ryhl <aliceryhl@google.com>

from:
https://lore.kernel.org/all/aaaxK5gFWN70WmMn@google.com/

Alice

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: make_task_dead() & kthread_exit()
  2026-02-26  9:47         ` Christian Brauner
  2026-02-27 16:54           ` Linus Torvalds
  2026-03-03 10:03           ` Alice Ryhl
@ 2026-03-06 11:05           ` Christian Loehle
  2026-03-06 16:28             ` Linus Torvalds
  2 siblings, 1 reply; 19+ messages in thread
From: Christian Loehle @ 2026-03-06 11:05 UTC (permalink / raw)
  To: Christian Brauner, Linus Torvalds
  Cc: Guillaume Tucker, Tejun Heo, Mateusz Guzik, Alexander Viro,
	Jan Kara, linux-fsdevel, Mark Brown, kunit-dev, David Gow

[-- Attachment #1: Type: text/plain, Size: 3876 bytes --]

On 2/26/26 09:47, Christian Brauner wrote:
> On Tue, Feb 24, 2026 at 11:30:57AM -0800, Linus Torvalds wrote:
>> On Tue, 24 Feb 2026 at 08:25, Christian Brauner <brauner@kernel.org> wrote:
>>>
>>> If a kthread exits via a path that bypasses kthread_exit() (e.g.,
>>> make_task_dead() after an oops -- which calls do_exit() directly),
>>> the affinity_node remains in the global kthread_affinity_list. When
>>> free_kthread_struct() later frees the kthread struct, the linked list
>>> still references the freed memory. Any subsequent list_del() by another
>>> kthread in kthread_exit() writes to the freed memory:
>>
>> Ugh.
>>
>> So this is nasty, but I really detest the suggested fix. It just
>> smells wrong to have that affinity_node cleanup done in two different
>> places depending on how the exit is done.
>>
>> IOW, I think the proper fix would be to just make sure that
>> kthread_exit() isn't actually ever bypassed.
>>
>> Because looking at this, there are other issues with do_exit() killing
>> a kthread - it currently also means that kthread->result randomly
>> doesn't get set, for example, so kthread_stop() would appear to
>> basically return garbage.
>>
>> No, nobody likely cares about the kthread_stop() return value for that
>> case, but it's an example of the same kind of "two different exit
>> paths, inconsistent data structures" issue.
>>
>> How about something like the attached, in other words?
>>
>> NOTE NOTE NOTE! This is *entirely* untested. It might do unspeakable
>> things to your pets, so please check it. I'm sending this patch out as
>> a "I really would prefer this kind of approach" example, not as
>> anything more than that.
>>
>> Because I really think the core fundamental problem was that there
>> were two different exit paths that did different things, and we
>> shouldn't try to fix the symptoms of that problem, but instead really
>> fix the core issue.
>>
>> Hmm?
>>
>> Side note: while writing this suggested patch, I do note that this
>> comment is wrong:
>>
>>  * When "(p->flags & PF_KTHREAD)" is set the task is a kthread and will
>>  * always remain a kthread.  For kthreads p->worker_private always
>>  * points to a struct kthread.  For tasks that are not kthreads
>>  * p->worker_private is used to point to other things.
>>
>> because 'init_task' is marked as PF_KTHREAD, but does *not* have a
>> p->worker_private.
>>
>> Anyway, that doesn't affect this particular code, but it might be
>> worth thinking about.
> 
> Oh nice.
> I was kinda hoping Tejun would jump on this one and so just pointed to
> one potential way to fix it but didn't really spend time on it.
> 
> Anyway, let's just take what you proposed and slap a commit message on
> it. Fwiw, init_task does have ->worker_private it just gets set later
> during sched_init():
> 
>           /*
>            * The idle task doesn't need the kthread struct to function, but it
>            * is dressed up as a per-CPU kthread and thus needs to play the part
>            * if we want to avoid special-casing it in code that deals with per-CPU
>            * kthreads.
>            */
>           WARN_ON(!set_kthread_struct(current));
> 
> I think that @current here is misleading. When sched_init() runs it
> should be single-threaded still and current == &init_task. So that
> set_kthread_struct(current) call sets @init_task's worker_private iiuc.
> 
> Patch appended. I'll stuff it into vfs.fixes.

FWIW this leaves the stale BTF reference:

------8<------
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 401d6c4960ec..8db79e593156 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -25261,7 +25261,6 @@ BTF_ID(func, __x64_sys_exit_group)
 BTF_ID(func, do_exit)
 BTF_ID(func, do_group_exit)
 BTF_ID(func, kthread_complete_and_exit)
-BTF_ID(func, kthread_exit)
 BTF_ID(func, make_task_dead)
 BTF_SET_END(noreturn_deny)
 

[-- Attachment #2: 0001-bpf-drop-kthread_exit-from-noreturn_deny.patch --]
[-- Type: text/x-patch, Size: 1020 bytes --]

From 6b2426541b296e88654a90a71395cbbf71faa823 Mon Sep 17 00:00:00 2001
From: Christian Loehle <christian.loehle@arm.com>
Date: Fri, 6 Mar 2026 10:49:18 +0000
Subject: [PATCH] bpf: drop kthread_exit from noreturn_deny

kthread_exit became a macro to do_exit in commit 28aaa9c39945
("kthread: consolidate kthread exit paths to prevent use-after-free"),
so there is no kthread_exit function BTF ID to resolve. Remove it from
noreturn_deny to avoid resolve_btfids unresolved symbol warnings.

Signed-off-by: Christian Loehle <christian.loehle@arm.com>
---
 kernel/bpf/verifier.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 401d6c4960ec..8db79e593156 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -25261,7 +25261,6 @@ BTF_ID(func, __x64_sys_exit_group)
 BTF_ID(func, do_exit)
 BTF_ID(func, do_group_exit)
 BTF_ID(func, kthread_complete_and_exit)
-BTF_ID(func, kthread_exit)
 BTF_ID(func, make_task_dead)
 BTF_SET_END(noreturn_deny)
 
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 19+ messages in thread

* Re: make_task_dead() & kthread_exit()
  2026-03-06 11:05           ` Christian Loehle
@ 2026-03-06 16:28             ` Linus Torvalds
  0 siblings, 0 replies; 19+ messages in thread
From: Linus Torvalds @ 2026-03-06 16:28 UTC (permalink / raw)
  To: Christian Loehle
  Cc: Christian Brauner, Guillaume Tucker, Tejun Heo, Mateusz Guzik,
	Alexander Viro, Jan Kara, linux-fsdevel, Mark Brown, kunit-dev,
	David Gow

On Fri, 6 Mar 2026 at 03:06, Christian Loehle <christian.loehle@arm.com> wrote:
>
> FWIW this leaves the stale BTF reference:

Thanks. Applied.

              Linus

^ permalink raw reply	[flat|nested] 19+ messages in thread

end of thread, other threads:[~2026-03-06 16:28 UTC | newest]

Thread overview: 19+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-01-20 14:52 [PATCH RFC v2] pidfs: convert rb-tree to rhashtable Christian Brauner
2026-01-20 15:43 ` Mateusz Guzik
2026-01-21 10:59 ` Jan Kara
2026-01-22 10:18   ` Christian Brauner
2026-01-22 13:21     ` Jan Kara
2026-02-20 15:11 ` Guillaume Tucker
2026-02-24 13:22   ` Christian Brauner
2026-02-24 16:25     ` make_task_dead() & kthread_exit() Christian Brauner
2026-02-24 17:04       ` Mark Brown
2026-02-24 18:27         ` Guillaume Tucker
2026-02-24 19:30       ` Linus Torvalds
2026-02-26  4:09         ` David Gow
2026-02-26  9:47         ` Christian Brauner
2026-02-27 16:54           ` Linus Torvalds
2026-03-03 10:03           ` Alice Ryhl
2026-03-06 11:05           ` Christian Loehle
2026-03-06 16:28             ` Linus Torvalds
2026-02-26  4:08       ` David Gow
2026-02-24 16:37     ` [PATCH RFC v2] pidfs: convert rb-tree to rhashtable Mark Brown

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox