All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH RFC v2] pidfs: convert rb-tree to rhashtable
@ 2026-01-20 14:52 Christian Brauner
  2026-01-20 15:43 ` Mateusz Guzik
                   ` (2 more replies)
  0 siblings, 3 replies; 28+ messages in thread
From: Christian Brauner @ 2026-01-20 14:52 UTC (permalink / raw)
  To: Mateusz Guzik; +Cc: Alexander Viro, Jan Kara, linux-fsdevel, Christian Brauner

Mateusz reported performance penalties [1] during task creation because
pidfs uses pidmap_lock to add elements into the rbtree. Switch to an
rhashtable to have separate fine-grained locking and to decouple from
pidmap_lock moving all heavy manipulations outside of it.

Convert the pidfs inode-to-pid mapping from an rb-tree with seqcount
protection to an rhashtable. This removes the global pidmap_lock
contention from pidfs_ino_get_pid() lookups and allows the hashtable
insert to happen outside the pidmap_lock.

pidfs_add_pid() is split. pidfs_prepare_pid() allocates inode number and
initializes pid fields and is called inside pidmap_lock. pidfs_add_pid()
inserts pid into rhashtable and is called outside pidmap_lock. Insertion
into the rhashtable can fail and memory allocation may happen so we need
to drop the spinlock.

To guard against accidently opening an already reaped task
pidfs_ino_get_pid() uses additional checks beyond pid_vnr(). If
pid->attr is PIDFS_PID_DEAD or NULL the pid either never had a pidfd or
it already went through pidfs_exit() aka the process as already reaped.
If pid->attr is valid check PIDFS_ATTR_BIT_EXIT to figure out whether
the task has exited.

This slightly changes visibility semantics: pidfd creation is denied
after pidfs_exit() runs, which is just before the pid number is removed
from the via free_pid(). That should not be an issue though.

Link: https://lore.kernel.org/20251206131955.780557-1-mjguzik@gmail.com [1]
Signed-off-by: Christian Brauner <brauner@kernel.org>
---
Changes in v2:
- Ensure that pid is removed before call_rcu() from pidfs.
- Don't drop and reacquire spinlock.
- Link to v1: https://patch.msgid.link/20260119-work-pidfs-rhashtable-v1-1-159c7700300a@kernel.org
---
 fs/pidfs.c            | 81 +++++++++++++++++++++------------------------------
 include/linux/pid.h   |  4 +--
 include/linux/pidfs.h |  3 +-
 kernel/pid.c          | 13 ++++++---
 4 files changed, 46 insertions(+), 55 deletions(-)

diff --git a/fs/pidfs.c b/fs/pidfs.c
index dba703d4ce4a..ee0e36dd29d2 100644
--- a/fs/pidfs.c
+++ b/fs/pidfs.c
@@ -21,6 +21,7 @@
 #include <linux/utsname.h>
 #include <net/net_namespace.h>
 #include <linux/coredump.h>
+#include <linux/rhashtable.h>
 #include <linux/xattr.h>
 
 #include "internal.h"
@@ -55,7 +56,14 @@ struct pidfs_attr {
 	__u32 coredump_signal;
 };
 
-static struct rb_root pidfs_ino_tree = RB_ROOT;
+static struct rhashtable pidfs_ino_ht;
+
+static const struct rhashtable_params pidfs_ino_ht_params = {
+	.key_offset		= offsetof(struct pid, ino),
+	.key_len		= sizeof(u64),
+	.head_offset		= offsetof(struct pid, pidfs_hash),
+	.automatic_shrinking	= true,
+};
 
 #if BITS_PER_LONG == 32
 static inline unsigned long pidfs_ino(u64 ino)
@@ -84,21 +92,11 @@ static inline u32 pidfs_gen(u64 ino)
 }
 #endif
 
-static int pidfs_ino_cmp(struct rb_node *a, const struct rb_node *b)
-{
-	struct pid *pid_a = rb_entry(a, struct pid, pidfs_node);
-	struct pid *pid_b = rb_entry(b, struct pid, pidfs_node);
-	u64 pid_ino_a = pid_a->ino;
-	u64 pid_ino_b = pid_b->ino;
-
-	if (pid_ino_a < pid_ino_b)
-		return -1;
-	if (pid_ino_a > pid_ino_b)
-		return 1;
-	return 0;
-}
-
-void pidfs_add_pid(struct pid *pid)
+/*
+ * Allocate inode number and initialize pidfs fields.
+ * Called with pidmap_lock held.
+ */
+void pidfs_prepare_pid(struct pid *pid)
 {
 	static u64 pidfs_ino_nr = 2;
 
@@ -131,20 +129,22 @@ void pidfs_add_pid(struct pid *pid)
 		pidfs_ino_nr += 2;
 
 	pid->ino = pidfs_ino_nr;
+	pid->pidfs_hash.next = NULL;
 	pid->stashed = NULL;
 	pid->attr = NULL;
 	pidfs_ino_nr++;
+}
 
-	write_seqcount_begin(&pidmap_lock_seq);
-	rb_find_add_rcu(&pid->pidfs_node, &pidfs_ino_tree, pidfs_ino_cmp);
-	write_seqcount_end(&pidmap_lock_seq);
+int pidfs_add_pid(struct pid *pid)
+{
+	return rhashtable_insert_fast(&pidfs_ino_ht, &pid->pidfs_hash,
+				      pidfs_ino_ht_params);
 }
 
 void pidfs_remove_pid(struct pid *pid)
 {
-	write_seqcount_begin(&pidmap_lock_seq);
-	rb_erase(&pid->pidfs_node, &pidfs_ino_tree);
-	write_seqcount_end(&pidmap_lock_seq);
+	rhashtable_remove_fast(&pidfs_ino_ht, &pid->pidfs_hash,
+			       pidfs_ino_ht_params);
 }
 
 void pidfs_free_pid(struct pid *pid)
@@ -773,42 +773,24 @@ static int pidfs_encode_fh(struct inode *inode, u32 *fh, int *max_len,
 	return FILEID_KERNFS;
 }
 
-static int pidfs_ino_find(const void *key, const struct rb_node *node)
-{
-	const u64 pid_ino = *(u64 *)key;
-	const struct pid *pid = rb_entry(node, struct pid, pidfs_node);
-
-	if (pid_ino < pid->ino)
-		return -1;
-	if (pid_ino > pid->ino)
-		return 1;
-	return 0;
-}
-
 /* Find a struct pid based on the inode number. */
 static struct pid *pidfs_ino_get_pid(u64 ino)
 {
 	struct pid *pid;
-	struct rb_node *node;
-	unsigned int seq;
+	struct pidfs_attr *attr;
 
 	guard(rcu)();
-	do {
-		seq = read_seqcount_begin(&pidmap_lock_seq);
-		node = rb_find_rcu(&ino, &pidfs_ino_tree, pidfs_ino_find);
-		if (node)
-			break;
-	} while (read_seqcount_retry(&pidmap_lock_seq, seq));
-
-	if (!node)
+	pid = rhashtable_lookup(&pidfs_ino_ht, &ino, pidfs_ino_ht_params);
+	if (!pid)
+		return NULL;
+	attr = READ_ONCE(pid->attr);
+	if (IS_ERR_OR_NULL(attr))
+		return NULL;
+	if (test_bit(PIDFS_ATTR_BIT_EXIT, &attr->attr_mask))
 		return NULL;
-
-	pid = rb_entry(node, struct pid, pidfs_node);
-
 	/* Within our pid namespace hierarchy? */
 	if (pid_vnr(pid) == 0)
 		return NULL;
-
 	return get_pid(pid);
 }
 
@@ -1086,6 +1068,9 @@ struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags)
 
 void __init pidfs_init(void)
 {
+	if (rhashtable_init(&pidfs_ino_ht, &pidfs_ino_ht_params))
+		panic("Failed to initialize pidfs hashtable");
+
 	pidfs_attr_cachep = kmem_cache_create("pidfs_attr_cache", sizeof(struct pidfs_attr), 0,
 					 (SLAB_HWCACHE_ALIGN | SLAB_RECLAIM_ACCOUNT |
 					  SLAB_ACCOUNT | SLAB_PANIC), NULL);
diff --git a/include/linux/pid.h b/include/linux/pid.h
index 003a1027d219..ce9b5cb7560b 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -6,6 +6,7 @@
 #include <linux/rculist.h>
 #include <linux/rcupdate.h>
 #include <linux/refcount.h>
+#include <linux/rhashtable-types.h>
 #include <linux/sched.h>
 #include <linux/wait.h>
 
@@ -60,7 +61,7 @@ struct pid {
 	spinlock_t lock;
 	struct {
 		u64 ino;
-		struct rb_node pidfs_node;
+		struct rhash_head pidfs_hash;
 		struct dentry *stashed;
 		struct pidfs_attr *attr;
 	};
@@ -73,7 +74,6 @@ struct pid {
 	struct upid numbers[];
 };
 
-extern seqcount_spinlock_t pidmap_lock_seq;
 extern struct pid init_struct_pid;
 
 struct file;
diff --git a/include/linux/pidfs.h b/include/linux/pidfs.h
index 3e08c33da2df..416bdff4d6ce 100644
--- a/include/linux/pidfs.h
+++ b/include/linux/pidfs.h
@@ -6,7 +6,8 @@ struct coredump_params;
 
 struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags);
 void __init pidfs_init(void);
-void pidfs_add_pid(struct pid *pid);
+void pidfs_prepare_pid(struct pid *pid);
+int pidfs_add_pid(struct pid *pid);
 void pidfs_remove_pid(struct pid *pid);
 void pidfs_exit(struct task_struct *tsk);
 #ifdef CONFIG_COREDUMP
diff --git a/kernel/pid.c b/kernel/pid.c
index ad4400a9f15f..6077da774652 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -43,7 +43,6 @@
 #include <linux/sched/task.h>
 #include <linux/idr.h>
 #include <linux/pidfs.h>
-#include <linux/seqlock.h>
 #include <net/sock.h>
 #include <uapi/linux/pidfd.h>
 
@@ -85,7 +84,6 @@ struct pid_namespace init_pid_ns = {
 EXPORT_SYMBOL_GPL(init_pid_ns);
 
 static  __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock);
-seqcount_spinlock_t pidmap_lock_seq = SEQCNT_SPINLOCK_ZERO(pidmap_lock_seq, &pidmap_lock);
 
 void put_pid(struct pid *pid)
 {
@@ -141,9 +139,9 @@ void free_pid(struct pid *pid)
 
 		idr_remove(&ns->idr, upid->nr);
 	}
-	pidfs_remove_pid(pid);
 	spin_unlock(&pidmap_lock);
 
+	pidfs_remove_pid(pid);
 	call_rcu(&pid->rcu, delayed_put_pid);
 }
 
@@ -315,7 +313,8 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *arg_set_tid,
 	retval = -ENOMEM;
 	if (unlikely(!(ns->pid_allocated & PIDNS_ADDING)))
 		goto out_free;
-	pidfs_add_pid(pid);
+	pidfs_prepare_pid(pid);
+
 	for (upid = pid->numbers + ns->level; upid >= pid->numbers; --upid) {
 		/* Make the PID visible to find_pid_ns. */
 		idr_replace(&upid->ns->idr, pid, upid->nr);
@@ -325,6 +324,12 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *arg_set_tid,
 	idr_preload_end();
 	ns_ref_active_get(ns);
 
+	retval = pidfs_add_pid(pid);
+	if (unlikely(retval)) {
+		free_pid(pid);
+		pid = ERR_PTR(-ENOMEM);
+	}
+
 	return pid;
 
 out_free:

---
base-commit: f54c7e54d2de2d7b58aa54604218a6fc00bb2e77
change-id: 20260119-work-pidfs-rhashtable-9d14071bd77a


^ permalink raw reply related	[flat|nested] 28+ messages in thread

end of thread, other threads:[~2026-03-10 13:11 UTC | newest]

Thread overview: 28+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-01-20 14:52 [PATCH RFC v2] pidfs: convert rb-tree to rhashtable Christian Brauner
2026-01-20 15:43 ` Mateusz Guzik
2026-01-21 10:59 ` Jan Kara
2026-01-22 10:18   ` Christian Brauner
2026-01-22 13:21     ` Jan Kara
2026-02-20 15:11 ` Guillaume Tucker
2026-02-24 13:22   ` Christian Brauner
2026-02-24 16:25     ` make_task_dead() & kthread_exit() Christian Brauner
2026-02-24 17:04       ` Mark Brown
2026-02-24 18:27         ` Guillaume Tucker
2026-02-24 19:30       ` Linus Torvalds
2026-02-26  4:09         ` David Gow
2026-02-26  9:47         ` Christian Brauner
2026-02-27 16:54           ` Linus Torvalds
2026-03-03 10:03           ` Alice Ryhl
2026-03-06 11:05           ` Christian Loehle
2026-03-06 14:07             ` [PATCH] kthread: remove kthread_exit() Christian Brauner
2026-03-06 14:44               ` Christoph Hellwig
2026-03-06 18:27                 ` Linus Torvalds
2026-03-09  9:36                   ` Christian Brauner
2026-03-09 15:43                   ` Christoph Hellwig
2026-03-09 16:37                     ` Linus Torvalds
2026-03-09 17:27                       ` Linus Torvalds
2026-03-10 13:11                       ` Christoph Hellwig
2026-03-08  8:41               ` David Gow
2026-03-06 16:28             ` make_task_dead() & kthread_exit() Linus Torvalds
2026-02-26  4:08       ` David Gow
2026-02-24 16:37     ` [PATCH RFC v2] pidfs: convert rb-tree to rhashtable Mark Brown

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.