public inbox for linux-fsdevel@vger.kernel.org
 help / color / mirror / Atom feed
From: Waiman Long <longman@redhat.com>
To: Paul Moore <paul@paul-moore.com>, Eric Paris <eparis@redhat.com>,
	Christian Brauner <brauner@kernel.org>,
	Al Viro <viro@zeniv.linux.org.uk>, Jan Kara <jack@suse.cz>
Cc: linux-kernel@vger.kernel.org, linux-fsdevel@vger.kernel.org,
	audit@vger.kernel.org, Richard Guy Briggs <rgb@redhat.com>,
	Ricardo Robaina <rrobaina@redhat.com>,
	Waiman Long <longman@redhat.com>
Subject: [PATCH v4 1/2] fs: Add a pool of extra fs->pwd references to fs_struct
Date: Sat, 28 Feb 2026 13:27:56 -0500	[thread overview]
Message-ID: <20260228182757.90528-2-longman@redhat.com> (raw)
In-Reply-To: <20260228182757.90528-1-longman@redhat.com>

When the audit subsystem is enabled, it can do a lot of get_fs_pwd()
calls to get references to fs->pwd and then releasing those references
back with path_put() later. That may cause a lot of spinlock contention
on a single pwd's dentry lock because of the constant changes to the
reference count when there are many processes on the same working
directory actively doing open/close system calls. This can cause
noticeable performance regresssion when compared with the case where
the audit subsystem is turned off especially on systems with a lot of
CPUs which is becoming more common these days.

A simple and elegant solution to avoid this kind of performance
regression is to add a common pool of extra fs->pwd references inside
the fs_struct. When a caller needs a pwd reference, it can borrow one
from pool, if available, to avoid an explicit path_get(). When it is
time to release the reference, it can put it back into the common pool
if fs->pwd isn't changed before without doing a path_put(). We still
need to acquire the fs's spinlock, but fs_struct is more distributed
and it is less common to have many tasks sharing a single fs_struct.

A new set of get_fs_pwd_pool/put_fs_pwd_pool() APIs are introduced
with this patch to enable other subsystems to acquire and release
a pwd reference from the common pool without doing unnecessary
path_get/path_put().

Besides fs/fs_struct.c, the copy_mnt_ns() function of fs/namespace.c is
also modified to properly handle the extra pwd references, if available.

Signed-off-by: Waiman Long <longman@redhat.com>
Reviewed-by: Richard Guy Briggs <rgb@redhat.com>
---
 fs/fs_struct.c            | 26 +++++++++++++++++++++-----
 fs/namespace.c            |  8 ++++++++
 include/linux/fs_struct.h | 28 +++++++++++++++++++++++++++-
 3 files changed, 56 insertions(+), 6 deletions(-)

diff --git a/fs/fs_struct.c b/fs/fs_struct.c
index 394875d06fd6..43af98e0a10c 100644
--- a/fs/fs_struct.c
+++ b/fs/fs_struct.c
@@ -33,15 +33,19 @@ void set_fs_root(struct fs_struct *fs, const struct path *path)
 void set_fs_pwd(struct fs_struct *fs, const struct path *path)
 {
 	struct path old_pwd;
+	int count;
 
 	path_get(path);
 	write_seqlock(&fs->seq);
 	old_pwd = fs->pwd;
 	fs->pwd = *path;
+	count = fs->pwd_refs + 1;
+	fs->pwd_refs = 0;
 	write_sequnlock(&fs->seq);
 
 	if (old_pwd.dentry)
-		path_put(&old_pwd);
+		while (count--)
+			path_put(&old_pwd);
 }
 
 static inline int replace_path(struct path *p, const struct path *old, const struct path *new)
@@ -63,10 +67,15 @@ void chroot_fs_refs(const struct path *old_root, const struct path *new_root)
 		task_lock(p);
 		fs = p->fs;
 		if (fs) {
-			int hits = 0;
+			int hits;
+
 			write_seqlock(&fs->seq);
+			hits = replace_path(&fs->pwd, old_root, new_root);
+			if (hits && fs->pwd_refs) {
+				count += fs->pwd_refs;
+				fs->pwd_refs = 0;
+			}
 			hits += replace_path(&fs->root, old_root, new_root);
-			hits += replace_path(&fs->pwd, old_root, new_root);
 			while (hits--) {
 				count++;
 				path_get(new_root);
@@ -82,8 +91,11 @@ void chroot_fs_refs(const struct path *old_root, const struct path *new_root)
 
 void free_fs_struct(struct fs_struct *fs)
 {
+	int count = fs->pwd_refs + 1;
+
 	path_put(&fs->root);
-	path_put(&fs->pwd);
+	while (count--)
+		path_put(&fs->pwd);
 	kmem_cache_free(fs_cachep, fs);
 }
 
@@ -111,6 +123,7 @@ struct fs_struct *copy_fs_struct(struct fs_struct *old)
 	if (fs) {
 		fs->users = 1;
 		fs->in_exec = 0;
+		fs->pwd_refs = 0;
 		seqlock_init(&fs->seq);
 		fs->umask = old->umask;
 
@@ -118,7 +131,10 @@ struct fs_struct *copy_fs_struct(struct fs_struct *old)
 		fs->root = old->root;
 		path_get(&fs->root);
 		fs->pwd = old->pwd;
-		path_get(&fs->pwd);
+		if (old->pwd_refs)
+			old->pwd_refs--;
+		else
+			path_get(&fs->pwd);
 		read_sequnlock_excl(&old->seq);
 	}
 	return fs;
diff --git a/fs/namespace.c b/fs/namespace.c
index 854f4fc66469..96d41f00add6 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -4272,6 +4272,14 @@ struct mnt_namespace *copy_mnt_ns(u64 flags, struct mnt_namespace *ns,
 	 * as belonging to new namespace.  We have already acquired a private
 	 * fs_struct, so tsk->fs->lock is not needed.
 	 */
+	if (new_fs)
+		WARN_ON_ONCE(new_fs->users != 1);
+
+	/* Release the extra pwd references of new_fs, if present. */
+	while (new_fs && new_fs->pwd_refs) {
+		path_put(&new_fs->pwd);
+		new_fs->pwd_refs--;
+	}
 	p = old;
 	q = new;
 	while (p) {
diff --git a/include/linux/fs_struct.h b/include/linux/fs_struct.h
index 0070764b790a..f8cf3b280398 100644
--- a/include/linux/fs_struct.h
+++ b/include/linux/fs_struct.h
@@ -8,10 +8,11 @@
 #include <linux/seqlock.h>
 
 struct fs_struct {
-	int users;
 	seqlock_t seq;
+	int users;
 	int umask;
 	int in_exec;
+	int pwd_refs;	/* A pool of extra pwd references */
 	struct path root, pwd;
 } __randomize_layout;
 
@@ -40,6 +41,31 @@ static inline void get_fs_pwd(struct fs_struct *fs, struct path *pwd)
 	read_sequnlock_excl(&fs->seq);
 }
 
+/* Acquire a pwd reference from the pwd_refs pool, if available */
+static inline void get_fs_pwd_pool(struct fs_struct *fs, struct path *pwd)
+{
+	read_seqlock_excl(&fs->seq);
+	*pwd = fs->pwd;
+	if (fs->pwd_refs)
+		fs->pwd_refs--;
+	else
+		path_get(pwd);
+	read_sequnlock_excl(&fs->seq);
+}
+
+/* Release a pwd reference back to the pwd_refs pool, if appropriate */
+static inline void put_fs_pwd_pool(struct fs_struct *fs, struct path *pwd)
+{
+	read_seqlock_excl(&fs->seq);
+	if ((fs->pwd.dentry == pwd->dentry) && (fs->pwd.mnt == pwd->mnt)) {
+		fs->pwd_refs++;
+		pwd = NULL;
+	}
+	read_sequnlock_excl(&fs->seq);
+	if (pwd)
+		path_put(pwd);
+}
+
 extern bool current_chrooted(void);
 
 static inline int current_umask(void)
-- 
2.53.0


  reply	other threads:[~2026-02-28 18:28 UTC|newest]

Thread overview: 5+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-02-28 18:27 [PATCH v4 0/2] fs, audit: Avoid excessive dput/dget in audit_context setup and reset paths Waiman Long
2026-02-28 18:27 ` Waiman Long [this message]
2026-02-28 18:27 ` [PATCH v4 2/2] audit: Use the new {get,put}_fs_pwd_pool() APIs to get/put pwd references Waiman Long
2026-03-05 21:46 ` [PATCH v4 0/2] fs, audit: Avoid excessive dput/dget in audit_context setup and reset paths Christian Brauner
2026-03-06  0:53   ` Waiman Long

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260228182757.90528-2-longman@redhat.com \
    --to=longman@redhat.com \
    --cc=audit@vger.kernel.org \
    --cc=brauner@kernel.org \
    --cc=eparis@redhat.com \
    --cc=jack@suse.cz \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=paul@paul-moore.com \
    --cc=rgb@redhat.com \
    --cc=rrobaina@redhat.com \
    --cc=viro@zeniv.linux.org.uk \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox