All of lore.kernel.org
 help / color / mirror / Atom feed
From: Mateusz Guzik <mjguzik@gmail.com>
To: brauner@kernel.org
Cc: viro@zeniv.linux.org.uk, jack@suse.cz,
	linux-kernel@vger.kernel.org, linux-fsdevel@vger.kernel.org,
	Mateusz Guzik <mjguzik@gmail.com>
Subject: [PATCH v2] fs: cache the string generated by reading /proc/filesystems
Date: Wed, 22 Apr 2026 20:17:11 +0200	[thread overview]
Message-ID: <20260422181711.1340269-1-mjguzik@gmail.com> (raw)

It is being read surprisingly often (e.g., by mkdir, ls and even sed!).

This is lock-protected pointer chasing over a linked list to pay for
sprintf for every fs (32 on my boxen).

Instead cache the result.

open+read+close cycle single-threaded (ops/s):
before:	442732
after:	1063462 (+140%)

Here the main bottleneck is memcg.

Scalability-wise problems are avoidable lockref trip on open and ref
management for the file on procfs side.

The file looks like a sterotypical C from the 90s, right down to an
open-coded and slightly obfuscated linked list. I intentionally did not
clean up any of it -- I think the file will be best served by a Rust
rewrite when the time comes.

Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
---

v2:
- drop the procfs bits
- touch up some comments

I posted v1 last year https://lore.kernel.org/linux-fsdevel/20250329192821.822253-1-mjguzik@gmail.com/

but that ran into some procfs issues. the thing can be sped up
regardless of the procfs problem.

 fs/filesystems.c | 144 ++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 137 insertions(+), 7 deletions(-)

diff --git a/fs/filesystems.c b/fs/filesystems.c
index 0c7d2b7ac26c..704fc6d49f80 100644
--- a/fs/filesystems.c
+++ b/fs/filesystems.c
@@ -34,6 +34,23 @@
 static struct file_system_type *file_systems;
 static DEFINE_RWLOCK(file_systems_lock);
 
+#ifdef CONFIG_PROC_FS
+static unsigned long file_systems_gen;
+
+struct file_systems_string {
+	struct rcu_head rcufree;
+	unsigned long gen;
+	size_t len;
+	char string[];
+};
+static struct file_systems_string *file_systems_string;
+static void invalidate_filesystems_string(void);
+#else
+static void invalidate_filesystems_string(void)
+{
+}
+#endif
+
 /* WARNING: This can be used only if we _already_ own a reference */
 struct file_system_type *get_filesystem(struct file_system_type *fs)
 {
@@ -83,10 +100,12 @@ int register_filesystem(struct file_system_type * fs)
 		return -EBUSY;
 	write_lock(&file_systems_lock);
 	p = find_filesystem(fs->name, strlen(fs->name));
-	if (*p)
+	if (*p) {
 		res = -EBUSY;
-	else
+	} else {
 		*p = fs;
+		invalidate_filesystems_string();
+	}
 	write_unlock(&file_systems_lock);
 	return res;
 }
@@ -115,6 +134,7 @@ int unregister_filesystem(struct file_system_type * fs)
 		if (fs == *tmp) {
 			*tmp = fs->next;
 			fs->next = NULL;
+			invalidate_filesystems_string();
 			write_unlock(&file_systems_lock);
 			synchronize_rcu();
 			return 0;
@@ -235,22 +255,132 @@ int __init list_bdev_fs_names(char *buf, size_t size)
 }
 
 #ifdef CONFIG_PROC_FS
-static int filesystems_proc_show(struct seq_file *m, void *v)
+/*
+ * The fs list gets queried a lot by userspace because of libselinux, including
+ * rather surprising programs (would you guess *sed* is on the list?). In order
+ * to reduce the overhead we cache the resulting string, which normally hangs
+ * around below 512 bytes in size.
+ *
+ * As the list almost never changes, its creation is not particularly optimized
+ * for simplicity.
+ *
+ * We sort it out on read in order to not introduce a failure point for fs
+ * registration (in principle we may be unable to alloc memory for the list).
+ */
+static void invalidate_filesystems_string(void)
 {
-	struct file_system_type * tmp;
+	struct file_systems_string *fss;
 
-	read_lock(&file_systems_lock);
+	lockdep_assert_held_write(&file_systems_lock);
+	file_systems_gen++;
+	fss = file_systems_string;
+	WRITE_ONCE(file_systems_string, NULL);
+	kfree_rcu(fss, rcufree);
+}
+
+static noinline int regen_filesystems_string(void)
+{
+	struct file_system_type *tmp;
+	struct file_systems_string *old, *new;
+	size_t newlen, usedlen;
+	unsigned long gen;
+
+retry:
+	lockdep_assert_not_held(&file_systems_lock);
+
+	newlen = 0;
+	write_lock(&file_systems_lock);
+	gen = file_systems_gen;
+	tmp = file_systems;
+
+	/* pre-calc space for "%s\t%s\n" for each fs */
+	while (tmp) {
+		if (!(tmp->fs_flags & FS_REQUIRES_DEV))
+			newlen += strlen("nodev");
+		newlen += strlen("\t");
+		newlen += strlen(tmp->name);
+		newlen += strlen("\n");
+		tmp = tmp->next;
+	}
+	write_unlock(&file_systems_lock);
+
+	new = kmalloc(offsetof(struct file_systems_string, string) + newlen + 1,
+		      GFP_KERNEL);
+	if (!new)
+		return -ENOMEM;
+
+	new->gen = gen;
+	new->len = newlen;
+	new->string[newlen] = '\0';
+	write_lock(&file_systems_lock);
+	old = file_systems_string;
+
+	/*
+	 * Did someone beat us to it?
+	 */
+	if (old && old->gen == file_systems_gen) {
+		write_unlock(&file_systems_lock);
+		kfree(new);
+		return 0;
+	}
+
+	/*
+	 * Did the list change in the meantime?
+	 */
+	if (gen != file_systems_gen) {
+		write_unlock(&file_systems_lock);
+		kfree(new);
+		goto retry;
+	}
+
+	/*
+	 * Populate the string.
+	 *
+	 * We know we have just enough space because we calculated the right
+	 * size the previous time we had the lock and confirmed the list has
+	 * not changed after reacquiring it.
+	 */
+	usedlen = 0;
 	tmp = file_systems;
 	while (tmp) {
-		seq_printf(m, "%s\t%s\n",
+		usedlen += sprintf(&new->string[usedlen], "%s\t%s\n",
 			(tmp->fs_flags & FS_REQUIRES_DEV) ? "" : "nodev",
 			tmp->name);
 		tmp = tmp->next;
 	}
-	read_unlock(&file_systems_lock);
+	BUG_ON(new->len != strlen(new->string));
+
+	/*
+	 * Paired with consume fence in READ_ONCE() in filesystems_proc_show()
+	 */
+	smp_store_release(&file_systems_string, new);
+	write_unlock(&file_systems_lock);
+	kfree_rcu(old, rcufree);
 	return 0;
 }
 
+static int filesystems_proc_show(struct seq_file *m, void *v)
+{
+	struct file_systems_string *fss;
+
+	for (;;) {
+		scoped_guard(rcu) {
+			/*
+			 * Paired with smp_store_release() in regen_filesystems_string()
+			 */
+			fss = READ_ONCE(file_systems_string);
+			if (likely(fss)) {
+				seq_write(m, fss->string, fss->len);
+				return 0;
+			}
+		}
+
+		int err = regen_filesystems_string();
+		if (unlikely(err))
+			return err;
+	}
+}
+
 static int __init proc_filesystems_init(void)
 {
 	proc_create_single("filesystems", 0, NULL, filesystems_proc_show);
-- 
2.48.1


             reply	other threads:[~2026-04-22 18:17 UTC|newest]

Thread overview: 7+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-04-22 18:17 Mateusz Guzik [this message]
2026-04-22 22:07 ` [PATCH v2] cache the string generated by reading /proc/filesystems Andreas Dilger
2026-04-22 22:42   ` Al Viro
2026-04-23 14:28   ` Mateusz Guzik
2026-04-23 13:36 ` [PATCH v2] fs: " Christian Brauner
2026-04-23 14:38   ` Mateusz Guzik
2026-04-23 21:04     ` Christian Brauner

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260422181711.1340269-1-mjguzik@gmail.com \
    --to=mjguzik@gmail.com \
    --cc=brauner@kernel.org \
    --cc=jack@suse.cz \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=viro@zeniv.linux.org.uk \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.