From: Mateusz Guzik <mjguzik@gmail.com>
To: brauner@kernel.org
Cc: viro@zeniv.linux.org.uk, jack@suse.cz,
linux-kernel@vger.kernel.org, linux-fsdevel@vger.kernel.org,
Mateusz Guzik <mjguzik@gmail.com>
Subject: [PATCH v2] fs: cache the string generated by reading /proc/filesystems
Date: Wed, 22 Apr 2026 20:17:11 +0200 [thread overview]
Message-ID: <20260422181711.1340269-1-mjguzik@gmail.com> (raw)
It is being read surprisingly often (e.g., by mkdir, ls and even sed!).
This is lock-protected pointer chasing over a linked list to pay for
sprintf for every fs (32 on my boxen).
Instead cache the result.
open+read+close cycle single-threaded (ops/s):
before: 442732
after: 1063462 (+140%)
Here the main bottleneck is memcg.
Scalability-wise problems are avoidable lockref trip on open and ref
management for the file on procfs side.
The file looks like a sterotypical C from the 90s, right down to an
open-coded and slightly obfuscated linked list. I intentionally did not
clean up any of it -- I think the file will be best served by a Rust
rewrite when the time comes.
Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
---
v2:
- drop the procfs bits
- touch up some comments
I posted v1 last year https://lore.kernel.org/linux-fsdevel/20250329192821.822253-1-mjguzik@gmail.com/
but that ran into some procfs issues. the thing can be sped up
regardless of the procfs problem.
fs/filesystems.c | 144 ++++++++++++++++++++++++++++++++++++++++++++---
1 file changed, 137 insertions(+), 7 deletions(-)
diff --git a/fs/filesystems.c b/fs/filesystems.c
index 0c7d2b7ac26c..704fc6d49f80 100644
--- a/fs/filesystems.c
+++ b/fs/filesystems.c
@@ -34,6 +34,23 @@
static struct file_system_type *file_systems;
static DEFINE_RWLOCK(file_systems_lock);
+#ifdef CONFIG_PROC_FS
+static unsigned long file_systems_gen;
+
+struct file_systems_string {
+ struct rcu_head rcufree;
+ unsigned long gen;
+ size_t len;
+ char string[];
+};
+static struct file_systems_string *file_systems_string;
+static void invalidate_filesystems_string(void);
+#else
+static void invalidate_filesystems_string(void)
+{
+}
+#endif
+
/* WARNING: This can be used only if we _already_ own a reference */
struct file_system_type *get_filesystem(struct file_system_type *fs)
{
@@ -83,10 +100,12 @@ int register_filesystem(struct file_system_type * fs)
return -EBUSY;
write_lock(&file_systems_lock);
p = find_filesystem(fs->name, strlen(fs->name));
- if (*p)
+ if (*p) {
res = -EBUSY;
- else
+ } else {
*p = fs;
+ invalidate_filesystems_string();
+ }
write_unlock(&file_systems_lock);
return res;
}
@@ -115,6 +134,7 @@ int unregister_filesystem(struct file_system_type * fs)
if (fs == *tmp) {
*tmp = fs->next;
fs->next = NULL;
+ invalidate_filesystems_string();
write_unlock(&file_systems_lock);
synchronize_rcu();
return 0;
@@ -235,22 +255,132 @@ int __init list_bdev_fs_names(char *buf, size_t size)
}
#ifdef CONFIG_PROC_FS
-static int filesystems_proc_show(struct seq_file *m, void *v)
+/*
+ * The fs list gets queried a lot by userspace because of libselinux, including
+ * rather surprising programs (would you guess *sed* is on the list?). In order
+ * to reduce the overhead we cache the resulting string, which normally hangs
+ * around below 512 bytes in size.
+ *
+ * As the list almost never changes, its creation is not particularly optimized
+ * for simplicity.
+ *
+ * We sort it out on read in order to not introduce a failure point for fs
+ * registration (in principle we may be unable to alloc memory for the list).
+ */
+static void invalidate_filesystems_string(void)
{
- struct file_system_type * tmp;
+ struct file_systems_string *fss;
- read_lock(&file_systems_lock);
+ lockdep_assert_held_write(&file_systems_lock);
+ file_systems_gen++;
+ fss = file_systems_string;
+ WRITE_ONCE(file_systems_string, NULL);
+ kfree_rcu(fss, rcufree);
+}
+
+static noinline int regen_filesystems_string(void)
+{
+ struct file_system_type *tmp;
+ struct file_systems_string *old, *new;
+ size_t newlen, usedlen;
+ unsigned long gen;
+
+retry:
+ lockdep_assert_not_held(&file_systems_lock);
+
+ newlen = 0;
+ write_lock(&file_systems_lock);
+ gen = file_systems_gen;
+ tmp = file_systems;
+
+ /* pre-calc space for "%s\t%s\n" for each fs */
+ while (tmp) {
+ if (!(tmp->fs_flags & FS_REQUIRES_DEV))
+ newlen += strlen("nodev");
+ newlen += strlen("\t");
+ newlen += strlen(tmp->name);
+ newlen += strlen("\n");
+ tmp = tmp->next;
+ }
+ write_unlock(&file_systems_lock);
+
+ new = kmalloc(offsetof(struct file_systems_string, string) + newlen + 1,
+ GFP_KERNEL);
+ if (!new)
+ return -ENOMEM;
+
+ new->gen = gen;
+ new->len = newlen;
+ new->string[newlen] = '\0';
+ write_lock(&file_systems_lock);
+ old = file_systems_string;
+
+ /*
+ * Did someone beat us to it?
+ */
+ if (old && old->gen == file_systems_gen) {
+ write_unlock(&file_systems_lock);
+ kfree(new);
+ return 0;
+ }
+
+ /*
+ * Did the list change in the meantime?
+ */
+ if (gen != file_systems_gen) {
+ write_unlock(&file_systems_lock);
+ kfree(new);
+ goto retry;
+ }
+
+ /*
+ * Populate the string.
+ *
+ * We know we have just enough space because we calculated the right
+ * size the previous time we had the lock and confirmed the list has
+ * not changed after reacquiring it.
+ */
+ usedlen = 0;
tmp = file_systems;
while (tmp) {
- seq_printf(m, "%s\t%s\n",
+ usedlen += sprintf(&new->string[usedlen], "%s\t%s\n",
(tmp->fs_flags & FS_REQUIRES_DEV) ? "" : "nodev",
tmp->name);
tmp = tmp->next;
}
- read_unlock(&file_systems_lock);
+ BUG_ON(new->len != strlen(new->string));
+
+ /*
+ * Paired with consume fence in READ_ONCE() in filesystems_proc_show()
+ */
+ smp_store_release(&file_systems_string, new);
+ write_unlock(&file_systems_lock);
+ kfree_rcu(old, rcufree);
return 0;
}
+static int filesystems_proc_show(struct seq_file *m, void *v)
+{
+ struct file_systems_string *fss;
+
+ for (;;) {
+ scoped_guard(rcu) {
+ /*
+ * Paired with smp_store_release() in regen_filesystems_string()
+ */
+ fss = READ_ONCE(file_systems_string);
+ if (likely(fss)) {
+ seq_write(m, fss->string, fss->len);
+ return 0;
+ }
+ }
+
+ int err = regen_filesystems_string();
+ if (unlikely(err))
+ return err;
+ }
+}
+
static int __init proc_filesystems_init(void)
{
proc_create_single("filesystems", 0, NULL, filesystems_proc_show);
--
2.48.1
next reply other threads:[~2026-04-22 18:17 UTC|newest]
Thread overview: 3+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-04-22 18:17 Mateusz Guzik [this message]
2026-04-22 22:07 ` [PATCH v2] cache the string generated by reading /proc/filesystems Andreas Dilger
2026-04-22 22:42 ` Al Viro
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260422181711.1340269-1-mjguzik@gmail.com \
--to=mjguzik@gmail.com \
--cc=brauner@kernel.org \
--cc=jack@suse.cz \
--cc=linux-fsdevel@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=viro@zeniv.linux.org.uk \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox