public inbox for linux-fsdevel@vger.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2] fs: cache the string generated by reading /proc/filesystems
@ 2026-04-22 18:17 Mateusz Guzik
  2026-04-22 22:07 ` [PATCH v2] " Andreas Dilger
  0 siblings, 1 reply; 3+ messages in thread
From: Mateusz Guzik @ 2026-04-22 18:17 UTC (permalink / raw)
  To: brauner; +Cc: viro, jack, linux-kernel, linux-fsdevel, Mateusz Guzik

It is being read surprisingly often (e.g., by mkdir, ls and even sed!).

This is lock-protected pointer chasing over a linked list to pay for
sprintf for every fs (32 on my boxen).

Instead cache the result.

open+read+close cycle single-threaded (ops/s):
before:	442732
after:	1063462 (+140%)

Here the main bottleneck is memcg.

Scalability-wise problems are avoidable lockref trip on open and ref
management for the file on procfs side.

The file looks like a sterotypical C from the 90s, right down to an
open-coded and slightly obfuscated linked list. I intentionally did not
clean up any of it -- I think the file will be best served by a Rust
rewrite when the time comes.

Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
---

v2:
- drop the procfs bits
- touch up some comments

I posted v1 last year https://lore.kernel.org/linux-fsdevel/20250329192821.822253-1-mjguzik@gmail.com/

but that ran into some procfs issues. the thing can be sped up
regardless of the procfs problem.

 fs/filesystems.c | 144 ++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 137 insertions(+), 7 deletions(-)

diff --git a/fs/filesystems.c b/fs/filesystems.c
index 0c7d2b7ac26c..704fc6d49f80 100644
--- a/fs/filesystems.c
+++ b/fs/filesystems.c
@@ -34,6 +34,23 @@
 static struct file_system_type *file_systems;
 static DEFINE_RWLOCK(file_systems_lock);
 
+#ifdef CONFIG_PROC_FS
+static unsigned long file_systems_gen;
+
+struct file_systems_string {
+	struct rcu_head rcufree;
+	unsigned long gen;
+	size_t len;
+	char string[];
+};
+static struct file_systems_string *file_systems_string;
+static void invalidate_filesystems_string(void);
+#else
+static void invalidate_filesystems_string(void)
+{
+}
+#endif
+
 /* WARNING: This can be used only if we _already_ own a reference */
 struct file_system_type *get_filesystem(struct file_system_type *fs)
 {
@@ -83,10 +100,12 @@ int register_filesystem(struct file_system_type * fs)
 		return -EBUSY;
 	write_lock(&file_systems_lock);
 	p = find_filesystem(fs->name, strlen(fs->name));
-	if (*p)
+	if (*p) {
 		res = -EBUSY;
-	else
+	} else {
 		*p = fs;
+		invalidate_filesystems_string();
+	}
 	write_unlock(&file_systems_lock);
 	return res;
 }
@@ -115,6 +134,7 @@ int unregister_filesystem(struct file_system_type * fs)
 		if (fs == *tmp) {
 			*tmp = fs->next;
 			fs->next = NULL;
+			invalidate_filesystems_string();
 			write_unlock(&file_systems_lock);
 			synchronize_rcu();
 			return 0;
@@ -235,22 +255,132 @@ int __init list_bdev_fs_names(char *buf, size_t size)
 }
 
 #ifdef CONFIG_PROC_FS
-static int filesystems_proc_show(struct seq_file *m, void *v)
+/*
+ * The fs list gets queried a lot by userspace because of libselinux, including
+ * rather surprising programs (would you guess *sed* is on the list?). In order
+ * to reduce the overhead we cache the resulting string, which normally hangs
+ * around below 512 bytes in size.
+ *
+ * As the list almost never changes, its creation is not particularly optimized
+ * for simplicity.
+ *
+ * We sort it out on read in order to not introduce a failure point for fs
+ * registration (in principle we may be unable to alloc memory for the list).
+ */
+static void invalidate_filesystems_string(void)
 {
-	struct file_system_type * tmp;
+	struct file_systems_string *fss;
 
-	read_lock(&file_systems_lock);
+	lockdep_assert_held_write(&file_systems_lock);
+	file_systems_gen++;
+	fss = file_systems_string;
+	WRITE_ONCE(file_systems_string, NULL);
+	kfree_rcu(fss, rcufree);
+}
+
+static noinline int regen_filesystems_string(void)
+{
+	struct file_system_type *tmp;
+	struct file_systems_string *old, *new;
+	size_t newlen, usedlen;
+	unsigned long gen;
+
+retry:
+	lockdep_assert_not_held(&file_systems_lock);
+
+	newlen = 0;
+	write_lock(&file_systems_lock);
+	gen = file_systems_gen;
+	tmp = file_systems;
+
+	/* pre-calc space for "%s\t%s\n" for each fs */
+	while (tmp) {
+		if (!(tmp->fs_flags & FS_REQUIRES_DEV))
+			newlen += strlen("nodev");
+		newlen += strlen("\t");
+		newlen += strlen(tmp->name);
+		newlen += strlen("\n");
+		tmp = tmp->next;
+	}
+	write_unlock(&file_systems_lock);
+
+	new = kmalloc(offsetof(struct file_systems_string, string) + newlen + 1,
+		      GFP_KERNEL);
+	if (!new)
+		return -ENOMEM;
+
+	new->gen = gen;
+	new->len = newlen;
+	new->string[newlen] = '\0';
+	write_lock(&file_systems_lock);
+	old = file_systems_string;
+
+	/*
+	 * Did someone beat us to it?
+	 */
+	if (old && old->gen == file_systems_gen) {
+		write_unlock(&file_systems_lock);
+		kfree(new);
+		return 0;
+	}
+
+	/*
+	 * Did the list change in the meantime?
+	 */
+	if (gen != file_systems_gen) {
+		write_unlock(&file_systems_lock);
+		kfree(new);
+		goto retry;
+	}
+
+	/*
+	 * Populate the string.
+	 *
+	 * We know we have just enough space because we calculated the right
+	 * size the previous time we had the lock and confirmed the list has
+	 * not changed after reacquiring it.
+	 */
+	usedlen = 0;
 	tmp = file_systems;
 	while (tmp) {
-		seq_printf(m, "%s\t%s\n",
+		usedlen += sprintf(&new->string[usedlen], "%s\t%s\n",
 			(tmp->fs_flags & FS_REQUIRES_DEV) ? "" : "nodev",
 			tmp->name);
 		tmp = tmp->next;
 	}
-	read_unlock(&file_systems_lock);
+	BUG_ON(new->len != strlen(new->string));
+
+	/*
+	 * Paired with consume fence in READ_ONCE() in filesystems_proc_show()
+	 */
+	smp_store_release(&file_systems_string, new);
+	write_unlock(&file_systems_lock);
+	kfree_rcu(old, rcufree);
 	return 0;
 }
 
+static int filesystems_proc_show(struct seq_file *m, void *v)
+{
+	struct file_systems_string *fss;
+
+	for (;;) {
+		scoped_guard(rcu) {
+			/*
+			 * Paired with smp_store_release() in regen_filesystems_string()
+			 */
+			fss = READ_ONCE(file_systems_string);
+			if (likely(fss)) {
+				seq_write(m, fss->string, fss->len);
+				return 0;
+			}
+		}
+
+		int err = regen_filesystems_string();
+		if (unlikely(err))
+			return err;
+	}
+}
+
 static int __init proc_filesystems_init(void)
 {
 	proc_create_single("filesystems", 0, NULL, filesystems_proc_show);
-- 
2.48.1


^ permalink raw reply related	[flat|nested] 3+ messages in thread

* Re: [PATCH v2] cache the string generated by reading /proc/filesystems
  2026-04-22 18:17 [PATCH v2] fs: cache the string generated by reading /proc/filesystems Mateusz Guzik
@ 2026-04-22 22:07 ` Andreas Dilger
  2026-04-22 22:42   ` Al Viro
  0 siblings, 1 reply; 3+ messages in thread
From: Andreas Dilger @ 2026-04-22 22:07 UTC (permalink / raw)
  To: Mateusz Guzik; +Cc: brauner, viro, jack, linux-kernel, linux-fsdevel

On Apr 22, 2026, at 12:17, Mateusz Guzik <mjguzik@gmail.com> wrote:
> 
> It is being read surprisingly often (e.g., by mkdir, ls and even sed!).

Yeah, it is sad that bloat continually grows to continually make fast
systems slow.

> This is lock-protected pointer chasing over a linked list to pay for
> sprintf for every fs (32 on my boxen).
> 
> Instead cache the result.
> 
> open+read+close cycle single-threaded (ops/s):
> before: 442732
> after: 1063462 (+140%)
> 
> Here the main bottleneck is memcg.
> 
> Scalability-wise problems are avoidable lockref trip on open and ref
> management for the file on procfs side.
> 
> The file looks like a sterotypical C from the 90s, right down to an
> open-coded and slightly obfuscated linked list. I intentionally did not
> clean up any of it -- I think the file will be best served by a Rust
> rewrite when the time comes.
> 
> Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
> ---
> 
> v2:
> - drop the procfs bits
> - touch up some comments
> 
> I posted v1 last year https://lore.kernel.org/linux-fsdevel/20250329192821.822253-1-mjguzik@gmail.com/
> 
> but that ran into some procfs issues. the thing can be sped up
> regardless of the procfs problem.
> 
> fs/filesystems.c | 144 ++++++++++++++++++++++++++++++++++++++++++++---
> 1 file changed, 137 insertions(+), 7 deletions(-)
> 
> diff --git a/fs/filesystems.c b/fs/filesystems.c
> 
> +static noinline int regen_filesystems_string(void)
> +{
> +	struct file_system_type *tmp;
> +	struct file_systems_string *old, *new;
> +	size_t newlen, usedlen;
> +	unsigned long gen;
> +
> +retry:
> +	lockdep_assert_not_held(&file_systems_lock);
> +
> +	newlen = 0;
> +	write_lock(&file_systems_lock);
> +	gen = file_systems_gen;
> +	tmp = file_systems;
> +
> +	/* pre-calc space for "%s\t%s\n" for each fs */
> +	while (tmp) {
> +		if (!(tmp->fs_flags & FS_REQUIRES_DEV))
> +			newlen += strlen("nodev");
> +		newlen += strlen("\t");
> +		newlen += strlen(tmp->name);
> +		newlen += strlen("\n");
> +		tmp = tmp->next;
> +	}
> +	write_unlock(&file_systems_lock);
> +
> +	new = kmalloc(offsetof(struct file_systems_string, string) + newlen + 1,
> +		      GFP_KERNEL);
> +	if (!new)
> +		return -ENOMEM;

This is introducing an error case where there was none before, in a
critical system file.  It would also be possible to fall back to
seq_printf() instead of sprinft() if no buffer is available.

> +	new->gen = gen;
> +	new->len = newlen;
> +	new->string[newlen] = '\0';
> +	write_lock(&file_systems_lock);
> +	old = file_systems_string;
> +
> +	/*
> +	 * Did someone beat us to it?
> +	 */
> +	if (old && old->gen == file_systems_gen) {
> +		write_unlock(&file_systems_lock);
> +		kfree(new);
> +		return 0;
> +	}
> +
> +	/*
> +	 * Did the list change in the meantime?
> +	 */
> +	if (gen != file_systems_gen) {
> +		write_unlock(&file_systems_lock);
> +		kfree(new);
> +		goto retry;
> +	}
> +
> +	/*
> +	 * Populate the string.
> +	 *
> +	 * We know we have just enough space because we calculated the right
> +	 * size the previous time we had the lock and confirmed the list has
> +	 * not changed after reacquiring it.
> +	 */

It doesn't seem prudent to use raw sprintf() in a loop and justify this in
a comment, when the actual string generation is not performance critical.
This could use scnprintf() and be "safer" even if it "should just work" (tm).

> +	 usedlen = 0;
>  	tmp = file_systems;
>  	while (tmp) {
> -		seq_printf(m, "%s\t%s\n",
> +		usedlen += sprintf(&new->string[usedlen], "%s\t%s\n",
>  			(tmp->fs_flags & FS_REQUIRES_DEV) ? "" : "nodev",
>  			tmp->name);
>  		tmp = tmp->next;
>  	}
> -	read_unlock(&file_systems_lock);
> +	BUG_ON(new->len != strlen(new->string));

Adding a BUG_ON() here seems gratuitous, when there are lots of other
options to handle this case, especially if sprintf() didn't just corrupt
memory for some reason.  It could allocate a larger buffer and retry,
or in the worst case return an error instead of crashing the system.

> +
> +	/*
> +	 * Paired with consume fence in READ_ONCE() in filesystems_proc_show()
> +	 */
> +	smp_store_release(&file_systems_string, new);
> +	write_unlock(&file_systems_lock);
> +	kfree_rcu(old, rcufree);
>  	return 0;
>  }
> 


Cheers, Andreas






^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [PATCH v2] cache the string generated by reading /proc/filesystems
  2026-04-22 22:07 ` [PATCH v2] " Andreas Dilger
@ 2026-04-22 22:42   ` Al Viro
  0 siblings, 0 replies; 3+ messages in thread
From: Al Viro @ 2026-04-22 22:42 UTC (permalink / raw)
  To: Andreas Dilger; +Cc: Mateusz Guzik, brauner, jack, linux-kernel, linux-fsdevel

On Wed, Apr 22, 2026 at 04:07:15PM -0600, Andreas Dilger wrote:
> On Apr 22, 2026, at 12:17, Mateusz Guzik <mjguzik@gmail.com> wrote:
> > 
> > It is being read surprisingly often (e.g., by mkdir, ls and even sed!).
> 
> Yeah, it is sad that bloat continually grows to continually make fast
> systems slow.

libselinux looking for selinuxfs, at a guess, not that I had any idea
why sed might possibly care about that...
<RTFS>
-i support, apparently.

No printable comment...

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2026-04-22 22:38 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-04-22 18:17 [PATCH v2] fs: cache the string generated by reading /proc/filesystems Mateusz Guzik
2026-04-22 22:07 ` [PATCH v2] " Andreas Dilger
2026-04-22 22:42   ` Al Viro

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox