Linux EXT4 FS development
 help / color / mirror / Atom feed
* [PATCH RFC v2 03/18] super: take lock after last reference count
From: Christian Brauner @ 2026-06-16 14:08 UTC (permalink / raw)
  To: Jan Kara
  Cc: Christoph Hellwig, Jens Axboe, Alexander Viro, linux-block,
	linux-kernel, linux-fsdevel, Carlos Maiolino, linux-xfs,
	Chris Mason, David Sterba, linux-btrfs, Theodore Ts'o,
	linux-ext4, Gao Xiang, linux-erofs, Christian Brauner (Amutable)
In-Reply-To: <20260616-work-super-bdev_holder_global-v2-0-7df6b864028e@kernel.org>

__put_super() required the caller to hold sb_lock, so put_super()
wrapped it. The per-device superblock table introduced later drops its
passive references from contexts that do not hold sb_lock, so make
put_super() self-locking: drop the count first and take sb_lock only for
the final list_del.

With the count now dropped outside sb_lock a superblock can briefly sit
on @super_blocks with s_passive == 0 before it is unlinked, so the list
walkers (__iterate_supers(), iterate_supers_type(), user_get_super())
switch to refcount_inc_not_zero() and skip it.

Signed-off-by: Christian Brauner (Amutable) <brauner@kernel.org>
---
 fs/super.c | 63 ++++++++++++++++++++++++++++----------------------------------
 1 file changed, 28 insertions(+), 35 deletions(-)

diff --git a/fs/super.c b/fs/super.c
index 25dd72b550e0..a771a0ad4c9a 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -403,12 +403,17 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags,
 /* Superblock refcounting  */
 
 /*
- * Drop a superblock's refcount.  The caller must hold sb_lock.
+ * Drop a superblock's passive reference.  Must be called WITHOUT sb_lock held;
+ * put_super() acquires sb_lock itself when the final reference is dropped.
  */
-static void __put_super(struct super_block *s)
+void put_super(struct super_block *s)
 {
 	if (refcount_dec_and_test(&s->s_passive)) {
+
+		spin_lock(&sb_lock);
 		list_del_init(&s->s_list);
+		spin_unlock(&sb_lock);
+
 		WARN_ON(s->s_dentry_lru.node);
 		WARN_ON(s->s_inode_lru.node);
 		WARN_ON(s->s_mounts);
@@ -416,20 +421,6 @@ static void __put_super(struct super_block *s)
 	}
 }
 
-/**
- *	put_super	-	drop a temporary reference to superblock
- *	@sb: superblock in question
- *
- *	Drops a temporary reference, frees superblock if there's no
- *	references left.
- */
-void put_super(struct super_block *sb)
-{
-	spin_lock(&sb_lock);
-	__put_super(sb);
-	spin_unlock(&sb_lock);
-}
-
 static void kill_super_notify(struct super_block *sb)
 {
 	lockdep_assert_not_held(&sb->s_umount);
@@ -478,11 +469,7 @@ void deactivate_locked_super(struct super_block *s)
 
 		kill_super_notify(s);
 
-		/*
-		 * Since list_lru_destroy() may sleep, we cannot call it from
-		 * put_super(), where we hold the sb_lock. Therefore we destroy
-		 * the lru lists right now.
-		 */
+		/* list_lru_destroy() may sleep; put_super() callers may not. */
 		list_lru_destroy(&s->s_dentry_lru);
 		list_lru_destroy(&s->s_inode_lru);
 
@@ -851,14 +838,17 @@ static void __iterate_supers(void (*f)(struct super_block *, void *), void *arg,
 	struct super_block *sb, *p = NULL;
 	bool excl = flags & SUPER_ITER_EXCL;
 
-	guard(spinlock)(&sb_lock);
+	spin_lock(&sb_lock);
 
 	for (sb = first_super(flags);
 	     !list_entry_is_head(sb, &super_blocks, s_list);
 	     sb = next_super(sb, flags)) {
 		if (super_flags(sb, SB_DYING))
 			continue;
-		refcount_inc(&sb->s_passive);
+
+		if (!refcount_inc_not_zero(&sb->s_passive))
+			continue;
+
 		spin_unlock(&sb_lock);
 
 		if (flags & SUPER_ITER_UNLOCKED) {
@@ -868,13 +858,14 @@ static void __iterate_supers(void (*f)(struct super_block *, void *), void *arg,
 			super_unlock(sb, excl);
 		}
 
-		spin_lock(&sb_lock);
 		if (p)
-			__put_super(p);
+			put_super(p);
 		p = sb;
+		spin_lock(&sb_lock);
 	}
+	spin_unlock(&sb_lock);
 	if (p)
-		__put_super(p);
+		put_super(p);
 }
 
 void iterate_supers(void (*f)(struct super_block *, void *), void *arg)
@@ -903,7 +894,9 @@ void iterate_supers_type(struct file_system_type *type,
 		if (super_flags(sb, SB_DYING))
 			continue;
 
-		refcount_inc(&sb->s_passive);
+		if (!refcount_inc_not_zero(&sb->s_passive))
+			continue;
+
 		spin_unlock(&sb_lock);
 
 		locked = super_lock_shared(sb);
@@ -912,14 +905,14 @@ void iterate_supers_type(struct file_system_type *type,
 			super_unlock_shared(sb);
 		}
 
-		spin_lock(&sb_lock);
 		if (p)
-			__put_super(p);
+			put_super(p);
 		p = sb;
+		spin_lock(&sb_lock);
 	}
-	if (p)
-		__put_super(p);
 	spin_unlock(&sb_lock);
+	if (p)
+		put_super(p);
 }
 
 EXPORT_SYMBOL(iterate_supers_type);
@@ -935,15 +928,17 @@ struct super_block *user_get_super(dev_t dev, bool excl)
 		if (sb->s_dev != dev)
 			continue;
 
-		refcount_inc(&sb->s_passive);
+		if (!refcount_inc_not_zero(&sb->s_passive))
+			continue;
+
 		spin_unlock(&sb_lock);
 
 		locked = super_lock(sb, excl);
 		if (locked)
 			return sb;
 
+		put_super(sb);
 		spin_lock(&sb_lock);
-		__put_super(sb);
 		break;
 	}
 	spin_unlock(&sb_lock);
@@ -1368,9 +1363,7 @@ static struct super_block *bdev_super_lock(struct block_device *bdev, bool excl)
 	lockdep_assert_not_held(&bdev->bd_disk->open_mutex);
 
 	/* Make sure sb doesn't go away from under us */
-	spin_lock(&sb_lock);
 	refcount_inc(&sb->s_passive);
-	spin_unlock(&sb_lock);
 
 	mutex_unlock(&bdev->bd_holder_lock);
 

-- 
2.47.3


^ permalink raw reply related

* [PATCH RFC v2 04/18] fs, block: move blk_mode_t and fop_flags_t into <linux/types.h>
From: Christian Brauner @ 2026-06-16 14:08 UTC (permalink / raw)
  To: Jan Kara
  Cc: Christoph Hellwig, Jens Axboe, Alexander Viro, linux-block,
	linux-kernel, linux-fsdevel, Carlos Maiolino, linux-xfs,
	Chris Mason, David Sterba, linux-btrfs, Theodore Ts'o,
	linux-ext4, Gao Xiang, linux-erofs, Christian Brauner (Amutable)
In-Reply-To: <20260616-work-super-bdev_holder_global-v2-0-7df6b864028e@kernel.org>

blk_mode_t and fop_flags_t are both plain 'unsigned int __bitwise' flag
typedefs, exactly like the gfp_t, slab_flags_t and fmode_t that already
live in <linux/types.h>. Move them there so they are available
everywhere without having to drag in a subsystem header.

Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner (Amutable) <brauner@kernel.org>
---
 include/linux/blkdev.h | 2 --
 include/linux/fs.h     | 2 --
 include/linux/types.h  | 2 ++
 3 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 9e95bdb8b323..cee548184a7b 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -126,8 +126,6 @@ struct blk_integrity {
 	unsigned char				pi_tuple_size;
 };
 
-typedef unsigned int __bitwise blk_mode_t;
-
 /* open for reading */
 #define BLK_OPEN_READ		((__force blk_mode_t)(1 << 0))
 /* open for writing */
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 6da44573ce45..1c8fe40ad9a4 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1921,8 +1921,6 @@ struct dir_context {
 struct io_uring_cmd;
 struct offset_ctx;
 
-typedef unsigned int __bitwise fop_flags_t;
-
 struct file_operations {
 	struct module *owner;
 	fop_flags_t fop_flags;
diff --git a/include/linux/types.h b/include/linux/types.h
index 608050dbca6a..ef026585420b 100644
--- a/include/linux/types.h
+++ b/include/linux/types.h
@@ -163,6 +163,8 @@ typedef u32 dma_addr_t;
 typedef unsigned int __bitwise gfp_t;
 typedef unsigned int __bitwise slab_flags_t;
 typedef unsigned int __bitwise fmode_t;
+typedef unsigned int __bitwise blk_mode_t;
+typedef unsigned int __bitwise fop_flags_t;
 
 #ifdef CONFIG_PHYS_ADDR_T_64BIT
 typedef u64 phys_addr_t;

-- 
2.47.3


^ permalink raw reply related

* [PATCH RFC v2 05/18] ext4: use anonymous devices for KUnit test superblocks
From: Christian Brauner @ 2026-06-16 14:08 UTC (permalink / raw)
  To: Jan Kara
  Cc: Christoph Hellwig, Jens Axboe, Alexander Viro, linux-block,
	linux-kernel, linux-fsdevel, Carlos Maiolino, linux-xfs,
	Chris Mason, David Sterba, linux-btrfs, Theodore Ts'o,
	linux-ext4, Gao Xiang, linux-erofs, Christian Brauner (Amutable)
In-Reply-To: <20260616-work-super-bdev_holder_global-v2-0-7df6b864028e@kernel.org>

The mballoc and extents KUnit tests create superblocks through
sget_fc() with a set callback that never assigns s_dev and a kill_sb
that only calls generic_shutdown_super().

The upcoming global device-to-superblock table registers every
superblock under its s_dev, so each superblock needs a unique device
number. Allocate a proper anonymous device via set_anon_super_fc() and
release it through kill_anon_super().

Signed-off-by: Christian Brauner (Amutable) <brauner@kernel.org>
---
 fs/ext4/extents-test.c | 9 ++-------
 fs/ext4/mballoc-test.c | 9 ++-------
 2 files changed, 4 insertions(+), 14 deletions(-)

diff --git a/fs/ext4/extents-test.c b/fs/ext4/extents-test.c
index bd7795a82607..c3836ecb89f9 100644
--- a/fs/ext4/extents-test.c
+++ b/fs/ext4/extents-test.c
@@ -126,11 +126,6 @@ struct kunit_ext_test_param {
 	struct kunit_ext_data_state exp_data_state[3];
 };
 
-static void ext_kill_sb(struct super_block *sb)
-{
-	generic_shutdown_super(sb);
-}
-
 static int ext_init_fs_context(struct fs_context *fc)
 {
 	return 0;
@@ -138,13 +133,13 @@ static int ext_init_fs_context(struct fs_context *fc)
 
 static int ext_set(struct super_block *sb, struct fs_context *fc)
 {
-	return 0;
+	return set_anon_super_fc(sb, fc);
 }
 
 static struct file_system_type ext_fs_type = {
 	.name		 = "extents test",
 	.init_fs_context = ext_init_fs_context,
-	.kill_sb	 = ext_kill_sb,
+	.kill_sb	 = kill_anon_super,
 };
 
 static void extents_kunit_exit(struct kunit *test)
diff --git a/fs/ext4/mballoc-test.c b/fs/ext4/mballoc-test.c
index d90da44aadbd..a3b33ed2c172 100644
--- a/fs/ext4/mballoc-test.c
+++ b/fs/ext4/mballoc-test.c
@@ -59,11 +59,6 @@ static const struct super_operations mbt_sops = {
 	.free_inode	= mbt_free_inode,
 };
 
-static void mbt_kill_sb(struct super_block *sb)
-{
-	generic_shutdown_super(sb);
-}
-
 static int mbt_init_fs_context(struct fs_context *fc)
 {
 	return 0;
@@ -72,7 +67,7 @@ static int mbt_init_fs_context(struct fs_context *fc)
 static struct file_system_type mbt_fs_type = {
 	.name			= "mballoc test",
 	.init_fs_context	= mbt_init_fs_context,
-	.kill_sb		= mbt_kill_sb,
+	.kill_sb		= kill_anon_super,
 };
 
 static int mbt_mb_init(struct super_block *sb)
@@ -136,7 +131,7 @@ static void mbt_mb_release(struct super_block *sb)
 
 static int mbt_set(struct super_block *sb, struct fs_context *fc)
 {
-	return 0;
+	return set_anon_super_fc(sb, fc);
 }
 
 static struct super_block *mbt_ext4_alloc_super_block(void)

-- 
2.47.3


^ permalink raw reply related

* [PATCH RFC v2 06/18] ocfs2: don't reset s_dev on dismount
From: Christian Brauner @ 2026-06-16 14:08 UTC (permalink / raw)
  To: Jan Kara
  Cc: Christoph Hellwig, Jens Axboe, Alexander Viro, linux-block,
	linux-kernel, linux-fsdevel, Carlos Maiolino, linux-xfs,
	Chris Mason, David Sterba, linux-btrfs, Theodore Ts'o,
	linux-ext4, Gao Xiang, linux-erofs, Christian Brauner (Amutable)
In-Reply-To: <20260616-work-super-bdev_holder_global-v2-0-7df6b864028e@kernel.org>

ocfs2_dismount_volume() has reset sb->s_dev to zero since the original
merge in ccd979bdbce9 ("[PATCH] OCFS2: The Second Oracle Cluster
Filesystem") as part of scrubbing the super_block. Nothing reads the
field afterwards: all ocfs2-internal uses are mount-time log and trace
prints, and dev_t-keyed superblock lookups skip a dying superblock
anyway - s_root is gone before ->put_super runs and super_lock()
refuses SB_DYING superblocks.

The upcoming device-to-superblock table registers every superblock
under its s_dev. Drop the reset instead of leaving a superblock around
whose s_dev contradicts its registration.

Signed-off-by: Christian Brauner (Amutable) <brauner@kernel.org>
---
 fs/ocfs2/super.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 4870e680c4e5..df9ebff25dab 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1882,7 +1882,6 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
 
 	ocfs2_delete_osb(osb);
 	kfree(osb);
-	sb->s_dev = 0;
 	sb->s_fs_info = NULL;
 }
 

-- 
2.47.3


^ permalink raw reply related

* [PATCH RFC v2 07/18] fs: maintain a global device-to-superblock table
From: Christian Brauner @ 2026-06-16 14:08 UTC (permalink / raw)
  To: Jan Kara
  Cc: Christoph Hellwig, Jens Axboe, Alexander Viro, linux-block,
	linux-kernel, linux-fsdevel, Carlos Maiolino, linux-xfs,
	Chris Mason, David Sterba, linux-btrfs, Theodore Ts'o,
	linux-ext4, Gao Xiang, linux-erofs, Christian Brauner (Amutable)
In-Reply-To: <20260616-work-super-bdev_holder_global-v2-0-7df6b864028e@kernel.org>

fs_holder_ops recovers the owning superblock from bdev->bd_holder, which
forces the holder to be exactly one superblock and prevents several
superblocks from sharing one block device. That's what erofs is doing.

As a first step introduce a global dev_t-keyed rhltable mapping each
device to the superblock(s) using it. The entry is preallocated in
alloc_super() and registered under sb->s_dev by the set callback through
set_anon_super() and set_bdev_super(), the two helpers every set
callback assigns s_dev through. Registration is the final fallible act
of a set callback, so an insert failure unwinds through sget_fc()'s
existing set-failure path: the fs_context keeps ownership of s_fs_info
and the callers' error paths stay correct. set_anon_super() releases
the anonymous dev it allocated when registration fails. Unwinding
through deactivate_locked_super() instead would run kill_sb() and free
s_fs_info behind the caller's back: nfs and ceph free that object
through a local pointer when sget_fc() fails and would double-free.

The superblock stashes the entry in sb->s_super_dev and
kill_super_notify() drops the claim through it, so teardown doesn't
depend on s_dev staying stable; an entry that was never registered is
freed together with the superblock in destroy_super_work().

Each table entry holds a passive reference (s_passive) on its
superblock, so the struct stays valid for as long as the entry is
reachable. Entries are claim-counted through sd_ref: additional claims
on the same (device, superblock) pair share the entry, and the unlink
is deferred to the last put, so a later iteration cursor never resumes
from a removed node.

The table is initialized from mnt_init(): the first superblocks (the
tmpfs shm mount and rootfs) are created from start_kernel() long before
any initcall runs, so an initcall would be too late.

The table has no readers yet; the fs_holder_ops callbacks are switched
over once all devices a filesystem claims are registered.

Signed-off-by: Christian Brauner (Amutable) <brauner@kernel.org>
---
 fs/internal.h                  |   1 +
 fs/namespace.c                 |   2 +
 fs/super.c                     | 102 ++++++++++++++++++++++++++++++++++++++++-
 include/linux/fs/super_types.h |   2 +
 4 files changed, 105 insertions(+), 2 deletions(-)

diff --git a/fs/internal.h b/fs/internal.h
index d77578d66d42..83eb3e2a0f85 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -137,6 +137,7 @@ extern int reconfigure_super(struct fs_context *);
 extern bool super_trylock_shared(struct super_block *sb);
 struct super_block *user_get_super(dev_t, bool excl);
 void put_super(struct super_block *sb);
+void __init super_dev_init(void);
 extern bool mount_capable(struct fs_context *);
 int sb_init_dio_done_wq(struct super_block *sb);
 
diff --git a/fs/namespace.c b/fs/namespace.c
index 3d5cd5bf3b05..7cef6dae0854 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -6262,6 +6262,8 @@ void __init mnt_init(void)
 	if (!mount_hashtable || !mountpoint_hashtable)
 		panic("Failed to allocate mount hash table\n");
 
+	super_dev_init();
+
 	kernfs_init();
 
 	err = sysfs_init();
diff --git a/fs/super.c b/fs/super.c
index a771a0ad4c9a..ff5e305d0ab4 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -24,6 +24,7 @@
 #include <linux/export.h>
 #include <linux/slab.h>
 #include <linux/blkdev.h>
+#include <linux/rhashtable.h>
 #include <linux/mount.h>
 #include <linux/security.h>
 #include <linux/writeback.h>		/* for the emergency remount stuff */
@@ -272,6 +273,8 @@ static unsigned long super_cache_count(struct shrinker *shrink,
 	return total_objects;
 }
 
+static struct super_dev *super_dev_alloc(dev_t dev, struct super_block *sb);
+
 static void destroy_super_work(struct work_struct *work)
 {
 	struct super_block *s = container_of(work, struct super_block,
@@ -279,6 +282,8 @@ static void destroy_super_work(struct work_struct *work)
 	fsnotify_sb_free(s);
 	security_sb_free(s);
 	put_user_ns(s->s_user_ns);
+	/* Only an unregistered entry is still owned by the superblock. */
+	kfree(s->s_super_dev);
 	kfree(s->s_subtype);
 	for (int i = 0; i < SB_FREEZE_LEVELS; i++)
 		percpu_free_rwsem(&s->s_writers.rw_sem[i]);
@@ -392,6 +397,10 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags,
 		goto fail;
 	if (list_lru_init_memcg(&s->s_inode_lru, s->s_shrink))
 		goto fail;
+	s->s_super_dev = super_dev_alloc(0, s);
+	if (!s->s_super_dev)
+		goto fail;
+
 	s->s_min_writeback_pages = MIN_WRITEBACK_PAGES;
 	return s;
 
@@ -421,6 +430,77 @@ void put_super(struct super_block *s)
 	}
 }
 
+struct super_dev {
+	dev_t			sd_dev;
+	struct super_block	*sd_sb;
+	refcount_t		sd_ref;
+	struct rhlist_head	sd_node;
+	struct rcu_head		sd_rcu;
+};
+
+static struct rhltable super_dev_table;
+static const struct rhashtable_params super_dev_params = {
+	.key_len	= sizeof(dev_t),
+	.key_offset	= offsetof(struct super_dev, sd_dev),
+	.head_offset	= offsetof(struct super_dev, sd_node),
+};
+
+static struct super_dev *super_dev_alloc(dev_t dev, struct super_block *sb)
+{
+	struct super_dev *fsd;
+
+	fsd = kzalloc_obj(*fsd);
+	if (!fsd)
+		return NULL;
+	fsd->sd_dev = dev;
+	fsd->sd_sb = sb;
+	refcount_set(&fsd->sd_ref, 1);
+	return fsd;
+}
+
+static void super_dev_put(struct super_dev *fsd)
+{
+	/* Unlink only once unpinned, so a cursor never resumes from a removed node. */
+	if (fsd && refcount_dec_and_test(&fsd->sd_ref)) {
+		rhltable_remove(&super_dev_table, &fsd->sd_node, super_dev_params);
+		put_super(fsd->sd_sb);
+		kfree_rcu(fsd, sd_rcu);
+	}
+}
+
+void __init super_dev_init(void)
+{
+	if (rhltable_init(&super_dev_table, &super_dev_params))
+		panic("VFS: Cannot initialise super_dev_table\n");
+}
+
+static int super_dev_insert(struct super_dev *fsd)
+{
+	int err;
+
+	err = rhltable_insert(&super_dev_table, &fsd->sd_node, super_dev_params);
+	if (!err)
+		refcount_inc(&fsd->sd_sb->s_passive);
+	return err;
+}
+
+/* Register @sb under @sb->s_dev as the final fallible act of a set callback. */
+static int super_dev_register(struct super_block *sb)
+{
+	struct super_dev *fsd = sb->s_super_dev;
+	int err;
+
+	lockdep_assert_held(&sb_lock);
+	VFS_WARN_ON_ONCE(!sb->s_dev);
+	VFS_WARN_ON_ONCE(!fsd || fsd->sd_dev);
+
+	fsd->sd_dev = sb->s_dev;
+	err = super_dev_insert(fsd);
+	if (err)
+		fsd->sd_dev = 0;
+	return err;
+}
+
 static void kill_super_notify(struct super_block *sb)
 {
 	lockdep_assert_not_held(&sb->s_umount);
@@ -440,6 +520,12 @@ static void kill_super_notify(struct super_block *sb)
 	hlist_del_init(&sb->s_instances);
 	spin_unlock(&sb_lock);
 
+	/* Drop sget_fc()'s claim; a never-registered entry stays with the sb. */
+	if (sb->s_super_dev->sd_dev) {
+		super_dev_put(sb->s_super_dev);
+		sb->s_super_dev = NULL;
+	}
+
 	/*
 	 * Let concurrent mounts know that this thing is really dead.
 	 * We don't need @sb->s_umount here as every concurrent caller
@@ -750,6 +836,7 @@ struct super_block *sget_fc(struct fs_context *fc,
 	}
 	if (!s) {
 		spin_unlock(&sb_lock);
+
 		s = alloc_super(fc->fs_type, fc->sb_flags, user_ns);
 		if (!s)
 			return ERR_PTR(-ENOMEM);
@@ -759,11 +846,13 @@ struct super_block *sget_fc(struct fs_context *fc,
 	s->s_fs_info = fc->s_fs_info;
 	err = set(s, fc);
 	if (err) {
+		VFS_WARN_ON_ONCE(s->s_super_dev->sd_dev);
 		s->s_fs_info = NULL;
 		spin_unlock(&sb_lock);
 		destroy_unused_super(s);
 		return ERR_PTR(err);
 	}
+	VFS_WARN_ON_ONCE(!s->s_super_dev->sd_dev);
 	fc->s_fs_info = NULL;
 	s->s_type = fc->fs_type;
 	s->s_iflags |= fc->s_iflags;
@@ -1217,7 +1306,16 @@ EXPORT_SYMBOL(free_anon_bdev);
 
 int set_anon_super(struct super_block *s, void *data)
 {
-	return get_anon_bdev(&s->s_dev);
+	int error;
+
+	error = get_anon_bdev(&s->s_dev);
+	if (error)
+		return error;
+
+	error = super_dev_register(s);
+	if (error)
+		free_anon_bdev(s->s_dev);
+	return error;
 }
 EXPORT_SYMBOL(set_anon_super);
 
@@ -1303,7 +1401,7 @@ EXPORT_SYMBOL(get_tree_keyed);
 static int set_bdev_super(struct super_block *s, void *data)
 {
 	s->s_dev = *(dev_t *)data;
-	return 0;
+	return super_dev_register(s);
 }
 
 static int super_s_dev_set(struct super_block *s, struct fs_context *fc)
diff --git a/include/linux/fs/super_types.h b/include/linux/fs/super_types.h
index 68747182abf9..c8172558750f 100644
--- a/include/linux/fs/super_types.h
+++ b/include/linux/fs/super_types.h
@@ -30,6 +30,7 @@ struct mount;
 struct mtd_info;
 struct quotactl_ops;
 struct shrinker;
+struct super_dev;
 struct unicode_map;
 struct user_namespace;
 struct workqueue_struct;
@@ -132,6 +133,7 @@ struct super_operations {
 struct super_block {
 	struct list_head			s_list;		/* Keep this first */
 	dev_t					s_dev;		/* search index; _not_ kdev_t */
+	struct super_dev			*s_super_dev;	/* sget_fc()'s device table claim */
 	unsigned char				s_blocksize_bits;
 	unsigned long				s_blocksize;
 	loff_t					s_maxbytes;	/* Max file size */

-- 
2.47.3


^ permalink raw reply related

* [PATCH RFC v2 08/18] fs: add dedicated block device open helpers for filesystems
From: Christian Brauner @ 2026-06-16 14:08 UTC (permalink / raw)
  To: Jan Kara
  Cc: Christoph Hellwig, Jens Axboe, Alexander Viro, linux-block,
	linux-kernel, linux-fsdevel, Carlos Maiolino, linux-xfs,
	Chris Mason, David Sterba, linux-btrfs, Theodore Ts'o,
	linux-ext4, Gao Xiang, linux-erofs, Christian Brauner (Amutable)
In-Reply-To: <20260616-work-super-bdev_holder_global-v2-0-7df6b864028e@kernel.org>

Add fs_bdev_file_open_by_{dev,path}() and fs_bdev_file_release(). They
open the device with fs_holder_ops and register a claim in the
device-to-superblock table. Claims on the same (device, superblock)
pair share one entry, so when a filesystem claims a device it already
uses (xfs with its log on the data device), no second entry is added
and each superblock will be acted on once.

The holder argument remains purely the block layer's exclusivity token:
a superblock, or a file_system_type for a device shared by several
superblocks of that type. The shared case only becomes usable once the
fs_holder_ops callbacks resolve superblocks through the table instead
of bdev->bd_holder.

Convert the main device, setup_bdev_super() and kill_block_super(),
over: the open finds the entry registered by sget_fc() and claims it
again. cramfs and romfs bypass kill_block_super() so they can handle
MTD mounts and release the main device with a plain bdev_fput(), which
would leave the claim behind: the (dev, sb) entry would never be
unregistered and the passive reference it holds would keep the
superblock alive forever. Convert their release paths in the same
step.

The frozen-device check stays in setup_bdev_super() for the primary
device and is added to fs_bdev_register() for new claims, i.e. every
additional device a filesystem opens through the helpers. Only a
(device, superblock) pair the superblock claimed earlier may be
reopened while frozen (xfs with its log on the data device): the freeze
already covers that superblock through the existing claim, so nothing
escapes it. Without the setup_bdev_super() check a device frozen before
the mount even started (dm lock_fs, loop) could be mounted and written
to (journal replay) under an active freeze, because the primary open
reuses the entry registered by sget_fc() and never takes the new-claim
path.

Both checks read bd_fsfreeze_count only after the entry is published
(by sget_fc() for the primary, by fs_bdev_register() for new claims)
and pair with bdev_freeze() incrementing the count before walking the
table: either the mount sees the elevated freeze count and fails with
EBUSY, or the freeze finds the published entry and converges once
SB_BORN is set.

Signed-off-by: Christian Brauner (Amutable) <brauner@kernel.org>
---
 fs/cramfs/inode.c        |   2 +-
 fs/romfs/super.c         |   2 +-
 fs/super.c               | 154 ++++++++++++++++++++++++++++++++++++++++++++---
 include/linux/fs/super.h |   7 +++
 4 files changed, 155 insertions(+), 10 deletions(-)

diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index 4edbfccd0bbe..d4cd03f4f60d 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -504,7 +504,7 @@ static void cramfs_kill_sb(struct super_block *sb)
 		sb->s_mtd = NULL;
 	} else if (IS_ENABLED(CONFIG_CRAMFS_BLOCKDEV) && sb->s_bdev) {
 		sync_blockdev(sb->s_bdev);
-		bdev_fput(sb->s_bdev_file);
+		fs_bdev_file_release(sb->s_bdev_file, sb);
 	}
 	kfree(sbi);
 }
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index ac55193bf398..43eb897197c0 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -587,7 +587,7 @@ static void romfs_kill_sb(struct super_block *sb)
 #ifdef CONFIG_ROMFS_ON_BLOCK
 	if (sb->s_bdev) {
 		sync_blockdev(sb->s_bdev);
-		bdev_fput(sb->s_bdev_file);
+		fs_bdev_file_release(sb->s_bdev_file, sb);
 	}
 #endif
 }
diff --git a/fs/super.c b/fs/super.c
index ff5e305d0ab4..3d166c7f578a 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -1633,6 +1633,145 @@ const struct blk_holder_ops fs_holder_ops = {
 };
 EXPORT_SYMBOL_GPL(fs_holder_ops);
 
+static struct super_dev *super_dev_lookup(dev_t dev, struct super_block *sb)
+{
+	struct super_dev *it;
+	struct rhlist_head *list, *pos;
+
+	RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "suspicious super_dev_lookup() usage");
+	VFS_WARN_ON_ONCE(!dev);
+	VFS_WARN_ON_ONCE(!sb);
+
+	list = rhltable_lookup(&super_dev_table, &dev, super_dev_params);
+	rhl_for_each_entry_rcu(it, pos, list, sd_node) {
+		if (it->sd_sb == sb)
+			return it;
+	}
+
+	return NULL;
+}
+
+static int fs_bdev_register(struct file *bdev_file, struct super_block *sb)
+{
+	struct super_dev *sb_dev __free(kfree) = NULL;
+	dev_t dev = file_bdev(bdev_file)->bd_dev;
+	int err;
+
+	scoped_guard(rcu) {
+		sb_dev = super_dev_lookup(dev, sb);
+		if (sb_dev && refcount_inc_not_zero(&sb_dev->sd_ref)) {
+			retain_and_null_ptr(sb_dev);
+			return 0;
+		}
+	}
+
+	sb_dev = super_dev_alloc(dev, sb);
+	if (!sb_dev)
+		return -ENOMEM;
+
+	err = super_dev_insert(sb_dev);
+	if (err)
+		return err;
+
+	/* Publish the entry before reading the count; pairs with bdev_freeze(). */
+	smp_mb();
+	if (atomic_read(&file_bdev(bdev_file)->bd_fsfreeze_count) > 0) {
+		err = -EBUSY;
+		super_dev_put(sb_dev);
+	}
+
+	retain_and_null_ptr(sb_dev);
+	return err;
+}
+
+/**
+ * fs_bdev_file_open_by_dev - claim a block device on behalf of a superblock
+ * @dev: block device number
+ * @mode: open mode
+ * @holder: block-layer exclusivity token (a superblock, or the file_system_type
+ *          when the device may be shared by several superblocks of that type)
+ * @sb: superblock to drive fs_holder_ops events for
+ *
+ * Open @dev with &fs_holder_ops and register that @sb uses it, so device
+ * removal/sync/freeze/thaw are propagated to @sb (and any other superblock
+ * sharing @dev).  Must be paired with fs_bdev_file_release().
+ *
+ * Return: an opened block-device file or an ERR_PTR().
+ */
+struct file *fs_bdev_file_open_by_dev(dev_t dev, blk_mode_t mode, void *holder,
+				      struct super_block *sb)
+{
+	struct file *bdev_file;
+	int err;
+
+	bdev_file = bdev_file_open_by_dev(dev, mode, holder, &fs_holder_ops);
+	if (IS_ERR(bdev_file))
+		return bdev_file;
+
+	err = fs_bdev_register(bdev_file, sb);
+	if (err) {
+		bdev_fput(bdev_file);
+		return ERR_PTR(err);
+	}
+	return bdev_file;
+}
+EXPORT_SYMBOL_GPL(fs_bdev_file_open_by_dev);
+
+/**
+ * fs_bdev_file_open_by_path - claim a block device on behalf of a superblock
+ * @path: path to the block device
+ * @mode: open mode
+ * @holder: block-layer exclusivity token (a superblock, or the file_system_type
+ *          when the device may be shared by several superblocks of that type)
+ * @sb: superblock to drive fs_holder_ops events for
+ *
+ * Open the block device at @path with &fs_holder_ops and register that @sb
+ * uses it, so device removal/sync/freeze/thaw are propagated to @sb (and any
+ * other superblock sharing the device).  Must be paired with
+ * fs_bdev_file_release().
+ *
+ * Return: an opened block-device file or an ERR_PTR().
+ */
+struct file *fs_bdev_file_open_by_path(const char *path, blk_mode_t mode,
+				       void *holder, struct super_block *sb)
+{
+	struct file *bdev_file;
+	int err;
+
+	bdev_file = bdev_file_open_by_path(path, mode, holder, &fs_holder_ops);
+	if (IS_ERR(bdev_file))
+		return bdev_file;
+
+	err = fs_bdev_register(bdev_file, sb);
+	if (err) {
+		bdev_fput(bdev_file);
+		return ERR_PTR(err);
+	}
+	return bdev_file;
+}
+EXPORT_SYMBOL_GPL(fs_bdev_file_open_by_path);
+
+/**
+ * fs_bdev_file_release - release a block device claimed for a superblock
+ * @bdev_file: file returned by fs_bdev_file_open_by_{dev,path}()
+ * @sb: superblock the device was claimed for
+ *
+ * Drop one claim on the {dev, @sb} entry; the last claim unregisters it (a
+ * pinning cursor defers the actual unlink).  Then close the block device.
+ */
+void fs_bdev_file_release(struct file *bdev_file, struct super_block *sb)
+{
+	dev_t dev = file_bdev(bdev_file)->bd_dev;
+	struct super_dev *sb_dev;
+
+	rcu_read_lock();
+	sb_dev = super_dev_lookup(dev, sb);
+	rcu_read_unlock();
+	super_dev_put(sb_dev);
+	bdev_fput(bdev_file);
+}
+EXPORT_SYMBOL_GPL(fs_bdev_file_release);
+
 int setup_bdev_super(struct super_block *sb, int sb_flags,
 		struct fs_context *fc)
 {
@@ -1640,7 +1779,7 @@ int setup_bdev_super(struct super_block *sb, int sb_flags,
 	struct file *bdev_file;
 	struct block_device *bdev;
 
-	bdev_file = bdev_file_open_by_dev(sb->s_dev, mode, sb, &fs_holder_ops);
+	bdev_file = fs_bdev_file_open_by_dev(sb->s_dev, mode, sb, sb);
 	if (IS_ERR(bdev_file)) {
 		if (fc)
 			errorf(fc, "%s: Can't open blockdev", fc->source);
@@ -1654,20 +1793,19 @@ int setup_bdev_super(struct super_block *sb, int sb_flags,
 	 * writable from userspace even for a read-only block device.
 	 */
 	if ((mode & BLK_OPEN_WRITE) && bdev_read_only(bdev)) {
-		bdev_fput(bdev_file);
+		fs_bdev_file_release(bdev_file, sb);
 		return -EACCES;
 	}
 
-	/*
-	 * It is enough to check bdev was not frozen before we set
-	 * s_bdev as freezing will wait until SB_BORN is set.
-	 */
+	/* The sget_fc() entry is already published; pairs with bdev_freeze(). */
+	smp_mb();
 	if (atomic_read(&bdev->bd_fsfreeze_count) > 0) {
 		if (fc)
 			warnf(fc, "%pg: Can't mount, blockdev is frozen", bdev);
-		bdev_fput(bdev_file);
+		fs_bdev_file_release(bdev_file, sb);
 		return -EBUSY;
 	}
+
 	spin_lock(&sb_lock);
 	sb->s_bdev_file = bdev_file;
 	sb->s_bdev = bdev;
@@ -1756,7 +1894,7 @@ void kill_block_super(struct super_block *sb)
 	generic_shutdown_super(sb);
 	if (bdev) {
 		sync_blockdev(bdev);
-		bdev_fput(sb->s_bdev_file);
+		fs_bdev_file_release(sb->s_bdev_file, sb);
 	}
 }
 
diff --git a/include/linux/fs/super.h b/include/linux/fs/super.h
index f21ffbb6dea5..721d842e3b24 100644
--- a/include/linux/fs/super.h
+++ b/include/linux/fs/super.h
@@ -235,4 +235,11 @@ int freeze_super(struct super_block *super, enum freeze_holder who,
 int thaw_super(struct super_block *super, enum freeze_holder who,
 	       const void *freeze_owner);
 
+struct file;
+struct file *fs_bdev_file_open_by_dev(dev_t dev, blk_mode_t mode, void *holder,
+				      struct super_block *sb);
+struct file *fs_bdev_file_open_by_path(const char *path, blk_mode_t mode,
+				       void *holder, struct super_block *sb);
+void fs_bdev_file_release(struct file *bdev_file, struct super_block *sb);
+
 #endif /* _LINUX_FS_SUPER_H */

-- 
2.47.3


^ permalink raw reply related

* [PATCH RFC v2 09/18] xfs: port to fs_bdev_file_open_by_path()
From: Christian Brauner @ 2026-06-16 14:08 UTC (permalink / raw)
  To: Jan Kara
  Cc: Christoph Hellwig, Jens Axboe, Alexander Viro, linux-block,
	linux-kernel, linux-fsdevel, Carlos Maiolino, linux-xfs,
	Chris Mason, David Sterba, linux-btrfs, Theodore Ts'o,
	linux-ext4, Gao Xiang, linux-erofs, Christian Brauner (Amutable)
In-Reply-To: <20260616-work-super-bdev_holder_global-v2-0-7df6b864028e@kernel.org>

Route the log and rt device opens through fs_bdev_file_open_by_path() so
each external device is registered against mp->m_super, and convert the
matching releases to fs_bdev_file_release(). The data device is still
opened and released by setup_bdev_super()/kill_block_super(); when the
log lives on the data device the open resolves to the existing (dev, sb)
entry so the superblock is acted on once.

Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner (Amutable) <brauner@kernel.org>
---
 fs/xfs/xfs_buf.c   |  2 +-
 fs/xfs/xfs_super.c | 10 +++++-----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 3ce12fe1c307..2eddd60aaa67 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1615,7 +1615,7 @@ xfs_free_buftarg(
 	fs_put_dax(btp->bt_daxdev, btp->bt_mount);
 	/* the main block device is closed by kill_block_super */
 	if (btp->bt_bdev != btp->bt_mount->m_super->s_bdev)
-		bdev_fput(btp->bt_file);
+		fs_bdev_file_release(btp->bt_file, btp->bt_mount->m_super);
 	kfree(btp);
 }
 
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 8531d526fc44..d1c622f0a957 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -400,8 +400,8 @@ xfs_blkdev_get(
 	blk_mode_t		mode;
 
 	mode = sb_open_mode(mp->m_super->s_flags);
-	*bdev_filep = bdev_file_open_by_path(name, mode,
-			mp->m_super, &fs_holder_ops);
+	*bdev_filep = fs_bdev_file_open_by_path(name, mode,
+			mp->m_super, mp->m_super);
 	if (IS_ERR(*bdev_filep)) {
 		error = PTR_ERR(*bdev_filep);
 		*bdev_filep = NULL;
@@ -526,7 +526,7 @@ xfs_open_devices(
 		mp->m_logdev_targp = mp->m_ddev_targp;
 		/* Handle won't be used, drop it */
 		if (logdev_file)
-			bdev_fput(logdev_file);
+			fs_bdev_file_release(logdev_file, mp->m_super);
 	}
 
 	return 0;
@@ -541,10 +541,10 @@ xfs_open_devices(
 	mp->m_ddev_targp = NULL;
  out_close_rtdev:
 	 if (rtdev_file)
-		bdev_fput(rtdev_file);
+		fs_bdev_file_release(rtdev_file, mp->m_super);
  out_close_logdev:
 	if (logdev_file)
-		bdev_fput(logdev_file);
+		fs_bdev_file_release(logdev_file, mp->m_super);
 	return error;
 }
 

-- 
2.47.3


^ permalink raw reply related

* [PATCH RFC v2 10/18] btrfs: open via dedicated fs bdev helpers
From: Christian Brauner @ 2026-06-16 14:08 UTC (permalink / raw)
  To: Jan Kara
  Cc: Christoph Hellwig, Jens Axboe, Alexander Viro, linux-block,
	linux-kernel, linux-fsdevel, Carlos Maiolino, linux-xfs,
	Chris Mason, David Sterba, linux-btrfs, Theodore Ts'o,
	linux-ext4, Gao Xiang, linux-erofs, Christian Brauner (Amutable),
	syzbot
In-Reply-To: <20260616-work-super-bdev_holder_global-v2-0-7df6b864028e@kernel.org>

Route the device opens through fs_bdev_file_open_by_path() so each device
is registered against the superblock, and convert the matching releases
to fs_bdev_file_release().

The temporary identification opens that only read the superblock and
close again pass a NULL holder and keep using bdev_fput().

On the close path the superblock is taken from bdev_file->private_data
(the holder set at open) rather than device->fs_info->sb: a mount that
fails before btrfs_init_devices_late() runs leaves device->fs_info NULL,
which close_fs_devices() would otherwise dereference.

Tested-by: syzbot@syzkaller.appspotmail.com
Signed-off-by: Christian Brauner (Amutable) <brauner@kernel.org>
---
 fs/btrfs/volumes.c | 29 ++++++++++++++++++++---------
 1 file changed, 20 insertions(+), 9 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 2d9e2ca09c5f..02abbfce5ea3 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -480,7 +480,12 @@ btrfs_get_bdev_and_sb(const char *device_path, blk_mode_t flags, void *holder,
 	struct block_device *bdev;
 	int ret;
 
-	*bdev_file = bdev_file_open_by_path(device_path, flags, holder, &fs_holder_ops);
+	if (holder)
+		*bdev_file = fs_bdev_file_open_by_path(device_path, flags,
+						       holder, holder);
+	else
+		*bdev_file = bdev_file_open_by_path(device_path, flags, NULL,
+						    NULL);
 
 	if (IS_ERR(*bdev_file)) {
 		ret = PTR_ERR(*bdev_file);
@@ -495,7 +500,7 @@ btrfs_get_bdev_and_sb(const char *device_path, blk_mode_t flags, void *holder,
 	if (holder) {
 		ret = set_blocksize(*bdev_file, BTRFS_BDEV_BLOCKSIZE);
 		if (ret) {
-			bdev_fput(*bdev_file);
+			fs_bdev_file_release(*bdev_file, holder);
 			goto error;
 		}
 	}
@@ -503,7 +508,10 @@ btrfs_get_bdev_and_sb(const char *device_path, blk_mode_t flags, void *holder,
 	*disk_super = btrfs_read_disk_super(bdev, 0, false);
 	if (IS_ERR(*disk_super)) {
 		ret = PTR_ERR(*disk_super);
-		bdev_fput(*bdev_file);
+		if (holder)
+			fs_bdev_file_release(*bdev_file, holder);
+		else
+			bdev_fput(*bdev_file);
 		goto error;
 	}
 
@@ -727,7 +735,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
 
 error_free_page:
 	btrfs_release_disk_super(disk_super);
-	bdev_fput(bdev_file);
+	fs_bdev_file_release(bdev_file, holder);
 
 	return -EINVAL;
 }
@@ -1087,7 +1095,7 @@ static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices,
 			continue;
 
 		if (device->bdev_file) {
-			bdev_fput(device->bdev_file);
+			fs_bdev_file_release(device->bdev_file, device->bdev_file->private_data);
 			device->bdev = NULL;
 			device->bdev_file = NULL;
 			fs_devices->open_devices--;
@@ -1127,10 +1135,12 @@ void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices)
 /* Release a device that was made unfreezable for a membership change. */
 void btrfs_release_device_allow_freeze(struct file *bdev_file)
 {
+	struct super_block *sb = bdev_file->private_data;
+
 	/* Yield before allow (strand-safe); file still open for the allow (UAF-safe). */
 	bdev_yield_claim(bdev_file);
 	bdev_allow_freeze(file_bdev(bdev_file));
-	bdev_fput(bdev_file);
+	fs_bdev_file_release(bdev_file, sb);
 }
 
 static void btrfs_close_bdev(struct btrfs_device *device, bool allow_freeze)
@@ -1147,7 +1157,8 @@ static void btrfs_close_bdev(struct btrfs_device *device, bool allow_freeze)
 	if (allow_freeze)
 		btrfs_release_device_allow_freeze(device->bdev_file);
 	else
-		bdev_fput(device->bdev_file);
+		fs_bdev_file_release(device->bdev_file,
+				     device->bdev_file->private_data);
 }
 
 static void btrfs_close_one_device(struct btrfs_device *device)
@@ -2894,8 +2905,8 @@ struct file *btrfs_open_device_deny_freeze(const char *path,
 		return ERR_PTR(ret);
 	}
 
-	bdev_file = bdev_file_open_by_dev(file_bdev(probe_file)->bd_dev,
-					  BLK_OPEN_WRITE, sb, &fs_holder_ops);
+	bdev_file = fs_bdev_file_open_by_dev(file_bdev(probe_file)->bd_dev,
+					     BLK_OPEN_WRITE, sb, sb);
 	if (IS_ERR(bdev_file))
 		bdev_allow_freeze(file_bdev(probe_file));
 	bdev_fput(probe_file);

-- 
2.47.3


^ permalink raw reply related

* [PATCH RFC v2 11/18] ext4: open via dedicated fs bdev helpers
From: Christian Brauner @ 2026-06-16 14:08 UTC (permalink / raw)
  To: Jan Kara
  Cc: Christoph Hellwig, Jens Axboe, Alexander Viro, linux-block,
	linux-kernel, linux-fsdevel, Carlos Maiolino, linux-xfs,
	Chris Mason, David Sterba, linux-btrfs, Theodore Ts'o,
	linux-ext4, Gao Xiang, linux-erofs, Christian Brauner (Amutable)
In-Reply-To: <20260616-work-super-bdev_holder_global-v2-0-7df6b864028e@kernel.org>

Route the external journal device open through fs_bdev_file_open_by_dev()
so it is registered against the superblock, and convert the matching
releases to fs_bdev_file_release().

Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner (Amutable) <brauner@kernel.org>
---
 fs/ext4/super.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 7283108d7609..2b5301a3bcfb 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -5793,7 +5793,7 @@ failed_mount8: __maybe_unused
 	brelse(sbi->s_sbh);
 	if (sbi->s_journal_bdev_file) {
 		invalidate_bdev(file_bdev(sbi->s_journal_bdev_file));
-		bdev_fput(sbi->s_journal_bdev_file);
+		fs_bdev_file_release(sbi->s_journal_bdev_file, sb);
 	}
 out_fail:
 	invalidate_bdev(sb->s_bdev);
@@ -5972,9 +5972,9 @@ static struct file *ext4_get_journal_blkdev(struct super_block *sb,
 	struct ext4_super_block *es;
 	int errno;
 
-	bdev_file = bdev_file_open_by_dev(j_dev,
+	bdev_file = fs_bdev_file_open_by_dev(j_dev,
 		BLK_OPEN_READ | BLK_OPEN_WRITE | BLK_OPEN_RESTRICT_WRITES,
-		sb, &fs_holder_ops);
+		sb, sb);
 	if (IS_ERR(bdev_file)) {
 		ext4_msg(sb, KERN_ERR,
 			 "failed to open journal device unknown-block(%u,%u) %ld",
@@ -6034,7 +6034,7 @@ static struct file *ext4_get_journal_blkdev(struct super_block *sb,
 out_bh:
 	brelse(bh);
 out_bdev:
-	bdev_fput(bdev_file);
+	fs_bdev_file_release(bdev_file, sb);
 	return ERR_PTR(errno);
 }
 
@@ -6073,7 +6073,7 @@ static journal_t *ext4_open_dev_journal(struct super_block *sb,
 out_journal:
 	ext4_journal_destroy(EXT4_SB(sb), journal);
 out_bdev:
-	bdev_fput(bdev_file);
+	fs_bdev_file_release(bdev_file, sb);
 	return ERR_PTR(errno);
 }
 
@@ -7490,7 +7490,7 @@ static void ext4_kill_sb(struct super_block *sb)
 	kill_block_super(sb);
 
 	if (bdev_file)
-		bdev_fput(bdev_file);
+		fs_bdev_file_release(bdev_file, sb);
 }
 
 static struct file_system_type ext4_fs_type = {

-- 
2.47.3


^ permalink raw reply related

* [PATCH RFC v2 12/18] fs: look up superblocks via the device table in fs_holder_ops
From: Christian Brauner @ 2026-06-16 14:08 UTC (permalink / raw)
  To: Jan Kara
  Cc: Christoph Hellwig, Jens Axboe, Alexander Viro, linux-block,
	linux-kernel, linux-fsdevel, Carlos Maiolino, linux-xfs,
	Chris Mason, David Sterba, linux-btrfs, Theodore Ts'o,
	linux-ext4, Gao Xiang, linux-erofs, Christian Brauner (Amutable)
In-Reply-To: <20260616-work-super-bdev_holder_global-v2-0-7df6b864028e@kernel.org>

Switch the fs_holder_ops callbacks from recovering the single owning
superblock out of bdev->bd_holder to walking the device-to-superblock
table and acting on every superblock registered for the device. The
holder argument becomes purely the block layer's exclusivity token and
is no longer needed by the fs specific callbacks.

All devices opened with fs_holder_ops are registered by now: the main
device since setup_bdev_super() switched to fs_bdev_file_open_by_dev()
and the extra devices (xfs log and realtime devices, btrfs member
devices, the ext4 external journal) since the preceding per-filesystem
conversions. So no event is lost in the switchover.

The walk uses a refcount-pinning cursor: each step takes a reference on
the entry via sd_ref and resumes from its sd_node. Unlinking an entry
is deferred to the last unpin, so a cursor never resumes from a removed
node.

mark_dead and sync only need the passive reference the entry holds plus
s_umount, which they take with super_lock_shared(). freeze and thaw
additionally need an active reference and acquire it with
get_active_super(), which waits for the superblock to be born before
taking s_active. Taking s_active before the superblock is born would
pin a still-mounting superblock so a racing mount that aborts could
never drop s_active to zero and reach SB_DYING, deadlocking the wait
for SB_BORN. This is how filesystems_freeze() and filesystems_thaw()
acquire it too.

One semantic change: when no live superblock uses the device anymore
(the holder is dying or was never registered), fs_bdev_freeze() and
fs_bdev_thaw() now return 0 - freeze after syncing the block device -
where they used to return -EINVAL.

The freeze-deny release path moves to the table in the same switchover.
A device made unfreezable for a btrfs membership change must drop its
table entry before re-allowing freezing; otherwise a freeze racing the
release reaches the superblock through the still-registered entry and is
stranded once the release unlinks it. Split fs_bdev_unregister() out of
fs_bdev_file_release() - the inverse of fs_bdev_register() - so
btrfs_release_device_allow_freeze() can drop the {dev, sb} entry, re-allow
freezing on the still-open device, then close it. Re-allowing only after
the entry is gone keeps a racing freeze from reaching the superblock, and
doing it while the file is still open avoids touching the block device
after the close. btrfs previously yielded bd_holder before re-allowing,
which this commit makes irrelevant to freeze resolution.

Signed-off-by: Christian Brauner (Amutable) <brauner@kernel.org>
---
 fs/btrfs/volumes.c       |   6 +-
 fs/super.c               | 269 +++++++++++++++++++++++------------------------
 include/linux/fs/super.h |   1 +
 3 files changed, 138 insertions(+), 138 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 02abbfce5ea3..d827d83722c1 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1137,10 +1137,10 @@ void btrfs_release_device_allow_freeze(struct file *bdev_file)
 {
 	struct super_block *sb = bdev_file->private_data;
 
-	/* Yield before allow (strand-safe); file still open for the allow (UAF-safe). */
-	bdev_yield_claim(bdev_file);
+	/* Unregister before re-allowing (strand-safe); file still open (UAF-safe). */
+	fs_bdev_unregister(bdev_file, sb);
 	bdev_allow_freeze(file_bdev(bdev_file));
-	fs_bdev_file_release(bdev_file, sb);
+	bdev_fput(bdev_file);
 }
 
 static void btrfs_close_bdev(struct btrfs_device *device, bool allow_freeze)
diff --git a/fs/super.c b/fs/super.c
index 3d166c7f578a..236e868209a4 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -501,6 +501,42 @@ static int super_dev_register(struct super_block *sb)
 	return err;
 }
 
+#ifdef CONFIG_BLOCK
+static struct super_dev *super_dev_get(struct rhlist_head *pos)
+{
+	struct super_dev *sb_dev;
+
+	for (; pos; pos = rcu_dereference_all(pos->next)) {
+		sb_dev = container_of(pos, struct super_dev, sd_node);
+		if (refcount_inc_not_zero(&sb_dev->sd_ref))
+			return sb_dev;
+	}
+	return NULL;
+}
+
+static struct super_dev *super_dev_first(dev_t dev)
+{
+	struct super_dev *sb_dev;
+
+	rcu_read_lock();
+	sb_dev = super_dev_get(rhltable_lookup(&super_dev_table, &dev, super_dev_params));
+	rcu_read_unlock();
+	return sb_dev;
+}
+
+static struct super_dev *super_dev_next(struct super_dev *prev)
+{
+	struct super_dev *sb_dev;
+
+	rcu_read_lock();
+	sb_dev = super_dev_get(rcu_dereference_all(prev->sd_node.next));
+	rcu_read_unlock();
+
+	super_dev_put(prev);
+	return sb_dev;
+}
+#endif
+
 static void kill_super_notify(struct super_block *sb)
 {
 	lockdep_assert_not_held(&sb->s_umount);
@@ -1443,185 +1479,131 @@ struct super_block *sget_dev(struct fs_context *fc, dev_t dev)
 EXPORT_SYMBOL(sget_dev);
 
 #ifdef CONFIG_BLOCK
-/*
- * Lock the superblock that is holder of the bdev. Returns the superblock
- * pointer if we successfully locked the superblock and it is alive. Otherwise
- * we return NULL and just unlock bdev->bd_holder_lock.
- *
- * The function must be called with bdev->bd_holder_lock and releases it.
- */
-static struct super_block *bdev_super_lock(struct block_device *bdev, bool excl)
-	__releases(&bdev->bd_holder_lock)
+static int fs_super_freeze(struct super_block *sb)
 {
-	struct super_block *sb = bdev->bd_holder;
-	bool locked;
-
-	lockdep_assert_held(&bdev->bd_holder_lock);
-	lockdep_assert_not_held(&sb->s_umount);
-	lockdep_assert_not_held(&bdev->bd_disk->open_mutex);
-
-	/* Make sure sb doesn't go away from under us */
-	refcount_inc(&sb->s_passive);
-
-	mutex_unlock(&bdev->bd_holder_lock);
-
-	locked = super_lock(sb, excl);
-
-	/*
-	 * If the superblock wasn't already SB_DYING then we hold
-	 * s_umount and can safely drop our temporary reference.
-         */
-	put_super(sb);
-
-	if (!locked)
-		return NULL;
-
-	if (!sb->s_root || !(sb->s_flags & SB_ACTIVE)) {
-		super_unlock(sb, excl);
-		return NULL;
-	}
+	if (sb->s_op->freeze_super)
+		return sb->s_op->freeze_super(sb,
+				FREEZE_MAY_NEST | FREEZE_HOLDER_USERSPACE, NULL);
+	return freeze_super(sb, FREEZE_MAY_NEST | FREEZE_HOLDER_USERSPACE, NULL);
+}
 
-	return sb;
+static int fs_super_thaw(struct super_block *sb)
+{
+	if (sb->s_op->thaw_super)
+		return sb->s_op->thaw_super(sb,
+				FREEZE_MAY_NEST | FREEZE_HOLDER_USERSPACE, NULL);
+	return thaw_super(sb, FREEZE_MAY_NEST | FREEZE_HOLDER_USERSPACE, NULL);
 }
 
 static void fs_bdev_mark_dead(struct block_device *bdev, bool surprise)
 {
-	struct super_block *sb;
+	struct super_dev *sb_dev;
+	dev_t dev = bdev->bd_dev;
 
-	sb = bdev_super_lock(bdev, false);
-	if (!sb)
-		return;
+	mutex_unlock(&bdev->bd_holder_lock);
 
-	if (sb->s_op->remove_bdev) {
-		int ret;
+	for (sb_dev = super_dev_first(dev); sb_dev; sb_dev = super_dev_next(sb_dev)) {
+		struct super_block *sb = sb_dev->sd_sb;
 
-		ret = sb->s_op->remove_bdev(sb, bdev);
-		if (!ret) {
-			super_unlock_shared(sb);
-			return;
+		if (!super_lock_shared(sb))
+			continue;
+		if (sb->s_root && (sb->s_flags & SB_ACTIVE)) {
+			if (!sb->s_op->remove_bdev ||
+			    sb->s_op->remove_bdev(sb, bdev)) {
+				if (!surprise)
+					sync_filesystem(sb);
+				shrink_dcache_sb(sb);
+				evict_inodes(sb);
+				if (sb->s_op->shutdown)
+					sb->s_op->shutdown(sb);
+			}
 		}
-		/* Fallback to shutdown. */
+		super_unlock_shared(sb);
 	}
-
-	if (!surprise)
-		sync_filesystem(sb);
-	shrink_dcache_sb(sb);
-	evict_inodes(sb);
-	if (sb->s_op->shutdown)
-		sb->s_op->shutdown(sb);
-
-	super_unlock_shared(sb);
 }
 
 static void fs_bdev_sync(struct block_device *bdev)
 {
-	struct super_block *sb;
-
-	sb = bdev_super_lock(bdev, false);
-	if (!sb)
-		return;
+	struct super_dev *sb_dev;
+	dev_t dev = bdev->bd_dev;
 
-	sync_filesystem(sb);
-	super_unlock_shared(sb);
-}
+	mutex_unlock(&bdev->bd_holder_lock);
 
-static struct super_block *get_bdev_super(struct block_device *bdev)
-{
-	bool active = false;
-	struct super_block *sb;
+	for (sb_dev = super_dev_first(dev); sb_dev; sb_dev = super_dev_next(sb_dev)) {
+		struct super_block *sb = sb_dev->sd_sb;
 
-	sb = bdev_super_lock(bdev, true);
-	if (sb) {
-		active = atomic_inc_not_zero(&sb->s_active);
-		super_unlock_excl(sb);
+		if (!super_lock_shared(sb))
+			continue;
+		if (sb->s_root && (sb->s_flags & SB_ACTIVE))
+			sync_filesystem(sb);
+		super_unlock_shared(sb);
 	}
-	if (!active)
-		return NULL;
-	return sb;
 }
 
 /**
- * fs_bdev_freeze - freeze owning filesystem of block device
+ * fs_bdev_freeze - freeze every superblock using a block device
  * @bdev: block device
  *
- * Freeze the filesystem that owns this block device if it is still
- * active.
- *
- * A filesystem that owns multiple block devices may be frozen from each
- * block device and won't be unfrozen until all block devices are
- * unfrozen. Each block device can only freeze the filesystem once as we
- * nest freezes for block devices in the block layer.
+ * Freeze each live superblock using @bdev.  A superblock owning several block
+ * devices is frozen once per device and stays frozen until all are thawed; the
+ * block layer nests these freezes so the count stays balanced.
  *
- * Return: If the freeze was successful zero is returned. If the freeze
- *         failed a negative error code is returned.
+ * Return: 0, or the first error from freezing a superblock or syncing the
+ *         block device.
  */
 static int fs_bdev_freeze(struct block_device *bdev)
 {
-	struct super_block *sb;
-	int error = 0;
+	dev_t dev = bdev->bd_dev;
+	struct super_dev *sb_dev;
+	int error = 0, err;
 
 	lockdep_assert_held(&bdev->bd_fsfreeze_mutex);
 
-	sb = get_bdev_super(bdev);
-	if (!sb)
-		return -EINVAL;
+	mutex_unlock(&bdev->bd_holder_lock);
+
+	for (sb_dev = super_dev_first(dev); sb_dev; sb_dev = super_dev_next(sb_dev)) {
+		if (!get_active_super(sb_dev->sd_sb))
+			continue;
+		err = fs_super_freeze(sb_dev->sd_sb);
+		if (err && !error)
+			error = err;
+		deactivate_super(sb_dev->sd_sb);
+	}
 
-	if (sb->s_op->freeze_super)
-		error = sb->s_op->freeze_super(sb,
-				FREEZE_MAY_NEST | FREEZE_HOLDER_USERSPACE, NULL);
-	else
-		error = freeze_super(sb,
-				FREEZE_MAY_NEST | FREEZE_HOLDER_USERSPACE, NULL);
 	if (!error)
 		error = sync_blockdev(bdev);
-	deactivate_super(sb);
 	return error;
 }
 
 /**
- * fs_bdev_thaw - thaw owning filesystem of block device
+ * fs_bdev_thaw - thaw every superblock using a block device
  * @bdev: block device
  *
- * Thaw the filesystem that owns this block device.
+ * The counterpart to fs_bdev_freeze(): thaw each live superblock using @bdev.
+ * A zero return does not imply a superblock is fully unfrozen; it may have been
+ * frozen more than once (by the kernel or via another device).
  *
- * A filesystem that owns multiple block devices may be frozen from each
- * block device and won't be unfrozen until all block devices are
- * unfrozen. Each block device can only freeze the filesystem once as we
- * nest freezes for block devices in the block layer.
- *
- * Return: If the thaw was successful zero is returned. If the thaw
- *         failed a negative error code is returned. If this function
- *         returns zero it doesn't mean that the filesystem is unfrozen
- *         as it may have been frozen multiple times (kernel may hold a
- *         freeze or might be frozen from other block devices).
+ * Return: 0, or the first error from thawing a superblock.
  */
 static int fs_bdev_thaw(struct block_device *bdev)
 {
-	struct super_block *sb;
-	int error;
+	dev_t dev = bdev->bd_dev;
+	struct super_dev *sb_dev;
+	int error = 0, err;
 
 	lockdep_assert_held(&bdev->bd_fsfreeze_mutex);
 
-	/*
-	 * The block device may have been frozen before it was claimed by a
-	 * filesystem. Concurrently another process might try to mount that
-	 * frozen block device and has temporarily claimed the block device for
-	 * that purpose causing a concurrent fs_bdev_thaw() to end up here. The
-	 * mounter is already about to abort mounting because they still saw an
-	 * elevanted bdev->bd_fsfreeze_count so get_bdev_super() will return
-	 * NULL in that case.
-	 */
-	sb = get_bdev_super(bdev);
-	if (!sb)
-		return -EINVAL;
+	mutex_unlock(&bdev->bd_holder_lock);
+
+	for (sb_dev = super_dev_first(dev); sb_dev; sb_dev = super_dev_next(sb_dev)) {
+		if (!get_active_super(sb_dev->sd_sb))
+			continue;
+		err = fs_super_thaw(sb_dev->sd_sb);
+		if (err && !error)
+			error = err;
+		deactivate_super(sb_dev->sd_sb);
+	}
 
-	if (sb->s_op->thaw_super)
-		error = sb->s_op->thaw_super(sb,
-				FREEZE_MAY_NEST | FREEZE_HOLDER_USERSPACE, NULL);
-	else
-		error = thaw_super(sb,
-				FREEZE_MAY_NEST | FREEZE_HOLDER_USERSPACE, NULL);
-	deactivate_super(sb);
 	return error;
 }
 
@@ -1752,14 +1734,18 @@ struct file *fs_bdev_file_open_by_path(const char *path, blk_mode_t mode,
 EXPORT_SYMBOL_GPL(fs_bdev_file_open_by_path);
 
 /**
- * fs_bdev_file_release - release a block device claimed for a superblock
+ * fs_bdev_unregister - drop a superblock's claim on a block device
  * @bdev_file: file returned by fs_bdev_file_open_by_{dev,path}()
  * @sb: superblock the device was claimed for
  *
- * Drop one claim on the {dev, @sb} entry; the last claim unregisters it (a
- * pinning cursor defers the actual unlink).  Then close the block device.
+ * The inverse of fs_bdev_register(): drop one claim on the {dev, @sb} entry
+ * (the last claim unregisters it; a pinning cursor defers the actual unlink)
+ * without closing the device.  A caller that must act on the still-open device
+ * between unregistering and closing - e.g. re-allow freezing one denied for a
+ * membership change - pairs this with bdev_fput().  fs_bdev_file_release() is
+ * the common unregister-and-close.
  */
-void fs_bdev_file_release(struct file *bdev_file, struct super_block *sb)
+void fs_bdev_unregister(struct file *bdev_file, struct super_block *sb)
 {
 	dev_t dev = file_bdev(bdev_file)->bd_dev;
 	struct super_dev *sb_dev;
@@ -1768,6 +1754,19 @@ void fs_bdev_file_release(struct file *bdev_file, struct super_block *sb)
 	sb_dev = super_dev_lookup(dev, sb);
 	rcu_read_unlock();
 	super_dev_put(sb_dev);
+}
+EXPORT_SYMBOL_GPL(fs_bdev_unregister);
+
+/**
+ * fs_bdev_file_release - release a block device claimed for a superblock
+ * @bdev_file: file returned by fs_bdev_file_open_by_{dev,path}()
+ * @sb: superblock the device was claimed for
+ *
+ * Unregister the {dev, @sb} entry, then close the block device.
+ */
+void fs_bdev_file_release(struct file *bdev_file, struct super_block *sb)
+{
+	fs_bdev_unregister(bdev_file, sb);
 	bdev_fput(bdev_file);
 }
 EXPORT_SYMBOL_GPL(fs_bdev_file_release);
diff --git a/include/linux/fs/super.h b/include/linux/fs/super.h
index 721d842e3b24..8c3987040ed1 100644
--- a/include/linux/fs/super.h
+++ b/include/linux/fs/super.h
@@ -240,6 +240,7 @@ struct file *fs_bdev_file_open_by_dev(dev_t dev, blk_mode_t mode, void *holder,
 				      struct super_block *sb);
 struct file *fs_bdev_file_open_by_path(const char *path, blk_mode_t mode,
 				       void *holder, struct super_block *sb);
+void fs_bdev_unregister(struct file *bdev_file, struct super_block *sb);
 void fs_bdev_file_release(struct file *bdev_file, struct super_block *sb);
 
 #endif /* _LINUX_FS_SUPER_H */

-- 
2.47.3


^ permalink raw reply related

* [PATCH RFC v2 13/18] fs: tolerate per-superblock freeze errors on shared devices
From: Christian Brauner @ 2026-06-16 14:08 UTC (permalink / raw)
  To: Jan Kara
  Cc: Christoph Hellwig, Jens Axboe, Alexander Viro, linux-block,
	linux-kernel, linux-fsdevel, Carlos Maiolino, linux-xfs,
	Chris Mason, David Sterba, linux-btrfs, Theodore Ts'o,
	linux-ext4, Gao Xiang, linux-erofs, Christian Brauner (Amutable)
In-Reply-To: <20260616-work-super-bdev_holder_global-v2-0-7df6b864028e@kernel.org>

When several superblocks share a device, keep it frozen even if some of
them failed to freeze and swallow the error: rolling the others back
via thaw_super() can fail too, so neither is a clear win. A single
filesystem still reports its error, and a sync_blockdev() failure is
always reported. Thaw follows the same rule.

A device can only be shared once superblocks claim it with a common
exclusivity token, which erofs starts doing in the next patch; for
everyone else the loop visits exactly one superblock and the behavior
is unchanged.

Signed-off-by: Christian Brauner (Amutable) <brauner@kernel.org>
---
 fs/super.c | 23 ++++++++++++++++++++---
 1 file changed, 20 insertions(+), 3 deletions(-)

diff --git a/fs/super.c b/fs/super.c
index 236e868209a4..a83f58755cf8 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -1548,13 +1548,15 @@ static void fs_bdev_sync(struct block_device *bdev)
  * devices is frozen once per device and stays frozen until all are thawed; the
  * block layer nests these freezes so the count stays balanced.
  *
- * Return: 0, or the first error from freezing a superblock or syncing the
- *         block device.
+ * Return: 0, or the error from the one superblock on a single-fs device.  When
+ *         several superblocks share @bdev a per-superblock failure is swallowed
+ *         (see below), but a sync_blockdev() failure is always reported.
  */
 static int fs_bdev_freeze(struct block_device *bdev)
 {
 	dev_t dev = bdev->bd_dev;
 	struct super_dev *sb_dev;
+	unsigned int count = 0;
 	int error = 0, err;
 
 	lockdep_assert_held(&bdev->bd_fsfreeze_mutex);
@@ -1568,8 +1570,17 @@ static int fs_bdev_freeze(struct block_device *bdev)
 		if (err && !error)
 			error = err;
 		deactivate_super(sb_dev->sd_sb);
+		count++;
 	}
 
+	/*
+	 * When several superblocks share the device, keep it frozen even if some
+	 * of them failed to freeze and swallow the error: rolling the rest back
+	 * via thaw_super() can fail too, so neither is a clear win. A single
+	 * filesystem (count == 1) still reports its error.
+	 */
+	if (error && count > 1)
+		error = 0;
 	if (!error)
 		error = sync_blockdev(bdev);
 	return error;
@@ -1583,12 +1594,14 @@ static int fs_bdev_freeze(struct block_device *bdev)
  * A zero return does not imply a superblock is fully unfrozen; it may have been
  * frozen more than once (by the kernel or via another device).
  *
- * Return: 0, or the first error from thawing a superblock.
+ * Return: 0, or the first error on a single-fs device; a shared device swallows
+ *         per-superblock errors, as fs_bdev_freeze() does.
  */
 static int fs_bdev_thaw(struct block_device *bdev)
 {
 	dev_t dev = bdev->bd_dev;
 	struct super_dev *sb_dev;
+	unsigned int count = 0;
 	int error = 0, err;
 
 	lockdep_assert_held(&bdev->bd_fsfreeze_mutex);
@@ -1602,8 +1615,12 @@ static int fs_bdev_thaw(struct block_device *bdev)
 		if (err && !error)
 			error = err;
 		deactivate_super(sb_dev->sd_sb);
+		count++;
 	}
 
+	/* Shared device: swallow per-superblock errors, like fs_bdev_freeze(). */
+	if (error && count > 1)
+		error = 0;
 	return error;
 }
 

-- 
2.47.3


^ permalink raw reply related

* [PATCH RFC v2 14/18] erofs: open via dedicated fs bdev helpers
From: Christian Brauner @ 2026-06-16 14:08 UTC (permalink / raw)
  To: Jan Kara
  Cc: Christoph Hellwig, Jens Axboe, Alexander Viro, linux-block,
	linux-kernel, linux-fsdevel, Carlos Maiolino, linux-xfs,
	Chris Mason, David Sterba, linux-btrfs, Theodore Ts'o,
	linux-ext4, Gao Xiang, linux-erofs, Christian Brauner (Amutable),
	Gao Xiang
In-Reply-To: <20260616-work-super-bdev_holder_global-v2-0-7df6b864028e@kernel.org>

Route opens through fs_bdev_file_open_by_path() so each external device
is registered against the correct superblock, and convert the matching
releases.

Gao Xiang: I think typical immutable filesystems don't need .shutdown()
and .remove_bdev() for the following reasons:

  - blk_mark_disk_dead() sets GD_DEAD in advance of fs_bdev_mark_dead()
    so that the following bios will fail immediately; block_device
    references are still valid so it seems overkill to handle dead
    blockdevs in the deep filesystem I/O submission path.

  - Immutable filesystems like EROFS don't have write paths and journals,
    so they don't need to block writes (i.e., new dirty pages), metadata
    changes, and abort journals.

  - The comment above loop_change_fd() documents a valid read-only use
    case we need to support anyway, but it calls disk_force_media_change()
    which will call fs_bdev_mark_dead() later: we don't want loop_change_fd()
    shutdowns the active filesystems and return -EIO unconditionally.

Currently I think the default behavior (shrink_dcache_sb + evict_inodes)
in fs_bdev_mark_dead() is enough for immutable filesystems, tried to
document in the commit here for later reference.

Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
Signed-off-by: Christian Brauner (Amutable) <brauner@kernel.org>
---
 fs/erofs/super.c | 35 +++++++++++++++++++++++------------
 1 file changed, 23 insertions(+), 12 deletions(-)

diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index 802add6652fd..def9cbfbc9d8 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -153,8 +153,8 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb,
 	} else if (!sbi->devs->flatdev) {
 		file = erofs_is_fileio_mode(sbi) ?
 				filp_open(dif->path, O_RDONLY | O_LARGEFILE, 0) :
-				bdev_file_open_by_path(dif->path,
-						BLK_OPEN_READ, sb->s_type, NULL);
+				fs_bdev_file_open_by_path(dif->path,
+						BLK_OPEN_READ, sb->s_type, sb);
 		if (IS_ERR(file)) {
 			if (file == ERR_PTR(-ENOTBLK))
 				return -EINVAL;
@@ -843,11 +843,16 @@ static int erofs_fc_reconfigure(struct fs_context *fc)
 
 static int erofs_release_device_info(int id, void *ptr, void *data)
 {
+	struct super_block *sb = data;
 	struct erofs_device_info *dif = ptr;
 
 	fs_put_dax(dif->dax_dev, NULL);
-	if (dif->file)
-		fput(dif->file);
+	if (dif->file) {
+		if (S_ISBLK(file_inode(dif->file)->i_mode))
+			fs_bdev_file_release(dif->file, sb);
+		else
+			fput(dif->file);
+	}
 	erofs_fscache_unregister_cookie(dif->fscache);
 	dif->fscache = NULL;
 	kfree(dif->path);
@@ -855,18 +860,19 @@ static int erofs_release_device_info(int id, void *ptr, void *data)
 	return 0;
 }
 
-static void erofs_free_dev_context(struct erofs_dev_context *devs)
+static void erofs_free_dev_context(struct erofs_dev_context *devs,
+				   struct super_block *sb)
 {
 	if (!devs)
 		return;
-	idr_for_each(&devs->tree, &erofs_release_device_info, NULL);
+	idr_for_each(&devs->tree, &erofs_release_device_info, sb);
 	idr_destroy(&devs->tree);
 	kfree(devs);
 }
 
-static void erofs_sb_free(struct erofs_sb_info *sbi)
+static void erofs_sb_free(struct erofs_sb_info *sbi, struct super_block *sb)
 {
-	erofs_free_dev_context(sbi->devs);
+	erofs_free_dev_context(sbi->devs, sb);
 	kfree(sbi->fsid);
 	kfree_sensitive(sbi->domain_id);
 	if (sbi->dif0.file)
@@ -879,8 +885,13 @@ static void erofs_fc_free(struct fs_context *fc)
 {
 	struct erofs_sb_info *sbi = fc->s_fs_info;
 
-	if (sbi) /* free here if an error occurs before transferring to sb */
-		erofs_sb_free(sbi);
+	/*
+	 * Freed here only if an error occurs before the sb is set up; at that
+	 * point no block-backed device has been claimed (that happens in
+	 * fill_super), so the NULL sb never reaches fs_bdev_file_release().
+	 */
+	if (sbi)
+		erofs_sb_free(sbi, NULL);
 }
 
 static const struct fs_context_operations erofs_context_ops = {
@@ -936,7 +947,7 @@ static void erofs_kill_sb(struct super_block *sb)
 	erofs_drop_internal_inodes(sbi);
 	fs_put_dax(sbi->dif0.dax_dev, NULL);
 	erofs_fscache_unregister_fs(sb);
-	erofs_sb_free(sbi);
+	erofs_sb_free(sbi, sb);
 	sb->s_fs_info = NULL;
 }
 
@@ -948,7 +959,7 @@ static void erofs_put_super(struct super_block *sb)
 	erofs_shrinker_unregister(sb);
 	erofs_xattr_prefixes_cleanup(sb);
 	erofs_drop_internal_inodes(sbi);
-	erofs_free_dev_context(sbi->devs);
+	erofs_free_dev_context(sbi->devs, sb);
 	sbi->devs = NULL;
 	erofs_fscache_unregister_fs(sb);
 }

-- 
2.47.3


^ permalink raw reply related

* [PATCH RFC v2 15/18] f2fs: open via dedicated fs bdev helpers
From: Christian Brauner @ 2026-06-16 14:08 UTC (permalink / raw)
  To: Jan Kara
  Cc: Christoph Hellwig, Jens Axboe, Alexander Viro, linux-block,
	linux-kernel, linux-fsdevel, Carlos Maiolino, linux-xfs,
	Chris Mason, David Sterba, linux-btrfs, Theodore Ts'o,
	linux-ext4, Gao Xiang, linux-erofs, Christian Brauner (Amutable)
In-Reply-To: <20260616-work-super-bdev_holder_global-v2-0-7df6b864028e@kernel.org>

Route the extra device opens of a multi-device f2fs through
fs_bdev_file_open_by_path() so each device is registered against the
superblock, and convert the matching release in destroy_device_list()
to fs_bdev_file_release(). The first device aliases the main bdev file
opened by setup_bdev_super() and is already registered through it.

f2fs opened its extra devices without holder ops, so a freeze, sync, or
removal of one of them was never propagated to the superblock.
Registering them wires those events up: every device now freezes,
thaws, syncs, and shuts down the filesystem like the main device does.

Signed-off-by: Christian Brauner (Amutable) <brauner@kernel.org>
---
 fs/f2fs/super.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index ccf806b676f5..49349262564f 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -1970,7 +1970,7 @@ static void destroy_device_list(struct f2fs_sb_info *sbi)
 
 	for (i = 0; i < sbi->s_ndevs; i++) {
 		if (i > 0)
-			bdev_fput(FDEV(i).bdev_file);
+			fs_bdev_file_release(FDEV(i).bdev_file, sbi->sb);
 #ifdef CONFIG_BLK_DEV_ZONED
 		kvfree(FDEV(i).blkz_seq);
 #endif
@@ -4840,8 +4840,8 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi)
 				FDEV(i).end_blk = FDEV(i).start_blk +
 						SEGS_TO_BLKS(sbi,
 						FDEV(i).total_segments) - 1;
-				FDEV(i).bdev_file = bdev_file_open_by_path(
-					FDEV(i).path, mode, sbi->sb, NULL);
+				FDEV(i).bdev_file = fs_bdev_file_open_by_path(
+					FDEV(i).path, mode, sbi->sb, sbi->sb);
 			}
 		}
 		if (IS_ERR(FDEV(i).bdev_file))

-- 
2.47.3


^ permalink raw reply related

* [PATCH RFC v2 16/18] super: make fs_holder_ops private
From: Christian Brauner @ 2026-06-16 14:08 UTC (permalink / raw)
  To: Jan Kara
  Cc: Christoph Hellwig, Jens Axboe, Alexander Viro, linux-block,
	linux-kernel, linux-fsdevel, Carlos Maiolino, linux-xfs,
	Chris Mason, David Sterba, linux-btrfs, Theodore Ts'o,
	linux-ext4, Gao Xiang, linux-erofs, Christian Brauner (Amutable)
In-Reply-To: <20260616-work-super-bdev_holder_global-v2-0-7df6b864028e@kernel.org>

Now that filesystems open and claim their block devices through
fs_bdev_file_open_by_{dev,path}(), nothing outside fs/super.c references
fs_holder_ops. Make it static and drop its declaration from blkdev.h.

Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner (Amutable) <brauner@kernel.org>
---
 fs/super.c             | 3 +--
 include/linux/blkdev.h | 7 -------
 2 files changed, 1 insertion(+), 9 deletions(-)

diff --git a/fs/super.c b/fs/super.c
index a83f58755cf8..2d0a07861bfc 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -1624,13 +1624,12 @@ static int fs_bdev_thaw(struct block_device *bdev)
 	return error;
 }
 
-const struct blk_holder_ops fs_holder_ops = {
+static const struct blk_holder_ops fs_holder_ops = {
 	.mark_dead		= fs_bdev_mark_dead,
 	.sync			= fs_bdev_sync,
 	.freeze			= fs_bdev_freeze,
 	.thaw			= fs_bdev_thaw,
 };
-EXPORT_SYMBOL_GPL(fs_holder_ops);
 
 static struct super_dev *super_dev_lookup(dev_t dev, struct super_block *sb)
 {
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index cee548184a7b..45225b4f7193 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1772,13 +1772,6 @@ struct blk_holder_ops {
 		__releases(&bdev->bd_holder_lock);
 };
 
-/*
- * For filesystems using @fs_holder_ops, the @holder argument passed to
- * helpers used to open and claim block devices via
- * bd_prepare_to_claim() must point to a superblock.
- */
-extern const struct blk_holder_ops fs_holder_ops;
-
 /*
  * Return the correct open flags for blkdev_get_by_* for super block flags
  * as stored in sb->s_flags.

-- 
2.47.3


^ permalink raw reply related

* [PATCH RFC v2 17/18] fs: look up the superblock via the device table in user_get_super()
From: Christian Brauner @ 2026-06-16 14:08 UTC (permalink / raw)
  To: Jan Kara
  Cc: Christoph Hellwig, Jens Axboe, Alexander Viro, linux-block,
	linux-kernel, linux-fsdevel, Carlos Maiolino, linux-xfs,
	Chris Mason, David Sterba, linux-btrfs, Theodore Ts'o,
	linux-ext4, Gao Xiang, linux-erofs, Christian Brauner (Amutable)
In-Reply-To: <20260616-work-super-bdev_holder_global-v2-0-7df6b864028e@kernel.org>

user_get_super() still finds the superblock for a device number by
walking the global super_blocks list under sb_lock. Every superblock is
registered in the device table under its s_dev since sget_fc() inserts
it there, including superblocks on anonymous devices, so use the table
instead.

The refcount-pinning cursor helpers super_dev_{get,first,next}() only
touch table state and do not depend on CONFIG_BLOCK, so drop the
CONFIG_BLOCK guard around them: their new caller serves anonymous
devices as well (ustat() on e.g. tmpfs) and is built without
CONFIG_BLOCK. The guard falls in this patch rather than separately
since without this caller the helpers would be unused without
CONFIG_BLOCK.

The pinned entry holds a passive reference on the superblock so
super_lock() can be called directly; once the superblock is locked grab
a passive reference for the caller before dropping the pin.

The device table contains more than the old walk could find: a
superblock is also registered for every additional device it claims
(the xfs log and realtime devices, btrfs member devices, the ext4
external journal, erofs blob devices). Don't filter those out:
specifying any device a filesystem uses now resolves to that
filesystem, so ustat() and quotactl() work on e.g. the xfs log device
or a btrfs member device (the latter used to fail outright as btrfs
superblocks carry an anonymous s_dev that never matches a member
device). When several superblocks share a device (erofs blob devices)
the first live superblock wins.

The cursor also keeps scanning past dying superblocks where the old
walk gave up after the first s_dev match, so a mount racing with the
unmount of the same device (or with the reuse of a recycled anonymous
dev_t) finds the live superblock where the old walk could spuriously
return NULL.

This removes the last s_dev-keyed walk of the super_blocks list and
takes ustat() and quotactl()'s block device lookup off sb_lock
entirely.

Signed-off-by: Christian Brauner (Amutable) <brauner@kernel.org>
---
 fs/super.c | 28 ++++++++--------------------
 1 file changed, 8 insertions(+), 20 deletions(-)

diff --git a/fs/super.c b/fs/super.c
index 2d0a07861bfc..93f24aea75c4 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -501,7 +501,6 @@ static int super_dev_register(struct super_block *sb)
 	return err;
 }
 
-#ifdef CONFIG_BLOCK
 static struct super_dev *super_dev_get(struct rhlist_head *pos)
 {
 	struct super_dev *sb_dev;
@@ -535,7 +534,6 @@ static struct super_dev *super_dev_next(struct super_dev *prev)
 	super_dev_put(prev);
 	return sb_dev;
 }
-#endif
 
 static void kill_super_notify(struct super_block *sb)
 {
@@ -1044,29 +1042,19 @@ EXPORT_SYMBOL(iterate_supers_type);
 
 struct super_block *user_get_super(dev_t dev, bool excl)
 {
-	struct super_block *sb;
-
-	spin_lock(&sb_lock);
-	list_for_each_entry(sb, &super_blocks, s_list) {
-		bool locked;
+	struct super_dev *sb_dev;
 
-		if (sb->s_dev != dev)
-			continue;
+	for (sb_dev = super_dev_first(dev); sb_dev; sb_dev = super_dev_next(sb_dev)) {
+		struct super_block *sb = sb_dev->sd_sb;
 
-		if (!refcount_inc_not_zero(&sb->s_passive))
+		if (!super_lock(sb, excl))
 			continue;
 
-		spin_unlock(&sb_lock);
-
-		locked = super_lock(sb, excl);
-		if (locked)
-			return sb;
-
-		put_super(sb);
-		spin_lock(&sb_lock);
-		break;
+		/* The pinned entry holds a passive reference, take our own. */
+		refcount_inc(&sb->s_passive);
+		super_dev_put(sb_dev);
+		return sb;
 	}
-	spin_unlock(&sb_lock);
 	return NULL;
 }
 

-- 
2.47.3


^ permalink raw reply related

* [PATCH RFC v2 18/18] selftests/filesystems: add ustat() coverage
From: Christian Brauner @ 2026-06-16 14:08 UTC (permalink / raw)
  To: Jan Kara
  Cc: Christoph Hellwig, Jens Axboe, Alexander Viro, linux-block,
	linux-kernel, linux-fsdevel, Carlos Maiolino, linux-xfs,
	Chris Mason, David Sterba, linux-btrfs, Theodore Ts'o,
	linux-ext4, Gao Xiang, linux-erofs, Christian Brauner (Amutable)
In-Reply-To: <20260616-work-super-bdev_holder_global-v2-0-7df6b864028e@kernel.org>

user_get_super() is now backed by the global device-to-superblock table
instead of a walk of the super_blocks list. ustat(2) is its most direct
user-visible consumer but nothing in the tree exercises it.

Add a small regression test: the device number of a mounted tmpfs (an
anonymous device, registered in the table by sget_fc()) must resolve,
it must stop resolving after the unmount (the entry is dropped again in
kill_super_notify()), and bogus device numbers keep reporting EINVAL.

The test passes on kernels before the conversion: it pins down the
semantics the table-backed lookup must preserve.

Signed-off-by: Christian Brauner (Amutable) <brauner@kernel.org>
---
 tools/testing/selftests/filesystems/.gitignore   |   1 +
 tools/testing/selftests/filesystems/Makefile     |   2 +-
 tools/testing/selftests/filesystems/ustat_test.c | 135 +++++++++++++++++++++++
 3 files changed, 137 insertions(+), 1 deletion(-)

diff --git a/tools/testing/selftests/filesystems/.gitignore b/tools/testing/selftests/filesystems/.gitignore
index 64ac0dfa46b7..1bd53d54553c 100644
--- a/tools/testing/selftests/filesystems/.gitignore
+++ b/tools/testing/selftests/filesystems/.gitignore
@@ -5,3 +5,4 @@ fclog
 file_stressor
 anon_inode_test
 kernfs_test
+ustat_test
diff --git a/tools/testing/selftests/filesystems/Makefile b/tools/testing/selftests/filesystems/Makefile
index 85427d7f19b9..bbdd40b167fa 100644
--- a/tools/testing/selftests/filesystems/Makefile
+++ b/tools/testing/selftests/filesystems/Makefile
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0
 
 CFLAGS += $(KHDR_INCLUDES)
-TEST_GEN_PROGS := devpts_pts file_stressor anon_inode_test kernfs_test fclog
+TEST_GEN_PROGS := devpts_pts file_stressor anon_inode_test kernfs_test fclog ustat_test
 TEST_GEN_PROGS_EXTENDED := dnotify_test
 
 include ../lib.mk
diff --git a/tools/testing/selftests/filesystems/ustat_test.c b/tools/testing/selftests/filesystems/ustat_test.c
new file mode 100644
index 000000000000..d429fd18d779
--- /dev/null
+++ b/tools/testing/selftests/filesystems/ustat_test.c
@@ -0,0 +1,135 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Test ustat(2): looking up superblocks by device number.
+ *
+ * ustat() resolves a device number to a mounted superblock via
+ * user_get_super(). Check that the device number of a mounted tmpfs (an
+ * anonymous device) resolves, that it stops resolving once the filesystem
+ * is unmounted and that bogus device numbers report EINVAL.
+ */
+#define _GNU_SOURCE
+#include <errno.h>
+#include <fcntl.h>
+#include <sched.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/mount.h>
+#include <sys/stat.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+
+#include "../kselftest_harness.h"
+
+/* struct ustat is not exported through UAPI, mirror include/linux/types.h. */
+struct ustat_buf {
+	int		f_tfree;
+	unsigned long	f_tinode;
+	char		f_fname[6];
+	char		f_fpack[6];
+	/* slack in case an architecture lays the struct out differently */
+	char		pad[64];
+};
+
+#ifdef __NR_ustat
+
+/*
+ * The kernel decodes @dev with new_decode_dev(), which matches the low 32
+ * bits of the st_dev encoding stat(2) returns for any major below 4096.
+ */
+static int sys_ustat(unsigned int dev, struct ustat_buf *buf)
+{
+	return syscall(__NR_ustat, dev, buf);
+}
+
+static int write_string(const char *path, const char *string)
+{
+	ssize_t len = strlen(string);
+	int fd;
+
+	fd = open(path, O_WRONLY);
+	if (fd < 0)
+		return -1;
+	if (write(fd, string, len) != len) {
+		close(fd);
+		return -1;
+	}
+	return close(fd);
+}
+
+/* Enter namespaces in which mounting a tmpfs instance is allowed. */
+static int setup_namespaces(void)
+{
+	uid_t uid = getuid();
+	gid_t gid = getgid();
+	char map[64];
+
+	if (unshare(CLONE_NEWNS | (uid ? CLONE_NEWUSER : 0)))
+		return -1;
+
+	if (uid) {
+		if (write_string("/proc/self/setgroups", "deny"))
+			return -1;
+		snprintf(map, sizeof(map), "0 %d 1", uid);
+		if (write_string("/proc/self/uid_map", map))
+			return -1;
+		snprintf(map, sizeof(map), "0 %d 1", gid);
+		if (write_string("/proc/self/gid_map", map))
+			return -1;
+	}
+
+	return mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, NULL);
+}
+
+TEST(resolves_mounted_superblock)
+{
+	char dir[] = "/tmp/ustat_test.XXXXXX";
+	struct ustat_buf ub;
+	struct stat st;
+
+	ASSERT_NE(NULL, mkdtemp(dir));
+
+	if (setup_namespaces()) {
+		rmdir(dir);
+		SKIP(return, "cannot set up namespaces: %s", strerror(errno));
+	}
+
+	ASSERT_EQ(0, mount("ustat_test", dir, "tmpfs", 0, NULL));
+	ASSERT_EQ(0, stat(dir, &st));
+
+	memset(&ub, 0xff, sizeof(ub));
+	ASSERT_EQ(0, sys_ustat(st.st_dev, &ub))
+		TH_LOG("ustat(%u): %s", (unsigned int)st.st_dev,
+		       strerror(errno));
+
+	ASSERT_EQ(0, umount(dir));
+
+	/* The unmount removed the superblock, the device is gone. */
+	ASSERT_EQ(-1, sys_ustat(st.st_dev, &ub));
+	ASSERT_EQ(EINVAL, errno);
+
+	rmdir(dir);
+}
+
+TEST(bogus_device_numbers)
+{
+	struct ustat_buf ub;
+
+	ASSERT_EQ(-1, sys_ustat(0, &ub));
+	ASSERT_EQ(EINVAL, errno);
+
+	/* major 4095, minor 1048575: nothing plausible lives there */
+	ASSERT_EQ(-1, sys_ustat((0xfffu << 8) | 0xffu | (0xfff00u << 12), &ub));
+	ASSERT_EQ(EINVAL, errno);
+}
+
+#else /* !__NR_ustat */
+
+TEST(unsupported)
+{
+	SKIP(return, "ustat(2) is not available on this architecture");
+}
+
+#endif /* __NR_ustat */
+
+TEST_HARNESS_MAIN

-- 
2.47.3


^ permalink raw reply related

* Re: [PATCH v4 08/23] ext4: implement buffered write path using iomap
From: Zhang Yi @ 2026-06-16 14:42 UTC (permalink / raw)
  To: Jan Kara, Zhang Yi
  Cc: linux-ext4, linux-fsdevel, linux-kernel, tytso, adilger.kernel,
	libaokun, ojaswin, ritesh.list, djwong, hch, yi.zhang, yangerkun,
	yukuai
In-Reply-To: <xuxrr3ls4gttesznypeplbawvnxwa4mqcbt7fkpdtdn3cfcfrv@paiigv322v3n>

On 6/16/2026 6:45 PM, Jan Kara wrote:
> On Mon 11-05-26 15:23:28, Zhang Yi wrote:
>> From: Zhang Yi <yi.zhang@huawei.com>
>>
>> Introduce two new iomap_ops instances for ext4 buffered writes:
>>
>>   - ext4_iomap_buffered_da_write_ops: for delayed allocation mode, using
>>     ext4_da_map_blocks() to map delalloc extents.
>>   - ext4_iomap_buffered_write_ops: for non-delayed allocation mode, using
>>     ext4_iomap_get_blocks() to directly allocate blocks.
>>
>> Also add ext4_iomap_valid() for the iomap infrastructure to check extent
>> validity.
>>
>> Key changes and considerations:
>>
>>   - Unwritten extents for new blocks (dioread_nolock always on)
>>     Since data=ordered mode is not used to prevent stale data exposure in
>>     the non-delayed allocation path, new blocks are always allocated as
>>     unwritten extents.
>>
>>   - Short write and write failure handling
>>     a. Delalloc path: On short write or failure, the stale delalloc range
>>        must be dropped and its space reservation released. Otherwise, a
>>        clean folio may cover leftover delalloc extents, causing
>>        inaccurate space reservation accounting.
>>     b. Non-delalloc path: No cleanup of allocated blocks is needed on
>>        short write.
>>
>>   - Lock ordering reversal
>>     The folio lock and transaction start ordering is reversed compared to
>>     the buffer_head buffered write path. To handle this, the journal
>>     handle must be stopped in iomap_begin() callbacks. The lock ordering
>>     documentation in super.c has been updated accordingly.
>>
>> Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
> 
> Looks good to me - besides the IOMAP_F_NEW bugs Ojaswin found. One
> observation I have here is that since the old indirect block based on-disk
> format doesn't support unwritten extents we can never transition it to the
> iomap scheme used here. So we'll have to figure out some way to avoid
> maintaining two (actually three if we count data=journal) buffered write /
> writeback paths in the long term. But let's address that once things settle
> for the common paths.
> 
> 								Honza

Yes, I agree. In the future, we need to convert the buffered I/O path
to iomap as much as possible to reduce maintenance costs. For the ext3
filesystem, which does not support unwritten extents, I haven't given
much thought to a conversion plan at the moment. Perhaps we could
implement it through the "delay map" we discussed earlier in v2. After
this patch set is finished, we can take a closer look at the solution.
:-)

Best Regards,
Yi.

> 
>> ---
>>   fs/ext4/ext4.h  |   4 ++
>>   fs/ext4/file.c  |  20 +++++-
>>   fs/ext4/inode.c | 165 +++++++++++++++++++++++++++++++++++++++++++++++-
>>   fs/ext4/super.c |  10 ++-
>>   4 files changed, 192 insertions(+), 7 deletions(-)
>>
>> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
>> index 1e27d73d7427..4832e7f7db82 100644
>> --- a/fs/ext4/ext4.h
>> +++ b/fs/ext4/ext4.h
>> @@ -3057,6 +3057,7 @@ int ext4_walk_page_buffers(handle_t *handle,
>>   int do_journal_get_write_access(handle_t *handle, struct inode *inode,
>>   				struct buffer_head *bh);
>>   void ext4_set_inode_mapping_order(struct inode *inode);
>> +int ext4_nonda_switch(struct super_block *sb);
>>   #define FALL_BACK_TO_NONDELALLOC 1
>>   #define CONVERT_INLINE_DATA	 2
>>   
>> @@ -3926,6 +3927,9 @@ static inline void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end)
>>   
>>   extern const struct iomap_ops ext4_iomap_ops;
>>   extern const struct iomap_ops ext4_iomap_report_ops;
>> +extern const struct iomap_ops ext4_iomap_buffered_write_ops;
>> +extern const struct iomap_ops ext4_iomap_buffered_da_write_ops;
>> +extern const struct iomap_write_ops ext4_iomap_write_ops;
>>   
>>   static inline int ext4_buffer_uptodate(struct buffer_head *bh)
>>   {
>> diff --git a/fs/ext4/file.c b/fs/ext4/file.c
>> index eb1a323962b1..7f9bfbbc4a4e 100644
>> --- a/fs/ext4/file.c
>> +++ b/fs/ext4/file.c
>> @@ -299,6 +299,21 @@ static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from)
>>   	return count;
>>   }
>>   
>> +static ssize_t ext4_iomap_buffered_write(struct kiocb *iocb,
>> +					 struct iov_iter *from)
>> +{
>> +	struct inode *inode = file_inode(iocb->ki_filp);
>> +	const struct iomap_ops *iomap_ops;
>> +
>> +	if (test_opt(inode->i_sb, DELALLOC) && !ext4_nonda_switch(inode->i_sb))
>> +		iomap_ops = &ext4_iomap_buffered_da_write_ops;
>> +	else
>> +		iomap_ops = &ext4_iomap_buffered_write_ops;
>> +
>> +	return iomap_file_buffered_write(iocb, from, iomap_ops,
>> +					 &ext4_iomap_write_ops, NULL);
>> +}
>> +
>>   static ssize_t ext4_buffered_write_iter(struct kiocb *iocb,
>>   					struct iov_iter *from)
>>   {
>> @@ -313,7 +328,10 @@ static ssize_t ext4_buffered_write_iter(struct kiocb *iocb,
>>   	if (ret <= 0)
>>   		goto out;
>>   
>> -	ret = generic_perform_write(iocb, from);
>> +	if (ext4_inode_buffered_iomap(inode))
>> +		ret = ext4_iomap_buffered_write(iocb, from);
>> +	else
>> +		ret = generic_perform_write(iocb, from);
>>   
>>   out:
>>   	inode_unlock(inode);
>> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
>> index 39577a6b65b9..1ae7d3f4a1c8 100644
>> --- a/fs/ext4/inode.c
>> +++ b/fs/ext4/inode.c
>> @@ -3097,7 +3097,7 @@ static int ext4_dax_writepages(struct address_space *mapping,
>>   	return ret;
>>   }
>>   
>> -static int ext4_nonda_switch(struct super_block *sb)
>> +int ext4_nonda_switch(struct super_block *sb)
>>   {
>>   	s64 free_clusters, dirty_clusters;
>>   	struct ext4_sb_info *sbi = EXT4_SB(sb);
>> @@ -3467,6 +3467,15 @@ static bool ext4_inode_datasync_dirty(struct inode *inode)
>>   	return inode_state_read_once(inode) & I_DIRTY_DATASYNC;
>>   }
>>   
>> +static bool ext4_iomap_valid(struct inode *inode, const struct iomap *iomap)
>> +{
>> +	return iomap->validity_cookie == READ_ONCE(EXT4_I(inode)->i_es_seq);
>> +}
>> +
>> +const struct iomap_write_ops ext4_iomap_write_ops = {
>> +	.iomap_valid = ext4_iomap_valid,
>> +};
>> +
>>   static void ext4_set_iomap(struct inode *inode, struct iomap *iomap,
>>   			   struct ext4_map_blocks *map, loff_t offset,
>>   			   loff_t length, unsigned int flags)
>> @@ -3501,6 +3510,8 @@ static void ext4_set_iomap(struct inode *inode, struct iomap *iomap,
>>   	    !ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
>>   		iomap->flags |= IOMAP_F_MERGED;
>>   
>> +	iomap->validity_cookie = map->m_seq;
>> +
>>   	/*
>>   	 * Flags passed to ext4_map_blocks() for direct I/O writes can result
>>   	 * in m_flags having both EXT4_MAP_MAPPED and EXT4_MAP_UNWRITTEN bits
>> @@ -3908,8 +3919,12 @@ const struct iomap_ops ext4_iomap_report_ops = {
>>   	.iomap_begin = ext4_iomap_begin_report,
>>   };
>>   
>> +/* Map blocks */
>> +typedef int (ext4_get_blocks_t)(struct inode *, struct ext4_map_blocks *);
>> +
>>   static int ext4_iomap_map_blocks(struct inode *inode, loff_t offset,
>> -		loff_t length, struct ext4_map_blocks *map)
>> +		loff_t length, ext4_get_blocks_t get_blocks,
>> +		struct ext4_map_blocks *map)
>>   {
>>   	u8 blkbits = inode->i_blkbits;
>>   
>> @@ -3921,6 +3936,9 @@ static int ext4_iomap_map_blocks(struct inode *inode, loff_t offset,
>>   	map->m_len = min_t(loff_t, (offset + length - 1) >> blkbits,
>>   			   EXT4_MAX_LOGICAL_BLOCK) - map->m_lblk + 1;
>>   
>> +	if (get_blocks)
>> +		return get_blocks(inode, map);
>> +
>>   	return ext4_map_blocks(NULL, inode, map, 0);
>>   }
>>   
>> @@ -3938,7 +3956,7 @@ static int ext4_iomap_buffered_read_begin(struct inode *inode, loff_t offset,
>>   	if (WARN_ON_ONCE(ext4_has_inline_data(inode)))
>>   		return -ERANGE;
>>   
>> -	ret = ext4_iomap_map_blocks(inode, offset, length, &map);
>> +	ret = ext4_iomap_map_blocks(inode, offset, length, NULL, &map);
>>   	if (ret < 0)
>>   		return ret;
>>   
>> @@ -3946,6 +3964,147 @@ static int ext4_iomap_buffered_read_begin(struct inode *inode, loff_t offset,
>>   	return 0;
>>   }
>>   
>> +static int ext4_iomap_get_blocks(struct inode *inode,
>> +				 struct ext4_map_blocks *map)
>> +{
>> +	loff_t i_size = i_size_read(inode);
>> +	handle_t *handle;
>> +	int ret;
>> +
>> +	/*
>> +	 * Check if the blocks have already been allocated, this could
>> +	 * avoid initiating a new journal transaction and return the
>> +	 * mapping information directly.
>> +	 */
>> +	if ((map->m_lblk + map->m_len) <=
>> +	    round_up(i_size, i_blocksize(inode)) >> inode->i_blkbits) {
>> +		ret = ext4_map_blocks(NULL, inode, map, 0);
>> +		if (ret < 0)
>> +			return ret;
>> +		if (map->m_flags & (EXT4_MAP_MAPPED | EXT4_MAP_UNWRITTEN |
>> +				    EXT4_MAP_DELAYED))
>> +			return 0;
>> +	}
>> +
>> +	handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
>> +			ext4_chunk_trans_blocks(inode, map->m_len));
>> +	if (IS_ERR(handle))
>> +		return PTR_ERR(handle);
>> +
>> +	ret = ext4_map_blocks(handle, inode, map,
>> +			      EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT);
>> +	/*
>> +	 * Stop handle here following the lock ordering of the folio lock
>> +	 * and the transaction start.
>> +	 */
>> +	ext4_journal_stop(handle);
>> +
>> +	return ret;
>> +}
>> +
>> +static int ext4_iomap_buffered_do_write_begin(struct inode *inode,
>> +		loff_t offset, loff_t length, unsigned int flags,
>> +		struct iomap *iomap, struct iomap *srcmap, bool delalloc)
>> +{
>> +	int ret, retries = 0;
>> +	struct ext4_map_blocks map;
>> +	ext4_get_blocks_t *get_blocks;
>> +
>> +	ret = ext4_emergency_state(inode->i_sb);
>> +	if (unlikely(ret))
>> +		return ret;
>> +
>> +	/* Inline data and non-extent are not supported. */
>> +	if (WARN_ON_ONCE(ext4_has_inline_data(inode)))
>> +		return -ERANGE;
>> +	if (WARN_ON_ONCE(!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
>> +		return -EINVAL;
>> +	if (WARN_ON_ONCE(!(flags & IOMAP_WRITE)))
>> +		return -EINVAL;
>> +
>> +	if (delalloc)
>> +		get_blocks = ext4_da_map_blocks;
>> +	else
>> +		get_blocks = ext4_iomap_get_blocks;
>> +retry:
>> +	ret = ext4_iomap_map_blocks(inode, offset, length, get_blocks, &map);
>> +	if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
>> +		goto retry;
>> +	if (ret < 0)
>> +		return ret;
>> +
>> +	ext4_set_iomap(inode, iomap, &map, offset, length, flags);
>> +	return 0;
>> +}
>> +
>> +static int ext4_iomap_buffered_write_begin(struct inode *inode,
>> +		loff_t offset, loff_t length, unsigned int flags,
>> +		struct iomap *iomap, struct iomap *srcmap)
>> +{
>> +	return ext4_iomap_buffered_do_write_begin(inode, offset, length, flags,
>> +						  iomap, srcmap, false);
>> +}
>> +
>> +static int ext4_iomap_buffered_da_write_begin(struct inode *inode,
>> +		loff_t offset, loff_t length, unsigned int flags,
>> +		struct iomap *iomap, struct iomap *srcmap)
>> +{
>> +	return ext4_iomap_buffered_do_write_begin(inode, offset, length, flags,
>> +						  iomap, srcmap, true);
>> +}
>> +
>> +/*
>> + * On write failure, drop the stale delayed allocation range and release
>> + * its reserved space for both start and end blocks. Otherwise, we may
>> + * leave a range of delayed extents covered by a clean folio, which can
>> + * result in inaccurate space reservation accounting.
>> + */
>> +static void ext4_iomap_punch_delalloc(struct inode *inode, loff_t offset,
>> +				     loff_t length, struct iomap *iomap)
>> +{
>> +	down_write(&EXT4_I(inode)->i_data_sem);
>> +	ext4_es_remove_extent(inode, offset >> inode->i_blkbits,
>> +			DIV_ROUND_UP_ULL(length, EXT4_BLOCK_SIZE(inode->i_sb)));
>> +	up_write(&EXT4_I(inode)->i_data_sem);
>> +}
>> +
>> +static int ext4_iomap_buffered_da_write_end(struct inode *inode, loff_t offset,
>> +					    loff_t length, ssize_t written,
>> +					    unsigned int flags,
>> +					    struct iomap *iomap)
>> +{
>> +	loff_t start_byte, end_byte;
>> +
>> +	/* If we didn't reserve the blocks, we're not allowed to punch them. */
>> +	if (iomap->type != IOMAP_DELALLOC || !(iomap->flags & IOMAP_F_NEW))
>> +		return 0;
>> +
>> +	/* Nothing to do if we've written the entire delalloc extent */
>> +	start_byte = iomap_last_written_block(inode, offset, written);
>> +	end_byte = round_up(offset + length, i_blocksize(inode));
>> +	if (start_byte >= end_byte)
>> +		return 0;
>> +
>> +	filemap_invalidate_lock(inode->i_mapping);
>> +	iomap_write_delalloc_release(inode, start_byte, end_byte, flags,
>> +				     iomap, ext4_iomap_punch_delalloc);
>> +	filemap_invalidate_unlock(inode->i_mapping);
>> +	return 0;
>> +}
>> +
>> +/*
>> + * Since we always allocate unwritten extents, there is no need for
>> + * iomap_end to clean up allocated blocks on a short write.
>> + */
>> +const struct iomap_ops ext4_iomap_buffered_write_ops = {
>> +	.iomap_begin = ext4_iomap_buffered_write_begin,
>> +};
>> +
>> +const struct iomap_ops ext4_iomap_buffered_da_write_ops = {
>> +	.iomap_begin = ext4_iomap_buffered_da_write_begin,
>> +	.iomap_end = ext4_iomap_buffered_da_write_end,
>> +};
>> +
>>   const struct iomap_ops ext4_iomap_buffered_read_ops = {
>>   	.iomap_begin = ext4_iomap_buffered_read_begin,
>>   };
>> diff --git a/fs/ext4/super.c b/fs/ext4/super.c
>> index 6a77db4d3124..9bc294b769db 100644
>> --- a/fs/ext4/super.c
>> +++ b/fs/ext4/super.c
>> @@ -104,9 +104,13 @@ static const struct fs_parameter_spec ext4_param_specs[];
>>    *   -> page lock -> i_data_sem (rw)
>>    *
>>    * buffered write path:
>> - * sb_start_write -> i_mutex -> mmap_lock
>> - * sb_start_write -> i_mutex -> transaction start -> page lock ->
>> - *   i_data_sem (rw)
>> + * sb_start_write -> i_rwsem (w) -> mmap_lock
>> + * - buffer_head path:
>> + *   sb_start_write -> i_rwsem (w) -> transaction start -> folio lock ->
>> + *     i_data_sem (rw)
>> + * - iomap path:
>> + *   sb_start_write -> i_rwsem (w) -> transaction start -> i_data_sem (rw)
>> + *   sb_start_write -> i_rwsem (w) -> folio lock (not under an active handle)
>>    *
>>    * truncate:
>>    * sb_start_write -> i_mutex -> invalidate_lock (w) -> i_mmap_rwsem (w) ->
>> -- 
>> 2.52.0
>>


^ permalink raw reply

* Re: [PATCH RFC 2/8] fs: add a global device to super block hash table
From: Christian Brauner @ 2026-06-16 14:59 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Jan Kara, Jens Axboe, Alexander Viro, linux-block, linux-kernel,
	linux-fsdevel, Carlos Maiolino, linux-xfs, Chris Mason,
	David Sterba, linux-btrfs, Theodore Ts'o, linux-ext4,
	Gao Xiang, linux-erofs
In-Reply-To: <20260616123443.GA21024@lst.de>

On Tue, Jun 16, 2026 at 02:34:43PM +0200, Christoph Hellwig wrote:
> On Tue, Jun 02, 2026 at 12:10:08PM +0200, Christian Brauner wrote:
> > fs_holder_ops recovers the owning superblock from bdev->bd_holder, which
> > forces the holder to be exactly one superblock and prevents several
> > superblocks from sharing one block device. That's what erofs is doing.
> > 
> > Introduce a global dev_t-keyed rhltable mapping each block device to the
> > superblock(s) using it. The holder argument becomes purely the block
> > layer's exclusivity token (a superblock, or a file_system_type for
> > shared devices) and is no longer needed by the fs specific callbacks.
> 
> Err, no.  block devices need to have a specific owner.  If erofs wants
> to share a device between superblock it needs to come up with an entity
> that owns the block devices which is not a superblock.

It already did.

> IMHO sharing devices between superblocks is a bad idea, but that ship
> has sailed, but please keep it contained inside of erofs.

We need a simple device number to superblock mapping anyway and that can
simply be centralized in the vfs. And it can work with anon device
numbers and block device numbers uniformly.

^ permalink raw reply

* [PATCH v7 3/4] ext4: introduce ext4_put_ea_inode() for safe deferred iput
From: Yun Zhou @ 2026-06-16 15:15 UTC (permalink / raw)
  To: tytso, adilger.kernel, libaokun, jack, ojaswin, ritesh.list,
	yi.zhang
  Cc: linux-ext4, linux-kernel, yun.zhou
In-Reply-To: <20260616151558.1728881-1-yun.zhou@windriver.com>

Calling iput() on EA inodes while holding xattr_sem or a jbd2 handle
can trigger write_inode_now() -> ext4_writepages() -> s_writepages_rwsem,
creating a lock ordering issue during mount (!SB_ACTIVE).

Add ext4_put_ea_inode() which safely releases EA inode references:
when SB_ACTIVE, it calls iput() directly (write_inode_now cannot be
triggered); during mount (!SB_ACTIVE), it queues the inode on a per-sb
lock-free llist and schedules a worker to call iput() in a clean
context without holding any ext4 locks.

Convert the iput in ext4_xattr_block_set()'s "Drop the previous xattr
block" path to use ext4_xattr_inode_array_free_deferred(), which
releases EA inodes via ext4_put_ea_inode().  This path previously called
ext4_xattr_inode_array_free() (synchronous iput) while holding xattr_sem
and a jbd2 handle.

The worker is flushed in ext4_put_super() before journal destruction to
ensure all pending EA inode cleanup completes while the journal is still
available.

Signed-off-by: Yun Zhou <yun.zhou@windriver.com>
---
 fs/ext4/ext4.h  |  5 ++++
 fs/ext4/super.c |  6 ++++
 fs/ext4/xattr.c | 73 ++++++++++++++++++++++++++++++++++++++++++++++++-
 fs/ext4/xattr.h |  2 ++
 4 files changed, 85 insertions(+), 1 deletion(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 94283a991e5c..690202303269 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1706,6 +1706,11 @@ struct ext4_sb_info {
 	struct ext4_es_stats s_es_stats;
 	struct mb_cache *s_ea_block_cache;
 	struct mb_cache *s_ea_inode_cache;
+
+	/* Deferred iput for EA inodes to avoid lock ordering issues */
+	struct llist_head s_ea_inode_to_free;
+	struct work_struct s_ea_inode_work;
+
 	spinlock_t s_es_lock ____cacheline_aligned_in_smp;
 
 	/* Journal triggers for checksum computation */
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 6a77db4d3124..b777bb0a81ea 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1308,6 +1308,9 @@ static void ext4_put_super(struct super_block *sb)
 	destroy_workqueue(sbi->rsv_conversion_wq);
 	ext4_release_orphan_info(sb);
 
+	/* Flush deferred EA inode iputs before destroying journal */
+	flush_work(&sbi->s_ea_inode_work);
+
 	if (sbi->s_journal) {
 		aborted = is_journal_aborted(sbi->s_journal);
 		err = ext4_journal_destroy(sbi, sbi->s_journal);
@@ -5535,6 +5538,9 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
 		needs_recovery = 0;
 	}
 
+	init_llist_head(&sbi->s_ea_inode_to_free);
+	INIT_WORK(&sbi->s_ea_inode_work, ext4_ea_inode_work);
+
 	if (!test_opt(sb, NO_MBCACHE)) {
 		sbi->s_ea_block_cache = ext4_xattr_create_cache();
 		if (!sbi->s_ea_block_cache) {
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 982a1f831e22..04e7f674340d 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -117,6 +117,8 @@ const struct xattr_handler * const ext4_xattr_handlers[] = {
 static int
 ext4_expand_inode_array(struct ext4_xattr_inode_array **ea_inode_array,
 			struct inode *inode);
+static void ext4_xattr_inode_array_free_deferred(struct super_block *sb,
+				struct ext4_xattr_inode_array *array);
 
 #ifdef CONFIG_LOCKDEP
 void ext4_xattr_inode_set_class(struct inode *ea_inode)
@@ -2187,7 +2189,8 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
 		ext4_xattr_release_block(handle, inode, bs->bh,
 					 &ea_inode_array,
 					 0 /* extra_credits */);
-		ext4_xattr_inode_array_free(ea_inode_array);
+		ext4_xattr_inode_array_free_deferred(inode->i_sb,
+						     ea_inode_array);
 	}
 	error = 0;
 
@@ -3025,6 +3028,74 @@ void ext4_xattr_inode_array_free(struct ext4_xattr_inode_array *ea_inode_array)
 	kfree(ea_inode_array);
 }
 
+static void ext4_xattr_inode_array_free_deferred(struct super_block *sb,
+				struct ext4_xattr_inode_array *array)
+{
+	int idx;
+
+	if (array == NULL)
+		return;
+
+	for (idx = 0; idx < array->count; ++idx)
+		ext4_put_ea_inode(sb, array->inodes[idx]);
+	kfree(array);
+}
+
+struct ext4_ea_iput_entry {
+	struct llist_node node;
+	struct inode *inode;
+};
+
+/*
+ * Worker function for deferred EA inode iput.  Processes all inodes queued
+ * on s_ea_inode_to_free in a context free of xattr_sem/jbd2 handle locks.
+ */
+void ext4_ea_inode_work(struct work_struct *work)
+{
+	struct ext4_sb_info *sbi = container_of(work, struct ext4_sb_info,
+						s_ea_inode_work);
+	struct llist_node *node = llist_del_all(&sbi->s_ea_inode_to_free);
+	struct llist_node *next;
+
+	while (node) {
+		struct ext4_ea_iput_entry *entry = container_of(node,
+				struct ext4_ea_iput_entry, node);
+		next = node->next;
+		iput(entry->inode);
+		kfree(entry);
+		node = next;
+	}
+}
+
+/*
+ * Release a VFS reference on an EA inode after ext4_xattr_inode_dec_ref()
+ * may have set i_nlink=0.  Must be used instead of iput() in any context
+ * where xattr_sem or a jbd2 handle is held, because eviction of a nlink=0
+ * inode can acquire those same locks.
+ *
+ * When SB_ACTIVE, eviction does not call write_inode_now() so direct
+ * iput() is safe.  During mount (!SB_ACTIVE), defer to a workqueue.
+ *
+ * For EA inode references dropped without a preceding dec_ref (e.g.,
+ * lookup-only paths where nlink remains >= 1), plain iput() is safe
+ * and preferred.
+ */
+void ext4_put_ea_inode(struct super_block *sb, struct inode *inode)
+{
+	struct ext4_ea_iput_entry *entry;
+
+	if (!inode)
+		return;
+	if (sb->s_flags & SB_ACTIVE) {
+		iput(inode);
+		return;
+	}
+	entry = kmalloc(sizeof(*entry), GFP_NOFS | __GFP_NOFAIL);
+	entry->inode = inode;
+	llist_add(&entry->node, &EXT4_SB(sb)->s_ea_inode_to_free);
+	schedule_work(&EXT4_SB(sb)->s_ea_inode_work);
+}
+
 /*
  * ext4_xattr_block_cache_insert()
  *
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index 1fedf44d4fb6..52074537dce5 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -190,6 +190,8 @@ extern int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
 				   struct ext4_xattr_inode_array **array,
 				   int extra_credits);
 extern void ext4_xattr_inode_array_free(struct ext4_xattr_inode_array *array);
+extern void ext4_ea_inode_work(struct work_struct *work);
+extern void ext4_put_ea_inode(struct super_block *sb, struct inode *inode);
 
 extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
 			    struct ext4_inode *raw_inode, handle_t *handle);
-- 
2.43.0


^ permalink raw reply related

* [PATCH v7 1/4] ext4: skip extra isize expansion during mount to prevent deadlock
From: Yun Zhou @ 2026-06-16 15:15 UTC (permalink / raw)
  To: tytso, adilger.kernel, libaokun, jack, ojaswin, ritesh.list,
	yi.zhang
  Cc: linux-ext4, linux-kernel, yun.zhou
In-Reply-To: <20260616151558.1728881-1-yun.zhou@windriver.com>

ext4_try_to_expand_extra_isize() is called from __ext4_mark_inode_dirty()
while holding an active jbd2 handle.  During mount (!SB_ACTIVE), the
expand path may move xattrs to external blocks and release ea_inodes via
iput().  When !SB_ACTIVE, iput() calls write_inode_now() which acquires
s_writepages_rwsem, creating a circular lock dependency:

  s_writepages_rwsem --> jbd2_handle --> xattr_sem --> s_writepages_rwsem

This can be triggered via:

  ext4_process_orphan() -> ext4_truncate() -> ext4_mark_inode_dirty()
    -> ext4_try_to_expand_extra_isize()

or:

  ext4_evict_inode() -> ext4_mark_inode_dirty()
    -> ext4_try_to_expand_extra_isize()

Skip expansion when !SB_ACTIVE.  This is a minor loss of functionality
(extra isize won't grow for these inodes during mount), which e2fsck
can resolve later if needed.

Reported-by: syzbot+5d19358d7eb30ffb0cc5@syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=5d19358d7eb30ffb0cc5
Fixes: c8585c6fcaf2 ("ext4: fix races between changing inode journal mode and ext4_writepages")
Signed-off-by: Yun Zhou <yun.zhou@windriver.com>
---
 fs/ext4/inode.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index c2c2d6ac7f3d..09dcfb6bf48c 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -6458,6 +6458,16 @@ static int ext4_try_to_expand_extra_isize(struct inode *inode,
 	if (ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND))
 		return -EOVERFLOW;
 
+	/*
+	 * Skip expansion during mount (!SB_ACTIVE).  Expanding extra isize
+	 * may move xattrs to external blocks and release ea_inodes via iput.
+	 * When !SB_ACTIVE, iput triggers write_inode_now() which acquires
+	 * s_writepages_rwsem, causing a deadlock with the caller's active
+	 * jbd2 handle (lock order: s_writepages_rwsem -> jbd2_handle).
+	 */
+	if (unlikely(!(inode->i_sb->s_flags & SB_ACTIVE)))
+		return -EBUSY;
+
 	/*
 	 * In nojournal mode, we can immediately attempt to expand
 	 * the inode.  When journaled, we first need to obtain extra
-- 
2.43.0


^ permalink raw reply related

* [PATCH v7 2/4] ext4: set EXT4_STATE_NO_EXPAND in ext4_evict_inode
From: Yun Zhou @ 2026-06-16 15:15 UTC (permalink / raw)
  To: tytso, adilger.kernel, libaokun, jack, ojaswin, ritesh.list,
	yi.zhang
  Cc: linux-ext4, linux-kernel, yun.zhou
In-Reply-To: <20260616151558.1728881-1-yun.zhou@windriver.com>

An inode being evicted will never need its extra isize expanded.  Set
EXT4_STATE_NO_EXPAND before ext4_mark_inode_dirty() in ext4_evict_inode()
to make this explicit and prevent any unnecessary work in
ext4_try_to_expand_extra_isize().

This also provides defense-in-depth for the s_writepages_rwsem deadlock
during mount-time orphan cleanup, ensuring the expand path is blocked
for inodes under eviction regardless of how they are reached.

Signed-off-by: Yun Zhou <yun.zhou@windriver.com>
---
 fs/ext4/inode.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 09dcfb6bf48c..1de0aaa28e63 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -264,6 +264,7 @@ void ext4_evict_inode(struct inode *inode)
 	if (ext4_inode_is_fast_symlink(inode))
 		memset(EXT4_I(inode)->i_data, 0, sizeof(EXT4_I(inode)->i_data));
 	inode->i_size = 0;
+	ext4_set_inode_state(inode, EXT4_STATE_NO_EXPAND);
 	err = ext4_mark_inode_dirty(handle, inode);
 	if (err) {
 		ext4_warning(inode->i_sb,
-- 
2.43.0


^ permalink raw reply related

* [PATCH v7 4/4] ext4: convert remaining EA inode iput() calls to ext4_put_ea_inode()
From: Yun Zhou @ 2026-06-16 15:15 UTC (permalink / raw)
  To: tytso, adilger.kernel, libaokun, jack, ojaswin, ritesh.list,
	yi.zhang
  Cc: linux-ext4, linux-kernel, yun.zhou
In-Reply-To: <20260616151558.1728881-1-yun.zhou@windriver.com>

Convert all remaining iput() calls on EA inodes that execute under
xattr_sem or a jbd2 handle to use ext4_put_ea_inode().  With i_nlink>=1
and !SB_ACTIVE, a direct iput() would trigger write_inode_now() ->
s_writepages_rwsem, creating a lock ordering violation with the caller's
active jbd2 handle.

Converted sites and why defer is necessary:

- ext4_xattr_inode_inc_ref_all() cleanup: dec_ref undoes the failed
  inc_ref, but the EA inode may be shared so i_nlink remains 1.

- ext4_xattr_inode_lookup_create() out_err: may be a cache-found inode
  where inc_ref failed; i_nlink remains 1.

- ext4_xattr_set_entry() old_ea_inode: dec_ref was called but the EA
  inode may be shared by other xattr blocks, so i_nlink remains 1.

- ext4_xattr_block_set() new block path: dec_ref drops the "extra" ref
  but inc_ref_all added another, so i_nlink stays 1.

- ext4_xattr_block_set() cleanup: on success no dec_ref was called
  (i_nlink=1); on error dec_ref may leave i_nlink=1 if shared.

- ext4_xattr_ibody_set() error path: dec_ref on a cache-found EA inode
  may leave i_nlink=1 if shared.

- ext4_xattr_ibody_set() success path: newly stored EA inode with
  i_nlink=1, just releasing the lookup reference.

- ext4_xattr_delete_inode() quota loop: iget for quota accounting only,
  no dec_ref called, i_nlink=1, jbd2 handle is active.

Sites where direct iput() is provably safe are left unchanged with a
comment: ext4_xattr_inode_create() error path (dec_ref guarantees
i_nlink=0, so eviction skips write_inode_now).

Signed-off-by: Yun Zhou <yun.zhou@windriver.com>
---
 fs/ext4/xattr.c | 32 ++++++++++++++++++++++++--------
 1 file changed, 24 insertions(+), 8 deletions(-)

diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 04e7f674340d..b8a2ccd0a958 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -1079,6 +1079,13 @@ static int ext4_xattr_inode_inc_ref(handle_t *handle, struct inode *ea_inode)
 	return ext4_xattr_inode_update_ref(handle, ea_inode, 1);
 }
 
+/*
+ * Decrement on-disk reference count of an EA inode.  If refcount reaches 0,
+ * i_nlink is cleared and the inode is added to the orphan list.  Callers
+ * must use ext4_put_ea_inode() (not iput) to release the VFS reference
+ * afterwards, since iput on a nlink=0 inode triggers eviction which may
+ * deadlock if called under xattr_sem or an active jbd2 handle.
+ */
 static int ext4_xattr_inode_dec_ref(handle_t *handle, struct inode *ea_inode)
 {
 	return ext4_xattr_inode_update_ref(handle, ea_inode, -1);
@@ -1135,7 +1142,8 @@ static int ext4_xattr_inode_inc_ref_all(handle_t *handle, struct inode *parent,
 		if (err)
 			ext4_warning_inode(ea_inode, "cleanup dec ref error %d",
 					   err);
-		iput(ea_inode);
+		/* i_nlink may remain 1 if shared; defer for !SB_ACTIVE safety */
+		ext4_put_ea_inode(parent->i_sb, ea_inode);
 	}
 	return saved_err;
 }
@@ -1507,6 +1515,7 @@ static struct inode *ext4_xattr_inode_create(handle_t *handle,
 			if (ext4_xattr_inode_dec_ref(handle, ea_inode))
 				ext4_warning_inode(ea_inode,
 					"cleanup dec ref error %d", err);
+			/* dec_ref set i_nlink=0; iput won't trigger write_inode_now */
 			iput(ea_inode);
 			return ERR_PTR(err);
 		}
@@ -1617,7 +1626,8 @@ static struct inode *ext4_xattr_inode_lookup_create(handle_t *handle,
 				      ea_inode->i_ino, true /* reusable */);
 	return ea_inode;
 out_err:
-	iput(ea_inode);
+	/* May be cache-found inode with i_nlink=1 (inc_ref failed) */
+	ext4_put_ea_inode(inode->i_sb, ea_inode);
 	ext4_xattr_inode_free_quota(inode, NULL, value_len);
 	return ERR_PTR(err);
 }
@@ -1850,7 +1860,8 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i,
 
 	ret = 0;
 out:
-	iput(old_ea_inode);
+	/* old_ea_inode had dec_ref; may still have i_nlink=1 if shared */
+	ext4_put_ea_inode(inode->i_sb, old_ea_inode);
 	return ret;
 }
 
@@ -2152,7 +2163,8 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
 					ext4_warning_inode(ea_inode,
 							   "dec ref error=%d",
 							   error);
-				iput(ea_inode);
+				/* i_nlink stays 1 (inc_ref_all added a ref) */
+				ext4_put_ea_inode(inode->i_sb, ea_inode);
 				ea_inode = NULL;
 			}
 
@@ -2206,7 +2218,8 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
 			ext4_xattr_inode_free_quota(inode, ea_inode,
 						    i_size_read(ea_inode));
 		}
-		iput(ea_inode);
+		/* success: i_nlink=1; error+dec_ref: may still be 1 if shared */
+		ext4_put_ea_inode(inode->i_sb, ea_inode);
 	}
 	if (ce)
 		mb_cache_entry_put(ea_block_cache, ce);
@@ -2288,7 +2301,8 @@ int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode,
 
 			ext4_xattr_inode_free_quota(inode, ea_inode,
 						    i_size_read(ea_inode));
-			iput(ea_inode);
+			/* cache-found ea_inode may retain i_nlink=1 */
+			ext4_put_ea_inode(inode->i_sb, ea_inode);
 		}
 		return error;
 	}
@@ -2300,7 +2314,8 @@ int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode,
 		header->h_magic = cpu_to_le32(0);
 		ext4_clear_inode_state(inode, EXT4_STATE_XATTR);
 	}
-	iput(ea_inode);
+	/* ea_inode has i_nlink=1 (new ref just stored in xattr entry) */
+	ext4_put_ea_inode(inode->i_sb, ea_inode);
 	return 0;
 }
 
@@ -2989,7 +3004,8 @@ int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
 					continue;
 				ext4_xattr_inode_free_quota(inode, ea_inode,
 					      le32_to_cpu(entry->e_value_size));
-				iput(ea_inode);
+				/* no dec_ref yet but i_nlink=1; handle is active */
+				ext4_put_ea_inode(inode->i_sb, ea_inode);
 			}
 
 		}
-- 
2.43.0


^ permalink raw reply related

* Re: [PATCH RFC 2/8] fs: add a global device to super block hash table
From: Christian Brauner @ 2026-06-16 15:19 UTC (permalink / raw)
  To: Christoph Hellwig, Jan Kara
  Cc: Jens Axboe, Alexander Viro, linux-block, linux-kernel,
	linux-fsdevel, Carlos Maiolino, linux-xfs, Chris Mason,
	David Sterba, linux-btrfs, Theodore Ts'o, linux-ext4,
	Gao Xiang, linux-erofs
In-Reply-To: <20260616-fragil-duktus-nachverfolgen-60f54584c206@brauner>

On Tue, Jun 16, 2026 at 04:59:53PM +0200, Christian Brauner wrote:
> On Tue, Jun 16, 2026 at 02:34:43PM +0200, Christoph Hellwig wrote:
> > On Tue, Jun 02, 2026 at 12:10:08PM +0200, Christian Brauner wrote:
> > > fs_holder_ops recovers the owning superblock from bdev->bd_holder, which
> > > forces the holder to be exactly one superblock and prevents several
> > > superblocks from sharing one block device. That's what erofs is doing.
> > > 
> > > Introduce a global dev_t-keyed rhltable mapping each block device to the
> > > superblock(s) using it. The holder argument becomes purely the block
> > > layer's exclusivity token (a superblock, or a file_system_type for
> > > shared devices) and is no longer needed by the fs specific callbacks.
> > 
> > Err, no.  block devices need to have a specific owner.  If erofs wants
> > to share a device between superblock it needs to come up with an entity
> > that owns the block devices which is not a superblock.
> 
> It already did.
> 
> > IMHO sharing devices between superblocks is a bad idea, but that ship
> > has sailed, but please keep it contained inside of erofs.
> 
> We need a simple device number to superblock mapping anyway and that can
> simply be centralized in the vfs. And it can work with anon device
> numbers and block device numbers uniformly.

Plus, after we're done we then also have a centry place where we can
intercept what devices can be mounted by a filesystem uniformly.

My first approach for this was of course to just add fs_file_open_by_*()
wrappers and move the relevant security hook into there. But while doing
this - ignoring the ton of bugs I found - I realized that having a
mapping so we can go from device number to superblock is very helpful.

We could of course keep the mapping just local to erofs but I see no
reason why the vfs cannot just provide this ability natively given that
it has all the required machinery. I'll let Jan chime in as well.

^ permalink raw reply

* [PATCH v7 0/4] ext4: fix xattr iput deadlock with s_writepages_rwsem
From: Yun Zhou @ 2026-06-16 15:15 UTC (permalink / raw)
  To: tytso, adilger.kernel, libaokun, jack, ojaswin, ritesh.list,
	yi.zhang
  Cc: linux-ext4, linux-kernel, yun.zhou

This series fixes a circular lock dependency reported by syzbot:

  s_writepages_rwsem --> jbd2_handle --> xattr_sem --> s_writepages_rwsem

The deadlock occurs when iput() on an EA inode triggers write_inode_now()
while xattr_sem and a jbd2 handle are held.  The triggering path is
during mount-time orphan cleanup (!SB_ACTIVE) where iput_final() calls
write_inode_now() synchronously.

Patch 1 blocks the deadlock by skipping extra isize expansion when
!SB_ACTIVE -- this prevents the xattr manipulation path from being
entered during mount.

Patch 2 is a belt-and-suspenders semantic improvement: an inode under
eviction never needs extra isize expansion.

Patches 3-4 are a structural improvement using a per-sb workqueue:

  Patch 3 introduces ext4_put_ea_inode(), which does direct iput() when
  SB_ACTIVE (zero overhead) and defers to a workqueue when !SB_ACTIVE.
  It also converts the first call site (ext4_xattr_block_set release
  path) which previously called iput under xattr_sem + jbd2 handle.

  Patch 4 converts the remaining EA inode iput() calls that execute
  under locks.  Sites where direct iput() is provably safe (i_nlink=0
  after dec_ref, or lookup-only paths) are left unchanged with comments.

Link: https://syzkaller.appspot.com/bug?extid=5d19358d7eb30ffb0cc5

v7:
 - Replaced the deferred-iput array threading approach (v4-v6) with a
   simpler per-sb workqueue + lock-free llist design.  No function
   signature changes needed.  ext4_put_ea_inode() does direct iput when
   SB_ACTIVE (zero overhead in normal operation) and defers to the
   workqueue only during mount (!SB_ACTIVE).
 - Converted the iput in ext4_xattr_delete_inode()'s quota accounting
   loop to ext4_put_ea_inode() to eliminate a lockdep-reportable lock
   ordering violation (jbd2_handle -> iput -> s_writepages_rwsem).
 - Moved flush_work() before the if (sbi->s_journal) check in
   ext4_put_super() to cover nojournal mode.

v6:
 - ext4_inline_data_truncate(): use local ea_inode_array instead of
   passing NULL, freed after ext4_journal_stop().  Fixes a deadlock
   reachable via crafted filesystem where inline data xattr entry has
   e_value_inum set: orphan cleanup -> ext4_truncate ->
   ext4_inline_data_truncate -> iput under !SB_ACTIVE.

v5:
 - Split into 3 patches for easier review.
 - Add explicit !SB_ACTIVE early-return in ext4_try_to_expand_extra_isize()
   to block ALL mount-time paths (ext4_process_orphan -> ext4_truncate ->
   ext4_mark_inode_dirty), not just the eviction path. v4 only relied on
   EXT4_STATE_NO_EXPAND which doesn't cover orphan truncation.

v4:
 - Comprehensive rewrite of the deferred iput mechanism.
 - Thread ea_inode_array through ext4_expand_extra_isize_ea() and
   ext4_xattr_move_to_block() so ALL ea_inode iputs in the expand
   path are deferred, not just those in ext4_xattr_block_set().
 - Add NULL safety to ext4_expand_inode_array(): when ea_inode_array
   pointer is NULL, fall back to synchronous iput (for callers like
   ext4_initxattrs that only run with SB_ACTIVE).
 - Use __GFP_NOFAIL to guarantee deferred array growth, eliminating
   fallback to synchronous iput under locks.
 - Update ext4_xattr_ibody_set() and ext4_xattr_set_entry() signatures
   to accept ea_inode_array, converting ALL iput(ea_inode) calls.
 - Set EXT4_STATE_NO_EXPAND in ext4_evict_inode() before
   ext4_mark_inode_dirty().

v3:
 - Check ext4_expand_inode_array() return value; fallback to
   direct iput() on ENOMEM to prevent inode leak.
 - Make ext4_xattr_set_handle() take an optional ea_inode_array
   output parameter so callers can free after ext4_journal_stop(),
   avoiding the jbd2_handle vs s_writepages_rwsem AB-BA.
 - Pass ea_inode_array directly to ext4_xattr_release_block()
   instead of using a local array freed under xattr_sem.
 - Move ext4_xattr_inode_array_free() after ext4_journal_stop()

v2:
 - Defer iput() in ext4_xattr_block_set() via ea_inode_array,
   freed after xattr_sem is released. Fixes the root cause.

v1:
 - Set EXT4_STATE_NO_EXPAND in ext4_evict_inode() to skip expand
   on inodes being deleted. Only fixes the syzbot reproducer, not
   the underlying lock ordering violation.

Yun Zhou (4):
  ext4: skip extra isize expansion during mount to prevent deadlock
  ext4: set EXT4_STATE_NO_EXPAND in ext4_evict_inode
  ext4: introduce ext4_put_ea_inode() for safe deferred iput
  ext4: convert remaining EA inode iput() calls to ext4_put_ea_inode()

 fs/ext4/ext4.h  |   5 +++
 fs/ext4/inode.c |  11 +++++
 fs/ext4/super.c |   6 +++
 fs/ext4/xattr.c | 105 +++++++++++++++++++++++++++++++++++++++++++-----
 fs/ext4/xattr.h |   2 +
 5 files changed, 120 insertions(+), 9 deletions(-)

-- 
2.43.0


^ permalink raw reply

* Re: [PATCH RFC 2/8] fs: add a global device to super block hash table
From: Gao Xiang @ 2026-06-16 16:35 UTC (permalink / raw)
  To: Christoph Hellwig, Christian Brauner
  Cc: Jan Kara, Jens Axboe, Alexander Viro, linux-block, linux-kernel,
	linux-fsdevel, Carlos Maiolino, linux-xfs, Chris Mason,
	David Sterba, linux-btrfs, Theodore Ts'o, linux-ext4,
	Gao Xiang, linux-erofs
In-Reply-To: <20260616123443.GA21024@lst.de>



On 2026/6/16 20:34, Christoph Hellwig wrote:

> IMHO sharing devices between superblocks is a bad idea, but that ship
> has sailed, but please keep it contained inside of erofs.

I'm not sure why it's a bad idea, for example,
the immutable layer model is already applied to layered virtual
block formats (such as qcow2) and layered fs like overlayfs.

and I think device mappers may have some similar immutable
approaches as shared layers but works in a slight different
way.

The principle is that each instance uses shared blobs in a
read-only way, and that is almost a simple and safest way
to share data among filesystem instances.

Yet I don't want to argue with that since it's pretty common
for years and I've seen no practical risk using this model.

Thanks,
Gao Xiang

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox