Linux block layer

Linux block layer
 help / color / mirror / Atom feed

* [PATCH RFC v2 07/18] fs: maintain a global device-to-superblock table
From: Christian Brauner @ 2026-06-16 14:08 UTC (permalink / raw)
  To: Jan Kara
  Cc: Christoph Hellwig, Jens Axboe, Alexander Viro, linux-block,
	linux-kernel, linux-fsdevel, Carlos Maiolino, linux-xfs,
	Chris Mason, David Sterba, linux-btrfs, Theodore Ts'o,
	linux-ext4, Gao Xiang, linux-erofs, Christian Brauner (Amutable)
In-Reply-To: <20260616-work-super-bdev_holder_global-v2-0-7df6b864028e@kernel.org>

fs_holder_ops recovers the owning superblock from bdev->bd_holder, which
forces the holder to be exactly one superblock and prevents several
superblocks from sharing one block device. That's what erofs is doing.

As a first step introduce a global dev_t-keyed rhltable mapping each
device to the superblock(s) using it. The entry is preallocated in
alloc_super() and registered under sb->s_dev by the set callback through
set_anon_super() and set_bdev_super(), the two helpers every set
callback assigns s_dev through. Registration is the final fallible act
of a set callback, so an insert failure unwinds through sget_fc()'s
existing set-failure path: the fs_context keeps ownership of s_fs_info
and the callers' error paths stay correct. set_anon_super() releases
the anonymous dev it allocated when registration fails. Unwinding
through deactivate_locked_super() instead would run kill_sb() and free
s_fs_info behind the caller's back: nfs and ceph free that object
through a local pointer when sget_fc() fails and would double-free.

The superblock stashes the entry in sb->s_super_dev and
kill_super_notify() drops the claim through it, so teardown doesn't
depend on s_dev staying stable; an entry that was never registered is
freed together with the superblock in destroy_super_work().

Each table entry holds a passive reference (s_passive) on its
superblock, so the struct stays valid for as long as the entry is
reachable. Entries are claim-counted through sd_ref: additional claims
on the same (device, superblock) pair share the entry, and the unlink
is deferred to the last put, so a later iteration cursor never resumes
from a removed node.

The table is initialized from mnt_init(): the first superblocks (the
tmpfs shm mount and rootfs) are created from start_kernel() long before
any initcall runs, so an initcall would be too late.

The table has no readers yet; the fs_holder_ops callbacks are switched
over once all devices a filesystem claims are registered.

Signed-off-by: Christian Brauner (Amutable) <brauner@kernel.org>
---
 fs/internal.h                  |   1 +
 fs/namespace.c                 |   2 +
 fs/super.c                     | 102 ++++++++++++++++++++++++++++++++++++++++-
 include/linux/fs/super_types.h |   2 +
 4 files changed, 105 insertions(+), 2 deletions(-)

diff --git a/fs/internal.h b/fs/internal.h
index d77578d66d42..83eb3e2a0f85 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -137,6 +137,7 @@ extern int reconfigure_super(struct fs_context *);
 extern bool super_trylock_shared(struct super_block *sb);
 struct super_block *user_get_super(dev_t, bool excl);
 void put_super(struct super_block *sb);
+void __init super_dev_init(void);
 extern bool mount_capable(struct fs_context *);
 int sb_init_dio_done_wq(struct super_block *sb);
 
diff --git a/fs/namespace.c b/fs/namespace.c
index 3d5cd5bf3b05..7cef6dae0854 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -6262,6 +6262,8 @@ void __init mnt_init(void)
 	if (!mount_hashtable || !mountpoint_hashtable)
 		panic("Failed to allocate mount hash table\n");
 
+	super_dev_init();
+
 	kernfs_init();
 
 	err = sysfs_init();
diff --git a/fs/super.c b/fs/super.c
index a771a0ad4c9a..ff5e305d0ab4 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -24,6 +24,7 @@
 #include <linux/export.h>
 #include <linux/slab.h>
 #include <linux/blkdev.h>
+#include <linux/rhashtable.h>
 #include <linux/mount.h>
 #include <linux/security.h>
 #include <linux/writeback.h>		/* for the emergency remount stuff */
@@ -272,6 +273,8 @@ static unsigned long super_cache_count(struct shrinker *shrink,
 	return total_objects;
 }
 
+static struct super_dev *super_dev_alloc(dev_t dev, struct super_block *sb);
+
 static void destroy_super_work(struct work_struct *work)
 {
 	struct super_block *s = container_of(work, struct super_block,
@@ -279,6 +282,8 @@ static void destroy_super_work(struct work_struct *work)
 	fsnotify_sb_free(s);
 	security_sb_free(s);
 	put_user_ns(s->s_user_ns);
+	/* Only an unregistered entry is still owned by the superblock. */
+	kfree(s->s_super_dev);
 	kfree(s->s_subtype);
 	for (int i = 0; i < SB_FREEZE_LEVELS; i++)
 		percpu_free_rwsem(&s->s_writers.rw_sem[i]);
@@ -392,6 +397,10 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags,
 		goto fail;
 	if (list_lru_init_memcg(&s->s_inode_lru, s->s_shrink))
 		goto fail;
+	s->s_super_dev = super_dev_alloc(0, s);
+	if (!s->s_super_dev)
+		goto fail;
+
 	s->s_min_writeback_pages = MIN_WRITEBACK_PAGES;
 	return s;
 
@@ -421,6 +430,77 @@ void put_super(struct super_block *s)
 	}
 }
 
+struct super_dev {
+	dev_t			sd_dev;
+	struct super_block	*sd_sb;
+	refcount_t		sd_ref;
+	struct rhlist_head	sd_node;
+	struct rcu_head		sd_rcu;
+};
+
+static struct rhltable super_dev_table;
+static const struct rhashtable_params super_dev_params = {
+	.key_len	= sizeof(dev_t),
+	.key_offset	= offsetof(struct super_dev, sd_dev),
+	.head_offset	= offsetof(struct super_dev, sd_node),
+};
+
+static struct super_dev *super_dev_alloc(dev_t dev, struct super_block *sb)
+{
+	struct super_dev *fsd;
+
+	fsd = kzalloc_obj(*fsd);
+	if (!fsd)
+		return NULL;
+	fsd->sd_dev = dev;
+	fsd->sd_sb = sb;
+	refcount_set(&fsd->sd_ref, 1);
+	return fsd;
+}
+
+static void super_dev_put(struct super_dev *fsd)
+{
+	/* Unlink only once unpinned, so a cursor never resumes from a removed node. */
+	if (fsd && refcount_dec_and_test(&fsd->sd_ref)) {
+		rhltable_remove(&super_dev_table, &fsd->sd_node, super_dev_params);
+		put_super(fsd->sd_sb);
+		kfree_rcu(fsd, sd_rcu);
+	}
+}
+
+void __init super_dev_init(void)
+{
+	if (rhltable_init(&super_dev_table, &super_dev_params))
+		panic("VFS: Cannot initialise super_dev_table\n");
+}
+
+static int super_dev_insert(struct super_dev *fsd)
+{
+	int err;
+
+	err = rhltable_insert(&super_dev_table, &fsd->sd_node, super_dev_params);
+	if (!err)
+		refcount_inc(&fsd->sd_sb->s_passive);
+	return err;
+}
+
+/* Register @sb under @sb->s_dev as the final fallible act of a set callback. */
+static int super_dev_register(struct super_block *sb)
+{
+	struct super_dev *fsd = sb->s_super_dev;
+	int err;
+
+	lockdep_assert_held(&sb_lock);
+	VFS_WARN_ON_ONCE(!sb->s_dev);
+	VFS_WARN_ON_ONCE(!fsd || fsd->sd_dev);
+
+	fsd->sd_dev = sb->s_dev;
+	err = super_dev_insert(fsd);
+	if (err)
+		fsd->sd_dev = 0;
+	return err;
+}
+
 static void kill_super_notify(struct super_block *sb)
 {
 	lockdep_assert_not_held(&sb->s_umount);
@@ -440,6 +520,12 @@ static void kill_super_notify(struct super_block *sb)
 	hlist_del_init(&sb->s_instances);
 	spin_unlock(&sb_lock);
 
+	/* Drop sget_fc()'s claim; a never-registered entry stays with the sb. */
+	if (sb->s_super_dev->sd_dev) {
+		super_dev_put(sb->s_super_dev);
+		sb->s_super_dev = NULL;
+	}
+
 	/*
 	 * Let concurrent mounts know that this thing is really dead.
 	 * We don't need @sb->s_umount here as every concurrent caller
@@ -750,6 +836,7 @@ struct super_block *sget_fc(struct fs_context *fc,
 	}
 	if (!s) {
 		spin_unlock(&sb_lock);
+
 		s = alloc_super(fc->fs_type, fc->sb_flags, user_ns);
 		if (!s)
 			return ERR_PTR(-ENOMEM);
@@ -759,11 +846,13 @@ struct super_block *sget_fc(struct fs_context *fc,
 	s->s_fs_info = fc->s_fs_info;
 	err = set(s, fc);
 	if (err) {
+		VFS_WARN_ON_ONCE(s->s_super_dev->sd_dev);
 		s->s_fs_info = NULL;
 		spin_unlock(&sb_lock);
 		destroy_unused_super(s);
 		return ERR_PTR(err);
 	}
+	VFS_WARN_ON_ONCE(!s->s_super_dev->sd_dev);
 	fc->s_fs_info = NULL;
 	s->s_type = fc->fs_type;
 	s->s_iflags |= fc->s_iflags;
@@ -1217,7 +1306,16 @@ EXPORT_SYMBOL(free_anon_bdev);
 
 int set_anon_super(struct super_block *s, void *data)
 {
-	return get_anon_bdev(&s->s_dev);
+	int error;
+
+	error = get_anon_bdev(&s->s_dev);
+	if (error)
+		return error;
+
+	error = super_dev_register(s);
+	if (error)
+		free_anon_bdev(s->s_dev);
+	return error;
 }
 EXPORT_SYMBOL(set_anon_super);
 
@@ -1303,7 +1401,7 @@ EXPORT_SYMBOL(get_tree_keyed);
 static int set_bdev_super(struct super_block *s, void *data)
 {
 	s->s_dev = *(dev_t *)data;
-	return 0;
+	return super_dev_register(s);
 }
 
 static int super_s_dev_set(struct super_block *s, struct fs_context *fc)
diff --git a/include/linux/fs/super_types.h b/include/linux/fs/super_types.h
index 68747182abf9..c8172558750f 100644
--- a/include/linux/fs/super_types.h
+++ b/include/linux/fs/super_types.h
@@ -30,6 +30,7 @@ struct mount;
 struct mtd_info;
 struct quotactl_ops;
 struct shrinker;
+struct super_dev;
 struct unicode_map;
 struct user_namespace;
 struct workqueue_struct;
@@ -132,6 +133,7 @@ struct super_operations {
 struct super_block {
 	struct list_head			s_list;		/* Keep this first */
 	dev_t					s_dev;		/* search index; _not_ kdev_t */
+	struct super_dev			*s_super_dev;	/* sget_fc()'s device table claim */
 	unsigned char				s_blocksize_bits;
 	unsigned long				s_blocksize;
 	loff_t					s_maxbytes;	/* Max file size */

-- 
2.47.3


^ permalink raw reply related

* [PATCH RFC v2 06/18] ocfs2: don't reset s_dev on dismount
From: Christian Brauner @ 2026-06-16 14:08 UTC (permalink / raw)
  To: Jan Kara
  Cc: Christoph Hellwig, Jens Axboe, Alexander Viro, linux-block,
	linux-kernel, linux-fsdevel, Carlos Maiolino, linux-xfs,
	Chris Mason, David Sterba, linux-btrfs, Theodore Ts'o,
	linux-ext4, Gao Xiang, linux-erofs, Christian Brauner (Amutable)
In-Reply-To: <20260616-work-super-bdev_holder_global-v2-0-7df6b864028e@kernel.org>

ocfs2_dismount_volume() has reset sb->s_dev to zero since the original
merge in ccd979bdbce9 ("[PATCH] OCFS2: The Second Oracle Cluster
Filesystem") as part of scrubbing the super_block. Nothing reads the
field afterwards: all ocfs2-internal uses are mount-time log and trace
prints, and dev_t-keyed superblock lookups skip a dying superblock
anyway - s_root is gone before ->put_super runs and super_lock()
refuses SB_DYING superblocks.

The upcoming device-to-superblock table registers every superblock
under its s_dev. Drop the reset instead of leaving a superblock around
whose s_dev contradicts its registration.

Signed-off-by: Christian Brauner (Amutable) <brauner@kernel.org>
---
 fs/ocfs2/super.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 4870e680c4e5..df9ebff25dab 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1882,7 +1882,6 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)

 	ocfs2_delete_osb(osb);
 	kfree(osb);
-	sb->s_dev = 0;
 	sb->s_fs_info = NULL;
 }

-- 
2.47.3

^ permalink raw reply related

* [PATCH RFC v2 05/18] ext4: use anonymous devices for KUnit test superblocks
From: Christian Brauner @ 2026-06-16 14:08 UTC (permalink / raw)
  To: Jan Kara
  Cc: Christoph Hellwig, Jens Axboe, Alexander Viro, linux-block,
	linux-kernel, linux-fsdevel, Carlos Maiolino, linux-xfs,
	Chris Mason, David Sterba, linux-btrfs, Theodore Ts'o,
	linux-ext4, Gao Xiang, linux-erofs, Christian Brauner (Amutable)
In-Reply-To: <20260616-work-super-bdev_holder_global-v2-0-7df6b864028e@kernel.org>

The mballoc and extents KUnit tests create superblocks through
sget_fc() with a set callback that never assigns s_dev and a kill_sb
that only calls generic_shutdown_super().

The upcoming global device-to-superblock table registers every
superblock under its s_dev, so each superblock needs a unique device
number. Allocate a proper anonymous device via set_anon_super_fc() and
release it through kill_anon_super().

Signed-off-by: Christian Brauner (Amutable) <brauner@kernel.org>
---
 fs/ext4/extents-test.c | 9 ++-------
 fs/ext4/mballoc-test.c | 9 ++-------
 2 files changed, 4 insertions(+), 14 deletions(-)

diff --git a/fs/ext4/extents-test.c b/fs/ext4/extents-test.c
index bd7795a82607..c3836ecb89f9 100644
--- a/fs/ext4/extents-test.c
+++ b/fs/ext4/extents-test.c
@@ -126,11 +126,6 @@ struct kunit_ext_test_param {
 	struct kunit_ext_data_state exp_data_state[3];
 };
 
-static void ext_kill_sb(struct super_block *sb)
-{
-	generic_shutdown_super(sb);
-}
-
 static int ext_init_fs_context(struct fs_context *fc)
 {
 	return 0;
@@ -138,13 +133,13 @@ static int ext_init_fs_context(struct fs_context *fc)
 
 static int ext_set(struct super_block *sb, struct fs_context *fc)
 {
-	return 0;
+	return set_anon_super_fc(sb, fc);
 }
 
 static struct file_system_type ext_fs_type = {
 	.name		 = "extents test",
 	.init_fs_context = ext_init_fs_context,
-	.kill_sb	 = ext_kill_sb,
+	.kill_sb	 = kill_anon_super,
 };
 
 static void extents_kunit_exit(struct kunit *test)
diff --git a/fs/ext4/mballoc-test.c b/fs/ext4/mballoc-test.c
index d90da44aadbd..a3b33ed2c172 100644
--- a/fs/ext4/mballoc-test.c
+++ b/fs/ext4/mballoc-test.c
@@ -59,11 +59,6 @@ static const struct super_operations mbt_sops = {
 	.free_inode	= mbt_free_inode,
 };
 
-static void mbt_kill_sb(struct super_block *sb)
-{
-	generic_shutdown_super(sb);
-}
-
 static int mbt_init_fs_context(struct fs_context *fc)
 {
 	return 0;
@@ -72,7 +67,7 @@ static int mbt_init_fs_context(struct fs_context *fc)
 static struct file_system_type mbt_fs_type = {
 	.name			= "mballoc test",
 	.init_fs_context	= mbt_init_fs_context,
-	.kill_sb		= mbt_kill_sb,
+	.kill_sb		= kill_anon_super,
 };
 
 static int mbt_mb_init(struct super_block *sb)
@@ -136,7 +131,7 @@ static void mbt_mb_release(struct super_block *sb)
 
 static int mbt_set(struct super_block *sb, struct fs_context *fc)
 {
-	return 0;
+	return set_anon_super_fc(sb, fc);
 }
 
 static struct super_block *mbt_ext4_alloc_super_block(void)

-- 
2.47.3


^ permalink raw reply related

* [PATCH RFC v2 04/18] fs, block: move blk_mode_t and fop_flags_t into <linux/types.h>
From: Christian Brauner @ 2026-06-16 14:08 UTC (permalink / raw)
  To: Jan Kara
  Cc: Christoph Hellwig, Jens Axboe, Alexander Viro, linux-block,
	linux-kernel, linux-fsdevel, Carlos Maiolino, linux-xfs,
	Chris Mason, David Sterba, linux-btrfs, Theodore Ts'o,
	linux-ext4, Gao Xiang, linux-erofs, Christian Brauner (Amutable)
In-Reply-To: <20260616-work-super-bdev_holder_global-v2-0-7df6b864028e@kernel.org>

blk_mode_t and fop_flags_t are both plain 'unsigned int __bitwise' flag
typedefs, exactly like the gfp_t, slab_flags_t and fmode_t that already
live in <linux/types.h>. Move them there so they are available
everywhere without having to drag in a subsystem header.

Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner (Amutable) <brauner@kernel.org>
---
 include/linux/blkdev.h | 2 --
 include/linux/fs.h     | 2 --
 include/linux/types.h  | 2 ++
 3 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 9e95bdb8b323..cee548184a7b 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -126,8 +126,6 @@ struct blk_integrity {
 	unsigned char				pi_tuple_size;
 };
 
-typedef unsigned int __bitwise blk_mode_t;
-
 /* open for reading */
 #define BLK_OPEN_READ		((__force blk_mode_t)(1 << 0))
 /* open for writing */
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 6da44573ce45..1c8fe40ad9a4 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1921,8 +1921,6 @@ struct dir_context {
 struct io_uring_cmd;
 struct offset_ctx;
 
-typedef unsigned int __bitwise fop_flags_t;
-
 struct file_operations {
 	struct module *owner;
 	fop_flags_t fop_flags;
diff --git a/include/linux/types.h b/include/linux/types.h
index 608050dbca6a..ef026585420b 100644
--- a/include/linux/types.h
+++ b/include/linux/types.h
@@ -163,6 +163,8 @@ typedef u32 dma_addr_t;
 typedef unsigned int __bitwise gfp_t;
 typedef unsigned int __bitwise slab_flags_t;
 typedef unsigned int __bitwise fmode_t;
+typedef unsigned int __bitwise blk_mode_t;
+typedef unsigned int __bitwise fop_flags_t;
 
 #ifdef CONFIG_PHYS_ADDR_T_64BIT
 typedef u64 phys_addr_t;

-- 
2.47.3


^ permalink raw reply related

* [PATCH RFC v2 03/18] super: take lock after last reference count
From: Christian Brauner @ 2026-06-16 14:08 UTC (permalink / raw)
  To: Jan Kara
  Cc: Christoph Hellwig, Jens Axboe, Alexander Viro, linux-block,
	linux-kernel, linux-fsdevel, Carlos Maiolino, linux-xfs,
	Chris Mason, David Sterba, linux-btrfs, Theodore Ts'o,
	linux-ext4, Gao Xiang, linux-erofs, Christian Brauner (Amutable)
In-Reply-To: <20260616-work-super-bdev_holder_global-v2-0-7df6b864028e@kernel.org>

__put_super() required the caller to hold sb_lock, so put_super()
wrapped it. The per-device superblock table introduced later drops its
passive references from contexts that do not hold sb_lock, so make
put_super() self-locking: drop the count first and take sb_lock only for
the final list_del.

With the count now dropped outside sb_lock a superblock can briefly sit
on @super_blocks with s_passive == 0 before it is unlinked, so the list
walkers (__iterate_supers(), iterate_supers_type(), user_get_super())
switch to refcount_inc_not_zero() and skip it.

Signed-off-by: Christian Brauner (Amutable) <brauner@kernel.org>
---
 fs/super.c | 63 ++++++++++++++++++++++++++++----------------------------------
 1 file changed, 28 insertions(+), 35 deletions(-)

diff --git a/fs/super.c b/fs/super.c
index 25dd72b550e0..a771a0ad4c9a 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -403,12 +403,17 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags,
 /* Superblock refcounting  */
 
 /*
- * Drop a superblock's refcount.  The caller must hold sb_lock.
+ * Drop a superblock's passive reference.  Must be called WITHOUT sb_lock held;
+ * put_super() acquires sb_lock itself when the final reference is dropped.
  */
-static void __put_super(struct super_block *s)
+void put_super(struct super_block *s)
 {
 	if (refcount_dec_and_test(&s->s_passive)) {
+
+		spin_lock(&sb_lock);
 		list_del_init(&s->s_list);
+		spin_unlock(&sb_lock);
+
 		WARN_ON(s->s_dentry_lru.node);
 		WARN_ON(s->s_inode_lru.node);
 		WARN_ON(s->s_mounts);
@@ -416,20 +421,6 @@ static void __put_super(struct super_block *s)
 	}
 }
 
-/**
- *	put_super	-	drop a temporary reference to superblock
- *	@sb: superblock in question
- *
- *	Drops a temporary reference, frees superblock if there's no
- *	references left.
- */
-void put_super(struct super_block *sb)
-{
-	spin_lock(&sb_lock);
-	__put_super(sb);
-	spin_unlock(&sb_lock);
-}
-
 static void kill_super_notify(struct super_block *sb)
 {
 	lockdep_assert_not_held(&sb->s_umount);
@@ -478,11 +469,7 @@ void deactivate_locked_super(struct super_block *s)
 
 		kill_super_notify(s);
 
-		/*
-		 * Since list_lru_destroy() may sleep, we cannot call it from
-		 * put_super(), where we hold the sb_lock. Therefore we destroy
-		 * the lru lists right now.
-		 */
+		/* list_lru_destroy() may sleep; put_super() callers may not. */
 		list_lru_destroy(&s->s_dentry_lru);
 		list_lru_destroy(&s->s_inode_lru);
 
@@ -851,14 +838,17 @@ static void __iterate_supers(void (*f)(struct super_block *, void *), void *arg,
 	struct super_block *sb, *p = NULL;
 	bool excl = flags & SUPER_ITER_EXCL;
 
-	guard(spinlock)(&sb_lock);
+	spin_lock(&sb_lock);
 
 	for (sb = first_super(flags);
 	     !list_entry_is_head(sb, &super_blocks, s_list);
 	     sb = next_super(sb, flags)) {
 		if (super_flags(sb, SB_DYING))
 			continue;
-		refcount_inc(&sb->s_passive);
+
+		if (!refcount_inc_not_zero(&sb->s_passive))
+			continue;
+
 		spin_unlock(&sb_lock);
 
 		if (flags & SUPER_ITER_UNLOCKED) {
@@ -868,13 +858,14 @@ static void __iterate_supers(void (*f)(struct super_block *, void *), void *arg,
 			super_unlock(sb, excl);
 		}
 
-		spin_lock(&sb_lock);
 		if (p)
-			__put_super(p);
+			put_super(p);
 		p = sb;
+		spin_lock(&sb_lock);
 	}
+	spin_unlock(&sb_lock);
 	if (p)
-		__put_super(p);
+		put_super(p);
 }
 
 void iterate_supers(void (*f)(struct super_block *, void *), void *arg)
@@ -903,7 +894,9 @@ void iterate_supers_type(struct file_system_type *type,
 		if (super_flags(sb, SB_DYING))
 			continue;
 
-		refcount_inc(&sb->s_passive);
+		if (!refcount_inc_not_zero(&sb->s_passive))
+			continue;
+
 		spin_unlock(&sb_lock);
 
 		locked = super_lock_shared(sb);
@@ -912,14 +905,14 @@ void iterate_supers_type(struct file_system_type *type,
 			super_unlock_shared(sb);
 		}
 
-		spin_lock(&sb_lock);
 		if (p)
-			__put_super(p);
+			put_super(p);
 		p = sb;
+		spin_lock(&sb_lock);
 	}
-	if (p)
-		__put_super(p);
 	spin_unlock(&sb_lock);
+	if (p)
+		put_super(p);
 }
 
 EXPORT_SYMBOL(iterate_supers_type);
@@ -935,15 +928,17 @@ struct super_block *user_get_super(dev_t dev, bool excl)
 		if (sb->s_dev != dev)
 			continue;
 
-		refcount_inc(&sb->s_passive);
+		if (!refcount_inc_not_zero(&sb->s_passive))
+			continue;
+
 		spin_unlock(&sb_lock);
 
 		locked = super_lock(sb, excl);
 		if (locked)
 			return sb;
 
+		put_super(sb);
 		spin_lock(&sb_lock);
-		__put_super(sb);
 		break;
 	}
 	spin_unlock(&sb_lock);
@@ -1368,9 +1363,7 @@ static struct super_block *bdev_super_lock(struct block_device *bdev, bool excl)
 	lockdep_assert_not_held(&bdev->bd_disk->open_mutex);
 
 	/* Make sure sb doesn't go away from under us */
-	spin_lock(&sb_lock);
 	refcount_inc(&sb->s_passive);
-	spin_unlock(&sb_lock);
 
 	mutex_unlock(&bdev->bd_holder_lock);
 

-- 
2.47.3


^ permalink raw reply related

* [PATCH RFC v2 02/18] super: convert s_count to refcount_t s_passive
From: Christian Brauner @ 2026-06-16 14:08 UTC (permalink / raw)
  To: Jan Kara
  Cc: Christoph Hellwig, Jens Axboe, Alexander Viro, linux-block,
	linux-kernel, linux-fsdevel, Carlos Maiolino, linux-xfs,
	Chris Mason, David Sterba, linux-btrfs, Theodore Ts'o,
	linux-ext4, Gao Xiang, linux-erofs, Christian Brauner (Amutable)
In-Reply-To: <20260616-work-super-bdev_holder_global-v2-0-7df6b864028e@kernel.org>

The superblock carries two counters: s_active, the active reference
count that keeps the filesystem usable, and s_count, the passive
reference count that merely keeps the structure itself alive. Turn the
passive count into a refcount_t and rename it to s_passive to make the
pairing with s_active obvious.

Everything is still serialized by sb_lock, so there is no functional
change; the conversion buys the usual refcount_t saturation and
underflow checking. The following patches start dropping passive
references without holding sb_lock and make the device-to-superblock
table hold one passive reference per registered entry, which a plain
integer cannot support.

Signed-off-by: Christian Brauner (Amutable) <brauner@kernel.org>
---
 fs/super.c                     | 18 +++++++++---------
 include/linux/fs/super_types.h |  2 +-
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/fs/super.c b/fs/super.c
index a8fd61136aaf..25dd72b550e0 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -102,7 +102,7 @@ static bool super_flags(const struct super_block *sb, unsigned int flags)
  * creation will succeed and SB_BORN is set by vfs_get_tree() or we're
  * woken and we'll see SB_DYING.
  *
- * The caller must have acquired a temporary reference on @sb->s_count.
+ * The caller must have acquired a temporary reference on @sb->s_passive.
  *
  * Return: The function returns true if SB_BORN was set and with
  *         s_umount held. The function returns false if SB_DYING was
@@ -367,7 +367,7 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags,
 	spin_lock_init(&s->s_inode_wblist_lock);
 	fserror_mount(s);
 
-	s->s_count = 1;
+	refcount_set(&s->s_passive, 1);
 	atomic_set(&s->s_active, 1);
 	mutex_init(&s->s_vfs_rename_mutex);
 	lockdep_set_class(&s->s_vfs_rename_mutex, &type->s_vfs_rename_key);
@@ -407,7 +407,7 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags,
  */
 static void __put_super(struct super_block *s)
 {
-	if (!--s->s_count) {
+	if (refcount_dec_and_test(&s->s_passive)) {
 		list_del_init(&s->s_list);
 		WARN_ON(s->s_dentry_lru.node);
 		WARN_ON(s->s_inode_lru.node);
@@ -529,7 +529,7 @@ static bool grab_super(struct super_block *sb)
 {
 	bool locked;
 
-	sb->s_count++;
+	refcount_inc(&sb->s_passive);
 	spin_unlock(&sb_lock);
 	locked = super_lock_excl(sb);
 	if (locked) {
@@ -556,7 +556,7 @@ static bool grab_super(struct super_block *sb)
  *	lock held in read mode in case of success. On successful return,
  *	the caller must drop the s_umount lock when done.
  *
- *	Note that unlike get_super() et.al. this one does *not* bump ->s_count.
+ *	Note that unlike get_super() et.al. this one does *not* bump ->s_passive.
  *	The reason why it's safe is that we are OK with doing trylock instead
  *	of down_read().  There's a couple of places that are OK with that, but
  *	it's very much not a general-purpose interface.
@@ -858,7 +858,7 @@ static void __iterate_supers(void (*f)(struct super_block *, void *), void *arg,
 	     sb = next_super(sb, flags)) {
 		if (super_flags(sb, SB_DYING))
 			continue;
-		sb->s_count++;
+		refcount_inc(&sb->s_passive);
 		spin_unlock(&sb_lock);
 
 		if (flags & SUPER_ITER_UNLOCKED) {
@@ -903,7 +903,7 @@ void iterate_supers_type(struct file_system_type *type,
 		if (super_flags(sb, SB_DYING))
 			continue;
 
-		sb->s_count++;
+		refcount_inc(&sb->s_passive);
 		spin_unlock(&sb_lock);
 
 		locked = super_lock_shared(sb);
@@ -935,7 +935,7 @@ struct super_block *user_get_super(dev_t dev, bool excl)
 		if (sb->s_dev != dev)
 			continue;
 
-		sb->s_count++;
+		refcount_inc(&sb->s_passive);
 		spin_unlock(&sb_lock);
 
 		locked = super_lock(sb, excl);
@@ -1369,7 +1369,7 @@ static struct super_block *bdev_super_lock(struct block_device *bdev, bool excl)
 
 	/* Make sure sb doesn't go away from under us */
 	spin_lock(&sb_lock);
-	sb->s_count++;
+	refcount_inc(&sb->s_passive);
 	spin_unlock(&sb_lock);
 
 	mutex_unlock(&bdev->bd_holder_lock);
diff --git a/include/linux/fs/super_types.h b/include/linux/fs/super_types.h
index ef7941e9dc79..68747182abf9 100644
--- a/include/linux/fs/super_types.h
+++ b/include/linux/fs/super_types.h
@@ -145,7 +145,7 @@ struct super_block {
 	unsigned long				s_magic;
 	struct dentry				*s_root;
 	struct rw_semaphore			s_umount;
-	int					s_count;
+	refcount_t				s_passive;
 	atomic_t				s_active;
 #ifdef CONFIG_SECURITY
 	void					*s_security;

-- 
2.47.3


^ permalink raw reply related

* [PATCH RFC v2 01/18] xfs: fix the error unwind in xfs_open_devices()
From: Christian Brauner @ 2026-06-16 14:08 UTC (permalink / raw)
  To: Jan Kara
  Cc: Christoph Hellwig, Jens Axboe, Alexander Viro, linux-block,
	linux-kernel, linux-fsdevel, Carlos Maiolino, linux-xfs,
	Chris Mason, David Sterba, linux-btrfs, Theodore Ts'o,
	linux-ext4, Gao Xiang, linux-erofs, Christian Brauner (Amutable)
In-Reply-To: <20260616-work-super-bdev_holder_global-v2-0-7df6b864028e@kernel.org>

Since the rt and log block devices are closed in xfs_free_buftarg() the
buftarg owns the device file. The error unwind does not respect that:
when the log buftarg allocation fails, out_free_rtdev_targ frees the rt
buftarg - releasing rtdev_file - and then falls through to
out_close_rtdev and releases it a second time.

The unwind also leaves mp->m_rtdev_targp and mp->m_ddev_targp pointing
to the freed buftargs. The failed mount continues into
deactivate_locked_super() -> xfs_kill_sb() -> xfs_mount_free(), which
frees them again.

Clear the buftarg pointers once the unwind freed them and clear
rtdev_file once the rt buftarg owns it, so nothing is released twice.

Reachable when a buftarg allocation fails after the data buftarg was
set up: an I/O error in sync_blockdev() or an allocation failure in
xfs_init_buftarg() while mounting with external rt and log devices.

Fixes: 41233576e9a4 ("xfs: close the RT and log block devices in xfs_free_buftarg")
Signed-off-by: Christian Brauner (Amutable) <brauner@kernel.org>
---
 fs/xfs/xfs_super.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index eac7f9503805..8531d526fc44 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -534,8 +534,11 @@ xfs_open_devices(
  out_free_rtdev_targ:
 	if (mp->m_rtdev_targp)
 		xfs_free_buftarg(mp->m_rtdev_targp);
+	mp->m_rtdev_targp = NULL;
+	rtdev_file = NULL;	/* released by xfs_free_buftarg() */
  out_free_ddev_targ:
 	xfs_free_buftarg(mp->m_ddev_targp);
+	mp->m_ddev_targp = NULL;
  out_close_rtdev:
 	 if (rtdev_file)
 		bdev_fput(rtdev_file);

-- 
2.47.3

^ permalink raw reply related

* [PATCH RFC v2 00/18] fs: support freeze/thaw/mark_dead/sync with shared devices
From: Christian Brauner @ 2026-06-16 14:08 UTC (permalink / raw)
  To: Jan Kara
  Cc: Christoph Hellwig, Jens Axboe, Alexander Viro, linux-block,
	linux-kernel, linux-fsdevel, Carlos Maiolino, linux-xfs,
	Chris Mason, David Sterba, linux-btrfs, Theodore Ts'o,
	linux-ext4, Gao Xiang, linux-erofs, Christian Brauner (Amutable),
	syzbot, Gao Xiang

This is a generalization of the device number to superblock so it works
for actual block device and anonymous (or even mtd) devices.

fs_holder_ops recovers the affected superblock from bdev->bd_holder. That
forces the holder of a block device to be exactly one superblock and makes
it impossible for several superblocks to share a single device.

erofs does exactly that. It can mount read-only "blob" devices that are
shared between many superblocks: a metadata-only erofs that indexes a set
of per-layer blobs (one filesystem instead of one per OCI layer), or an
incremental image whose base device is shared by several updates. Because
the block layer only tracks a single holder, a freeze, thaw, removal or
sync on such a device is never propagated to all the superblocks using it,
and the current infrastructure has no way to find them.

This series replaces the bd_holder-based lookup with a global, dev_t-keyed
table mapping each block device to the superblock(s) using it. The holder
argument becomes purely the block layer's exclusivity token -- a superblock,
or the file_system_type for a device shared within one filesystem type --
and the fs_holder_ops callbacks look the device up in the table and act on
every superblock registered for it: 1:1 for most filesystems, 1:many for
erofs.

Filesystems claim and release their devices through new
fs_bdev_file_open_by_{dev,path}() and fs_bdev_file_release() helpers; the
per-fs patches convert xfs, btrfs, ext4, f2fs and erofs over to them and
fix cramfs and romfs, which released the registered main device with a
raw bdev_fput().

Since every superblock is registered under its s_dev the table also
replaces the last s_dev-keyed walk of the super_blocks list:
user_get_super() resolves device numbers through it, so ustat() and
quotactl() now work on any device a filesystem claims and no longer
take sb_lock.

The longer-term motivation is to let userspace decide which devices may be
onlined from one central place, without having to teach every filesystem
about it individually.

Signed-off-by: Christian Brauner (Amutable) <brauner@kernel.org>
---
Changes in v2:
- super: rework the device-to-superblock table reference counting: each
  (device, superblock) entry carries a single claim count and holds one
  passive reference on its superblock for the entry's lifetime. New prep
  patches convert s_count to refcount_t s_passive and make put_super()
  self-locking.
- super: preallocate the entry in alloc_super() and register it from the
  set callbacks through set_anon_super()/set_bdev_super(); an insert
  failure unwinds exactly like a set callback failure. The superblock
  stashes the entry in sb->s_super_dev and kill_super_notify() drops the
  claim through it.
- super: initialize the table from mnt_init(); the rootfs and shm mounts
  are created long before any initcall runs.
- super: fold the v1 "refuse to claim a frozen block device" patch into
  the registration helper and restore the EBUSY check for the primary
  device in setup_bdev_super(): additional devices (the xfs log, the ext4
  journal, erofs blobs) are now refused while frozen as well, answering
  Jan's question on v1 3/8.
- Split the core patch into table/helpers/switch-over and move the
  xfs/btrfs/ext4 conversions before the fs_holder_ops switch so no
  freeze/mark_dead events are lost mid-series; erofs follows the switch.
- New prep patches: the ext4 KUnit tests allocate anonymous devices and
  ocfs2 stops resetting s_dev on dismount.
- New: convert user_get_super() to the device table, plus a ustat()
  selftest.
- New: fix a pre-existing double release of the realtime device file and
  dangling buftarg pointers in xfs_open_devices()'s error unwind.
- New: convert f2fs's additional devices to the helpers; fix cramfs and
  romfs releasing the registered main device with a raw bdev_fput().
- erofs: drop the .shutdown() and .remove_bdev() implementations and the
  per-device "dead" flag. Immutable filesystems don't need them: the block
  layer sets GD_DEAD before fs_bdev_mark_dead() so in-flight bios fail
  anyway, erofs has no write path or journal to stop, and the read-only
  loop_change_fd() case must not be forced to -EIO. Patch from Gao Xiang,
  applied verbatim - thanks!
- btrfs: fix a general protection fault in close_fs_devices() on a failed
  mount (reported by syzbot). The release path took the superblock from
  device->fs_info, which is still NULL if open_ctree() fails before
  btrfs_init_devices_late(); it now uses bdev_file->private_data.
- erofs: the v1 conversion was sent with a generic boilerplate changelog;
  superseded by Gao's patch above.
- Collect Reviewed-by from Jan Kara and Tested-by from syzbot.
- Rebase onto v7.1-rc1.
- Link to v1: https://patch.msgid.link/20260602-work-super-bdev_holder_global-v1-0-bb0fd82f3861@kernel.org

---
Christian Brauner (18):
      xfs: fix the error unwind in xfs_open_devices()
      super: convert s_count to refcount_t s_passive
      super: take lock after last reference count
      fs, block: move blk_mode_t and fop_flags_t into <linux/types.h>
      ext4: use anonymous devices for KUnit test superblocks
      ocfs2: don't reset s_dev on dismount
      fs: maintain a global device-to-superblock table
      fs: add dedicated block device open helpers for filesystems
      xfs: port to fs_bdev_file_open_by_path()
      btrfs: open via dedicated fs bdev helpers
      ext4: open via dedicated fs bdev helpers
      fs: look up superblocks via the device table in fs_holder_ops
      fs: tolerate per-superblock freeze errors on shared devices
      erofs: open via dedicated fs bdev helpers
      f2fs: open via dedicated fs bdev helpers
      super: make fs_holder_ops private
      fs: look up the superblock via the device table in user_get_super()
      selftests/filesystems: add ustat() coverage

 fs/btrfs/volumes.c                               |  31 +-
 fs/cramfs/inode.c                                |   2 +-
 fs/erofs/super.c                                 |  35 +-
 fs/ext4/extents-test.c                           |   9 +-
 fs/ext4/mballoc-test.c                           |   9 +-
 fs/ext4/super.c                                  |  12 +-
 fs/f2fs/super.c                                  |   6 +-
 fs/internal.h                                    |   1 +
 fs/namespace.c                                   |   2 +
 fs/ocfs2/super.c                                 |   1 -
 fs/romfs/super.c                                 |   2 +-
 fs/super.c                                       | 620 ++++++++++++++++-------
 fs/xfs/xfs_buf.c                                 |   2 +-
 fs/xfs/xfs_super.c                               |  13 +-
 include/linux/blkdev.h                           |   9 -
 include/linux/fs.h                               |   2 -
 include/linux/fs/super.h                         |   8 +
 include/linux/fs/super_types.h                   |   4 +-
 include/linux/types.h                            |   2 +
 tools/testing/selftests/filesystems/.gitignore   |   1 +
 tools/testing/selftests/filesystems/Makefile     |   2 +-
 tools/testing/selftests/filesystems/ustat_test.c | 135 +++++
 22 files changed, 647 insertions(+), 261 deletions(-)
---
base-commit: 0c0d974f62e6603d4514e1a8035658edb353c68f
change-id: 20260602-work-super-bdev_holder_global-8cba5e52bed5


^ permalink raw reply

* Re: Repeatable, raid1+O_DIRECT, hang/warn
From: Dr. David Alan Gilbert @ 2026-06-16 14:04 UTC (permalink / raw)
  To: Keith Busch, zkabelac
  Cc: Vjaceslavs Klimovs, Thorsten Leemhuis, trnka, linux-block,
	dm-devel, Linux kernel regressions list
In-Reply-To: <ajFK5NXkxd6jU5zu@gallifrey>

* Dr. David Alan Gilbert (dave@treblig.org) wrote:
> * Dr. David Alan Gilbert (dave@treblig.org) wrote:
> > * Keith Busch (kbusch@kernel.org) wrote:
> > > On Mon, Jun 15, 2026 at 04:16:12PM -0700, Vjaceslavs Klimovs wrote:
> > > > Your trace looks like what the two earlier reports hit: a read reaching
> > > > a leaf device with sectors > 0 but phys_seg 0 (an empty bio). One aside
> > > > that may help read the trace: blk_io_trace.error is a __u16, so the
> > > > bracketed values on your C lines are errnos as u16 (65514 = -EINVAL,
> > > > 65531 = -EIO).
> > > > 
> > > > The WARN itself is new, the bad bio isn't. bio_add_page() only started
> > > > rejecting len == 0 in 643893647cac ("block: reject zero length in
> > > > bio_add_page()", v7.1-rc1); on 7.0.8 the same empty bio tripped
> > > > scsi_alloc_sgtables()'s !nr_segs instead, which matches what you saw.
> > > > That fits your "not a recent regression": the condition is older, v7.1
> > > > just made it loud.
> > > > 
> > > > For Tomas's and my reports (QEMU O_DIRECT to the LV block device) the
> > > > origin looks like 5ff3f74e145a ("block: simplify direct io validity
> > > > check", v6.18): blkdev_dio_invalid() now checks only aggregate
> > > > ki_pos | count alignment and dropped the per-segment
> > > > bdev_iter_is_aligned() walk, so a degenerate or misaligned O_DIRECT no
> > > > longer gets -EINVAL at the fops boundary. But your reproducer reads a
> > > > file, which goes through the filesystem O_DIRECT path and never calls
> > > > blkdev_dio_invalid(), and still makes the empty bio. So it isn't only
> > > > that one entry point.
> > > > 
> > > > dm-mirror then hangs because Keith's f7b24c7b41f2 only covers md
> > > > raid1/raid10; legacy dm-mirror (dm-raid1.c) has no equivalent and
> > > > rebuilds the empty read onto the other leg. Note the leg's status isn't
> > > > even consistent (your SATA path returns BLK_STS_IOERR, not
> > > > BLK_STS_INVAL), so copying that status check into dm-mirror probably
> > > > wouldn't catch every case.
> > > > 
> > > > For what it's worth, that points me toward rejecting the empty or
> > > > misaligned bio once, at submission, with -EINVAL, rather than teaching
> > > > each consumer to tolerate it. But you'll know the tradeoffs far better
> > > > than I do.
> > > > 
> > > > I have a small QEMU + LVM raid1/mirror setup that reproduces the
> > > > block-device variant and bisects to 5ff3f74e. Happy to run your file
> > > > reproducer with some instrumentation at the dm-mirror read entry
> > > > (bi_size vs bio_sectors vs bvec lengths) to see whether the bio is
> > > > already empty on arrival or built that way on the retry, and to test
> > > > any patch.
> > > 
> > > Thanks for following up here. I didn't initially see your follow-up
> > > until Thorsten linked it. I apologize for missing that, this feature is
> > > important so I don't want to see anything regress for it.
> > > 
> > > There is a known bug fix I think future tests should include:
> > > 
> > >   https://lore.kernel.org/linux-block/20260612223205.465913-1-kbusch@meta.com/
> > 
> > > This likely isn't the fix you're looking for, but including it rules out
> > > conditions that are not important here.
> > > 
> > > After that, can we try this suggestion and see if the hang goes away?
> > > 
> > >   https://lore.kernel.org/linux-block/ajBb8tK-0aJBpIgF@kbusch-mbp/
> > 
> > With just that one in, the machine survives - thanks!
> > 
> > It does give:
> > 
> > [  505.208354] device-mapper: raid1: Mirror read failed from 252:24. Trying alternative device.
> > [  505.239376] device-mapper: raid1: All sides of mirror have failed.
> > [  505.239389] device-mapper: raid1: Read failure on mirror device 252:25.  Failing I/O.
> > [  505.239394] device-mapper: raid1: Mirror read failed.
> > 
> > Although as far as I can tell the RAID hasn't errored and is still in sync.
> > 
> > If I turn the test case into a write (just s/pread/pwrite/ ) - the machine
> > still survives but then it does lose raid sync, and the raid resync
> > seems to stick until I do a 'lvchange --refresh main/lvol0'
> > which recovers after having spat out a:
> > 
> > [  865.319527] Buffer I/O error on dev dm-26, logical block 262128, async page read
> > 
> > > I expect the original test case to still return an error (and I think it
> > > was designed to), but it shouldn't produce the warn or bug splats with a
> > > stuck uninterruptable task.
> > 
> > It's not clear to me if it was designed to fail or not; I've not had
> > a chance to rerun the original qemu block tests yet, and I don't know
> > if old kernels succesfully used O_DIRECT in this case.
> > 
> > It still feels that my pwrite case above shouldn't cause a raid de-sync
> > (especially since a normal user can do it).
> 
> Just to follow up on that;  if I use the modern lvm mode 
> ( lvcreate  -m 1 -L 1G main /dev/sda2 /dev/sdb2 ) rather than
> the old mirror with the same patch, then:
> 
>   a) I get no log errors with either read or write
>   b) read still gives EIO
>   c) write apparently succeeds ?!

One more confirmation; running qemu's 'make check' during build passes
with no log errors (whether it skipped any tests due to it's detection
code I don't know).

Dave

> Dave
> 
> > Dave
> > -- 
> >  -----Open up your eyes, open up your mind, open up your code -------   
> > / Dr. David Alan Gilbert    |       Running GNU/Linux       | Happy  \ 
> > \        dave @ treblig.org |                               | In Hex /
> >  \ _________________________|_____ http://www.treblig.org   |_______/
> -- 
>  -----Open up your eyes, open up your mind, open up your code -------   
> / Dr. David Alan Gilbert    |       Running GNU/Linux       | Happy  \ 
> \        dave @ treblig.org |                               | In Hex /
>  \ _________________________|_____ http://www.treblig.org   |_______/
-- 
 -----Open up your eyes, open up your mind, open up your code -------   
/ Dr. David Alan Gilbert    |       Running GNU/Linux       | Happy  \ 
\        dave @ treblig.org |                               | In Hex /
 \ _________________________|_____ http://www.treblig.org   |_______/

^ permalink raw reply

* Re: Repeatable, raid1+O_DIRECT, hang/warn
From: Dr. David Alan Gilbert @ 2026-06-16 13:08 UTC (permalink / raw)
  To: Keith Busch, zkabelac
  Cc: Vjaceslavs Klimovs, Thorsten Leemhuis, trnka, linux-block,
	dm-devel, Linux kernel regressions list
In-Reply-To: <ajFISH9bvyWjLOM6@gallifrey>

* Dr. David Alan Gilbert (dave@treblig.org) wrote:
> * Keith Busch (kbusch@kernel.org) wrote:
> > On Mon, Jun 15, 2026 at 04:16:12PM -0700, Vjaceslavs Klimovs wrote:
> > > Your trace looks like what the two earlier reports hit: a read reaching
> > > a leaf device with sectors > 0 but phys_seg 0 (an empty bio). One aside
> > > that may help read the trace: blk_io_trace.error is a __u16, so the
> > > bracketed values on your C lines are errnos as u16 (65514 = -EINVAL,
> > > 65531 = -EIO).
> > > 
> > > The WARN itself is new, the bad bio isn't. bio_add_page() only started
> > > rejecting len == 0 in 643893647cac ("block: reject zero length in
> > > bio_add_page()", v7.1-rc1); on 7.0.8 the same empty bio tripped
> > > scsi_alloc_sgtables()'s !nr_segs instead, which matches what you saw.
> > > That fits your "not a recent regression": the condition is older, v7.1
> > > just made it loud.
> > > 
> > > For Tomas's and my reports (QEMU O_DIRECT to the LV block device) the
> > > origin looks like 5ff3f74e145a ("block: simplify direct io validity
> > > check", v6.18): blkdev_dio_invalid() now checks only aggregate
> > > ki_pos | count alignment and dropped the per-segment
> > > bdev_iter_is_aligned() walk, so a degenerate or misaligned O_DIRECT no
> > > longer gets -EINVAL at the fops boundary. But your reproducer reads a
> > > file, which goes through the filesystem O_DIRECT path and never calls
> > > blkdev_dio_invalid(), and still makes the empty bio. So it isn't only
> > > that one entry point.
> > > 
> > > dm-mirror then hangs because Keith's f7b24c7b41f2 only covers md
> > > raid1/raid10; legacy dm-mirror (dm-raid1.c) has no equivalent and
> > > rebuilds the empty read onto the other leg. Note the leg's status isn't
> > > even consistent (your SATA path returns BLK_STS_IOERR, not
> > > BLK_STS_INVAL), so copying that status check into dm-mirror probably
> > > wouldn't catch every case.
> > > 
> > > For what it's worth, that points me toward rejecting the empty or
> > > misaligned bio once, at submission, with -EINVAL, rather than teaching
> > > each consumer to tolerate it. But you'll know the tradeoffs far better
> > > than I do.
> > > 
> > > I have a small QEMU + LVM raid1/mirror setup that reproduces the
> > > block-device variant and bisects to 5ff3f74e. Happy to run your file
> > > reproducer with some instrumentation at the dm-mirror read entry
> > > (bi_size vs bio_sectors vs bvec lengths) to see whether the bio is
> > > already empty on arrival or built that way on the retry, and to test
> > > any patch.
> > 
> > Thanks for following up here. I didn't initially see your follow-up
> > until Thorsten linked it. I apologize for missing that, this feature is
> > important so I don't want to see anything regress for it.
> > 
> > There is a known bug fix I think future tests should include:
> > 
> >   https://lore.kernel.org/linux-block/20260612223205.465913-1-kbusch@meta.com/
> 
> > This likely isn't the fix you're looking for, but including it rules out
> > conditions that are not important here.
> > 
> > After that, can we try this suggestion and see if the hang goes away?
> > 
> >   https://lore.kernel.org/linux-block/ajBb8tK-0aJBpIgF@kbusch-mbp/
> 
> With just that one in, the machine survives - thanks!
> 
> It does give:
> 
> [  505.208354] device-mapper: raid1: Mirror read failed from 252:24. Trying alternative device.
> [  505.239376] device-mapper: raid1: All sides of mirror have failed.
> [  505.239389] device-mapper: raid1: Read failure on mirror device 252:25.  Failing I/O.
> [  505.239394] device-mapper: raid1: Mirror read failed.
> 
> Although as far as I can tell the RAID hasn't errored and is still in sync.
> 
> If I turn the test case into a write (just s/pread/pwrite/ ) - the machine
> still survives but then it does lose raid sync, and the raid resync
> seems to stick until I do a 'lvchange --refresh main/lvol0'
> which recovers after having spat out a:
> 
> [  865.319527] Buffer I/O error on dev dm-26, logical block 262128, async page read
> 
> > I expect the original test case to still return an error (and I think it
> > was designed to), but it shouldn't produce the warn or bug splats with a
> > stuck uninterruptable task.
> 
> It's not clear to me if it was designed to fail or not; I've not had
> a chance to rerun the original qemu block tests yet, and I don't know
> if old kernels succesfully used O_DIRECT in this case.
> 
> It still feels that my pwrite case above shouldn't cause a raid de-sync
> (especially since a normal user can do it).

Just to follow up on that;  if I use the modern lvm mode 
( lvcreate  -m 1 -L 1G main /dev/sda2 /dev/sdb2 ) rather than
the old mirror with the same patch, then:

  a) I get no log errors with either read or write
  b) read still gives EIO
  c) write apparently succeeds ?!

Dave

> Dave
> -- 
>  -----Open up your eyes, open up your mind, open up your code -------   
> / Dr. David Alan Gilbert    |       Running GNU/Linux       | Happy  \ 
> \        dave @ treblig.org |                               | In Hex /
>  \ _________________________|_____ http://www.treblig.org   |_______/
-- 
 -----Open up your eyes, open up your mind, open up your code -------   
/ Dr. David Alan Gilbert    |       Running GNU/Linux       | Happy  \ 
\        dave @ treblig.org |                               | In Hex /
 \ _________________________|_____ http://www.treblig.org   |_______/

^ permalink raw reply

* Re: [PATCH net v2 2/2] vsock/virtio: restore msg_iter on transmission failure
From: Stefano Garzarella @ 2026-06-16 12:59 UTC (permalink / raw)
  To: Octavian Purdila, g
  Cc: netdev, Alexander Viro, Andrew Morton, Arseniy Krasnov,
	David S. Miller, Eric Dumazet, Eugenio Pérez, Jakub Kicinski,
	Jason Wang, kvm, linux-block, linux-fsdevel, linux-kernel,
	Michael S. Tsirkin, Paolo Abeni, Simon Horman, Stefan Hajnoczi,
	virtualization, Xuan Zhuo, syzbot+28e5f3d207b14bae122a
In-Reply-To: <20260613000953.467473-3-tavip@google.com>

On Sat, Jun 13, 2026 at 12:09:53AM +0000, Octavian Purdila wrote:
>When transmission fails in virtio_transport_send_pkt_info, the msg_iter
>might have been partially advanced. If we don't restore it, the next
>attempt to send data will use an incorrect iterator state, leading to
>desync and warnings like "send_pkt() returns 0, but X expected".
>
>Specifically, this can happen in the following scenario, triggered by
>the syzkaller repro:
>
>1. A write-only VMA (PROT_WRITE only) is partially populated by a
>   prior TUN write that failed with -EIO but still faulted in some
>   pages).
>2. A vsock sendmmsg call with MSG_ZEROCOPY requests transmission of a
>   buffer from this VMA.
>3. The first packet (64KB) is sent successfully because the pages are
>   populated.
>4. The second packet allocation fails because GUP fast pins the first page
>   but GUP slow fails on the next unpopulated page due to PROT_WRITE-only
>   permissions.
>5. The iterator is advanced by the partially successful GUP (68KB total
>   advanced: 64KB from first packet + 4KB from second), but the send loop
>   breaks and only reports 64KB sent. This creates a 4KB desync.
>6. The next retry starts with a non-zero iov_offset, disabling zerocopy
>   and falling back to copy mode.
>7. In copy mode, the transmission succeeds for the next packets but
>   exhausts the iterator early because of the desync.
>8. The final retry sees an empty iterator but zerocopy is re-enabled
>   (offset resets). It attempts to send the remaining bytes with zerocopy
>   but pins 0 pages, creating an empty packet.
>9. The transport sends the empty packet, triggering the warning because
>   the returned bytes (header only) do not match the expected payload size.
>10. The loop continues to spin, allocating ubuf_info each time, eventually
>    exhausting sysctl_optmem_max and returning -ENOMEM to userspace.
>
>Restore msg_iter to its original state before the packet allocation
>and transmission attempt if they fail.
>
>Fixes: e0718bd82e27 ("vsock: enable setting SO_ZEROCOPY")
>Reported-by: syzbot+28e5f3d207b14bae122a@syzkaller.appspotmail.com
>Closes: https://syzkaller.appspot.com/bug?extid=28e5f3d207b14bae122a
>Assisted-by: gemini:gemini-3.1-pro
>Signed-off-by: Octavian Purdila <tavip@google.com>
>---
> net/vmw_vsock/virtio_transport_common.c | 13 +++++++++++++
> 1 file changed, 13 insertions(+)

Thanks, looks much better to me now!

Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>


^ permalink raw reply

* Re: Repeatable, raid1+O_DIRECT, hang/warn
From: Dr. David Alan Gilbert @ 2026-06-16 12:57 UTC (permalink / raw)
  To: Keith Busch
  Cc: Vjaceslavs Klimovs, Thorsten Leemhuis, trnka, linux-block,
	dm-devel, Linux kernel regressions list
In-Reply-To: <ajCTaUaACV9eNmWo@kbusch-mbp>

* Keith Busch (kbusch@kernel.org) wrote:
> On Mon, Jun 15, 2026 at 04:16:12PM -0700, Vjaceslavs Klimovs wrote:
> > Your trace looks like what the two earlier reports hit: a read reaching
> > a leaf device with sectors > 0 but phys_seg 0 (an empty bio). One aside
> > that may help read the trace: blk_io_trace.error is a __u16, so the
> > bracketed values on your C lines are errnos as u16 (65514 = -EINVAL,
> > 65531 = -EIO).
> > 
> > The WARN itself is new, the bad bio isn't. bio_add_page() only started
> > rejecting len == 0 in 643893647cac ("block: reject zero length in
> > bio_add_page()", v7.1-rc1); on 7.0.8 the same empty bio tripped
> > scsi_alloc_sgtables()'s !nr_segs instead, which matches what you saw.
> > That fits your "not a recent regression": the condition is older, v7.1
> > just made it loud.
> > 
> > For Tomas's and my reports (QEMU O_DIRECT to the LV block device) the
> > origin looks like 5ff3f74e145a ("block: simplify direct io validity
> > check", v6.18): blkdev_dio_invalid() now checks only aggregate
> > ki_pos | count alignment and dropped the per-segment
> > bdev_iter_is_aligned() walk, so a degenerate or misaligned O_DIRECT no
> > longer gets -EINVAL at the fops boundary. But your reproducer reads a
> > file, which goes through the filesystem O_DIRECT path and never calls
> > blkdev_dio_invalid(), and still makes the empty bio. So it isn't only
> > that one entry point.
> > 
> > dm-mirror then hangs because Keith's f7b24c7b41f2 only covers md
> > raid1/raid10; legacy dm-mirror (dm-raid1.c) has no equivalent and
> > rebuilds the empty read onto the other leg. Note the leg's status isn't
> > even consistent (your SATA path returns BLK_STS_IOERR, not
> > BLK_STS_INVAL), so copying that status check into dm-mirror probably
> > wouldn't catch every case.
> > 
> > For what it's worth, that points me toward rejecting the empty or
> > misaligned bio once, at submission, with -EINVAL, rather than teaching
> > each consumer to tolerate it. But you'll know the tradeoffs far better
> > than I do.
> > 
> > I have a small QEMU + LVM raid1/mirror setup that reproduces the
> > block-device variant and bisects to 5ff3f74e. Happy to run your file
> > reproducer with some instrumentation at the dm-mirror read entry
> > (bi_size vs bio_sectors vs bvec lengths) to see whether the bio is
> > already empty on arrival or built that way on the retry, and to test
> > any patch.
> 
> Thanks for following up here. I didn't initially see your follow-up
> until Thorsten linked it. I apologize for missing that, this feature is
> important so I don't want to see anything regress for it.
> 
> There is a known bug fix I think future tests should include:
> 
>   https://lore.kernel.org/linux-block/20260612223205.465913-1-kbusch@meta.com/

> This likely isn't the fix you're looking for, but including it rules out
> conditions that are not important here.
> 
> After that, can we try this suggestion and see if the hang goes away?
> 
>   https://lore.kernel.org/linux-block/ajBb8tK-0aJBpIgF@kbusch-mbp/

With just that one in, the machine survives - thanks!

It does give:

[  505.208354] device-mapper: raid1: Mirror read failed from 252:24. Trying alternative device.
[  505.239376] device-mapper: raid1: All sides of mirror have failed.
[  505.239389] device-mapper: raid1: Read failure on mirror device 252:25.  Failing I/O.
[  505.239394] device-mapper: raid1: Mirror read failed.

Although as far as I can tell the RAID hasn't errored and is still in sync.

If I turn the test case into a write (just s/pread/pwrite/ ) - the machine
still survives but then it does lose raid sync, and the raid resync
seems to stick until I do a 'lvchange --refresh main/lvol0'
which recovers after having spat out a:

[  865.319527] Buffer I/O error on dev dm-26, logical block 262128, async page read

> I expect the original test case to still return an error (and I think it
> was designed to), but it shouldn't produce the warn or bug splats with a
> stuck uninterruptable task.

It's not clear to me if it was designed to fail or not; I've not had
a chance to rerun the original qemu block tests yet, and I don't know
if old kernels succesfully used O_DIRECT in this case.

It still feels that my pwrite case above shouldn't cause a raid de-sync
(especially since a normal user can do it).

Dave
-- 
 -----Open up your eyes, open up your mind, open up your code -------   
/ Dr. David Alan Gilbert    |       Running GNU/Linux       | Happy  \ 
\        dave @ treblig.org |                               | In Hex /
 \ _________________________|_____ http://www.treblig.org   |_______/

^ permalink raw reply

* Re: [PATCH v4 0/3] btrfs: use IOMAP_DIO_BOUNCE flag instead of falling back to buffered IO
From: Christoph Hellwig @ 2026-06-16 12:45 UTC (permalink / raw)
  To: Qu Wenruo; +Cc: linux-btrfs, linux-block, linux-fsdevel, linux-xfs
In-Reply-To: <cover.1781597506.git.wqu@suse.com>

Note: You'll need to include Jens for the block bits to get either an
ACK or a merge through the block tree.


^ permalink raw reply

* Re: [PATCH v4 2/3] block: respect iov_iter::nofault flag in bio_iov_iter_bounce_write()
From: Christoph Hellwig @ 2026-06-16 12:44 UTC (permalink / raw)
  To: Qu Wenruo; +Cc: linux-btrfs, linux-block, linux-fsdevel, linux-xfs
In-Reply-To: <9c165a314022b61566eb247852eb773ca6c70889.1781597506.git.wqu@suse.com>

Looks good:

Reviewed-by: Christoph Hellwig <hch@lst.de>


^ permalink raw reply

* Re: [PATCH v4 1/3] block: revert the iov_iter after a short copy in bio_iov_iter_bounce_write()
From: Christoph Hellwig @ 2026-06-16 12:44 UTC (permalink / raw)
  To: Qu Wenruo; +Cc: linux-btrfs, linux-block, linux-fsdevel, linux-xfs
In-Reply-To: <c400989f227343b134110773d5acaaacf7024574.1781597506.git.wqu@suse.com>

Looks good:

Reviewed-by: Christoph Hellwig <hch@lst.de>


^ permalink raw reply

* Re: [PATCH 0/3] mm/zram: route block swap I/O through swap_ops
From: Christoph Hellwig @ 2026-06-16 12:36 UTC (permalink / raw)
  To: Jianyue Wu
  Cc: Andrew Morton, Christoph Hellwig, Chris Li, Baoquan He, Nhat Pham,
	Barry Song, Kairui Song, Kemeng Shi, Youngjun Park, Minchan Kim,
	Sergey Senozhatsky, Jens Axboe, Matthew Wilcox (Oracle), Jan Kara,
	linux-mm, linux-kernel, linux-block, linux-doc
In-Reply-To: <20260614-zram-swap-ops-block-register-v1-0-6c1a6639c222@gmail.com>

I fear this is going entirely in the wrong direction.

Yes, we have to keep zram around as a legacy interface for now,
but the right place to deal with compressed swap is in the core.

So please don't add more hacks for 'magic' block devices.

^ permalink raw reply

* Re: [PATCH net v2 1/2] iov_iter: export iov_iter_restore
From: Stefano Garzarella @ 2026-06-16 12:35 UTC (permalink / raw)
  To: Octavian Purdila
  Cc: netdev, Alexander Viro, Andrew Morton, Arseniy Krasnov,
	David S. Miller, Eric Dumazet, Eugenio Pérez, Jakub Kicinski,
	Jason Wang, kvm, linux-block, linux-fsdevel, linux-kernel,
	Michael S. Tsirkin, Paolo Abeni, Simon Horman, Stefan Hajnoczi,
	virtualization, Xuan Zhuo
In-Reply-To: <20260613000953.467473-2-tavip@google.com>

On Sat, Jun 13, 2026 at 12:09:52AM +0000, Octavian Purdila wrote:
>Export iov_iter_restore so that it can be used by modules.
>
>This is needed by the virtio vsock transport (which can be built as a
>module) to restore the msg_iter state when transmission fails.
>
>Signed-off-by: Octavian Purdila <tavip@google.com>
>---
> lib/iov_iter.c | 1 +
> 1 file changed, 1 insertion(+)

Acked-by: Stefano Garzarella <sgarzare@redhat.com>

>
>diff --git a/lib/iov_iter.c b/lib/iov_iter.c
>index 243662af1af73..067e745f9ef53 100644
>--- a/lib/iov_iter.c
>+++ b/lib/iov_iter.c
>@@ -1491,6 +1491,7 @@ void iov_iter_restore(struct iov_iter *i, struct iov_iter_state *state)
> 		i->__iov -= state->nr_segs - i->nr_segs;
> 	i->nr_segs = state->nr_segs;
> }
>+EXPORT_SYMBOL(iov_iter_restore);
>
> /*
>  * Extract a list of contiguous pages from an ITER_FOLIOQ iterator.  This does
>-- 
>2.54.0.1136.gdb2ca164c4-goog
>


^ permalink raw reply

* Re: [PATCH v2 2/5] block: split bdev_yield_claim() out of bdev_fput()
From: Jan Kara @ 2026-06-16 12:35 UTC (permalink / raw)
  To: Christian Brauner
  Cc: Chris Mason, Jens Axboe, David Sterba, Jan Kara, Naohiro Aota,
	Josef Bacik, linux-btrfs, linux-block, linux-fsdevel
In-Reply-To: <20260616-work-super-freeze_deny_upstream-v2-2-b3567c7f994b@kernel.org>

On Tue 16-06-26 13:58:15, Christian Brauner wrote:
> bdev_fput() yields the holder claim and then closes the file, which is a
> deferred operation.  Split the yield half into bdev_yield_claim() so a caller
> can give up the holder while the file - and therefore the block device - is
> still open, act on the device, and only then bdev_fput().
> 
> A filesystem that made a device unfreezable for a membership change with
> bdev_deny_freeze() undoes the deny on release with
> 
> 	bdev_yield_claim(bdev_file);
> 	bdev_allow_freeze(file_bdev(bdev_file));
> 	bdev_fput(bdev_file);
> 
> Re-allowing only after the holder is yielded avoids stranding the filesystem
> on a racing freeze, and doing it while the file is still open avoids touching
> the block device after bdev_fput().  bdev_fput() yields again, which is a
> no-op once the claim has already been given up.
> 
> Signed-off-by: Christian Brauner (Amutable) <brauner@kernel.org>

Looks good. Feel free to add:

Reviewed-by: Jan Kara <jack@suse.cz>

								Honza

> ---
>  block/bdev.c           | 50 ++++++++++++++++++++++++++++++++++----------------
>  include/linux/blkdev.h |  1 +
>  2 files changed, 35 insertions(+), 16 deletions(-)
> 
> diff --git a/block/bdev.c b/block/bdev.c
> index a83a3809380c..54b35a084c36 100644
> --- a/block/bdev.c
> +++ b/block/bdev.c
> @@ -1200,6 +1200,39 @@ void bdev_release(struct file *bdev_file)
>  	blkdev_put_no_open(bdev);
>  }
>  
> +/**
> + * bdev_yield_claim - give up the holder claim on an open block device
> + * @bdev_file: open block device
> + *
> + * Yield the holder and any write access for @bdev_file without closing it, so
> + * the caller can still act on the device - e.g. bdev_allow_freeze() it - before
> + * the final bdev_fput().  bdev_fput() yields too, so calling it afterwards is
> + * safe.
> + */
> +void bdev_yield_claim(struct file *bdev_file)
> +{
> +	struct block_device *bdev;
> +	struct gendisk *disk;
> +
> +	if (!bdev_file->private_data)
> +		return;
> +
> +	bdev = file_bdev(bdev_file);
> +	disk = bdev->bd_disk;
> +
> +	mutex_lock(&disk->open_mutex);
> +	bdev_yield_write_access(bdev_file);
> +	bd_yield_claim(bdev_file);
> +	/*
> +	 * Tell release we already gave up our hold on the
> +	 * device and if write restrictions are available that
> +	 * we already gave up write access to the device.
> +	 */
> +	bdev_file->private_data = BDEV_I(bdev_file->f_mapping->host);
> +	mutex_unlock(&disk->open_mutex);
> +}
> +EXPORT_SYMBOL_GPL(bdev_yield_claim);
> +
>  /**
>   * bdev_fput - yield claim to the block device and put the file
>   * @bdev_file: open block device
> @@ -1213,22 +1246,7 @@ void bdev_fput(struct file *bdev_file)
>  	if (WARN_ON_ONCE(bdev_file->f_op != &def_blk_fops))
>  		return;
>  
> -	if (bdev_file->private_data) {
> -		struct block_device *bdev = file_bdev(bdev_file);
> -		struct gendisk *disk = bdev->bd_disk;
> -
> -		mutex_lock(&disk->open_mutex);
> -		bdev_yield_write_access(bdev_file);
> -		bd_yield_claim(bdev_file);
> -		/*
> -		 * Tell release we already gave up our hold on the
> -		 * device and if write restrictions are available that
> -		 * we already gave up write access to the device.
> -		 */
> -		bdev_file->private_data = BDEV_I(bdev_file->f_mapping->host);
> -		mutex_unlock(&disk->open_mutex);
> -	}
> -
> +	bdev_yield_claim(bdev_file);
>  	fput(bdev_file);
>  }
>  EXPORT_SYMBOL(bdev_fput);
> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
> index cf1951caadb2..9fc16e3c8075 100644
> --- a/include/linux/blkdev.h
> +++ b/include/linux/blkdev.h
> @@ -1832,6 +1832,7 @@ int bdev_thaw(struct block_device *bdev);
>  int bdev_deny_freeze(struct block_device *bdev);
>  void bdev_allow_freeze(struct block_device *bdev);
>  void bdev_fput(struct file *bdev_file);
> +void bdev_yield_claim(struct file *bdev_file);
>  
>  struct io_comp_batch {
>  	struct rq_list req_list;
> 
> -- 
> 2.47.3
> 
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply

* Re: [PATCH RFC 2/8] fs: add a global device to super block hash table
From: Christoph Hellwig @ 2026-06-16 12:34 UTC (permalink / raw)
  To: Christian Brauner
  Cc: Christoph Hellwig, Jan Kara, Jens Axboe, Alexander Viro,
	linux-block, linux-kernel, linux-fsdevel, Carlos Maiolino,
	linux-xfs, Chris Mason, David Sterba, linux-btrfs,
	Theodore Ts'o, linux-ext4, Gao Xiang, linux-erofs
In-Reply-To: <20260602-work-super-bdev_holder_global-v1-2-bb0fd82f3861@kernel.org>

On Tue, Jun 02, 2026 at 12:10:08PM +0200, Christian Brauner wrote:
> fs_holder_ops recovers the owning superblock from bdev->bd_holder, which
> forces the holder to be exactly one superblock and prevents several
> superblocks from sharing one block device. That's what erofs is doing.
> 
> Introduce a global dev_t-keyed rhltable mapping each block device to the
> superblock(s) using it. The holder argument becomes purely the block
> layer's exclusivity token (a superblock, or a file_system_type for
> shared devices) and is no longer needed by the fs specific callbacks.

Err, no.  block devices need to have a specific owner.  If erofs wants
to share a device between superblock it needs to come up with an entity
that owns the block devices which is not a superblock.

IMHO sharing devices between superblocks is a bad idea, but that ship
has sailed, but please keep it contained inside of erofs.


^ permalink raw reply

* [PATCH v2 5/5] btrfs: deny freezing devices undergoing a replace
From: Christian Brauner @ 2026-06-16 11:58 UTC (permalink / raw)
  To: Chris Mason, Jens Axboe, David Sterba, Jan Kara
  Cc: Naohiro Aota, Josef Bacik, linux-btrfs, linux-block,
	linux-fsdevel, Christian Brauner (Amutable)
In-Reply-To: <20260616-work-super-freeze_deny_upstream-v2-0-b3567c7f994b@kernel.org>

A device replace opens a target and, on success, frees the source on a live
filesystem from btrfs_dev_replace_finishing() - which cannot fail and also
runs from a kthread on mount resume.  A bdev_freeze() racing the source free
or the target swap-in would freeze the filesystem through a claim that is
being torn down or replaced, leaving nothing for bdev_thaw() to rebalance.

Make both devices unfreezable for the whole replace, with the invariant that
a STARTED replace holds one deny on each device and any other state holds
none.  The target is denied at open (btrfs_open_device_deny_freeze(), undone
on btrfs_init_dev_replace_tgtdev()'s error unwind); the source is denied at
the start of btrfs_dev_replace_start(), before mark_block_group_to_copy() so
every 'leave' unwind sees both denied.

The deny tracks the STARTED state and is dropped whenever the replace leaves
it: btrfs_dev_replace_finishing() re-allows the target it makes a member and
frees the source through btrfs_close_bdev(allow_freeze=true), and its
scrub-error path re-allows both as it cancels.  Its early failures (before
the device swap) keep the replace STARTED and resumable, so both stay denied.
Suspending for unmount re-allows both, so they are reopened freezable at the
next mount where btrfs_resume_dev_replace_async() re-denies them (staying
suspended if a device is frozen right then); a replace cancelled from the
suspended state therefore destroys the target without allowing.
btrfs_close_bdev() and btrfs_destroy_dev_replace_tgtdev() take an allow_freeze
argument to carry this distinction; the unmount path
(btrfs_close_one_device()) passes false.

On resume, a failed kthread_run() re-allows both devices and goes through the
suspend path, resetting the replace to SUSPENDED and finishing the exclusive
operation instead of returning straight away.  The (re)mount still aborts on
that error; routing it through suspend keeps the deny balanced against the
unmount teardown and additionally drops BTRFS_EXCLOP_DEV_REPLACE, closing a
pre-existing leak that was harmless on the failed mount that frees the fs but
would have wedged future exclusive operations after a failed remount-rw.

Signed-off-by: Christian Brauner (Amutable) <brauner@kernel.org>
---
 fs/btrfs/dev-replace.c | 65 ++++++++++++++++++++++++++++++++++++++++++++------
 fs/btrfs/volumes.c     | 18 +++++++++-----
 fs/btrfs/volumes.h     |  3 ++-
 3 files changed, 72 insertions(+), 14 deletions(-)

diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 8f8fa14886de..4ae34acb89e8 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -247,8 +247,8 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
 		return -EINVAL;
 	}
 
-	bdev_file = bdev_file_open_by_path(device_path, BLK_OPEN_WRITE,
-					   fs_info->sb, &fs_holder_ops);
+	/* Unfreezable for the whole replace; see btrfs_dev_replace_start(). */
+	bdev_file = btrfs_open_device_deny_freeze(device_path, fs_info->sb);
 	if (IS_ERR(bdev_file)) {
 		btrfs_err(fs_info, "target device %s is invalid!", device_path);
 		return PTR_ERR(bdev_file);
@@ -325,7 +325,8 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
 	return 0;
 
 error:
-	bdev_fput(bdev_file);
+	/* Undo the open-time freeze deny. */
+	btrfs_release_device_allow_freeze(bdev_file);
 	return ret;
 }
 
@@ -622,6 +623,15 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
 	if (ret)
 		return ret;
 
+	/* Deny the source before mark, so every 'leave' unwinds both denied. */
+	if (src_device->bdev) {
+		ret = bdev_deny_freeze(src_device->bdev);
+		if (ret) {
+			btrfs_destroy_dev_replace_tgtdev(tgt_device, true);
+			return ret;
+		}
+	}
+
 	ret = mark_block_group_to_copy(fs_info, src_device);
 	if (ret)
 		return ret;
@@ -706,7 +716,9 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
 	return ret;
 
 leave:
-	btrfs_destroy_dev_replace_tgtdev(tgt_device);
+	if (src_device->bdev)
+		bdev_allow_freeze(src_device->bdev);
+	btrfs_destroy_dev_replace_tgtdev(tgt_device, true);
 	return ret;
 }
 
@@ -887,6 +899,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
 	 */
 	ret = btrfs_start_delalloc_roots(fs_info, LONG_MAX, false);
 	if (ret) {
+		/* Stays started/resumable; keep both denied. */
 		mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
 		return ret;
 	}
@@ -900,6 +913,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
 	while (1) {
 		trans = btrfs_start_transaction(root, 0);
 		if (IS_ERR(trans)) {
+			/* Stays started/resumable; keep both denied. */
 			mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
 			return PTR_ERR(trans);
 		}
@@ -952,7 +966,10 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
 		mutex_unlock(&fs_devices->device_list_mutex);
 		btrfs_rm_dev_replace_blocked(fs_info);
 		if (tgt_device)
-			btrfs_destroy_dev_replace_tgtdev(tgt_device);
+			btrfs_destroy_dev_replace_tgtdev(tgt_device, true);
+		/* The source stays a member; re-allow freezing it. */
+		if (src_device->bdev)
+			bdev_allow_freeze(src_device->bdev);
 		btrfs_rm_dev_replace_unblocked(fs_info);
 		mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
 
@@ -1018,6 +1035,8 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
 
 	mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
 
+	/* The target is now a member; the source is freed (allow + release). */
+	bdev_allow_freeze(tgt_device->bdev);
 	btrfs_rm_dev_replace_free_srcdev(src_device);
 
 	return 0;
@@ -1146,8 +1165,9 @@ int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info)
 			btrfs_dev_name(src_device), src_device->devid,
 			btrfs_dev_name(tgt_device));
 
+		/* A suspended replace never re-denied freezing; do not allow. */
 		if (tgt_device)
-			btrfs_destroy_dev_replace_tgtdev(tgt_device);
+			btrfs_destroy_dev_replace_tgtdev(tgt_device, false);
 		break;
 	default:
 		up_write(&dev_replace->rwsem);
@@ -1177,6 +1197,11 @@ void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info)
 		dev_replace->time_stopped = ktime_get_real_seconds();
 		dev_replace->item_needs_writeback = 1;
 		btrfs_info(fs_info, "suspending dev_replace for unmount");
+		/* Reopened freezable next mount; resume re-denies. */
+		if (dev_replace->srcdev && dev_replace->srcdev->bdev)
+			bdev_allow_freeze(dev_replace->srcdev->bdev);
+		if (dev_replace->tgtdev && dev_replace->tgtdev->bdev)
+			bdev_allow_freeze(dev_replace->tgtdev->bdev);
 		break;
 	}
 
@@ -1189,6 +1214,7 @@ int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info)
 {
 	struct task_struct *task;
 	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+	int ret = 0;
 
 	down_write(&dev_replace->rwsem);
 
@@ -1232,8 +1258,33 @@ int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info)
 		return 0;
 	}
 
+	/* Re-deny for the resumed replace; stay suspended if frozen now. */
+	if (dev_replace->srcdev->bdev &&
+	    bdev_deny_freeze(dev_replace->srcdev->bdev))
+		goto suspend;
+	if (bdev_deny_freeze(dev_replace->tgtdev->bdev)) {
+		if (dev_replace->srcdev->bdev)
+			bdev_allow_freeze(dev_replace->srcdev->bdev);
+		goto suspend;
+	}
+
 	task = kthread_run(btrfs_dev_replace_kthread, fs_info, "btrfs-devrepl");
-	return PTR_ERR_OR_ZERO(task);
+	if (IS_ERR(task)) {
+		bdev_allow_freeze(dev_replace->tgtdev->bdev);
+		if (dev_replace->srcdev->bdev)
+			bdev_allow_freeze(dev_replace->srcdev->bdev);
+		/* Undo the deny and suspend, but still fail the mount. */
+		ret = PTR_ERR(task);
+		goto suspend;
+	}
+	return 0;
+
+suspend:
+	btrfs_exclop_finish(fs_info);
+	down_write(&dev_replace->rwsem);
+	dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED;
+	up_write(&dev_replace->rwsem);
+	return ret;
 }
 
 static int btrfs_dev_replace_kthread(void *data)
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 167a1c3d0fca..9ffc5329f6b2 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1128,7 +1128,7 @@ void btrfs_release_device_allow_freeze(struct file *bdev_file)
 	bdev_fput(bdev_file);
 }
 
-static void btrfs_close_bdev(struct btrfs_device *device)
+static void btrfs_close_bdev(struct btrfs_device *device, bool allow_freeze)
 {
 	if (!device->bdev)
 		return;
@@ -1138,7 +1138,11 @@ static void btrfs_close_bdev(struct btrfs_device *device)
 		invalidate_bdev(device->bdev);
 	}
 
-	bdev_fput(device->bdev_file);
+	/* @allow_freeze undoes a replace-time deny; unmount-close was never denied. */
+	if (allow_freeze)
+		btrfs_release_device_allow_freeze(device->bdev_file);
+	else
+		bdev_fput(device->bdev_file);
 }
 
 static void btrfs_close_one_device(struct btrfs_device *device)
@@ -1159,7 +1163,7 @@ static void btrfs_close_one_device(struct btrfs_device *device)
 		fs_devices->missing_devices--;
 	}
 
-	btrfs_close_bdev(device);
+	btrfs_close_bdev(device, false);
 	if (device->bdev) {
 		fs_devices->open_devices--;
 		device->bdev = NULL;
@@ -2511,7 +2515,8 @@ void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev)
 
 	mutex_lock(&uuid_mutex);
 
-	btrfs_close_bdev(srcdev);
+	/* The source was made unfreezable for the replace; undo it. */
+	btrfs_close_bdev(srcdev, true);
 	synchronize_rcu();
 	btrfs_free_device(srcdev);
 
@@ -2532,7 +2537,8 @@ void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev)
 	mutex_unlock(&uuid_mutex);
 }
 
-void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev)
+void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev,
+				      bool allow_freeze)
 {
 	struct btrfs_fs_devices *fs_devices = tgtdev->fs_info->fs_devices;
 
@@ -2553,7 +2559,7 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev)
 
 	btrfs_scratch_superblocks(tgtdev->fs_info, tgtdev);
 
-	btrfs_close_bdev(tgtdev);
+	btrfs_close_bdev(tgtdev, allow_freeze);
 	synchronize_rcu();
 	btrfs_free_device(tgtdev);
 }
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 75c7963f5d4c..65de9504d887 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -790,7 +790,8 @@ int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info);
 int btrfs_run_dev_stats(struct btrfs_trans_handle *trans);
 void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev);
 void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev);
-void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev);
+void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev,
+				      bool allow_freeze);
 unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
 				    u64 logical);
 u64 btrfs_calc_stripe_length(const struct btrfs_chunk_map *map);

-- 
2.47.3


^ permalink raw reply related

* [PATCH v2 4/5] btrfs: deny freezing a device while it is being added
From: Christian Brauner @ 2026-06-16 11:58 UTC (permalink / raw)
  To: Chris Mason, Jens Axboe, David Sterba, Jan Kara
  Cc: Naohiro Aota, Josef Bacik, linux-btrfs, linux-block,
	linux-fsdevel, Christian Brauner (Amutable)
In-Reply-To: <20260616-work-super-freeze_deny_upstream-v2-0-b3567c7f994b@kernel.org>

btrfs_init_new_device() opens and claims the new device on a live
superblock without holding the write count, so a bdev_freeze() racing the
window between the claim being published and the device becoming a member
could freeze the filesystem through a claim the add may still abort and tear
down.

Add btrfs_open_device_deny_freeze(): it opens the device once
non-exclusively to take the freeze deny, then claims it by the same dev_t,
so the holder is only ever published while the device is already
unfreezable.  Keep it denied until the add is durable: bdev_allow_freeze()
on each success return (the device is now a committed member),
btrfs_release_device_allow_freeze() on the error unwind.  The deny spans the
whole add, including the seeding tail whose late failures still release the
device.  A device already frozen when the add starts is refused with -EBUSY.

Signed-off-by: Christian Brauner (Amutable) <brauner@kernel.org>
---
 fs/btrfs/volumes.c | 46 +++++++++++++++++++++++++++++++++++++++++-----
 fs/btrfs/volumes.h |  2 ++
 2 files changed, 43 insertions(+), 5 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 36f9835f65e3..167a1c3d0fca 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2822,6 +2822,37 @@ static int btrfs_finish_sprout(struct btrfs_trans_handle *trans)
 	return 0;
 }
 
+/*
+ * Open @path for @sb with freezing denied before the holder claim is published,
+ * so a racing bdev_freeze() can never reach a claim a device add or replace may
+ * still abort.  The deny is taken on a throwaway non-holder probe open, then the
+ * holder is opened by the probe's dev_t.  Balanced by the caller.
+ */
+struct file *btrfs_open_device_deny_freeze(const char *path,
+					   struct super_block *sb)
+{
+	struct file *probe_file, *bdev_file;
+	int ret;
+
+	/* WRITE so bdev_file_open_by_path() rejects a read-only device. */
+	probe_file = bdev_file_open_by_path(path, BLK_OPEN_WRITE, NULL, NULL);
+	if (IS_ERR(probe_file))
+		return probe_file;
+
+	ret = bdev_deny_freeze(file_bdev(probe_file));
+	if (ret) {
+		bdev_fput(probe_file);
+		return ERR_PTR(ret);
+	}
+
+	bdev_file = bdev_file_open_by_dev(file_bdev(probe_file)->bd_dev,
+					  BLK_OPEN_WRITE, sb, &fs_holder_ops);
+	if (IS_ERR(bdev_file))
+		bdev_allow_freeze(file_bdev(probe_file));
+	bdev_fput(probe_file);
+	return bdev_file;
+}
+
 int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path)
 {
 	struct btrfs_root *root = fs_info->dev_root;
@@ -2840,8 +2871,8 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
 	if (sb_rdonly(sb) && !fs_devices->seeding)
 		return -EROFS;
 
-	bdev_file = bdev_file_open_by_path(device_path, BLK_OPEN_WRITE,
-					   fs_info->sb, &fs_holder_ops);
+	/* Forbid freezing until the device is a committed member (or unwound). */
+	bdev_file = btrfs_open_device_deny_freeze(device_path, fs_info->sb);
 	if (IS_ERR(bdev_file))
 		return PTR_ERR(bdev_file);
 
@@ -3006,8 +3037,10 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
 		up_write(&sb->s_umount);
 		locked = false;
 
-		if (ret) /* transaction commit */
+		if (ret) { /* transaction commit */
+			bdev_allow_freeze(file_bdev(bdev_file));
 			return ret;
+		}
 
 		ret = btrfs_relocate_sys_chunks(fs_info);
 		if (ret < 0)
@@ -3015,8 +3048,10 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
 				    "Failed to relocate sys chunks after device initialization. This can be fixed using the \"btrfs balance\" command.");
 		trans = btrfs_attach_transaction(root);
 		if (IS_ERR(trans)) {
-			if (PTR_ERR(trans) == -ENOENT)
+			if (PTR_ERR(trans) == -ENOENT) {
+				bdev_allow_freeze(file_bdev(bdev_file));
 				return 0;
+			}
 			ret = PTR_ERR(trans);
 			trans = NULL;
 			goto error_sysfs;
@@ -3036,6 +3071,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
 	/* Update ctime/mtime for blkid or udev */
 	update_dev_time(device_path);
 
+	bdev_allow_freeze(file_bdev(bdev_file));
 	return ret;
 
 error_sysfs:
@@ -3065,7 +3101,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
 error_free_device:
 	btrfs_free_device(device);
 error:
-	bdev_fput(bdev_file);
+	btrfs_release_device_allow_freeze(bdev_file);
 	if (locked) {
 		mutex_unlock(&uuid_mutex);
 		up_write(&sb->s_umount);
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 60e82c15881a..75c7963f5d4c 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -769,6 +769,8 @@ struct btrfs_device *btrfs_find_device(const struct btrfs_fs_devices *fs_devices
 				       const struct btrfs_dev_lookup_args *args);
 int btrfs_shrink_device(struct btrfs_device *device, u64 new_size);
 int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *path);
+struct file *btrfs_open_device_deny_freeze(const char *path,
+					   struct super_block *sb);
 int btrfs_balance(struct btrfs_fs_info *fs_info,
 		  struct btrfs_balance_control *bctl,
 		  struct btrfs_ioctl_balance_args *bargs);

-- 
2.47.3


^ permalink raw reply related

* [PATCH v2 3/5] btrfs: deny freezing a device while it is being removed
From: Christian Brauner @ 2026-06-16 11:58 UTC (permalink / raw)
  To: Chris Mason, Jens Axboe, David Sterba, Jan Kara
  Cc: Naohiro Aota, Josef Bacik, linux-btrfs, linux-block,
	linux-fsdevel, Christian Brauner (Amutable)
In-Reply-To: <20260616-work-super-freeze_deny_upstream-v2-0-b3567c7f994b@kernel.org>

btrfs_rm_device() runs under mnt_want_write_file(), but the claim on the
removed device is released by the ioctl after mnt_drop_write_file(), so a
bdev_freeze() racing that window could freeze the filesystem through the
device just as its claim is torn down, leaving nothing for bdev_thaw() to
rebalance.

The window cannot be closed by reordering the teardown.  btrfs_rm_device()
hands the final bdev_fput() back to the ioctl, run only after
mnt_drop_write_file(), because bdev_release() takes the disk ->open_mutex and
its dependency chain, which must not nest under the superblock's freeze/write
protection -- freeze_super() drops s_umount before draining writers precisely
to keep sb_start_write ordered above s_umount.  Holding mnt_want_write across
bdev_fput() would reintroduce that inversion, so the holder teardown is forced
outside the write-protected section.  A freeze landing in the resulting gap
resolves the still-live holder, rides in, and strands when the claim is
released; no ordering of the close against the drop removes the gap.  The
device itself therefore has to refuse freezing for the whole removal.

Deny freezing the device for the duration of the removal: bdev_deny_freeze()
at the start of btrfs_rm_device() (it cannot be frozen yet, the ioctl holds
the write count), and release it through btrfs_release_device_allow_freeze()
in the ioctls on success, or bdev_allow_freeze() on the error paths that keep
the device a member.  A device frozen before the removal begins is refused
with -EBUSY.

btrfs_release_device_allow_freeze() yields the holder, re-allows freezing,
then closes the device, so the re-allow neither strands the filesystem on a
racing freeze nor touches the block device after the final fput.

Signed-off-by: Christian Brauner (Amutable) <brauner@kernel.org>
---
 fs/btrfs/ioctl.c   |  4 ++--
 fs/btrfs/volumes.c | 20 ++++++++++++++++++++
 fs/btrfs/volumes.h |  1 +
 3 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index b2e447f5005c..fc3e06445211 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -2579,7 +2579,7 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg)
 err_drop:
 	mnt_drop_write_file(file);
 	if (bdev_file)
-		bdev_fput(bdev_file);
+		btrfs_release_device_allow_freeze(bdev_file);
 out:
 	btrfs_put_dev_args_from_path(&args);
 	kfree(vol_args);
@@ -2630,7 +2630,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)

 	mnt_drop_write_file(file);
 	if (bdev_file)
-		bdev_fput(bdev_file);
+		btrfs_release_device_allow_freeze(bdev_file);
 out:
 	btrfs_put_dev_args_from_path(&args);
 out_free:
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index a88e68f90564..36f9835f65e3 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1119,6 +1119,15 @@ void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices)
 	mutex_unlock(&uuid_mutex);
 }

+/* Release a device that was made unfreezable for a membership change. */
+void btrfs_release_device_allow_freeze(struct file *bdev_file)
+{
+	/* Yield before allow (strand-safe); file still open for the allow (UAF-safe). */
+	bdev_yield_claim(bdev_file);
+	bdev_allow_freeze(file_bdev(bdev_file));
+	bdev_fput(bdev_file);
+}
+
 static void btrfs_close_bdev(struct btrfs_device *device)
 {
 	if (!device->bdev)
@@ -2336,6 +2345,13 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info,
 	    fs_info->fs_devices->rw_devices == 1)
 		return BTRFS_ERROR_DEV_ONLY_WRITABLE;

+	/* Removal and freezing are mutually exclusive; refuse if frozen now. */
+	if (device->bdev) {
+		ret = bdev_deny_freeze(device->bdev);
+		if (ret)
+			return ret;
+	}
+
 	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
 		mutex_lock(&fs_info->chunk_mutex);
 		list_del_init(&device->dev_alloc_list);
@@ -2362,6 +2378,8 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info,
 			   device->devid, ret);
 		btrfs_abort_transaction(trans, ret);
 		btrfs_end_transaction(trans);
+		if (device->bdev)
+			bdev_allow_freeze(device->bdev);
 		return ret;
 	}

@@ -2447,6 +2465,8 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info,
 	return btrfs_commit_transaction(trans);

 error_undo:
+	if (device->bdev)
+		bdev_allow_freeze(device->bdev);
 	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
 		mutex_lock(&fs_info->chunk_mutex);
 		list_add(&device->dev_alloc_list,
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 0082c166af91..60e82c15881a 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -744,6 +744,7 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 struct btrfs_device *btrfs_scan_one_device(const char *path, bool mount_arg_dev);
 int btrfs_forget_devices(dev_t devt);
 void btrfs_close_devices(struct btrfs_fs_devices *fs_devices);
+void btrfs_release_device_allow_freeze(struct file *bdev_file);
 void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices);
 void btrfs_assign_next_active_device(struct btrfs_device *device,
 				     struct btrfs_device *this_dev);

-- 
2.47.3

^ permalink raw reply related

* [PATCH v2 2/5] block: split bdev_yield_claim() out of bdev_fput()
From: Christian Brauner @ 2026-06-16 11:58 UTC (permalink / raw)
  To: Chris Mason, Jens Axboe, David Sterba, Jan Kara
  Cc: Naohiro Aota, Josef Bacik, linux-btrfs, linux-block,
	linux-fsdevel, Christian Brauner (Amutable)
In-Reply-To: <20260616-work-super-freeze_deny_upstream-v2-0-b3567c7f994b@kernel.org>

bdev_fput() yields the holder claim and then closes the file, which is a
deferred operation.  Split the yield half into bdev_yield_claim() so a caller
can give up the holder while the file - and therefore the block device - is
still open, act on the device, and only then bdev_fput().

A filesystem that made a device unfreezable for a membership change with
bdev_deny_freeze() undoes the deny on release with

	bdev_yield_claim(bdev_file);
	bdev_allow_freeze(file_bdev(bdev_file));
	bdev_fput(bdev_file);

Re-allowing only after the holder is yielded avoids stranding the filesystem
on a racing freeze, and doing it while the file is still open avoids touching
the block device after bdev_fput().  bdev_fput() yields again, which is a
no-op once the claim has already been given up.

Signed-off-by: Christian Brauner (Amutable) <brauner@kernel.org>
---
 block/bdev.c           | 50 ++++++++++++++++++++++++++++++++++----------------
 include/linux/blkdev.h |  1 +
 2 files changed, 35 insertions(+), 16 deletions(-)

diff --git a/block/bdev.c b/block/bdev.c
index a83a3809380c..54b35a084c36 100644
--- a/block/bdev.c
+++ b/block/bdev.c
@@ -1200,6 +1200,39 @@ void bdev_release(struct file *bdev_file)
 	blkdev_put_no_open(bdev);
 }
 
+/**
+ * bdev_yield_claim - give up the holder claim on an open block device
+ * @bdev_file: open block device
+ *
+ * Yield the holder and any write access for @bdev_file without closing it, so
+ * the caller can still act on the device - e.g. bdev_allow_freeze() it - before
+ * the final bdev_fput().  bdev_fput() yields too, so calling it afterwards is
+ * safe.
+ */
+void bdev_yield_claim(struct file *bdev_file)
+{
+	struct block_device *bdev;
+	struct gendisk *disk;
+
+	if (!bdev_file->private_data)
+		return;
+
+	bdev = file_bdev(bdev_file);
+	disk = bdev->bd_disk;
+
+	mutex_lock(&disk->open_mutex);
+	bdev_yield_write_access(bdev_file);
+	bd_yield_claim(bdev_file);
+	/*
+	 * Tell release we already gave up our hold on the
+	 * device and if write restrictions are available that
+	 * we already gave up write access to the device.
+	 */
+	bdev_file->private_data = BDEV_I(bdev_file->f_mapping->host);
+	mutex_unlock(&disk->open_mutex);
+}
+EXPORT_SYMBOL_GPL(bdev_yield_claim);
+
 /**
  * bdev_fput - yield claim to the block device and put the file
  * @bdev_file: open block device
@@ -1213,22 +1246,7 @@ void bdev_fput(struct file *bdev_file)
 	if (WARN_ON_ONCE(bdev_file->f_op != &def_blk_fops))
 		return;
 
-	if (bdev_file->private_data) {
-		struct block_device *bdev = file_bdev(bdev_file);
-		struct gendisk *disk = bdev->bd_disk;
-
-		mutex_lock(&disk->open_mutex);
-		bdev_yield_write_access(bdev_file);
-		bd_yield_claim(bdev_file);
-		/*
-		 * Tell release we already gave up our hold on the
-		 * device and if write restrictions are available that
-		 * we already gave up write access to the device.
-		 */
-		bdev_file->private_data = BDEV_I(bdev_file->f_mapping->host);
-		mutex_unlock(&disk->open_mutex);
-	}
-
+	bdev_yield_claim(bdev_file);
 	fput(bdev_file);
 }
 EXPORT_SYMBOL(bdev_fput);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index cf1951caadb2..9fc16e3c8075 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1832,6 +1832,7 @@ int bdev_thaw(struct block_device *bdev);
 int bdev_deny_freeze(struct block_device *bdev);
 void bdev_allow_freeze(struct block_device *bdev);
 void bdev_fput(struct file *bdev_file);
+void bdev_yield_claim(struct file *bdev_file);
 
 struct io_comp_batch {
 	struct rq_list req_list;

-- 
2.47.3


^ permalink raw reply related

* [PATCH v2 1/5] block: allow making a block device unfreezable
From: Christian Brauner @ 2026-06-16 11:58 UTC (permalink / raw)
  To: Chris Mason, Jens Axboe, David Sterba, Jan Kara
  Cc: Naohiro Aota, Josef Bacik, linux-btrfs, linux-block,
	linux-fsdevel, Christian Brauner (Amutable)
In-Reply-To: <20260616-work-super-freeze_deny_upstream-v2-0-b3567c7f994b@kernel.org>

Add bdev_deny_freeze() and bdev_allow_freeze(), modeled on
deny_write_access()/allow_write_access().  bd_fsfreeze_count becomes a
signed counter: > 0 counts active freezes, < 0 counts deniers, and the
two regimes are mutually exclusive.  bdev_freeze() refuses with -EBUSY
while a deny is held, and bdev_deny_freeze() refuses while the device is
frozen.

A filesystem that mutates a device's membership (a btrfs device add,
remove or replace) denies freezing on the device for the duration, so a
claim a freeze walk might act on is never added or torn down behind the
freezer's back.

The deny/allow helpers are a single atomic on bd_fsfreeze_count and take
no lock, so they can be called while holding s_umount without inverting
against bdev_freeze()'s bd_fsfreeze_mutex -> s_umount order.

Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner (Amutable) <brauner@kernel.org>
---
 block/bdev.c              | 63 +++++++++++++++++++++++++++++++++++++++--------
 include/linux/blk_types.h |  2 +-
 include/linux/blkdev.h    |  2 ++
 3 files changed, 56 insertions(+), 11 deletions(-)

diff --git a/block/bdev.c b/block/bdev.c
index bb0ffa3bb4df..a83a3809380c 100644
--- a/block/bdev.c
+++ b/block/bdev.c
@@ -304,7 +304,12 @@ int bdev_freeze(struct block_device *bdev)
 
 	mutex_lock(&bdev->bd_fsfreeze_mutex);
 
-	if (atomic_inc_return(&bdev->bd_fsfreeze_count) > 1) {
+	/* A device being removed from its filesystem refuses freezes. */
+	if (!atomic_inc_unless_negative(&bdev->bd_fsfreeze_count)) {
+		mutex_unlock(&bdev->bd_fsfreeze_mutex);
+		return -EBUSY;
+	}
+	if (atomic_read(&bdev->bd_fsfreeze_count) > 1) {
 		mutex_unlock(&bdev->bd_fsfreeze_mutex);
 		return 0;
 	}
@@ -340,18 +345,18 @@ int bdev_thaw(struct block_device *bdev)
 
 	mutex_lock(&bdev->bd_fsfreeze_mutex);
 
-	/*
-	 * If this returns < 0 it means that @bd_fsfreeze_count was
-	 * already 0 and no decrement was performed.
-	 */
-	nr_freeze = atomic_dec_if_positive(&bdev->bd_fsfreeze_count);
-	if (nr_freeze < 0)
+	/* <= 0: not frozen (0) or a freeze deny is held (< 0); leave it. */
+	nr_freeze = atomic_read(&bdev->bd_fsfreeze_count);
+	if (nr_freeze <= 0)
 		goto out;
 
 	error = 0;
-	if (nr_freeze > 0)
+	if (nr_freeze > 1) {
+		atomic_dec(&bdev->bd_fsfreeze_count);
 		goto out;
+	}
 
+	/* Keep the count positive across the thaw so a deny is refused. */
 	mutex_lock(&bdev->bd_holder_lock);
 	if (bdev->bd_holder_ops && bdev->bd_holder_ops->thaw) {
 		error = bdev->bd_holder_ops->thaw(bdev);
@@ -360,14 +365,52 @@ int bdev_thaw(struct block_device *bdev)
 		mutex_unlock(&bdev->bd_holder_lock);
 	}
 
-	if (error)
-		atomic_inc(&bdev->bd_fsfreeze_count);
+	if (!error)
+		atomic_dec(&bdev->bd_fsfreeze_count);
 out:
 	mutex_unlock(&bdev->bd_fsfreeze_mutex);
 	return error;
 }
 EXPORT_SYMBOL(bdev_thaw);
 
+/**
+ * bdev_deny_freeze - make a block device unfreezable
+ * @bdev: block device
+ *
+ * Reserve @bdev against bdev_freeze() the way deny_write_access() reserves a
+ * file against writers.  bd_fsfreeze_count is sign-encoded: > 0 counts active
+ * freezes, < 0 counts deniers, so a deny succeeds only while no freeze is in
+ * progress.  While held, bdev_freeze() returns -EBUSY.  Pair with
+ * bdev_allow_freeze().
+ *
+ * A filesystem removing, adding or replacing a member device denies freezes on
+ * it for the duration, so a claim a freeze walk might act on is never torn down
+ * behind the freezer's back.  The deny is device-scoped, not (device,
+ * superblock)-scoped: a device shared by several superblocks is refused for all
+ * of them.  No in-tree filesystem removes a shared claim from a live superblock.
+ *
+ * Return: 0, or -EBUSY if the device is currently frozen.
+ */
+int bdev_deny_freeze(struct block_device *bdev)
+{
+	return atomic_dec_unless_positive(&bdev->bd_fsfreeze_count) ? 0 : -EBUSY;
+}
+EXPORT_SYMBOL_GPL(bdev_deny_freeze);
+
+/**
+ * bdev_allow_freeze - allow freezing a block device again
+ * @bdev: block device
+ *
+ * Undo one bdev_deny_freeze().
+ */
+void bdev_allow_freeze(struct block_device *bdev)
+{
+	/* A deny must be held, i.e. the count must be negative. */
+	WARN_ON_ONCE(atomic_read(&bdev->bd_fsfreeze_count) >= 0);
+	atomic_inc(&bdev->bd_fsfreeze_count);
+}
+EXPORT_SYMBOL_GPL(bdev_allow_freeze);
+
 /*
  * pseudo-fs
  */
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 8808ee76e73c..5a725a0cd35f 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -66,7 +66,7 @@ struct block_device {
 	int			bd_holders;
 	struct kobject		*bd_holder_dir;
 
-	atomic_t		bd_fsfreeze_count; /* number of freeze requests */
+	atomic_t		bd_fsfreeze_count; /* >0 freeze requests, <0 freeze deniers */
 	struct mutex		bd_fsfreeze_mutex; /* serialize freeze/thaw */
 
 	struct partition_meta_info *bd_meta_info;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 890128cdea1c..cf1951caadb2 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1829,6 +1829,8 @@ static inline int early_lookup_bdev(const char *pathname, dev_t *dev)
 
 int bdev_freeze(struct block_device *bdev);
 int bdev_thaw(struct block_device *bdev);
+int bdev_deny_freeze(struct block_device *bdev);
+void bdev_allow_freeze(struct block_device *bdev);
 void bdev_fput(struct file *bdev_file);
 
 struct io_comp_batch {

-- 
2.47.3


^ permalink raw reply related

* [PATCH v2 0/5] block,btrfs: fix frozen-superblock strand on device add/remove/replace
From: Christian Brauner @ 2026-06-16 11:58 UTC (permalink / raw)
  To: Chris Mason, Jens Axboe, David Sterba, Jan Kara
  Cc: Naohiro Aota, Josef Bacik, linux-btrfs, linux-block,
	linux-fsdevel, Christian Brauner (Amutable)

This is another series of fixes that fell out of the device to
superblock hashtable work. These are all pre-existing bugs.

A block-device freeze that races a btrfs device membership change can leave
the whole filesystem stuck frozen, recoverable only with a manual FITHAW.

btrfs holds each of its devices open with the superblock as the block-device
holder.  bdev_freeze() - issued by "dmsetup suspend" or an LVM snapshot -
resolves that holder to freeze the filesystem, and bdev_thaw() ("dmsetup
resume") resolves it again to thaw.  If a freeze lands while btrfs is adding,
removing or replacing a device, it rides in on the device's holder link and
freezes the filesystem; the membership change then drops that link, so the
matching thaw can no longer find the superblock.  The filesystem stays frozen
with no way back short of FITHAW.

To reproduce on the remove path: build a two-device btrfs with one member
behind a dm-linear target, write enough data that removing that member
relocates for a few seconds, start "btrfs device remove" on it, and
"dmsetup suspend" the dm device while the removal is underway.  The suspend's
freeze blocks on the remove ioctl's write access and rides in as the ioctl
drops it; the removal then clears the device's holder link, so the matching
"dmsetup resume" can no longer reach the superblock.  On an unpatched kernel
the filesystem is left frozen and the next write hangs in D state until a
manual FITHAW (fsfreeze -u).

The fix lets a filesystem forbid freezing a device for the duration of a
membership change, modelled on deny_write_access()/allow_write_access().
bd_fsfreeze_count becomes signed: > 0 counts active freezes, < 0 counts deny
holders, and the two are mutually exclusive.  bdev_deny_freeze() reserves the
device (bdev_freeze() then returns -EBUSY) and bdev_allow_freeze() releases
it; both are a single lockless atomic, so a filesystem can deny under
s_umount without inverting against bdev_freeze()'s bd_fsfreeze_mutex.  btrfs
denies the device across each add, remove and replace, so a racing freeze is
refused instead of riding in, while a normal freeze of a settled member
still works.

To re-allow freezing safely on release, bdev_yield_claim() is split out of
bdev_fput(): the caller yields the holder while the device file is still
open, re-allows freezing on the now-holderless device, and only then closes
it. Re-allowing after the holder is gone avoids re-stranding on a racing
freeze; doing it while the file is still open keeps the block device alive
without referencing it after the final fput.

With the fix the racing suspend is refused with -EBUSY mid-removal and the
filesystem stays writable.

Signed-off-by: Christian Brauner (Amutable) <brauner@kernel.org>
---
Changes in v2:
- block: bdev_thaw() now keeps bd_fsfreeze_count positive across the thaw
  and only drops it to 0 on success, so a bdev_deny_freeze() racing the thaw
  is refused instead of slipping in on a transient 0 and corrupting the
  sign-encoded counter.
- block: bdev_allow_freeze() WARN_ON_ONCE()s an unbalanced call (Jan Kara).
- block: bdev_yield_claim() early-returns instead of wrapping its body in an
  if (Johannes Thumshirn).
- btrfs: btrfs_open_device_deny_freeze() opens the probe BLK_OPEN_WRITE so a
  read-only device is rejected at "device add"; the by-dev open of the
  holder skipped the read-only check the previous by-path open enforced.
- Reword the cover: FIFREEZE freezes the superblock, not the bare device.
- Link to v1: https://patch.msgid.link/20260615-work-super-freeze_deny_upstream-v1-0-a6c72b840e7d@kernel.org

---
Christian Brauner (5):
      block: allow making a block device unfreezable
      block: split bdev_yield_claim() out of bdev_fput()
      btrfs: deny freezing a device while it is being removed
      btrfs: deny freezing a device while it is being added
      btrfs: deny freezing devices undergoing a replace

 block/bdev.c              | 113 +++++++++++++++++++++++++++++++++++-----------
 fs/btrfs/dev-replace.c    |  65 +++++++++++++++++++++++---
 fs/btrfs/ioctl.c          |   4 +-
 fs/btrfs/volumes.c        |  84 +++++++++++++++++++++++++++++-----
 fs/btrfs/volumes.h        |   6 ++-
 include/linux/blk_types.h |   2 +-
 include/linux/blkdev.h    |   3 ++
 7 files changed, 229 insertions(+), 48 deletions(-)
---
base-commit: 254f49634ee16a731174d2ae34bc50bd5f45e731
change-id: 20260615-work-super-freeze_deny_upstream-498ae64761a0

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox