Linux block layer
 help / color / mirror / Atom feed
* [PATCH 2/2] dm-raid1: don't fail the mirror for invalid I/O errors
From: Keith Busch @ 2026-06-16 15:58 UTC (permalink / raw)
  To: Keith Busch
  Cc: dm-devel, linux-block, mpatocka, Dr. David Alan Gilbert,
	Vjaceslavs Klimovs
In-Reply-To: <20260616150554.1686662-1-kbusch@meta.com>

BLK_STS_INVAL indicates the I/O request itself was invalid (for example a
misaligned direct I/O), not that the device has failed. dm-raid1 treated
any read or write completion error as a device failure: it failed the
mirror leg, retried on the alternatives - which fail identically - and
eventually returned EIO while spuriously degrading the array.

Since commit 5ff3f74e145a ("block: simplify direct io validity check") the
direct I/O path no longer rejects misaligned buffers up front, so an
invalid bio now reaches the lower block layers, which fail it with
BLK_STS_INVAL. dm-io collapses the block status into a per-region error
bit before invoking the completion callback, so record BLK_STS_INVAL on
the originating bio and have the dm-raid1 read, write and end_io paths
propagate it instead of failing the device.

This mirrors the raid1/raid10 fix in commit f7b24c7b41f23
("md/raid1,raid10: don't fail devices for invalid IO errors") for the
device-mapper mirror target.

Fixes: 7eac33186957 ("iomap: simplify direct io validity check")
Fixes: 5ff3f74e145a ("block: simplify direct io validity check")
Reported-by: Dr. David Alan Gilbert <linux@treblig.org>
Reported-by: Vjaceslavs Klimovs <vklimovs@gmail.com>
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
Resending patch 2/2 from a different machine. For some reason, only 1/2
is getting through with git-send-email, so manually replying to the
thread with the missing second patch.

 drivers/md/dm-io.c    | 14 +++++++++++++-
 drivers/md/dm-raid1.c | 28 +++++++++++++++++++++++++++-
 2 files changed, 40 insertions(+), 2 deletions(-)

diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index 28adfeb58f240..f382e9f9be059 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -37,6 +37,7 @@ struct io {
 	struct dm_io_client *client;
 	io_notify_fn callback;
 	void *context;
+	struct bio *orig_bio;
 	void *vma_invalidate_address;
 	unsigned long vma_invalidate_size;
 } __aligned(DM_IO_MAX_REGIONS);
@@ -132,8 +133,18 @@ static void complete_io(struct io *io)
 
 static void dec_count(struct io *io, unsigned int region, blk_status_t error)
 {
-	if (error)
+	if (error) {
 		set_bit(region, &io->error_bits);
+		/*
+		 * BLK_STS_INVAL means the bio was not valid for the underlying
+		 * device (e.g. a misaligned direct I/O), which is a caller error
+		 * rather than a device failure. Record it on the original bio so
+		 * bio-based targets can propagate it instead of treating it as a
+		 * media error and failing the device.
+		 */
+		if (error == BLK_STS_INVAL && io->orig_bio)
+			io->orig_bio->bi_status = error;
+	}
 
 	if (atomic_dec_and_test(&io->count))
 		complete_io(io);
@@ -398,6 +409,7 @@ static void async_io(struct dm_io_client *client, unsigned int num_regions,
 	io->client = client;
 	io->callback = fn;
 	io->context = context;
+	io->orig_bio = dp->orig_bio;
 
 	io->vma_invalidate_address = dp->vma_invalidate_address;
 	io->vma_invalidate_size = dp->vma_invalidate_size;
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index de5c00704e69c..022ad791c2957 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -524,6 +524,17 @@ static void read_callback(unsigned long error, void *context)
 		return;
 	}
 
+	/*
+	 * BLK_STS_INVAL means the bio was not valid for the underlying device,
+	 * e.g. a misaligned direct I/O. That is a caller error, not a device
+	 * failure, so propagate it rather than failing the mirror and retrying
+	 * on the other legs, which would fail the same way.
+	 */
+	if (bio->bi_status == BLK_STS_INVAL) {
+		bio_endio(bio);
+		return;
+	}
+
 	fail_mirror(m, DM_RAID1_READ_ERROR);
 
 	if (likely(default_ok(m)) || mirror_available(m->ms, bio)) {
@@ -622,6 +633,16 @@ static void write_callback(unsigned long error, void *context)
 		return;
 	}
 
+	/*
+	 * BLK_STS_INVAL means the bio was not valid for the underlying device,
+	 * e.g. a misaligned direct I/O. Propagate the error without degrading
+	 * the array.
+	 */
+	if (bio->bi_status == BLK_STS_INVAL) {
+		bio_endio(bio);
+		return;
+	}
+
 	/*
 	 * If the bio is discard, return an error, but do not
 	 * degrade the array.
@@ -1262,7 +1283,12 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio,
 		return DM_ENDIO_DONE;
 	}
 
-	if (*error == BLK_STS_NOTSUPP)
+	/*
+	 * BLK_STS_INVAL means the bio was not valid for the underlying device,
+	 * e.g. a misaligned direct I/O. Propagate it rather than failing the
+	 * mirror and retrying, which would fail the same way on every leg.
+	 */
+	if (*error == BLK_STS_NOTSUPP || *error == BLK_STS_INVAL)
 		goto out;
 
 	if (bio->bi_opf & REQ_RAHEAD)
-- 
2.52.0



^ permalink raw reply related

* Re: Repeatable, raid1+O_DIRECT, hang/warn
From: Mikulas Patocka @ 2026-06-16 15:55 UTC (permalink / raw)
  To: Vjaceslavs Klimovs
  Cc: Dr. David Alan Gilbert, Thorsten Leemhuis, kbusch, trnka,
	Zdenek Kabelac, linux-block, dm-devel,
	Linux kernel regressions list
In-Reply-To: <CAC_j7i0eDccVWzPeRafM50mZEOFHPz2cwd=RZqqx6TK2EVRFvw@mail.gmail.com>

Hi


On Mon, 15 Jun 2026, Vjaceslavs Klimovs wrote:

> Hi Dave, all,
> 
> I'm one of the original reporters and very much a user, not a block/dm
> developer, so please sanity-check all of this.
> 
> Your trace looks like what the two earlier reports hit: a read reaching
> a leaf device with sectors > 0 but phys_seg 0 (an empty bio). One aside
> that may help read the trace: blk_io_trace.error is a __u16, so the
> bracketed values on your C lines are errnos as u16 (65514 = -EINVAL,
> 65531 = -EIO).
> 
> The WARN itself is new, the bad bio isn't. bio_add_page() only started
> rejecting len == 0 in 643893647cac ("block: reject zero length in
> bio_add_page()", v7.1-rc1); on 7.0.8 the same empty bio tripped
> scsi_alloc_sgtables()'s !nr_segs instead, which matches what you saw.
> That fits your "not a recent regression": the condition is older, v7.1
> just made it loud.
> 
> For Tomas's and my reports (QEMU O_DIRECT to the LV block device) the
> origin looks like 5ff3f74e145a ("block: simplify direct io validity
> check", v6.18): blkdev_dio_invalid() now checks only aggregate
> ki_pos | count alignment and dropped the per-segment
> bdev_iter_is_aligned() walk, so a degenerate or misaligned O_DIRECT no
> longer gets -EINVAL at the fops boundary. But your reproducer reads a
> file, which goes through the filesystem O_DIRECT path and never calls
> blkdev_dio_invalid(), and still makes the empty bio. So it isn't only
> that one entry point.

I thought that reverting 5ff3f74e145a and re-introducing the alignment 
check in block/fops.c:blkdev_dio_invalid would fix it - but it wouldn't.

The same problem existed even before 5ff3f74e145a, with the pvmove 
command.

Suppose that the administrator needs to move a logical volume from one 
disk to another and uses pvmove. Pvmove inserts a new dm-mirror target 
underneath the logical volume and uses it to copy the data. Now, the 
dm-mirror target crashes whenever it receives bio with unaligned vectors.

So, I think that the proper way to fix this is to teach dm-mirror/dm-io to 
deal with unaligned bio vectors and handle them properly.

Mikulas


^ permalink raw reply

* Re: Repeatable, raid1+O_DIRECT, hang/warn
From: Dr. David Alan Gilbert @ 2026-06-16 15:55 UTC (permalink / raw)
  To: Keith Busch
  Cc: zkabelac, Vjaceslavs Klimovs, Thorsten Leemhuis, trnka,
	linux-block, dm-devel, Linux kernel regressions list
In-Reply-To: <ajFbglSvcLrFH8Z-@kbusch-mbp>

* Keith Busch (kbusch@kernel.org) wrote:
> On Tue, Jun 16, 2026 at 01:08:52PM +0000, Dr. David Alan Gilbert wrote:
> > ( lvcreate  -m 1 -L 1G main /dev/sda2 /dev/sdb2 ) rather than
> > the old mirror with the same patch, then:
> > 
> >   a) I get no log errors with either read or write
> >   b) read still gives EIO
> 
> I've a follow up patch to handle the error properly. You want to see
> EINVAL, not EIO, and that error shouldn't be considered for determining
> the raid health. Something like what f7b24c7b41f23b5 does, but it's a
> little more complicated in this path since it doesn't see the lower
> level error status and just converts everything to EIO.

OK, thanks for your help, and I'll be happy to test that when it's done.

Dave
-- 
 -----Open up your eyes, open up your mind, open up your code -------   
/ Dr. David Alan Gilbert    |       Running GNU/Linux       | Happy  \ 
\        dave @ treblig.org |                               | In Hex /
 \ _________________________|_____ http://www.treblig.org   |_______/

^ permalink raw reply

* [PATCH] block: fix IORING_URING_CMD_REISSUE flags check in blkdev_uring_cmd
From: Yitang Yang @ 2026-06-16 15:51 UTC (permalink / raw)
  To: Jens Axboe; +Cc: linux-block, Yitang Yang

blkdev_uring_cmd() checks IORING_URING_CMD_REISSUE to determine whether
this is the first issue. However, this flag lives in cmd->flags instead
of issue_flags.

Coincidentally, IO_URING_F_NONBLOCK shares bit 31 with
IORING_URING_CMD_REISSUE. As a result, the SQE read was never performed,
bic->len remained zero, and every BLOCK_URING_CMD_DISCARD failed with
-EINVAL.

Fix it by checking cmd->flags as intended.

Fixes: 212ec34e4e72 ("block: only read from sqe on initial invocation of blkdev_uring_cmd")
Signed-off-by: Yitang Yang <yi1tang.yang@gmail.com>
---
 block/ioctl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/ioctl.c b/block/ioctl.c
index ab2c9ed79946..3d4ea1537457 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -951,7 +951,7 @@ int blkdev_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
 	u32 cmd_op = cmd->cmd_op;
 
 	/* Read what we need from the SQE on the first issue */
-	if (!(issue_flags & IORING_URING_CMD_REISSUE)) {
+	if (!(cmd->flags & IORING_URING_CMD_REISSUE)) {
 		const struct io_uring_sqe *sqe = cmd->sqe;
 
 		if (unlikely(sqe->ioprio || sqe->__pad1 || sqe->len ||
-- 
2.43.0


^ permalink raw reply related

* Re: [PATCH] block: check bio split for unaligned bvec
From: Keith Busch @ 2026-06-16 15:36 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Keith Busch, linux-block, axboe, Carlos Maiolino
In-Reply-To: <ajB36Wt9lU_F-r7h@kbusch-mbp>

On Mon, Jun 15, 2026 at 04:08:41PM -0600, Keith Busch wrote:
>   3: can handle arbitrary memory but advertise default dma_alignment=511
>       (brd, pmem, zram, ps3vram, simdisk - "limits lie")

That's actually not right because they iterate with
bio_for_each_segment, which requires all the bv_len's are sector size
granularity, so their default limits are correct. Just no one's
enforcing them right now.

^ permalink raw reply

* Re: [PATCH RFC 2/8] fs: add a global device to super block hash table
From: Christian Brauner @ 2026-06-16 15:19 UTC (permalink / raw)
  To: Christoph Hellwig, Jan Kara
  Cc: Jens Axboe, Alexander Viro, linux-block, linux-kernel,
	linux-fsdevel, Carlos Maiolino, linux-xfs, Chris Mason,
	David Sterba, linux-btrfs, Theodore Ts'o, linux-ext4,
	Gao Xiang, linux-erofs
In-Reply-To: <20260616-fragil-duktus-nachverfolgen-60f54584c206@brauner>

On Tue, Jun 16, 2026 at 04:59:53PM +0200, Christian Brauner wrote:
> On Tue, Jun 16, 2026 at 02:34:43PM +0200, Christoph Hellwig wrote:
> > On Tue, Jun 02, 2026 at 12:10:08PM +0200, Christian Brauner wrote:
> > > fs_holder_ops recovers the owning superblock from bdev->bd_holder, which
> > > forces the holder to be exactly one superblock and prevents several
> > > superblocks from sharing one block device. That's what erofs is doing.
> > > 
> > > Introduce a global dev_t-keyed rhltable mapping each block device to the
> > > superblock(s) using it. The holder argument becomes purely the block
> > > layer's exclusivity token (a superblock, or a file_system_type for
> > > shared devices) and is no longer needed by the fs specific callbacks.
> > 
> > Err, no.  block devices need to have a specific owner.  If erofs wants
> > to share a device between superblock it needs to come up with an entity
> > that owns the block devices which is not a superblock.
> 
> It already did.
> 
> > IMHO sharing devices between superblocks is a bad idea, but that ship
> > has sailed, but please keep it contained inside of erofs.
> 
> We need a simple device number to superblock mapping anyway and that can
> simply be centralized in the vfs. And it can work with anon device
> numbers and block device numbers uniformly.

Plus, after we're done we then also have a centry place where we can
intercept what devices can be mounted by a filesystem uniformly.

My first approach for this was of course to just add fs_file_open_by_*()
wrappers and move the relevant security hook into there. But while doing
this - ignoring the ton of bugs I found - I realized that having a
mapping so we can go from device number to superblock is very helpful.

We could of course keep the mapping just local to erofs but I see no
reason why the vfs cannot just provide this ability natively given that
it has all the required machinery. I'll let Jan chime in as well.

^ permalink raw reply

* [PATCH 1/2] dm-io: clone the source bio instead of copying its biovec
From: Keith Busch @ 2026-06-16 15:05 UTC (permalink / raw)
  To: dm-devel
  Cc: linux-block, mpatocka, Keith Busch, Dr. David Alan Gilbert,
	Vjaceslavs Klimovs

From: Keith Busch <kbusch@kernel.org>

For DM_IO_BIO requests, do_region() built each destination bio by walking
the source bio's biovec and re-adding the pages one at a time, tracking
the remaining transfer in sectors. The vector lengths are byte granular
and need not be sector aligned (e.g. a misaligned O_DIRECT buffer split
across pages), so the sector-based accounting could lose a sub-sector
fragment: to_sector() truncated the remainder and the outer loop spun
forever submitting empty bios, hanging the I/O.

There is no need to rebuild the biovec at all. The destination reads into
(or writes from) exactly the same pages as the source bio, so the bio can
simply clone the source's biovec with bio_alloc_clone() and remap it to
the target device. The clone inherits the source's iterator and alignment,
and the block layer splits it to the target's limits on submission, so the
whole region maps to a single cloned bio with no manual page copying or
sector accounting.

This removes the per-page copy path (and its open-coded bvec dpages
helpers) for bio-backed I/O and fixes the hang on misaligned direct I/O to
a dm-mirror device. Page-list, vma and kmem sources keep the existing copy
path.

Fixes: 7eac33186957 ("iomap: simplify direct io validity check")
Fixes: 5ff3f74e145a ("block: simplify direct io validity check")
Reported-by: Dr. David Alan Gilbert <linux@treblig.org>
Reported-by: Vjaceslavs Klimovs <vklimovs@gmail.com>
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 drivers/md/dm-io.c | 67 +++++++++++++++++-----------------------------
 1 file changed, 24 insertions(+), 43 deletions(-)

diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index 1db565b376200..28adfeb58f240 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -170,12 +170,11 @@ struct dpages {
 			 struct page **p, unsigned long *len, unsigned int *offset);
 	void (*next_page)(struct dpages *dp);
 
-	union {
-		unsigned int context_u;
-		struct bvec_iter context_bi;
-	};
+	unsigned int context_u;
 	void *context_ptr;
 
+	struct bio *orig_bio;
+
 	void *vma_invalidate_address;
 	unsigned long vma_invalidate_size;
 };
@@ -210,44 +209,6 @@ static void list_dp_init(struct dpages *dp, struct page_list *pl, unsigned int o
 	dp->context_ptr = pl;
 }
 
-/*
- * Functions for getting the pages from a bvec.
- */
-static void bio_get_page(struct dpages *dp, struct page **p,
-			 unsigned long *len, unsigned int *offset)
-{
-	struct bio_vec bvec = bvec_iter_bvec((struct bio_vec *)dp->context_ptr,
-					     dp->context_bi);
-
-	*p = bvec.bv_page;
-	*len = bvec.bv_len;
-	*offset = bvec.bv_offset;
-
-	/* avoid figuring it out again in bio_next_page() */
-	dp->context_bi.bi_sector = (sector_t)bvec.bv_len;
-}
-
-static void bio_next_page(struct dpages *dp)
-{
-	unsigned int len = (unsigned int)dp->context_bi.bi_sector;
-
-	bvec_iter_advance((struct bio_vec *)dp->context_ptr,
-			  &dp->context_bi, len);
-}
-
-static void bio_dp_init(struct dpages *dp, struct bio *bio)
-{
-	dp->get_page = bio_get_page;
-	dp->next_page = bio_next_page;
-
-	/*
-	 * We just use bvec iterator to retrieve pages, so it is ok to
-	 * access the bvec table directly here
-	 */
-	dp->context_ptr = bio->bi_io_vec;
-	dp->context_bi = bio->bi_iter;
-}
-
 /*
  * Functions for getting the pages from a VMA.
  */
@@ -332,6 +293,21 @@ static void do_region(const blk_opf_t opf, unsigned int region,
 		return;
 	}
 
+	if (dp->orig_bio) {
+		bio = bio_alloc_clone(where->bdev, dp->orig_bio, GFP_NOIO,
+				      &io->client->bios);
+		bio->bi_iter.bi_sector = where->sector;
+		bio->bi_iter.bi_size = where->count << SECTOR_SHIFT;
+		bio->bi_opf = opf;
+		bio->bi_end_io = endio;
+		bio->bi_ioprio = ioprio;
+		store_io_and_region_in_bio(bio, io, region);
+
+		atomic_inc(&io->count);
+		submit_bio(bio);
+		return;
+	}
+
 	/*
 	 * where->count may be zero if op holds a flush and we need to
 	 * send a zero-sized flush.
@@ -468,6 +444,7 @@ static int dp_init(struct dm_io_request *io_req, struct dpages *dp,
 
 	dp->vma_invalidate_address = NULL;
 	dp->vma_invalidate_size = 0;
+	dp->orig_bio = NULL;
 
 	switch (io_req->mem.type) {
 	case DM_IO_PAGE_LIST:
@@ -475,7 +452,11 @@ static int dp_init(struct dm_io_request *io_req, struct dpages *dp,
 		break;
 
 	case DM_IO_BIO:
-		bio_dp_init(dp, io_req->mem.ptr.bio);
+		/*
+		 * The destination bios clone this bio's biovec directly, so
+		 * there are no per-page accessors to set up here.
+		 */
+		dp->orig_bio = io_req->mem.ptr.bio;
 		break;
 
 	case DM_IO_VMA:
-- 
2.52.0


^ permalink raw reply related

* Re: [PATCH RFC 2/8] fs: add a global device to super block hash table
From: Christian Brauner @ 2026-06-16 14:59 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Jan Kara, Jens Axboe, Alexander Viro, linux-block, linux-kernel,
	linux-fsdevel, Carlos Maiolino, linux-xfs, Chris Mason,
	David Sterba, linux-btrfs, Theodore Ts'o, linux-ext4,
	Gao Xiang, linux-erofs
In-Reply-To: <20260616123443.GA21024@lst.de>

On Tue, Jun 16, 2026 at 02:34:43PM +0200, Christoph Hellwig wrote:
> On Tue, Jun 02, 2026 at 12:10:08PM +0200, Christian Brauner wrote:
> > fs_holder_ops recovers the owning superblock from bdev->bd_holder, which
> > forces the holder to be exactly one superblock and prevents several
> > superblocks from sharing one block device. That's what erofs is doing.
> > 
> > Introduce a global dev_t-keyed rhltable mapping each block device to the
> > superblock(s) using it. The holder argument becomes purely the block
> > layer's exclusivity token (a superblock, or a file_system_type for
> > shared devices) and is no longer needed by the fs specific callbacks.
> 
> Err, no.  block devices need to have a specific owner.  If erofs wants
> to share a device between superblock it needs to come up with an entity
> that owns the block devices which is not a superblock.

It already did.

> IMHO sharing devices between superblocks is a bad idea, but that ship
> has sailed, but please keep it contained inside of erofs.

We need a simple device number to superblock mapping anyway and that can
simply be centralized in the vfs. And it can work with anon device
numbers and block device numbers uniformly.

^ permalink raw reply

* [PATCH V2]block: Remove redundant plug in __submit_bio()
From: wenxiong @ 2026-06-16 14:31 UTC (permalink / raw)
  To: linux-block, axboe; +Cc: tom.leiming, yukuai, stable, wenxiong, Wen Xiong

From: Wen Xiong <wenxiong@linux.ibm.com>

The patch removes the automatic plug/unplug operations from __submit_bio()
that were added to cache nsecs time when no explicit plug is used.

The plug mechanism is most effective when batching multiple I/O
operations together. Creating a plug for every bio submission
provides minimal benefit while adding function call overhead and
stack usage for every I/O operation.

Below is performance comparison with the latest upstream kernel.

Iotype  qd nj  rmix  mpstat busy  mpstat busy without plug
Randrw  1  20  100       53%                 24%
Randrw  1  40  100       70%                 24%
Randrw  1  20  70        40%                 24%
Randrw  1  40  70        60%                 26%
Randrw  1  20  0         14%                 6%
Randrw  1  40  0         20%                 7%

Fixes: 060406c61c7c ("block: add plug while submitting IO")
Signed-off-by: Wen Xiong <wenxiong@linux.ibm.com>
Reviewed-by: Ming Lei <tom.leiming@gmail.com>
---
 block/blk-core.c | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 73a41df98c9a..365641266c9e 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -669,11 +669,6 @@ static inline blk_status_t blk_check_zone_append(struct request_queue *q,
 
 static void __submit_bio(struct bio *bio)
 {
-	/* If plug is not used, add new plug here to cache nsecs time. */
-	struct blk_plug plug;
-
-	blk_start_plug(&plug);
-
 	if (!bdev_test_flag(bio->bi_bdev, BD_HAS_SUBMIT_BIO)) {
 		blk_mq_submit_bio(bio);
 	} else if (likely(bio_queue_enter(bio) == 0)) {
@@ -686,8 +681,6 @@ static void __submit_bio(struct bio *bio)
 			disk->fops->submit_bio(bio);
 		blk_queue_exit(disk->queue);
 	}
-
-	blk_finish_plug(&plug);
 }
 
 /*
-- 
2.52.0


^ permalink raw reply related

* Re: Repeatable, raid1+O_DIRECT, hang/warn
From: Keith Busch @ 2026-06-16 14:19 UTC (permalink / raw)
  To: Dr. David Alan Gilbert
  Cc: zkabelac, Vjaceslavs Klimovs, Thorsten Leemhuis, trnka,
	linux-block, dm-devel, Linux kernel regressions list
In-Reply-To: <ajFK5NXkxd6jU5zu@gallifrey>

On Tue, Jun 16, 2026 at 01:08:52PM +0000, Dr. David Alan Gilbert wrote:
> ( lvcreate  -m 1 -L 1G main /dev/sda2 /dev/sdb2 ) rather than
> the old mirror with the same patch, then:
> 
>   a) I get no log errors with either read or write
>   b) read still gives EIO

I've a follow up patch to handle the error properly. You want to see
EINVAL, not EIO, and that error shouldn't be considered for determining
the raid health. Something like what f7b24c7b41f23b5 does, but it's a
little more complicated in this path since it doesn't see the lower
level error status and just converts everything to EIO.

^ permalink raw reply

* [PATCH 2/2] block: invalidate cached plug timestamp after task switch
From: Usama Arif @ 2026-06-16 14:15 UTC (permalink / raw)
  To: axboe, linux-block, bsegall, dietmar.eggemann, juri.lelli,
	kprateek.nayak, linux-kernel, mgorman, mingo, peterz, rostedt,
	vincent.guittot, vschneid
  Cc: shakeel.butt, hannes, riel, kernel-team, Usama Arif, stable
In-Reply-To: <20260616141604.328820-1-usama.arif@linux.dev>

blk_time_get_ns() caches ktime_get_ns() in current->plug->cur_ktime
and marks the task with PF_BLOCK_TS. That cache is only valid while the
task keeps running; if the task is switched out, wall-clock time
advances and the cached value must not be reused when the task runs again.

The existing invalidation covers explicit plug flushes through
__blk_flush_plug(), and the schedule() / rtmutex paths through
sched_update_worker(). It does not cover in-kernel preemption paths such
as preempt_schedule(), preempt_schedule_notrace(), and
preempt_schedule_irq(), which enter __schedule(SM_PREEMPT) directly and
return without calling sched_update_worker().

As a result, a task preempted while holding a plug with PF_BLOCK_TS set
can reuse a stale plug->cur_ktime after it is scheduled back in. blk-iocost
then consumes that stale timestamp through ioc_now(), producing stale vnow
values for throttle decisions, and through ioc_rqos_done(), inflating
on-queue time and feeding false missed-QoS samples into vrate
adjustment.

Move the schedule-side invalidation to finish_task_switch(), which runs
for the scheduled-in task after every actual context switch regardless
of which schedule entry point was used. Keep __blk_flush_plug() as the
explicit flush/finish-plug invalidation path, and remove only the
PF_BLOCK_TS handling from sched_update_worker().

Fixes: 06b23f92af87 ("block: update cached timestamp post schedule/preemption")
Cc: stable@vger.kernel.org
Signed-off-by: Usama Arif <usama.arif@linux.dev>
---
 include/linux/blkdev.h | 16 ++++++----------
 kernel/sched/core.c    | 12 ++++++++----
 2 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 57e84d59a642..c285a4d9837d 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1216,16 +1216,12 @@ static inline void blk_flush_plug(struct blk_plug *plug, bool async)
 		__blk_flush_plug(plug, async);
 }
 
-/*
- * tsk == current here
- */
-static inline void blk_plug_invalidate_ts(struct task_struct *tsk)
+static __always_inline void blk_plug_invalidate_ts(void)
 {
-	struct blk_plug *plug = tsk->plug;
-
-	if (plug)
-		plug->cur_ktime = 0;
-	current->flags &= ~PF_BLOCK_TS;
+	if (unlikely(current->flags & PF_BLOCK_TS)) {
+		current->plug->cur_ktime = 0;
+		current->flags &= ~PF_BLOCK_TS;
+	}
 }
 
 int blkdev_issue_flush(struct block_device *bdev);
@@ -1251,7 +1247,7 @@ static inline void blk_flush_plug(struct blk_plug *plug, bool async)
 {
 }
 
-static inline void blk_plug_invalidate_ts(struct task_struct *tsk)
+static inline void blk_plug_invalidate_ts(void)
 {
 }
 
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 8b791e9e9f67..e97e98c33be5 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5368,6 +5368,12 @@ static struct rq *finish_task_switch(struct task_struct *prev)
 	 */
 	kmap_local_sched_in();
 
+	/*
+	 * Any cached block-layer timestamp (plug->cur_ktime) is stale now,
+	 * invalidate it.
+	 */
+	blk_plug_invalidate_ts();
+
 	fire_sched_in_preempt_notifiers(current);
 	/*
 	 * When switching through a kernel thread, the loop in
@@ -7290,12 +7296,10 @@ static inline void sched_submit_work(struct task_struct *tsk)
 
 static void sched_update_worker(struct task_struct *tsk)
 {
-	if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER | PF_BLOCK_TS)) {
-		if (tsk->flags & PF_BLOCK_TS)
-			blk_plug_invalidate_ts(tsk);
+	if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) {
 		if (tsk->flags & PF_WQ_WORKER)
 			wq_worker_running(tsk);
-		else if (tsk->flags & PF_IO_WORKER)
+		else
 			io_wq_worker_running(tsk);
 	}
 }
-- 
2.53.0-Meta


^ permalink raw reply related

* [PATCH 1/2] kernel/fork: clear PF_BLOCK_TS in copy_process()
From: Usama Arif @ 2026-06-16 14:15 UTC (permalink / raw)
  To: axboe, linux-block, bsegall, dietmar.eggemann, juri.lelli,
	kprateek.nayak, linux-kernel, mgorman, mingo, peterz, rostedt,
	vincent.guittot, vschneid
  Cc: shakeel.butt, hannes, riel, kernel-team, Usama Arif, stable
In-Reply-To: <20260616141604.328820-1-usama.arif@linux.dev>

PF_BLOCK_TS is only set in blk_time_get_ns() when current->plug is
non-NULL, and blk_finish_plug() clears it via __blk_flush_plug()
before NULLing the plug pointer.  copy_process() breaks the
invariant by inheriting PF_BLOCK_TS from the parent while resetting
the child's plug to NULL.

Clear PF_BLOCK_TS alongside that assignment so callers can rely on
"PF_BLOCK_TS set implies current->plug != NULL" and dereference
current->plug unguarded.

Fixes: 06b23f92af87 ("block: update cached timestamp post schedule/preemption")
Cc: stable@vger.kernel.org
Signed-off-by: Usama Arif <usama.arif@linux.dev>
---
 kernel/fork.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel/fork.c b/kernel/fork.c
index 892a95214c54..13e38e89a1f3 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2338,6 +2338,7 @@ __latent_entropy struct task_struct *copy_process(
 
 #ifdef CONFIG_BLOCK
 	p->plug = NULL;
+	p->flags &= ~PF_BLOCK_TS;
 #endif
 	futex_init_task(p);
 
-- 
2.53.0-Meta


^ permalink raw reply related

* [PATCH 0/2] block: invalidate cached plug timestamp on context switch
From: Usama Arif @ 2026-06-16 14:15 UTC (permalink / raw)
  To: axboe, linux-block, bsegall, dietmar.eggemann, juri.lelli,
	kprateek.nayak, linux-kernel, mgorman, mingo, peterz, rostedt,
	vincent.guittot, vschneid
  Cc: shakeel.butt, hannes, riel, kernel-team, Usama Arif

The details for this are in patch 2. The main reason for this series
is to invalidate the cached timestamp on context switch. This was
done in sched_update_worker() only before which was resulting in
blk-iocost reading stale timestamps and throttling based on wrong
information.

Patch 1 is a prerequisite to create the invariant that
PF_BLOCK_TS set implies current->plug != NULL.

v2 -> v3:
  https://lore.kernel.org/all/20260612094042.3350401-1-usama.arif@linux.dev/
  - Add patch 1 to clear PF_BLOCK_TS in copy_process() so the
    invariant survives fork.
  - Drop the if (plug) NULL check inside blk_plug_invalidate_ts(),
    relying on the invariant established by patch 1. (Peter Zijlstra)

v1 -> v2:
  https://lore.kernel.org/all/20260611231428.345098-1-usama.arif@linux.dev/
  - Move the PF_BLOCK_TS check into blk_plug_invalidate_ts() and
    upgrade it to __always_inline (Peter Zijlstra).
  - Drop the tsk parameter; the helper only ever operates on current.
 
Usama Arif (2):
  kernel/fork: clear PF_BLOCK_TS in copy_process()
  block: invalidate cached plug timestamp after task switch

 include/linux/blkdev.h | 16 ++++++----------
 kernel/fork.c          |  1 +
 kernel/sched/core.c    | 12 ++++++++----
 3 files changed, 15 insertions(+), 14 deletions(-)

-- 
2.53.0-Meta


^ permalink raw reply

* [PATCH RFC v2 18/18] selftests/filesystems: add ustat() coverage
From: Christian Brauner @ 2026-06-16 14:08 UTC (permalink / raw)
  To: Jan Kara
  Cc: Christoph Hellwig, Jens Axboe, Alexander Viro, linux-block,
	linux-kernel, linux-fsdevel, Carlos Maiolino, linux-xfs,
	Chris Mason, David Sterba, linux-btrfs, Theodore Ts'o,
	linux-ext4, Gao Xiang, linux-erofs, Christian Brauner (Amutable)
In-Reply-To: <20260616-work-super-bdev_holder_global-v2-0-7df6b864028e@kernel.org>

user_get_super() is now backed by the global device-to-superblock table
instead of a walk of the super_blocks list. ustat(2) is its most direct
user-visible consumer but nothing in the tree exercises it.

Add a small regression test: the device number of a mounted tmpfs (an
anonymous device, registered in the table by sget_fc()) must resolve,
it must stop resolving after the unmount (the entry is dropped again in
kill_super_notify()), and bogus device numbers keep reporting EINVAL.

The test passes on kernels before the conversion: it pins down the
semantics the table-backed lookup must preserve.

Signed-off-by: Christian Brauner (Amutable) <brauner@kernel.org>
---
 tools/testing/selftests/filesystems/.gitignore   |   1 +
 tools/testing/selftests/filesystems/Makefile     |   2 +-
 tools/testing/selftests/filesystems/ustat_test.c | 135 +++++++++++++++++++++++
 3 files changed, 137 insertions(+), 1 deletion(-)

diff --git a/tools/testing/selftests/filesystems/.gitignore b/tools/testing/selftests/filesystems/.gitignore
index 64ac0dfa46b7..1bd53d54553c 100644
--- a/tools/testing/selftests/filesystems/.gitignore
+++ b/tools/testing/selftests/filesystems/.gitignore
@@ -5,3 +5,4 @@ fclog
 file_stressor
 anon_inode_test
 kernfs_test
+ustat_test
diff --git a/tools/testing/selftests/filesystems/Makefile b/tools/testing/selftests/filesystems/Makefile
index 85427d7f19b9..bbdd40b167fa 100644
--- a/tools/testing/selftests/filesystems/Makefile
+++ b/tools/testing/selftests/filesystems/Makefile
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0
 
 CFLAGS += $(KHDR_INCLUDES)
-TEST_GEN_PROGS := devpts_pts file_stressor anon_inode_test kernfs_test fclog
+TEST_GEN_PROGS := devpts_pts file_stressor anon_inode_test kernfs_test fclog ustat_test
 TEST_GEN_PROGS_EXTENDED := dnotify_test
 
 include ../lib.mk
diff --git a/tools/testing/selftests/filesystems/ustat_test.c b/tools/testing/selftests/filesystems/ustat_test.c
new file mode 100644
index 000000000000..d429fd18d779
--- /dev/null
+++ b/tools/testing/selftests/filesystems/ustat_test.c
@@ -0,0 +1,135 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Test ustat(2): looking up superblocks by device number.
+ *
+ * ustat() resolves a device number to a mounted superblock via
+ * user_get_super(). Check that the device number of a mounted tmpfs (an
+ * anonymous device) resolves, that it stops resolving once the filesystem
+ * is unmounted and that bogus device numbers report EINVAL.
+ */
+#define _GNU_SOURCE
+#include <errno.h>
+#include <fcntl.h>
+#include <sched.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/mount.h>
+#include <sys/stat.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+
+#include "../kselftest_harness.h"
+
+/* struct ustat is not exported through UAPI, mirror include/linux/types.h. */
+struct ustat_buf {
+	int		f_tfree;
+	unsigned long	f_tinode;
+	char		f_fname[6];
+	char		f_fpack[6];
+	/* slack in case an architecture lays the struct out differently */
+	char		pad[64];
+};
+
+#ifdef __NR_ustat
+
+/*
+ * The kernel decodes @dev with new_decode_dev(), which matches the low 32
+ * bits of the st_dev encoding stat(2) returns for any major below 4096.
+ */
+static int sys_ustat(unsigned int dev, struct ustat_buf *buf)
+{
+	return syscall(__NR_ustat, dev, buf);
+}
+
+static int write_string(const char *path, const char *string)
+{
+	ssize_t len = strlen(string);
+	int fd;
+
+	fd = open(path, O_WRONLY);
+	if (fd < 0)
+		return -1;
+	if (write(fd, string, len) != len) {
+		close(fd);
+		return -1;
+	}
+	return close(fd);
+}
+
+/* Enter namespaces in which mounting a tmpfs instance is allowed. */
+static int setup_namespaces(void)
+{
+	uid_t uid = getuid();
+	gid_t gid = getgid();
+	char map[64];
+
+	if (unshare(CLONE_NEWNS | (uid ? CLONE_NEWUSER : 0)))
+		return -1;
+
+	if (uid) {
+		if (write_string("/proc/self/setgroups", "deny"))
+			return -1;
+		snprintf(map, sizeof(map), "0 %d 1", uid);
+		if (write_string("/proc/self/uid_map", map))
+			return -1;
+		snprintf(map, sizeof(map), "0 %d 1", gid);
+		if (write_string("/proc/self/gid_map", map))
+			return -1;
+	}
+
+	return mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, NULL);
+}
+
+TEST(resolves_mounted_superblock)
+{
+	char dir[] = "/tmp/ustat_test.XXXXXX";
+	struct ustat_buf ub;
+	struct stat st;
+
+	ASSERT_NE(NULL, mkdtemp(dir));
+
+	if (setup_namespaces()) {
+		rmdir(dir);
+		SKIP(return, "cannot set up namespaces: %s", strerror(errno));
+	}
+
+	ASSERT_EQ(0, mount("ustat_test", dir, "tmpfs", 0, NULL));
+	ASSERT_EQ(0, stat(dir, &st));
+
+	memset(&ub, 0xff, sizeof(ub));
+	ASSERT_EQ(0, sys_ustat(st.st_dev, &ub))
+		TH_LOG("ustat(%u): %s", (unsigned int)st.st_dev,
+		       strerror(errno));
+
+	ASSERT_EQ(0, umount(dir));
+
+	/* The unmount removed the superblock, the device is gone. */
+	ASSERT_EQ(-1, sys_ustat(st.st_dev, &ub));
+	ASSERT_EQ(EINVAL, errno);
+
+	rmdir(dir);
+}
+
+TEST(bogus_device_numbers)
+{
+	struct ustat_buf ub;
+
+	ASSERT_EQ(-1, sys_ustat(0, &ub));
+	ASSERT_EQ(EINVAL, errno);
+
+	/* major 4095, minor 1048575: nothing plausible lives there */
+	ASSERT_EQ(-1, sys_ustat((0xfffu << 8) | 0xffu | (0xfff00u << 12), &ub));
+	ASSERT_EQ(EINVAL, errno);
+}
+
+#else /* !__NR_ustat */
+
+TEST(unsupported)
+{
+	SKIP(return, "ustat(2) is not available on this architecture");
+}
+
+#endif /* __NR_ustat */
+
+TEST_HARNESS_MAIN

-- 
2.47.3


^ permalink raw reply related

* [PATCH RFC v2 17/18] fs: look up the superblock via the device table in user_get_super()
From: Christian Brauner @ 2026-06-16 14:08 UTC (permalink / raw)
  To: Jan Kara
  Cc: Christoph Hellwig, Jens Axboe, Alexander Viro, linux-block,
	linux-kernel, linux-fsdevel, Carlos Maiolino, linux-xfs,
	Chris Mason, David Sterba, linux-btrfs, Theodore Ts'o,
	linux-ext4, Gao Xiang, linux-erofs, Christian Brauner (Amutable)
In-Reply-To: <20260616-work-super-bdev_holder_global-v2-0-7df6b864028e@kernel.org>

user_get_super() still finds the superblock for a device number by
walking the global super_blocks list under sb_lock. Every superblock is
registered in the device table under its s_dev since sget_fc() inserts
it there, including superblocks on anonymous devices, so use the table
instead.

The refcount-pinning cursor helpers super_dev_{get,first,next}() only
touch table state and do not depend on CONFIG_BLOCK, so drop the
CONFIG_BLOCK guard around them: their new caller serves anonymous
devices as well (ustat() on e.g. tmpfs) and is built without
CONFIG_BLOCK. The guard falls in this patch rather than separately
since without this caller the helpers would be unused without
CONFIG_BLOCK.

The pinned entry holds a passive reference on the superblock so
super_lock() can be called directly; once the superblock is locked grab
a passive reference for the caller before dropping the pin.

The device table contains more than the old walk could find: a
superblock is also registered for every additional device it claims
(the xfs log and realtime devices, btrfs member devices, the ext4
external journal, erofs blob devices). Don't filter those out:
specifying any device a filesystem uses now resolves to that
filesystem, so ustat() and quotactl() work on e.g. the xfs log device
or a btrfs member device (the latter used to fail outright as btrfs
superblocks carry an anonymous s_dev that never matches a member
device). When several superblocks share a device (erofs blob devices)
the first live superblock wins.

The cursor also keeps scanning past dying superblocks where the old
walk gave up after the first s_dev match, so a mount racing with the
unmount of the same device (or with the reuse of a recycled anonymous
dev_t) finds the live superblock where the old walk could spuriously
return NULL.

This removes the last s_dev-keyed walk of the super_blocks list and
takes ustat() and quotactl()'s block device lookup off sb_lock
entirely.

Signed-off-by: Christian Brauner (Amutable) <brauner@kernel.org>
---
 fs/super.c | 28 ++++++++--------------------
 1 file changed, 8 insertions(+), 20 deletions(-)

diff --git a/fs/super.c b/fs/super.c
index 2d0a07861bfc..93f24aea75c4 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -501,7 +501,6 @@ static int super_dev_register(struct super_block *sb)
 	return err;
 }
 
-#ifdef CONFIG_BLOCK
 static struct super_dev *super_dev_get(struct rhlist_head *pos)
 {
 	struct super_dev *sb_dev;
@@ -535,7 +534,6 @@ static struct super_dev *super_dev_next(struct super_dev *prev)
 	super_dev_put(prev);
 	return sb_dev;
 }
-#endif
 
 static void kill_super_notify(struct super_block *sb)
 {
@@ -1044,29 +1042,19 @@ EXPORT_SYMBOL(iterate_supers_type);
 
 struct super_block *user_get_super(dev_t dev, bool excl)
 {
-	struct super_block *sb;
-
-	spin_lock(&sb_lock);
-	list_for_each_entry(sb, &super_blocks, s_list) {
-		bool locked;
+	struct super_dev *sb_dev;
 
-		if (sb->s_dev != dev)
-			continue;
+	for (sb_dev = super_dev_first(dev); sb_dev; sb_dev = super_dev_next(sb_dev)) {
+		struct super_block *sb = sb_dev->sd_sb;
 
-		if (!refcount_inc_not_zero(&sb->s_passive))
+		if (!super_lock(sb, excl))
 			continue;
 
-		spin_unlock(&sb_lock);
-
-		locked = super_lock(sb, excl);
-		if (locked)
-			return sb;
-
-		put_super(sb);
-		spin_lock(&sb_lock);
-		break;
+		/* The pinned entry holds a passive reference, take our own. */
+		refcount_inc(&sb->s_passive);
+		super_dev_put(sb_dev);
+		return sb;
 	}
-	spin_unlock(&sb_lock);
 	return NULL;
 }
 

-- 
2.47.3


^ permalink raw reply related

* [PATCH RFC v2 16/18] super: make fs_holder_ops private
From: Christian Brauner @ 2026-06-16 14:08 UTC (permalink / raw)
  To: Jan Kara
  Cc: Christoph Hellwig, Jens Axboe, Alexander Viro, linux-block,
	linux-kernel, linux-fsdevel, Carlos Maiolino, linux-xfs,
	Chris Mason, David Sterba, linux-btrfs, Theodore Ts'o,
	linux-ext4, Gao Xiang, linux-erofs, Christian Brauner (Amutable)
In-Reply-To: <20260616-work-super-bdev_holder_global-v2-0-7df6b864028e@kernel.org>

Now that filesystems open and claim their block devices through
fs_bdev_file_open_by_{dev,path}(), nothing outside fs/super.c references
fs_holder_ops. Make it static and drop its declaration from blkdev.h.

Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner (Amutable) <brauner@kernel.org>
---
 fs/super.c             | 3 +--
 include/linux/blkdev.h | 7 -------
 2 files changed, 1 insertion(+), 9 deletions(-)

diff --git a/fs/super.c b/fs/super.c
index a83f58755cf8..2d0a07861bfc 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -1624,13 +1624,12 @@ static int fs_bdev_thaw(struct block_device *bdev)
 	return error;
 }
 
-const struct blk_holder_ops fs_holder_ops = {
+static const struct blk_holder_ops fs_holder_ops = {
 	.mark_dead		= fs_bdev_mark_dead,
 	.sync			= fs_bdev_sync,
 	.freeze			= fs_bdev_freeze,
 	.thaw			= fs_bdev_thaw,
 };
-EXPORT_SYMBOL_GPL(fs_holder_ops);
 
 static struct super_dev *super_dev_lookup(dev_t dev, struct super_block *sb)
 {
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index cee548184a7b..45225b4f7193 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1772,13 +1772,6 @@ struct blk_holder_ops {
 		__releases(&bdev->bd_holder_lock);
 };
 
-/*
- * For filesystems using @fs_holder_ops, the @holder argument passed to
- * helpers used to open and claim block devices via
- * bd_prepare_to_claim() must point to a superblock.
- */
-extern const struct blk_holder_ops fs_holder_ops;
-
 /*
  * Return the correct open flags for blkdev_get_by_* for super block flags
  * as stored in sb->s_flags.

-- 
2.47.3


^ permalink raw reply related

* [PATCH RFC v2 15/18] f2fs: open via dedicated fs bdev helpers
From: Christian Brauner @ 2026-06-16 14:08 UTC (permalink / raw)
  To: Jan Kara
  Cc: Christoph Hellwig, Jens Axboe, Alexander Viro, linux-block,
	linux-kernel, linux-fsdevel, Carlos Maiolino, linux-xfs,
	Chris Mason, David Sterba, linux-btrfs, Theodore Ts'o,
	linux-ext4, Gao Xiang, linux-erofs, Christian Brauner (Amutable)
In-Reply-To: <20260616-work-super-bdev_holder_global-v2-0-7df6b864028e@kernel.org>

Route the extra device opens of a multi-device f2fs through
fs_bdev_file_open_by_path() so each device is registered against the
superblock, and convert the matching release in destroy_device_list()
to fs_bdev_file_release(). The first device aliases the main bdev file
opened by setup_bdev_super() and is already registered through it.

f2fs opened its extra devices without holder ops, so a freeze, sync, or
removal of one of them was never propagated to the superblock.
Registering them wires those events up: every device now freezes,
thaws, syncs, and shuts down the filesystem like the main device does.

Signed-off-by: Christian Brauner (Amutable) <brauner@kernel.org>
---
 fs/f2fs/super.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index ccf806b676f5..49349262564f 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -1970,7 +1970,7 @@ static void destroy_device_list(struct f2fs_sb_info *sbi)
 
 	for (i = 0; i < sbi->s_ndevs; i++) {
 		if (i > 0)
-			bdev_fput(FDEV(i).bdev_file);
+			fs_bdev_file_release(FDEV(i).bdev_file, sbi->sb);
 #ifdef CONFIG_BLK_DEV_ZONED
 		kvfree(FDEV(i).blkz_seq);
 #endif
@@ -4840,8 +4840,8 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi)
 				FDEV(i).end_blk = FDEV(i).start_blk +
 						SEGS_TO_BLKS(sbi,
 						FDEV(i).total_segments) - 1;
-				FDEV(i).bdev_file = bdev_file_open_by_path(
-					FDEV(i).path, mode, sbi->sb, NULL);
+				FDEV(i).bdev_file = fs_bdev_file_open_by_path(
+					FDEV(i).path, mode, sbi->sb, sbi->sb);
 			}
 		}
 		if (IS_ERR(FDEV(i).bdev_file))

-- 
2.47.3


^ permalink raw reply related

* [PATCH RFC v2 14/18] erofs: open via dedicated fs bdev helpers
From: Christian Brauner @ 2026-06-16 14:08 UTC (permalink / raw)
  To: Jan Kara
  Cc: Christoph Hellwig, Jens Axboe, Alexander Viro, linux-block,
	linux-kernel, linux-fsdevel, Carlos Maiolino, linux-xfs,
	Chris Mason, David Sterba, linux-btrfs, Theodore Ts'o,
	linux-ext4, Gao Xiang, linux-erofs, Christian Brauner (Amutable),
	Gao Xiang
In-Reply-To: <20260616-work-super-bdev_holder_global-v2-0-7df6b864028e@kernel.org>

Route opens through fs_bdev_file_open_by_path() so each external device
is registered against the correct superblock, and convert the matching
releases.

Gao Xiang: I think typical immutable filesystems don't need .shutdown()
and .remove_bdev() for the following reasons:

  - blk_mark_disk_dead() sets GD_DEAD in advance of fs_bdev_mark_dead()
    so that the following bios will fail immediately; block_device
    references are still valid so it seems overkill to handle dead
    blockdevs in the deep filesystem I/O submission path.

  - Immutable filesystems like EROFS don't have write paths and journals,
    so they don't need to block writes (i.e., new dirty pages), metadata
    changes, and abort journals.

  - The comment above loop_change_fd() documents a valid read-only use
    case we need to support anyway, but it calls disk_force_media_change()
    which will call fs_bdev_mark_dead() later: we don't want loop_change_fd()
    shutdowns the active filesystems and return -EIO unconditionally.

Currently I think the default behavior (shrink_dcache_sb + evict_inodes)
in fs_bdev_mark_dead() is enough for immutable filesystems, tried to
document in the commit here for later reference.

Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
Signed-off-by: Christian Brauner (Amutable) <brauner@kernel.org>
---
 fs/erofs/super.c | 35 +++++++++++++++++++++++------------
 1 file changed, 23 insertions(+), 12 deletions(-)

diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index 802add6652fd..def9cbfbc9d8 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -153,8 +153,8 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb,
 	} else if (!sbi->devs->flatdev) {
 		file = erofs_is_fileio_mode(sbi) ?
 				filp_open(dif->path, O_RDONLY | O_LARGEFILE, 0) :
-				bdev_file_open_by_path(dif->path,
-						BLK_OPEN_READ, sb->s_type, NULL);
+				fs_bdev_file_open_by_path(dif->path,
+						BLK_OPEN_READ, sb->s_type, sb);
 		if (IS_ERR(file)) {
 			if (file == ERR_PTR(-ENOTBLK))
 				return -EINVAL;
@@ -843,11 +843,16 @@ static int erofs_fc_reconfigure(struct fs_context *fc)
 
 static int erofs_release_device_info(int id, void *ptr, void *data)
 {
+	struct super_block *sb = data;
 	struct erofs_device_info *dif = ptr;
 
 	fs_put_dax(dif->dax_dev, NULL);
-	if (dif->file)
-		fput(dif->file);
+	if (dif->file) {
+		if (S_ISBLK(file_inode(dif->file)->i_mode))
+			fs_bdev_file_release(dif->file, sb);
+		else
+			fput(dif->file);
+	}
 	erofs_fscache_unregister_cookie(dif->fscache);
 	dif->fscache = NULL;
 	kfree(dif->path);
@@ -855,18 +860,19 @@ static int erofs_release_device_info(int id, void *ptr, void *data)
 	return 0;
 }
 
-static void erofs_free_dev_context(struct erofs_dev_context *devs)
+static void erofs_free_dev_context(struct erofs_dev_context *devs,
+				   struct super_block *sb)
 {
 	if (!devs)
 		return;
-	idr_for_each(&devs->tree, &erofs_release_device_info, NULL);
+	idr_for_each(&devs->tree, &erofs_release_device_info, sb);
 	idr_destroy(&devs->tree);
 	kfree(devs);
 }
 
-static void erofs_sb_free(struct erofs_sb_info *sbi)
+static void erofs_sb_free(struct erofs_sb_info *sbi, struct super_block *sb)
 {
-	erofs_free_dev_context(sbi->devs);
+	erofs_free_dev_context(sbi->devs, sb);
 	kfree(sbi->fsid);
 	kfree_sensitive(sbi->domain_id);
 	if (sbi->dif0.file)
@@ -879,8 +885,13 @@ static void erofs_fc_free(struct fs_context *fc)
 {
 	struct erofs_sb_info *sbi = fc->s_fs_info;
 
-	if (sbi) /* free here if an error occurs before transferring to sb */
-		erofs_sb_free(sbi);
+	/*
+	 * Freed here only if an error occurs before the sb is set up; at that
+	 * point no block-backed device has been claimed (that happens in
+	 * fill_super), so the NULL sb never reaches fs_bdev_file_release().
+	 */
+	if (sbi)
+		erofs_sb_free(sbi, NULL);
 }
 
 static const struct fs_context_operations erofs_context_ops = {
@@ -936,7 +947,7 @@ static void erofs_kill_sb(struct super_block *sb)
 	erofs_drop_internal_inodes(sbi);
 	fs_put_dax(sbi->dif0.dax_dev, NULL);
 	erofs_fscache_unregister_fs(sb);
-	erofs_sb_free(sbi);
+	erofs_sb_free(sbi, sb);
 	sb->s_fs_info = NULL;
 }
 
@@ -948,7 +959,7 @@ static void erofs_put_super(struct super_block *sb)
 	erofs_shrinker_unregister(sb);
 	erofs_xattr_prefixes_cleanup(sb);
 	erofs_drop_internal_inodes(sbi);
-	erofs_free_dev_context(sbi->devs);
+	erofs_free_dev_context(sbi->devs, sb);
 	sbi->devs = NULL;
 	erofs_fscache_unregister_fs(sb);
 }

-- 
2.47.3


^ permalink raw reply related

* [PATCH RFC v2 13/18] fs: tolerate per-superblock freeze errors on shared devices
From: Christian Brauner @ 2026-06-16 14:08 UTC (permalink / raw)
  To: Jan Kara
  Cc: Christoph Hellwig, Jens Axboe, Alexander Viro, linux-block,
	linux-kernel, linux-fsdevel, Carlos Maiolino, linux-xfs,
	Chris Mason, David Sterba, linux-btrfs, Theodore Ts'o,
	linux-ext4, Gao Xiang, linux-erofs, Christian Brauner (Amutable)
In-Reply-To: <20260616-work-super-bdev_holder_global-v2-0-7df6b864028e@kernel.org>

When several superblocks share a device, keep it frozen even if some of
them failed to freeze and swallow the error: rolling the others back
via thaw_super() can fail too, so neither is a clear win. A single
filesystem still reports its error, and a sync_blockdev() failure is
always reported. Thaw follows the same rule.

A device can only be shared once superblocks claim it with a common
exclusivity token, which erofs starts doing in the next patch; for
everyone else the loop visits exactly one superblock and the behavior
is unchanged.

Signed-off-by: Christian Brauner (Amutable) <brauner@kernel.org>
---
 fs/super.c | 23 ++++++++++++++++++++---
 1 file changed, 20 insertions(+), 3 deletions(-)

diff --git a/fs/super.c b/fs/super.c
index 236e868209a4..a83f58755cf8 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -1548,13 +1548,15 @@ static void fs_bdev_sync(struct block_device *bdev)
  * devices is frozen once per device and stays frozen until all are thawed; the
  * block layer nests these freezes so the count stays balanced.
  *
- * Return: 0, or the first error from freezing a superblock or syncing the
- *         block device.
+ * Return: 0, or the error from the one superblock on a single-fs device.  When
+ *         several superblocks share @bdev a per-superblock failure is swallowed
+ *         (see below), but a sync_blockdev() failure is always reported.
  */
 static int fs_bdev_freeze(struct block_device *bdev)
 {
 	dev_t dev = bdev->bd_dev;
 	struct super_dev *sb_dev;
+	unsigned int count = 0;
 	int error = 0, err;
 
 	lockdep_assert_held(&bdev->bd_fsfreeze_mutex);
@@ -1568,8 +1570,17 @@ static int fs_bdev_freeze(struct block_device *bdev)
 		if (err && !error)
 			error = err;
 		deactivate_super(sb_dev->sd_sb);
+		count++;
 	}
 
+	/*
+	 * When several superblocks share the device, keep it frozen even if some
+	 * of them failed to freeze and swallow the error: rolling the rest back
+	 * via thaw_super() can fail too, so neither is a clear win. A single
+	 * filesystem (count == 1) still reports its error.
+	 */
+	if (error && count > 1)
+		error = 0;
 	if (!error)
 		error = sync_blockdev(bdev);
 	return error;
@@ -1583,12 +1594,14 @@ static int fs_bdev_freeze(struct block_device *bdev)
  * A zero return does not imply a superblock is fully unfrozen; it may have been
  * frozen more than once (by the kernel or via another device).
  *
- * Return: 0, or the first error from thawing a superblock.
+ * Return: 0, or the first error on a single-fs device; a shared device swallows
+ *         per-superblock errors, as fs_bdev_freeze() does.
  */
 static int fs_bdev_thaw(struct block_device *bdev)
 {
 	dev_t dev = bdev->bd_dev;
 	struct super_dev *sb_dev;
+	unsigned int count = 0;
 	int error = 0, err;
 
 	lockdep_assert_held(&bdev->bd_fsfreeze_mutex);
@@ -1602,8 +1615,12 @@ static int fs_bdev_thaw(struct block_device *bdev)
 		if (err && !error)
 			error = err;
 		deactivate_super(sb_dev->sd_sb);
+		count++;
 	}
 
+	/* Shared device: swallow per-superblock errors, like fs_bdev_freeze(). */
+	if (error && count > 1)
+		error = 0;
 	return error;
 }
 

-- 
2.47.3


^ permalink raw reply related

* [PATCH RFC v2 12/18] fs: look up superblocks via the device table in fs_holder_ops
From: Christian Brauner @ 2026-06-16 14:08 UTC (permalink / raw)
  To: Jan Kara
  Cc: Christoph Hellwig, Jens Axboe, Alexander Viro, linux-block,
	linux-kernel, linux-fsdevel, Carlos Maiolino, linux-xfs,
	Chris Mason, David Sterba, linux-btrfs, Theodore Ts'o,
	linux-ext4, Gao Xiang, linux-erofs, Christian Brauner (Amutable)
In-Reply-To: <20260616-work-super-bdev_holder_global-v2-0-7df6b864028e@kernel.org>

Switch the fs_holder_ops callbacks from recovering the single owning
superblock out of bdev->bd_holder to walking the device-to-superblock
table and acting on every superblock registered for the device. The
holder argument becomes purely the block layer's exclusivity token and
is no longer needed by the fs specific callbacks.

All devices opened with fs_holder_ops are registered by now: the main
device since setup_bdev_super() switched to fs_bdev_file_open_by_dev()
and the extra devices (xfs log and realtime devices, btrfs member
devices, the ext4 external journal) since the preceding per-filesystem
conversions. So no event is lost in the switchover.

The walk uses a refcount-pinning cursor: each step takes a reference on
the entry via sd_ref and resumes from its sd_node. Unlinking an entry
is deferred to the last unpin, so a cursor never resumes from a removed
node.

mark_dead and sync only need the passive reference the entry holds plus
s_umount, which they take with super_lock_shared(). freeze and thaw
additionally need an active reference and acquire it with
get_active_super(), which waits for the superblock to be born before
taking s_active. Taking s_active before the superblock is born would
pin a still-mounting superblock so a racing mount that aborts could
never drop s_active to zero and reach SB_DYING, deadlocking the wait
for SB_BORN. This is how filesystems_freeze() and filesystems_thaw()
acquire it too.

One semantic change: when no live superblock uses the device anymore
(the holder is dying or was never registered), fs_bdev_freeze() and
fs_bdev_thaw() now return 0 - freeze after syncing the block device -
where they used to return -EINVAL.

The freeze-deny release path moves to the table in the same switchover.
A device made unfreezable for a btrfs membership change must drop its
table entry before re-allowing freezing; otherwise a freeze racing the
release reaches the superblock through the still-registered entry and is
stranded once the release unlinks it. Split fs_bdev_unregister() out of
fs_bdev_file_release() - the inverse of fs_bdev_register() - so
btrfs_release_device_allow_freeze() can drop the {dev, sb} entry, re-allow
freezing on the still-open device, then close it. Re-allowing only after
the entry is gone keeps a racing freeze from reaching the superblock, and
doing it while the file is still open avoids touching the block device
after the close. btrfs previously yielded bd_holder before re-allowing,
which this commit makes irrelevant to freeze resolution.

Signed-off-by: Christian Brauner (Amutable) <brauner@kernel.org>
---
 fs/btrfs/volumes.c       |   6 +-
 fs/super.c               | 269 +++++++++++++++++++++++------------------------
 include/linux/fs/super.h |   1 +
 3 files changed, 138 insertions(+), 138 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 02abbfce5ea3..d827d83722c1 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1137,10 +1137,10 @@ void btrfs_release_device_allow_freeze(struct file *bdev_file)
 {
 	struct super_block *sb = bdev_file->private_data;
 
-	/* Yield before allow (strand-safe); file still open for the allow (UAF-safe). */
-	bdev_yield_claim(bdev_file);
+	/* Unregister before re-allowing (strand-safe); file still open (UAF-safe). */
+	fs_bdev_unregister(bdev_file, sb);
 	bdev_allow_freeze(file_bdev(bdev_file));
-	fs_bdev_file_release(bdev_file, sb);
+	bdev_fput(bdev_file);
 }
 
 static void btrfs_close_bdev(struct btrfs_device *device, bool allow_freeze)
diff --git a/fs/super.c b/fs/super.c
index 3d166c7f578a..236e868209a4 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -501,6 +501,42 @@ static int super_dev_register(struct super_block *sb)
 	return err;
 }
 
+#ifdef CONFIG_BLOCK
+static struct super_dev *super_dev_get(struct rhlist_head *pos)
+{
+	struct super_dev *sb_dev;
+
+	for (; pos; pos = rcu_dereference_all(pos->next)) {
+		sb_dev = container_of(pos, struct super_dev, sd_node);
+		if (refcount_inc_not_zero(&sb_dev->sd_ref))
+			return sb_dev;
+	}
+	return NULL;
+}
+
+static struct super_dev *super_dev_first(dev_t dev)
+{
+	struct super_dev *sb_dev;
+
+	rcu_read_lock();
+	sb_dev = super_dev_get(rhltable_lookup(&super_dev_table, &dev, super_dev_params));
+	rcu_read_unlock();
+	return sb_dev;
+}
+
+static struct super_dev *super_dev_next(struct super_dev *prev)
+{
+	struct super_dev *sb_dev;
+
+	rcu_read_lock();
+	sb_dev = super_dev_get(rcu_dereference_all(prev->sd_node.next));
+	rcu_read_unlock();
+
+	super_dev_put(prev);
+	return sb_dev;
+}
+#endif
+
 static void kill_super_notify(struct super_block *sb)
 {
 	lockdep_assert_not_held(&sb->s_umount);
@@ -1443,185 +1479,131 @@ struct super_block *sget_dev(struct fs_context *fc, dev_t dev)
 EXPORT_SYMBOL(sget_dev);
 
 #ifdef CONFIG_BLOCK
-/*
- * Lock the superblock that is holder of the bdev. Returns the superblock
- * pointer if we successfully locked the superblock and it is alive. Otherwise
- * we return NULL and just unlock bdev->bd_holder_lock.
- *
- * The function must be called with bdev->bd_holder_lock and releases it.
- */
-static struct super_block *bdev_super_lock(struct block_device *bdev, bool excl)
-	__releases(&bdev->bd_holder_lock)
+static int fs_super_freeze(struct super_block *sb)
 {
-	struct super_block *sb = bdev->bd_holder;
-	bool locked;
-
-	lockdep_assert_held(&bdev->bd_holder_lock);
-	lockdep_assert_not_held(&sb->s_umount);
-	lockdep_assert_not_held(&bdev->bd_disk->open_mutex);
-
-	/* Make sure sb doesn't go away from under us */
-	refcount_inc(&sb->s_passive);
-
-	mutex_unlock(&bdev->bd_holder_lock);
-
-	locked = super_lock(sb, excl);
-
-	/*
-	 * If the superblock wasn't already SB_DYING then we hold
-	 * s_umount and can safely drop our temporary reference.
-         */
-	put_super(sb);
-
-	if (!locked)
-		return NULL;
-
-	if (!sb->s_root || !(sb->s_flags & SB_ACTIVE)) {
-		super_unlock(sb, excl);
-		return NULL;
-	}
+	if (sb->s_op->freeze_super)
+		return sb->s_op->freeze_super(sb,
+				FREEZE_MAY_NEST | FREEZE_HOLDER_USERSPACE, NULL);
+	return freeze_super(sb, FREEZE_MAY_NEST | FREEZE_HOLDER_USERSPACE, NULL);
+}
 
-	return sb;
+static int fs_super_thaw(struct super_block *sb)
+{
+	if (sb->s_op->thaw_super)
+		return sb->s_op->thaw_super(sb,
+				FREEZE_MAY_NEST | FREEZE_HOLDER_USERSPACE, NULL);
+	return thaw_super(sb, FREEZE_MAY_NEST | FREEZE_HOLDER_USERSPACE, NULL);
 }
 
 static void fs_bdev_mark_dead(struct block_device *bdev, bool surprise)
 {
-	struct super_block *sb;
+	struct super_dev *sb_dev;
+	dev_t dev = bdev->bd_dev;
 
-	sb = bdev_super_lock(bdev, false);
-	if (!sb)
-		return;
+	mutex_unlock(&bdev->bd_holder_lock);
 
-	if (sb->s_op->remove_bdev) {
-		int ret;
+	for (sb_dev = super_dev_first(dev); sb_dev; sb_dev = super_dev_next(sb_dev)) {
+		struct super_block *sb = sb_dev->sd_sb;
 
-		ret = sb->s_op->remove_bdev(sb, bdev);
-		if (!ret) {
-			super_unlock_shared(sb);
-			return;
+		if (!super_lock_shared(sb))
+			continue;
+		if (sb->s_root && (sb->s_flags & SB_ACTIVE)) {
+			if (!sb->s_op->remove_bdev ||
+			    sb->s_op->remove_bdev(sb, bdev)) {
+				if (!surprise)
+					sync_filesystem(sb);
+				shrink_dcache_sb(sb);
+				evict_inodes(sb);
+				if (sb->s_op->shutdown)
+					sb->s_op->shutdown(sb);
+			}
 		}
-		/* Fallback to shutdown. */
+		super_unlock_shared(sb);
 	}
-
-	if (!surprise)
-		sync_filesystem(sb);
-	shrink_dcache_sb(sb);
-	evict_inodes(sb);
-	if (sb->s_op->shutdown)
-		sb->s_op->shutdown(sb);
-
-	super_unlock_shared(sb);
 }
 
 static void fs_bdev_sync(struct block_device *bdev)
 {
-	struct super_block *sb;
-
-	sb = bdev_super_lock(bdev, false);
-	if (!sb)
-		return;
+	struct super_dev *sb_dev;
+	dev_t dev = bdev->bd_dev;
 
-	sync_filesystem(sb);
-	super_unlock_shared(sb);
-}
+	mutex_unlock(&bdev->bd_holder_lock);
 
-static struct super_block *get_bdev_super(struct block_device *bdev)
-{
-	bool active = false;
-	struct super_block *sb;
+	for (sb_dev = super_dev_first(dev); sb_dev; sb_dev = super_dev_next(sb_dev)) {
+		struct super_block *sb = sb_dev->sd_sb;
 
-	sb = bdev_super_lock(bdev, true);
-	if (sb) {
-		active = atomic_inc_not_zero(&sb->s_active);
-		super_unlock_excl(sb);
+		if (!super_lock_shared(sb))
+			continue;
+		if (sb->s_root && (sb->s_flags & SB_ACTIVE))
+			sync_filesystem(sb);
+		super_unlock_shared(sb);
 	}
-	if (!active)
-		return NULL;
-	return sb;
 }
 
 /**
- * fs_bdev_freeze - freeze owning filesystem of block device
+ * fs_bdev_freeze - freeze every superblock using a block device
  * @bdev: block device
  *
- * Freeze the filesystem that owns this block device if it is still
- * active.
- *
- * A filesystem that owns multiple block devices may be frozen from each
- * block device and won't be unfrozen until all block devices are
- * unfrozen. Each block device can only freeze the filesystem once as we
- * nest freezes for block devices in the block layer.
+ * Freeze each live superblock using @bdev.  A superblock owning several block
+ * devices is frozen once per device and stays frozen until all are thawed; the
+ * block layer nests these freezes so the count stays balanced.
  *
- * Return: If the freeze was successful zero is returned. If the freeze
- *         failed a negative error code is returned.
+ * Return: 0, or the first error from freezing a superblock or syncing the
+ *         block device.
  */
 static int fs_bdev_freeze(struct block_device *bdev)
 {
-	struct super_block *sb;
-	int error = 0;
+	dev_t dev = bdev->bd_dev;
+	struct super_dev *sb_dev;
+	int error = 0, err;
 
 	lockdep_assert_held(&bdev->bd_fsfreeze_mutex);
 
-	sb = get_bdev_super(bdev);
-	if (!sb)
-		return -EINVAL;
+	mutex_unlock(&bdev->bd_holder_lock);
+
+	for (sb_dev = super_dev_first(dev); sb_dev; sb_dev = super_dev_next(sb_dev)) {
+		if (!get_active_super(sb_dev->sd_sb))
+			continue;
+		err = fs_super_freeze(sb_dev->sd_sb);
+		if (err && !error)
+			error = err;
+		deactivate_super(sb_dev->sd_sb);
+	}
 
-	if (sb->s_op->freeze_super)
-		error = sb->s_op->freeze_super(sb,
-				FREEZE_MAY_NEST | FREEZE_HOLDER_USERSPACE, NULL);
-	else
-		error = freeze_super(sb,
-				FREEZE_MAY_NEST | FREEZE_HOLDER_USERSPACE, NULL);
 	if (!error)
 		error = sync_blockdev(bdev);
-	deactivate_super(sb);
 	return error;
 }
 
 /**
- * fs_bdev_thaw - thaw owning filesystem of block device
+ * fs_bdev_thaw - thaw every superblock using a block device
  * @bdev: block device
  *
- * Thaw the filesystem that owns this block device.
+ * The counterpart to fs_bdev_freeze(): thaw each live superblock using @bdev.
+ * A zero return does not imply a superblock is fully unfrozen; it may have been
+ * frozen more than once (by the kernel or via another device).
  *
- * A filesystem that owns multiple block devices may be frozen from each
- * block device and won't be unfrozen until all block devices are
- * unfrozen. Each block device can only freeze the filesystem once as we
- * nest freezes for block devices in the block layer.
- *
- * Return: If the thaw was successful zero is returned. If the thaw
- *         failed a negative error code is returned. If this function
- *         returns zero it doesn't mean that the filesystem is unfrozen
- *         as it may have been frozen multiple times (kernel may hold a
- *         freeze or might be frozen from other block devices).
+ * Return: 0, or the first error from thawing a superblock.
  */
 static int fs_bdev_thaw(struct block_device *bdev)
 {
-	struct super_block *sb;
-	int error;
+	dev_t dev = bdev->bd_dev;
+	struct super_dev *sb_dev;
+	int error = 0, err;
 
 	lockdep_assert_held(&bdev->bd_fsfreeze_mutex);
 
-	/*
-	 * The block device may have been frozen before it was claimed by a
-	 * filesystem. Concurrently another process might try to mount that
-	 * frozen block device and has temporarily claimed the block device for
-	 * that purpose causing a concurrent fs_bdev_thaw() to end up here. The
-	 * mounter is already about to abort mounting because they still saw an
-	 * elevanted bdev->bd_fsfreeze_count so get_bdev_super() will return
-	 * NULL in that case.
-	 */
-	sb = get_bdev_super(bdev);
-	if (!sb)
-		return -EINVAL;
+	mutex_unlock(&bdev->bd_holder_lock);
+
+	for (sb_dev = super_dev_first(dev); sb_dev; sb_dev = super_dev_next(sb_dev)) {
+		if (!get_active_super(sb_dev->sd_sb))
+			continue;
+		err = fs_super_thaw(sb_dev->sd_sb);
+		if (err && !error)
+			error = err;
+		deactivate_super(sb_dev->sd_sb);
+	}
 
-	if (sb->s_op->thaw_super)
-		error = sb->s_op->thaw_super(sb,
-				FREEZE_MAY_NEST | FREEZE_HOLDER_USERSPACE, NULL);
-	else
-		error = thaw_super(sb,
-				FREEZE_MAY_NEST | FREEZE_HOLDER_USERSPACE, NULL);
-	deactivate_super(sb);
 	return error;
 }
 
@@ -1752,14 +1734,18 @@ struct file *fs_bdev_file_open_by_path(const char *path, blk_mode_t mode,
 EXPORT_SYMBOL_GPL(fs_bdev_file_open_by_path);
 
 /**
- * fs_bdev_file_release - release a block device claimed for a superblock
+ * fs_bdev_unregister - drop a superblock's claim on a block device
  * @bdev_file: file returned by fs_bdev_file_open_by_{dev,path}()
  * @sb: superblock the device was claimed for
  *
- * Drop one claim on the {dev, @sb} entry; the last claim unregisters it (a
- * pinning cursor defers the actual unlink).  Then close the block device.
+ * The inverse of fs_bdev_register(): drop one claim on the {dev, @sb} entry
+ * (the last claim unregisters it; a pinning cursor defers the actual unlink)
+ * without closing the device.  A caller that must act on the still-open device
+ * between unregistering and closing - e.g. re-allow freezing one denied for a
+ * membership change - pairs this with bdev_fput().  fs_bdev_file_release() is
+ * the common unregister-and-close.
  */
-void fs_bdev_file_release(struct file *bdev_file, struct super_block *sb)
+void fs_bdev_unregister(struct file *bdev_file, struct super_block *sb)
 {
 	dev_t dev = file_bdev(bdev_file)->bd_dev;
 	struct super_dev *sb_dev;
@@ -1768,6 +1754,19 @@ void fs_bdev_file_release(struct file *bdev_file, struct super_block *sb)
 	sb_dev = super_dev_lookup(dev, sb);
 	rcu_read_unlock();
 	super_dev_put(sb_dev);
+}
+EXPORT_SYMBOL_GPL(fs_bdev_unregister);
+
+/**
+ * fs_bdev_file_release - release a block device claimed for a superblock
+ * @bdev_file: file returned by fs_bdev_file_open_by_{dev,path}()
+ * @sb: superblock the device was claimed for
+ *
+ * Unregister the {dev, @sb} entry, then close the block device.
+ */
+void fs_bdev_file_release(struct file *bdev_file, struct super_block *sb)
+{
+	fs_bdev_unregister(bdev_file, sb);
 	bdev_fput(bdev_file);
 }
 EXPORT_SYMBOL_GPL(fs_bdev_file_release);
diff --git a/include/linux/fs/super.h b/include/linux/fs/super.h
index 721d842e3b24..8c3987040ed1 100644
--- a/include/linux/fs/super.h
+++ b/include/linux/fs/super.h
@@ -240,6 +240,7 @@ struct file *fs_bdev_file_open_by_dev(dev_t dev, blk_mode_t mode, void *holder,
 				      struct super_block *sb);
 struct file *fs_bdev_file_open_by_path(const char *path, blk_mode_t mode,
 				       void *holder, struct super_block *sb);
+void fs_bdev_unregister(struct file *bdev_file, struct super_block *sb);
 void fs_bdev_file_release(struct file *bdev_file, struct super_block *sb);
 
 #endif /* _LINUX_FS_SUPER_H */

-- 
2.47.3


^ permalink raw reply related

* [PATCH RFC v2 11/18] ext4: open via dedicated fs bdev helpers
From: Christian Brauner @ 2026-06-16 14:08 UTC (permalink / raw)
  To: Jan Kara
  Cc: Christoph Hellwig, Jens Axboe, Alexander Viro, linux-block,
	linux-kernel, linux-fsdevel, Carlos Maiolino, linux-xfs,
	Chris Mason, David Sterba, linux-btrfs, Theodore Ts'o,
	linux-ext4, Gao Xiang, linux-erofs, Christian Brauner (Amutable)
In-Reply-To: <20260616-work-super-bdev_holder_global-v2-0-7df6b864028e@kernel.org>

Route the external journal device open through fs_bdev_file_open_by_dev()
so it is registered against the superblock, and convert the matching
releases to fs_bdev_file_release().

Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner (Amutable) <brauner@kernel.org>
---
 fs/ext4/super.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 7283108d7609..2b5301a3bcfb 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -5793,7 +5793,7 @@ failed_mount8: __maybe_unused
 	brelse(sbi->s_sbh);
 	if (sbi->s_journal_bdev_file) {
 		invalidate_bdev(file_bdev(sbi->s_journal_bdev_file));
-		bdev_fput(sbi->s_journal_bdev_file);
+		fs_bdev_file_release(sbi->s_journal_bdev_file, sb);
 	}
 out_fail:
 	invalidate_bdev(sb->s_bdev);
@@ -5972,9 +5972,9 @@ static struct file *ext4_get_journal_blkdev(struct super_block *sb,
 	struct ext4_super_block *es;
 	int errno;
 
-	bdev_file = bdev_file_open_by_dev(j_dev,
+	bdev_file = fs_bdev_file_open_by_dev(j_dev,
 		BLK_OPEN_READ | BLK_OPEN_WRITE | BLK_OPEN_RESTRICT_WRITES,
-		sb, &fs_holder_ops);
+		sb, sb);
 	if (IS_ERR(bdev_file)) {
 		ext4_msg(sb, KERN_ERR,
 			 "failed to open journal device unknown-block(%u,%u) %ld",
@@ -6034,7 +6034,7 @@ static struct file *ext4_get_journal_blkdev(struct super_block *sb,
 out_bh:
 	brelse(bh);
 out_bdev:
-	bdev_fput(bdev_file);
+	fs_bdev_file_release(bdev_file, sb);
 	return ERR_PTR(errno);
 }
 
@@ -6073,7 +6073,7 @@ static journal_t *ext4_open_dev_journal(struct super_block *sb,
 out_journal:
 	ext4_journal_destroy(EXT4_SB(sb), journal);
 out_bdev:
-	bdev_fput(bdev_file);
+	fs_bdev_file_release(bdev_file, sb);
 	return ERR_PTR(errno);
 }
 
@@ -7490,7 +7490,7 @@ static void ext4_kill_sb(struct super_block *sb)
 	kill_block_super(sb);
 
 	if (bdev_file)
-		bdev_fput(bdev_file);
+		fs_bdev_file_release(bdev_file, sb);
 }
 
 static struct file_system_type ext4_fs_type = {

-- 
2.47.3


^ permalink raw reply related

* [PATCH RFC v2 10/18] btrfs: open via dedicated fs bdev helpers
From: Christian Brauner @ 2026-06-16 14:08 UTC (permalink / raw)
  To: Jan Kara
  Cc: Christoph Hellwig, Jens Axboe, Alexander Viro, linux-block,
	linux-kernel, linux-fsdevel, Carlos Maiolino, linux-xfs,
	Chris Mason, David Sterba, linux-btrfs, Theodore Ts'o,
	linux-ext4, Gao Xiang, linux-erofs, Christian Brauner (Amutable),
	syzbot
In-Reply-To: <20260616-work-super-bdev_holder_global-v2-0-7df6b864028e@kernel.org>

Route the device opens through fs_bdev_file_open_by_path() so each device
is registered against the superblock, and convert the matching releases
to fs_bdev_file_release().

The temporary identification opens that only read the superblock and
close again pass a NULL holder and keep using bdev_fput().

On the close path the superblock is taken from bdev_file->private_data
(the holder set at open) rather than device->fs_info->sb: a mount that
fails before btrfs_init_devices_late() runs leaves device->fs_info NULL,
which close_fs_devices() would otherwise dereference.

Tested-by: syzbot@syzkaller.appspotmail.com
Signed-off-by: Christian Brauner (Amutable) <brauner@kernel.org>
---
 fs/btrfs/volumes.c | 29 ++++++++++++++++++++---------
 1 file changed, 20 insertions(+), 9 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 2d9e2ca09c5f..02abbfce5ea3 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -480,7 +480,12 @@ btrfs_get_bdev_and_sb(const char *device_path, blk_mode_t flags, void *holder,
 	struct block_device *bdev;
 	int ret;
 
-	*bdev_file = bdev_file_open_by_path(device_path, flags, holder, &fs_holder_ops);
+	if (holder)
+		*bdev_file = fs_bdev_file_open_by_path(device_path, flags,
+						       holder, holder);
+	else
+		*bdev_file = bdev_file_open_by_path(device_path, flags, NULL,
+						    NULL);
 
 	if (IS_ERR(*bdev_file)) {
 		ret = PTR_ERR(*bdev_file);
@@ -495,7 +500,7 @@ btrfs_get_bdev_and_sb(const char *device_path, blk_mode_t flags, void *holder,
 	if (holder) {
 		ret = set_blocksize(*bdev_file, BTRFS_BDEV_BLOCKSIZE);
 		if (ret) {
-			bdev_fput(*bdev_file);
+			fs_bdev_file_release(*bdev_file, holder);
 			goto error;
 		}
 	}
@@ -503,7 +508,10 @@ btrfs_get_bdev_and_sb(const char *device_path, blk_mode_t flags, void *holder,
 	*disk_super = btrfs_read_disk_super(bdev, 0, false);
 	if (IS_ERR(*disk_super)) {
 		ret = PTR_ERR(*disk_super);
-		bdev_fput(*bdev_file);
+		if (holder)
+			fs_bdev_file_release(*bdev_file, holder);
+		else
+			bdev_fput(*bdev_file);
 		goto error;
 	}
 
@@ -727,7 +735,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
 
 error_free_page:
 	btrfs_release_disk_super(disk_super);
-	bdev_fput(bdev_file);
+	fs_bdev_file_release(bdev_file, holder);
 
 	return -EINVAL;
 }
@@ -1087,7 +1095,7 @@ static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices,
 			continue;
 
 		if (device->bdev_file) {
-			bdev_fput(device->bdev_file);
+			fs_bdev_file_release(device->bdev_file, device->bdev_file->private_data);
 			device->bdev = NULL;
 			device->bdev_file = NULL;
 			fs_devices->open_devices--;
@@ -1127,10 +1135,12 @@ void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices)
 /* Release a device that was made unfreezable for a membership change. */
 void btrfs_release_device_allow_freeze(struct file *bdev_file)
 {
+	struct super_block *sb = bdev_file->private_data;
+
 	/* Yield before allow (strand-safe); file still open for the allow (UAF-safe). */
 	bdev_yield_claim(bdev_file);
 	bdev_allow_freeze(file_bdev(bdev_file));
-	bdev_fput(bdev_file);
+	fs_bdev_file_release(bdev_file, sb);
 }
 
 static void btrfs_close_bdev(struct btrfs_device *device, bool allow_freeze)
@@ -1147,7 +1157,8 @@ static void btrfs_close_bdev(struct btrfs_device *device, bool allow_freeze)
 	if (allow_freeze)
 		btrfs_release_device_allow_freeze(device->bdev_file);
 	else
-		bdev_fput(device->bdev_file);
+		fs_bdev_file_release(device->bdev_file,
+				     device->bdev_file->private_data);
 }
 
 static void btrfs_close_one_device(struct btrfs_device *device)
@@ -2894,8 +2905,8 @@ struct file *btrfs_open_device_deny_freeze(const char *path,
 		return ERR_PTR(ret);
 	}
 
-	bdev_file = bdev_file_open_by_dev(file_bdev(probe_file)->bd_dev,
-					  BLK_OPEN_WRITE, sb, &fs_holder_ops);
+	bdev_file = fs_bdev_file_open_by_dev(file_bdev(probe_file)->bd_dev,
+					     BLK_OPEN_WRITE, sb, sb);
 	if (IS_ERR(bdev_file))
 		bdev_allow_freeze(file_bdev(probe_file));
 	bdev_fput(probe_file);

-- 
2.47.3


^ permalink raw reply related

* [PATCH RFC v2 09/18] xfs: port to fs_bdev_file_open_by_path()
From: Christian Brauner @ 2026-06-16 14:08 UTC (permalink / raw)
  To: Jan Kara
  Cc: Christoph Hellwig, Jens Axboe, Alexander Viro, linux-block,
	linux-kernel, linux-fsdevel, Carlos Maiolino, linux-xfs,
	Chris Mason, David Sterba, linux-btrfs, Theodore Ts'o,
	linux-ext4, Gao Xiang, linux-erofs, Christian Brauner (Amutable)
In-Reply-To: <20260616-work-super-bdev_holder_global-v2-0-7df6b864028e@kernel.org>

Route the log and rt device opens through fs_bdev_file_open_by_path() so
each external device is registered against mp->m_super, and convert the
matching releases to fs_bdev_file_release(). The data device is still
opened and released by setup_bdev_super()/kill_block_super(); when the
log lives on the data device the open resolves to the existing (dev, sb)
entry so the superblock is acted on once.

Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner (Amutable) <brauner@kernel.org>
---
 fs/xfs/xfs_buf.c   |  2 +-
 fs/xfs/xfs_super.c | 10 +++++-----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 3ce12fe1c307..2eddd60aaa67 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1615,7 +1615,7 @@ xfs_free_buftarg(
 	fs_put_dax(btp->bt_daxdev, btp->bt_mount);
 	/* the main block device is closed by kill_block_super */
 	if (btp->bt_bdev != btp->bt_mount->m_super->s_bdev)
-		bdev_fput(btp->bt_file);
+		fs_bdev_file_release(btp->bt_file, btp->bt_mount->m_super);
 	kfree(btp);
 }
 
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 8531d526fc44..d1c622f0a957 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -400,8 +400,8 @@ xfs_blkdev_get(
 	blk_mode_t		mode;
 
 	mode = sb_open_mode(mp->m_super->s_flags);
-	*bdev_filep = bdev_file_open_by_path(name, mode,
-			mp->m_super, &fs_holder_ops);
+	*bdev_filep = fs_bdev_file_open_by_path(name, mode,
+			mp->m_super, mp->m_super);
 	if (IS_ERR(*bdev_filep)) {
 		error = PTR_ERR(*bdev_filep);
 		*bdev_filep = NULL;
@@ -526,7 +526,7 @@ xfs_open_devices(
 		mp->m_logdev_targp = mp->m_ddev_targp;
 		/* Handle won't be used, drop it */
 		if (logdev_file)
-			bdev_fput(logdev_file);
+			fs_bdev_file_release(logdev_file, mp->m_super);
 	}
 
 	return 0;
@@ -541,10 +541,10 @@ xfs_open_devices(
 	mp->m_ddev_targp = NULL;
  out_close_rtdev:
 	 if (rtdev_file)
-		bdev_fput(rtdev_file);
+		fs_bdev_file_release(rtdev_file, mp->m_super);
  out_close_logdev:
 	if (logdev_file)
-		bdev_fput(logdev_file);
+		fs_bdev_file_release(logdev_file, mp->m_super);
 	return error;
 }
 

-- 
2.47.3


^ permalink raw reply related

* [PATCH RFC v2 08/18] fs: add dedicated block device open helpers for filesystems
From: Christian Brauner @ 2026-06-16 14:08 UTC (permalink / raw)
  To: Jan Kara
  Cc: Christoph Hellwig, Jens Axboe, Alexander Viro, linux-block,
	linux-kernel, linux-fsdevel, Carlos Maiolino, linux-xfs,
	Chris Mason, David Sterba, linux-btrfs, Theodore Ts'o,
	linux-ext4, Gao Xiang, linux-erofs, Christian Brauner (Amutable)
In-Reply-To: <20260616-work-super-bdev_holder_global-v2-0-7df6b864028e@kernel.org>

Add fs_bdev_file_open_by_{dev,path}() and fs_bdev_file_release(). They
open the device with fs_holder_ops and register a claim in the
device-to-superblock table. Claims on the same (device, superblock)
pair share one entry, so when a filesystem claims a device it already
uses (xfs with its log on the data device), no second entry is added
and each superblock will be acted on once.

The holder argument remains purely the block layer's exclusivity token:
a superblock, or a file_system_type for a device shared by several
superblocks of that type. The shared case only becomes usable once the
fs_holder_ops callbacks resolve superblocks through the table instead
of bdev->bd_holder.

Convert the main device, setup_bdev_super() and kill_block_super(),
over: the open finds the entry registered by sget_fc() and claims it
again. cramfs and romfs bypass kill_block_super() so they can handle
MTD mounts and release the main device with a plain bdev_fput(), which
would leave the claim behind: the (dev, sb) entry would never be
unregistered and the passive reference it holds would keep the
superblock alive forever. Convert their release paths in the same
step.

The frozen-device check stays in setup_bdev_super() for the primary
device and is added to fs_bdev_register() for new claims, i.e. every
additional device a filesystem opens through the helpers. Only a
(device, superblock) pair the superblock claimed earlier may be
reopened while frozen (xfs with its log on the data device): the freeze
already covers that superblock through the existing claim, so nothing
escapes it. Without the setup_bdev_super() check a device frozen before
the mount even started (dm lock_fs, loop) could be mounted and written
to (journal replay) under an active freeze, because the primary open
reuses the entry registered by sget_fc() and never takes the new-claim
path.

Both checks read bd_fsfreeze_count only after the entry is published
(by sget_fc() for the primary, by fs_bdev_register() for new claims)
and pair with bdev_freeze() incrementing the count before walking the
table: either the mount sees the elevated freeze count and fails with
EBUSY, or the freeze finds the published entry and converges once
SB_BORN is set.

Signed-off-by: Christian Brauner (Amutable) <brauner@kernel.org>
---
 fs/cramfs/inode.c        |   2 +-
 fs/romfs/super.c         |   2 +-
 fs/super.c               | 154 ++++++++++++++++++++++++++++++++++++++++++++---
 include/linux/fs/super.h |   7 +++
 4 files changed, 155 insertions(+), 10 deletions(-)

diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index 4edbfccd0bbe..d4cd03f4f60d 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -504,7 +504,7 @@ static void cramfs_kill_sb(struct super_block *sb)
 		sb->s_mtd = NULL;
 	} else if (IS_ENABLED(CONFIG_CRAMFS_BLOCKDEV) && sb->s_bdev) {
 		sync_blockdev(sb->s_bdev);
-		bdev_fput(sb->s_bdev_file);
+		fs_bdev_file_release(sb->s_bdev_file, sb);
 	}
 	kfree(sbi);
 }
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index ac55193bf398..43eb897197c0 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -587,7 +587,7 @@ static void romfs_kill_sb(struct super_block *sb)
 #ifdef CONFIG_ROMFS_ON_BLOCK
 	if (sb->s_bdev) {
 		sync_blockdev(sb->s_bdev);
-		bdev_fput(sb->s_bdev_file);
+		fs_bdev_file_release(sb->s_bdev_file, sb);
 	}
 #endif
 }
diff --git a/fs/super.c b/fs/super.c
index ff5e305d0ab4..3d166c7f578a 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -1633,6 +1633,145 @@ const struct blk_holder_ops fs_holder_ops = {
 };
 EXPORT_SYMBOL_GPL(fs_holder_ops);
 
+static struct super_dev *super_dev_lookup(dev_t dev, struct super_block *sb)
+{
+	struct super_dev *it;
+	struct rhlist_head *list, *pos;
+
+	RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "suspicious super_dev_lookup() usage");
+	VFS_WARN_ON_ONCE(!dev);
+	VFS_WARN_ON_ONCE(!sb);
+
+	list = rhltable_lookup(&super_dev_table, &dev, super_dev_params);
+	rhl_for_each_entry_rcu(it, pos, list, sd_node) {
+		if (it->sd_sb == sb)
+			return it;
+	}
+
+	return NULL;
+}
+
+static int fs_bdev_register(struct file *bdev_file, struct super_block *sb)
+{
+	struct super_dev *sb_dev __free(kfree) = NULL;
+	dev_t dev = file_bdev(bdev_file)->bd_dev;
+	int err;
+
+	scoped_guard(rcu) {
+		sb_dev = super_dev_lookup(dev, sb);
+		if (sb_dev && refcount_inc_not_zero(&sb_dev->sd_ref)) {
+			retain_and_null_ptr(sb_dev);
+			return 0;
+		}
+	}
+
+	sb_dev = super_dev_alloc(dev, sb);
+	if (!sb_dev)
+		return -ENOMEM;
+
+	err = super_dev_insert(sb_dev);
+	if (err)
+		return err;
+
+	/* Publish the entry before reading the count; pairs with bdev_freeze(). */
+	smp_mb();
+	if (atomic_read(&file_bdev(bdev_file)->bd_fsfreeze_count) > 0) {
+		err = -EBUSY;
+		super_dev_put(sb_dev);
+	}
+
+	retain_and_null_ptr(sb_dev);
+	return err;
+}
+
+/**
+ * fs_bdev_file_open_by_dev - claim a block device on behalf of a superblock
+ * @dev: block device number
+ * @mode: open mode
+ * @holder: block-layer exclusivity token (a superblock, or the file_system_type
+ *          when the device may be shared by several superblocks of that type)
+ * @sb: superblock to drive fs_holder_ops events for
+ *
+ * Open @dev with &fs_holder_ops and register that @sb uses it, so device
+ * removal/sync/freeze/thaw are propagated to @sb (and any other superblock
+ * sharing @dev).  Must be paired with fs_bdev_file_release().
+ *
+ * Return: an opened block-device file or an ERR_PTR().
+ */
+struct file *fs_bdev_file_open_by_dev(dev_t dev, blk_mode_t mode, void *holder,
+				      struct super_block *sb)
+{
+	struct file *bdev_file;
+	int err;
+
+	bdev_file = bdev_file_open_by_dev(dev, mode, holder, &fs_holder_ops);
+	if (IS_ERR(bdev_file))
+		return bdev_file;
+
+	err = fs_bdev_register(bdev_file, sb);
+	if (err) {
+		bdev_fput(bdev_file);
+		return ERR_PTR(err);
+	}
+	return bdev_file;
+}
+EXPORT_SYMBOL_GPL(fs_bdev_file_open_by_dev);
+
+/**
+ * fs_bdev_file_open_by_path - claim a block device on behalf of a superblock
+ * @path: path to the block device
+ * @mode: open mode
+ * @holder: block-layer exclusivity token (a superblock, or the file_system_type
+ *          when the device may be shared by several superblocks of that type)
+ * @sb: superblock to drive fs_holder_ops events for
+ *
+ * Open the block device at @path with &fs_holder_ops and register that @sb
+ * uses it, so device removal/sync/freeze/thaw are propagated to @sb (and any
+ * other superblock sharing the device).  Must be paired with
+ * fs_bdev_file_release().
+ *
+ * Return: an opened block-device file or an ERR_PTR().
+ */
+struct file *fs_bdev_file_open_by_path(const char *path, blk_mode_t mode,
+				       void *holder, struct super_block *sb)
+{
+	struct file *bdev_file;
+	int err;
+
+	bdev_file = bdev_file_open_by_path(path, mode, holder, &fs_holder_ops);
+	if (IS_ERR(bdev_file))
+		return bdev_file;
+
+	err = fs_bdev_register(bdev_file, sb);
+	if (err) {
+		bdev_fput(bdev_file);
+		return ERR_PTR(err);
+	}
+	return bdev_file;
+}
+EXPORT_SYMBOL_GPL(fs_bdev_file_open_by_path);
+
+/**
+ * fs_bdev_file_release - release a block device claimed for a superblock
+ * @bdev_file: file returned by fs_bdev_file_open_by_{dev,path}()
+ * @sb: superblock the device was claimed for
+ *
+ * Drop one claim on the {dev, @sb} entry; the last claim unregisters it (a
+ * pinning cursor defers the actual unlink).  Then close the block device.
+ */
+void fs_bdev_file_release(struct file *bdev_file, struct super_block *sb)
+{
+	dev_t dev = file_bdev(bdev_file)->bd_dev;
+	struct super_dev *sb_dev;
+
+	rcu_read_lock();
+	sb_dev = super_dev_lookup(dev, sb);
+	rcu_read_unlock();
+	super_dev_put(sb_dev);
+	bdev_fput(bdev_file);
+}
+EXPORT_SYMBOL_GPL(fs_bdev_file_release);
+
 int setup_bdev_super(struct super_block *sb, int sb_flags,
 		struct fs_context *fc)
 {
@@ -1640,7 +1779,7 @@ int setup_bdev_super(struct super_block *sb, int sb_flags,
 	struct file *bdev_file;
 	struct block_device *bdev;
 
-	bdev_file = bdev_file_open_by_dev(sb->s_dev, mode, sb, &fs_holder_ops);
+	bdev_file = fs_bdev_file_open_by_dev(sb->s_dev, mode, sb, sb);
 	if (IS_ERR(bdev_file)) {
 		if (fc)
 			errorf(fc, "%s: Can't open blockdev", fc->source);
@@ -1654,20 +1793,19 @@ int setup_bdev_super(struct super_block *sb, int sb_flags,
 	 * writable from userspace even for a read-only block device.
 	 */
 	if ((mode & BLK_OPEN_WRITE) && bdev_read_only(bdev)) {
-		bdev_fput(bdev_file);
+		fs_bdev_file_release(bdev_file, sb);
 		return -EACCES;
 	}
 
-	/*
-	 * It is enough to check bdev was not frozen before we set
-	 * s_bdev as freezing will wait until SB_BORN is set.
-	 */
+	/* The sget_fc() entry is already published; pairs with bdev_freeze(). */
+	smp_mb();
 	if (atomic_read(&bdev->bd_fsfreeze_count) > 0) {
 		if (fc)
 			warnf(fc, "%pg: Can't mount, blockdev is frozen", bdev);
-		bdev_fput(bdev_file);
+		fs_bdev_file_release(bdev_file, sb);
 		return -EBUSY;
 	}
+
 	spin_lock(&sb_lock);
 	sb->s_bdev_file = bdev_file;
 	sb->s_bdev = bdev;
@@ -1756,7 +1894,7 @@ void kill_block_super(struct super_block *sb)
 	generic_shutdown_super(sb);
 	if (bdev) {
 		sync_blockdev(bdev);
-		bdev_fput(sb->s_bdev_file);
+		fs_bdev_file_release(sb->s_bdev_file, sb);
 	}
 }
 
diff --git a/include/linux/fs/super.h b/include/linux/fs/super.h
index f21ffbb6dea5..721d842e3b24 100644
--- a/include/linux/fs/super.h
+++ b/include/linux/fs/super.h
@@ -235,4 +235,11 @@ int freeze_super(struct super_block *super, enum freeze_holder who,
 int thaw_super(struct super_block *super, enum freeze_holder who,
 	       const void *freeze_owner);
 
+struct file;
+struct file *fs_bdev_file_open_by_dev(dev_t dev, blk_mode_t mode, void *holder,
+				      struct super_block *sb);
+struct file *fs_bdev_file_open_by_path(const char *path, blk_mode_t mode,
+				       void *holder, struct super_block *sb);
+void fs_bdev_file_release(struct file *bdev_file, struct super_block *sb);
+
 #endif /* _LINUX_FS_SUPER_H */

-- 
2.47.3


^ permalink raw reply related

* [PATCH RFC v2 07/18] fs: maintain a global device-to-superblock table
From: Christian Brauner @ 2026-06-16 14:08 UTC (permalink / raw)
  To: Jan Kara
  Cc: Christoph Hellwig, Jens Axboe, Alexander Viro, linux-block,
	linux-kernel, linux-fsdevel, Carlos Maiolino, linux-xfs,
	Chris Mason, David Sterba, linux-btrfs, Theodore Ts'o,
	linux-ext4, Gao Xiang, linux-erofs, Christian Brauner (Amutable)
In-Reply-To: <20260616-work-super-bdev_holder_global-v2-0-7df6b864028e@kernel.org>

fs_holder_ops recovers the owning superblock from bdev->bd_holder, which
forces the holder to be exactly one superblock and prevents several
superblocks from sharing one block device. That's what erofs is doing.

As a first step introduce a global dev_t-keyed rhltable mapping each
device to the superblock(s) using it. The entry is preallocated in
alloc_super() and registered under sb->s_dev by the set callback through
set_anon_super() and set_bdev_super(), the two helpers every set
callback assigns s_dev through. Registration is the final fallible act
of a set callback, so an insert failure unwinds through sget_fc()'s
existing set-failure path: the fs_context keeps ownership of s_fs_info
and the callers' error paths stay correct. set_anon_super() releases
the anonymous dev it allocated when registration fails. Unwinding
through deactivate_locked_super() instead would run kill_sb() and free
s_fs_info behind the caller's back: nfs and ceph free that object
through a local pointer when sget_fc() fails and would double-free.

The superblock stashes the entry in sb->s_super_dev and
kill_super_notify() drops the claim through it, so teardown doesn't
depend on s_dev staying stable; an entry that was never registered is
freed together with the superblock in destroy_super_work().

Each table entry holds a passive reference (s_passive) on its
superblock, so the struct stays valid for as long as the entry is
reachable. Entries are claim-counted through sd_ref: additional claims
on the same (device, superblock) pair share the entry, and the unlink
is deferred to the last put, so a later iteration cursor never resumes
from a removed node.

The table is initialized from mnt_init(): the first superblocks (the
tmpfs shm mount and rootfs) are created from start_kernel() long before
any initcall runs, so an initcall would be too late.

The table has no readers yet; the fs_holder_ops callbacks are switched
over once all devices a filesystem claims are registered.

Signed-off-by: Christian Brauner (Amutable) <brauner@kernel.org>
---
 fs/internal.h                  |   1 +
 fs/namespace.c                 |   2 +
 fs/super.c                     | 102 ++++++++++++++++++++++++++++++++++++++++-
 include/linux/fs/super_types.h |   2 +
 4 files changed, 105 insertions(+), 2 deletions(-)

diff --git a/fs/internal.h b/fs/internal.h
index d77578d66d42..83eb3e2a0f85 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -137,6 +137,7 @@ extern int reconfigure_super(struct fs_context *);
 extern bool super_trylock_shared(struct super_block *sb);
 struct super_block *user_get_super(dev_t, bool excl);
 void put_super(struct super_block *sb);
+void __init super_dev_init(void);
 extern bool mount_capable(struct fs_context *);
 int sb_init_dio_done_wq(struct super_block *sb);
 
diff --git a/fs/namespace.c b/fs/namespace.c
index 3d5cd5bf3b05..7cef6dae0854 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -6262,6 +6262,8 @@ void __init mnt_init(void)
 	if (!mount_hashtable || !mountpoint_hashtable)
 		panic("Failed to allocate mount hash table\n");
 
+	super_dev_init();
+
 	kernfs_init();
 
 	err = sysfs_init();
diff --git a/fs/super.c b/fs/super.c
index a771a0ad4c9a..ff5e305d0ab4 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -24,6 +24,7 @@
 #include <linux/export.h>
 #include <linux/slab.h>
 #include <linux/blkdev.h>
+#include <linux/rhashtable.h>
 #include <linux/mount.h>
 #include <linux/security.h>
 #include <linux/writeback.h>		/* for the emergency remount stuff */
@@ -272,6 +273,8 @@ static unsigned long super_cache_count(struct shrinker *shrink,
 	return total_objects;
 }
 
+static struct super_dev *super_dev_alloc(dev_t dev, struct super_block *sb);
+
 static void destroy_super_work(struct work_struct *work)
 {
 	struct super_block *s = container_of(work, struct super_block,
@@ -279,6 +282,8 @@ static void destroy_super_work(struct work_struct *work)
 	fsnotify_sb_free(s);
 	security_sb_free(s);
 	put_user_ns(s->s_user_ns);
+	/* Only an unregistered entry is still owned by the superblock. */
+	kfree(s->s_super_dev);
 	kfree(s->s_subtype);
 	for (int i = 0; i < SB_FREEZE_LEVELS; i++)
 		percpu_free_rwsem(&s->s_writers.rw_sem[i]);
@@ -392,6 +397,10 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags,
 		goto fail;
 	if (list_lru_init_memcg(&s->s_inode_lru, s->s_shrink))
 		goto fail;
+	s->s_super_dev = super_dev_alloc(0, s);
+	if (!s->s_super_dev)
+		goto fail;
+
 	s->s_min_writeback_pages = MIN_WRITEBACK_PAGES;
 	return s;
 
@@ -421,6 +430,77 @@ void put_super(struct super_block *s)
 	}
 }
 
+struct super_dev {
+	dev_t			sd_dev;
+	struct super_block	*sd_sb;
+	refcount_t		sd_ref;
+	struct rhlist_head	sd_node;
+	struct rcu_head		sd_rcu;
+};
+
+static struct rhltable super_dev_table;
+static const struct rhashtable_params super_dev_params = {
+	.key_len	= sizeof(dev_t),
+	.key_offset	= offsetof(struct super_dev, sd_dev),
+	.head_offset	= offsetof(struct super_dev, sd_node),
+};
+
+static struct super_dev *super_dev_alloc(dev_t dev, struct super_block *sb)
+{
+	struct super_dev *fsd;
+
+	fsd = kzalloc_obj(*fsd);
+	if (!fsd)
+		return NULL;
+	fsd->sd_dev = dev;
+	fsd->sd_sb = sb;
+	refcount_set(&fsd->sd_ref, 1);
+	return fsd;
+}
+
+static void super_dev_put(struct super_dev *fsd)
+{
+	/* Unlink only once unpinned, so a cursor never resumes from a removed node. */
+	if (fsd && refcount_dec_and_test(&fsd->sd_ref)) {
+		rhltable_remove(&super_dev_table, &fsd->sd_node, super_dev_params);
+		put_super(fsd->sd_sb);
+		kfree_rcu(fsd, sd_rcu);
+	}
+}
+
+void __init super_dev_init(void)
+{
+	if (rhltable_init(&super_dev_table, &super_dev_params))
+		panic("VFS: Cannot initialise super_dev_table\n");
+}
+
+static int super_dev_insert(struct super_dev *fsd)
+{
+	int err;
+
+	err = rhltable_insert(&super_dev_table, &fsd->sd_node, super_dev_params);
+	if (!err)
+		refcount_inc(&fsd->sd_sb->s_passive);
+	return err;
+}
+
+/* Register @sb under @sb->s_dev as the final fallible act of a set callback. */
+static int super_dev_register(struct super_block *sb)
+{
+	struct super_dev *fsd = sb->s_super_dev;
+	int err;
+
+	lockdep_assert_held(&sb_lock);
+	VFS_WARN_ON_ONCE(!sb->s_dev);
+	VFS_WARN_ON_ONCE(!fsd || fsd->sd_dev);
+
+	fsd->sd_dev = sb->s_dev;
+	err = super_dev_insert(fsd);
+	if (err)
+		fsd->sd_dev = 0;
+	return err;
+}
+
 static void kill_super_notify(struct super_block *sb)
 {
 	lockdep_assert_not_held(&sb->s_umount);
@@ -440,6 +520,12 @@ static void kill_super_notify(struct super_block *sb)
 	hlist_del_init(&sb->s_instances);
 	spin_unlock(&sb_lock);
 
+	/* Drop sget_fc()'s claim; a never-registered entry stays with the sb. */
+	if (sb->s_super_dev->sd_dev) {
+		super_dev_put(sb->s_super_dev);
+		sb->s_super_dev = NULL;
+	}
+
 	/*
 	 * Let concurrent mounts know that this thing is really dead.
 	 * We don't need @sb->s_umount here as every concurrent caller
@@ -750,6 +836,7 @@ struct super_block *sget_fc(struct fs_context *fc,
 	}
 	if (!s) {
 		spin_unlock(&sb_lock);
+
 		s = alloc_super(fc->fs_type, fc->sb_flags, user_ns);
 		if (!s)
 			return ERR_PTR(-ENOMEM);
@@ -759,11 +846,13 @@ struct super_block *sget_fc(struct fs_context *fc,
 	s->s_fs_info = fc->s_fs_info;
 	err = set(s, fc);
 	if (err) {
+		VFS_WARN_ON_ONCE(s->s_super_dev->sd_dev);
 		s->s_fs_info = NULL;
 		spin_unlock(&sb_lock);
 		destroy_unused_super(s);
 		return ERR_PTR(err);
 	}
+	VFS_WARN_ON_ONCE(!s->s_super_dev->sd_dev);
 	fc->s_fs_info = NULL;
 	s->s_type = fc->fs_type;
 	s->s_iflags |= fc->s_iflags;
@@ -1217,7 +1306,16 @@ EXPORT_SYMBOL(free_anon_bdev);
 
 int set_anon_super(struct super_block *s, void *data)
 {
-	return get_anon_bdev(&s->s_dev);
+	int error;
+
+	error = get_anon_bdev(&s->s_dev);
+	if (error)
+		return error;
+
+	error = super_dev_register(s);
+	if (error)
+		free_anon_bdev(s->s_dev);
+	return error;
 }
 EXPORT_SYMBOL(set_anon_super);
 
@@ -1303,7 +1401,7 @@ EXPORT_SYMBOL(get_tree_keyed);
 static int set_bdev_super(struct super_block *s, void *data)
 {
 	s->s_dev = *(dev_t *)data;
-	return 0;
+	return super_dev_register(s);
 }
 
 static int super_s_dev_set(struct super_block *s, struct fs_context *fc)
diff --git a/include/linux/fs/super_types.h b/include/linux/fs/super_types.h
index 68747182abf9..c8172558750f 100644
--- a/include/linux/fs/super_types.h
+++ b/include/linux/fs/super_types.h
@@ -30,6 +30,7 @@ struct mount;
 struct mtd_info;
 struct quotactl_ops;
 struct shrinker;
+struct super_dev;
 struct unicode_map;
 struct user_namespace;
 struct workqueue_struct;
@@ -132,6 +133,7 @@ struct super_operations {
 struct super_block {
 	struct list_head			s_list;		/* Keep this first */
 	dev_t					s_dev;		/* search index; _not_ kdev_t */
+	struct super_dev			*s_super_dev;	/* sget_fc()'s device table claim */
 	unsigned char				s_blocksize_bits;
 	unsigned long				s_blocksize;
 	loff_t					s_maxbytes;	/* Max file size */

-- 
2.47.3


^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox