Linux EXT4 FS development

Linux EXT4 FS development
 help / color / mirror / Atom feed

* [PATCH v7 12/22] xfs: don't allow to enable DAX on fs-verity sealed inode
From: Andrey Albershteyn @ 2026-04-09 13:13 UTC (permalink / raw)
  To: linux-xfs, fsverity, linux-fsdevel, ebiggers
  Cc: Andrey Albershteyn, hch, linux-ext4, linux-f2fs-devel,
	linux-btrfs, djwong
In-Reply-To: <20260409131404.1545834-1-aalbersh@kernel.org>

fs-verity doesn't support DAX. Forbid filesystem to enable DAX on
inodes which already have fs-verity enabled. The opposite is checked
when fs-verity is enabled, it won't be enabled if DAX is.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrey Albershteyn <aalbersh@kernel.org>
---
 fs/xfs/xfs_iops.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index ca369eb96561..17efc83a86ed 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -1387,6 +1387,8 @@ xfs_inode_should_enable_dax(
 		return false;
 	if (!xfs_inode_supports_dax(ip))
 		return false;
+	if (ip->i_diflags2 & XFS_DIFLAG2_VERITY)
+		return false;
 	if (xfs_has_dax_always(ip->i_mount))
 		return true;
 	if (ip->i_diflags2 & XFS_DIFLAG2_DAX)
-- 
2.51.2


^ permalink raw reply related

* [PATCH v7 11/22] xfs: initialize fs-verity on file open
From: Andrey Albershteyn @ 2026-04-09 13:13 UTC (permalink / raw)
  To: linux-xfs, fsverity, linux-fsdevel, ebiggers
  Cc: Andrey Albershteyn, hch, linux-ext4, linux-f2fs-devel,
	linux-btrfs, djwong
In-Reply-To: <20260409131404.1545834-1-aalbersh@kernel.org>

fs-verity will read and attach metadata (not the tree itself) from
a disk for those inodes which already have fs-verity enabled.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrey Albershteyn <aalbersh@kernel.org>
---
 fs/xfs/xfs_file.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 6246f34df9fd..a980ac5196a8 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -37,6 +37,7 @@
 #include <linux/fadvise.h>
 #include <linux/mount.h>
 #include <linux/filelock.h>
+#include <linux/fsverity.h>
 
 static const struct vm_operations_struct xfs_file_vm_ops;
 
@@ -1640,11 +1641,18 @@ xfs_file_open(
 	struct inode	*inode,
 	struct file	*file)
 {
+	int		error;
+
 	if (xfs_is_shutdown(XFS_M(inode->i_sb)))
 		return -EIO;
 	file->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT;
 	if (xfs_get_atomic_write_min(XFS_I(inode)) > 0)
 		file->f_mode |= FMODE_CAN_ATOMIC_WRITE;
+
+	error = fsverity_file_open(inode, file);
+	if (error)
+		return error;
+
 	return generic_file_open(inode, file);
 }
 
-- 
2.51.2


^ permalink raw reply related

* [PATCH v7 10/22] xfs: introduce fsverity on-disk changes
From: Andrey Albershteyn @ 2026-04-09 13:13 UTC (permalink / raw)
  To: linux-xfs, fsverity, linux-fsdevel, ebiggers
  Cc: Andrey Albershteyn, hch, linux-ext4, linux-f2fs-devel,
	linux-btrfs, djwong
In-Reply-To: <20260409131404.1545834-1-aalbersh@kernel.org>

Introduce XFS_DIFLAG2_VERITY for inodes with fsverity. This flag
indicates that inode has fs-verity enabled (i.e. descriptor exist,
tree is built and file is read-only).

Introduce XFS_SB_FEAT_RO_COMPAT_VERITY for filesystems having
fsverity inodes. As on-disk changes applies to fsverity inodes only, let
older kernels read-only access. This will be enabled in the further
patch after full fsverity support.

Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrey Albershteyn <aalbersh@kernel.org>
---
 fs/xfs/libxfs/xfs_format.h     | 30 +++++++++++++++++++++++++++++-
 fs/xfs/libxfs/xfs_inode_buf.c  |  8 ++++++++
 fs/xfs/libxfs/xfs_inode_util.c |  2 ++
 fs/xfs/libxfs/xfs_sb.c         |  2 ++
 fs/xfs/xfs_iops.c              |  2 ++
 fs/xfs/xfs_mount.h             |  2 ++
 6 files changed, 45 insertions(+), 1 deletion(-)

diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index 779dac59b1f3..4dff29659e40 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -374,6 +374,7 @@ xfs_sb_has_compat_feature(
 #define XFS_SB_FEAT_RO_COMPAT_RMAPBT   (1 << 1)		/* reverse map btree */
 #define XFS_SB_FEAT_RO_COMPAT_REFLINK  (1 << 2)		/* reflinked files */
 #define XFS_SB_FEAT_RO_COMPAT_INOBTCNT (1 << 3)		/* inobt block counts */
+#define XFS_SB_FEAT_RO_COMPAT_VERITY   (1 << 4)		/* fs-verity */
 #define XFS_SB_FEAT_RO_COMPAT_ALL \
 		(XFS_SB_FEAT_RO_COMPAT_FINOBT | \
 		 XFS_SB_FEAT_RO_COMPAT_RMAPBT | \
@@ -1230,16 +1231,21 @@ static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev)
  */
 #define XFS_DIFLAG2_METADATA_BIT	5
 
+/* inodes sealed with fs-verity */
+#define XFS_DIFLAG2_VERITY_BIT		6
+
 #define XFS_DIFLAG2_DAX		(1ULL << XFS_DIFLAG2_DAX_BIT)
 #define XFS_DIFLAG2_REFLINK	(1ULL << XFS_DIFLAG2_REFLINK_BIT)
 #define XFS_DIFLAG2_COWEXTSIZE	(1ULL << XFS_DIFLAG2_COWEXTSIZE_BIT)
 #define XFS_DIFLAG2_BIGTIME	(1ULL << XFS_DIFLAG2_BIGTIME_BIT)
 #define XFS_DIFLAG2_NREXT64	(1ULL << XFS_DIFLAG2_NREXT64_BIT)
 #define XFS_DIFLAG2_METADATA	(1ULL << XFS_DIFLAG2_METADATA_BIT)
+#define XFS_DIFLAG2_VERITY	(1ULL << XFS_DIFLAG2_VERITY_BIT)
 
 #define XFS_DIFLAG2_ANY \
 	(XFS_DIFLAG2_DAX | XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE | \
-	 XFS_DIFLAG2_BIGTIME | XFS_DIFLAG2_NREXT64 | XFS_DIFLAG2_METADATA)
+	 XFS_DIFLAG2_BIGTIME | XFS_DIFLAG2_NREXT64 | XFS_DIFLAG2_METADATA | \
+	 XFS_DIFLAG2_VERITY)
 
 static inline bool xfs_dinode_has_bigtime(const struct xfs_dinode *dip)
 {
@@ -2021,4 +2027,26 @@ struct xfs_acl {
 #define SGI_ACL_FILE_SIZE	(sizeof(SGI_ACL_FILE)-1)
 #define SGI_ACL_DEFAULT_SIZE	(sizeof(SGI_ACL_DEFAULT)-1)
 
+/*
+ * At maximum of 8 levels with 128 hashes per block (32 bytes SHA-256) maximum
+ * tree size is ((128^8 − 1)/(128 − 1)) = 567*10^12 blocks. This should fit in
+ * 53 bits address space.
+ *
+ * At this Merkle tree size we can cover 295EB large file. This is much larger
+ * than the currently supported file size.
+ *
+ * For sha512 the largest file we can cover ends at 1 << 50 offset, this is also
+ * good.
+ */
+#define XFS_FSVERITY_LARGEST_FILE	((loff_t)1ULL << 53)
+
+/*
+ * Alignment of the fsverity metadata placement. This is largest supported PAGE
+ * SIZE for fsverity. This is used to space out data and metadata in page cache.
+ * The spacing is necessary for non-exposure of metadata to userspace and
+ * correct merkle tree synethesis in the iomap.
+ */
+#define XFS_FSVERITY_START_ALIGN	(65536)
+
+
 #endif /* __XFS_FORMAT_H__ */
diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index 3794e5412eba..f2181c1bed54 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -760,6 +760,14 @@ xfs_dinode_verify(
 	    !xfs_has_rtreflink(mp))
 		return __this_address;
 
+	/* only regular files can have fsverity */
+	if (flags2 & XFS_DIFLAG2_VERITY) {
+		if (!xfs_has_verity(mp))
+			return __this_address;
+		if (!S_ISREG(mode))
+			return __this_address;
+	}
+
 	if (xfs_has_zoned(mp) &&
 	    dip->di_metatype == cpu_to_be16(XFS_METAFILE_RTRMAP)) {
 		if (be32_to_cpu(dip->di_used_blocks) > mp->m_sb.sb_rgextents)
diff --git a/fs/xfs/libxfs/xfs_inode_util.c b/fs/xfs/libxfs/xfs_inode_util.c
index 551fa51befb6..6b1e20a4bb9b 100644
--- a/fs/xfs/libxfs/xfs_inode_util.c
+++ b/fs/xfs/libxfs/xfs_inode_util.c
@@ -126,6 +126,8 @@ xfs_ip2xflags(
 			flags |= FS_XFLAG_DAX;
 		if (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE)
 			flags |= FS_XFLAG_COWEXTSIZE;
+		if (ip->i_diflags2 & XFS_DIFLAG2_VERITY)
+			flags |= FS_XFLAG_VERITY;
 	}
 
 	if (xfs_inode_has_attr_fork(ip))
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index 47322adb7690..a15510ebd2f1 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -165,6 +165,8 @@ xfs_sb_version_to_features(
 		features |= XFS_FEAT_REFLINK;
 	if (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_INOBTCNT)
 		features |= XFS_FEAT_INOBTCNT;
+	if (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_VERITY)
+		features |= XFS_FEAT_VERITY;
 	if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_FTYPE)
 		features |= XFS_FEAT_FTYPE;
 	if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_SPINODES)
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 208543e57eda..ca369eb96561 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -1415,6 +1415,8 @@ xfs_diflags_to_iflags(
 		flags |= S_NOATIME;
 	if (init && xfs_inode_should_enable_dax(ip))
 		flags |= S_DAX;
+	if (xflags & FS_XFLAG_VERITY)
+		flags |= S_VERITY;
 
 	/*
 	 * S_DAX can only be set during inode initialization and is never set by
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index ddd4028be8d6..07f6aa3c3f26 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -385,6 +385,7 @@ typedef struct xfs_mount {
 #define XFS_FEAT_EXCHANGE_RANGE	(1ULL << 27)	/* exchange range */
 #define XFS_FEAT_METADIR	(1ULL << 28)	/* metadata directory tree */
 #define XFS_FEAT_ZONED		(1ULL << 29)	/* zoned RT device */
+#define XFS_FEAT_VERITY		(1ULL << 30)	/* fs-verity */
 
 /* Mount features */
 #define XFS_FEAT_NOLIFETIME	(1ULL << 47)	/* disable lifetime hints */
@@ -442,6 +443,7 @@ __XFS_HAS_FEAT(exchange_range, EXCHANGE_RANGE)
 __XFS_HAS_FEAT(metadir, METADIR)
 __XFS_HAS_FEAT(zoned, ZONED)
 __XFS_HAS_FEAT(nolifetime, NOLIFETIME)
+__XFS_HAS_FEAT(verity, VERITY)
 
 static inline bool xfs_has_rtgroups(const struct xfs_mount *mp)
 {
-- 
2.51.2


^ permalink raw reply related

* [PATCH v7 09/22] iomap: introduce iomap_fsverity_write() for writing fsverity metadata
From: Andrey Albershteyn @ 2026-04-09 13:13 UTC (permalink / raw)
  To: linux-xfs, fsverity, linux-fsdevel, ebiggers
  Cc: Andrey Albershteyn, hch, linux-ext4, linux-f2fs-devel,
	linux-btrfs, djwong
In-Reply-To: <20260409131404.1545834-1-aalbersh@kernel.org>

This is just a wrapper around iomap_file_buffered_write() to create
necessary iterator over metadata.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Andrey Albershteyn <aalbersh@kernel.org>
---
 fs/iomap/buffered-io.c | 25 +++++++++++++++++++++++++
 include/linux/iomap.h  |  3 +++
 2 files changed, 28 insertions(+)

diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 843314852663..5c9fd925a62f 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -1287,6 +1287,31 @@ iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *i,
 }
 EXPORT_SYMBOL_GPL(iomap_file_buffered_write);
 
+int iomap_fsverity_write(struct file *file, loff_t pos, size_t length,
+		const void *buf, const struct iomap_ops *ops,
+		const struct iomap_write_ops *write_ops)
+{
+	int			ret;
+	struct iov_iter		iiter;
+	struct kvec		kvec = {
+		.iov_base	= (void *)buf,
+		.iov_len	= length,
+	};
+	struct kiocb		iocb = {
+		.ki_filp	= file,
+		.ki_ioprio	= get_current_ioprio(),
+		.ki_pos		= pos,
+	};
+
+	iov_iter_kvec(&iiter, WRITE, &kvec, 1, length);
+
+	ret = iomap_file_buffered_write(&iocb, &iiter, ops, write_ops, NULL);
+	if (ret < 0)
+		return ret;
+	return ret == length ? 0 : -EIO;
+}
+EXPORT_SYMBOL_GPL(iomap_fsverity_write);
+
 static void iomap_write_delalloc_ifs_punch(struct inode *inode,
 		struct folio *folio, loff_t start_byte, loff_t end_byte,
 		struct iomap *iomap, iomap_punch_t punch)
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 4d9202cae29f..83586f09f365 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -359,6 +359,9 @@ static inline bool iomap_want_unshare_iter(const struct iomap_iter *iter)
 ssize_t iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *from,
 		const struct iomap_ops *ops,
 		const struct iomap_write_ops *write_ops, void *private);
+int iomap_fsverity_write(struct file *file, loff_t pos, size_t length,
+		const void *buf, const struct iomap_ops *ops,
+		const struct iomap_write_ops *write_ops);
 void iomap_read_folio(const struct iomap_ops *ops,
 		struct iomap_read_folio_ctx *ctx, void *private);
 void iomap_readahead(const struct iomap_ops *ops,
-- 
2.51.2


^ permalink raw reply related

* [PATCH v7 08/22] iomap: teach iomap to read files with fsverity
From: Andrey Albershteyn @ 2026-04-09 13:13 UTC (permalink / raw)
  To: linux-xfs, fsverity, linux-fsdevel, ebiggers
  Cc: Andrey Albershteyn, hch, linux-ext4, linux-f2fs-devel,
	linux-btrfs, djwong
In-Reply-To: <20260409131404.1545834-1-aalbersh@kernel.org>

Obtain fsverity info for folios with file data and fsverity metadata.
Filesystem can pass vi down to ioend and then to fsverity for
verification. This is different from other filesystems ext4, f2fs, btrfs
supporting fsverity, these filesystems don't need fsverity_info for
reading fsverity metadata. While reading merkle tree iomap requires
fsverity info to synthesize hashes for zeroed data block.

fsverity metadata has two kinds of holes - ones in merkle tree and one
after fsverity descriptor.

Merkle tree holes are blocks full of hashes of zeroed data blocks. These
are not stored on the disk but synthesized on the fly. This saves a bit
of space for sparse files. Due to this iomap also need to lookup
fsverity_info for folios with fsverity metadata. ->vi has a hash of the
zeroed data block which will be used to fill the merkle tree block.

The hole past descriptor is interpreted as end of metadata region. As we
don't have EOF here we use this hole as an indication that rest of the
folio is empty. This patch marks rest of the folio beyond fsverity
descriptor as uptodate.

For file data, fsverity needs to verify consistency of the whole file
against the root hash, hashes of holes are included in the merkle tree.
Verify them too.

Issue reading of fsverity merkle tree on the fsverity inodes. This way
metadata will be available at I/O completion time.

Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrey Albershteyn <aalbersh@kernel.org>
---
 fs/iomap/buffered-io.c | 41 +++++++++++++++++++++++++++++++++++++++--
 include/linux/iomap.h  |  2 ++
 2 files changed, 41 insertions(+), 2 deletions(-)

diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 38c9592fba43..843314852663 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -9,6 +9,7 @@
 #include <linux/swap.h>
 #include <linux/migrate.h>
 #include <linux/fserror.h>
+#include <linux/fsverity.h>
 #include "internal.h"
 #include "trace.h"
 
@@ -561,9 +562,27 @@ static int iomap_read_folio_iter(struct iomap_iter *iter,
 		if (plen == 0)
 			return 0;
 
-		/* zero post-eof blocks as the page may be mapped */
-		if (iomap_block_needs_zeroing(iter, pos)) {
+		/*
+		 * Handling of fsverity "holes". We hit this for two case:
+		 *   1. No need to go further, the hole after fsverity
+		 *	descriptor is the end of the fsverity metadata.
+		 *
+		 *   2. This folio contains merkle tree blocks which need to be
+		 *	synthesized. If we already have fsverity info (ctx->vi)
+		 *	synthesize these blocks.
+		 */
+		if ((iomap->flags & IOMAP_F_FSVERITY) &&
+		    iomap->type == IOMAP_HOLE) {
+			if (ctx->vi)
+				fsverity_fill_zerohash(folio, poff, plen,
+						       ctx->vi);
+			iomap_set_range_uptodate(folio, poff, plen);
+		} else if (iomap_block_needs_zeroing(iter, pos)) {
+			/* zero post-eof blocks as the page may be mapped */
 			folio_zero_range(folio, poff, plen);
+			if (ctx->vi &&
+			    !fsverity_verify_blocks(ctx->vi, folio, plen, poff))
+				return -EIO;
 			iomap_set_range_uptodate(folio, poff, plen);
 		} else {
 			if (!*bytes_submitted)
@@ -614,6 +633,15 @@ void iomap_read_folio(const struct iomap_ops *ops,
 
 	trace_iomap_readpage(iter.inode, 1);
 
+	/*
+	 * Fetch fsverity_info for both data and fsverity metadata, as iomap
+	 * needs zeroed hash for merkle tree block synthesis
+	 */
+	ctx->vi = fsverity_get_info(iter.inode);
+	if (ctx->vi && iter.pos < i_size_read(iter.inode))
+		fsverity_readahead(ctx->vi, folio->index,
+				   folio_nr_pages(folio));
+
 	while ((ret = iomap_iter(&iter, ops)) > 0)
 		iter.status = iomap_read_folio_iter(&iter, ctx,
 				&bytes_submitted);
@@ -681,6 +709,15 @@ void iomap_readahead(const struct iomap_ops *ops,
 
 	trace_iomap_readahead(rac->mapping->host, readahead_count(rac));
 
+	/*
+	 * Fetch fsverity_info for both data and fsverity metadata, as iomap
+	 * needs zeroed hash for merkle tree block synthesis
+	 */
+	ctx->vi = fsverity_get_info(iter.inode);
+	if (ctx->vi && iter.pos < i_size_read(iter.inode))
+		fsverity_readahead(ctx->vi, readahead_index(rac),
+				readahead_count(rac));
+
 	while (iomap_iter(&iter, ops) > 0)
 		iter.status = iomap_readahead_iter(&iter, ctx,
 					&cur_bytes_submitted);
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 4506a99d5285..4d9202cae29f 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -435,6 +435,7 @@ struct iomap_ioend {
 	loff_t			io_offset;	/* offset in the file */
 	sector_t		io_sector;	/* start sector of ioend */
 	void			*io_private;	/* file system private data */
+	struct fsverity_info	*io_vi;		/* fsverity info */
 	struct bio		io_bio;		/* MUST BE LAST! */
 };
 
@@ -509,6 +510,7 @@ struct iomap_read_folio_ctx {
 	struct readahead_control *rac;
 	void			*read_ctx;
 	loff_t			read_ctx_file_offset;
+	struct fsverity_info	*vi;
 };
 
 struct iomap_read_ops {
-- 
2.51.2


^ permalink raw reply related

* [PATCH v7 07/22] iomap: introduce IOMAP_F_FSVERITY and teach writeback to handle fsverity
From: Andrey Albershteyn @ 2026-04-09 13:13 UTC (permalink / raw)
  To: linux-xfs, fsverity, linux-fsdevel, ebiggers
  Cc: Andrey Albershteyn, hch, linux-ext4, linux-f2fs-devel,
	linux-btrfs, djwong
In-Reply-To: <20260409131404.1545834-1-aalbersh@kernel.org>

This flag indicates that I/O is for fsverity metadata.

In the write path skip i_size check and i_size updates as metadata is
past EOF. In writeback don't update i_size and continue writeback if
even folio is beyond EOF. In read path don't zero fsverity folios, again
they are past EOF.

The iomap_block_needs_zeroing() is also called from write path. For
folios of larger order we don't want to zero out pages in the folio as
these could contain other merkle tree blocks. For fsverity, filesystem
will request to read PAGE_SIZE memory regions. For data folios, iomap
will zero the rest of the folio for anything which is beyond EOF. We
don't want this for fsverity folios.

Signed-off-by: Andrey Albershteyn <aalbersh@kernel.org>
Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
---
 fs/iomap/buffered-io.c | 43 +++++++++++++++++++++++++++++++++---------
 fs/iomap/trace.h       |  3 ++-
 include/linux/iomap.h  |  8 ++++++++
 3 files changed, 44 insertions(+), 10 deletions(-)

diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index e4b6886e5c3c..38c9592fba43 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -353,9 +353,26 @@ static inline bool iomap_block_needs_zeroing(const struct iomap_iter *iter,
 {
 	const struct iomap *srcmap = iomap_iter_srcmap(iter);
 
-	return srcmap->type != IOMAP_MAPPED ||
-		(srcmap->flags & IOMAP_F_NEW) ||
-		pos >= i_size_read(iter->inode);
+	/*
+	 * If this block has not been written, there's nothing to read
+	 */
+	if (srcmap->type != IOMAP_MAPPED)
+		return true;
+
+	/*
+	 * Newly allocated blocks have not been written
+	 */
+	if (srcmap->flags & IOMAP_F_NEW)
+		return true;
+
+	/*
+	 * fsverity metadata is stored past i_size, we need to read it instead
+	 * of zeroing
+	 */
+	if (srcmap->flags & IOMAP_F_FSVERITY)
+		return false;
+
+	return pos >= i_size_read(iter->inode);
 }
 
 /**
@@ -1167,13 +1184,14 @@ static int iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i,
 		 * unlock and release the folio.
 		 */
 		old_size = iter->inode->i_size;
-		if (pos + written > old_size) {
+		if (pos + written > old_size &&
+		    !(iter->iomap.flags & IOMAP_F_FSVERITY)) {
 			i_size_write(iter->inode, pos + written);
 			iter->iomap.flags |= IOMAP_F_SIZE_CHANGED;
 		}
 		__iomap_put_folio(iter, write_ops, written, folio);
 
-		if (old_size < pos)
+		if (old_size < pos && !(iter->iomap.flags & IOMAP_F_FSVERITY))
 			pagecache_isize_extended(iter->inode, old_size, pos);
 
 		cond_resched();
@@ -1801,13 +1819,20 @@ static int iomap_writeback_range(struct iomap_writepage_ctx *wpc,
  * Check interaction of the folio with the file end.
  *
  * If the folio is entirely beyond i_size, return false.  If it straddles
- * i_size, adjust end_pos and zero all data beyond i_size.
+ * i_size, adjust end_pos and zero all data beyond i_size. Don't skip fsverity
+ * folios as those are beyond i_size.
  */
-static bool iomap_writeback_handle_eof(struct folio *folio, struct inode *inode,
-		u64 *end_pos)
+static bool iomap_writeback_handle_eof(struct folio *folio,
+		struct iomap_writepage_ctx *wpc, u64 *end_pos)
 {
+	struct inode *inode = wpc->inode;
 	u64 isize = i_size_read(inode);
 
+	if (wpc->iomap.flags & IOMAP_F_FSVERITY) {
+		WARN_ON_ONCE(folio_pos(folio) < isize);
+		return true;
+	}
+
 	if (*end_pos > isize) {
 		size_t poff = offset_in_folio(folio, isize);
 		pgoff_t end_index = isize >> PAGE_SHIFT;
@@ -1873,7 +1898,7 @@ int iomap_writeback_folio(struct iomap_writepage_ctx *wpc, struct folio *folio)
 
 	trace_iomap_writeback_folio(inode, pos, folio_size(folio));
 
-	if (!iomap_writeback_handle_eof(folio, inode, &end_pos))
+	if (!iomap_writeback_handle_eof(folio, wpc, &end_pos))
 		return 0;
 	WARN_ON_ONCE(end_pos <= pos);
 
diff --git a/fs/iomap/trace.h b/fs/iomap/trace.h
index 532787277b16..5252051cc137 100644
--- a/fs/iomap/trace.h
+++ b/fs/iomap/trace.h
@@ -118,7 +118,8 @@ DEFINE_RANGE_EVENT(iomap_zero_iter);
 	{ IOMAP_F_ATOMIC_BIO,	"ATOMIC_BIO" }, \
 	{ IOMAP_F_PRIVATE,	"PRIVATE" }, \
 	{ IOMAP_F_SIZE_CHANGED,	"SIZE_CHANGED" }, \
-	{ IOMAP_F_STALE,	"STALE" }
+	{ IOMAP_F_STALE,	"STALE" }, \
+	{ IOMAP_F_FSVERITY,	"FSVERITY" }
 
 
 #define IOMAP_DIO_STRINGS \
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 531f9ebdeeae..4506a99d5285 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -87,6 +87,14 @@ struct vm_fault;
 #define IOMAP_F_INTEGRITY	0
 #endif /* CONFIG_BLK_DEV_INTEGRITY */
 
+/*
+ * Indicates reads and writes of fsverity metadata.
+ *
+ * Fsverity metadata is stored after the regular file data and thus beyond
+ * i_size.
+ */
+#define IOMAP_F_FSVERITY	(1U << 10)
+
 /*
  * Flag reserved for file system specific usage
  */
-- 
2.51.2


^ permalink raw reply related

* [PATCH v7 06/22] fsverity: hoist pagecache_read from f2fs/ext4 to fsverity
From: Andrey Albershteyn @ 2026-04-09 13:13 UTC (permalink / raw)
  To: linux-xfs, fsverity, linux-fsdevel, ebiggers
  Cc: Andrey Albershteyn, hch, linux-ext4, linux-f2fs-devel,
	linux-btrfs, djwong
In-Reply-To: <20260409131404.1545834-1-aalbersh@kernel.org>

This is the same function to read from pageache. XFS will also need
this, so move this to core fsverity.

Note that f2fs and ext4 functions diverged a bit, as ext4 operated over
folios and f2fs operated over pages. The common one will operate over
folios.

Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Acked-by: Eric Biggers <ebiggers@kernel.org>
Signed-off-by: Andrey Albershteyn <aalbersh@kernel.org>
---
 fs/ext4/verity.c         | 32 +++-----------------------------
 fs/f2fs/verity.c         | 30 +-----------------------------
 fs/verity/pagecache.c    | 33 +++++++++++++++++++++++++++++++++
 include/linux/fsverity.h |  2 ++
 4 files changed, 39 insertions(+), 58 deletions(-)

diff --git a/fs/ext4/verity.c b/fs/ext4/verity.c
index 347945ac23a4..ac5c133f5529 100644
--- a/fs/ext4/verity.c
+++ b/fs/ext4/verity.c
@@ -34,32 +34,6 @@ static inline loff_t ext4_verity_metadata_pos(const struct inode *inode)
 	return round_up(inode->i_size, 65536);
 }
 
-/*
- * Read some verity metadata from the inode.  __vfs_read() can't be used because
- * we need to read beyond i_size.
- */
-static int pagecache_read(struct inode *inode, void *buf, size_t count,
-			  loff_t pos)
-{
-	while (count) {
-		struct folio *folio;
-		size_t n;
-
-		folio = read_mapping_folio(inode->i_mapping, pos >> PAGE_SHIFT,
-					 NULL);
-		if (IS_ERR(folio))
-			return PTR_ERR(folio);
-
-		n = memcpy_from_file_folio(buf, folio, pos, count);
-		folio_put(folio);
-
-		buf += n;
-		pos += n;
-		count -= n;
-	}
-	return 0;
-}
-
 /*
  * Write some verity metadata to the inode for FS_IOC_ENABLE_VERITY.
  * kernel_write() can't be used because the file descriptor is readonly.
@@ -311,8 +285,8 @@ static int ext4_get_verity_descriptor_location(struct inode *inode,
 		goto bad;
 	desc_size_pos -= sizeof(desc_size_disk);
 
-	err = pagecache_read(inode, &desc_size_disk, sizeof(desc_size_disk),
-			     desc_size_pos);
+	err = fsverity_pagecache_read(inode, &desc_size_disk,
+				      sizeof(desc_size_disk), desc_size_pos);
 	if (err)
 		return err;
 	desc_size = le32_to_cpu(desc_size_disk);
@@ -352,7 +326,7 @@ static int ext4_get_verity_descriptor(struct inode *inode, void *buf,
 	if (buf_size) {
 		if (desc_size > buf_size)
 			return -ERANGE;
-		err = pagecache_read(inode, buf, desc_size, desc_pos);
+		err = fsverity_pagecache_read(inode, buf, desc_size, desc_pos);
 		if (err)
 			return err;
 	}
diff --git a/fs/f2fs/verity.c b/fs/f2fs/verity.c
index b3b3e71604ac..5ea0a9b40443 100644
--- a/fs/f2fs/verity.c
+++ b/fs/f2fs/verity.c
@@ -36,34 +36,6 @@ static inline loff_t f2fs_verity_metadata_pos(const struct inode *inode)
 	return round_up(inode->i_size, 65536);
 }
 
-/*
- * Read some verity metadata from the inode.  __vfs_read() can't be used because
- * we need to read beyond i_size.
- */
-static int pagecache_read(struct inode *inode, void *buf, size_t count,
-			  loff_t pos)
-{
-	while (count) {
-		size_t n = min_t(size_t, count,
-				 PAGE_SIZE - offset_in_page(pos));
-		struct page *page;
-
-		page = read_mapping_page(inode->i_mapping, pos >> PAGE_SHIFT,
-					 NULL);
-		if (IS_ERR(page))
-			return PTR_ERR(page);
-
-		memcpy_from_page(buf, page, offset_in_page(pos), n);
-
-		put_page(page);
-
-		buf += n;
-		pos += n;
-		count -= n;
-	}
-	return 0;
-}
-
 /*
  * Write some verity metadata to the inode for FS_IOC_ENABLE_VERITY.
  * kernel_write() can't be used because the file descriptor is readonly.
@@ -248,7 +220,7 @@ static int f2fs_get_verity_descriptor(struct inode *inode, void *buf,
 	if (buf_size) {
 		if (size > buf_size)
 			return -ERANGE;
-		res = pagecache_read(inode, buf, size, pos);
+		res = fsverity_pagecache_read(inode, buf, size, pos);
 		if (res)
 			return res;
 	}
diff --git a/fs/verity/pagecache.c b/fs/verity/pagecache.c
index 99f5f53eea98..9d82e6b74ba1 100644
--- a/fs/verity/pagecache.c
+++ b/fs/verity/pagecache.c
@@ -78,3 +78,36 @@ void fsverity_fill_zerohash(struct folio *folio, size_t offset, size_t len,
 				vi->tree_params.digest_size);
 }
 EXPORT_SYMBOL_GPL(fsverity_fill_zerohash);
+
+/**
+ * fsverity_pagecache_read() - read page and copy data to buffer
+ * @inode:	copy from this inode's address space
+ * @buf:	buffer to copy to
+ * @count:	number of bytes to copy
+ * @pos:	position of the folio to copy from
+ *
+ * Read some verity metadata from the inode.  __vfs_read() can't be used because
+ * we need to read beyond i_size.
+ */
+int fsverity_pagecache_read(struct inode *inode, void *buf, size_t count,
+			  loff_t pos)
+{
+	while (count) {
+		struct folio *folio;
+		size_t n;
+
+		folio = read_mapping_folio(inode->i_mapping, pos >> PAGE_SHIFT,
+					 NULL);
+		if (IS_ERR(folio))
+			return PTR_ERR(folio);
+
+		n = memcpy_from_file_folio(buf, folio, pos, count);
+		folio_put(folio);
+
+		buf += n;
+		pos += n;
+		count -= n;
+	}
+	return 0;
+}
+EXPORT_SYMBOL_GPL(fsverity_pagecache_read);
diff --git a/include/linux/fsverity.h b/include/linux/fsverity.h
index a5645ec07aa8..a2ae5cc649ad 100644
--- a/include/linux/fsverity.h
+++ b/include/linux/fsverity.h
@@ -328,5 +328,7 @@ void fsverity_cleanup_inode(struct inode *inode);
 struct page *generic_read_merkle_tree_page(struct inode *inode, pgoff_t index);
 void generic_readahead_merkle_tree(struct inode *inode, pgoff_t index,
 				   unsigned long nr_pages);
+int fsverity_pagecache_read(struct inode *inode, void *buf, size_t count,
+			    loff_t pos);
 
 #endif	/* _LINUX_FSVERITY_H */
-- 
2.51.2


^ permalink raw reply related

* [PATCH v7 05/22] fsverity: pass digest size and hash of the all-zeroes block to ->write
From: Andrey Albershteyn @ 2026-04-09 13:13 UTC (permalink / raw)
  To: linux-xfs, fsverity, linux-fsdevel, ebiggers
  Cc: Andrey Albershteyn, hch, linux-ext4, linux-f2fs-devel,
	linux-btrfs, djwong
In-Reply-To: <20260409131404.1545834-1-aalbersh@kernel.org>

Let filesystem iterate over hashes in the block and check if these are
hashes of zeroed data blocks. XFS will use this to decide if it want to
store tree block full of these hashes.

Signed-off-by: Andrey Albershteyn <aalbersh@kernel.org>
Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
Acked-by: Eric Biggers <ebiggers@kernel.org>
---
 fs/btrfs/verity.c        | 6 +++++-
 fs/ext4/verity.c         | 4 +++-
 fs/f2fs/verity.c         | 4 +++-
 fs/verity/enable.c       | 4 +++-
 include/linux/fsverity.h | 6 +++++-
 5 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/fs/btrfs/verity.c b/fs/btrfs/verity.c
index 0062b3a55781..fd3696d3f4ce 100644
--- a/fs/btrfs/verity.c
+++ b/fs/btrfs/verity.c
@@ -773,11 +773,15 @@ static struct page *btrfs_read_merkle_tree_page(struct inode *inode,
  * @buf:	Merkle tree block to write
  * @pos:	the position of the block in the Merkle tree (in bytes)
  * @size:	the Merkle tree block size (in bytes)
+ * @zero_digest:	the hash of the all-zeroes block
+ * @digest_size:	size of zero_digest, in bytes
  *
  * Returns 0 on success or negative error code on failure
  */
 static int btrfs_write_merkle_tree_block(struct file *file, const void *buf,
-					 u64 pos, unsigned int size)
+					 u64 pos, unsigned int size,
+					 const u8 *zero_digest,
+					 unsigned int digest_size)
 {
 	struct inode *inode = file_inode(file);
 	loff_t merkle_pos = merkle_file_pos(inode);
diff --git a/fs/ext4/verity.c b/fs/ext4/verity.c
index ca61da53f313..347945ac23a4 100644
--- a/fs/ext4/verity.c
+++ b/fs/ext4/verity.c
@@ -374,7 +374,9 @@ static void ext4_readahead_merkle_tree(struct inode *inode, pgoff_t index,
 }
 
 static int ext4_write_merkle_tree_block(struct file *file, const void *buf,
-					u64 pos, unsigned int size)
+					u64 pos, unsigned int size,
+					const u8 *zero_digest,
+					unsigned int digest_size)
 {
 	pos += ext4_verity_metadata_pos(file_inode(file));
 
diff --git a/fs/f2fs/verity.c b/fs/f2fs/verity.c
index 92ebcc19cab0..b3b3e71604ac 100644
--- a/fs/f2fs/verity.c
+++ b/fs/f2fs/verity.c
@@ -270,7 +270,9 @@ static void f2fs_readahead_merkle_tree(struct inode *inode, pgoff_t index,
 }
 
 static int f2fs_write_merkle_tree_block(struct file *file, const void *buf,
-					u64 pos, unsigned int size)
+					u64 pos, unsigned int size,
+					const u8 *zero_digest,
+					unsigned int digest_size)
 {
 	pos += f2fs_verity_metadata_pos(file_inode(file));
 
diff --git a/fs/verity/enable.c b/fs/verity/enable.c
index 42dfed1ce0ce..ad4ff71d7dd9 100644
--- a/fs/verity/enable.c
+++ b/fs/verity/enable.c
@@ -50,7 +50,9 @@ static int write_merkle_tree_block(struct file *file, const u8 *buf,
 	int err;
 
 	err = inode->i_sb->s_vop->write_merkle_tree_block(file, buf, pos,
-							  params->block_size);
+							  params->block_size,
+							  params->zero_digest,
+							  params->digest_size);
 	if (err)
 		fsverity_err(inode, "Error %d writing Merkle tree block %lu",
 			     err, index);
diff --git a/include/linux/fsverity.h b/include/linux/fsverity.h
index e4503312d114..a5645ec07aa8 100644
--- a/include/linux/fsverity.h
+++ b/include/linux/fsverity.h
@@ -124,6 +124,8 @@ struct fsverity_operations {
 	 * @buf: the Merkle tree block to write
 	 * @pos: the position of the block in the Merkle tree (in bytes)
 	 * @size: the Merkle tree block size (in bytes)
+	 * @zero_digest: the hash of the all-zeroes block
+	 * @digest_size: size of zero_digest, in bytes
 	 *
 	 * This is only called between ->begin_enable_verity() and
 	 * ->end_enable_verity().
@@ -131,7 +133,9 @@ struct fsverity_operations {
 	 * Return: 0 on success, -errno on failure
 	 */
 	int (*write_merkle_tree_block)(struct file *file, const void *buf,
-				       u64 pos, unsigned int size);
+				       u64 pos, unsigned int size,
+				       const u8 *zero_digest,
+				       unsigned int digest_size);
 };
 
 #ifdef CONFIG_FS_VERITY
-- 
2.51.2


^ permalink raw reply related

* [PATCH v7 04/22] fsverity: generate and store zero-block hash
From: Andrey Albershteyn @ 2026-04-09 13:13 UTC (permalink / raw)
  To: linux-xfs, fsverity, linux-fsdevel, ebiggers
  Cc: Andrey Albershteyn, hch, linux-ext4, linux-f2fs-devel,
	linux-btrfs, djwong
In-Reply-To: <20260409131404.1545834-1-aalbersh@kernel.org>

Compute the hash of one filesystem block's worth of zeros. A filesystem
implementation can decide to elide merkle tree blocks containing only
this hash and synthesize the contents at read time.

Let's pretend that there's a file containing 131 data block and whose
merkle tree looks roughly like this:

root
 +--leaf0
 |   +--data0
 |   +--data1
 |   +--...
 |   `--data128
 `--leaf1
     +--data129
     +--data130
     `--data131

If data[0-128] are sparse holes, then leaf0 will contain a repeating
sequence of @zero_digest.  Therefore, leaf0 need not be written to disk
because its contents can be synthesized.

A subsequent xfs patch will use this to reduce the size of the merkle
tree when dealing with sparse gold master disk images and the like.

Note that this works only on the first-level (data holes). fsverity
doesn't store/generate zero_digest for any higher levels.

Add a helper to pre-fill folio with hashes of empty blocks. This will be
used by iomap to synthesize blocks full of zero hashes on the fly.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Acked-by: Eric Biggers <ebiggers@kernel.org>
Signed-off-by: Andrey Albershteyn <aalbersh@kernel.org>
---
 fs/verity/fsverity_private.h |  3 +++
 fs/verity/measure.c          |  4 ++--
 fs/verity/open.c             |  3 +++
 fs/verity/pagecache.c        | 22 ++++++++++++++++++++++
 include/linux/fsverity.h     |  8 ++++++++
 5 files changed, 38 insertions(+), 2 deletions(-)

diff --git a/fs/verity/fsverity_private.h b/fs/verity/fsverity_private.h
index 6e6854c19078..881d46f25e08 100644
--- a/fs/verity/fsverity_private.h
+++ b/fs/verity/fsverity_private.h
@@ -53,6 +53,9 @@ struct merkle_tree_params {
 	u64 tree_size;			/* Merkle tree size in bytes */
 	unsigned long tree_pages;	/* Merkle tree size in pages */
 
+	/* the hash of an all-zeroes block */
+	u8 zero_digest[FS_VERITY_MAX_DIGEST_SIZE];
+
 	/*
 	 * Starting block index for each tree level, ordered from leaf level (0)
 	 * to root level ('num_levels - 1')
diff --git a/fs/verity/measure.c b/fs/verity/measure.c
index 6a35623ebdf0..818083507885 100644
--- a/fs/verity/measure.c
+++ b/fs/verity/measure.c
@@ -68,8 +68,8 @@ EXPORT_SYMBOL_GPL(fsverity_ioctl_measure);
  * @alg: (out) the digest's algorithm, as a FS_VERITY_HASH_ALG_* value
  * @halg: (out) the digest's algorithm, as a HASH_ALGO_* value
  *
- * Retrieves the fsverity digest of the given file.  The file must have been
- * opened at least once since the inode was last loaded into the inode cache;
+ * Retrieves the fsverity digest of the given file. The
+ * fsverity_ensure_verity_info() must be called on the inode beforehand;
  * otherwise this function will not recognize when fsverity is enabled.
  *
  * The file's fsverity digest consists of @raw_digest in combination with either
diff --git a/fs/verity/open.c b/fs/verity/open.c
index d32d0899df25..875e8850ccba 100644
--- a/fs/verity/open.c
+++ b/fs/verity/open.c
@@ -153,6 +153,9 @@ int fsverity_init_merkle_tree_params(struct merkle_tree_params *params,
 		goto out_err;
 	}
 
+	fsverity_hash_block(params, page_address(ZERO_PAGE(0)),
+			    params->zero_digest);
+
 	params->tree_size = offset << log_blocksize;
 	params->tree_pages = PAGE_ALIGN(params->tree_size) >> PAGE_SHIFT;
 	return 0;
diff --git a/fs/verity/pagecache.c b/fs/verity/pagecache.c
index 1819314ecaa3..99f5f53eea98 100644
--- a/fs/verity/pagecache.c
+++ b/fs/verity/pagecache.c
@@ -2,6 +2,7 @@
 /*
  * Copyright 2019 Google LLC
  */
+#include "fsverity_private.h"
 
 #include <linux/export.h>
 #include <linux/fsverity.h>
@@ -56,3 +57,24 @@ void generic_readahead_merkle_tree(struct inode *inode, pgoff_t index,
 		folio_put(folio);
 }
 EXPORT_SYMBOL_GPL(generic_readahead_merkle_tree);
+
+/**
+ * fsverity_fill_zerohash() - fill folio with hashes of zero data block
+ * @folio:	folio to fill
+ * @offset:	offset in the folio to start
+ * @len:	length of the range to fill with hashes
+ * @vi:		fsverity info
+ */
+void fsverity_fill_zerohash(struct folio *folio, size_t offset, size_t len,
+			      struct fsverity_info *vi)
+{
+	size_t off = offset;
+
+	WARN_ON_ONCE(!IS_ALIGNED(offset, vi->tree_params.digest_size));
+	WARN_ON_ONCE(!IS_ALIGNED(len, vi->tree_params.digest_size));
+
+	for (; off < (offset + len); off += vi->tree_params.digest_size)
+		memcpy_to_folio(folio, off, vi->tree_params.zero_digest,
+				vi->tree_params.digest_size);
+}
+EXPORT_SYMBOL_GPL(fsverity_fill_zerohash);
diff --git a/include/linux/fsverity.h b/include/linux/fsverity.h
index 5562271bd628..e4503312d114 100644
--- a/include/linux/fsverity.h
+++ b/include/linux/fsverity.h
@@ -201,6 +201,8 @@ bool fsverity_verify_blocks(struct fsverity_info *vi, struct folio *folio,
 			    size_t len, size_t offset);
 void fsverity_verify_bio(struct fsverity_info *vi, struct bio *bio);
 void fsverity_enqueue_verify_work(struct work_struct *work);
+void fsverity_fill_zerohash(struct folio *folio, size_t poff, size_t plen,
+			      struct fsverity_info *vi);
 
 #else /* !CONFIG_FS_VERITY */
 
@@ -281,6 +283,12 @@ static inline void fsverity_enqueue_verify_work(struct work_struct *work)
 	WARN_ON_ONCE(1);
 }
 
+static inline void fsverity_fill_zerohash(struct folio *folio, size_t poff,
+		size_t plen, struct fsverity_info *vi)
+{
+	WARN_ON_ONCE(1);
+}
+
 #endif	/* !CONFIG_FS_VERITY */
 
 static inline bool fsverity_verify_folio(struct fsverity_info *vi,
-- 
2.51.2


^ permalink raw reply related

* [PATCH v7 03/22] ovl: use core fsverity ensure info interface
From: Andrey Albershteyn @ 2026-04-09 13:13 UTC (permalink / raw)
  To: linux-xfs, fsverity, linux-fsdevel, ebiggers
  Cc: Andrey Albershteyn, hch, linux-ext4, linux-f2fs-devel,
	linux-btrfs, djwong
In-Reply-To: <20260409131404.1545834-1-aalbersh@kernel.org>

fsverity now exposes fsverity_ensure_verity_info() which could be used
instead of opening file to ensure that fsverity info is loaded and
attached to inode.

Signed-off-by: Andrey Albershteyn <aalbersh@kernel.org>
---
 fs/overlayfs/util.c | 14 +++-----------
 1 file changed, 3 insertions(+), 11 deletions(-)

diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c
index 2ea769f311c3..8bde5dc31d7d 100644
--- a/fs/overlayfs/util.c
+++ b/fs/overlayfs/util.c
@@ -16,6 +16,7 @@
 #include <linux/namei.h>
 #include <linux/ratelimit.h>
 #include <linux/overflow.h>
+#include <linux/fsverity.h>
 #include "overlayfs.h"
 
 /* Get write access to upper mnt - may fail if upper sb was remounted ro */
@@ -1377,18 +1378,9 @@ char *ovl_get_redirect_xattr(struct ovl_fs *ofs, const struct path *path, int pa
 int ovl_ensure_verity_loaded(const struct path *datapath)
 {
 	struct inode *inode = d_inode(datapath->dentry);
-	struct file *filp;
 
-	if (!fsverity_active(inode) && IS_VERITY(inode)) {
-		/*
-		 * If this inode was not yet opened, the verity info hasn't been
-		 * loaded yet, so we need to do that here to force it into memory.
-		 */
-		filp = kernel_file_open(datapath, O_RDONLY, current_cred());
-		if (IS_ERR(filp))
-			return PTR_ERR(filp);
-		fput(filp);
-	}
+	if (fsverity_active(inode))
+		fsverity_ensure_verity_info(inode);
 
 	return 0;
 }
-- 
2.51.2


^ permalink raw reply related

* [PATCH v7 02/22] fsverity: expose ensure_fsverity_info()
From: Andrey Albershteyn @ 2026-04-09 13:13 UTC (permalink / raw)
  To: linux-xfs, fsverity, linux-fsdevel, ebiggers
  Cc: Andrey Albershteyn, hch, linux-ext4, linux-f2fs-devel,
	linux-btrfs, djwong
In-Reply-To: <20260409131404.1545834-1-aalbersh@kernel.org>

This function will be used by XFS's scrub to force fsverity activation,
therefore, to read fsverity context.

Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Acked-by: Eric Biggers <ebiggers@kernel.org>
Signed-off-by: Andrey Albershteyn <aalbersh@kernel.org>
---
 fs/verity/open.c         | 22 ++++++++++++++++++++--
 include/linux/fsverity.h |  2 ++
 2 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/fs/verity/open.c b/fs/verity/open.c
index dfa0d1afe0fe..d32d0899df25 100644
--- a/fs/verity/open.c
+++ b/fs/verity/open.c
@@ -344,7 +344,24 @@ int fsverity_get_descriptor(struct inode *inode,
 	return 0;
 }
 
-static int ensure_verity_info(struct inode *inode)
+/**
+ * fsverity_ensure_verity_info() - cache verity info if it's not already cached
+ * @inode: the inode for which verity info should be cached
+ *
+ * Ensure this inode has verity info attached to it, it's assumed the inode
+ * already has fsverity enabled. Read fsverity descriptor and creates verity
+ * based on that.
+ *
+ * This needs to be called at least once before any of the inode's data
+ * can be verified (and thus read at all) or the inode's fsverity digest
+ * retrieved.  fsverity_file_open() calls this already, which handles
+ * normal file accesses.  If a filesystem does any internal (i.e. not
+ * associated with a file descriptor) reads of the file's data or
+ * fsverity digest, it must call this explicitly before doing so.
+ *
+ * Return: 0 on success, -errno on failure
+ */
+int fsverity_ensure_verity_info(struct inode *inode)
 {
 	struct fsverity_info *vi = fsverity_get_info(inode), *found;
 	struct fsverity_descriptor *desc;
@@ -380,12 +397,13 @@ static int ensure_verity_info(struct inode *inode)
 	kfree(desc);
 	return err;
 }
+EXPORT_SYMBOL_GPL(fsverity_ensure_verity_info);
 
 int __fsverity_file_open(struct inode *inode, struct file *filp)
 {
 	if (filp->f_mode & FMODE_WRITE)
 		return -EPERM;
-	return ensure_verity_info(inode);
+	return fsverity_ensure_verity_info(inode);
 }
 EXPORT_SYMBOL_GPL(__fsverity_file_open);
 
diff --git a/include/linux/fsverity.h b/include/linux/fsverity.h
index a8f9aa75b792..5562271bd628 100644
--- a/include/linux/fsverity.h
+++ b/include/linux/fsverity.h
@@ -309,6 +309,8 @@ static inline int fsverity_file_open(struct inode *inode, struct file *filp)
 	return 0;
 }
 
+int fsverity_ensure_verity_info(struct inode *inode);
+
 void fsverity_cleanup_inode(struct inode *inode);
 
 struct page *generic_read_merkle_tree_page(struct inode *inode, pgoff_t index);
-- 
2.51.2


^ permalink raw reply related

* [PATCH v7 01/22] fsverity: report validation errors through fserror to fsnotify
From: Andrey Albershteyn @ 2026-04-09 13:13 UTC (permalink / raw)
  To: linux-xfs, fsverity, linux-fsdevel, ebiggers
  Cc: Andrey Albershteyn, hch, linux-ext4, linux-f2fs-devel,
	linux-btrfs, djwong
In-Reply-To: <20260409131404.1545834-1-aalbersh@kernel.org>

Reported verification errors to fsnotify through recently added fserror
interface.

Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Acked-by: Eric Biggers <ebiggers@kernel.org>
Signed-off-by: Andrey Albershteyn <aalbersh@kernel.org>
---
 fs/verity/verify.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/fs/verity/verify.c b/fs/verity/verify.c
index 4004a1d42875..db8c350234bb 100644
--- a/fs/verity/verify.c
+++ b/fs/verity/verify.c
@@ -9,6 +9,7 @@
 
 #include <linux/bio.h>
 #include <linux/export.h>
+#include <linux/fserror.h>
 
 #define FS_VERITY_MAX_PENDING_BLOCKS 2
 
@@ -205,6 +206,8 @@ static bool verify_data_block(struct fsverity_info *vi,
 		if (memchr_inv(dblock->data, 0, params->block_size)) {
 			fsverity_err(inode,
 				     "FILE CORRUPTED!  Data past EOF is not zeroed");
+			fserror_report_data_lost(inode, data_pos,
+						 params->block_size, GFP_NOFS);
 			return false;
 		}
 		return true;
@@ -312,6 +315,7 @@ static bool verify_data_block(struct fsverity_info *vi,
 		data_pos, level - 1, params->hash_alg->name, hsize, want_hash,
 		params->hash_alg->name, hsize,
 		level == 0 ? dblock->real_hash : real_hash);
+	fserror_report_data_lost(inode, data_pos, params->block_size, GFP_NOFS);
 error:
 	for (; level > 0; level--) {
 		kunmap_local(hblocks[level - 1].addr);
-- 
2.51.2


^ permalink raw reply related

* [PATCH v7 00/22] fs-verity support for XFS with post EOF merkle tree
From: Andrey Albershteyn @ 2026-04-09 13:13 UTC (permalink / raw)
  To: linux-xfs, fsverity, linux-fsdevel, ebiggers
  Cc: Andrey Albershteyn, hch, linux-ext4, linux-f2fs-devel,
	linux-btrfs, djwong, david

Hi all,

This patch series adds fs-verity support for XFS. This version stores
merkle tree beyond end of the file, the same way as ext4 does it. The
difference is that verity descriptor is stored at the next aligned 64k
block after the merkle tree last block. This is done due to sparse
merkle tree which doesn't store hashes of zero data blocks.

The patchset starts with a few fs-verity preparation patches. Then, a
few patches to allow iomap to work in post EOF region. The XFS fs-verity
implementation follows.

The tree is read by iomap into page cache at offset of next largest
folio past end of file. The same offset is used for on-disk.

This patchsets also synthesizes merkle tree block full of hashes of
zeroed data blocks. This merkle blocks are not stored on disk, they are
holes in the tree.

Testing. The -g verity is passing for 1k, 8k and 4k with/without quota
on 4k and 64k page size systems. Tested -g quick for enabled/disabled
fsverity. Also, overlay/080 overlay/089 with XFS as base.

This series based on v7.0-rc6 with patchset fs generated integrity
information [1] applied.

kernel:
https://git.kernel.org/pub/scm/linux/kernel/git/aalbersh/xfs-linux.git/log/?h=b4/fsverity

xfsprogs:
https://github.com/alberand/xfsprogs/tree/b4/fsverity

xfstests:
https://github.com/alberand/xfstests/tree/b4/fsverity

Cc: fsverity@lists.linux.dev
Cc: linux-fsdevel@vger.kernel.org
Cc: linux-xfs@vger.kernel.org

Cc: david@fromorbit.com
Cc: djwong@kernel.org
Cc: ebiggers@kernel.org
Cc: hch@lst.de

[1]: https://lore.kernel.org/linux-xfs/20260223132021.292832-1-hch@lst.de/

---
Changes in v7:
- Move kerneldoc to fsverity_ensure_verity_info() definition
- Drop patch adding XFS traces
- Fix overly long line in the comment
- Make order of fserror and fsverity_error consistent
- Add overlay patch converting to fsverity_ensure_verity_info()
Changes in v6:
- Removed stub for fsverity_ensure_verity_info() as it's optimized out
- Rename fsverity_folio_zero_hash() to fsverify_fill_zerohash()
- Merge patches 8 to 10 into one
- Merge patch gerating zero_hash and fsverity_fill_zerohash() into one
- Add kerneldoc to fsverity_ensure_verity_info()
- Add comments to iomap_block_needs_zeroing()
Changes in v5:
- Add fserror_report_data_lost() for data blocks in page spanning EOF
- Issue fsverity metadata readahead in data readahead
- iomap_fsverity_write() return type fix
- Use of S_ISREG(mode)
- Make 65536 #define instead of open-coded
- Use transaction per unwritten extent removal
- Fetch fsverity_info for all fsverity metadata
- Revert fsverity_folio_zero_hash() stub as used in iomap
- Extend cancel_unwritten to whole file range to remove cow leftovers
- Drop delayed allocation on the COW fork on fsverity completion
Changes in v4:
- Use fserror interface in fsverity instead of fs callback
- Hoist pagecache_read from f2fs/ext4 to fsverity
- Refactor iomap code
- Fetch fsverity_info only for file data and merkle tree holes
- Do not disable preallocation, remove unwritten extents instead
- Offload fsverity hash I/O to fsverity workqueue in read path
- Store merkle tree at round_up(i_size, 64k)
- Add a spacing between merkle tree and fsverity descriptor as next 64k
  aligned block
- Squash helpers into first user commits
- Squash on-disk format changes into single commit
- Drop different offset for pagecache/on-disk
- Don't zero out pages in higher order folios in write path
- Link to v3: https://lore.kernel.org/fsverity/20260217231937.1183679-1-aalbersh@kernel.org/T/#t
Changes in v3:
- Different on-disk and pagecache offset
- Use read path ioends
- Switch to hashtable fsverity info
- Synthesize merkle tree blocks full of zeroes
- Other minor refactors
- Link to v2: https://lore.kernel.org/fsverity/20260114164210.GO15583@frogsfrogsfrogs/T/#t
Changes in v2:
- Move to VFS interface for merkle tree block reading
- Drop patchset for per filesystem workqueues
- Change how offsets of the descriptor and tree metadata is calculated
- Store fs-verity descriptor in data fork side by side with merkle tree
- Simplify iomap changes, remove interface for post eof read/write
- Get rid of extended attribute implementation
- Link to v1: https://lore.kernel.org/r/20250728-fsverity-v1-0-9e5443af0e34@kernel.org

Andrey Albershteyn (20):
  fsverity: report validation errors through fserror to fsnotify
  fsverity: expose ensure_fsverity_info()
  ovl: use core fsverity ensure info interface
  fsverity: generate and store zero-block hash
  fsverity: pass digest size and hash of the all-zeroes block to ->write
  fsverity: hoist pagecache_read from f2fs/ext4 to fsverity
  iomap: introduce IOMAP_F_FSVERITY and teach writeback to handle
    fsverity
  iomap: teach iomap to read files with fsverity
  iomap: introduce iomap_fsverity_write() for writing fsverity metadata
  xfs: introduce fsverity on-disk changes
  xfs: initialize fs-verity on file open
  xfs: don't allow to enable DAX on fs-verity sealed inode
  xfs: disable direct read path for fs-verity files
  xfs: handle fsverity I/O in write/read path
  xfs: use read ioend for fsverity data verification
  xfs: add fs-verity support
  xfs: remove unwritten extents after preallocations in fsverity
    metadata
  xfs: add fs-verity ioctls
  xfs: introduce health state for corrupted fsverity metadata
  xfs: enable ro-compat fs-verity flag

Darrick J. Wong (2):
  xfs: advertise fs-verity being available on filesystem
  xfs: check and repair the verity inode flag state

 fs/btrfs/verity.c              |   6 +-
 fs/ext4/verity.c               |  36 +--
 fs/f2fs/verity.c               |  34 +--
 fs/iomap/buffered-io.c         | 109 +++++++-
 fs/iomap/trace.h               |   3 +-
 fs/overlayfs/util.c            |  14 +-
 fs/verity/enable.c             |   4 +-
 fs/verity/fsverity_private.h   |   3 +
 fs/verity/measure.c            |   4 +-
 fs/verity/open.c               |  25 +-
 fs/verity/pagecache.c          |  55 ++++
 fs/verity/verify.c             |   4 +
 fs/xfs/Makefile                |   1 +
 fs/xfs/libxfs/xfs_bmap.c       |   7 +
 fs/xfs/libxfs/xfs_format.h     |  35 ++-
 fs/xfs/libxfs/xfs_fs.h         |   2 +
 fs/xfs/libxfs/xfs_health.h     |   4 +-
 fs/xfs/libxfs/xfs_inode_buf.c  |   8 +
 fs/xfs/libxfs/xfs_inode_util.c |   2 +
 fs/xfs/libxfs/xfs_sb.c         |   4 +
 fs/xfs/scrub/attr.c            |   7 +
 fs/xfs/scrub/common.c          |  53 ++++
 fs/xfs/scrub/common.h          |   2 +
 fs/xfs/scrub/inode.c           |   7 +
 fs/xfs/scrub/inode_repair.c    |  36 +++
 fs/xfs/xfs_aops.c              |  62 ++++-
 fs/xfs/xfs_bmap_util.c         |   8 +
 fs/xfs/xfs_file.c              |  19 +-
 fs/xfs/xfs_fsverity.c          | 455 +++++++++++++++++++++++++++++++++
 fs/xfs/xfs_fsverity.h          |  28 ++
 fs/xfs/xfs_health.c            |   1 +
 fs/xfs/xfs_inode.h             |   6 +
 fs/xfs/xfs_ioctl.c             |  14 +
 fs/xfs/xfs_iomap.c             |  15 +-
 fs/xfs/xfs_iops.c              |   4 +
 fs/xfs/xfs_message.c           |   4 +
 fs/xfs/xfs_message.h           |   1 +
 fs/xfs/xfs_mount.h             |   4 +
 fs/xfs/xfs_super.c             |   7 +
 include/linux/fsverity.h       |  18 +-
 include/linux/iomap.h          |  13 +
 41 files changed, 1017 insertions(+), 107 deletions(-)
 create mode 100644 fs/xfs/xfs_fsverity.c
 create mode 100644 fs/xfs/xfs_fsverity.h

-- 
2.51.2


^ permalink raw reply

* Re: BUG: net-next (7.0-rc6 based and later) fails to boot on Jetson Xavier NX
From: Will Deacon @ 2026-04-09 12:24 UTC (permalink / raw)
  To: Russell King (Oracle)
  Cc: Robin Murphy, netdev, linux-arm-kernel, linux-kernel, iommu,
	linux-ext4, Linus Torvalds, dmaengine, Marek Szyprowski,
	Theodore Ts'o, Andreas Dilger, Vinod Koul, Frank Li
In-Reply-To: <adayAMR_dEA6W5vW@shell.armlinux.org.uk>

On Wed, Apr 08, 2026 at 08:52:32PM +0100, Russell King (Oracle) wrote:
> What's the status on the iommu fix? Is it merged into mainline yet?
> If it isn't already, that means net-next remains unbootable going
> into the merge window without manually carrying the fix locally.

I'll pick it up for 7.0 in the iommu tree.

Will

^ permalink raw reply

* [PATCH v2] jbd2: fix deadlock in jbd2_journal_cancel_revoke()
From: Zhang Yi @ 2026-04-09 11:42 UTC (permalink / raw)
  To: linux-ext4
  Cc: linux-fsdevel, linux-kernel, dave, tytso, adilger.kernel, jack,
	ojaswin, ritesh.list, libaokun, yi.zhang, yi.zhang, yizhang089,
	yangerkun, yukuai

From: Zhang Yi <yi.zhang@huawei.com>

Commit f76d4c28a46a ("fs/jbd2: use sleeping version of
__find_get_block()") changed jbd2_journal_cancel_revoke() to use
__find_get_block_nonatomic() which holds the folio lock instead of
i_private_lock. This breaks the lock ordering (folio -> buffer) and
causes an ABBA deadlock when the filesystem blocksize < pagesize:

     T1                                T2
ext4_mkdir()
 ext4_init_new_dir()
  ext4_append()
   ext4_getblk()
    lock_buffer()    <- A
                                   sync_blockdev()
                                    blkdev_writepages()
                                     writeback_iter()
                                      writeback_get_folio()
                                       folio_lock()   <- B
     ext4_journal_get_create_access()
      jbd2_journal_cancel_revoke()
       __find_get_block_nonatomic()
        folio_lock()  <- B
                                     block_write_full_folio()
                                      lock_buffer()   <- A

This can occasionally cause generic/013 to hang.

Fix by only calling __find_get_block_nonatomic() when the passed
buffer_head doesn't belong to the bdev, which is the only case that we
need to look up its bdev alias. Otherwise, the lookup is redundant since
the found buffer_head is equal to the one we passed in.

Fixes: f76d4c28a46a ("fs/jbd2: use sleeping version of __find_get_block()")
Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
---
v1->v2:
 - Switch to using sb_is_blkdev_sb() to check whether the bh belongs to
   the bdev.

 fs/jbd2/revoke.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
index 9016ddb82447..e4c2fbd381f1 100644
--- a/fs/jbd2/revoke.c
+++ b/fs/jbd2/revoke.c
@@ -428,6 +428,7 @@ void jbd2_journal_cancel_revoke(handle_t *handle, struct journal_head *jh)
 	journal_t *journal = handle->h_transaction->t_journal;
 	int need_cancel;
 	struct buffer_head *bh = jh2bh(jh);
+	struct address_space *bh_mapping = bh->b_folio->mapping;
 
 	jbd2_debug(4, "journal_head %p, cancelling revoke\n", jh);
 
@@ -464,13 +465,14 @@ void jbd2_journal_cancel_revoke(handle_t *handle, struct journal_head *jh)
 	 * buffer_head?  If so, we'd better make sure we clear the
 	 * revoked status on any hashed alias too, otherwise the revoke
 	 * state machine will get very upset later on. */
-	if (need_cancel) {
+	if (need_cancel && !sb_is_blkdev_sb(bh_mapping->host->i_sb)) {
 		struct buffer_head *bh2;
+
 		bh2 = __find_get_block_nonatomic(bh->b_bdev, bh->b_blocknr,
 						 bh->b_size);
 		if (bh2) {
-			if (bh2 != bh)
-				clear_buffer_revoked(bh2);
+			WARN_ON_ONCE(bh2 == bh);
+			clear_buffer_revoked(bh2);
 			__brelse(bh2);
 		}
 	}
-- 
2.52.0


^ permalink raw reply related

* Re: [PATCH v2 3/3] ext4: derive f_fsid from block device to avoid collisions
From: Anand Jain @ 2026-04-09  9:45 UTC (permalink / raw)
  To: Theodore Tso
  Cc: Christoph Hellwig, Darrick J. Wong, linux-ext4, linux-btrfs,
	linux-xfs, Anand Jain, dsterba
In-Reply-To: <20260409041035.GC99725@macsyma-wired.lan>

> So if the two file systems are identical, the (f_fsid, ino) will
> uniquely determines a file.  And that's *fine* if f_fsid is the same
> for statfs(A) and statfs(B).

Got it. Do you mean that since both filesystems are identical,
statfs(A) and statfs(B) can legitimately return the same values?

And if users want them to diverge, the expectation is that they
should update the UUID (via tune2fs), after which f_fsid would
also differ. If the UUID isn't changed, then the ambiguity is
effectively a user responsibility.

I'm not entirely sure what the correct expectation for f_fsid
should be.

My initial idea was to make f_fsid behavior consistent across
major filesystems so that user space benefits from predictable
semantics.

Currently, XFS and Btrfs provide f_fsid values that are unique
per filesystem, including clones, while ext4 provides uniqueness
only if the UUID is changed by the user.

That said, it may be fine for ext4 to retain this behavior,
since statfs(2) itself has some inherent ambiguity.

If there are no further concerns, I'll update the submitted
test case generic/791 accordingly.

Thanks, Anand

^ permalink raw reply

* Re: [PATCH v2 3/3] ext4: derive f_fsid from block device to avoid collisions
From: Theodore Tso @ 2026-04-09  4:10 UTC (permalink / raw)
  To: Anand Jain
  Cc: Christoph Hellwig, Darrick J. Wong, linux-ext4, linux-btrfs,
	linux-xfs, Anand Jain
In-Reply-To: <3c9e478a-42ef-446f-a8cc-1b4ac970d2ef@gmail.com>

On Thu, Apr 09, 2026 at 06:28:32AM +0800, Anand Jain wrote:
> Some A/B testing use cases require the filesystem to remain
> byte-for-byte identical. In those scenarios, changing the UUID
> isn't an option.

But in that case, where the file systems A and B are bit-fot-bit
identical, why do we care whether statfs returns different fsid's for
statfs(A) and statfs(B).  After all, f_fsid is only defined as "File
system ID", and there is no other definition.  Which is why I say that
people *really* shouldn't depending on its semantics, because it's not
well defined.

Quoting from the statfs(2) man page:

    Nobody knows what f_fsid is supposed to contain (but see below).

    ...

    The general idea is that f_fsid contains some random stuff such
    that the pair (f_fsid,ino) uniquely determines a file. Some
    operating systems use (a variation on) the device number, or the
    device number combined with the filesystem type. Several operating
    systems restrict giving out the f_fsid field to the superuser only
    (and zero it for unprivileged users), because this field is used
    in the filehandle of the filesystem when NFS-exported, and giving
    it out is a security concern.

So if the two file systems are identical, the (f_fsid, ino) will
uniquely determines a file.  And that's *fine* if f_fsid is the same
for statfs(A) and statfs(B).

No?

					- Ted

^ permalink raw reply

* Re: [PATCH 0/3] show orphan file inode detail info
From: Theodore Tso @ 2026-04-09  2:51 UTC (permalink / raw)
  To: Jan Kara; +Cc: Ye Bin, adilger.kernel, linux-ext4, linux-fsdevel
In-Reply-To: <fdil2jz7kizzrlew7airdlpvnz7qyzigigaw3huktm66455wuh@yyqhacmxe4qk>

On Wed, Apr 08, 2026 at 03:36:10PM +0200, Jan Kara wrote:
> > And inodes on the orphan inode list have been unlinked, so we don't want
> > to allow people to be able to open them.
> 
> Why? You can reopen unlinked files using magic links in proc or file
> handles just fine today (just tested this if I'm not missing anything in
> the code). Only once the inode is really deleted you cannot open using the
> handle anymore.

You're right.  I don't know why I thought that would fail, but it does
indeed work.  I think I was thinking of linkat(2) with AT_EMPTY_FLAG.

So yeah, your suggestion of a synthetic fd that returns a list of file
handles sounds good to me.

					- Ted

^ permalink raw reply

* Re: [PATCH] jbd2: fix deadlock in jbd2_journal_cancel_revoke()
From: Zhang Yi @ 2026-04-09  2:45 UTC (permalink / raw)
  To: linux-ext4
  Cc: linux-fsdevel, linux-kernel, dave, tytso, adilger.kernel, jack,
	ojaswin, ritesh.list, libaokun, yi.zhang, yizhang089, yangerkun,
	yukuai
In-Reply-To: <20260409010730.5508-1-yi.zhang@huaweicloud.com>

Please ignore this patch; this solution does not correct.

Best Regards.
Yi.

On 4/9/2026 9:07 AM, Zhang Yi wrote:
> From: Zhang Yi <yi.zhang@huawei.com>
> 
> Commit f76d4c28a46a ("fs/jbd2: use sleeping version of
> __find_get_block()") changed jbd2_journal_cancel_revoke() to use
> __find_get_block_nonatomic() which holds the folio lock instead of
> i_private_lock. This breaks the lock ordering (folio -> buffer) and
> causes an ABBA deadlock when the filesystem blocksize < pagesize:
> 
>      T1                                T2
> ext4_mkdir()
>  ext4_init_new_dir()
>   ext4_append()
>    ext4_getblk()
>     lock_buffer()    <- A
>                                    sync_blockdev()
>                                     blkdev_writepages()
>                                      writeback_iter()
>                                       writeback_get_folio()
>                                        folio_lock()   <- B
>      ext4_journal_get_create_access()
>       jbd2_journal_cancel_revoke()
>        __find_get_block_nonatomic()
>         folio_lock()  <- B
>                                      block_write_full_folio()
>                                       lock_buffer()   <- A
> 
> This can occasionally cause generic/013 to hang.
> 
> Fix by only calling __find_get_block_nonatomic() when the passed
> buffer_head doesn't belong to the bdev (i.e., !bh->b_bdev), which is the
> only case that we need to look up its bdev alias. Otherwise, the lookup
> is redundant since the found buffer_head is equal to the one we passed
> in.
> 
> Fixes: f76d4c28a46a ("fs/jbd2: use sleeping version of __find_get_block()")
> Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
> ---
>  fs/jbd2/revoke.c | 7 ++++---
>  1 file changed, 4 insertions(+), 3 deletions(-)
> 
> diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
> index 9016ddb82447..8b52d40c27c9 100644
> --- a/fs/jbd2/revoke.c
> +++ b/fs/jbd2/revoke.c
> @@ -464,13 +464,14 @@ void jbd2_journal_cancel_revoke(handle_t *handle, struct journal_head *jh)
>  	 * buffer_head?  If so, we'd better make sure we clear the
>  	 * revoked status on any hashed alias too, otherwise the revoke
>  	 * state machine will get very upset later on. */
> -	if (need_cancel) {
> +	if (need_cancel && !bh->b_bdev) {
>  		struct buffer_head *bh2;
> +
>  		bh2 = __find_get_block_nonatomic(bh->b_bdev, bh->b_blocknr,
>  						 bh->b_size);
>  		if (bh2) {
> -			if (bh2 != bh)
> -				clear_buffer_revoked(bh2);
> +			WARN_ON_ONCE(bh2 == bh);
> +			clear_buffer_revoked(bh2);
>  			__brelse(bh2);
>  		}
>  	}


^ permalink raw reply

* [PATCH] jbd2: fix deadlock in jbd2_journal_cancel_revoke()
From: Zhang Yi @ 2026-04-09  1:07 UTC (permalink / raw)
  To: linux-ext4
  Cc: linux-fsdevel, linux-kernel, dave, tytso, adilger.kernel, jack,
	ojaswin, ritesh.list, libaokun, yi.zhang, yi.zhang, yizhang089,
	yangerkun, yukuai

From: Zhang Yi <yi.zhang@huawei.com>

Commit f76d4c28a46a ("fs/jbd2: use sleeping version of
__find_get_block()") changed jbd2_journal_cancel_revoke() to use
__find_get_block_nonatomic() which holds the folio lock instead of
i_private_lock. This breaks the lock ordering (folio -> buffer) and
causes an ABBA deadlock when the filesystem blocksize < pagesize:

     T1                                T2
ext4_mkdir()
 ext4_init_new_dir()
  ext4_append()
   ext4_getblk()
    lock_buffer()    <- A
                                   sync_blockdev()
                                    blkdev_writepages()
                                     writeback_iter()
                                      writeback_get_folio()
                                       folio_lock()   <- B
     ext4_journal_get_create_access()
      jbd2_journal_cancel_revoke()
       __find_get_block_nonatomic()
        folio_lock()  <- B
                                     block_write_full_folio()
                                      lock_buffer()   <- A

This can occasionally cause generic/013 to hang.

Fix by only calling __find_get_block_nonatomic() when the passed
buffer_head doesn't belong to the bdev (i.e., !bh->b_bdev), which is the
only case that we need to look up its bdev alias. Otherwise, the lookup
is redundant since the found buffer_head is equal to the one we passed
in.

Fixes: f76d4c28a46a ("fs/jbd2: use sleeping version of __find_get_block()")
Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
---
 fs/jbd2/revoke.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
index 9016ddb82447..8b52d40c27c9 100644
--- a/fs/jbd2/revoke.c
+++ b/fs/jbd2/revoke.c
@@ -464,13 +464,14 @@ void jbd2_journal_cancel_revoke(handle_t *handle, struct journal_head *jh)
 	 * buffer_head?  If so, we'd better make sure we clear the
 	 * revoked status on any hashed alias too, otherwise the revoke
 	 * state machine will get very upset later on. */
-	if (need_cancel) {
+	if (need_cancel && !bh->b_bdev) {
 		struct buffer_head *bh2;
+
 		bh2 = __find_get_block_nonatomic(bh->b_bdev, bh->b_blocknr,
 						 bh->b_size);
 		if (bh2) {
-			if (bh2 != bh)
-				clear_buffer_revoked(bh2);
+			WARN_ON_ONCE(bh2 == bh);
+			clear_buffer_revoked(bh2);
 			__brelse(bh2);
 		}
 	}
-- 
2.52.0


^ permalink raw reply related

* Re: [RFC PATCH v1 0/6] provenance_time (ptime): a new settable timestamp for cross-filesystem provenance
From: Sean Smith @ 2026-04-09  0:15 UTC (permalink / raw)
  To: Theodore Tso
  Cc: linux-fsdevel, linux-ext4, linux-btrfs, dsterba, david, brauner,
	osandov, hirofumi, linkinjeon
In-Reply-To: <20260408133324.GA18443@macsyma-wired.lan>

On 4/8/2026 08:33, Theodore Tso wrote:
> AI models which don't consider secord order effects (or used by people 
> who have different priorities than others) can result in...
> suboptimal results.>
> And this is why it's important why it's important to have humans in
> the loop instead of blindly trusting AI models.
> 
> I'd also suggest that you consider the value of patience.  Linux is 35
> years old as of this writing.  One of the reasons why Linux has stuck
> around this long is because we take the long view.  Sure, it might
> take longer to shift the ecosystem to use some new interface or new
> feature; but everyone will have to live with muddled interface
> semantics *forever*.

> MacOS doesn't have this Windows-style timestamp support, 
> and it hasn't stopped many users from switching from Windows 
> to MacOS.

I think this thread demonstrates the value of humans in the
loop. Despite my efforts to be careful, my earlier claim that
"Windows, macOS, and SMB have supported a settable creation
timestamp for decades — Linux is the outlier" was wrong. macOS
doesn't preserve date created on copy either. Windows is now
the outlier. Working with AI, it feels like drowning in
information which no single individual has the time or ability
to fully verify. Your years of experience caught what I and
agents did not.

From the sound of it, the issues with rename() will block an
upstream merge of a kernel-based ptime. I can maintain my
kernel patch on GitHub for those that want it. The file_attr
design work that came out of Darrick Wong's engagement was
productive and worth doing despite this outcome.

I see long-term value in our discussion. As agents become more
involved in projects, this LKML thread will establish ptime
as a solution to this specific problem. Someone migrating from
Windows to Linux who asks their agents how to preserve
date created are going to find this thread.

The tools I used to build my kernel patch didn't exist six
months ago. My AI harness is just as much a dependency, and
it didn't exist three months ago. My harness undergoes rapid
development and increases in capabilities, even without newer,
smarter models releasing every few months.

I think there's a real risk of a negative feedback loop. As
implementation costs drop, people who could contribute to
upstream will find it easier to build and maintain a custom
stack rather than try to get changes through a multi-year
review process. I think experienced developers need to lead
a shift in how contributions are reviewed and organized. I
think the key challenge for maintainers is figuring out how
to work with a new class of contributor, and direct them so
they work in a coordinated manner.

Thanks for taking the time to engage with me.

- Sean

^ permalink raw reply

* Re: [PATCH v2 3/3] ext4: derive f_fsid from block device to avoid collisions
From: Anand Jain @ 2026-04-08 22:28 UTC (permalink / raw)
  To: Theodore Tso, Christoph Hellwig
  Cc: Darrick J. Wong, linux-ext4, linux-btrfs, linux-xfs, Anand Jain
In-Reply-To: <20260407144709.GA81690@macsyma-wired.lan>

On 7/4/26 22:47, Theodore Tso wrote:
> On Mon, Apr 06, 2026 at 10:22:16PM -0700, Christoph Hellwig wrote:
>>> Dilemma:
>>> While statfs(2) [1] suggests f_fsid is "some random stuff," we know
>>> userspace (NFS, systemd) often treats it as a persistent handle.
>>>
>>> Do you prefer one of the names above, or is there a more idiomatic ext4
>>> naming convention I should follow?
>>>
>>
>> My take is that anything that should persist should be an on-disk
>> feature flag, not a mount option.  But I'm not in charge for ext4
> 
> My take is that f_fsid is random stuff, as documented by the
> specification, so anyone who tries to depend on it needs to be kept in
> a padding room where they can't hurt themselves or their users.
> 
> And as far as NFS is concerned, file handles should be based on
> the super block UUID, not statfs's f_fsid, and anyone who wants to
> mount a snapshot as an NFS exported file system at the same time that
> the original file system is mounted is _also_ should be gently coaxed
> into a padding room where they can't hurt themselves or their users.

> The solution that we've used for people who are cloning block devices
> for things like cloud images has been for *years* has been to use
> "tune2fs -U random /dev/sda1".  And this works on mounted file system,
> and (for example) built into various cloud images for Google Cloud
> Engine.

Ted,

Thanks for the feedback.
Some A/B testing use cases require the filesystem to remain
byte-for-byte identical. In those scenarios, changing the UUID
isn't an option. This was discussed on the mailing list a year
or two ago (I don't have the link handy), and is similar to [1].

  [1] https://source.android.com/docs/core/ota/ab/ab_implement.

> If we want to change statfs's f_fsid, from one set of "Random stuff"
> to another set of "Random stuff", I don't really mind, but I don't
> think it's worth *either* a mount option, *or* a feature flag, as
> either would be confusing for system adminsitrators when some file
> systems behave one way, and other file systems behave another.

I agree that a new mount option or flag is too heavy, and I wasn't
a fan of that approach either. Let's drop the ext4 patch for now.
The `f_fsid` collision in cloned ext4 filesystems is currently
only theoretical; we can revisit this if it becomes necessary.

Thanks, Anand

^ permalink raw reply

* Re: [RFC v6 6/7] ext4: fast commit: add lock_updates tracepoint
From: Steven Rostedt @ 2026-04-08 20:04 UTC (permalink / raw)
  To: Li Chen
  Cc: Zhang Yi, Theodore Ts'o, Andreas Dilger, Baokun Li, Jan Kara,
	Ojaswin Mujoo, Ritesh Harjani (IBM), Zhang Yi, Masami Hiramatsu,
	Mathieu Desnoyers, linux-ext4, linux-kernel, linux-trace-kernel
In-Reply-To: <20260408112020.716706-7-me@linux.beauty>

On Wed,  8 Apr 2026 19:20:17 +0800
Li Chen <me@linux.beauty> wrote:

> Commit-time fast commit snapshots run under jbd2_journal_lock_updates(),
> so it is useful to quantify the time spent with updates locked and to
> understand why snapshotting can fail.
> 
> Add a new tracepoint, ext4_fc_lock_updates, reporting the time spent in
> the updates-locked window along with the number of snapshotted inodes
> and ranges. Record the first snapshot failure reason in a stable snap_err
> field for tooling.
> 

[..]

> @@ -1338,13 +1375,13 @@ static int ext4_fc_perform_commit(journal_t *journal)
>  	if (ret)
>  		return ret;
>  
> -
>  	ret = ext4_fc_alloc_snapshot_inodes(sb, &inodes, &inodes_size);
>  	if (ret)
>  		return ret;
>  
>  	/* Step 4: Mark all inodes as being committed. */
>  	jbd2_journal_lock_updates(journal);
> +	lock_start = ktime_get();

ktime_get() is rather quick but if you care about micro-optimizations, you
could have:

	if (trace_ext4_fc_lock_updates_enabled())
		lock_start = ktime_get();
	else	
		lock_start = 0;

>  	/*
>  	 * The journal is now locked. No more handles can start and all the
>  	 * previous handles are now drained. Snapshotting happens in this
> @@ -1358,8 +1395,15 @@ static int ext4_fc_perform_commit(journal_t *journal)
>  	}
>  	ext4_fc_unlock(sb, alloc_ctx);
>  
> -	ret = ext4_fc_snapshot_inodes(journal, inodes, inodes_size);
> +	ret = ext4_fc_snapshot_inodes(journal, inodes, inodes_size,
> +				      &snap_inodes, &snap_ranges, &snap_err);
>  	jbd2_journal_unlock_updates(journal);
> +	if (trace_ext4_fc_lock_updates_enabled()) {

	if (trace_ext4_fc_lock_updates_enabled() && lock_start) {

But feel free to ignore this if the overhead of always calling ktime_get()
is not an issue.


> +		locked_ns = ktime_to_ns(ktime_sub(ktime_get(), lock_start));
> +		trace_ext4_fc_lock_updates(sb, commit_tid, locked_ns,
> +					   snap_inodes, snap_ranges, ret,
> +					   snap_err);
> +	}
>  	kvfree(inodes);
>  	if (ret)
>  		return ret;
> @@ -1564,7 +1608,7 @@ int ext4_fc_commit(journal_t *journal, tid_t commit_tid)
>  		journal_ioprio = EXT4_DEF_JOURNAL_IOPRIO;
>  	set_task_ioprio(current, journal_ioprio);
>  	fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize;
> -	ret = ext4_fc_perform_commit(journal);
> +	ret = ext4_fc_perform_commit(journal, commit_tid);
>  	if (ret < 0) {
>  		if (ret == -EAGAIN || ret == -E2BIG || ret == -ECANCELED)
>  			status = EXT4_FC_STATUS_INELIGIBLE;
> diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
> index f493642cf121..7028a28316fa 100644
> --- a/include/trace/events/ext4.h
> +++ b/include/trace/events/ext4.h
> @@ -107,6 +107,26 @@ TRACE_DEFINE_ENUM(EXT4_FC_REASON_VERITY);
>  TRACE_DEFINE_ENUM(EXT4_FC_REASON_MOVE_EXT);
>  TRACE_DEFINE_ENUM(EXT4_FC_REASON_MAX);
>  
> +#undef EM
> +#undef EMe
> +#define EM(a)	TRACE_DEFINE_ENUM(EXT4_FC_SNAP_ERR_##a);
> +#define EMe(a)	TRACE_DEFINE_ENUM(EXT4_FC_SNAP_ERR_##a);
> +
> +#define TRACE_SNAP_ERR						\
> +	EM(NONE)						\
> +	EM(ES_MISS)						\
> +	EM(ES_DELAYED)						\
> +	EM(ES_OTHER)						\
> +	EM(INODES_CAP)						\
> +	EM(RANGES_CAP)						\
> +	EM(NOMEM)						\
> +	EMe(INODE_LOC)
> +
> +TRACE_SNAP_ERR
> +
> +#undef EM
> +#undef EMe
> +
>  #define show_fc_reason(reason)						\
>  	__print_symbolic(reason,					\
>  		{ EXT4_FC_REASON_XATTR,		"XATTR"},		\
> @@ -2818,6 +2838,47 @@ TRACE_EVENT(ext4_fc_commit_stop,
>  		  __entry->num_fc_ineligible, __entry->nblks_agg, __entry->tid)
>  );
>  
> +#define EM(a)	{ EXT4_FC_SNAP_ERR_##a, #a },
> +#define EMe(a)	{ EXT4_FC_SNAP_ERR_##a, #a }
> +
> +TRACE_EVENT(ext4_fc_lock_updates,
> +	    TP_PROTO(struct super_block *sb, tid_t commit_tid, u64 locked_ns,
> +		     unsigned int nr_inodes, unsigned int nr_ranges, int err,
> +		     int snap_err),
> +
> +	TP_ARGS(sb, commit_tid, locked_ns, nr_inodes, nr_ranges, err, snap_err),
> +
> +	TP_STRUCT__entry(/* entry */
> +		__field(dev_t, dev)
> +		__field(tid_t, tid)
> +		__field(u64, locked_ns)
> +		__field(unsigned int, nr_inodes)
> +		__field(unsigned int, nr_ranges)
> +		__field(int, err)
> +		__field(int, snap_err)
> +	),
> +
> +	TP_fast_assign(/* assign */
> +		__entry->dev = sb->s_dev;
> +		__entry->tid = commit_tid;
> +		__entry->locked_ns = locked_ns;
> +		__entry->nr_inodes = nr_inodes;
> +		__entry->nr_ranges = nr_ranges;
> +		__entry->err = err;
> +		__entry->snap_err = snap_err;
> +	),
> +
> +	TP_printk("dev %d,%d tid %u locked_ns %llu nr_inodes %u nr_ranges %u err %d snap_err %s",
> +		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->tid,
> +		  __entry->locked_ns, __entry->nr_inodes, __entry->nr_ranges,
> +		  __entry->err, __print_symbolic(__entry->snap_err,
> +						 TRACE_SNAP_ERR))
> +);
> +
> +#undef EM
> +#undef EMe
> +#undef TRACE_SNAP_ERR
> +
>  #define FC_REASON_NAME_STAT(reason)					\
>  	show_fc_reason(reason),						\
>  	__entry->fc_ineligible_rc[reason]

As for the rest:

Reviewed-by: Steven Rostedt (Google) <rostedt@goodmis.org>

[ Please add this reviewed-by to any new versions so I remember I already
  looked at it. ]

Thanks,

-- Steve

^ permalink raw reply

* Re: BUG: net-next (7.0-rc6 based and later) fails to boot on Jetson Xavier NX
From: Russell King (Oracle) @ 2026-04-08 19:52 UTC (permalink / raw)
  To: Robin Murphy
  Cc: netdev, linux-arm-kernel, linux-kernel, iommu, linux-ext4,
	Linus Torvalds, dmaengine, Marek Szyprowski, Theodore Ts'o,
	Andreas Dilger, Vinod Koul, Frank Li
In-Reply-To: <3a1d0520-3402-47b2-9d7b-4e14a3cd07a4@arm.com>

On Wed, Apr 08, 2026 at 05:40:48PM +0100, Robin Murphy wrote:
> On 2026-04-08 5:16 pm, Russell King (Oracle) wrote:
> > On Wed, Apr 08, 2026 at 05:08:34PM +0100, Russell King (Oracle) wrote:
> > > The rebase is still progressing, but it's landed on:
> > > 
> > > c7d812e33f3e dmaengine: xilinx: xilinx_dma: Fix unmasked residue subtraction
> 
> FWIW I don't see a Tegra having the Xilinx IP in it anyway - judging by the
> DT it has their own tegra-gpcdma engine...
> 
> There's a fair chance this could be 90c5def10bea ("iommu: Do not call
> drivers for empty gathers"), which JonH also reported causing boot issues on
> Tegras - in short, SMMU TLB maintenance may not be completed properly which
> could lead to recycled DMA addresses causing exactly this kind of random
> memory corruption. I CC'd you on a patch:
> 
> https://lore.kernel.org/linux-iommu/20260408162846.GE3357077@nvidia.com/T/#t

Okay, bisect complete, and... no idea. It seems to suggest that 7.0-rc6
is actually fine - it ended up blaming Linus' tagging of 7.0-rc6 which
only changed the makefile. So, my assumption that because rc6 was merged
into net-next last Thursday which fails, net-next+rc7 fails, rc7 also
fails, that rc6 would also fail seems to be false.

Right, rc7 built with the same .config that rc6 was built with
definitely fails, this time with:

Root device found: PARTUUID=741c0777-391a-4bce-a222-455e180ece2a
depmod: ERROR: could not open directory /lib/modules/7.0.0-rc7-bisect: No such file or directory
depmod: FATAL: could not search modules: No such file or directory
usb 2-3: new SuperSpeed Plus Gen 2x1 USB device number 2 using tegra-xusb
hub 2-3:1.0: USB hub found
hub 2-3:1.0: 4 ports detected
usb 1-3: new full-speed USB device number 3 using tegra-xusb
EXT4-fs (mmcblk0p1): VFS: Can't find ext4 filesystem
mount: /mnt: wrong fs type, bad option, bad superblock on /dev/mmcblk0p1, missing codepage or helper program, or other error.
mount: /mnt/: can't find PARTUUID=741c0777-391a-4bce-a222-455e180ece2a.
get_swap_device: Bad swap file entry 1800c00008
get_swap_device: Bad swap file entry 1800c00008
get_swap_device: Bad swap file entry 1800c00008

So, it seems rc6 -> rc7 => fails
net-next with rc5 -> net-next with rc6 => fails

However, before I test anything else, I've just built the same rc7
which failed above with your patch applied - and that boots fine.

Now, each Thursday, net-next gets updated as that's the day that the
net tree gets sent for merging into mainline. This causes net-next's
version to increase. So something in current net-next plus in rc7 is
causing this problem.

The commit you claim needs fixing is:

$ git describe --contains 90c5def10bea
v7.0-rc7~29^2~2

which I had assumed wouldn't be in net-next.

Now, mainline had this on Thursday:

commit f8f5627a8aeab15183eef8930bf75ba88a51622f
Merge: 4c2c526b5adf ec7067e66119
Author: Linus Torvalds <torvalds@linux-foundation.org>
Date:   Thu Apr 2 09:57:06 2026 -0700

    Merge tag 'net-7.0-rc7' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net

commit 4c2c526b5adfb580bd95316bf179327d5ee26da8
Merge: 2ec9074b28a0 8b72aa5704c7
Author: Linus Torvalds <torvalds@linux-foundation.org>
Date:   Thu Apr 2 09:53:16 2026 -0700

    Merge tag 'iommu-fixes-v7.0-rc6' of git://git.kernel.org/pub/scm/linux/kernel/git/iommu/linux

and merging iommu-fixes-v7.0-rc6 introduced the buggy 90c5def10bea
commit into -rc7.

However, as soon as Linus merged net-7.0-rc7, netdev maintainers merged
that exact commit back into net-next:

commit 8ffb33d7709b59ff60560f48960a73bd8a55be95
Merge: 269389ba5398 f8f5627a8aea
Author: Jakub Kicinski <kuba@kernel.org>
Date:   Thu Apr 2 10:57:09 2026 -0700

    Merge git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net

Thereby bringing in that buggy commit into net-next, but with net-next
identifying itself as 7.0-rc6.

That's... confusing, but explains why current net-next which reports
itself as 7.0-rc6 _and_ rc7 both fail, but rc6 itself does not. It
also means I've wasted an entire afternoon running a useless bisect
between rc5 and rc6 due to the version numbers in net-next being
meaningless.

What's the status on the iommu fix? Is it merged into mainline yet?
If it isn't already, that means net-next remains unbootable going
into the merge window without manually carrying the fix locally.

-- 
RMK's Patch system: https://www.armlinux.org.uk/developer/patches/
FTTP is here! 80Mbps down 10Mbps up. Decent connectivity at last!

^ permalink raw reply

* Re: BUG: net-next (7.0-rc6 based and later) fails to boot on Jetson Xavier NX
From: Robin Murphy @ 2026-04-08 16:40 UTC (permalink / raw)
  To: Russell King (Oracle), netdev, linux-arm-kernel, linux-kernel,
	iommu, linux-ext4, Linus Torvalds, dmaengine
  Cc: Marek Szyprowski, Theodore Ts'o, Andreas Dilger, Vinod Koul,
	Frank Li
In-Reply-To: <adZ_ZmjcE8S22vR1@shell.armlinux.org.uk>

On 2026-04-08 5:16 pm, Russell King (Oracle) wrote:
> On Wed, Apr 08, 2026 at 05:08:34PM +0100, Russell King (Oracle) wrote:
>> The rebase is still progressing, but it's landed on:
>>
>> c7d812e33f3e dmaengine: xilinx: xilinx_dma: Fix unmasked residue subtraction

FWIW I don't see a Tegra having the Xilinx IP in it anyway - judging by 
the DT it has their own tegra-gpcdma engine...

There's a fair chance this could be 90c5def10bea ("iommu: Do not call 
drivers for empty gathers"), which JonH also reported causing boot 
issues on Tegras - in short, SMMU TLB maintenance may not be completed 
properly which could lead to recycled DMA addresses causing exactly this 
kind of random memory corruption. I CC'd you on a patch:

https://lore.kernel.org/linux-iommu/20260408162846.GE3357077@nvidia.com/T/#t

Thanks,
Robin.

>>
>> and while this boots to a login prompt, it spat out a BUG():
>>
>> BUG: sleeping function called from invalid context at kernel/locking/mutex.c:591
>> in_atomic(): 0, irqs_disabled(): 1, non_block: 0, pid: 56, name: kworker/u24:3
>> preempt_count: 0, expected: 0
>> RCU nest depth: 0, expected: 0
>> 3 locks held by kworker/u24:3/56:
>>   #0: ffff000080042148 ((wq_completion)events_unbound#2){+.+.}-{0:0}, at: process_one_work+0x184/0x780
>>   #1: ffff80008299bdf8 (deferred_probe_work){+.+.}-{0:0}, at: process_one_work+0x1ac/0x780
>>   #2: ffff0000808b48f8 (&dev->mutex){....}-{4:4}, at: __device_attach+0x2c/0x188
>> irq event stamp: 10872
>> hardirqs last  enabled at (10871): [<ffff80008013a410>] ktime_get+0x130/0x180
>> hardirqs last disabled at (10872): [<ffff800080d61ac8>] _raw_spin_lock_irqsave+0x84/0x88
>> softirqs last  enabled at (9216): [<ffff80008002807c>] fpsimd_save_and_flush_current_state+0x3c/0x80
>> softirqs last disabled at (9214): [<ffff800080028098>] fpsimd_save_and_flush_current_state+0x58/0x80
>> CPU: 5 UID: 0 PID: 56 Comm: kworker/u24:3 Not tainted 7.0.0-rc1-bisect+ #654 PREEMPT
>> Hardware name: NVIDIA NVIDIA Jetson Xavier NX Developer Kit/Jetson, BIOS 6.0-37391689 08/28/2024
>> Workqueue: events_unbound deferred_probe_work_func
>> Call trace:
>>   show_stack+0x18/0x30 (C)
>>   dump_stack_lvl+0x6c/0x94
>>   dump_stack+0x18/0x24
>>   __might_resched+0x154/0x220
>>   __might_sleep+0x48/0x80
>>   __mutex_lock+0x48/0x800
>>   mutex_lock_nested+0x24/0x30
>>   pinmux_disable_setting+0x9c/0x180
>>   pinctrl_commit_state+0x5c/0x260
>>   pinctrl_pm_select_idle_state+0x4c/0xa0
>>   tegra_i2c_runtime_suspend+0x2c/0x3c
>>   pm_generic_runtime_suspend+0x2c/0x44
>>   __rpm_callback+0x48/0x1ec
>>   rpm_callback+0x74/0x80
>>   rpm_suspend+0xec/0x630
>>   rpm_idle+0x2c0/0x420
>>   __pm_runtime_idle+0x44/0x160
>>   tegra_i2c_probe+0x2e4/0x640
>>   platform_probe+0x5c/0xa4
>>   really_probe+0xbc/0x2c0
>>   __driver_probe_device+0x78/0x120
>>   driver_probe_device+0x3c/0x160
>>   __device_attach_driver+0xbc/0x160
>>   bus_for_each_drv+0x70/0xb8
>>   __device_attach+0xa4/0x188
>>   device_initial_probe+0x50/0x54
>>   bus_probe_device+0x38/0xa4
>>   deferred_probe_work_func+0x90/0xcc
>>   process_one_work+0x204/0x780
>>   worker_thread+0x1c8/0x36c
>>   kthread+0x138/0x144
>>   ret_from_fork+0x10/0x20
>>
>> This is reproducible.
> 
> I've just realised that it's the Tegra I2C bug that is already known
> about, but took ages to be fixed in mainline - it's unrelated to the
> memory corruption, so can be ignored. Sorry for the noise.
> 


^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox