Linux EXT4 FS development

Linux EXT4 FS development
 help / color / mirror / Atom feed

* [PATCH 6/9] fat: Fix possibly missing inode write on fsync(2)
From: Jan Kara @ 2026-05-11 12:13 UTC (permalink / raw)
  To: linux-fsdevel
  Cc: Christian Brauner, aivazian.tigran, OGAWA Hirofumi, Ted Tso,
	linux-ext4, Jan Kara
In-Reply-To: <20260511115725.28441-1-jack@suse.cz>

Use mmb inode buffer writeout infrastructure to reliably write out
inode's buffer on fsync(2).

Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/fat/inode.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 28f78df086ef..4ca00b7a618b 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -907,6 +907,7 @@ static int __fat_write_inode(struct inode *inode, int wait)
 	}
 	spin_unlock(&sbi->inode_hash_lock);
 	mark_buffer_dirty(bh);
+	MSDOS_I(inode)->i_metadata_bhs.inode_blk = bh->b_blocknr;
 	err = 0;
 	if (wait)
 		err = sync_dirty_buffer(bh);
@@ -925,7 +926,7 @@ static int fat_write_inode(struct inode *inode, struct writeback_control *wbc)
 		err = fat_clusters_flush(sb);
 		mutex_unlock(&MSDOS_SB(sb)->s_lock);
 	} else
-		err = __fat_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
+		err = __fat_write_inode(inode, 0);
 
 	return err;
 }
-- 
2.51.0


^ permalink raw reply related

* [PATCH 4/9] ext2: Fix possibly missing inode write on fsync(2)
From: Jan Kara @ 2026-05-11 12:13 UTC (permalink / raw)
  To: linux-fsdevel
  Cc: Christian Brauner, aivazian.tigran, OGAWA Hirofumi, Ted Tso,
	linux-ext4, Jan Kara
In-Reply-To: <20260511115725.28441-1-jack@suse.cz>

Use mmb inode buffer writeout infrastructure to reliably write out
inode's inode table block on fsync(2).

Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/ext2/inode.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 74aca5eb572d..6ce832da944f 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -1612,6 +1612,7 @@ static int __ext2_write_inode(struct inode *inode, int do_sync)
 	} else for (n = 0; n < EXT2_N_BLOCKS; n++)
 		raw_inode->i_block[n] = ei->i_data[n];
 	mark_buffer_dirty(bh);
+	ei->i_metadata_bhs.inode_blk = bh->b_blocknr;
 	if (do_sync) {
 		sync_dirty_buffer(bh);
 		if (buffer_req(bh) && !buffer_uptodate(bh)) {
@@ -1627,7 +1628,7 @@ static int __ext2_write_inode(struct inode *inode, int do_sync)
 
 int ext2_write_inode(struct inode *inode, struct writeback_control *wbc)
 {
-	return __ext2_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
+	return __ext2_write_inode(inode, 0);
 }
 
 int ext2_getattr(struct mnt_idmap *idmap, const struct path *path,
-- 
2.51.0


^ permalink raw reply related

* [PATCH 2/9] ext4: Allocate mapping_metadata_bhs struct on demand
From: Jan Kara @ 2026-05-11 12:13 UTC (permalink / raw)
  To: linux-fsdevel
  Cc: Christian Brauner, aivazian.tigran, OGAWA Hirofumi, Ted Tso,
	linux-ext4, Jan Kara
In-Reply-To: <20260511115725.28441-1-jack@suse.cz>

Currently every ext4 inode gets mapping_metadata_bhs struct although it
is only needed when running without a journal and only for inodes where
any metadata was dirtied. Allocate mapping_metadata_bhs struct on demand
when dirtying the first metadata buffer for the inode.

Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/ext4/ext4.h      |  2 +-
 fs/ext4/ext4_jbd2.c | 24 +++++++++++++++++++++---
 fs/ext4/fsync.c     | 12 ++++++++----
 fs/ext4/inode.c     |  9 +++++----
 fs/ext4/super.c     |  8 +++++---
 5 files changed, 40 insertions(+), 15 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 94283a991e5c..6bb29a20420f 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1117,7 +1117,7 @@ struct ext4_inode_info {
 	struct rw_semaphore i_data_sem;
 	struct inode vfs_inode;
 	struct jbd2_inode *jinode;
-	struct mapping_metadata_bhs i_metadata_bhs;
+	struct mapping_metadata_bhs *i_metadata_bhs;
 
 	/*
 	 * File creation time. Its function is same as that of
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index 9a8c225f2753..74f05bd0cdde 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -350,6 +350,21 @@ int __ext4_journal_get_create_access(const char *where, unsigned int line,
 	return 0;
 }
 
+static void ext4_inode_attach_mmb(struct inode *inode)
+{
+	struct mapping_metadata_bhs *mmb;
+
+	/*
+	 * It's difficult to handle failure when marking buffer dirty without
+	 * leaving filesystem corrupyted
+	 */
+	mmb = kmalloc_obj(*mmb, GFP_KERNEL | __GFP_NOFAIL);
+	mmb_init(mmb, inode->i_mapping);
+	/* Someone swapped another mmb before us? */
+	if (cmpxchg(&EXT4_I(inode)->i_metadata_bhs, NULL, mmb))
+		kfree(mmb);
+}
+
 int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
 				 handle_t *handle, struct inode *inode,
 				 struct buffer_head *bh)
@@ -389,11 +404,14 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
 					 err);
 		}
 	} else {
-		if (inode)
+		if (inode) {
+			if (!EXT4_I(inode)->i_metadata_bhs)
+				ext4_inode_attach_mmb(inode);
 			mmb_mark_buffer_dirty(bh,
-					      &EXT4_I(inode)->i_metadata_bhs);
-		else
+					      EXT4_I(inode)->i_metadata_bhs);
+		} else {
 			mark_buffer_dirty(bh);
+		}
 		if (inode && inode_needs_sync(inode)) {
 			sync_dirty_buffer(bh);
 			if (buffer_req(bh) && !buffer_uptodate(bh)) {
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 924726dcc85f..e25d365e1179 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -46,6 +46,7 @@
 static int ext4_sync_parent(struct inode *inode)
 {
 	struct dentry *dentry, *next;
+	struct mapping_metadata_bhs *mmb;
 	int ret = 0;
 
 	if (!ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY))
@@ -68,9 +69,12 @@ static int ext4_sync_parent(struct inode *inode)
 		 * through ext4_evict_inode()) and so we are safe to flush
 		 * metadata blocks and the inode.
 		 */
-		ret = mmb_sync(&EXT4_I(inode)->i_metadata_bhs);
-		if (ret)
-			break;
+		mmb = READ_ONCE(EXT4_I(inode)->i_metadata_bhs);
+		if (mmb) {
+			ret = mmb_sync(mmb);
+			if (ret)
+				break;
+		}
 		ret = sync_inode_metadata(inode, 1);
 		if (ret)
 			break;
@@ -89,7 +93,7 @@ static int ext4_fsync_nojournal(struct file *file, loff_t start, loff_t end,
 	};
 	int ret;
 
-	ret = mmb_fsync_noflush(file, &EXT4_I(inode)->i_metadata_bhs,
+	ret = mmb_fsync_noflush(file, EXT4_I(inode)->i_metadata_bhs,
 				start, end, datasync);
 	if (ret)
 		return ret;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index c2c2d6ac7f3d..3e66e9510909 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -195,9 +195,8 @@ void ext4_evict_inode(struct inode *inode)
 			ext4_warning_inode(inode, "data will be lost");
 
 		truncate_inode_pages_final(&inode->i_data);
-		/* Avoid mballoc special inode which has no proper iops */
-		if (!EXT4_SB(inode->i_sb)->s_journal)
-			mmb_sync(&EXT4_I(inode)->i_metadata_bhs);
+		if (EXT4_I(inode)->i_metadata_bhs)
+			mmb_sync(EXT4_I(inode)->i_metadata_bhs);
 		goto no_delete;
 	}
 
@@ -3451,6 +3450,7 @@ static bool ext4_release_folio(struct folio *folio, gfp_t wait)
 static bool ext4_inode_datasync_dirty(struct inode *inode)
 {
 	journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
+	struct mapping_metadata_bhs *mmb;
 
 	if (journal) {
 		if (jbd2_transaction_committed(journal,
@@ -3461,8 +3461,9 @@ static bool ext4_inode_datasync_dirty(struct inode *inode)
 		return true;
 	}
 
+	mmb = READ_ONCE(EXT4_I(inode)->i_metadata_bhs);
 	/* Any metadata buffers to write? */
-	if (mmb_has_buffers(&EXT4_I(inode)->i_metadata_bhs))
+	if (mmb && mmb_has_buffers(mmb))
 		return true;
 	return inode_state_read_once(inode) & I_DIRTY_DATASYNC;
 }
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 6a77db4d3124..92134ea4620c 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1430,7 +1430,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
 	INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work);
 	ext4_fc_init_inode(&ei->vfs_inode);
 	spin_lock_init(&ei->i_fc_lock);
-	mmb_init(&ei->i_metadata_bhs, &ei->vfs_inode.i_data);
+	ei->i_metadata_bhs = NULL;
 	return &ei->vfs_inode;
 }
 
@@ -1527,8 +1527,10 @@ static void destroy_inodecache(void)
 void ext4_clear_inode(struct inode *inode)
 {
 	ext4_fc_del(inode);
-	if (!EXT4_SB(inode->i_sb)->s_journal)
-		mmb_invalidate(&EXT4_I(inode)->i_metadata_bhs);
+	if (EXT4_I(inode)->i_metadata_bhs) {
+		mmb_invalidate(EXT4_I(inode)->i_metadata_bhs);
+		kfree(EXT4_I(inode)->i_metadata_bhs);
+	}
 	clear_inode(inode);
 	ext4_discard_preallocations(inode);
 	/*
-- 
2.51.0


^ permalink raw reply related

* [PATCH 5/9] udf: Fix possibly missing inode write on fsync(2)
From: Jan Kara @ 2026-05-11 12:13 UTC (permalink / raw)
  To: linux-fsdevel
  Cc: Christian Brauner, aivazian.tigran, OGAWA Hirofumi, Ted Tso,
	linux-ext4, Jan Kara
In-Reply-To: <20260511115725.28441-1-jack@suse.cz>

Use mmb inode buffer writeout infrastructure to reliably write out
inode's block on fsync(2).

Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/udf/inode.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 67bcf83758c8..4102d3482319 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -1707,7 +1707,7 @@ void udf_update_extra_perms(struct inode *inode, umode_t mode)
 
 int udf_write_inode(struct inode *inode, struct writeback_control *wbc)
 {
-	return udf_update_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
+	return udf_update_inode(inode, 0);
 }
 
 static int udf_sync_inode(struct inode *inode)
@@ -1937,6 +1937,7 @@ static int udf_update_inode(struct inode *inode, int do_sync)
 
 	/* write the data blocks */
 	mark_buffer_dirty(bh);
+	iinfo->i_metadata_bhs.inode_blk = bh->b_blocknr;
 	if (do_sync) {
 		sync_dirty_buffer(bh);
 		if (buffer_write_io_error(bh)) {
-- 
2.51.0


^ permalink raw reply related

* [PATCH 1/9] affs: Drop support for metadata bh tracking
From: Jan Kara @ 2026-05-11 12:13 UTC (permalink / raw)
  To: linux-fsdevel
  Cc: Christian Brauner, aivazian.tigran, OGAWA Hirofumi, Ted Tso,
	linux-ext4, Jan Kara, David Sterba
In-Reply-To: <20260511115725.28441-1-jack@suse.cz>

AFFS did all the hard work of tracking metadata bhs dirtied for an inode
but it actually never used this information as affs_file_fsync() just
calls sync_blockdev() to writeback all filesystem metadata bhs. After a
discussion with AFFS maintainer nobody cares about AFFS performance
so let's keep this affs_file_fsync() behavior and just drop all the
pointless tracking from AFFS.

CC: David Sterba <dsterba@suse.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/affs/affs.h     |  1 -
 fs/affs/amigaffs.c | 12 ++++++------
 fs/affs/file.c     | 25 +++++++++++--------------
 fs/affs/inode.c    | 13 +++++--------
 fs/affs/namei.c    |  9 ++++-----
 fs/affs/super.c    |  1 -
 6 files changed, 26 insertions(+), 35 deletions(-)

diff --git a/fs/affs/affs.h b/fs/affs/affs.h
index a0caf6ace860..406a0ef63e7b 100644
--- a/fs/affs/affs.h
+++ b/fs/affs/affs.h
@@ -44,7 +44,6 @@ struct affs_inode_info {
 	struct mutex i_link_lock;		/* Protects internal inode access. */
 	struct mutex i_ext_lock;		/* Protects internal inode access. */
 #define i_hash_lock i_ext_lock
-	struct mapping_metadata_bhs i_metadata_bhs;
 	u32	 i_blkcnt;			/* block count */
 	u32	 i_extcnt;			/* extended block count */
 	u32	*i_lc;				/* linear cache of extended blocks */
diff --git a/fs/affs/amigaffs.c b/fs/affs/amigaffs.c
index bed4fc805e8e..6cc0fc9a4cbf 100644
--- a/fs/affs/amigaffs.c
+++ b/fs/affs/amigaffs.c
@@ -57,7 +57,7 @@ affs_insert_hash(struct inode *dir, struct buffer_head *bh)
 		AFFS_TAIL(sb, dir_bh)->hash_chain = cpu_to_be32(ino);
 
 	affs_adjust_checksum(dir_bh, ino);
-	mmb_mark_buffer_dirty(dir_bh, &AFFS_I(dir)->i_metadata_bhs);
+	mark_buffer_dirty(dir_bh);
 	affs_brelse(dir_bh);
 
 	inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
@@ -100,7 +100,7 @@ affs_remove_hash(struct inode *dir, struct buffer_head *rem_bh)
 			else
 				AFFS_TAIL(sb, bh)->hash_chain = ino;
 			affs_adjust_checksum(bh, be32_to_cpu(ino) - hash_ino);
-			mmb_mark_buffer_dirty(bh, &AFFS_I(dir)->i_metadata_bhs);
+			mark_buffer_dirty(bh);
 			AFFS_TAIL(sb, rem_bh)->parent = 0;
 			retval = 0;
 			break;
@@ -180,7 +180,7 @@ affs_remove_link(struct dentry *dentry)
 			affs_unlock_dir(dir);
 			goto done;
 		}
-		mmb_mark_buffer_dirty(link_bh, &AFFS_I(inode)->i_metadata_bhs);
+		mark_buffer_dirty(link_bh);
 
 		memcpy(AFFS_TAIL(sb, bh)->name, AFFS_TAIL(sb, link_bh)->name, 32);
 		retval = affs_insert_hash(dir, bh);
@@ -188,7 +188,7 @@ affs_remove_link(struct dentry *dentry)
 			affs_unlock_dir(dir);
 			goto done;
 		}
-		mmb_mark_buffer_dirty(bh, &AFFS_I(inode)->i_metadata_bhs);
+		mark_buffer_dirty(bh);
 
 		affs_unlock_dir(dir);
 		iput(dir);
@@ -203,7 +203,7 @@ affs_remove_link(struct dentry *dentry)
 			__be32 ino2 = AFFS_TAIL(sb, link_bh)->link_chain;
 			AFFS_TAIL(sb, bh)->link_chain = ino2;
 			affs_adjust_checksum(bh, be32_to_cpu(ino2) - link_ino);
-			mmb_mark_buffer_dirty(bh, &AFFS_I(inode)->i_metadata_bhs);
+			mark_buffer_dirty(bh);
 			retval = 0;
 			/* Fix the link count, if bh is a normal header block without links */
 			switch (be32_to_cpu(AFFS_TAIL(sb, bh)->stype)) {
@@ -306,7 +306,7 @@ affs_remove_header(struct dentry *dentry)
 	retval = affs_remove_hash(dir, bh);
 	if (retval)
 		goto done_unlock;
-	mmb_mark_buffer_dirty(bh, &AFFS_I(inode)->i_metadata_bhs);
+	mark_buffer_dirty(bh);
 
 	affs_unlock_dir(dir);
 
diff --git a/fs/affs/file.c b/fs/affs/file.c
index 144b17482d12..23e088a7ed4f 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -140,14 +140,14 @@ affs_alloc_extblock(struct inode *inode, struct buffer_head *bh, u32 ext)
 	AFFS_TAIL(sb, new_bh)->parent = cpu_to_be32(inode->i_ino);
 	affs_fix_checksum(sb, new_bh);
 
-	mmb_mark_buffer_dirty(new_bh, &AFFS_I(inode)->i_metadata_bhs);
+	mark_buffer_dirty(new_bh);
 
 	tmp = be32_to_cpu(AFFS_TAIL(sb, bh)->extension);
 	if (tmp)
 		affs_warning(sb, "alloc_ext", "previous extension set (%x)", tmp);
 	AFFS_TAIL(sb, bh)->extension = cpu_to_be32(blocknr);
 	affs_adjust_checksum(bh, blocknr - tmp);
-	mmb_mark_buffer_dirty(bh, &AFFS_I(inode)->i_metadata_bhs);
+	mark_buffer_dirty(bh);
 
 	AFFS_I(inode)->i_extcnt++;
 	mark_inode_dirty(inode);
@@ -581,7 +581,7 @@ affs_extent_file_ofs(struct inode *inode, u32 newsize)
 		memset(AFFS_DATA(bh) + boff, 0, tmp);
 		be32_add_cpu(&AFFS_DATA_HEAD(bh)->size, tmp);
 		affs_fix_checksum(sb, bh);
-		mmb_mark_buffer_dirty(bh, &AFFS_I(inode)->i_metadata_bhs);
+		mark_buffer_dirty(bh);
 		size += tmp;
 		bidx++;
 	} else if (bidx) {
@@ -603,7 +603,7 @@ affs_extent_file_ofs(struct inode *inode, u32 newsize)
 		AFFS_DATA_HEAD(bh)->size = cpu_to_be32(tmp);
 		affs_fix_checksum(sb, bh);
 		bh->b_state &= ~(1UL << BH_New);
-		mmb_mark_buffer_dirty(bh, &AFFS_I(inode)->i_metadata_bhs);
+		mark_buffer_dirty(bh);
 		if (prev_bh) {
 			u32 tmp_next = be32_to_cpu(AFFS_DATA_HEAD(prev_bh)->next);
 
@@ -613,8 +613,7 @@ affs_extent_file_ofs(struct inode *inode, u32 newsize)
 					     bidx, tmp_next);
 			AFFS_DATA_HEAD(prev_bh)->next = cpu_to_be32(bh->b_blocknr);
 			affs_adjust_checksum(prev_bh, bh->b_blocknr - tmp_next);
-			mmb_mark_buffer_dirty(prev_bh,
-					      &AFFS_I(inode)->i_metadata_bhs);
+			mark_buffer_dirty(prev_bh);
 			affs_brelse(prev_bh);
 		}
 		size += bsize;
@@ -733,7 +732,7 @@ static int affs_write_end_ofs(const struct kiocb *iocb,
 		AFFS_DATA_HEAD(bh)->size = cpu_to_be32(
 			max(boff + tmp, be32_to_cpu(AFFS_DATA_HEAD(bh)->size)));
 		affs_fix_checksum(sb, bh);
-		mmb_mark_buffer_dirty(bh, &AFFS_I(inode)->i_metadata_bhs);
+		mark_buffer_dirty(bh);
 		written += tmp;
 		from += tmp;
 		bidx++;
@@ -766,13 +765,12 @@ static int affs_write_end_ofs(const struct kiocb *iocb,
 						     bidx, tmp_next);
 				AFFS_DATA_HEAD(prev_bh)->next = cpu_to_be32(bh->b_blocknr);
 				affs_adjust_checksum(prev_bh, bh->b_blocknr - tmp_next);
-				mmb_mark_buffer_dirty(prev_bh,
-					&AFFS_I(inode)->i_metadata_bhs);
+				mark_buffer_dirty(prev_bh);
 			}
 		}
 		affs_brelse(prev_bh);
 		affs_fix_checksum(sb, bh);
-		mmb_mark_buffer_dirty(bh, &AFFS_I(inode)->i_metadata_bhs);
+		mark_buffer_dirty(bh);
 		written += bsize;
 		from += bsize;
 		bidx++;
@@ -801,14 +799,13 @@ static int affs_write_end_ofs(const struct kiocb *iocb,
 						     bidx, tmp_next);
 				AFFS_DATA_HEAD(prev_bh)->next = cpu_to_be32(bh->b_blocknr);
 				affs_adjust_checksum(prev_bh, bh->b_blocknr - tmp_next);
-				mmb_mark_buffer_dirty(prev_bh,
-						&AFFS_I(inode)->i_metadata_bhs);
+				mark_buffer_dirty(prev_bh);
 			}
 		} else if (be32_to_cpu(AFFS_DATA_HEAD(bh)->size) < tmp)
 			AFFS_DATA_HEAD(bh)->size = cpu_to_be32(tmp);
 		affs_brelse(prev_bh);
 		affs_fix_checksum(sb, bh);
-		mmb_mark_buffer_dirty(bh, &AFFS_I(inode)->i_metadata_bhs);
+		mark_buffer_dirty(bh);
 		written += tmp;
 		from += tmp;
 		bidx++;
@@ -945,7 +942,7 @@ affs_truncate(struct inode *inode)
 	}
 	AFFS_TAIL(sb, ext_bh)->extension = 0;
 	affs_fix_checksum(sb, ext_bh);
-	mmb_mark_buffer_dirty(ext_bh, &AFFS_I(inode)->i_metadata_bhs);
+	mark_buffer_dirty(ext_bh);
 	affs_brelse(ext_bh);
 
 	if (inode->i_size) {
diff --git a/fs/affs/inode.c b/fs/affs/inode.c
index 5dd1b016bcb0..d4a3f381c4bc 100644
--- a/fs/affs/inode.c
+++ b/fs/affs/inode.c
@@ -206,7 +206,7 @@ affs_write_inode(struct inode *inode, struct writeback_control *wbc)
 		}
 	}
 	affs_fix_checksum(sb, bh);
-	mmb_mark_buffer_dirty(bh, &AFFS_I(inode)->i_metadata_bhs);
+	mark_buffer_dirty(bh);
 	affs_brelse(bh);
 	affs_free_prealloc(inode);
 	return 0;
@@ -266,11 +266,8 @@ affs_evict_inode(struct inode *inode)
 	if (!inode->i_nlink) {
 		inode->i_size = 0;
 		affs_truncate(inode);
-	} else {
-		mmb_sync(&AFFS_I(inode)->i_metadata_bhs);
 	}
 
-	mmb_invalidate(&AFFS_I(inode)->i_metadata_bhs);
 	clear_inode(inode);
 	affs_free_prealloc(inode);
 	cache_page = (unsigned long)AFFS_I(inode)->i_lc;
@@ -305,7 +302,7 @@ affs_new_inode(struct inode *dir)
 	bh = affs_getzeroblk(sb, block);
 	if (!bh)
 		goto err_bh;
-	mmb_mark_buffer_dirty(bh, &AFFS_I(inode)->i_metadata_bhs);
+	mark_buffer_dirty(bh);
 	affs_brelse(bh);
 
 	inode->i_uid     = current_fsuid();
@@ -393,17 +390,17 @@ affs_add_entry(struct inode *dir, struct inode *inode, struct dentry *dentry, s3
 		AFFS_TAIL(sb, bh)->link_chain = chain;
 		AFFS_TAIL(sb, inode_bh)->link_chain = cpu_to_be32(block);
 		affs_adjust_checksum(inode_bh, block - be32_to_cpu(chain));
-		mmb_mark_buffer_dirty(inode_bh, &AFFS_I(inode)->i_metadata_bhs);
+		mark_buffer_dirty(inode_bh);
 		set_nlink(inode, 2);
 		ihold(inode);
 	}
 	affs_fix_checksum(sb, bh);
-	mmb_mark_buffer_dirty(bh, &AFFS_I(inode)->i_metadata_bhs);
+	mark_buffer_dirty(bh);
 	dentry->d_fsdata = (void *)(long)bh->b_blocknr;
 
 	affs_lock_dir(dir);
 	retval = affs_insert_hash(dir, bh);
-	mmb_mark_buffer_dirty(bh, &AFFS_I(inode)->i_metadata_bhs);
+	mark_buffer_dirty(bh);
 	affs_unlock_dir(dir);
 	affs_unlock_link(inode);
 
diff --git a/fs/affs/namei.c b/fs/affs/namei.c
index c3c6532da4b0..57d8d755aada 100644
--- a/fs/affs/namei.c
+++ b/fs/affs/namei.c
@@ -373,7 +373,7 @@ affs_symlink(struct mnt_idmap *idmap, struct inode *dir,
 	}
 	*p = 0;
 	inode->i_size = i + 1;
-	mmb_mark_buffer_dirty(bh, &AFFS_I(inode)->i_metadata_bhs);
+	mark_buffer_dirty(bh);
 	affs_brelse(bh);
 	mark_inode_dirty(inode);
 
@@ -443,8 +443,7 @@ affs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	/* TODO: move it back to old_dir, if error? */
 
 done:
-	mmb_mark_buffer_dirty(bh,
-			&AFFS_I(retval ? old_dir : new_dir)->i_metadata_bhs);
+	mark_buffer_dirty(bh);
 	affs_brelse(bh);
 	return retval;
 }
@@ -497,8 +496,8 @@ affs_xrename(struct inode *old_dir, struct dentry *old_dentry,
 	retval = affs_insert_hash(old_dir, bh_new);
 	affs_unlock_dir(old_dir);
 done:
-	mmb_mark_buffer_dirty(bh_old, &AFFS_I(new_dir)->i_metadata_bhs);
-	mmb_mark_buffer_dirty(bh_new, &AFFS_I(old_dir)->i_metadata_bhs);
+	mark_buffer_dirty(bh_old);
+	mark_buffer_dirty(bh_new);
 	affs_brelse(bh_old);
 	affs_brelse(bh_new);
 	return retval;
diff --git a/fs/affs/super.c b/fs/affs/super.c
index 079f36e1ddec..8451647f3fea 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -108,7 +108,6 @@ static struct inode *affs_alloc_inode(struct super_block *sb)
 	i->i_lc = NULL;
 	i->i_ext_bh = NULL;
 	i->i_pa_cnt = 0;
-	mmb_init(&i->i_metadata_bhs, &i->vfs_inode.i_data);
 
 	return &i->vfs_inode;
 }
-- 
2.51.0


^ permalink raw reply related

* [PATCH 3/9] fs: Writeout inode buffer from mmb_sync()
From: Jan Kara @ 2026-05-11 12:13 UTC (permalink / raw)
  To: linux-fsdevel
  Cc: Christian Brauner, aivazian.tigran, OGAWA Hirofumi, Ted Tso,
	linux-ext4, Jan Kara
In-Reply-To: <20260511115725.28441-1-jack@suse.cz>

Currently metadata bh tracking does not track inode buffers because they
are usually shared by several inodes and so our linked list tracking
cannot be used. On fsync we call sync_inode_metadata() to write inode
instead where filesystems' .write_inode methods detect data integrity
writeback and take care to submit inode buffer to disk and wait for it
in that case. This is however racy as for example flush worker can
submit normal (WB_SYNC_NONE) inode writeback first, which makes the
inode clean and copies the inode to the buffer but doesn't submit the
buffer for IO. Thus sync_inode_metadata() call does nothing and we fail
to persist inode buffer to disk on fsync(2).

Fix the problem by allowing filesystem to set the number of block backing
the inode in mmb structure and mmb_sync() then takes care to writeout
corresponding buffer and wait for it.

Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/buffer.c        | 34 +++++++++++++++++++++++-----------
 include/linux/fs.h |  1 +
 2 files changed, 24 insertions(+), 11 deletions(-)

diff --git a/fs/buffer.c b/fs/buffer.c
index b0b3792b1496..dba29a45346b 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -477,12 +477,14 @@ EXPORT_SYMBOL(mark_buffer_async_write);
  * using RCU, grab the lock, verify we didn't race with somebody detaching the
  * bh / moving it to different inode and only then proceeding.
  */
+#define INVALID_BLK (~0ULL)
 
 void mmb_init(struct mapping_metadata_bhs *mmb, struct address_space *mapping)
 {
 	spin_lock_init(&mmb->lock);
 	INIT_LIST_HEAD(&mmb->list);
 	mmb->mapping = mapping;
+	mmb->inode_blk = INVALID_BLK;
 }
 EXPORT_SYMBOL(mmb_init);
 
@@ -593,8 +595,18 @@ int mmb_sync(struct mapping_metadata_bhs *mmb)
 			}
 		}
 	}
-
 	spin_unlock(&mmb->lock);
+
+	/* Writeout inode buffer head */
+	if (mmb->inode_blk != INVALID_BLK) {
+		bh = sb_find_get_block(mmb->mapping->host->i_sb, mmb->inode_blk);
+		write_dirty_buffer(bh, REQ_SYNC);
+		wait_on_buffer(bh);
+		if (!buffer_uptodate(bh))
+			err = -EIO;
+		brelse(bh);
+	}
+
 	blk_finish_plug(&plug);
 	spin_lock(&mmb->lock);
 
@@ -646,18 +658,18 @@ int mmb_fsync_noflush(struct file *file, struct mapping_metadata_bhs *mmb,
 	if (err)
 		return err;
 
-	if (mmb)
-		ret = mmb_sync(mmb);
 	if (!(inode_state_read_once(inode) & I_DIRTY_ALL))
-		goto out;
+		goto sync_buffers;
 	if (datasync && !(inode_state_read_once(inode) & I_DIRTY_DATASYNC))
-		goto out;
-
-	err = sync_inode_metadata(inode, 1);
-	if (ret == 0)
-		ret = err;
-
-out:
+		goto sync_buffers;
+
+	ret = sync_inode_metadata(inode, 1);
+sync_buffers:
+	if (mmb) {
+		err = mmb_sync(mmb);
+		if (ret == 0)
+			ret = err;
+	}
 	/* check and advance again to catch errors after syncing out buffers */
 	err = file_check_and_advance_wb_err(file);
 	if (ret == 0)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 11559c513dfb..435a41e4c90f 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -446,6 +446,7 @@ extern const struct address_space_operations empty_aops;
 /* Structure for tracking metadata buffer heads associated with the mapping */
 struct mapping_metadata_bhs {
 	struct address_space *mapping;	/* Mapping bhs are associated with */
+	sector_t inode_blk;	/* Number of block containing the inode */
 	spinlock_t lock;	/* Lock protecting bh list */
 	struct list_head list;	/* The list of bhs (b_assoc_buffers) */
 };
-- 
2.51.0


^ permalink raw reply related

* [PATCH 0/9] fs: Fix missed inode write during fsync
From: Jan Kara @ 2026-05-11 12:13 UTC (permalink / raw)
  To: linux-fsdevel
  Cc: Christian Brauner, aivazian.tigran, OGAWA Hirofumi, Ted Tso,
	linux-ext4, Jan Kara

Hello,

this patch series fixes the possibly missing inode write during fsync(2) for
filesystems using generic metadata bh tracking. The inherent problem is that
.write_inode methods clear inode dirty bit but they only copy inode contents
into to the buffer cache. Because buffer carrying the inode is shared among
multiple inodes, it cannot be tracked by the generic metadata bh tracking
infrastructure and thus nothing is tracking that buffer needs to be written
out to maintain fsync(2) guarantees. Normally, this gets taken care of
by .write_inode checking for WB_SYNC_ALL writeback and submitting & waiting
for the buffer in that case however if flush worker ends up writing the
inode before data integrity writeback, this mechanism is broken.

This patch series adds a way for filesystems to track metadata block number
which contains the inode metadata and then uses this information to writeout
the buffer on fsync.

								Honza

^ permalink raw reply

* Re: [PATCH v2] iomap: add simple read path for small direct I/O
From: changfengnan @ 2026-05-11 12:09 UTC (permalink / raw)
  To: brauner, djwong, hch, ojaswin, dgc, linux-xfs, linux-fsdevel,
	linux-ext4, linux-kernel, lidiangang
In-Reply-To: <20260428114730.14384-1-changfengnan@bytedance.com>

Ping.

> From: "Fengnan Chang"<changfengnan@bytedance.com>
> Date:  Tue, Apr 28, 2026, 19:48
> Subject:  [PATCH v2] iomap: add simple read path for small direct I/O
> To: <brauner@kernel.org>, <djwong@kernel.org>, <hch@infradead.org>, <ojaswin@linux.ibm.com>, <dgc@kernel.org>, <linux-xfs@vger.kernel.org>, <linux-fsdevel@vger.kernel.org>, <linux-ext4@vger.kernel.org>, <linux-kernel@vger.kernel.org>, <lidiangang@bytedance.com>
> Cc: "Fengnan Chang"<changfengnan@bytedance.com>
> When running 4K random read workloads on high-performance Gen5 NVMe
> SSDs, the software overhead in the iomap direct I/O path
> (__iomap_dio_rw) becomes a significant bottleneck.
> 
> Using io_uring with poll mode for a 4K randread test on a raw block
> device:
> taskset -c 30 ./t/io_uring -p1 -d512 -b4096 -s32 -c32 -F1 -B1 -R1 -X1
> -n1 -P1 /dev/nvme10n1
> Result: ~3.2M IOPS
> 
> Running the exact same workload on ext4 and XFS:
> taskset -c 30 ./t/io_uring -p1 -d512 -b4096 -s32 -c32 -F1 -B1 -R1 -X1
> -n1 -P1 /mnt/testfile
> Result: ~1.84M IOPS
> 
> Profiling the ext4 workload reveals that a significant portion of CPU
> time is spent on memory allocation and the iomap state machine
> iteration:
>   5.33%  [kernel]  [k] __iomap_dio_rw
>   3.26%  [kernel]  [k] iomap_iter
>   2.37%  [kernel]  [k] iomap_dio_bio_iter
>   2.35%  [kernel]  [k] kfree
>   1.33%  [kernel]  [k] iomap_dio_complete
> 
> Introduce simple reads to reduce the overhead of iomap, simple read path
> is triggered when the request satisfies:
> - I/O size is <= inode blocksize (fits in a single block, no splits).
> - No custom `iomap_dio_ops` (dops) registered by the filesystem.
> 
> After this optimization, the heavy generic functions disappear from the
> profile, replaced by a single streamlined execution path:
>   4.83%  [kernel]  [k] iomap_dio_simple_read
> 
> With this patch, 4K random read IOPS on ext4 increases from 1.84M to
> 2.19M in the original single-core io_uring poll-mode workload.
> 
> Below are the test results using fio:
> 
>   fs    workload       qd    simple=0      simple=1      gain
>   ext4  libaio         1     18,738        18,761        +0.12%
>   ext4  libaio         128   455,383       471,473       +3.53%
>   ext4  libaio         256   453,273       468,555       +3.37%
>   ext4  libaio         512   447,320       469,036       +4.85%
>   ext4  io_uring       1     18,798        18,824        +0.14%
>   ext4  io_uring       128   503,834       528,353       +4.87%
>   ext4  io_uring       256   503,635       527,617       +4.76%
>   ext4  io_uring       512   501,802       527,882       +5.20%
>   ext4  io_uring_poll  1     19,246        19,270        +0.12%
>   ext4  io_uring_poll  128   1,463,343     1,565,019     +6.95%
>   ext4  io_uring_poll  256   1,651,112     1,888,182     +14.36%
>   ext4  io_uring_poll  512   1,632,641     1,893,259     +15.96%
>   xfs   libaio         1     18,715        18,734        +0.10%
>   xfs   libaio         128   452,974       473,459       +4.52%
>   xfs   libaio         256   454,435       470,855       +3.61%
>   xfs   libaio         512   456,796       473,047       +3.56%
>   xfs   io_uring       1     18,755        18,795        +0.21%
>   xfs   io_uring       128   509,459       534,819       +4.98%
>   xfs   io_uring       256   509,853       536,051       +5.14%
>   xfs   io_uring       512   507,926       533,558       +5.05%
>   xfs   io_uring_poll  1     19,230        19,269        +0.20%
>   xfs   io_uring_poll  128   1,467,398     1,567,840     +6.84%
>   xfs   io_uring_poll  256   1,636,852     1,878,917     +14.79%
>   xfs   io_uring_poll  512   1,639,495     1,874,813     +14.35%
> 
> Assisted-by: Gemini:gemini-3.1-pro-preview
> Assisted-by: Codex:gpt-5-5
> Signed-off-by: Fengnan Chang <changfengnan@bytedance.com>
> ---
>  fs/iomap/direct-io.c | 382 +++++++++++++++++++++++++++++++++++++++++--
>  1 file changed, 371 insertions(+), 11 deletions(-)
> 
> diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
> index e911daedff65a..807d8c628a464 100644
> --- a/fs/iomap/direct-io.c
> +++ b/fs/iomap/direct-io.c
> @@ -9,6 +9,9 @@
>  #include <linux/iomap.h>
>  #include <linux/task_io_accounting_ops.h>
>  #include <linux/fserror.h>
> +#include <linux/kobject.h>
> +#include <linux/sysfs.h>
> +#include <linux/init.h>
>  #include "internal.h"
>  #include "trace.h"
>  
> @@ -236,20 +239,26 @@ static void iomap_dio_done(struct iomap_dio *dio)
>          iomap_dio_complete_work(&dio->aio.work);
>  }
>  
> -static void __iomap_dio_bio_end_io(struct bio *bio, bool inline_completion)
> +static inline void iomap_dio_bio_release_pages(struct bio *bio,
> +                unsigned int dio_flags, bool error)
>  {
> -        struct iomap_dio *dio = bio->bi_private;
> -
> -        if (dio->flags & IOMAP_DIO_BOUNCE) {
> -                bio_iov_iter_unbounce(bio, !!dio->error,
> -                                dio->flags & IOMAP_DIO_USER_BACKED);
> +        if (dio_flags & IOMAP_DIO_BOUNCE) {
> +                bio_iov_iter_unbounce(bio, error,
> +                                dio_flags & IOMAP_DIO_USER_BACKED);
>                  bio_put(bio);
> -        } else if (dio->flags & IOMAP_DIO_USER_BACKED) {
> +        } else if (dio_flags & IOMAP_DIO_USER_BACKED) {
>                  bio_check_pages_dirty(bio);
>          } else {
>                  bio_release_pages(bio, false);
>                  bio_put(bio);
>          }
> +}
> +
> +static void __iomap_dio_bio_end_io(struct bio *bio, bool inline_completion)
> +{
> +        struct iomap_dio *dio = bio->bi_private;
> +
> +        iomap_dio_bio_release_pages(bio, dio->flags, !!dio->error);
>  
>          /* Do not touch bio below, we just gave up our reference. */
>  
> @@ -387,6 +396,14 @@ static ssize_t iomap_dio_bio_iter_one(struct iomap_iter *iter,
>          return ret;
>  }
>  
> +static inline unsigned int iomap_dio_alignment(struct inode *inode,
> +                struct block_device *bdev, unsigned int dio_flags)
> +{
> +        if (dio_flags & IOMAP_DIO_FSBLOCK_ALIGNED)
> +                return i_blocksize(inode);
> +        return bdev_logical_block_size(bdev);
> +}
> +
>  static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio)
>  {
>          const struct iomap *iomap = &iter->iomap;
> @@ -405,10 +422,7 @@ static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio)
>           * File systems that write out of place and always allocate new blocks
>           * need each bio to be block aligned as that's the unit of allocation.
>           */
> -        if (dio->flags & IOMAP_DIO_FSBLOCK_ALIGNED)
> -                alignment = fs_block_size;
> -        else
> -                alignment = bdev_logical_block_size(iomap->bdev);
> +        alignment = iomap_dio_alignment(inode, iomap->bdev, dio->flags);
>  
>          if ((pos | length) & (alignment - 1))
>                  return -EINVAL;
> @@ -880,12 +894,350 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
>  }
>  EXPORT_SYMBOL_GPL(__iomap_dio_rw);
>  
> +struct iomap_dio_simple_read {
> +        struct kiocb                *iocb;
> +        size_t                        size;
> +        unsigned int                dio_flags;
> +        atomic_t                state;
> +        union {
> +                struct task_struct        *waiter;
> +                struct work_struct        work;
> +        };
> +        /*
> +         * Align @bio to a cacheline boundary so that, combined with the
> +         * front_pad passed to bioset_init(), the bio sits at the start of
> +         * a cacheline in memory returned by the (HWCACHE-aligned) bio
> +         * slab.  This keeps the hot fields block layer touches on submit
> +         * and completion (bi_iter, bi_status, ...) within a single line.
> +         */
> +        struct bio        bio ____cacheline_aligned_in_smp;
> +};
> +
> +static struct bio_set iomap_dio_simple_read_pool;
> +
> +/*
> + * In the async simple read path, we need to prevent bio_endio() from
> + * triggering iocb->ki_complete() before the submitter has returned
> + * -EIOCBQUEUED. Otherwise, the caller might free the iocb concurrently.
> + *
> + * We use a three-state rendezvous to synchronize the submitter and end_io:
> + *
> + * IOMAP_DIO_SIMPLE_SUBMITTING: Initial state set before submitting the bio.
> + *
> + * IOMAP_DIO_SIMPLE_QUEUED: The submitter has safely queued the IO and will
> + * return -EIOCBQUEUED. If end_io sees this state, it takes over and calls
> + * ki_complete().
> + *
> + * IOMAP_DIO_SIMPLE_DONE: end_io fired before the submitter finished the
> + * submit path. end_io sets this state and does nothing else. The submitter
> + * will see this state and handle the completion synchronously (bypassing
> + * ki_complete() and returning the actual result).
> + */
> +enum {
> +        IOMAP_DIO_SIMPLE_SUBMITTING = 0,
> +        IOMAP_DIO_SIMPLE_QUEUED,
> +        IOMAP_DIO_SIMPLE_DONE,
> +};
> +
> +static ssize_t iomap_dio_simple_read_finish(struct kiocb *iocb,
> +                struct bio *bio, ssize_t ret)
> +{
> +        struct inode *inode = file_inode(iocb->ki_filp);
> +        struct iomap_dio_simple_read *sr = bio->bi_private;
> +
> +        if (likely(!ret)) {
> +                ret = sr->size;
> +                iocb->ki_pos += ret;
> +        } else {
> +                fserror_report_io(inode, FSERR_DIRECTIO_READ, iocb->ki_pos,
> +                                  sr->size, ret, GFP_NOFS);
> +        }
> +
> +        iomap_dio_bio_release_pages(bio, sr->dio_flags, ret < 0);
> +
> +        return ret;
> +}
> +
> +static ssize_t iomap_dio_simple_read_complete(struct kiocb *iocb,
> +                struct bio *bio)
> +{
> +        struct inode *inode = file_inode(iocb->ki_filp);
> +        ssize_t ret;
> +
> +        WRITE_ONCE(iocb->private, NULL);
> +
> +        ret = iomap_dio_simple_read_finish(iocb, bio,
> +                        blk_status_to_errno(bio->bi_status));
> +
> +        inode_dio_end(inode);
> +        trace_iomap_dio_complete(iocb, ret < 0 ? ret : 0, ret > 0 ? ret : 0);
> +        return ret;
> +}
> +
> +static void iomap_dio_simple_read_complete_work(struct work_struct *work)
> +{
> +        struct iomap_dio_simple_read *sr =
> +                container_of(work, struct iomap_dio_simple_read, work);
> +        struct kiocb *iocb = sr->iocb;
> +        ssize_t ret;
> +
> +        ret = iomap_dio_simple_read_complete(iocb, &sr->bio);
> +        iocb->ki_complete(iocb, ret);
> +}
> +
> +static void iomap_dio_simple_read_async_done(struct iomap_dio_simple_read *sr)
> +{
> +        struct kiocb *iocb = sr->iocb;
> +
> +        if (unlikely(sr->bio.bi_status)) {
> +                struct inode *inode = file_inode(iocb->ki_filp);
> +
> +                INIT_WORK(&sr->work, iomap_dio_simple_read_complete_work);
> +                queue_work(inode->i_sb->s_dio_done_wq, &sr->work);
> +                return;
> +        }
> +
> +        iomap_dio_simple_read_complete_work(&sr->work);
> +}
> +
> +static void iomap_dio_simple_read_end_io(struct bio *bio)
> +{
> +        struct iomap_dio_simple_read *sr = bio->bi_private;
> +
> +        if (sr->waiter) {
> +                struct task_struct *waiter = sr->waiter;
> +
> +                WRITE_ONCE(sr->waiter, NULL);
> +                blk_wake_io_task(waiter);
> +                return;
> +        }
> +
> +        if (likely(atomic_read(&sr->state) == IOMAP_DIO_SIMPLE_QUEUED) ||
> +            atomic_cmpxchg(&sr->state, IOMAP_DIO_SIMPLE_SUBMITTING,
> +                           IOMAP_DIO_SIMPLE_DONE) == IOMAP_DIO_SIMPLE_QUEUED)
> +                iomap_dio_simple_read_async_done(sr);
> +}
> +
> +static inline bool iomap_dio_simple_read_supported(struct kiocb *iocb,
> +                struct iov_iter *iter, unsigned int dio_flags)
> +{
> +        struct inode *inode = file_inode(iocb->ki_filp);
> +        size_t count = iov_iter_count(iter);
> +
> +        if (iov_iter_rw(iter) != READ)
> +                return false;
> +        /*
> +         * Simple read is an optimization for small IO. Filter out large IO
> +         * early as it's the most common case to fail for typical direct IO
> +         * workloads.
> +         */
> +        if (count > inode->i_sb->s_blocksize)
> +                return false;
> +        if (dio_flags & (IOMAP_DIO_FORCE_WAIT | IOMAP_DIO_PARTIAL))
> +                return false;
> +        if (iocb->ki_pos + count > i_size_read(inode))
> +                return false;
> +
> +        return true;
> +}
> +
> +static ssize_t iomap_dio_simple_read(struct kiocb *iocb,
> +                struct iov_iter *iter, const struct iomap_ops *ops,
> +                void *private, unsigned int dio_flags)
> +{
> +        struct inode *inode = file_inode(iocb->ki_filp);
> +        size_t count = iov_iter_count(iter);
> +        int nr_pages;
> +        struct iomap_dio_simple_read *sr;
> +        unsigned int alignment;
> +        struct iomap_iter iomi = {
> +                .inode                = inode,
> +                .pos                = iocb->ki_pos,
> +                .len                = count,
> +                .flags                = IOMAP_DIRECT,
> +                .private        = private,
> +        };
> +        struct bio *bio;
> +        bool wait_for_completion = is_sync_kiocb(iocb);
> +        ssize_t ret;
> +
> +        if (dio_flags & IOMAP_DIO_BOUNCE)
> +                nr_pages = bio_iov_bounce_nr_vecs(iter, REQ_OP_READ);
> +        else
> +                nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS);
> +
> +        if (iocb->ki_flags & IOCB_NOWAIT)
> +                iomi.flags |= IOMAP_NOWAIT;
> +
> +        ret = kiocb_write_and_wait(iocb, count);
> +        if (ret)
> +                return ret;
> +
> +        inode_dio_begin(inode);
> +
> +        ret = ops->iomap_begin(inode, iomi.pos, count, iomi.flags,
> +                               &iomi.iomap, &iomi.srcmap);
> +        if (ret) {
> +                inode_dio_end(inode);
> +                return ret;
> +        }
> +
> +        if (iomi.iomap.type != IOMAP_MAPPED ||
> +            iomi.iomap.offset > iomi.pos ||
> +            iomi.iomap.offset + iomi.iomap.length < iomi.pos + count) {
> +                ret = -ENOTBLK;
> +                goto out_iomap_end;
> +        }
> +
> +        alignment = iomap_dio_alignment(inode, iomi.iomap.bdev, dio_flags);
> +        if ((iomi.pos | count) & (alignment - 1)) {
> +                ret = -EINVAL;
> +                goto out_iomap_end;
> +        }
> +
> +        if (unlikely(!inode->i_sb->s_dio_done_wq)) {
> +                ret = sb_init_dio_done_wq(inode->i_sb);
> +                if (ret < 0)
> +                        goto out_iomap_end;
> +        }
> +
> +        trace_iomap_dio_rw_begin(iocb, iter, dio_flags, 0);
> +
> +        if (user_backed_iter(iter))
> +                dio_flags |= IOMAP_DIO_USER_BACKED;
> +
> +        bio = bio_alloc_bioset(iomi.iomap.bdev, nr_pages,
> +                               REQ_OP_READ | REQ_SYNC | REQ_IDLE,
> +                               GFP_KERNEL, &iomap_dio_simple_read_pool);
> +        sr = container_of(bio, struct iomap_dio_simple_read, bio);
> +
> +        fscrypt_set_bio_crypt_ctx(bio, inode, iomi.pos >> inode->i_blkbits,
> +                                  GFP_KERNEL);
> +        sr->iocb = iocb;
> +        sr->dio_flags = dio_flags;
> +
> +        bio->bi_iter.bi_sector = iomap_sector(&iomi.iomap, iomi.pos);
> +        bio->bi_ioprio = iocb->ki_ioprio;
> +        bio->bi_private = sr;
> +        bio->bi_end_io = iomap_dio_simple_read_end_io;
> +
> +        if (dio_flags & IOMAP_DIO_BOUNCE)
> +                ret = bio_iov_iter_bounce(bio, iter);
> +        else
> +                ret = bio_iov_iter_get_pages(bio, iter, alignment - 1);
> +        if (unlikely(ret))
> +                goto out_bio_put;
> +
> +        if (bio->bi_iter.bi_size != count) {
> +                iov_iter_revert(iter, bio->bi_iter.bi_size);
> +                ret = -ENOTBLK;
> +                goto out_bio_release_pages;
> +        }
> +
> +        sr->size = bio->bi_iter.bi_size;
> +
> +        if ((dio_flags & IOMAP_DIO_USER_BACKED) &&
> +            !(dio_flags & IOMAP_DIO_BOUNCE))
> +                bio_set_pages_dirty(bio);
> +
> +        if (iocb->ki_flags & IOCB_NOWAIT)
> +                bio->bi_opf |= REQ_NOWAIT;
> +        if ((iocb->ki_flags & IOCB_HIPRI) && !wait_for_completion) {
> +                bio->bi_opf |= REQ_POLLED;
> +                bio_set_polled(bio, iocb);
> +                WRITE_ONCE(iocb->private, bio);
> +        }
> +
> +        if (wait_for_completion) {
> +                sr->waiter = current;
> +                blk_crypto_submit_bio(bio);
> +        } else {
> +                atomic_set(&sr->state, IOMAP_DIO_SIMPLE_SUBMITTING);
> +                sr->waiter = NULL;
> +                blk_crypto_submit_bio(bio);
> +                ret = -EIOCBQUEUED;
> +        }
> +
> +        if (ops->iomap_end)
> +                ops->iomap_end(inode, iomi.pos, count, count, iomi.flags,
> +                               &iomi.iomap);
> +
> +        if (wait_for_completion) {
> +                for (;;) {
> +                        set_current_state(TASK_UNINTERRUPTIBLE);
> +                        if (!READ_ONCE(sr->waiter))
> +                                break;
> +                        blk_io_schedule();
> +                }
> +                __set_current_state(TASK_RUNNING);
> +
> +                ret = iomap_dio_simple_read_finish(iocb, bio,
> +                                blk_status_to_errno(bio->bi_status));
> +                inode_dio_end(inode);
> +                trace_iomap_dio_complete(iocb, ret < 0 ? ret : 0,
> +                                         ret > 0 ? ret : 0);
> +        } else if (atomic_cmpxchg(&sr->state, IOMAP_DIO_SIMPLE_SUBMITTING,
> +                                  IOMAP_DIO_SIMPLE_QUEUED) ==
> +                   IOMAP_DIO_SIMPLE_DONE) {
> +                ret = iomap_dio_simple_read_complete(iocb, bio);
> +        } else {
> +                trace_iomap_dio_rw_queued(inode, iomi.pos, count);
> +        }
> +
> +        return ret;
> +
> +out_bio_release_pages:
> +        if (dio_flags & IOMAP_DIO_BOUNCE)
> +                bio_iov_iter_unbounce(bio, true, false);
> +        else
> +                bio_release_pages(bio, false);
> +out_bio_put:
> +        bio_put(bio);
> +out_iomap_end:
> +        if (ops->iomap_end)
> +                ops->iomap_end(inode, iomi.pos, count, 0, iomi.flags,
> +                               &iomi.iomap);
> +        inode_dio_end(inode);
> +        return ret;
> +}
> +
>  ssize_t
>  iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
>                  const struct iomap_ops *ops, const struct iomap_dio_ops *dops,
>                  unsigned int dio_flags, void *private, size_t done_before)
>  {
>          struct iomap_dio *dio;
> +        ssize_t ret;
> +
> +        /*
> +         * Fast path for small, block-aligned reads that map to a single
> +         * contiguous on-disk extent.
> +         *
> +         * @dops must be NULL: a non-NULL @dops means the caller wants its
> +         * ->end_io / ->submit_io hooks invoked, and in particular wants its
> +         * bios to be allocated from the filesystem-private @dops->bio_set
> +         * (whose front_pad sizes a filesystem-private wrapper around the
> +         * bio).  The fast path instead allocates from the shared
> +         * iomap_dio_simple_read_pool, whose front_pad matches
> +         * struct iomap_dio_simple_read; the two wrappers are not
> +         * interchangeable, so we must fall back to __iomap_dio_rw() in
> +         * that case.
> +         *
> +         * @done_before must be zero: a non-zero caller-accumulated residual
> +         * cannot be carried through a single-bio inline completion.
> +         *
> +         * -ENOTBLK is the private sentinel returned by iomap_dio_simple_read()
> +         * when it decides the request does not fit the fast path.
> +         * In that case we proceed to the generic __iomap_dio_rw() slow
> +         * path.  Any other errno is a real result and is propagated as-is,
> +         * in particular -EAGAIN for IOCB_NOWAIT must reach the caller.
> +         */
> +        if (!dops && !done_before &&
> +            iomap_dio_simple_read_supported(iocb, iter, dio_flags)) {
> +                ret = iomap_dio_simple_read(iocb, iter, ops, private, dio_flags);
> +                if (ret != -ENOTBLK)
> +                        return ret;
> +        }
>  
>          dio = __iomap_dio_rw(iocb, iter, ops, dops, dio_flags, private,
>                               done_before);
> @@ -894,3 +1246,11 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
>          return iomap_dio_complete(dio);
>  }
>  EXPORT_SYMBOL_GPL(iomap_dio_rw);
> +
> +static int __init iomap_dio_init(void)
> +{
> +        return bioset_init(&iomap_dio_simple_read_pool, 4,
> +                           offsetof(struct iomap_dio_simple_read, bio),
> +                           BIOSET_NEED_BVECS | BIOSET_PERCPU_CACHE);
> +}
> +fs_initcall(iomap_dio_init);
> -- 
> 2.39.5 (Apple Git-154)
> 

^ permalink raw reply

* Re: [PATCH] jbd2: check for aborted handle in jbd2_journal_dirty_metadata()
From: Jan Kara @ 2026-05-11 11:33 UTC (permalink / raw)
  To: Deepanshu Kartikey
  Cc: tytso, jack, linux-ext4, linux-kernel,
	syzbot+98f651460e558a21baae
In-Reply-To: <20260507050605.50081-1-kartikey406@gmail.com>

On Thu 07-05-26 10:36:05, Deepanshu Kartikey wrote:
> jbd2_journal_dirty_metadata() unconditionally dereferences
> handle->h_transaction at function entry to obtain the journal pointer:
> 
> 	transaction_t *transaction = handle->h_transaction;
> 	journal_t *journal = transaction->t_journal;
> 
> However, h_transaction may legitimately be NULL for an aborted handle.
> The is_handle_aborted() helper in include/linux/jbd2.h explicitly
> treats !h_transaction as one of the aborted states:
> 
> 	if (handle->h_aborted || !handle->h_transaction)
> 		return 1;
> 
> Every other entry point in fs/jbd2/transaction.c
> (jbd2_journal_get_{write,undo,create}_access, jbd2_journal_extend,
> jbd2_journal_restart, jbd2_journal_stop, etc.) guards against this
> with an is_handle_aborted() check before any dereference of
> h_transaction. jbd2_journal_dirty_metadata() was missing this guard.
> 
> This is reachable from ocfs2's xattr code. ocfs2_xa_set() intentionally
> falls through to ocfs2_xa_journal_dirty() even after
> ocfs2_xa_prepare_entry() fails, on the assumption that the buffer
> needs to be journaled to record any partial modifications (see the
> comment above the out_dirty label in fs/ocfs2/xattr.c). If the failure
> was caused by the journal being aborted -- e.g. an underlying I/O
> error during a sub-operation such as __ocfs2_remove_xattr_range() --
> the handle's h_transaction has been cleared by the abort path, and
> the unconditional deref in jbd2_journal_dirty_metadata() becomes a
> NULL deref.
> 
> Reproduced by syzbot with a crafted ocfs2 image where I/O against the
> loop device backing the mount is sabotaged via LOOP_SET_STATUS64
> between two setxattr() calls, causing the second setxattr (which
> truncates an external xattr value) to abort the journal mid-flight:
> 
>   Oops: general protection fault, probably for non-canonical
>         address 0xdffffc0000000000
>   KASAN: null-ptr-deref in range [0x0000000000000000-0x0000000000000007]
>   RIP: jbd2_journal_dirty_metadata+0x4a/0xd30 fs/jbd2/transaction.c:1520
>   Call Trace:
>    ocfs2_journal_dirty+0x130/0x700 fs/ocfs2/journal.c:831
>    ocfs2_xa_journal_dirty fs/ocfs2/xattr.c:1483 [inline]
>    ocfs2_xa_set+0x15e3/0x2ec0 fs/ocfs2/xattr.c:2294
>    ocfs2_xattr_block_set+0x3e0/0x33c0 fs/ocfs2/xattr.c:3016
>    __ocfs2_xattr_set_handle+0x6b3/0xf50 fs/ocfs2/xattr.c:3418
>    ocfs2_xattr_set+0xf3f/0x13e0 fs/ocfs2/xattr.c:3681
>    __vfs_setxattr+0x43c/0x480 fs/xattr.c:218
>    ...
> 
> Fix by adding the standard is_handle_aborted() guard at the top of
> jbd2_journal_dirty_metadata() and returning -EROFS, matching the
> pattern used by every other entry point in this file.
> ocfs2_journal_dirty() already handles a non-zero return from
> jbd2_journal_dirty_metadata() correctly.
> 
> Reported-by: syzbot+98f651460e558a21baae@syzkaller.appspotmail.com
> Closes: https://syzkaller.appspot.com/bug?extid=98f651460e558a21baae
> Tested-by: syzbot+98f651460e558a21baae@syzkaller.appspotmail.com
> Signed-off-by: Deepanshu Kartikey <kartikey406@gmail.com>

Looks sensible. Feel free to add:

Reviewed-by: Jan Kara <jack@suse.cz>

								Honza

> ---
>  fs/jbd2/transaction.c | 9 +++++++--
>  1 file changed, 7 insertions(+), 2 deletions(-)
> 
> diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
> index 4885903bbd10..aa0be9e9c876 100644
> --- a/fs/jbd2/transaction.c
> +++ b/fs/jbd2/transaction.c
> @@ -1516,14 +1516,19 @@ void jbd2_buffer_abort_trigger(struct journal_head *jh,
>   */
>  int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
>  {
> -	transaction_t *transaction = handle->h_transaction;
> -	journal_t *journal = transaction->t_journal;
> +	transaction_t *transaction;
> +	journal_t *journal;
>  	struct journal_head *jh;
>  	int ret = 0;
>  
> +	if (is_handle_aborted(handle))
> +		return -EROFS;
>  	if (!buffer_jbd(bh))
>  		return -EUCLEAN;
>  
> +	transaction = handle->h_transaction;
> +	journal = transaction->t_journal;
> +
>  	/*
>  	 * We don't grab jh reference here since the buffer must be part
>  	 * of the running transaction.
> -- 
> 2.43.0
> 
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply

* Re: [PATCH v9 00/22] fs-verity support for XFS with post EOF merkle tree
From: Andrey Albershteyn @ 2026-05-11 11:15 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Andrey Albershteyn, linux-xfs, fsverity, linux-fsdevel, ebiggers,
	linux-ext4, linux-f2fs-devel, linux-btrfs, linux-unionfs, djwong,
	david
In-Reply-To: <20260507055217.GA19888@lst.de>

On 2026-05-07 07:52:17, Christoph Hellwig wrote:
> On Tue, Apr 28, 2026 at 10:33:06AM +0200, Andrey Albershteyn wrote:
> > This series based on v7.0 with Christoph's read ioends patchset [1].
> 
> That's not a good baseline.  We'll need it on the -rc that has everything
> from the current merge window at least.  It might also interact with
> the fsverity fix that just went in.

Sure, I will sent v10 with a latest master and that fix soon.

> 
> This might also be time to come up with a merge plan to figure out through
> what tree(s) to merge it as there don't seem to be any maintainer
> objections.
> 
> 

-- 
- Andrey


^ permalink raw reply

* Re: [PATCH v4 11/23] iomap: correct the range of a partial dirty clear
From: Zhang Yi @ 2026-05-11  8:57 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: linux-ext4, linux-fsdevel, linux-kernel, tytso, adilger.kernel,
	libaokun, jack, ojaswin, ritesh.list, djwong, yi.zhang,
	yizhang089, yangerkun, yukuai
In-Reply-To: <agGJcqFNl23qNLvx@infradead.org>

On 5/11/2026 3:46 PM, Christoph Hellwig wrote:
> Plase send the iomap patches out separate, including to all the
> relevant lists from the iomap MAINTAINERS entry.
> 

OK, sure, will do.

Best Regards,
Yi


^ permalink raw reply

* [RFC v7 7/7] ext4: fast commit: export snapshot stats in fc_info
From: Li Chen @ 2026-05-11  8:43 UTC (permalink / raw)
  To: Zhang Yi, Theodore Ts'o, Andreas Dilger, Baokun Li, Jan Kara,
	Ojaswin Mujoo, Ritesh Harjani (IBM), Zhang Yi, linux-ext4,
	linux-kernel
  Cc: Steven Rostedt, Masami Hiramatsu, Mathieu Desnoyers,
	linux-trace-kernel
In-Reply-To: <20260511084304.1559557-1-me@linux.beauty>

Snapshot-based fast commit can fall back when the commit-time snapshot
cannot be built (e.g. extent status cache misses). It is useful to
quantify the updates-locked window and to see why snapshotting failed.

Add best-effort snapshot counters to the ext4 superblock and extend
/proc/fs/ext4/<sb_id>/fc_info to report the number of snapshotted
inodes and ranges, snapshot failure reasons, and the average/max time
spent with journal updates locked.

Signed-off-by: Li Chen <chenl311@chinatelecom.cn>
---
Changes in v7:
- Address Sashiko review by using READ_ONCE() + div64_u64() for the fc_info
  lock_updates average.

Changes in v6:
- Start consuming locked_ns in fc_info, so this patch intentionally moves
  lock_updates_ns_{total,max,samples} accounting here.
- Guard the tracepoint call with trace_ext4_fc_lock_updates_enabled() and
  use trace_call__ext4_fc_lock_updates() to avoid the double static_branch
  at the guarded call site.
- Keep the stats unconditionally while avoiding extra tracepoint
  overhead when ext4_fc_lock_updates is disabled.

 fs/ext4/ext4.h        | 31 +++++++++++++++++
 fs/ext4/fast_commit.c | 78 +++++++++++++++++++++++++++++++++++++------
 fs/ext4/super.c       |  1 +
 3 files changed, 100 insertions(+), 10 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index df30f8705c98..3457b4950c02 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1550,6 +1550,36 @@ struct ext4_orphan_info {
 						 * file blocks */
 };
 
+/*
+ * Ext4 fast commit snapshot statistics.
+ *
+ * These are best-effort counters intended for debugging / performance
+ * introspection; they are not exact under concurrent updates.
+ */
+struct ext4_fc_snap_stats {
+	u64 lock_updates_ns_total;
+	u64 lock_updates_ns_max;
+	u64 lock_updates_samples;
+
+	u64 snap_inodes;
+	u64 snap_ranges;
+
+	u64 snap_fail_es_miss;
+	u64 snap_fail_es_delayed;
+	u64 snap_fail_es_other;
+
+	u64 snap_fail_inodes_cap;
+	u64 snap_fail_ranges_cap;
+	u64 snap_fail_nomem;
+	u64 snap_fail_inode_loc;
+
+	/*
+	 * Missing inode snapshots during log writing should never happen.
+	 * Keep this counter to help catch unexpected regressions.
+	 */
+	u64 snap_fail_no_snap;
+};
+
 /*
  * fourth extended-fs super-block data in memory
  */
@@ -1824,6 +1854,7 @@ struct ext4_sb_info {
 	struct mutex s_fc_lock;
 	struct buffer_head *s_fc_bh;
 	struct ext4_fc_stats s_fc_stats;
+	struct ext4_fc_snap_stats s_fc_snap_stats;
 	tid_t s_fc_ineligible_tid;
 #ifdef CONFIG_EXT4_DEBUG
 	int s_fc_debug_max_replay;
diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
index c24984d8df83..1dfcccf4179e 100644
--- a/fs/ext4/fast_commit.c
+++ b/fs/ext4/fast_commit.c
@@ -874,13 +874,17 @@ static int ext4_fc_write_inode(struct inode *inode, u32 *crc)
 	int inode_len;
 	int ret;
 
-	if (!snap)
+	if (!snap) {
+		EXT4_SB(inode->i_sb)->s_fc_snap_stats.snap_fail_no_snap++;
 		return -ECANCELED;
+	}
 
 	src = snap->inode_buf;
 	inode_len = snap->inode_len;
-	if (!src || inode_len == 0)
+	if (!src || inode_len == 0) {
+		EXT4_SB(inode->i_sb)->s_fc_snap_stats.snap_fail_no_snap++;
 		return -ECANCELED;
+	}
 
 	fc_inode.fc_ino = cpu_to_le32(inode->i_ino);
 	tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_INODE);
@@ -915,8 +919,10 @@ static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc)
 	struct ext4_extent *ex;
 	struct ext4_fc_range *range;
 
-	if (!snap)
+	if (!snap) {
+		EXT4_SB(inode->i_sb)->s_fc_snap_stats.snap_fail_no_snap++;
 		return -ECANCELED;
+	}
 
 	list_for_each_entry(range, &snap->data_list, list) {
 		if (range->tag == EXT4_FC_TAG_DEL_RANGE) {
@@ -977,6 +983,8 @@ static int ext4_fc_snapshot_inode_data(struct inode *inode,
 				       int *snap_err)
 {
 	struct ext4_inode_info *ei = EXT4_I(inode);
+	struct ext4_fc_snap_stats *stats =
+		&EXT4_SB(inode->i_sb)->s_fc_snap_stats;
 	ext4_lblk_t start_lblk, end_lblk, cur_lblk;
 	unsigned int nr_ranges = 0;
 
@@ -1004,11 +1012,13 @@ static int ext4_fc_snapshot_inode_data(struct inode *inode,
 		u64 remaining = (u64)end_lblk - cur_lblk + 1;
 
 		if (!ext4_es_lookup_extent(inode, cur_lblk, NULL, &es, NULL)) {
+			stats->snap_fail_es_miss++;
 			ext4_fc_set_snap_err(snap_err, EXT4_FC_SNAP_ERR_ES_MISS);
 			return -EAGAIN;
 		}
 
 		if (ext4_es_is_delayed(&es)) {
+			stats->snap_fail_es_delayed++;
 			ext4_fc_set_snap_err(snap_err,
 					     EXT4_FC_SNAP_ERR_ES_DELAYED);
 			return -EAGAIN;
@@ -1023,6 +1033,7 @@ static int ext4_fc_snapshot_inode_data(struct inode *inode,
 		}
 
 		if (nr_ranges_total + nr_ranges >= EXT4_FC_SNAPSHOT_MAX_RANGES) {
+			stats->snap_fail_ranges_cap++;
 			ext4_fc_set_snap_err(snap_err,
 					     EXT4_FC_SNAP_ERR_RANGES_CAP);
 			return -E2BIG;
@@ -1030,6 +1041,7 @@ static int ext4_fc_snapshot_inode_data(struct inode *inode,
 
 		range = kmem_cache_alloc(ext4_fc_range_cachep, GFP_NOFS);
 		if (!range) {
+			stats->snap_fail_nomem++;
 			ext4_fc_set_snap_err(snap_err, EXT4_FC_SNAP_ERR_NOMEM);
 			return -ENOMEM;
 		}
@@ -1057,6 +1069,7 @@ static int ext4_fc_snapshot_inode_data(struct inode *inode,
 				range->len = max;
 		} else {
 			kmem_cache_free(ext4_fc_range_cachep, range);
+			stats->snap_fail_es_other++;
 			ext4_fc_set_snap_err(snap_err, EXT4_FC_SNAP_ERR_ES_OTHER);
 			return -EAGAIN;
 		}
@@ -1080,6 +1093,8 @@ static int ext4_fc_snapshot_inode(struct inode *inode,
 				  unsigned int *nr_rangesp, int *snap_err)
 {
 	struct ext4_inode_info *ei = EXT4_I(inode);
+	struct ext4_fc_snap_stats *stats =
+		&EXT4_SB(inode->i_sb)->s_fc_snap_stats;
 	struct ext4_fc_inode_snap *snap;
 	int inode_len = EXT4_GOOD_OLD_INODE_SIZE;
 	struct ext4_iloc iloc;
@@ -1090,6 +1105,7 @@ static int ext4_fc_snapshot_inode(struct inode *inode,
 
 	ret = ext4_get_inode_loc_noio(inode, &iloc);
 	if (ret) {
+		stats->snap_fail_inode_loc++;
 		ext4_fc_set_snap_err(snap_err, EXT4_FC_SNAP_ERR_INODE_LOC);
 		return ret;
 	}
@@ -1101,6 +1117,7 @@ static int ext4_fc_snapshot_inode(struct inode *inode,
 
 	snap = kmalloc(struct_size(snap, inode_buf, inode_len), GFP_NOFS);
 	if (!snap) {
+		stats->snap_fail_nomem++;
 		ext4_fc_set_snap_err(snap_err, EXT4_FC_SNAP_ERR_NOMEM);
 		brelse(iloc.bh);
 		return -ENOMEM;
@@ -1125,6 +1142,8 @@ static int ext4_fc_snapshot_inode(struct inode *inode,
 	list_splice_tail_init(&ranges, &snap->data_list);
 	ext4_fc_unlock(inode->i_sb, alloc_ctx);
 
+	stats->snap_inodes++;
+	stats->snap_ranges += nr_ranges;
 	if (nr_rangesp)
 		*nr_rangesp = nr_ranges;
 	return 0;
@@ -1234,6 +1253,7 @@ static int ext4_fc_snapshot_inodes(journal_t *journal, struct inode **inodes,
 	alloc_ctx = ext4_fc_lock(sb);
 	list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
 		if (i >= inodes_size) {
+			sbi->s_fc_snap_stats.snap_fail_inodes_cap++;
 			ext4_fc_set_snap_err(snap_err,
 					     EXT4_FC_SNAP_ERR_INODES_CAP);
 			ret = -E2BIG;
@@ -1259,6 +1279,7 @@ static int ext4_fc_snapshot_inodes(journal_t *journal, struct inode **inodes,
 			continue;
 
 		if (i >= inodes_size) {
+			sbi->s_fc_snap_stats.snap_fail_inodes_cap++;
 			ext4_fc_set_snap_err(snap_err,
 					     EXT4_FC_SNAP_ERR_INODES_CAP);
 			ret = -E2BIG;
@@ -1302,6 +1323,7 @@ static int ext4_fc_perform_commit(journal_t *journal, tid_t commit_tid)
 {
 	struct super_block *sb = journal->j_private;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_fc_snap_stats *snap_stats = &sbi->s_fc_snap_stats;
 	struct ext4_inode_info *iter;
 	struct ext4_fc_head head;
 	struct inode *inode;
@@ -1364,8 +1386,13 @@ static int ext4_fc_perform_commit(journal_t *journal, tid_t commit_tid)
 		return ret;
 
 	ret = ext4_fc_alloc_snapshot_inodes(sb, &inodes, &inodes_size);
-	if (ret)
+	if (ret) {
+		if (ret == -E2BIG)
+			snap_stats->snap_fail_inodes_cap++;
+		else if (ret == -ENOMEM)
+			snap_stats->snap_fail_nomem++;
 		return ret;
+	}
 
 	/* Step 4: Mark all inodes as being committed. */
 	jbd2_journal_lock_updates(journal);
@@ -1386,12 +1413,15 @@ static int ext4_fc_perform_commit(journal_t *journal, tid_t commit_tid)
 	ret = ext4_fc_snapshot_inodes(journal, inodes, inodes_size,
 				      &snap_inodes, &snap_ranges, &snap_err);
 	jbd2_journal_unlock_updates(journal);
-	if (trace_ext4_fc_lock_updates_enabled()) {
-		locked_ns = ktime_to_ns(ktime_sub(ktime_get(), lock_start));
-		trace_ext4_fc_lock_updates(sb, commit_tid, locked_ns,
-					   snap_inodes, snap_ranges, ret,
-					   snap_err);
-	}
+	locked_ns = ktime_to_ns(ktime_sub(ktime_get(), lock_start));
+	snap_stats->lock_updates_ns_total += locked_ns;
+	snap_stats->lock_updates_samples++;
+	if (locked_ns > snap_stats->lock_updates_ns_max)
+		snap_stats->lock_updates_ns_max = locked_ns;
+	if (trace_ext4_fc_lock_updates_enabled())
+		trace_call__ext4_fc_lock_updates(sb, commit_tid, locked_ns,
+						 snap_inodes, snap_ranges,
+						 ret, snap_err);
 	kvfree(inodes);
 	if (ret)
 		return ret;
@@ -2667,11 +2697,23 @@ int ext4_fc_info_show(struct seq_file *seq, void *v)
 {
 	struct ext4_sb_info *sbi = EXT4_SB((struct super_block *)seq->private);
 	struct ext4_fc_stats *stats = &sbi->s_fc_stats;
+	struct ext4_fc_snap_stats *snap_stats = &sbi->s_fc_snap_stats;
+	u64 lock_avg_ns = 0;
+	u64 lock_updates_samples;
+	u64 lock_updates_ns_total;
+	u64 lock_updates_ns_max;
 	int i;
 
 	if (v != SEQ_START_TOKEN)
 		return 0;
 
+	lock_updates_samples = READ_ONCE(snap_stats->lock_updates_samples);
+	lock_updates_ns_total = READ_ONCE(snap_stats->lock_updates_ns_total);
+	lock_updates_ns_max = READ_ONCE(snap_stats->lock_updates_ns_max);
+	if (lock_updates_samples)
+		lock_avg_ns = div64_u64(lock_updates_ns_total,
+					lock_updates_samples);
+
 	seq_printf(seq,
 		"fc stats:\n%ld commits\n%ld ineligible\n%ld numblks\n%lluus avg_commit_time\n",
 		   stats->fc_num_commits, stats->fc_ineligible_commits,
@@ -2682,6 +2724,22 @@ int ext4_fc_info_show(struct seq_file *seq, void *v)
 		seq_printf(seq, "\"%s\":\t%d\n", fc_ineligible_reasons[i],
 			stats->fc_ineligible_reason_count[i]);
 
+	seq_printf(seq,
+		   "Snapshot stats:\n%llu inodes\n%llu ranges\n%lluus lock_updates_avg\n%lluus lock_updates_max\n",
+		   snap_stats->snap_inodes, snap_stats->snap_ranges,
+		   div_u64(lock_avg_ns, 1000),
+		   div_u64(lock_updates_ns_max, 1000));
+	seq_printf(seq,
+		   "Snapshot failures:\n%llu es_miss\n%llu es_delayed\n%llu es_other\n%llu inodes_cap\n%llu ranges_cap\n%llu nomem\n%llu inode_loc\n%llu no_snap\n",
+		   snap_stats->snap_fail_es_miss,
+		   snap_stats->snap_fail_es_delayed,
+		   snap_stats->snap_fail_es_other,
+		   snap_stats->snap_fail_inodes_cap,
+		   snap_stats->snap_fail_ranges_cap,
+		   snap_stats->snap_fail_nomem,
+		   snap_stats->snap_fail_inode_loc,
+		   snap_stats->snap_fail_no_snap);
+
 	return 0;
 }
 
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 3c869f0001c5..f1f8819a2a23 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -4544,6 +4544,7 @@ static void ext4_fast_commit_init(struct super_block *sb)
 	sbi->s_fc_ineligible_tid = 0;
 	mutex_init(&sbi->s_fc_lock);
 	memset(&sbi->s_fc_stats, 0, sizeof(sbi->s_fc_stats));
+	memset(&sbi->s_fc_snap_stats, 0, sizeof(sbi->s_fc_snap_stats));
 	sbi->s_fc_replay_state.fc_regions = NULL;
 	sbi->s_fc_replay_state.fc_regions_size = 0;
 	sbi->s_fc_replay_state.fc_regions_used = 0;
-- 
2.53.0

^ permalink raw reply related

* [RFC v7 6/7] ext4: fast commit: add lock_updates tracepoint
From: Li Chen @ 2026-05-11  8:43 UTC (permalink / raw)
  To: Zhang Yi, Theodore Ts'o, Andreas Dilger, Baokun Li, Jan Kara,
	Ojaswin Mujoo, Ritesh Harjani (IBM), Zhang Yi, Steven Rostedt,
	Masami Hiramatsu, Mathieu Desnoyers, linux-ext4, linux-kernel,
	linux-trace-kernel
In-Reply-To: <20260511084304.1559557-1-me@linux.beauty>

Commit-time fast commit snapshots run under jbd2_journal_lock_updates(),
so it is useful to quantify the time spent with updates locked and to
understand why snapshotting can fail.

Add a new tracepoint, ext4_fc_lock_updates, reporting the time spent in
the updates-locked window along with the number of snapshotted inodes
and ranges. Record the first snapshot failure reason in a stable snap_err
field for tooling.

Signed-off-by: Li Chen <chenl311@chinatelecom.cn>
Reviewed-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
Changes in v7:
- Address Sashiko review by reporting successfully snapshotted inode counts
  in ext4_fc_lock_updates when snapshotting stops early.

Changes in v6:
- Drop explicit ext4_fc_snap_err assignments and rely on enum
  auto-increment.
- Treat locked_ns as trace-only in this patch and calculate it only when
  ext4_fc_lock_updates is enabled, as suggested by Steven Rostedt.

 fs/ext4/ext4.h              | 15 ++++++++
 fs/ext4/fast_commit.c       | 74 +++++++++++++++++++++++++++++--------
 include/trace/events/ext4.h | 61 ++++++++++++++++++++++++++++++
 3 files changed, 135 insertions(+), 15 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 2a706acdfaf8..df30f8705c98 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1027,6 +1027,21 @@ enum {
 
 struct ext4_fc_inode_snap;
 
+/*
+ * Snapshot failure reasons for ext4_fc_lock_updates tracepoint.
+ * Keep these stable for tooling.
+ */
+enum ext4_fc_snap_err {
+	EXT4_FC_SNAP_ERR_NONE = 0,
+	EXT4_FC_SNAP_ERR_ES_MISS,
+	EXT4_FC_SNAP_ERR_ES_DELAYED,
+	EXT4_FC_SNAP_ERR_ES_OTHER,
+	EXT4_FC_SNAP_ERR_INODES_CAP,
+	EXT4_FC_SNAP_ERR_RANGES_CAP,
+	EXT4_FC_SNAP_ERR_NOMEM,
+	EXT4_FC_SNAP_ERR_INODE_LOC,
+};
+
 /*
  * fourth extended file system inode data in memory
  */
diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
index 9fc17c1fa7af..c24984d8df83 100644
--- a/fs/ext4/fast_commit.c
+++ b/fs/ext4/fast_commit.c
@@ -194,6 +194,12 @@ static struct kmem_cache *ext4_fc_range_cachep;
 #define EXT4_FC_SNAPSHOT_MAX_INODES	1024
 #define EXT4_FC_SNAPSHOT_MAX_RANGES	2048
 
+static inline void ext4_fc_set_snap_err(int *snap_err, int err)
+{
+	if (snap_err && *snap_err == EXT4_FC_SNAP_ERR_NONE)
+		*snap_err = err;
+}
+
 static void ext4_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
 {
 	BUFFER_TRACE(bh, "");
@@ -967,11 +973,12 @@ static void ext4_fc_free_inode_snap(struct inode *inode)
 static int ext4_fc_snapshot_inode_data(struct inode *inode,
 				       struct list_head *ranges,
 				       unsigned int nr_ranges_total,
-				       unsigned int *nr_rangesp)
+				       unsigned int *nr_rangesp,
+				       int *snap_err)
 {
 	struct ext4_inode_info *ei = EXT4_I(inode);
-	unsigned int nr_ranges = 0;
 	ext4_lblk_t start_lblk, end_lblk, cur_lblk;
+	unsigned int nr_ranges = 0;
 
 	spin_lock(&ei->i_fc_lock);
 	if (ei->i_fc_lblk_len == 0) {
@@ -996,11 +1003,16 @@ static int ext4_fc_snapshot_inode_data(struct inode *inode,
 		ext4_lblk_t len;
 		u64 remaining = (u64)end_lblk - cur_lblk + 1;
 
-		if (!ext4_es_lookup_extent(inode, cur_lblk, NULL, &es, NULL))
+		if (!ext4_es_lookup_extent(inode, cur_lblk, NULL, &es, NULL)) {
+			ext4_fc_set_snap_err(snap_err, EXT4_FC_SNAP_ERR_ES_MISS);
 			return -EAGAIN;
+		}
 
-		if (ext4_es_is_delayed(&es))
+		if (ext4_es_is_delayed(&es)) {
+			ext4_fc_set_snap_err(snap_err,
+					     EXT4_FC_SNAP_ERR_ES_DELAYED);
 			return -EAGAIN;
+		}
 
 		len = es.es_len - (cur_lblk - es.es_lblk);
 		if (len > remaining)
@@ -1010,12 +1022,17 @@ static int ext4_fc_snapshot_inode_data(struct inode *inode,
 			continue;
 		}
 
-		if (nr_ranges_total + nr_ranges >= EXT4_FC_SNAPSHOT_MAX_RANGES)
+		if (nr_ranges_total + nr_ranges >= EXT4_FC_SNAPSHOT_MAX_RANGES) {
+			ext4_fc_set_snap_err(snap_err,
+					     EXT4_FC_SNAP_ERR_RANGES_CAP);
 			return -E2BIG;
+		}
 
 		range = kmem_cache_alloc(ext4_fc_range_cachep, GFP_NOFS);
-		if (!range)
+		if (!range) {
+			ext4_fc_set_snap_err(snap_err, EXT4_FC_SNAP_ERR_NOMEM);
 			return -ENOMEM;
+		}
 		nr_ranges++;
 
 		range->lblk = cur_lblk;
@@ -1040,6 +1057,7 @@ static int ext4_fc_snapshot_inode_data(struct inode *inode,
 				range->len = max;
 		} else {
 			kmem_cache_free(ext4_fc_range_cachep, range);
+			ext4_fc_set_snap_err(snap_err, EXT4_FC_SNAP_ERR_ES_OTHER);
 			return -EAGAIN;
 		}
 
@@ -1059,7 +1077,7 @@ static int ext4_fc_snapshot_inode_data(struct inode *inode,
 
 static int ext4_fc_snapshot_inode(struct inode *inode,
 				  unsigned int nr_ranges_total,
-				  unsigned int *nr_rangesp)
+				  unsigned int *nr_rangesp, int *snap_err)
 {
 	struct ext4_inode_info *ei = EXT4_I(inode);
 	struct ext4_fc_inode_snap *snap;
@@ -1071,8 +1089,10 @@ static int ext4_fc_snapshot_inode(struct inode *inode,
 	int alloc_ctx;
 
 	ret = ext4_get_inode_loc_noio(inode, &iloc);
-	if (ret)
+	if (ret) {
+		ext4_fc_set_snap_err(snap_err, EXT4_FC_SNAP_ERR_INODE_LOC);
 		return ret;
+	}
 
 	if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA))
 		inode_len = EXT4_INODE_SIZE(inode->i_sb);
@@ -1081,6 +1101,7 @@ static int ext4_fc_snapshot_inode(struct inode *inode,
 
 	snap = kmalloc(struct_size(snap, inode_buf, inode_len), GFP_NOFS);
 	if (!snap) {
+		ext4_fc_set_snap_err(snap_err, EXT4_FC_SNAP_ERR_NOMEM);
 		brelse(iloc.bh);
 		return -ENOMEM;
 	}
@@ -1091,7 +1112,7 @@ static int ext4_fc_snapshot_inode(struct inode *inode,
 	brelse(iloc.bh);
 
 	ret = ext4_fc_snapshot_inode_data(inode, &ranges, nr_ranges_total,
-					  &nr_ranges);
+					  &nr_ranges, snap_err);
 	if (ret) {
 		kfree(snap);
 		ext4_fc_free_ranges(&ranges);
@@ -1192,7 +1213,10 @@ static int ext4_fc_alloc_snapshot_inodes(struct super_block *sb,
 					 unsigned int *nr_inodesp);
 
 static int ext4_fc_snapshot_inodes(journal_t *journal, struct inode **inodes,
-				   unsigned int inodes_size)
+				   unsigned int inodes_size,
+				   unsigned int *nr_inodesp,
+				   unsigned int *nr_rangesp,
+				   int *snap_err)
 {
 	struct super_block *sb = journal->j_private;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -1210,6 +1234,8 @@ static int ext4_fc_snapshot_inodes(journal_t *journal, struct inode **inodes,
 	alloc_ctx = ext4_fc_lock(sb);
 	list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
 		if (i >= inodes_size) {
+			ext4_fc_set_snap_err(snap_err,
+					     EXT4_FC_SNAP_ERR_INODES_CAP);
 			ret = -E2BIG;
 			goto unlock;
 		}
@@ -1233,6 +1259,8 @@ static int ext4_fc_snapshot_inodes(journal_t *journal, struct inode **inodes,
 			continue;
 
 		if (i >= inodes_size) {
+			ext4_fc_set_snap_err(snap_err,
+					     EXT4_FC_SNAP_ERR_INODES_CAP);
 			ret = -E2BIG;
 			goto unlock;
 		}
@@ -1257,16 +1285,20 @@ static int ext4_fc_snapshot_inodes(journal_t *journal, struct inode **inodes,
 		unsigned int inode_ranges = 0;
 
 		ret = ext4_fc_snapshot_inode(inodes[idx], nr_ranges,
-					     &inode_ranges);
+					     &inode_ranges, snap_err);
 		if (ret)
 			break;
 		nr_ranges += inode_ranges;
 	}
 
+	if (nr_inodesp)
+		*nr_inodesp = idx;
+	if (nr_rangesp)
+		*nr_rangesp = nr_ranges;
 	return ret;
 }
 
-static int ext4_fc_perform_commit(journal_t *journal)
+static int ext4_fc_perform_commit(journal_t *journal, tid_t commit_tid)
 {
 	struct super_block *sb = journal->j_private;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -1275,10 +1307,15 @@ static int ext4_fc_perform_commit(journal_t *journal)
 	struct inode *inode;
 	struct inode **inodes;
 	unsigned int inodes_size;
+	unsigned int snap_inodes = 0;
+	unsigned int snap_ranges = 0;
+	int snap_err = EXT4_FC_SNAP_ERR_NONE;
 	struct blk_plug plug;
 	int ret = 0;
 	u32 crc = 0;
 	int alloc_ctx;
+	ktime_t lock_start;
+	u64 locked_ns;
 
 	/*
 	 * Step 1: Mark all inodes on s_fc_q[MAIN] with
@@ -1326,13 +1363,13 @@ static int ext4_fc_perform_commit(journal_t *journal)
 	if (ret)
 		return ret;
 
-
 	ret = ext4_fc_alloc_snapshot_inodes(sb, &inodes, &inodes_size);
 	if (ret)
 		return ret;
 
 	/* Step 4: Mark all inodes as being committed. */
 	jbd2_journal_lock_updates(journal);
+	lock_start = ktime_get();
 	/*
 	 * The journal is now locked. No more handles can start and all the
 	 * previous handles are now drained. Snapshotting happens in this
@@ -1346,8 +1383,15 @@ static int ext4_fc_perform_commit(journal_t *journal)
 	}
 	ext4_fc_unlock(sb, alloc_ctx);
 
-	ret = ext4_fc_snapshot_inodes(journal, inodes, inodes_size);
+	ret = ext4_fc_snapshot_inodes(journal, inodes, inodes_size,
+				      &snap_inodes, &snap_ranges, &snap_err);
 	jbd2_journal_unlock_updates(journal);
+	if (trace_ext4_fc_lock_updates_enabled()) {
+		locked_ns = ktime_to_ns(ktime_sub(ktime_get(), lock_start));
+		trace_ext4_fc_lock_updates(sb, commit_tid, locked_ns,
+					   snap_inodes, snap_ranges, ret,
+					   snap_err);
+	}
 	kvfree(inodes);
 	if (ret)
 		return ret;
@@ -1552,7 +1596,7 @@ int ext4_fc_commit(journal_t *journal, tid_t commit_tid)
 		journal_ioprio = EXT4_DEF_JOURNAL_IOPRIO;
 	set_task_ioprio(current, journal_ioprio);
 	fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize;
-	ret = ext4_fc_perform_commit(journal);
+	ret = ext4_fc_perform_commit(journal, commit_tid);
 	if (ret < 0) {
 		if (ret == -EAGAIN || ret == -E2BIG || ret == -ECANCELED)
 			status = EXT4_FC_STATUS_INELIGIBLE;
diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
index f493642cf121..7028a28316fa 100644
--- a/include/trace/events/ext4.h
+++ b/include/trace/events/ext4.h
@@ -107,6 +107,26 @@ TRACE_DEFINE_ENUM(EXT4_FC_REASON_VERITY);
 TRACE_DEFINE_ENUM(EXT4_FC_REASON_MOVE_EXT);
 TRACE_DEFINE_ENUM(EXT4_FC_REASON_MAX);
 
+#undef EM
+#undef EMe
+#define EM(a)	TRACE_DEFINE_ENUM(EXT4_FC_SNAP_ERR_##a);
+#define EMe(a)	TRACE_DEFINE_ENUM(EXT4_FC_SNAP_ERR_##a);
+
+#define TRACE_SNAP_ERR						\
+	EM(NONE)						\
+	EM(ES_MISS)						\
+	EM(ES_DELAYED)						\
+	EM(ES_OTHER)						\
+	EM(INODES_CAP)						\
+	EM(RANGES_CAP)						\
+	EM(NOMEM)						\
+	EMe(INODE_LOC)
+
+TRACE_SNAP_ERR
+
+#undef EM
+#undef EMe
+
 #define show_fc_reason(reason)						\
 	__print_symbolic(reason,					\
 		{ EXT4_FC_REASON_XATTR,		"XATTR"},		\
@@ -2818,6 +2838,47 @@ TRACE_EVENT(ext4_fc_commit_stop,
 		  __entry->num_fc_ineligible, __entry->nblks_agg, __entry->tid)
 );
 
+#define EM(a)	{ EXT4_FC_SNAP_ERR_##a, #a },
+#define EMe(a)	{ EXT4_FC_SNAP_ERR_##a, #a }
+
+TRACE_EVENT(ext4_fc_lock_updates,
+	    TP_PROTO(struct super_block *sb, tid_t commit_tid, u64 locked_ns,
+		     unsigned int nr_inodes, unsigned int nr_ranges, int err,
+		     int snap_err),
+
+	TP_ARGS(sb, commit_tid, locked_ns, nr_inodes, nr_ranges, err, snap_err),
+
+	TP_STRUCT__entry(/* entry */
+		__field(dev_t, dev)
+		__field(tid_t, tid)
+		__field(u64, locked_ns)
+		__field(unsigned int, nr_inodes)
+		__field(unsigned int, nr_ranges)
+		__field(int, err)
+		__field(int, snap_err)
+	),
+
+	TP_fast_assign(/* assign */
+		__entry->dev = sb->s_dev;
+		__entry->tid = commit_tid;
+		__entry->locked_ns = locked_ns;
+		__entry->nr_inodes = nr_inodes;
+		__entry->nr_ranges = nr_ranges;
+		__entry->err = err;
+		__entry->snap_err = snap_err;
+	),
+
+	TP_printk("dev %d,%d tid %u locked_ns %llu nr_inodes %u nr_ranges %u err %d snap_err %s",
+		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->tid,
+		  __entry->locked_ns, __entry->nr_inodes, __entry->nr_ranges,
+		  __entry->err, __print_symbolic(__entry->snap_err,
+						 TRACE_SNAP_ERR))
+);
+
+#undef EM
+#undef EMe
+#undef TRACE_SNAP_ERR
+
 #define FC_REASON_NAME_STAT(reason)					\
 	show_fc_reason(reason),						\
 	__entry->fc_ineligible_rc[reason]
-- 
2.53.0

^ permalink raw reply related

* [RFC v7 5/7] ext4: fast commit: avoid i_data_sem by dropping ext4_map_blocks() in snapshots
From: Li Chen @ 2026-05-11  8:43 UTC (permalink / raw)
  To: Zhang Yi, Theodore Ts'o, Andreas Dilger, Baokun Li, Jan Kara,
	Ojaswin Mujoo, Ritesh Harjani (IBM), Zhang Yi, linux-ext4,
	linux-kernel
  Cc: Steven Rostedt, Masami Hiramatsu, Mathieu Desnoyers,
	linux-trace-kernel
In-Reply-To: <20260511084304.1559557-1-me@linux.beauty>

Commit-time snapshots run under jbd2_journal_lock_updates(), so the work
done there must stay bounded.

The snapshot path still used ext4_map_blocks() to build data ranges. This
can take i_data_sem and pulls the mapping code into the snapshot logic.
Build inode data range snapshots from the extent status tree instead.

The extent status tree is a cache, not an authoritative source. If the
needed information is missing or unstable (e.g. delayed allocation), treat
the transaction as fast commit ineligible and fall back to full commit.

Also cap the number of inodes and ranges snapshotted per fast commit and
allocate range records from a dedicated slab cache. The inode pointer
array is allocated outside the updates-locked window.

Testing: QEMU/KVM guest, virtio-pmem + dax, ext4 -O fast_commit, mounted
dax,noatime. Ran python3 500x {4K write + fsync}, fallocate 256M, and
python3 500x {creat + fsync(dir)} without lockdep splats or errors.

Signed-off-by: Li Chen <chenl311@chinatelecom.cn>
---
Changes in v7:
- Address Sashiko review by guarding snapshot range arithmetic near
  EXT_MAX_BLOCKS to avoid cur_lblk / remaining-range wraparound in the
  snapshot walk.

 fs/ext4/fast_commit.c | 257 +++++++++++++++++++++++++++++-------------
 1 file changed, 181 insertions(+), 76 deletions(-)

diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
index f9bb18c0b549..9fc17c1fa7af 100644
--- a/fs/ext4/fast_commit.c
+++ b/fs/ext4/fast_commit.c
@@ -184,6 +184,15 @@
 
 #include <trace/events/ext4.h>
 static struct kmem_cache *ext4_fc_dentry_cachep;
+static struct kmem_cache *ext4_fc_range_cachep;
+
+/*
+ * Avoid spending unbounded time/memory snapshotting highly fragmented files
+ * under jbd2_journal_lock_updates(). If we exceed this limit, fall back to
+ * full commit.
+ */
+#define EXT4_FC_SNAPSHOT_MAX_INODES	1024
+#define EXT4_FC_SNAPSHOT_MAX_RANGES	2048
 
 static void ext4_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
 {
@@ -938,7 +947,7 @@ static void ext4_fc_free_ranges(struct list_head *head)
 
 	list_for_each_entry_safe(range, range_n, head, list) {
 		list_del(&range->list);
-		kfree(range);
+		kmem_cache_free(ext4_fc_range_cachep, range);
 	}
 }
 
@@ -956,16 +965,19 @@ static void ext4_fc_free_inode_snap(struct inode *inode)
 }
 
 static int ext4_fc_snapshot_inode_data(struct inode *inode,
-				       struct list_head *ranges)
+				       struct list_head *ranges,
+				       unsigned int nr_ranges_total,
+				       unsigned int *nr_rangesp)
 {
 	struct ext4_inode_info *ei = EXT4_I(inode);
+	unsigned int nr_ranges = 0;
 	ext4_lblk_t start_lblk, end_lblk, cur_lblk;
-	struct ext4_map_blocks map;
-	int ret;
 
 	spin_lock(&ei->i_fc_lock);
 	if (ei->i_fc_lblk_len == 0) {
 		spin_unlock(&ei->i_fc_lock);
+		if (nr_rangesp)
+			*nr_rangesp = 0;
 		return 0;
 	}
 	start_lblk = ei->i_fc_lblk_start;
@@ -979,61 +991,82 @@ static int ext4_fc_snapshot_inode_data(struct inode *inode,
 		   (unsigned long long)inode->i_ino);
 
 	while (cur_lblk <= end_lblk) {
+		struct extent_status es;
 		struct ext4_fc_range *range;
+		ext4_lblk_t len;
+		u64 remaining = (u64)end_lblk - cur_lblk + 1;
 
-		map.m_lblk = cur_lblk;
-		map.m_len = end_lblk - cur_lblk + 1;
-		ret = ext4_map_blocks(NULL, inode, &map,
-				      EXT4_GET_BLOCKS_IO_SUBMIT |
-				      EXT4_EX_NOCACHE);
-		if (ret < 0)
-			return -ECANCELED;
+		if (!ext4_es_lookup_extent(inode, cur_lblk, NULL, &es, NULL))
+			return -EAGAIN;
+
+		if (ext4_es_is_delayed(&es))
+			return -EAGAIN;
 
-		if (map.m_len == 0) {
+		len = es.es_len - (cur_lblk - es.es_lblk);
+		if (len > remaining)
+			len = remaining;
+		if (len == 0) {
 			cur_lblk++;
 			continue;
 		}
 
-		range = kmalloc(sizeof(*range), GFP_NOFS);
+		if (nr_ranges_total + nr_ranges >= EXT4_FC_SNAPSHOT_MAX_RANGES)
+			return -E2BIG;
+
+		range = kmem_cache_alloc(ext4_fc_range_cachep, GFP_NOFS);
 		if (!range)
 			return -ENOMEM;
+		nr_ranges++;
 
-		range->lblk = map.m_lblk;
-		range->len = map.m_len;
+		range->lblk = cur_lblk;
+		range->len = len;
 		range->pblk = 0;
 		range->unwritten = false;
 
-		if (ret == 0) {
+		if (ext4_es_is_hole(&es)) {
 			range->tag = EXT4_FC_TAG_DEL_RANGE;
-		} else {
-			unsigned int max = (map.m_flags & EXT4_MAP_UNWRITTEN) ?
-				EXT_UNWRITTEN_MAX_LEN : EXT_INIT_MAX_LEN;
-
-			/* Limit the number of blocks in one extent */
-			map.m_len = min(max, map.m_len);
+		} else if (ext4_es_is_written(&es) ||
+			   ext4_es_is_unwritten(&es)) {
+			unsigned int max;
 
 			range->tag = EXT4_FC_TAG_ADD_RANGE;
-			range->len = map.m_len;
-			range->pblk = map.m_pblk;
-			range->unwritten = !!(map.m_flags & EXT4_MAP_UNWRITTEN);
+			range->pblk = ext4_es_pblock(&es) +
+				      (cur_lblk - es.es_lblk);
+			range->unwritten = ext4_es_is_unwritten(&es);
+
+			max = range->unwritten ? EXT_UNWRITTEN_MAX_LEN :
+						 EXT_INIT_MAX_LEN;
+			if (range->len > max)
+				range->len = max;
+		} else {
+			kmem_cache_free(ext4_fc_range_cachep, range);
+			return -EAGAIN;
 		}
 
 		INIT_LIST_HEAD(&range->list);
 		list_add_tail(&range->list, ranges);
 
-		cur_lblk += map.m_len;
+		if ((u64)range->len > (u64)end_lblk - cur_lblk)
+			break;
+
+		cur_lblk += range->len;
 	}
 
+	if (nr_rangesp)
+		*nr_rangesp = nr_ranges;
 	return 0;
 }
 
-static int ext4_fc_snapshot_inode(struct inode *inode)
+static int ext4_fc_snapshot_inode(struct inode *inode,
+				  unsigned int nr_ranges_total,
+				  unsigned int *nr_rangesp)
 {
 	struct ext4_inode_info *ei = EXT4_I(inode);
 	struct ext4_fc_inode_snap *snap;
 	int inode_len = EXT4_GOOD_OLD_INODE_SIZE;
 	struct ext4_iloc iloc;
 	LIST_HEAD(ranges);
+	unsigned int nr_ranges = 0;
 	int ret;
 	int alloc_ctx;
 
@@ -1057,7 +1090,8 @@ static int ext4_fc_snapshot_inode(struct inode *inode)
 	memcpy(snap->inode_buf, (u8 *)ext4_raw_inode(&iloc), inode_len);
 	brelse(iloc.bh);
 
-	ret = ext4_fc_snapshot_inode_data(inode, &ranges);
+	ret = ext4_fc_snapshot_inode_data(inode, &ranges, nr_ranges_total,
+					  &nr_ranges);
 	if (ret) {
 		kfree(snap);
 		ext4_fc_free_ranges(&ranges);
@@ -1070,10 +1104,11 @@ static int ext4_fc_snapshot_inode(struct inode *inode)
 	list_splice_tail_init(&ranges, &snap->data_list);
 	ext4_fc_unlock(inode->i_sb, alloc_ctx);
 
+	if (nr_rangesp)
+		*nr_rangesp = nr_ranges;
 	return 0;
 }
 
-
 /* Flushes data of all the inodes in the commit queue. */
 static int ext4_fc_flush_data(journal_t *journal)
 {
@@ -1152,49 +1187,32 @@ static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc)
 	return 0;
 }
 
-static int ext4_fc_snapshot_inodes(journal_t *journal)
+static int ext4_fc_alloc_snapshot_inodes(struct super_block *sb,
+					 struct inode ***inodesp,
+					 unsigned int *nr_inodesp);
+
+static int ext4_fc_snapshot_inodes(journal_t *journal, struct inode **inodes,
+				   unsigned int inodes_size)
 {
 	struct super_block *sb = journal->j_private;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	struct ext4_inode_info *iter;
 	struct ext4_fc_dentry_update *fc_dentry;
-	struct inode **inodes;
-	unsigned int nr_inodes = 0;
 	unsigned int i = 0;
+	unsigned int idx;
+	unsigned int nr_ranges = 0;
 	int ret = 0;
 	int alloc_ctx;
 
-	alloc_ctx = ext4_fc_lock(sb);
-	list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list)
-		nr_inodes++;
-
-	list_for_each_entry(fc_dentry, &sbi->s_fc_dentry_q[FC_Q_MAIN], fcd_list) {
-		struct ext4_inode_info *ei;
-
-		if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT)
-			continue;
-		if (list_empty(&fc_dentry->fcd_dilist))
-			continue;
-
-		/* See the comment in ext4_fc_commit_dentry_updates(). */
-		ei = list_first_entry(&fc_dentry->fcd_dilist,
-				      struct ext4_inode_info, i_fc_dilist);
-		if (!list_empty(&ei->i_fc_list))
-			continue;
-
-		nr_inodes++;
-	}
-	ext4_fc_unlock(sb, alloc_ctx);
-
-	if (!nr_inodes)
+	if (!inodes_size)
 		return 0;
 
-	inodes = kvcalloc(nr_inodes, sizeof(*inodes), GFP_NOFS);
-	if (!inodes)
-		return -ENOMEM;
-
 	alloc_ctx = ext4_fc_lock(sb);
 	list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
+		if (i >= inodes_size) {
+			ret = -E2BIG;
+			goto unlock;
+		}
 		inodes[i++] = &iter->vfs_inode;
 	}
 
@@ -1214,6 +1232,10 @@ static int ext4_fc_snapshot_inodes(journal_t *journal)
 		if (!list_empty(&ei->i_fc_list))
 			continue;
 
+		if (i >= inodes_size) {
+			ret = -E2BIG;
+			goto unlock;
+		}
 		/*
 		 * Create-only inodes may only be referenced via fcd_dilist and
 		 * not appear on s_fc_q[MAIN]. They may hit the last iput while
@@ -1225,15 +1247,22 @@ static int ext4_fc_snapshot_inodes(journal_t *journal)
 		ext4_set_inode_state(inode, EXT4_STATE_FC_COMMITTING);
 		inodes[i++] = inode;
 	}
+unlock:
 	ext4_fc_unlock(sb, alloc_ctx);
 
-	for (nr_inodes = 0; nr_inodes < i; nr_inodes++) {
-		ret = ext4_fc_snapshot_inode(inodes[nr_inodes]);
+	if (ret)
+		return ret;
+
+	for (idx = 0; idx < i; idx++) {
+		unsigned int inode_ranges = 0;
+
+		ret = ext4_fc_snapshot_inode(inodes[idx], nr_ranges,
+					     &inode_ranges);
 		if (ret)
 			break;
+		nr_ranges += inode_ranges;
 	}
 
-	kvfree(inodes);
 	return ret;
 }
 
@@ -1244,6 +1273,8 @@ static int ext4_fc_perform_commit(journal_t *journal)
 	struct ext4_inode_info *iter;
 	struct ext4_fc_head head;
 	struct inode *inode;
+	struct inode **inodes;
+	unsigned int inodes_size;
 	struct blk_plug plug;
 	int ret = 0;
 	u32 crc = 0;
@@ -1296,6 +1327,10 @@ static int ext4_fc_perform_commit(journal_t *journal)
 		return ret;
 
 
+	ret = ext4_fc_alloc_snapshot_inodes(sb, &inodes, &inodes_size);
+	if (ret)
+		return ret;
+
 	/* Step 4: Mark all inodes as being committed. */
 	jbd2_journal_lock_updates(journal);
 	/*
@@ -1311,8 +1346,9 @@ static int ext4_fc_perform_commit(journal_t *journal)
 	}
 	ext4_fc_unlock(sb, alloc_ctx);
 
-	ret = ext4_fc_snapshot_inodes(journal);
+	ret = ext4_fc_snapshot_inodes(journal, inodes, inodes_size);
 	jbd2_journal_unlock_updates(journal);
+	kvfree(inodes);
 	if (ret)
 		return ret;
 
@@ -1368,6 +1404,64 @@ static int ext4_fc_perform_commit(journal_t *journal)
 	return ret;
 }
 
+static unsigned int ext4_fc_count_snapshot_inodes(struct super_block *sb)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_inode_info *iter;
+	struct ext4_fc_dentry_update *fc_dentry;
+	unsigned int nr_inodes = 0;
+	int alloc_ctx;
+
+	alloc_ctx = ext4_fc_lock(sb);
+	list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list)
+		nr_inodes++;
+
+	list_for_each_entry(fc_dentry, &sbi->s_fc_dentry_q[FC_Q_MAIN], fcd_list) {
+		struct ext4_inode_info *ei;
+
+		if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT)
+			continue;
+		if (list_empty(&fc_dentry->fcd_dilist))
+			continue;
+
+		/* See the comment in ext4_fc_commit_dentry_updates(). */
+		ei = list_first_entry(&fc_dentry->fcd_dilist,
+				      struct ext4_inode_info, i_fc_dilist);
+		if (!list_empty(&ei->i_fc_list))
+			continue;
+
+		nr_inodes++;
+	}
+	ext4_fc_unlock(sb, alloc_ctx);
+
+	return nr_inodes;
+}
+
+static int ext4_fc_alloc_snapshot_inodes(struct super_block *sb,
+					 struct inode ***inodesp,
+					 unsigned int *nr_inodesp)
+{
+	unsigned int nr_inodes = ext4_fc_count_snapshot_inodes(sb);
+	struct inode **inodes;
+
+	*inodesp = NULL;
+	*nr_inodesp = 0;
+
+	if (!nr_inodes)
+		return 0;
+
+	if (nr_inodes > EXT4_FC_SNAPSHOT_MAX_INODES)
+		return -E2BIG;
+
+	inodes = kvcalloc(nr_inodes, sizeof(*inodes), GFP_NOFS);
+	if (!inodes)
+		return -ENOMEM;
+
+	*inodesp = inodes;
+	*nr_inodesp = nr_inodes;
+	return 0;
+}
+
 static void ext4_fc_update_stats(struct super_block *sb, int status,
 				 u64 commit_time, int nblks, tid_t commit_tid)
 {
@@ -1460,7 +1554,10 @@ int ext4_fc_commit(journal_t *journal, tid_t commit_tid)
 	fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize;
 	ret = ext4_fc_perform_commit(journal);
 	if (ret < 0) {
-		status = EXT4_FC_STATUS_FAILED;
+		if (ret == -EAGAIN || ret == -E2BIG || ret == -ECANCELED)
+			status = EXT4_FC_STATUS_INELIGIBLE;
+		else
+			status = EXT4_FC_STATUS_FAILED;
 		goto fallback;
 	}
 	nblks = (sbi->s_fc_bytes + bsize - 1) / bsize - fc_bufs_before;
@@ -1544,34 +1641,35 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid)
 
 	while (!list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) {
 		fc_dentry = list_first_entry(&sbi->s_fc_dentry_q[FC_Q_MAIN],
-					     struct ext4_fc_dentry_update,
-					     fcd_list);
+						 struct ext4_fc_dentry_update,
+						 fcd_list);
 		list_del_init(&fc_dentry->fcd_list);
 		if (fc_dentry->fcd_op == EXT4_FC_TAG_CREAT &&
-		    !list_empty(&fc_dentry->fcd_dilist)) {
+			!list_empty(&fc_dentry->fcd_dilist)) {
 			/* See the comment in ext4_fc_commit_dentry_updates(). */
 			ei = list_first_entry(&fc_dentry->fcd_dilist,
-					      struct ext4_inode_info,
-					      i_fc_dilist);
+						  struct ext4_inode_info,
+						  i_fc_dilist);
 			ext4_fc_free_inode_snap(&ei->vfs_inode);
 			spin_lock(&ei->i_fc_lock);
 			ext4_clear_inode_state(&ei->vfs_inode,
-					       EXT4_STATE_FC_REQUEUE);
+						   EXT4_STATE_FC_REQUEUE);
 			ext4_clear_inode_state(&ei->vfs_inode,
-					       EXT4_STATE_FC_COMMITTING);
+						   EXT4_STATE_FC_COMMITTING);
 			spin_unlock(&ei->i_fc_lock);
 			/*
 			 * Make sure clearing of EXT4_STATE_FC_COMMITTING is
-			 * visible before we send the wakeup. Pairs with implicit
-			 * barrier in prepare_to_wait() in ext4_fc_del().
+			 * visible before we send the wakeup. Pairs with
+			 * implicit barrier in prepare_to_wait() in
+			 * ext4_fc_del().
 			 */
 			smp_mb();
 #if (BITS_PER_LONG < 64)
 			wake_up_bit(&ei->i_state_flags,
-				    EXT4_STATE_FC_COMMITTING);
+					EXT4_STATE_FC_COMMITTING);
 #else
 			wake_up_bit(&ei->i_flags,
-				    EXT4_STATE_FC_COMMITTING);
+					EXT4_STATE_FC_COMMITTING);
 #endif
 		}
 		list_del_init(&fc_dentry->fcd_dilist);
@@ -2548,13 +2646,20 @@ int __init ext4_fc_init_dentry_cache(void)
 	ext4_fc_dentry_cachep = KMEM_CACHE(ext4_fc_dentry_update,
 					   SLAB_RECLAIM_ACCOUNT);
 
-	if (ext4_fc_dentry_cachep == NULL)
+	if (!ext4_fc_dentry_cachep)
 		return -ENOMEM;
 
+	ext4_fc_range_cachep = KMEM_CACHE(ext4_fc_range, SLAB_RECLAIM_ACCOUNT);
+	if (!ext4_fc_range_cachep) {
+		kmem_cache_destroy(ext4_fc_dentry_cachep);
+		return -ENOMEM;
+	}
+
 	return 0;
 }
 
 void ext4_fc_destroy_dentry_cache(void)
 {
+	kmem_cache_destroy(ext4_fc_range_cachep);
 	kmem_cache_destroy(ext4_fc_dentry_cachep);
 }
-- 
2.53.0

^ permalink raw reply related

* [RFC v7 4/7] ext4: fast commit: avoid self-deadlock in inode snapshotting
From: Li Chen @ 2026-05-11  8:42 UTC (permalink / raw)
  To: Zhang Yi, Theodore Ts'o, Andreas Dilger, Baokun Li, Jan Kara,
	Ojaswin Mujoo, Ritesh Harjani (IBM), Zhang Yi, linux-ext4,
	linux-kernel
  Cc: Steven Rostedt, Masami Hiramatsu, Mathieu Desnoyers,
	linux-trace-kernel
In-Reply-To: <20260511084304.1559557-1-me@linux.beauty>

ext4_fc_snapshot_inodes() used igrab()/iput() to pin inodes while building
commit-time snapshots. With ext4_fc_del() waiting for
EXT4_STATE_FC_COMMITTING, iput() can trigger
ext4_clear_inode()->ext4_fc_del() in the commit thread and deadlock waiting
for the fast commit to finish.

Avoid taking extra references. Collect inode pointers under s_fc_lock and
rely on EXT4_STATE_FC_COMMITTING to pin inodes until ext4_fc_cleanup()
clears the bit.

Also set EXT4_STATE_FC_COMMITTING for create-only inodes referenced
from the dentry update queue, and wake up waiters when ext4_fc_cleanup()
clears the bit.

Signed-off-by: Li Chen <chenl311@chinatelecom.cn>
---
 fs/ext4/fast_commit.c | 47 ++++++++++++++++++++++++++++++++-----------
 1 file changed, 35 insertions(+), 12 deletions(-)

diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
index 273bf34031ae..f9bb18c0b549 100644
--- a/fs/ext4/fast_commit.c
+++ b/fs/ext4/fast_commit.c
@@ -1195,13 +1195,12 @@ static int ext4_fc_snapshot_inodes(journal_t *journal)
 
 	alloc_ctx = ext4_fc_lock(sb);
 	list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
-		inodes[i] = igrab(&iter->vfs_inode);
-		if (inodes[i])
-			i++;
+		inodes[i++] = &iter->vfs_inode;
 	}
 
 	list_for_each_entry(fc_dentry, &sbi->s_fc_dentry_q[FC_Q_MAIN], fcd_list) {
 		struct ext4_inode_info *ei;
+		struct inode *inode;
 
 		if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT)
 			continue;
@@ -1211,12 +1210,20 @@ static int ext4_fc_snapshot_inodes(journal_t *journal)
 		/* See the comment in ext4_fc_commit_dentry_updates(). */
 		ei = list_first_entry(&fc_dentry->fcd_dilist,
 				      struct ext4_inode_info, i_fc_dilist);
+		inode = &ei->vfs_inode;
 		if (!list_empty(&ei->i_fc_list))
 			continue;
 
-		inodes[i] = igrab(&ei->vfs_inode);
-		if (inodes[i])
-			i++;
+		/*
+		 * Create-only inodes may only be referenced via fcd_dilist and
+		 * not appear on s_fc_q[MAIN]. They may hit the last iput while
+		 * we are snapshotting, but inode eviction calls ext4_fc_del(),
+		 * which waits for FC_COMMITTING to clear. Mark them FC_COMMITTING
+		 * so the inode stays pinned and the snapshot stays valid until
+		 * ext4_fc_cleanup().
+		 */
+		ext4_set_inode_state(inode, EXT4_STATE_FC_COMMITTING);
+		inodes[i++] = inode;
 	}
 	ext4_fc_unlock(sb, alloc_ctx);
 
@@ -1226,10 +1233,6 @@ static int ext4_fc_snapshot_inodes(journal_t *journal)
 			break;
 	}
 
-	for (nr_inodes = 0; nr_inodes < i; nr_inodes++) {
-		if (inodes[nr_inodes])
-			iput(inodes[nr_inodes]);
-	}
 	kvfree(inodes);
 	return ret;
 }
@@ -1297,8 +1300,9 @@ static int ext4_fc_perform_commit(journal_t *journal)
 	jbd2_journal_lock_updates(journal);
 	/*
 	 * The journal is now locked. No more handles can start and all the
-	 * previous handles are now drained. We now mark the inodes on the
-	 * commit queue as being committed.
+	 * previous handles are now drained. Snapshotting happens in this
+	 * window so log writing can consume only stable snapshots without
+	 * doing logical-to-physical mapping.
 	 */
 	alloc_ctx = ext4_fc_lock(sb);
 	list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
@@ -1550,6 +1554,25 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid)
 					      struct ext4_inode_info,
 					      i_fc_dilist);
 			ext4_fc_free_inode_snap(&ei->vfs_inode);
+			spin_lock(&ei->i_fc_lock);
+			ext4_clear_inode_state(&ei->vfs_inode,
+					       EXT4_STATE_FC_REQUEUE);
+			ext4_clear_inode_state(&ei->vfs_inode,
+					       EXT4_STATE_FC_COMMITTING);
+			spin_unlock(&ei->i_fc_lock);
+			/*
+			 * Make sure clearing of EXT4_STATE_FC_COMMITTING is
+			 * visible before we send the wakeup. Pairs with implicit
+			 * barrier in prepare_to_wait() in ext4_fc_del().
+			 */
+			smp_mb();
+#if (BITS_PER_LONG < 64)
+			wake_up_bit(&ei->i_state_flags,
+				    EXT4_STATE_FC_COMMITTING);
+#else
+			wake_up_bit(&ei->i_flags,
+				    EXT4_STATE_FC_COMMITTING);
+#endif
 		}
 		list_del_init(&fc_dentry->fcd_dilist);
 
-- 
2.53.0


^ permalink raw reply related

* [RFC v7 3/7] ext4: fast commit: avoid waiting for FC_COMMITTING
From: Li Chen @ 2026-05-11  8:42 UTC (permalink / raw)
  To: Zhang Yi, Theodore Ts'o, Andreas Dilger, Baokun Li, Jan Kara,
	Ojaswin Mujoo, Ritesh Harjani (IBM), Zhang Yi, linux-ext4,
	linux-kernel
  Cc: Steven Rostedt, Masami Hiramatsu, Mathieu Desnoyers,
	linux-trace-kernel
In-Reply-To: <20260511084304.1559557-1-me@linux.beauty>

ext4_fc_track_inode() can be called while holding i_data_sem (e.g.
fallocate). Waiting for EXT4_STATE_FC_COMMITTING in that case risks an
ABBA deadlock: i_data_sem -> wait(FC_COMMITTING) vs FC_COMMITTING ->
wait(i_data_sem) in the commit task.

Now that fast commit snapshots inode state at commit time, updates during
log writing do not need to block. Drop the wait and lockdep assertion in
ext4_fc_track_inode(), and make ext4_fc_del() wait for FC_COMMITTING so an
inode cannot be removed while the commit thread is still using it.

When an inode is modified during a fast commit, mark it with
EXT4_STATE_FC_REQUEUE so cleanup keeps it queued for the next fast commit.
This is needed because jbd2_fc_end_commit() invokes the cleanup callback
with tid == 0, so tid-based requeue logic would requeue every inode.

Testing: tracepoint ext4:ext4_fc_commit_stop with two fsyncs in the same
transaction. nblks is the number of journal blocks written for that fast
commit. Before this change, the second fsync still wrote almost the same
fast commit log (nblks 10->9), because tid == 0 in jbd2_fc_end_commit()
caused the tid-based requeue logic to keep all inodes queued. After this
change, only inodes modified during the commit are requeued, and the
second fsync wrote a nearly empty fast commit (nblks 10->1).

Signed-off-by: Li Chen <chenl311@chinatelecom.cn>
---
 fs/ext4/ext4.h        |   1 +
 fs/ext4/fast_commit.c | 111 ++++++++++++++++++++----------------------
 2 files changed, 53 insertions(+), 59 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 05c8f67625b4..2a706acdfaf8 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1991,6 +1991,7 @@ enum {
 	EXT4_STATE_FC_COMMITTING,	/* Fast commit ongoing */
 	EXT4_STATE_FC_FLUSHING_DATA,	/* Fast commit flushing data */
 	EXT4_STATE_ORPHAN_FILE,		/* Inode orphaned in orphan file */
+	EXT4_STATE_FC_REQUEUE,		/* Inode modified during fast commit */
 };
 
 #define EXT4_INODE_BIT_FNS(name, field, offset)				\
diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
index cd4eac4e7dcb..273bf34031ae 100644
--- a/fs/ext4/fast_commit.c
+++ b/fs/ext4/fast_commit.c
@@ -62,9 +62,8 @@
  *     setting "EXT4_STATE_FC_COMMITTING" state, and snapshot the inode state
  *     needed for log writing.
  * [5] Unlock the journal by calling jbd2_journal_unlock_updates(). This allows
- *     starting of new handles. If new handles try to start an update on
- *     any of the inodes that are being committed, ext4_fc_track_inode()
- *     will block until those inodes have finished the fast commit.
+ *     starting of new handles. Updates to inodes being fast committed are
+ *     tracked for requeue rather than blocking.
  * [6] Commit all the directory entry updates in the fast commit space.
  * [7] Commit all the changed inodes in the fast commit space.
  * [8] Write tail tag (this tag ensures the atomicity, please read the following
@@ -218,6 +217,7 @@ void ext4_fc_init_inode(struct inode *inode)
 
 	ext4_fc_reset_inode(inode);
 	ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING);
+	ext4_clear_inode_state(inode, EXT4_STATE_FC_REQUEUE);
 	INIT_LIST_HEAD(&ei->i_fc_list);
 	INIT_LIST_HEAD(&ei->i_fc_dilist);
 	ei->i_fc_snap = NULL;
@@ -257,22 +257,30 @@ void ext4_fc_del(struct inode *inode)
 	}
 
 	/*
-	 * Since ext4_fc_del is called from ext4_evict_inode while having a
-	 * handle open, there is no need for us to wait here even if a fast
-	 * commit is going on. That is because, if this inode is being
-	 * committed, ext4_mark_inode_dirty would have waited for inode commit
-	 * operation to finish before we come here. So, by the time we come
-	 * here, inode's EXT4_STATE_FC_COMMITTING would have been cleared. So,
-	 * we shouldn't see EXT4_STATE_FC_COMMITTING to be set on this inode
-	 * here.
-	 *
-	 * We may come here without any handles open in the "no_delete" case of
-	 * ext4_evict_inode as well. However, if that happens, we first mark the
-	 * file system as fast commit ineligible anyway. So, even in that case,
-	 * it is okay to remove the inode from the fc list.
+	 * Wait for ongoing fast commit to finish. We cannot remove the inode
+	 * from fast commit lists while it is being committed.
 	 */
-	WARN_ON(ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)
-		&& !ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE));
+	while (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
+#if (BITS_PER_LONG < 64)
+		DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
+				EXT4_STATE_FC_COMMITTING);
+		wq = bit_waitqueue(&ei->i_state_flags,
+				   EXT4_STATE_FC_COMMITTING);
+#else
+		DEFINE_WAIT_BIT(wait, &ei->i_flags,
+				EXT4_STATE_FC_COMMITTING);
+		wq = bit_waitqueue(&ei->i_flags,
+				   EXT4_STATE_FC_COMMITTING);
+#endif
+		prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
+		if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
+			ext4_fc_unlock(inode->i_sb, alloc_ctx);
+			schedule();
+			alloc_ctx = ext4_fc_lock(inode->i_sb);
+		}
+		finish_wait(wq, &wait.wq_entry);
+	}
+
 	while (ext4_test_inode_state(inode, EXT4_STATE_FC_FLUSHING_DATA)) {
 #if (BITS_PER_LONG < 64)
 		DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
@@ -293,19 +301,22 @@ void ext4_fc_del(struct inode *inode)
 		}
 		finish_wait(wq, &wait.wq_entry);
 	}
+
 	ext4_fc_free_inode_snap(inode);
 	list_del_init(&ei->i_fc_list);
 
 	/*
-	 * Since this inode is getting removed, let's also remove all FC
-	 * dentry create references, since it is not needed to log it anyways.
+	 * Since this inode is getting removed, let's also remove all FC dentry
+	 * create references, since it is not needed to log it anyways.
 	 */
 	if (list_empty(&ei->i_fc_dilist)) {
 		ext4_fc_unlock(inode->i_sb, alloc_ctx);
 		return;
 	}
 
-	fc_dentry = list_first_entry(&ei->i_fc_dilist, struct ext4_fc_dentry_update, fcd_dilist);
+	fc_dentry = list_first_entry(&ei->i_fc_dilist,
+				     struct ext4_fc_dentry_update,
+				     fcd_dilist);
 	WARN_ON(fc_dentry->fcd_op != EXT4_FC_TAG_CREAT);
 	list_del_init(&fc_dentry->fcd_list);
 	list_del_init(&fc_dentry->fcd_dilist);
@@ -377,6 +388,8 @@ static int ext4_fc_track_template(
 
 	tid = handle->h_transaction->t_tid;
 	spin_lock(&ei->i_fc_lock);
+	if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING))
+		ext4_set_inode_state(inode, EXT4_STATE_FC_REQUEUE);
 	if (tid == ei->i_sync_tid) {
 		update = true;
 	} else {
@@ -547,8 +560,6 @@ static int __track_inode(handle_t *handle, struct inode *inode, void *arg,
 
 void ext4_fc_track_inode(handle_t *handle, struct inode *inode)
 {
-	struct ext4_inode_info *ei = EXT4_I(inode);
-	wait_queue_head_t *wq;
 	int ret;
 
 	if (S_ISDIR(inode->i_mode))
@@ -564,29 +575,11 @@ void ext4_fc_track_inode(handle_t *handle, struct inode *inode)
 		return;
 
 	/*
-	 * If we come here, we may sleep while waiting for the inode to
-	 * commit. We shouldn't be holding i_data_sem when we go to sleep since
-	 * the commit path needs to grab the lock while committing the inode.
+	 * Fast commit snapshots inode state at commit time, so there's no need
+	 * to wait for EXT4_STATE_FC_COMMITTING here. If the inode is already
+	 * on the commit queue, ext4_fc_cleanup() will requeue it for the new
+	 * transaction once the current commit finishes.
 	 */
-	lockdep_assert_not_held(&ei->i_data_sem);
-
-	while (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
-#if (BITS_PER_LONG < 64)
-		DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
-				EXT4_STATE_FC_COMMITTING);
-		wq = bit_waitqueue(&ei->i_state_flags,
-				   EXT4_STATE_FC_COMMITTING);
-#else
-		DEFINE_WAIT_BIT(wait, &ei->i_flags,
-				EXT4_STATE_FC_COMMITTING);
-		wq = bit_waitqueue(&ei->i_flags,
-				   EXT4_STATE_FC_COMMITTING);
-#endif
-		prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
-		if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING))
-			schedule();
-		finish_wait(wq, &wait.wq_entry);
-	}
 
 	/*
 	 * From this point on, this inode will not be committed either
@@ -1510,32 +1503,32 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid)
 
 	alloc_ctx = ext4_fc_lock(sb);
 	while (!list_empty(&sbi->s_fc_q[FC_Q_MAIN])) {
+		bool requeue;
+
 		ei = list_first_entry(&sbi->s_fc_q[FC_Q_MAIN],
 					struct ext4_inode_info,
 					i_fc_list);
 		list_del_init(&ei->i_fc_list);
 		ext4_fc_free_inode_snap(&ei->vfs_inode);
+		spin_lock(&ei->i_fc_lock);
+		if (full)
+			requeue = !tid_geq(tid, ei->i_sync_tid);
+		else
+			requeue = ext4_test_inode_state(&ei->vfs_inode,
+							EXT4_STATE_FC_REQUEUE);
+		if (!requeue)
+			ext4_fc_reset_inode(&ei->vfs_inode);
+		ext4_clear_inode_state(&ei->vfs_inode, EXT4_STATE_FC_REQUEUE);
 		ext4_clear_inode_state(&ei->vfs_inode,
 				       EXT4_STATE_FC_COMMITTING);
-		if (tid_geq(tid, ei->i_sync_tid)) {
-			ext4_fc_reset_inode(&ei->vfs_inode);
-		} else if (full) {
-			/*
-			 * We are called after a full commit, inode has been
-			 * modified while the commit was running. Re-enqueue
-			 * the inode into STAGING, which will then be splice
-			 * back into MAIN. This cannot happen during
-			 * fastcommit because the journal is locked all the
-			 * time in that case (and tid doesn't increase so
-			 * tid check above isn't reliable).
-			 */
+		spin_unlock(&ei->i_fc_lock);
+		if (requeue)
 			list_add_tail(&ei->i_fc_list,
 				      &sbi->s_fc_q[FC_Q_STAGING]);
-		}
 		/*
 		 * Make sure clearing of EXT4_STATE_FC_COMMITTING is
 		 * visible before we send the wakeup. Pairs with implicit
-		 * barrier in prepare_to_wait() in ext4_fc_track_inode().
+		 * barrier in prepare_to_wait() in ext4_fc_del().
 		 */
 		smp_mb();
 #if (BITS_PER_LONG < 64)
-- 
2.53.0


^ permalink raw reply related

* [RFC v7 2/7] ext4: lockdep: handle i_data_sem subclassing for special inodes
From: Li Chen @ 2026-05-11  8:42 UTC (permalink / raw)
  To: Zhang Yi, Theodore Ts'o, Andreas Dilger, Baokun Li, Jan Kara,
	Ojaswin Mujoo, Ritesh Harjani (IBM), Zhang Yi, linux-ext4,
	linux-kernel
  Cc: Steven Rostedt, Masami Hiramatsu, Mathieu Desnoyers,
	linux-trace-kernel
In-Reply-To: <20260511084304.1559557-1-me@linux.beauty>

Fast commit can hold s_fc_lock while writing journal blocks. Mapping the
journal inode can take its i_data_sem. Normal inode update paths can take a
data inode i_data_sem and then s_fc_lock, which makes lockdep report a
circular dependency.

lockdep treats all i_data_sem instances as one lock class and cannot
distinguish the journal inode i_data_sem from a regular inode i_data_sem.
The journal inode is not tracked by fast commit and no FC waiters ever
depend on it, so this is not a real ABBA deadlock. Assign the journal inode
a dedicated i_data_sem lockdep subclass to avoid the false positive.

Inode cache objects can be recycled, so also reset i_data_sem to
I_DATA_SEM_NORMAL when allocating an ext4 inode. Otherwise a new inode may
inherit an old subclass (journal/quota/ea) and trigger lockdep warnings.

Signed-off-by: Li Chen <chenl311@chinatelecom.cn>
---
Changes in v6:
- Rebase onto linux-next master as of 2026-04-08.
- Refresh the patch context around upstream ext4_alloc_inode() changes,
  without changing the subclassing logic.

 fs/ext4/ext4.h  | 4 +++-
 fs/ext4/super.c | 8 ++++++++
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index e01d00dbc077..05c8f67625b4 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1015,12 +1015,14 @@ do {										\
  *			  than the first
  *  I_DATA_SEM_QUOTA  - Used for quota inodes only
  *  I_DATA_SEM_EA     - Used for ea_inodes only
+ *  I_DATA_SEM_JOURNAL - Used for journal inode only
  */
 enum {
 	I_DATA_SEM_NORMAL = 0,
 	I_DATA_SEM_OTHER,
 	I_DATA_SEM_QUOTA,
-	I_DATA_SEM_EA
+	I_DATA_SEM_EA,
+	I_DATA_SEM_JOURNAL
 };
 
 struct ext4_fc_inode_snap;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 6a77db4d3124..3c869f0001c5 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1431,6 +1431,9 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
 	ext4_fc_init_inode(&ei->vfs_inode);
 	spin_lock_init(&ei->i_fc_lock);
 	mmb_init(&ei->i_metadata_bhs, &ei->vfs_inode.i_data);
+#ifdef CONFIG_LOCKDEP
+	lockdep_set_subclass(&ei->i_data_sem, I_DATA_SEM_NORMAL);
+#endif
 	return &ei->vfs_inode;
 }
 
@@ -5910,6 +5913,11 @@ static struct inode *ext4_get_journal_inode(struct super_block *sb,
 		return ERR_PTR(-EFSCORRUPTED);
 	}
 
+#ifdef CONFIG_LOCKDEP
+	lockdep_set_subclass(&EXT4_I(journal_inode)->i_data_sem,
+			     I_DATA_SEM_JOURNAL);
+#endif
+
 	ext4_debug("Journal inode found at %p: %lld bytes\n",
 		  journal_inode, journal_inode->i_size);
 	return journal_inode;
-- 
2.53.0


^ permalink raw reply related

* [RFC v7 1/7] ext4: fast commit: snapshot inode state before writing log
From: Li Chen @ 2026-05-11  8:42 UTC (permalink / raw)
  To: Zhang Yi, Theodore Ts'o, Andreas Dilger, Baokun Li, Jan Kara,
	Ojaswin Mujoo, Ritesh Harjani (IBM), Zhang Yi, linux-ext4,
	linux-kernel
  Cc: Steven Rostedt, Masami Hiramatsu, Mathieu Desnoyers,
	linux-trace-kernel
In-Reply-To: <20260511084304.1559557-1-me@linux.beauty>

Fast commit writes inode metadata and data range updates after unlocking
journal updates. New handles can start at that point, so the log writing
path must not look at live inode state.

Add a commit-time per-inode snapshot and populate it while journal updates
are locked and existing handles are drained. Store the snapshot behind
ext4_inode_info->i_fc_snap so ext4_inode_info only grows by one pointer.
The snapshot contains a copy of the on-disk inode plus the data range
records needed for fast commit TLVs.

Snapshotting runs under jbd2_journal_lock_updates(). Avoid triggering I/O
there by using ext4_get_inode_loc_noio() and falling back to full commit
if the inode table block is not present or not uptodate.

Log writing then only serializes the snapshot, so it no longer needs to
call ext4_map_blocks() and take i_data_sem under s_fc_lock. The snapshot
is installed and freed under s_fc_lock and is released from fast commit
cleanup and inode eviction.

Signed-off-by: Li Chen <chenl311@chinatelecom.cn>
---
Changes in v7:
- Drop the stale i_fc_wait initialization after rebasing onto the new
  linux-next base.

Changes in v6:
- Rebase onto linux-next master as of 2026-04-08.
- Fix the inode debug print format after rebasing.

 fs/ext4/ext4.h        |  22 ++-
 fs/ext4/fast_commit.c | 331 +++++++++++++++++++++++++++++++++++-------
 fs/ext4/inode.c       |  51 +++++++
 3 files changed, 352 insertions(+), 52 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 94283a991e5c..e01d00dbc077 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1023,6 +1023,7 @@ enum {
 	I_DATA_SEM_EA
 };
 
+struct ext4_fc_inode_snap;
 
 /*
  * fourth extended file system inode data in memory
@@ -1079,6 +1080,22 @@ struct ext4_inode_info {
 	/* End of lblk range that needs to be committed in this fast commit */
 	ext4_lblk_t i_fc_lblk_len;
 
+	/*
+	 * Commit-time fast commit snapshots.
+	 *
+	 * i_fc_snap is installed and freed under sbi->s_fc_lock. The fast
+	 * commit log writing path reads the snapshot under sbi->s_fc_lock while
+	 * serializing fast commit TLVs.
+	 *
+	 * The snapshot lifetime is bounded by EXT4_STATE_FC_COMMITTING and the
+	 * corresponding cleanup / eviction paths.
+	 *
+	 * i_fc_snap points to per-inode snapshot data for fast commit:
+	 * - a raw inode snapshot for EXT4_FC_TAG_INODE
+	 * - data range records for EXT4_FC_TAG_{ADD,DEL}_RANGE
+	 */
+	struct ext4_fc_inode_snap *i_fc_snap;
+
 	spinlock_t i_raw_lock;	/* protects updates to the raw inode */
 
 	/*
@@ -3080,8 +3097,9 @@ extern int  ext4_file_getattr(struct mnt_idmap *, const struct path *,
 			      struct kstat *, u32, unsigned int);
 extern void ext4_dirty_inode(struct inode *, int);
 extern int ext4_change_inode_journal_flag(struct inode *, int);
-extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
-extern int ext4_get_fc_inode_loc(struct super_block *sb, unsigned long ino,
+int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc);
+int ext4_get_inode_loc_noio(struct inode *inode, struct ext4_iloc *iloc);
+int ext4_get_fc_inode_loc(struct super_block *sb, unsigned long ino,
 			  struct ext4_iloc *iloc);
 extern int ext4_inode_attach_jinode(struct inode *inode);
 extern int ext4_can_truncate(struct inode *inode);
diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
index b3c22636251d..cd4eac4e7dcb 100644
--- a/fs/ext4/fast_commit.c
+++ b/fs/ext4/fast_commit.c
@@ -56,21 +56,23 @@
  *     deleted while it is being flushed.
  * [2] Flush data buffers to disk and clear "EXT4_STATE_FC_FLUSHING_DATA"
  *     state.
- * [3] Lock the journal by calling jbd2_journal_lock_updates. This ensures that
- *     all the exsiting handles finish and no new handles can start.
- * [4] Mark all the fast commit eligible inodes as undergoing fast commit
- *     by setting "EXT4_STATE_FC_COMMITTING" state.
- * [5] Unlock the journal by calling jbd2_journal_unlock_updates. This allows
+ * [3] Lock the journal by calling jbd2_journal_lock_updates(). This ensures
+ *     that all the existing handles finish and no new handles can start.
+ * [4] Mark all the fast commit eligible inodes as undergoing fast commit by
+ *     setting "EXT4_STATE_FC_COMMITTING" state, and snapshot the inode state
+ *     needed for log writing.
+ * [5] Unlock the journal by calling jbd2_journal_unlock_updates(). This allows
  *     starting of new handles. If new handles try to start an update on
  *     any of the inodes that are being committed, ext4_fc_track_inode()
  *     will block until those inodes have finished the fast commit.
  * [6] Commit all the directory entry updates in the fast commit space.
- * [7] Commit all the changed inodes in the fast commit space and clear
- *     "EXT4_STATE_FC_COMMITTING" for these inodes.
+ * [7] Commit all the changed inodes in the fast commit space.
  * [8] Write tail tag (this tag ensures the atomicity, please read the following
  *     section for more details).
+ * [9] Clear "EXT4_STATE_FC_COMMITTING" and wake up waiters in
+ *     ext4_fc_cleanup().
  *
- * All the inode updates must be enclosed within jbd2_jounrnal_start()
+ * All the inode updates must be enclosed within jbd2_journal_start()
  * and jbd2_journal_stop() similar to JBD2 journaling.
  *
  * Fast Commit Ineligibility
@@ -200,6 +202,8 @@ static void ext4_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
 	unlock_buffer(bh);
 }
 
+static void ext4_fc_free_inode_snap(struct inode *inode);
+
 static inline void ext4_fc_reset_inode(struct inode *inode)
 {
 	struct ext4_inode_info *ei = EXT4_I(inode);
@@ -216,6 +220,7 @@ void ext4_fc_init_inode(struct inode *inode)
 	ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING);
 	INIT_LIST_HEAD(&ei->i_fc_list);
 	INIT_LIST_HEAD(&ei->i_fc_dilist);
+	ei->i_fc_snap = NULL;
 }
 
 static bool ext4_fc_disabled(struct super_block *sb)
@@ -246,6 +251,7 @@ void ext4_fc_del(struct inode *inode)
 
 	alloc_ctx = ext4_fc_lock(inode->i_sb);
 	if (list_empty(&ei->i_fc_list) && list_empty(&ei->i_fc_dilist)) {
+		ext4_fc_free_inode_snap(inode);
 		ext4_fc_unlock(inode->i_sb, alloc_ctx);
 		return;
 	}
@@ -287,6 +293,7 @@ void ext4_fc_del(struct inode *inode)
 		}
 		finish_wait(wq, &wait.wq_entry);
 	}
+	ext4_fc_free_inode_snap(inode);
 	list_del_init(&ei->i_fc_list);
 
 	/*
@@ -829,6 +836,21 @@ static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u32 *crc,
 	return true;
 }
 
+struct ext4_fc_range {
+	struct list_head list;
+	u16 tag;
+	ext4_lblk_t lblk;
+	ext4_lblk_t len;
+	ext4_fsblk_t pblk;
+	bool unwritten;
+};
+
+struct ext4_fc_inode_snap {
+	struct list_head data_list;
+	unsigned int inode_len;
+	u8 inode_buf[];
+};
+
 /*
  * Writes inode in the fast commit space under TLV with tag @tag.
  * Returns 0 on success, error on failure.
@@ -836,21 +858,21 @@ static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u32 *crc,
 static int ext4_fc_write_inode(struct inode *inode, u32 *crc)
 {
 	struct ext4_inode_info *ei = EXT4_I(inode);
-	int inode_len = EXT4_GOOD_OLD_INODE_SIZE;
-	int ret;
-	struct ext4_iloc iloc;
+	struct ext4_fc_inode_snap *snap = ei->i_fc_snap;
 	struct ext4_fc_inode fc_inode;
 	struct ext4_fc_tl tl;
 	u8 *dst;
+	u8 *src;
+	int inode_len;
+	int ret;
 
-	ret = ext4_get_inode_loc(inode, &iloc);
-	if (ret)
-		return ret;
+	if (!snap)
+		return -ECANCELED;
 
-	if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA))
-		inode_len = EXT4_INODE_SIZE(inode->i_sb);
-	else if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE)
-		inode_len += ei->i_extra_isize;
+	src = snap->inode_buf;
+	inode_len = snap->inode_len;
+	if (!src || inode_len == 0)
+		return -ECANCELED;
 
 	fc_inode.fc_ino = cpu_to_le32(inode->i_ino);
 	tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_INODE);
@@ -866,10 +888,9 @@ static int ext4_fc_write_inode(struct inode *inode, u32 *crc)
 	dst += EXT4_FC_TAG_BASE_LEN;
 	memcpy(dst, &fc_inode, sizeof(fc_inode));
 	dst += sizeof(fc_inode);
-	memcpy(dst, (u8 *)ext4_raw_inode(&iloc), inode_len);
+	memcpy(dst, src, inode_len);
 	ret = 0;
 err:
-	brelse(iloc.bh);
 	return ret;
 }
 
@@ -879,12 +900,74 @@ static int ext4_fc_write_inode(struct inode *inode, u32 *crc)
  */
 static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc)
 {
-	ext4_lblk_t old_blk_size, cur_lblk_off, new_blk_size;
 	struct ext4_inode_info *ei = EXT4_I(inode);
-	struct ext4_map_blocks map;
+	struct ext4_fc_inode_snap *snap = ei->i_fc_snap;
 	struct ext4_fc_add_range fc_ext;
 	struct ext4_fc_del_range lrange;
 	struct ext4_extent *ex;
+	struct ext4_fc_range *range;
+
+	if (!snap)
+		return -ECANCELED;
+
+	list_for_each_entry(range, &snap->data_list, list) {
+		if (range->tag == EXT4_FC_TAG_DEL_RANGE) {
+			lrange.fc_ino = cpu_to_le32(inode->i_ino);
+			lrange.fc_lblk = cpu_to_le32(range->lblk);
+			lrange.fc_len = cpu_to_le32(range->len);
+			if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_DEL_RANGE,
+					     sizeof(lrange), (u8 *)&lrange, crc))
+				return -ENOSPC;
+			continue;
+		}
+
+		fc_ext.fc_ino = cpu_to_le32(inode->i_ino);
+		ex = (struct ext4_extent *)&fc_ext.fc_ex;
+		ex->ee_block = cpu_to_le32(range->lblk);
+		ex->ee_len = cpu_to_le16(range->len);
+		ext4_ext_store_pblock(ex, range->pblk);
+		if (range->unwritten)
+			ext4_ext_mark_unwritten(ex);
+		else
+			ext4_ext_mark_initialized(ex);
+
+		if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_ADD_RANGE,
+				     sizeof(fc_ext), (u8 *)&fc_ext, crc))
+			return -ENOSPC;
+	}
+
+	return 0;
+}
+
+static void ext4_fc_free_ranges(struct list_head *head)
+{
+	struct ext4_fc_range *range, *range_n;
+
+	list_for_each_entry_safe(range, range_n, head, list) {
+		list_del(&range->list);
+		kfree(range);
+	}
+}
+
+static void ext4_fc_free_inode_snap(struct inode *inode)
+{
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	struct ext4_fc_inode_snap *snap = ei->i_fc_snap;
+
+	if (!snap)
+		return;
+
+	ext4_fc_free_ranges(&snap->data_list);
+	kfree(snap);
+	ei->i_fc_snap = NULL;
+}
+
+static int ext4_fc_snapshot_inode_data(struct inode *inode,
+				       struct list_head *ranges)
+{
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	ext4_lblk_t start_lblk, end_lblk, cur_lblk;
+	struct ext4_map_blocks map;
 	int ret;
 
 	spin_lock(&ei->i_fc_lock);
@@ -892,18 +975,21 @@ static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc)
 		spin_unlock(&ei->i_fc_lock);
 		return 0;
 	}
-	old_blk_size = ei->i_fc_lblk_start;
-	new_blk_size = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1;
+	start_lblk = ei->i_fc_lblk_start;
+	end_lblk = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1;
 	ei->i_fc_lblk_len = 0;
 	spin_unlock(&ei->i_fc_lock);
 
-	cur_lblk_off = old_blk_size;
-	ext4_debug("will try writing %d to %d for inode %llu\n",
-		   cur_lblk_off, new_blk_size, inode->i_ino);
+	cur_lblk = start_lblk;
+	ext4_debug("snapshot data ranges %u-%u for inode %llu\n",
+		   start_lblk, end_lblk,
+		   (unsigned long long)inode->i_ino);
+
+	while (cur_lblk <= end_lblk) {
+		struct ext4_fc_range *range;
 
-	while (cur_lblk_off <= new_blk_size) {
-		map.m_lblk = cur_lblk_off;
-		map.m_len = new_blk_size - cur_lblk_off + 1;
+		map.m_lblk = cur_lblk;
+		map.m_len = end_lblk - cur_lblk + 1;
 		ret = ext4_map_blocks(NULL, inode, &map,
 				      EXT4_GET_BLOCKS_IO_SUBMIT |
 				      EXT4_EX_NOCACHE);
@@ -911,17 +997,21 @@ static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc)
 			return -ECANCELED;
 
 		if (map.m_len == 0) {
-			cur_lblk_off++;
+			cur_lblk++;
 			continue;
 		}
 
+		range = kmalloc(sizeof(*range), GFP_NOFS);
+		if (!range)
+			return -ENOMEM;
+
+		range->lblk = map.m_lblk;
+		range->len = map.m_len;
+		range->pblk = 0;
+		range->unwritten = false;
+
 		if (ret == 0) {
-			lrange.fc_ino = cpu_to_le32(inode->i_ino);
-			lrange.fc_lblk = cpu_to_le32(map.m_lblk);
-			lrange.fc_len = cpu_to_le32(map.m_len);
-			if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_DEL_RANGE,
-					    sizeof(lrange), (u8 *)&lrange, crc))
-				return -ENOSPC;
+			range->tag = EXT4_FC_TAG_DEL_RANGE;
 		} else {
 			unsigned int max = (map.m_flags & EXT4_MAP_UNWRITTEN) ?
 				EXT_UNWRITTEN_MAX_LEN : EXT_INIT_MAX_LEN;
@@ -929,26 +1019,67 @@ static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc)
 			/* Limit the number of blocks in one extent */
 			map.m_len = min(max, map.m_len);
 
-			fc_ext.fc_ino = cpu_to_le32(inode->i_ino);
-			ex = (struct ext4_extent *)&fc_ext.fc_ex;
-			ex->ee_block = cpu_to_le32(map.m_lblk);
-			ex->ee_len = cpu_to_le16(map.m_len);
-			ext4_ext_store_pblock(ex, map.m_pblk);
-			if (map.m_flags & EXT4_MAP_UNWRITTEN)
-				ext4_ext_mark_unwritten(ex);
-			else
-				ext4_ext_mark_initialized(ex);
-			if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_ADD_RANGE,
-					    sizeof(fc_ext), (u8 *)&fc_ext, crc))
-				return -ENOSPC;
+			range->tag = EXT4_FC_TAG_ADD_RANGE;
+			range->len = map.m_len;
+			range->pblk = map.m_pblk;
+			range->unwritten = !!(map.m_flags & EXT4_MAP_UNWRITTEN);
 		}
 
-		cur_lblk_off += map.m_len;
+		INIT_LIST_HEAD(&range->list);
+		list_add_tail(&range->list, ranges);
+
+		cur_lblk += map.m_len;
 	}
 
 	return 0;
 }
 
+static int ext4_fc_snapshot_inode(struct inode *inode)
+{
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	struct ext4_fc_inode_snap *snap;
+	int inode_len = EXT4_GOOD_OLD_INODE_SIZE;
+	struct ext4_iloc iloc;
+	LIST_HEAD(ranges);
+	int ret;
+	int alloc_ctx;
+
+	ret = ext4_get_inode_loc_noio(inode, &iloc);
+	if (ret)
+		return ret;
+
+	if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA))
+		inode_len = EXT4_INODE_SIZE(inode->i_sb);
+	else if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE)
+		inode_len += ei->i_extra_isize;
+
+	snap = kmalloc(struct_size(snap, inode_buf, inode_len), GFP_NOFS);
+	if (!snap) {
+		brelse(iloc.bh);
+		return -ENOMEM;
+	}
+	INIT_LIST_HEAD(&snap->data_list);
+	snap->inode_len = inode_len;
+
+	memcpy(snap->inode_buf, (u8 *)ext4_raw_inode(&iloc), inode_len);
+	brelse(iloc.bh);
+
+	ret = ext4_fc_snapshot_inode_data(inode, &ranges);
+	if (ret) {
+		kfree(snap);
+		ext4_fc_free_ranges(&ranges);
+		return ret;
+	}
+
+	alloc_ctx = ext4_fc_lock(inode->i_sb);
+	ext4_fc_free_inode_snap(inode);
+	ei->i_fc_snap = snap;
+	list_splice_tail_init(&ranges, &snap->data_list);
+	ext4_fc_unlock(inode->i_sb, alloc_ctx);
+
+	return 0;
+}
+
 
 /* Flushes data of all the inodes in the commit queue. */
 static int ext4_fc_flush_data(journal_t *journal)
@@ -999,6 +1130,11 @@ static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc)
 		 */
 		if (list_empty(&fc_dentry->fcd_dilist))
 			continue;
+		/*
+		 * For EXT4_FC_TAG_CREAT, fcd_dilist is linked on the created
+		 * inode's i_fc_dilist list (kept singular), so we can recover the
+		 * inode through it.
+		 */
 		ei = list_first_entry(&fc_dentry->fcd_dilist,
 				struct ext4_inode_info, i_fc_dilist);
 		inode = &ei->vfs_inode;
@@ -1023,6 +1159,88 @@ static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc)
 	return 0;
 }
 
+static int ext4_fc_snapshot_inodes(journal_t *journal)
+{
+	struct super_block *sb = journal->j_private;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_inode_info *iter;
+	struct ext4_fc_dentry_update *fc_dentry;
+	struct inode **inodes;
+	unsigned int nr_inodes = 0;
+	unsigned int i = 0;
+	int ret = 0;
+	int alloc_ctx;
+
+	alloc_ctx = ext4_fc_lock(sb);
+	list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list)
+		nr_inodes++;
+
+	list_for_each_entry(fc_dentry, &sbi->s_fc_dentry_q[FC_Q_MAIN], fcd_list) {
+		struct ext4_inode_info *ei;
+
+		if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT)
+			continue;
+		if (list_empty(&fc_dentry->fcd_dilist))
+			continue;
+
+		/* See the comment in ext4_fc_commit_dentry_updates(). */
+		ei = list_first_entry(&fc_dentry->fcd_dilist,
+				      struct ext4_inode_info, i_fc_dilist);
+		if (!list_empty(&ei->i_fc_list))
+			continue;
+
+		nr_inodes++;
+	}
+	ext4_fc_unlock(sb, alloc_ctx);
+
+	if (!nr_inodes)
+		return 0;
+
+	inodes = kvcalloc(nr_inodes, sizeof(*inodes), GFP_NOFS);
+	if (!inodes)
+		return -ENOMEM;
+
+	alloc_ctx = ext4_fc_lock(sb);
+	list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
+		inodes[i] = igrab(&iter->vfs_inode);
+		if (inodes[i])
+			i++;
+	}
+
+	list_for_each_entry(fc_dentry, &sbi->s_fc_dentry_q[FC_Q_MAIN], fcd_list) {
+		struct ext4_inode_info *ei;
+
+		if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT)
+			continue;
+		if (list_empty(&fc_dentry->fcd_dilist))
+			continue;
+
+		/* See the comment in ext4_fc_commit_dentry_updates(). */
+		ei = list_first_entry(&fc_dentry->fcd_dilist,
+				      struct ext4_inode_info, i_fc_dilist);
+		if (!list_empty(&ei->i_fc_list))
+			continue;
+
+		inodes[i] = igrab(&ei->vfs_inode);
+		if (inodes[i])
+			i++;
+	}
+	ext4_fc_unlock(sb, alloc_ctx);
+
+	for (nr_inodes = 0; nr_inodes < i; nr_inodes++) {
+		ret = ext4_fc_snapshot_inode(inodes[nr_inodes]);
+		if (ret)
+			break;
+	}
+
+	for (nr_inodes = 0; nr_inodes < i; nr_inodes++) {
+		if (inodes[nr_inodes])
+			iput(inodes[nr_inodes]);
+	}
+	kvfree(inodes);
+	return ret;
+}
+
 static int ext4_fc_perform_commit(journal_t *journal)
 {
 	struct super_block *sb = journal->j_private;
@@ -1095,7 +1313,11 @@ static int ext4_fc_perform_commit(journal_t *journal)
 				     EXT4_STATE_FC_COMMITTING);
 	}
 	ext4_fc_unlock(sb, alloc_ctx);
+
+	ret = ext4_fc_snapshot_inodes(journal);
 	jbd2_journal_unlock_updates(journal);
+	if (ret)
+		return ret;
 
 	/*
 	 * Step 5: If file system device is different from journal device,
@@ -1292,6 +1514,7 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid)
 					struct ext4_inode_info,
 					i_fc_list);
 		list_del_init(&ei->i_fc_list);
+		ext4_fc_free_inode_snap(&ei->vfs_inode);
 		ext4_clear_inode_state(&ei->vfs_inode,
 				       EXT4_STATE_FC_COMMITTING);
 		if (tid_geq(tid, ei->i_sync_tid)) {
@@ -1327,6 +1550,14 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid)
 					     struct ext4_fc_dentry_update,
 					     fcd_list);
 		list_del_init(&fc_dentry->fcd_list);
+		if (fc_dentry->fcd_op == EXT4_FC_TAG_CREAT &&
+		    !list_empty(&fc_dentry->fcd_dilist)) {
+			/* See the comment in ext4_fc_commit_dentry_updates(). */
+			ei = list_first_entry(&fc_dentry->fcd_dilist,
+					      struct ext4_inode_info,
+					      i_fc_dilist);
+			ext4_fc_free_inode_snap(&ei->vfs_inode);
+		}
 		list_del_init(&fc_dentry->fcd_dilist);
 
 		release_dentry_name_snapshot(&fc_dentry->fcd_name);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index c2c2d6ac7f3d..4678612f82e8 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -5025,6 +5025,57 @@ int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc)
 	return ret;
 }
 
+/*
+ * ext4_get_inode_loc_noio() is a best-effort variant of ext4_get_inode_loc().
+ * It looks up the inode table block in the buffer cache and returns -EAGAIN if
+ * the block is not present or not uptodate, without starting any I/O.
+ */
+int ext4_get_inode_loc_noio(struct inode *inode, struct ext4_iloc *iloc)
+{
+	struct super_block *sb = inode->i_sb;
+	struct ext4_group_desc *gdp;
+	struct buffer_head *bh;
+	ext4_fsblk_t block;
+	int inodes_per_block, inode_offset;
+	unsigned long ino = inode->i_ino;
+
+	iloc->bh = NULL;
+	if (ino < EXT4_ROOT_INO ||
+	    ino > le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count))
+		return -EFSCORRUPTED;
+
+	iloc->block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
+	gdp = ext4_get_group_desc(sb, iloc->block_group, NULL);
+	if (!gdp)
+		return -EIO;
+
+	/* Figure out the offset within the block group inode table. */
+	inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
+	inode_offset = ((ino - 1) % EXT4_INODES_PER_GROUP(sb));
+	iloc->offset = (inode_offset % inodes_per_block) * EXT4_INODE_SIZE(sb);
+
+	block = ext4_inode_table(sb, gdp);
+	if (block <= le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block) ||
+	    block >= ext4_blocks_count(EXT4_SB(sb)->s_es)) {
+		ext4_error(sb,
+			   "Invalid inode table block %llu in block_group %u",
+			   block, iloc->block_group);
+		return -EFSCORRUPTED;
+	}
+	block += inode_offset / inodes_per_block;
+
+	bh = sb_find_get_block(sb, block);
+	if (!bh)
+		return -EAGAIN;
+	if (!ext4_buffer_uptodate(bh)) {
+		brelse(bh);
+		return -EAGAIN;
+	}
+
+	iloc->bh = bh;
+	return 0;
+}
+
 
 int ext4_get_fc_inode_loc(struct super_block *sb, unsigned long ino,
 			  struct ext4_iloc *iloc)
-- 
2.53.0


^ permalink raw reply related

* [RFC v7 0/7] ext4: fast commit: snapshot inode state for FC log
From: Li Chen @ 2026-05-11  8:42 UTC (permalink / raw)
  To: Zhang Yi, Theodore Ts'o, Andreas Dilger
  Cc: Steven Rostedt, Masami Hiramatsu, Mathieu Desnoyers, linux-ext4,
	linux-trace-kernel, linux-kernel

Hi,

(This RFC v7 series is rebased onto linux-next master as of 2026-05-09,
commit e98d21c170b0 ("Add linux-next specific files for 20260508").)

Zhang Yi in RFC v3 review pointed out that postponing lockdep assertions only
masks the issue, and that sleeping in ext4_fc_track_inode() while holding
i_data_sem can form a real ABBA deadlock if the fast commit writer also needs
i_data_sem while the inode is in FC_COMMITTING.

Zhang Yi suggested two possible directions to address the root cause:

1. "Ha, the solution seems to have already been listed in the TODOs in
fast_commit.c.

  Change ext4_fc_commit() to lookup logical to physical mapping using extent
  status tree. This would get rid of the need to call ext4_fc_track_inode()
  before acquiring i_data_sem. To do that we would need to ensure that
  modified extents from the extent status tree are not evicted from memory."

2. "Alternatively, recording the mapped range of tracking might also be
feasible."

This series implements a hybrid way: it implements approach 2 by snapshotting inode image
and mapped ranges at commit time, and consuming only snapshots during log
writing.

Approach 2 still needs a mapping source while building the snapshot
(logical-to-physical and unwritten/hole semantics). Calling ext4_map_blocks()
there would take i_data_sem and can block inside the
jbd2_journal_lock_updates() window, which risks deadlocks or unbounded stalls.
So the snapshot path uses approach 1's extent status lookups as a best-effort
mapping source to avoid ext4_map_blocks().

I did not fully implement approach 1 (making extent status lookups
authoritative by preventing reclaim of needed entries) because that would need
additional pinning/integration under memory pressure and a larger correctness
surface. Instead, the extent status tree is treated as a cache and the
snapshot path falls back to full commit on cache misses or unstable mappings
(e.g. delayed allocation).

Lock inversion / deadlock model (before):

CPU0 (metadata update)               CPU1 (fast commit)
--------------------               -----------------
... hold i_data_sem (A)             mutex_lock(s_fc_lock) (B)
    ext4_fc_track_inode()             ext4_fc_write_inode_data()
      mutex_lock(s_fc_lock) (B)         ext4_map_blocks()
      wait FC_COMMITTING (sleep)          down_read(i_data_sem) (A)

This creates i_data_sem (A) -> s_fc_lock (B) on update paths, and
s_fc_lock (B) -> i_data_sem (A) on commit paths. Once CPU0 sleeps while
holding (A), CPU1 can block on (A) while holding (B), completing the ABBA
cycle.

New model (this series):

CPU0 (metadata update)               CPU1 (fast commit)
--------------------               -----------------
... maybe hold i_data_sem (A)        jbd2_journal_lock_updates()
    ext4_fc_track_*()                 snapshot inode + ranges (no map_blocks)
      mutex_lock(s_fc_lock) (B)       jbd2_journal_unlock_updates()
      if FC_COMMITTING: set FC_REQUEUE s_fc_lock (B)
      no sleep                         write FC log from snapshots only
                                    cleanup: clear COMMITTING, requeue if set

The commit path no longer takes i_data_sem while holding s_fc_lock, and
tracking no longer sleeps waiting for FC_COMMITTING. If an inode is updated
during a fast commit, EXT4_STATE_FC_REQUEUE records that fact and the inode
is moved to FC_Q_STAGING for the next commit.
The only remaining FC_COMMITTING waiter is ext4_fc_del(), which drops
s_fc_lock before sleeping.

This series snapshots the on-disk inode and tracked data ranges while journal
updates are locked and existing handles are drained. The log writing phase then
serializes only snapshots, so it no longer needs to call ext4_map_blocks() and
take i_data_sem under s_fc_lock. This is done in two steps: patch 1 drops
ext4_map_blocks() from log writing by introducing commit-time snapshots, and
patch 5 drops ext4_map_blocks() from the snapshot path by using the extent
status cache. The snapshot also records whether a mapped extent is unwritten,
so the ADD_RANGE records (and replay) preserve unwritten semantics.

Snapshotting runs under jbd2_journal_lock_updates(). Since a cache miss in
ext4_get_inode_loc() can start synchronous inode table I/O and stall handle
starts for milliseconds, patch 1 uses ext4_get_inode_loc_noio() and falls back
to full commit if the inode table block is not present or not uptodate.

ext4_fc_track_inode() also stops waiting for FC_COMMITTING. Updates during an
ongoing fast commit are marked with EXT4_STATE_FC_REQUEUE and are replayed in
the next fast commit, while ext4_fc_del() waits for FC_COMMITTING so an inode
cannot be removed while the commit thread is still using it.

The extent status tree is a cache, not an authoritative source, so the snapshot
path falls back to full commit on cache misses or unstable mappings (e.g.
delayed allocation). This includes cases where extent status entries are not
present (or have been reclaimed) under memory pressure. The snapshot path does
not try to rebuild mappings by calling ext4_map_blocks(); instead it simply
marks the transaction fast commit ineligible.

To keep the updates-locked window bounded, the snapshot path caps the number of
snapshotted inodes and ranges per fast commit (currently 1024 inodes and 2048
ranges) and falls back to full commit when the cap is exceeded. The series also
handles the journal inode i_data_sem lockdep false positive via subclassing;
journal inode mapping may still take i_data_sem even when data inode mapping is
avoided.

Patch 6 adds the ext4_fc_lock_updates tracepoint to quantify the updates-locked
window and snapshot fallback reasons. Patch 7 extends
/proc/fs/ext4/<sb_id>/fc_info with best-effort snapshot counters. If the /proc
interface is undesirable, I can drop patch 7 and keep the tracepoint only, or
drop even both.

Testing and measurement were done on a QEMU/KVM guest with virtio-pmem + dax
(ext4 -O fast_commit, mounted dax,noatime). The workload does python3 500x
{4K write + fsync}, fallocate 256M, and python3 500x {creat + fsync(dir)}.
Over 3 cold boots, ext4_fc_lock_updates reported locked_ns p50 2.88-2.92 us,
p99 <= 6.71 us, and max <= 102.71 us, with snap_err always 0. Under stress-ng
memory pressure (stress-ng --vm 4 --vm-bytes 75% --timeout 60s), locked_ns p50
2.94 us, p99 <= 4.97 us, and max <= 20.07 us. The fc_info snapshot failure
counters stayed at 0.
These hold times are in the low microseconds range, and the caps keep the
worst case bounded.

Comments and guidance are very welcome. Please let me know if there are any
concerns about correctness, corner cases, or better approaches.

RFC v6 -> RFC v7:
- Rebase onto linux-next master as of 2026-05-09, commit e98d21c170b0
  ("Add linux-next specific files for 20260508").
- Address Sashiko review feedback for RFC v6. [2]
- Fix the reported snapshot range arithmetic issue near EXT_MAX_BLOCKS to
  avoid cur_lblk / range wraparound in the snapshot walk.
- Report successfully snapshotted inode counts in ext4_fc_lock_updates when
  snapshotting stops early, as reported by Sashiko.
- Use READ_ONCE() + div64_u64() for the fc_info lock_updates average, as
  reported by Sashiko.

RFC v5 -> RFC v6:
- Rebase onto linux-next master as of 2026-04-08.
- Address tracepoint review feedback by relying on enum auto-increment for
  snap_err values and by switching the guarded ext4_fc_lock_updates call site
  to trace_call__ext4_fc_lock_updates() to avoid the double static_branch. [1]
- Keep lock window accounting unconditional for fc_info while using the guarded
  direct tracepoint call.
- Fix the inode debug print format exposed by the rebase.

RFC v4 -> RFC v5:
- Patch 6: Make ext4_fc_lock_updates snap_err human readable via
  TRACE_DEFINE_ENUM() + __print_symbolic(), using a single TRACE_SNAP_ERR
  mapping while keeping the enum values stable for tooling.

RFC v3 -> RFC v4:
- Replace lockdep_assert movement with removing the wait in
  ext4_fc_track_inode() and using EXT4_STATE_FC_REQUEUE to capture updates
  during an ongoing fast commit.
- Replace dropping s_fc_lock around log writing with commit-time snapshots of
  inode image and mapped ranges (recording the mapped range of tracking as
  suggested by Zhang Yi) so log writing consumes only snapshots.
- Avoid inode table I/O under jbd2_journal_lock_updates() via
  ext4_get_inode_loc_noio() and fallback to full commit on cache misses.
- Use the extent status cache for snapshot mappings and fall back to full
  commit on cache misses or unstable mappings (e.g. delayed allocation).
- Add tracepoint and /proc snapshot stats to quantify the updates-locked window
  and snapshot fallback reasons.

RFC v2 -> RFC v3:
- rebase on top of
  https://lore.kernel.org/linux-ext4/20251223131342.287864-1-me@linux.beauty/T/#u

RFC v1 -> RFC v2:
- patch 1: move comments to correct place
- patch 2: add it to patchset.
- add missing RFC prefix

RFC v1: https://lore.kernel.org/linux-ext4/20251222032655.87056-1-me@linux.beauty/T/#u
RFC v2: https://lore.kernel.org/linux-ext4/20251222151906.24607-1-me@linux.beauty/T/#t
RFC v3: https://lore.kernel.org/linux-ext4/20251224032943.134063-1-me@linux.beauty/
RFC v4: https://lore.kernel.org/all/20260120112538.132774-1-me@linux.beauty/
RFC v5: https://lore.kernel.org/all/20260317084624.457185-1-me@linux.beauty/t/#u
RFC v6: https://lore.kernel.org/all/20260408112020.716706-1-me@linux.beauty/

[1]: https://lore.kernel.org/all/acZJl8QUYEq8voqQ@BLRRASHENOY1.amd.com/T/#u
[2]: https://sashiko.dev/#/patchset/20260408112020.716706-1-me%40linux.beauty

Thanks,

Li Chen (7):
  ext4: fast commit: snapshot inode state before writing log
  ext4: lockdep: handle i_data_sem subclassing for special inodes
  ext4: fast commit: avoid waiting for FC_COMMITTING
  ext4: fast commit: avoid self-deadlock in inode snapshotting
  ext4: fast commit: avoid i_data_sem by dropping ext4_map_blocks() in
    snapshots
  ext4: fast commit: add lock_updates tracepoint
  ext4: fast commit: export snapshot stats in fc_info

 fs/ext4/ext4.h              |  73 +++-
 fs/ext4/fast_commit.c       | 716 +++++++++++++++++++++++++++++-------
 fs/ext4/inode.c             |  51 +++
 fs/ext4/super.c             |   9 +
 include/trace/events/ext4.h |  61 +++
 5 files changed, 776 insertions(+), 134 deletions(-)

-- 
2.53.0

^ permalink raw reply

* Re: [PATCH v4 11/23] iomap: correct the range of a partial dirty clear
From: Christoph Hellwig @ 2026-05-11  7:46 UTC (permalink / raw)
  To: Zhang Yi
  Cc: linux-ext4, linux-fsdevel, linux-kernel, tytso, adilger.kernel,
	libaokun, jack, ojaswin, ritesh.list, djwong, hch, yi.zhang,
	yizhang089, yangerkun, yukuai
In-Reply-To: <20260511072344.191271-12-yi.zhang@huaweicloud.com>

Plase send the iomap patches out separate, including to all the
relevant lists from the iomap MAINTAINERS entry.


^ permalink raw reply

* [PATCH v4 19/23] ext4: update i_disksize to i_size on ordered I/O completion
From: Zhang Yi @ 2026-05-11  7:23 UTC (permalink / raw)
  To: linux-ext4, linux-fsdevel
  Cc: linux-kernel, tytso, adilger.kernel, libaokun, jack, ojaswin,
	ritesh.list, djwong, hch, yi.zhang, yi.zhang, yizhang089,
	yangerkun, yukuai
In-Reply-To: <20260511072344.191271-1-yi.zhang@huaweicloud.com>

From: Zhang Yi <yi.zhang@huawei.com>

Currently, i_disksize is updated after ordered data writeback to prevent
exposing stale data in the post-EOF block. However, operations like
append allocate, zero range and truncate update i_disksize directly. If
the new i_disksize exceeds the original value, metadata may be written
back before the zeroed data is persisted. To avoid this, we defer
i_disksize updates when i_ordered_len is non-zero, only applying them
after ordered I/O completes.

However, this deferral introduces a new problem: on ordered I/O
completion, i_disksize is updated only to the end of that specific I/O,
discarding any later updates (e.g., from fallocate) and causing
filesystem inconsistency. A potential fix would involve scanning for
dirty or writeback folios beyond the current position, then updating
i_disksize to the start of the first such folio or to i_size. However,
folio scanning is expensive and concurrency with operations like
fallocate makes this approach prohibitively complex.

Instead, when ordered zero I/O completes, update i_disksize directly to
i_size. This may expose zeroed data (if dirty data within the range is
not yet on disk after crash recovery), but it will never expose stale
data. This limitation is restricted to unaligned append writes and is
deemed acceptable.

Suggested-by: Jan Kara <jack@suse.cz>
Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
---
 fs/ext4/ext4.h    | 29 +++++++++++++++++++++++++----
 fs/ext4/inode.c   | 30 ++++++++++++++++++++----------
 fs/ext4/page-io.c | 25 ++++++++++++++++++++-----
 3 files changed, 65 insertions(+), 19 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 9ce2128eea3e..0a3bb44f1e6e 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -3493,13 +3493,21 @@ do {								\
 #define EXT4_FREECLUSTERS_WATERMARK 0
 #endif
 
-/* Update i_disksize. Requires i_rwsem to avoid races with truncate */
+/*
+ * Update i_disksize. Requires i_rwsem to avoid races with truncate.
+ *
+ * In the iomap buffered I/O path, a non-zero i_ordered_len indicates that
+ * an ordered I/O (zeroing the EOF partial block) is still in progress.
+ * In that case, i_disksize will be updated after the ordered data has
+ * been written out.
+ */
 static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize)
 {
 	WARN_ON_ONCE(S_ISREG(inode->i_mode) &&
 		     !inode_is_locked(inode));
 	down_write(&EXT4_I(inode)->i_data_sem);
-	if (newsize > EXT4_I(inode)->i_disksize)
+	if (newsize > EXT4_I(inode)->i_disksize &&
+	    READ_ONCE(EXT4_I(inode)->i_ordered_len) == 0)
 		WRITE_ONCE(EXT4_I(inode)->i_disksize, newsize);
 	up_write(&EXT4_I(inode)->i_data_sem);
 }
@@ -3514,8 +3522,21 @@ static inline int ext4_update_inode_size(struct inode *inode, loff_t newsize)
 		changed = 1;
 	}
 	if (newsize > EXT4_I(inode)->i_disksize) {
-		ext4_update_i_disksize(inode, newsize);
-		changed |= 2;
+		/*
+		 * Pairs with smp_store_release() in ext4_iomap_end_bio()
+		 * that clears i_ordered_len.  The smp_mb() ensures the
+		 * i_size store above is globally visible before we read
+		 * i_ordered_len.  This way, if we skip the i_disksize
+		 * update because i_ordered_len is still non-zero, the
+		 * ordered-I/O completion path (which reads i_size under
+		 * i_data_sem) is guaranteed to see the new i_size and will
+		 * update i_disksize correctly.
+		 */
+		smp_mb();
+		if (READ_ONCE(EXT4_I(inode)->i_ordered_len) == 0) {
+			ext4_update_i_disksize(inode, newsize);
+			changed |= 2;
+		}
 	}
 	return changed;
 }
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 11fb369efeb1..1e208b3fad34 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4868,9 +4868,6 @@ int ext4_block_zero_eof(struct inode *inode, loff_t from, loff_t end)
 	 * truncating up or performing an append write, because there might be
 	 * exposing stale on-disk data which may caused by concurrent post-EOF
 	 * mmap write during folio writeback.
-	 *
-	 * TODO: In the iomap path, handle this by updating i_disksize to
-	 * i_size after the zeroed data has been written back.
 	 */
 	if (did_zero && zero_written && !IS_DAX(inode)) {
 		if (ext4_should_order_data(inode)) {
@@ -4894,9 +4891,15 @@ int ext4_block_zero_eof(struct inode *inode, loff_t from, loff_t end)
 		 * for I/O completion before updating i_disksize if the write
 		 * extends beyond the zeroed boundary.
 		 *
-		 * TODO: Any other operation that extends i_disksize
-		 * (including truncate up and append fallocate) must wait for
-		 * the relevant I/O to complete before updating i_disksize.
+		 * When zeroed I/O is in progress, operations that extend
+		 * i_disksize are handled as follows:
+		 *
+		 *  - Truncate up, append fallocate and zero_range:
+		 *    Defer the update. The file size will be updated to
+		 *    i_size by the end_io handler once the ongoing I/O
+		 *    completes.
+		 *
+		 *  - TODO: handle insert range and collapse range.
 		 */
 		} else if (ext4_inode_buffered_iomap(inode)) {
 			err = ext4_iomap_submit_zero_block(inode, from, end);
@@ -6512,11 +6515,16 @@ static void ext4_wait_for_tail_page_commit(struct inode *inode)
 }
 
 /*
- * Set i_size and i_disksize to 'newsize'.
+ * Set i_size and i_disksize to 'newsize'.  In the iomap buffered I/O path,
+ * if i_ordered_len is non-zero and newsize exceeds the current i_disksize,
+ * the actual i_disksize update is deferred until after the ordered data is
+ * written out.  In that case, i_disksize will be set to i_size upon I/O
+ * completion.
  *
  * Both i_rwsem and i_data_sem are required here to avoid races between
- * generic append writeback and concurrent truncate that also modify
- * i_size and i_disksize.
+ * generic append writeback (or ordered I/O writeback) and concurrent
+ * operations (e.g., fallocate, truncate) that also modify i_size and
+ * i_disksize.
  */
 static inline void ext4_set_inode_size(struct inode *inode, loff_t newsize)
 {
@@ -6524,7 +6532,9 @@ static inline void ext4_set_inode_size(struct inode *inode, loff_t newsize)
 
 	down_write(&EXT4_I(inode)->i_data_sem);
 	i_size_write(inode, newsize);
-	EXT4_I(inode)->i_disksize = newsize;
+	if (READ_ONCE(EXT4_I(inode)->i_ordered_len) == 0 ||
+	    newsize < EXT4_I(inode)->i_disksize)
+		WRITE_ONCE(EXT4_I(inode)->i_disksize, newsize);
 	up_write(&EXT4_I(inode)->i_data_sem);
 }
 
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index ad05ebb49bf6..2ad9f900c9f3 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -654,13 +654,13 @@ static void ext4_iomap_wb_ordered_wait(struct inode *inode,
 }
 
 static int ext4_iomap_wb_update_disksize(handle_t *handle, struct inode *inode,
-					 loff_t end)
+					 loff_t end, bool is_ordered)
 {
-	loff_t new_disksize = end;
+	loff_t new_disksize, i_size;
 	struct ext4_inode_info *ei = EXT4_I(inode);
 	int ret;
 
-	if (new_disksize <= READ_ONCE(ei->i_disksize))
+	if (end <= READ_ONCE(ei->i_disksize) && !is_ordered)
 		return 0;
 
 	/*
@@ -668,7 +668,20 @@ static int ext4_iomap_wb_update_disksize(handle_t *handle, struct inode *inode,
 	 * are avoided by checking i_size under i_data_sem.
 	 */
 	down_write(&ei->i_data_sem);
-	new_disksize = min(new_disksize, i_size_read(inode));
+	i_size = i_size_read(inode);
+
+	/*
+	 * Update i_disksize to i_size when completing an ordered I/O that
+	 * zeroes the old EOF partial block.  This is safe because we never
+	 * directly allocate written blocks during buffered writes.
+	 *
+	 * This ensures i_disksize is correctly advanced during truncate-up
+	 * or append fallocate on a block-unaligned file, preventing it
+	 * from remaining stale.  A downside is that zeroed data may be
+	 * exposed after crash recovery if the dirty data in this range is
+	 * not yet on disk, but stale data will never be exposed.
+	 */
+	new_disksize = is_ordered ? i_size : min(end, i_size);
 	if (new_disksize > ei->i_disksize)
 		ei->i_disksize = new_disksize;
 	up_write(&ei->i_data_sem);
@@ -685,6 +698,7 @@ static void ext4_iomap_finish_ioend(struct iomap_ioend *ioend)
 	struct super_block *sb = inode->i_sb;
 	loff_t pos = ioend->io_offset;
 	size_t size = ioend->io_size;
+	unsigned long io_mode = (unsigned long)ioend->io_private;
 	handle_t *handle;
 	int credits;
 	int ret, err;
@@ -714,7 +728,8 @@ static void ext4_iomap_finish_ioend(struct iomap_ioend *ioend)
 			goto out_journal;
 	}
 
-	ret = ext4_iomap_wb_update_disksize(handle, inode, pos + size);
+	ret = ext4_iomap_wb_update_disksize(handle, inode, pos + size,
+			io_mode == EXT4_IOMAP_IOEND_ORDER_IO);
 out_journal:
 	err = ext4_journal_stop(handle);
 	if (!ret)
-- 
2.52.0


^ permalink raw reply related

* [PATCH v4 11/23] iomap: correct the range of a partial dirty clear
From: Zhang Yi @ 2026-05-11  7:23 UTC (permalink / raw)
  To: linux-ext4, linux-fsdevel
  Cc: linux-kernel, tytso, adilger.kernel, libaokun, jack, ojaswin,
	ritesh.list, djwong, hch, yi.zhang, yi.zhang, yizhang089,
	yangerkun, yukuai
In-Reply-To: <20260511072344.191271-1-yi.zhang@huaweicloud.com>

From: Zhang Yi <yi.zhang@huawei.com>

The block range calculation in ifs_clear_range_dirty() is incorrect when
partially clearing a range in a folio. We cannot clear the dirty bit of
the first block or the last block if the start or end offset is not
blocksize-aligned. This has not yet caused any issues since we always
clear a whole folio in iomap_writeback_folio().

Fix this by rounding up the first block to blocksize alignment, and
calculate the last block by rounding down (using truncation). Correct
the nr_blks calculation accordingly.

Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
---
This is modified from:
 https://lore.kernel.org/linux-fsdevel/20240812121159.3775074-2-yi.zhang@huaweicloud.com/
Changes:
 - Use round_up() instead of DIV_ROUND_UP() to prevent wasted integer
   division.

 fs/iomap/buffered-io.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index d7b648421a70..64351a448a8b 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -176,13 +176,17 @@ static void ifs_clear_range_dirty(struct folio *folio,
 {
 	struct inode *inode = folio->mapping->host;
 	unsigned int blks_per_folio = i_blocks_per_folio(inode, folio);
-	unsigned int first_blk = (off >> inode->i_blkbits);
-	unsigned int last_blk = (off + len - 1) >> inode->i_blkbits;
-	unsigned int nr_blks = last_blk - first_blk + 1;
+	unsigned int first_blk = round_up(off, i_blocksize(inode)) >>
+				 inode->i_blkbits;
+	unsigned int last_blk = (off + len) >> inode->i_blkbits;
 	unsigned long flags;
 
+	if (first_blk >= last_blk)
+		return;
+
 	spin_lock_irqsave(&ifs->state_lock, flags);
-	bitmap_clear(ifs->state, first_blk + blks_per_folio, nr_blks);
+	bitmap_clear(ifs->state, first_blk + blks_per_folio,
+		     last_blk - first_blk);
 	spin_unlock_irqrestore(&ifs->state_lock, flags);
 }
 
-- 
2.52.0


^ permalink raw reply related

* [PATCH v4 15/23] ext4: add block mapping tracepoints for iomap buffered I/O path
From: Zhang Yi @ 2026-05-11  7:23 UTC (permalink / raw)
  To: linux-ext4, linux-fsdevel
  Cc: linux-kernel, tytso, adilger.kernel, libaokun, jack, ojaswin,
	ritesh.list, djwong, hch, yi.zhang, yi.zhang, yizhang089,
	yangerkun, yukuai
In-Reply-To: <20260511072344.191271-1-yi.zhang@huaweicloud.com>

From: Zhang Yi <yi.zhang@huawei.com>

Add tracepoints for iomap buffered read, write, partial block zeroing,
and writeback operations to help debug the iomap buffered I/O path.

Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
---
 fs/ext4/inode.c             |  6 +++++
 include/trace/events/ext4.h | 45 +++++++++++++++++++++++++++++++++++++
 2 files changed, 51 insertions(+)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index e0dae2501292..239d387ffaf2 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3961,6 +3961,8 @@ static int ext4_iomap_buffered_read_begin(struct inode *inode, loff_t offset,
 	if (ret < 0)
 		return ret;
 
+	trace_ext4_iomap_buffered_read_begin(inode, &map, offset, length,
+					     flags);
 	ext4_set_iomap(inode, iomap, &map, offset, length, flags);
 	return 0;
 }
@@ -4034,6 +4036,8 @@ static int ext4_iomap_buffered_do_write_begin(struct inode *inode,
 	if (ret < 0)
 		return ret;
 
+	trace_ext4_iomap_buffered_write_begin(inode, &map, offset, length,
+					      flags);
 	ext4_set_iomap(inode, iomap, &map, offset, length, flags);
 	return 0;
 }
@@ -4136,6 +4140,7 @@ static int ext4_iomap_zero_begin(struct inode *inode,
 			map.m_len = (start >> blkbits) - map.m_lblk;
 	}
 
+	trace_ext4_iomap_zero_begin(inode, &map, offset, length, flags);
 	ext4_set_iomap(inode, iomap, &map, offset, length, flags);
 	iomap->flags |= iomap_flags;
 
@@ -4308,6 +4313,7 @@ static int ext4_iomap_map_writeback_range(struct iomap_writepage_ctx *wpc,
 		return ret;
 	}
 out:
+	trace_ext4_iomap_map_writeback_range(inode, &map, offset, dirty_len, 0);
 	ext4_set_iomap(inode, &wpc->iomap, &map, offset, dirty_len, 0);
 	return 0;
 }
diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
index f493642cf121..ebafa06cd191 100644
--- a/include/trace/events/ext4.h
+++ b/include/trace/events/ext4.h
@@ -3096,6 +3096,51 @@ TRACE_EVENT(ext4_move_extent_exit,
 		  __entry->ret)
 );
 
+DECLARE_EVENT_CLASS(ext4_set_iomap_class,
+	TP_PROTO(struct inode *inode, struct ext4_map_blocks *map,
+		 loff_t offset, loff_t length, unsigned int flags),
+	TP_ARGS(inode, map, offset, length, flags),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(u64, ino)
+		__field(ext4_lblk_t, m_lblk)
+		__field(unsigned int, m_len)
+		__field(unsigned int, m_flags)
+		__field(u64, m_seq)
+		__field(loff_t, offset)
+		__field(loff_t, length)
+		__field(unsigned int, iomap_flags)
+	),
+	TP_fast_assign(
+		__entry->dev		= inode->i_sb->s_dev;
+		__entry->ino		= inode->i_ino;
+		__entry->m_lblk		= map->m_lblk;
+		__entry->m_len		= map->m_len;
+		__entry->m_flags	= map->m_flags;
+		__entry->m_seq		= map->m_seq;
+		__entry->offset		= offset;
+		__entry->length		= length;
+		__entry->iomap_flags	= flags;
+
+	),
+	TP_printk("dev %d:%d ino %llu m_lblk %u m_len %u m_flags %s m_seq %llu orig_off 0x%llx orig_len 0x%llx iomap_flags 0x%x",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino, __entry->m_lblk, __entry->m_len,
+		  show_mflags(__entry->m_flags), __entry->m_seq,
+		  __entry->offset, __entry->length, __entry->iomap_flags)
+)
+
+#define DEFINE_SET_IOMAP_EVENT(name) \
+DEFINE_EVENT(ext4_set_iomap_class, name, \
+	TP_PROTO(struct inode *inode, struct ext4_map_blocks *map, \
+		 loff_t offset, loff_t length, unsigned int flags), \
+	TP_ARGS(inode, map, offset, length, flags))
+
+DEFINE_SET_IOMAP_EVENT(ext4_iomap_buffered_read_begin);
+DEFINE_SET_IOMAP_EVENT(ext4_iomap_buffered_write_begin);
+DEFINE_SET_IOMAP_EVENT(ext4_iomap_map_writeback_range);
+DEFINE_SET_IOMAP_EVENT(ext4_iomap_zero_begin);
+
 #endif /* _TRACE_EXT4_H */
 
 /* This part must be outside protection */
-- 
2.52.0


^ permalink raw reply related

* [PATCH v4 12/23] iomap: support invalidating partial folios
From: Zhang Yi @ 2026-05-11  7:23 UTC (permalink / raw)
  To: linux-ext4, linux-fsdevel
  Cc: linux-kernel, tytso, adilger.kernel, libaokun, jack, ojaswin,
	ritesh.list, djwong, hch, yi.zhang, yi.zhang, yizhang089,
	yangerkun, yukuai
In-Reply-To: <20260511072344.191271-1-yi.zhang@huaweicloud.com>

From: Zhang Yi <yi.zhang@huawei.com>

Current iomap_invalidate_folio() can only invalidate an entire folio. If
we truncate a partial folio on a filesystem where the block size is
smaller than the folio size, it will leave behind dirty bits for the
truncated or punched blocks. During the write-back process, it will
attempt to map the invalid hole range. Fortunately, this has not caused
any real problems so far because the ->writeback_range() function
corrects the length.

However, the implementation of FALLOC_FL_ZERO_RANGE in ext4 depends on
the support for invalidating partial folios. When ext4 partially zeroes
out a dirty and unwritten folio, it does not perform a flush first like
XFS. Therefore, if the dirty bits of the corresponding area cannot be
cleared, the zeroed area after writeback remains in the written state
rather than reverting to the unwritten state. Fix this by supporting
invalidation of partial folios.

Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
---
This is cherry picked form:
 https://lore.kernel.org/linux-fsdevel/20240812121159.3775074-3-yi.zhang@huaweicloud.com/
No code changes, only update the commit message to explain why Ext4
needs this.

 fs/iomap/buffered-io.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 64351a448a8b..876c2f507f58 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -761,6 +761,8 @@ void iomap_invalidate_folio(struct folio *folio, size_t offset, size_t len)
 		WARN_ON_ONCE(folio_test_writeback(folio));
 		folio_cancel_dirty(folio);
 		ifs_free(folio);
+	} else {
+		iomap_clear_range_dirty(folio, offset, len);
 	}
 }
 EXPORT_SYMBOL_GPL(iomap_invalidate_folio);
-- 
2.52.0

^ permalink raw reply related

* [PATCH v4 14/23] ext4: implement partial block zero range path using iomap
From: Zhang Yi @ 2026-05-11  7:23 UTC (permalink / raw)
  To: linux-ext4, linux-fsdevel
  Cc: linux-kernel, tytso, adilger.kernel, libaokun, jack, ojaswin,
	ritesh.list, djwong, hch, yi.zhang, yi.zhang, yizhang089,
	yangerkun, yukuai
In-Reply-To: <20260511072344.191271-1-yi.zhang@huaweicloud.com>

From: Zhang Yi <yi.zhang@huawei.com>

Introduce a new iomap_ops instance, ext4_iomap_zero_ops, along with
ext4_iomap_block_zero_range() to implement block zeroing via the iomap
infrastructure for ext4.

ext4_iomap_block_zero_range() calls iomap_zero_range() with
ext4_iomap_zero_begin() as the callback. The callback locates and zeros
out either a mapped partial block or a dirty, unwritten partial block.

Important constraints:

Zeroing out under an active journal handle can cause deadlock, because
the order of acquiring the folio lock and starting a handle is
inconsistent with the iomap writeback path.

Therefore, ext4_iomap_block_zero_range():
- Must NOT be called under an active handle.
- Cannot rely on data=ordered mode to ensure zeroed data persistence
  before updating i_disksize (for the cases of post-EOF append write,
  post-EOF fallocate, and truncate up). In subsequent patches, we will
  address this by synchronizing commit I/O but doesn't waiting for
  completion, and updating i_disksize to i_size only after the zeroed
  data has been written back.

Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
---
 fs/ext4/inode.c | 92 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 92 insertions(+)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index c6fe42d012fc..e0dae2501292 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4101,6 +4101,51 @@ static int ext4_iomap_buffered_da_write_end(struct inode *inode, loff_t offset,
 	return 0;
 }
 
+static int ext4_iomap_zero_begin(struct inode *inode,
+		loff_t offset, loff_t length, unsigned int flags,
+		struct iomap *iomap, struct iomap *srcmap)
+{
+	struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap);
+	struct ext4_map_blocks map;
+	u8 blkbits = inode->i_blkbits;
+	unsigned int iomap_flags = 0;
+	int ret;
+
+	ret = ext4_emergency_state(inode->i_sb);
+	if (unlikely(ret))
+		return ret;
+
+	if (WARN_ON_ONCE(!(flags & IOMAP_ZERO)))
+		return -EINVAL;
+
+	ret = ext4_iomap_map_blocks(inode, offset, length, NULL, &map);
+	if (ret < 0)
+		return ret;
+
+	/*
+	 * Look up dirty folios for unwritten mappings within EOF. Providing
+	 * this bypasses the flush iomap uses to trigger extent conversion
+	 * when unwritten mappings have dirty pagecache in need of zeroing.
+	 */
+	if (map.m_flags & EXT4_MAP_UNWRITTEN) {
+		loff_t start = ((loff_t)map.m_lblk) << blkbits;
+		loff_t end = ((loff_t)map.m_lblk + map.m_len) << blkbits;
+
+		iomap_fill_dirty_folios(iter, &start, end, &iomap_flags);
+		if ((start >> blkbits) < map.m_lblk + map.m_len)
+			map.m_len = (start >> blkbits) - map.m_lblk;
+	}
+
+	ext4_set_iomap(inode, iomap, &map, offset, length, flags);
+	iomap->flags |= iomap_flags;
+
+	return 0;
+}
+
+static const struct iomap_ops ext4_iomap_zero_ops = {
+	.iomap_begin = ext4_iomap_zero_begin,
+};
+
 /*
  * Since we always allocate unwritten extents, there is no need for
  * iomap_end to clean up allocated blocks on a short write.
@@ -4616,6 +4661,47 @@ static int ext4_block_journalled_zero_range(struct inode *inode, loff_t from,
 	return err;
 }
 
+static int ext4_block_iomap_zero_range(struct inode *inode, loff_t from,
+				       loff_t length, bool *did_zero,
+				       bool *zero_written)
+{
+	int ret;
+
+	/*
+	 * Zeroing out under an active handle can cause deadlock since
+	 * the order of acquiring the folio lock and starting a handle is
+	 * inconsistent with the iomap writeback procedure.
+	 */
+	if (WARN_ON_ONCE(ext4_handle_valid(journal_current_handle())))
+		return -EINVAL;
+
+	/* The zeroing scope should not extend across a block. */
+	if (WARN_ON_ONCE((from >> inode->i_blkbits) !=
+			 ((from + length - 1) >> inode->i_blkbits)))
+		return -EINVAL;
+
+	if (!(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS) &&
+	    !(inode_state_read_once(inode) & (I_NEW | I_FREEING)))
+		WARN_ON_ONCE(!inode_is_locked(inode) &&
+			!rwsem_is_locked(&inode->i_mapping->invalidate_lock));
+
+	ret = iomap_zero_range(inode, from, length, did_zero,
+			       &ext4_iomap_zero_ops, &ext4_iomap_write_ops,
+			       NULL);
+	if (ret)
+		return ret;
+
+	/*
+	 * TODO: The iomap does not distinguish between different types of
+	 * zeroing and always sets zero_written if a zeroing operation is
+	 * performed, which may result in unnecessary order operations.
+	 */
+	if (did_zero && zero_written)
+		*zero_written = *did_zero;
+
+	return 0;
+}
+
 /*
  * Zeros out a mapping of length 'length' starting from file offset
  * 'from'.  The range to be zero'd must be contained with in one block.
@@ -4642,6 +4728,9 @@ static int ext4_block_zero_range(struct inode *inode,
 	} else if (ext4_should_journal_data(inode)) {
 		return ext4_block_journalled_zero_range(inode, from, length,
 							did_zero);
+	} else if (ext4_inode_buffered_iomap(inode)) {
+		return ext4_block_iomap_zero_range(inode, from, length,
+						   did_zero, zero_written);
 	}
 	return ext4_block_do_zero_range(inode, from, length, did_zero,
 					zero_written);
@@ -4682,6 +4771,9 @@ int ext4_block_zero_eof(struct inode *inode, loff_t from, loff_t end)
 	 * truncating up or performing an append write, because there might be
 	 * exposing stale on-disk data which may caused by concurrent post-EOF
 	 * mmap write during folio writeback.
+	 *
+	 * TODO: In the iomap path, handle this by updating i_disksize to
+	 * i_size after the zeroed data has been written back.
 	 */
 	if (ext4_should_order_data(inode) &&
 	    did_zero && zero_written && !IS_DAX(inode)) {
-- 
2.52.0


^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox