Linux filesystem development
 help / color / mirror / Atom feed
* [PATCH 0/9] fs: Fix missed inode write during fsync
@ 2026-05-11 12:13 Jan Kara
  2026-05-11 12:13 ` [PATCH 1/9] affs: Drop support for metadata bh tracking Jan Kara
                   ` (9 more replies)
  0 siblings, 10 replies; 18+ messages in thread
From: Jan Kara @ 2026-05-11 12:13 UTC (permalink / raw)
  To: linux-fsdevel
  Cc: Christian Brauner, aivazian.tigran, OGAWA Hirofumi, Ted Tso,
	linux-ext4, Jan Kara

Hello,

this patch series fixes the possibly missing inode write during fsync(2) for
filesystems using generic metadata bh tracking. The inherent problem is that
.write_inode methods clear inode dirty bit but they only copy inode contents
into to the buffer cache. Because buffer carrying the inode is shared among
multiple inodes, it cannot be tracked by the generic metadata bh tracking
infrastructure and thus nothing is tracking that buffer needs to be written
out to maintain fsync(2) guarantees. Normally, this gets taken care of
by .write_inode checking for WB_SYNC_ALL writeback and submitting & waiting
for the buffer in that case however if flush worker ends up writing the
inode before data integrity writeback, this mechanism is broken.

This patch series adds a way for filesystems to track metadata block number
which contains the inode metadata and then uses this information to writeout
the buffer on fsync.

								Honza

^ permalink raw reply	[flat|nested] 18+ messages in thread

* [PATCH 1/9] affs: Drop support for metadata bh tracking
  2026-05-11 12:13 [PATCH 0/9] fs: Fix missed inode write during fsync Jan Kara
@ 2026-05-11 12:13 ` Jan Kara
  2026-05-11 12:13 ` [PATCH 2/9] ext4: Allocate mapping_metadata_bhs struct on demand Jan Kara
                   ` (8 subsequent siblings)
  9 siblings, 0 replies; 18+ messages in thread
From: Jan Kara @ 2026-05-11 12:13 UTC (permalink / raw)
  To: linux-fsdevel
  Cc: Christian Brauner, aivazian.tigran, OGAWA Hirofumi, Ted Tso,
	linux-ext4, Jan Kara, David Sterba

AFFS did all the hard work of tracking metadata bhs dirtied for an inode
but it actually never used this information as affs_file_fsync() just
calls sync_blockdev() to writeback all filesystem metadata bhs. After a
discussion with AFFS maintainer nobody cares about AFFS performance
so let's keep this affs_file_fsync() behavior and just drop all the
pointless tracking from AFFS.

CC: David Sterba <dsterba@suse.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/affs/affs.h     |  1 -
 fs/affs/amigaffs.c | 12 ++++++------
 fs/affs/file.c     | 25 +++++++++++--------------
 fs/affs/inode.c    | 13 +++++--------
 fs/affs/namei.c    |  9 ++++-----
 fs/affs/super.c    |  1 -
 6 files changed, 26 insertions(+), 35 deletions(-)

diff --git a/fs/affs/affs.h b/fs/affs/affs.h
index a0caf6ace860..406a0ef63e7b 100644
--- a/fs/affs/affs.h
+++ b/fs/affs/affs.h
@@ -44,7 +44,6 @@ struct affs_inode_info {
 	struct mutex i_link_lock;		/* Protects internal inode access. */
 	struct mutex i_ext_lock;		/* Protects internal inode access. */
 #define i_hash_lock i_ext_lock
-	struct mapping_metadata_bhs i_metadata_bhs;
 	u32	 i_blkcnt;			/* block count */
 	u32	 i_extcnt;			/* extended block count */
 	u32	*i_lc;				/* linear cache of extended blocks */
diff --git a/fs/affs/amigaffs.c b/fs/affs/amigaffs.c
index bed4fc805e8e..6cc0fc9a4cbf 100644
--- a/fs/affs/amigaffs.c
+++ b/fs/affs/amigaffs.c
@@ -57,7 +57,7 @@ affs_insert_hash(struct inode *dir, struct buffer_head *bh)
 		AFFS_TAIL(sb, dir_bh)->hash_chain = cpu_to_be32(ino);
 
 	affs_adjust_checksum(dir_bh, ino);
-	mmb_mark_buffer_dirty(dir_bh, &AFFS_I(dir)->i_metadata_bhs);
+	mark_buffer_dirty(dir_bh);
 	affs_brelse(dir_bh);
 
 	inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
@@ -100,7 +100,7 @@ affs_remove_hash(struct inode *dir, struct buffer_head *rem_bh)
 			else
 				AFFS_TAIL(sb, bh)->hash_chain = ino;
 			affs_adjust_checksum(bh, be32_to_cpu(ino) - hash_ino);
-			mmb_mark_buffer_dirty(bh, &AFFS_I(dir)->i_metadata_bhs);
+			mark_buffer_dirty(bh);
 			AFFS_TAIL(sb, rem_bh)->parent = 0;
 			retval = 0;
 			break;
@@ -180,7 +180,7 @@ affs_remove_link(struct dentry *dentry)
 			affs_unlock_dir(dir);
 			goto done;
 		}
-		mmb_mark_buffer_dirty(link_bh, &AFFS_I(inode)->i_metadata_bhs);
+		mark_buffer_dirty(link_bh);
 
 		memcpy(AFFS_TAIL(sb, bh)->name, AFFS_TAIL(sb, link_bh)->name, 32);
 		retval = affs_insert_hash(dir, bh);
@@ -188,7 +188,7 @@ affs_remove_link(struct dentry *dentry)
 			affs_unlock_dir(dir);
 			goto done;
 		}
-		mmb_mark_buffer_dirty(bh, &AFFS_I(inode)->i_metadata_bhs);
+		mark_buffer_dirty(bh);
 
 		affs_unlock_dir(dir);
 		iput(dir);
@@ -203,7 +203,7 @@ affs_remove_link(struct dentry *dentry)
 			__be32 ino2 = AFFS_TAIL(sb, link_bh)->link_chain;
 			AFFS_TAIL(sb, bh)->link_chain = ino2;
 			affs_adjust_checksum(bh, be32_to_cpu(ino2) - link_ino);
-			mmb_mark_buffer_dirty(bh, &AFFS_I(inode)->i_metadata_bhs);
+			mark_buffer_dirty(bh);
 			retval = 0;
 			/* Fix the link count, if bh is a normal header block without links */
 			switch (be32_to_cpu(AFFS_TAIL(sb, bh)->stype)) {
@@ -306,7 +306,7 @@ affs_remove_header(struct dentry *dentry)
 	retval = affs_remove_hash(dir, bh);
 	if (retval)
 		goto done_unlock;
-	mmb_mark_buffer_dirty(bh, &AFFS_I(inode)->i_metadata_bhs);
+	mark_buffer_dirty(bh);
 
 	affs_unlock_dir(dir);
 
diff --git a/fs/affs/file.c b/fs/affs/file.c
index 144b17482d12..23e088a7ed4f 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -140,14 +140,14 @@ affs_alloc_extblock(struct inode *inode, struct buffer_head *bh, u32 ext)
 	AFFS_TAIL(sb, new_bh)->parent = cpu_to_be32(inode->i_ino);
 	affs_fix_checksum(sb, new_bh);
 
-	mmb_mark_buffer_dirty(new_bh, &AFFS_I(inode)->i_metadata_bhs);
+	mark_buffer_dirty(new_bh);
 
 	tmp = be32_to_cpu(AFFS_TAIL(sb, bh)->extension);
 	if (tmp)
 		affs_warning(sb, "alloc_ext", "previous extension set (%x)", tmp);
 	AFFS_TAIL(sb, bh)->extension = cpu_to_be32(blocknr);
 	affs_adjust_checksum(bh, blocknr - tmp);
-	mmb_mark_buffer_dirty(bh, &AFFS_I(inode)->i_metadata_bhs);
+	mark_buffer_dirty(bh);
 
 	AFFS_I(inode)->i_extcnt++;
 	mark_inode_dirty(inode);
@@ -581,7 +581,7 @@ affs_extent_file_ofs(struct inode *inode, u32 newsize)
 		memset(AFFS_DATA(bh) + boff, 0, tmp);
 		be32_add_cpu(&AFFS_DATA_HEAD(bh)->size, tmp);
 		affs_fix_checksum(sb, bh);
-		mmb_mark_buffer_dirty(bh, &AFFS_I(inode)->i_metadata_bhs);
+		mark_buffer_dirty(bh);
 		size += tmp;
 		bidx++;
 	} else if (bidx) {
@@ -603,7 +603,7 @@ affs_extent_file_ofs(struct inode *inode, u32 newsize)
 		AFFS_DATA_HEAD(bh)->size = cpu_to_be32(tmp);
 		affs_fix_checksum(sb, bh);
 		bh->b_state &= ~(1UL << BH_New);
-		mmb_mark_buffer_dirty(bh, &AFFS_I(inode)->i_metadata_bhs);
+		mark_buffer_dirty(bh);
 		if (prev_bh) {
 			u32 tmp_next = be32_to_cpu(AFFS_DATA_HEAD(prev_bh)->next);
 
@@ -613,8 +613,7 @@ affs_extent_file_ofs(struct inode *inode, u32 newsize)
 					     bidx, tmp_next);
 			AFFS_DATA_HEAD(prev_bh)->next = cpu_to_be32(bh->b_blocknr);
 			affs_adjust_checksum(prev_bh, bh->b_blocknr - tmp_next);
-			mmb_mark_buffer_dirty(prev_bh,
-					      &AFFS_I(inode)->i_metadata_bhs);
+			mark_buffer_dirty(prev_bh);
 			affs_brelse(prev_bh);
 		}
 		size += bsize;
@@ -733,7 +732,7 @@ static int affs_write_end_ofs(const struct kiocb *iocb,
 		AFFS_DATA_HEAD(bh)->size = cpu_to_be32(
 			max(boff + tmp, be32_to_cpu(AFFS_DATA_HEAD(bh)->size)));
 		affs_fix_checksum(sb, bh);
-		mmb_mark_buffer_dirty(bh, &AFFS_I(inode)->i_metadata_bhs);
+		mark_buffer_dirty(bh);
 		written += tmp;
 		from += tmp;
 		bidx++;
@@ -766,13 +765,12 @@ static int affs_write_end_ofs(const struct kiocb *iocb,
 						     bidx, tmp_next);
 				AFFS_DATA_HEAD(prev_bh)->next = cpu_to_be32(bh->b_blocknr);
 				affs_adjust_checksum(prev_bh, bh->b_blocknr - tmp_next);
-				mmb_mark_buffer_dirty(prev_bh,
-					&AFFS_I(inode)->i_metadata_bhs);
+				mark_buffer_dirty(prev_bh);
 			}
 		}
 		affs_brelse(prev_bh);
 		affs_fix_checksum(sb, bh);
-		mmb_mark_buffer_dirty(bh, &AFFS_I(inode)->i_metadata_bhs);
+		mark_buffer_dirty(bh);
 		written += bsize;
 		from += bsize;
 		bidx++;
@@ -801,14 +799,13 @@ static int affs_write_end_ofs(const struct kiocb *iocb,
 						     bidx, tmp_next);
 				AFFS_DATA_HEAD(prev_bh)->next = cpu_to_be32(bh->b_blocknr);
 				affs_adjust_checksum(prev_bh, bh->b_blocknr - tmp_next);
-				mmb_mark_buffer_dirty(prev_bh,
-						&AFFS_I(inode)->i_metadata_bhs);
+				mark_buffer_dirty(prev_bh);
 			}
 		} else if (be32_to_cpu(AFFS_DATA_HEAD(bh)->size) < tmp)
 			AFFS_DATA_HEAD(bh)->size = cpu_to_be32(tmp);
 		affs_brelse(prev_bh);
 		affs_fix_checksum(sb, bh);
-		mmb_mark_buffer_dirty(bh, &AFFS_I(inode)->i_metadata_bhs);
+		mark_buffer_dirty(bh);
 		written += tmp;
 		from += tmp;
 		bidx++;
@@ -945,7 +942,7 @@ affs_truncate(struct inode *inode)
 	}
 	AFFS_TAIL(sb, ext_bh)->extension = 0;
 	affs_fix_checksum(sb, ext_bh);
-	mmb_mark_buffer_dirty(ext_bh, &AFFS_I(inode)->i_metadata_bhs);
+	mark_buffer_dirty(ext_bh);
 	affs_brelse(ext_bh);
 
 	if (inode->i_size) {
diff --git a/fs/affs/inode.c b/fs/affs/inode.c
index 5dd1b016bcb0..d4a3f381c4bc 100644
--- a/fs/affs/inode.c
+++ b/fs/affs/inode.c
@@ -206,7 +206,7 @@ affs_write_inode(struct inode *inode, struct writeback_control *wbc)
 		}
 	}
 	affs_fix_checksum(sb, bh);
-	mmb_mark_buffer_dirty(bh, &AFFS_I(inode)->i_metadata_bhs);
+	mark_buffer_dirty(bh);
 	affs_brelse(bh);
 	affs_free_prealloc(inode);
 	return 0;
@@ -266,11 +266,8 @@ affs_evict_inode(struct inode *inode)
 	if (!inode->i_nlink) {
 		inode->i_size = 0;
 		affs_truncate(inode);
-	} else {
-		mmb_sync(&AFFS_I(inode)->i_metadata_bhs);
 	}
 
-	mmb_invalidate(&AFFS_I(inode)->i_metadata_bhs);
 	clear_inode(inode);
 	affs_free_prealloc(inode);
 	cache_page = (unsigned long)AFFS_I(inode)->i_lc;
@@ -305,7 +302,7 @@ affs_new_inode(struct inode *dir)
 	bh = affs_getzeroblk(sb, block);
 	if (!bh)
 		goto err_bh;
-	mmb_mark_buffer_dirty(bh, &AFFS_I(inode)->i_metadata_bhs);
+	mark_buffer_dirty(bh);
 	affs_brelse(bh);
 
 	inode->i_uid     = current_fsuid();
@@ -393,17 +390,17 @@ affs_add_entry(struct inode *dir, struct inode *inode, struct dentry *dentry, s3
 		AFFS_TAIL(sb, bh)->link_chain = chain;
 		AFFS_TAIL(sb, inode_bh)->link_chain = cpu_to_be32(block);
 		affs_adjust_checksum(inode_bh, block - be32_to_cpu(chain));
-		mmb_mark_buffer_dirty(inode_bh, &AFFS_I(inode)->i_metadata_bhs);
+		mark_buffer_dirty(inode_bh);
 		set_nlink(inode, 2);
 		ihold(inode);
 	}
 	affs_fix_checksum(sb, bh);
-	mmb_mark_buffer_dirty(bh, &AFFS_I(inode)->i_metadata_bhs);
+	mark_buffer_dirty(bh);
 	dentry->d_fsdata = (void *)(long)bh->b_blocknr;
 
 	affs_lock_dir(dir);
 	retval = affs_insert_hash(dir, bh);
-	mmb_mark_buffer_dirty(bh, &AFFS_I(inode)->i_metadata_bhs);
+	mark_buffer_dirty(bh);
 	affs_unlock_dir(dir);
 	affs_unlock_link(inode);
 
diff --git a/fs/affs/namei.c b/fs/affs/namei.c
index c3c6532da4b0..57d8d755aada 100644
--- a/fs/affs/namei.c
+++ b/fs/affs/namei.c
@@ -373,7 +373,7 @@ affs_symlink(struct mnt_idmap *idmap, struct inode *dir,
 	}
 	*p = 0;
 	inode->i_size = i + 1;
-	mmb_mark_buffer_dirty(bh, &AFFS_I(inode)->i_metadata_bhs);
+	mark_buffer_dirty(bh);
 	affs_brelse(bh);
 	mark_inode_dirty(inode);
 
@@ -443,8 +443,7 @@ affs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	/* TODO: move it back to old_dir, if error? */
 
 done:
-	mmb_mark_buffer_dirty(bh,
-			&AFFS_I(retval ? old_dir : new_dir)->i_metadata_bhs);
+	mark_buffer_dirty(bh);
 	affs_brelse(bh);
 	return retval;
 }
@@ -497,8 +496,8 @@ affs_xrename(struct inode *old_dir, struct dentry *old_dentry,
 	retval = affs_insert_hash(old_dir, bh_new);
 	affs_unlock_dir(old_dir);
 done:
-	mmb_mark_buffer_dirty(bh_old, &AFFS_I(new_dir)->i_metadata_bhs);
-	mmb_mark_buffer_dirty(bh_new, &AFFS_I(old_dir)->i_metadata_bhs);
+	mark_buffer_dirty(bh_old);
+	mark_buffer_dirty(bh_new);
 	affs_brelse(bh_old);
 	affs_brelse(bh_new);
 	return retval;
diff --git a/fs/affs/super.c b/fs/affs/super.c
index 079f36e1ddec..8451647f3fea 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -108,7 +108,6 @@ static struct inode *affs_alloc_inode(struct super_block *sb)
 	i->i_lc = NULL;
 	i->i_ext_bh = NULL;
 	i->i_pa_cnt = 0;
-	mmb_init(&i->i_metadata_bhs, &i->vfs_inode.i_data);
 
 	return &i->vfs_inode;
 }
-- 
2.51.0


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH 2/9] ext4: Allocate mapping_metadata_bhs struct on demand
  2026-05-11 12:13 [PATCH 0/9] fs: Fix missed inode write during fsync Jan Kara
  2026-05-11 12:13 ` [PATCH 1/9] affs: Drop support for metadata bh tracking Jan Kara
@ 2026-05-11 12:13 ` Jan Kara
  2026-05-11 12:13 ` [PATCH 3/9] fs: Writeout inode buffer from mmb_sync() Jan Kara
                   ` (7 subsequent siblings)
  9 siblings, 0 replies; 18+ messages in thread
From: Jan Kara @ 2026-05-11 12:13 UTC (permalink / raw)
  To: linux-fsdevel
  Cc: Christian Brauner, aivazian.tigran, OGAWA Hirofumi, Ted Tso,
	linux-ext4, Jan Kara

Currently every ext4 inode gets mapping_metadata_bhs struct although it
is only needed when running without a journal and only for inodes where
any metadata was dirtied. Allocate mapping_metadata_bhs struct on demand
when dirtying the first metadata buffer for the inode.

Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/ext4/ext4.h      |  2 +-
 fs/ext4/ext4_jbd2.c | 24 +++++++++++++++++++++---
 fs/ext4/fsync.c     | 12 ++++++++----
 fs/ext4/inode.c     |  9 +++++----
 fs/ext4/super.c     |  8 +++++---
 5 files changed, 40 insertions(+), 15 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 94283a991e5c..6bb29a20420f 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1117,7 +1117,7 @@ struct ext4_inode_info {
 	struct rw_semaphore i_data_sem;
 	struct inode vfs_inode;
 	struct jbd2_inode *jinode;
-	struct mapping_metadata_bhs i_metadata_bhs;
+	struct mapping_metadata_bhs *i_metadata_bhs;
 
 	/*
 	 * File creation time. Its function is same as that of
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index 9a8c225f2753..74f05bd0cdde 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -350,6 +350,21 @@ int __ext4_journal_get_create_access(const char *where, unsigned int line,
 	return 0;
 }
 
+static void ext4_inode_attach_mmb(struct inode *inode)
+{
+	struct mapping_metadata_bhs *mmb;
+
+	/*
+	 * It's difficult to handle failure when marking buffer dirty without
+	 * leaving filesystem corrupyted
+	 */
+	mmb = kmalloc_obj(*mmb, GFP_KERNEL | __GFP_NOFAIL);
+	mmb_init(mmb, inode->i_mapping);
+	/* Someone swapped another mmb before us? */
+	if (cmpxchg(&EXT4_I(inode)->i_metadata_bhs, NULL, mmb))
+		kfree(mmb);
+}
+
 int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
 				 handle_t *handle, struct inode *inode,
 				 struct buffer_head *bh)
@@ -389,11 +404,14 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
 					 err);
 		}
 	} else {
-		if (inode)
+		if (inode) {
+			if (!EXT4_I(inode)->i_metadata_bhs)
+				ext4_inode_attach_mmb(inode);
 			mmb_mark_buffer_dirty(bh,
-					      &EXT4_I(inode)->i_metadata_bhs);
-		else
+					      EXT4_I(inode)->i_metadata_bhs);
+		} else {
 			mark_buffer_dirty(bh);
+		}
 		if (inode && inode_needs_sync(inode)) {
 			sync_dirty_buffer(bh);
 			if (buffer_req(bh) && !buffer_uptodate(bh)) {
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 924726dcc85f..e25d365e1179 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -46,6 +46,7 @@
 static int ext4_sync_parent(struct inode *inode)
 {
 	struct dentry *dentry, *next;
+	struct mapping_metadata_bhs *mmb;
 	int ret = 0;
 
 	if (!ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY))
@@ -68,9 +69,12 @@ static int ext4_sync_parent(struct inode *inode)
 		 * through ext4_evict_inode()) and so we are safe to flush
 		 * metadata blocks and the inode.
 		 */
-		ret = mmb_sync(&EXT4_I(inode)->i_metadata_bhs);
-		if (ret)
-			break;
+		mmb = READ_ONCE(EXT4_I(inode)->i_metadata_bhs);
+		if (mmb) {
+			ret = mmb_sync(mmb);
+			if (ret)
+				break;
+		}
 		ret = sync_inode_metadata(inode, 1);
 		if (ret)
 			break;
@@ -89,7 +93,7 @@ static int ext4_fsync_nojournal(struct file *file, loff_t start, loff_t end,
 	};
 	int ret;
 
-	ret = mmb_fsync_noflush(file, &EXT4_I(inode)->i_metadata_bhs,
+	ret = mmb_fsync_noflush(file, EXT4_I(inode)->i_metadata_bhs,
 				start, end, datasync);
 	if (ret)
 		return ret;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index c2c2d6ac7f3d..3e66e9510909 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -195,9 +195,8 @@ void ext4_evict_inode(struct inode *inode)
 			ext4_warning_inode(inode, "data will be lost");
 
 		truncate_inode_pages_final(&inode->i_data);
-		/* Avoid mballoc special inode which has no proper iops */
-		if (!EXT4_SB(inode->i_sb)->s_journal)
-			mmb_sync(&EXT4_I(inode)->i_metadata_bhs);
+		if (EXT4_I(inode)->i_metadata_bhs)
+			mmb_sync(EXT4_I(inode)->i_metadata_bhs);
 		goto no_delete;
 	}
 
@@ -3451,6 +3450,7 @@ static bool ext4_release_folio(struct folio *folio, gfp_t wait)
 static bool ext4_inode_datasync_dirty(struct inode *inode)
 {
 	journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
+	struct mapping_metadata_bhs *mmb;
 
 	if (journal) {
 		if (jbd2_transaction_committed(journal,
@@ -3461,8 +3461,9 @@ static bool ext4_inode_datasync_dirty(struct inode *inode)
 		return true;
 	}
 
+	mmb = READ_ONCE(EXT4_I(inode)->i_metadata_bhs);
 	/* Any metadata buffers to write? */
-	if (mmb_has_buffers(&EXT4_I(inode)->i_metadata_bhs))
+	if (mmb && mmb_has_buffers(mmb))
 		return true;
 	return inode_state_read_once(inode) & I_DIRTY_DATASYNC;
 }
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 6a77db4d3124..92134ea4620c 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1430,7 +1430,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
 	INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work);
 	ext4_fc_init_inode(&ei->vfs_inode);
 	spin_lock_init(&ei->i_fc_lock);
-	mmb_init(&ei->i_metadata_bhs, &ei->vfs_inode.i_data);
+	ei->i_metadata_bhs = NULL;
 	return &ei->vfs_inode;
 }
 
@@ -1527,8 +1527,10 @@ static void destroy_inodecache(void)
 void ext4_clear_inode(struct inode *inode)
 {
 	ext4_fc_del(inode);
-	if (!EXT4_SB(inode->i_sb)->s_journal)
-		mmb_invalidate(&EXT4_I(inode)->i_metadata_bhs);
+	if (EXT4_I(inode)->i_metadata_bhs) {
+		mmb_invalidate(EXT4_I(inode)->i_metadata_bhs);
+		kfree(EXT4_I(inode)->i_metadata_bhs);
+	}
 	clear_inode(inode);
 	ext4_discard_preallocations(inode);
 	/*
-- 
2.51.0


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH 3/9] fs: Writeout inode buffer from mmb_sync()
  2026-05-11 12:13 [PATCH 0/9] fs: Fix missed inode write during fsync Jan Kara
  2026-05-11 12:13 ` [PATCH 1/9] affs: Drop support for metadata bh tracking Jan Kara
  2026-05-11 12:13 ` [PATCH 2/9] ext4: Allocate mapping_metadata_bhs struct on demand Jan Kara
@ 2026-05-11 12:13 ` Jan Kara
  2026-05-11 13:27   ` Christian Brauner
  2026-05-11 12:13 ` [PATCH 4/9] ext2: Fix possibly missing inode write on fsync(2) Jan Kara
                   ` (6 subsequent siblings)
  9 siblings, 1 reply; 18+ messages in thread
From: Jan Kara @ 2026-05-11 12:13 UTC (permalink / raw)
  To: linux-fsdevel
  Cc: Christian Brauner, aivazian.tigran, OGAWA Hirofumi, Ted Tso,
	linux-ext4, Jan Kara

Currently metadata bh tracking does not track inode buffers because they
are usually shared by several inodes and so our linked list tracking
cannot be used. On fsync we call sync_inode_metadata() to write inode
instead where filesystems' .write_inode methods detect data integrity
writeback and take care to submit inode buffer to disk and wait for it
in that case. This is however racy as for example flush worker can
submit normal (WB_SYNC_NONE) inode writeback first, which makes the
inode clean and copies the inode to the buffer but doesn't submit the
buffer for IO. Thus sync_inode_metadata() call does nothing and we fail
to persist inode buffer to disk on fsync(2).

Fix the problem by allowing filesystem to set the number of block backing
the inode in mmb structure and mmb_sync() then takes care to writeout
corresponding buffer and wait for it.

Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/buffer.c        | 34 +++++++++++++++++++++++-----------
 include/linux/fs.h |  1 +
 2 files changed, 24 insertions(+), 11 deletions(-)

diff --git a/fs/buffer.c b/fs/buffer.c
index b0b3792b1496..dba29a45346b 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -477,12 +477,14 @@ EXPORT_SYMBOL(mark_buffer_async_write);
  * using RCU, grab the lock, verify we didn't race with somebody detaching the
  * bh / moving it to different inode and only then proceeding.
  */
+#define INVALID_BLK (~0ULL)
 
 void mmb_init(struct mapping_metadata_bhs *mmb, struct address_space *mapping)
 {
 	spin_lock_init(&mmb->lock);
 	INIT_LIST_HEAD(&mmb->list);
 	mmb->mapping = mapping;
+	mmb->inode_blk = INVALID_BLK;
 }
 EXPORT_SYMBOL(mmb_init);
 
@@ -593,8 +595,18 @@ int mmb_sync(struct mapping_metadata_bhs *mmb)
 			}
 		}
 	}
-
 	spin_unlock(&mmb->lock);
+
+	/* Writeout inode buffer head */
+	if (mmb->inode_blk != INVALID_BLK) {
+		bh = sb_find_get_block(mmb->mapping->host->i_sb, mmb->inode_blk);
+		write_dirty_buffer(bh, REQ_SYNC);
+		wait_on_buffer(bh);
+		if (!buffer_uptodate(bh))
+			err = -EIO;
+		brelse(bh);
+	}
+
 	blk_finish_plug(&plug);
 	spin_lock(&mmb->lock);
 
@@ -646,18 +658,18 @@ int mmb_fsync_noflush(struct file *file, struct mapping_metadata_bhs *mmb,
 	if (err)
 		return err;
 
-	if (mmb)
-		ret = mmb_sync(mmb);
 	if (!(inode_state_read_once(inode) & I_DIRTY_ALL))
-		goto out;
+		goto sync_buffers;
 	if (datasync && !(inode_state_read_once(inode) & I_DIRTY_DATASYNC))
-		goto out;
-
-	err = sync_inode_metadata(inode, 1);
-	if (ret == 0)
-		ret = err;
-
-out:
+		goto sync_buffers;
+
+	ret = sync_inode_metadata(inode, 1);
+sync_buffers:
+	if (mmb) {
+		err = mmb_sync(mmb);
+		if (ret == 0)
+			ret = err;
+	}
 	/* check and advance again to catch errors after syncing out buffers */
 	err = file_check_and_advance_wb_err(file);
 	if (ret == 0)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 11559c513dfb..435a41e4c90f 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -446,6 +446,7 @@ extern const struct address_space_operations empty_aops;
 /* Structure for tracking metadata buffer heads associated with the mapping */
 struct mapping_metadata_bhs {
 	struct address_space *mapping;	/* Mapping bhs are associated with */
+	sector_t inode_blk;	/* Number of block containing the inode */
 	spinlock_t lock;	/* Lock protecting bh list */
 	struct list_head list;	/* The list of bhs (b_assoc_buffers) */
 };
-- 
2.51.0


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH 4/9] ext2: Fix possibly missing inode write on fsync(2)
  2026-05-11 12:13 [PATCH 0/9] fs: Fix missed inode write during fsync Jan Kara
                   ` (2 preceding siblings ...)
  2026-05-11 12:13 ` [PATCH 3/9] fs: Writeout inode buffer from mmb_sync() Jan Kara
@ 2026-05-11 12:13 ` Jan Kara
  2026-05-11 12:13 ` [PATCH 5/9] udf: " Jan Kara
                   ` (5 subsequent siblings)
  9 siblings, 0 replies; 18+ messages in thread
From: Jan Kara @ 2026-05-11 12:13 UTC (permalink / raw)
  To: linux-fsdevel
  Cc: Christian Brauner, aivazian.tigran, OGAWA Hirofumi, Ted Tso,
	linux-ext4, Jan Kara

Use mmb inode buffer writeout infrastructure to reliably write out
inode's inode table block on fsync(2).

Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/ext2/inode.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 74aca5eb572d..6ce832da944f 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -1612,6 +1612,7 @@ static int __ext2_write_inode(struct inode *inode, int do_sync)
 	} else for (n = 0; n < EXT2_N_BLOCKS; n++)
 		raw_inode->i_block[n] = ei->i_data[n];
 	mark_buffer_dirty(bh);
+	ei->i_metadata_bhs.inode_blk = bh->b_blocknr;
 	if (do_sync) {
 		sync_dirty_buffer(bh);
 		if (buffer_req(bh) && !buffer_uptodate(bh)) {
@@ -1627,7 +1628,7 @@ static int __ext2_write_inode(struct inode *inode, int do_sync)
 
 int ext2_write_inode(struct inode *inode, struct writeback_control *wbc)
 {
-	return __ext2_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
+	return __ext2_write_inode(inode, 0);
 }
 
 int ext2_getattr(struct mnt_idmap *idmap, const struct path *path,
-- 
2.51.0


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH 5/9] udf: Fix possibly missing inode write on fsync(2)
  2026-05-11 12:13 [PATCH 0/9] fs: Fix missed inode write during fsync Jan Kara
                   ` (3 preceding siblings ...)
  2026-05-11 12:13 ` [PATCH 4/9] ext2: Fix possibly missing inode write on fsync(2) Jan Kara
@ 2026-05-11 12:13 ` Jan Kara
  2026-05-11 12:13 ` [PATCH 6/9] fat: " Jan Kara
                   ` (4 subsequent siblings)
  9 siblings, 0 replies; 18+ messages in thread
From: Jan Kara @ 2026-05-11 12:13 UTC (permalink / raw)
  To: linux-fsdevel
  Cc: Christian Brauner, aivazian.tigran, OGAWA Hirofumi, Ted Tso,
	linux-ext4, Jan Kara

Use mmb inode buffer writeout infrastructure to reliably write out
inode's block on fsync(2).

Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/udf/inode.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 67bcf83758c8..4102d3482319 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -1707,7 +1707,7 @@ void udf_update_extra_perms(struct inode *inode, umode_t mode)
 
 int udf_write_inode(struct inode *inode, struct writeback_control *wbc)
 {
-	return udf_update_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
+	return udf_update_inode(inode, 0);
 }
 
 static int udf_sync_inode(struct inode *inode)
@@ -1937,6 +1937,7 @@ static int udf_update_inode(struct inode *inode, int do_sync)
 
 	/* write the data blocks */
 	mark_buffer_dirty(bh);
+	iinfo->i_metadata_bhs.inode_blk = bh->b_blocknr;
 	if (do_sync) {
 		sync_dirty_buffer(bh);
 		if (buffer_write_io_error(bh)) {
-- 
2.51.0


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH 6/9] fat: Fix possibly missing inode write on fsync(2)
  2026-05-11 12:13 [PATCH 0/9] fs: Fix missed inode write during fsync Jan Kara
                   ` (4 preceding siblings ...)
  2026-05-11 12:13 ` [PATCH 5/9] udf: " Jan Kara
@ 2026-05-11 12:13 ` Jan Kara
  2026-05-11 14:32   ` OGAWA Hirofumi
  2026-05-11 12:13 ` [PATCH 7/9] minix: " Jan Kara
                   ` (3 subsequent siblings)
  9 siblings, 1 reply; 18+ messages in thread
From: Jan Kara @ 2026-05-11 12:13 UTC (permalink / raw)
  To: linux-fsdevel
  Cc: Christian Brauner, aivazian.tigran, OGAWA Hirofumi, Ted Tso,
	linux-ext4, Jan Kara

Use mmb inode buffer writeout infrastructure to reliably write out
inode's buffer on fsync(2).

Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/fat/inode.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 28f78df086ef..4ca00b7a618b 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -907,6 +907,7 @@ static int __fat_write_inode(struct inode *inode, int wait)
 	}
 	spin_unlock(&sbi->inode_hash_lock);
 	mark_buffer_dirty(bh);
+	MSDOS_I(inode)->i_metadata_bhs.inode_blk = bh->b_blocknr;
 	err = 0;
 	if (wait)
 		err = sync_dirty_buffer(bh);
@@ -925,7 +926,7 @@ static int fat_write_inode(struct inode *inode, struct writeback_control *wbc)
 		err = fat_clusters_flush(sb);
 		mutex_unlock(&MSDOS_SB(sb)->s_lock);
 	} else
-		err = __fat_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
+		err = __fat_write_inode(inode, 0);
 
 	return err;
 }
-- 
2.51.0


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH 7/9] minix: Fix possibly missing inode write on fsync(2)
  2026-05-11 12:13 [PATCH 0/9] fs: Fix missed inode write during fsync Jan Kara
                   ` (5 preceding siblings ...)
  2026-05-11 12:13 ` [PATCH 6/9] fat: " Jan Kara
@ 2026-05-11 12:13 ` Jan Kara
  2026-05-11 12:13 ` [PATCH 8/9] bfs: " Jan Kara
                   ` (2 subsequent siblings)
  9 siblings, 0 replies; 18+ messages in thread
From: Jan Kara @ 2026-05-11 12:13 UTC (permalink / raw)
  To: linux-fsdevel
  Cc: Christian Brauner, aivazian.tigran, OGAWA Hirofumi, Ted Tso,
	linux-ext4, Jan Kara

Use mmb inode buffer writeout infrastructure to reliably write out
inode's buffer on fsync(2).

Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/minix/inode.c | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index 9c6bac248907..e3e05c9308bd 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -693,14 +693,7 @@ static int minix_write_inode(struct inode *inode, struct writeback_control *wbc)
 		bh = V2_minix_update_inode(inode);
 	if (!bh)
 		return -EIO;
-	if (wbc->sync_mode == WB_SYNC_ALL && buffer_dirty(bh)) {
-		sync_dirty_buffer(bh);
-		if (buffer_req(bh) && !buffer_uptodate(bh)) {
-			printk("IO error syncing minix inode [%s:%08llx]\n",
-				inode->i_sb->s_id, inode->i_ino);
-			err = -EIO;
-		}
-	}
+	minix_i(inode)->i_metadata_bhs.inode_blk = bh->b_blocknr;
 	brelse (bh);
 	return err;
 }
-- 
2.51.0


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH 8/9] bfs: Fix possibly missing inode write on fsync(2)
  2026-05-11 12:13 [PATCH 0/9] fs: Fix missed inode write during fsync Jan Kara
                   ` (6 preceding siblings ...)
  2026-05-11 12:13 ` [PATCH 7/9] minix: " Jan Kara
@ 2026-05-11 12:13 ` Jan Kara
  2026-05-11 12:13 ` [PATCH 9/9] ext4: Use mmb infrastructure for inode buffer writeout Jan Kara
  2026-05-11 20:49 ` [syzbot ci] Re: fs: Fix missed inode write during fsync syzbot ci
  9 siblings, 0 replies; 18+ messages in thread
From: Jan Kara @ 2026-05-11 12:13 UTC (permalink / raw)
  To: linux-fsdevel
  Cc: Christian Brauner, aivazian.tigran, OGAWA Hirofumi, Ted Tso,
	linux-ext4, Jan Kara

Use mmb inode buffer writeout infrastructure to reliably write out
inode's buffer on fsync(2).

Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/bfs/inode.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index 19e49c8cf750..16d351b2f122 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -165,11 +165,7 @@ static int bfs_write_inode(struct inode *inode, struct writeback_control *wbc)
 	di->i_eoffset = cpu_to_le32(i_sblock * BFS_BSIZE + inode->i_size - 1);
 
 	mark_buffer_dirty(bh);
-	if (wbc->sync_mode == WB_SYNC_ALL) {
-		sync_dirty_buffer(bh);
-		if (buffer_req(bh) && !buffer_uptodate(bh))
-			err = -EIO;
-	}
+	BFS_I(inode)->i_metadata_bhs.inode_blk = bh->b_blocknr;
 	brelse(bh);
 	mutex_unlock(&info->bfs_lock);
 	return err;
-- 
2.51.0


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH 9/9] ext4: Use mmb infrastructure for inode buffer writeout
  2026-05-11 12:13 [PATCH 0/9] fs: Fix missed inode write during fsync Jan Kara
                   ` (7 preceding siblings ...)
  2026-05-11 12:13 ` [PATCH 8/9] bfs: " Jan Kara
@ 2026-05-11 12:13 ` Jan Kara
  2026-05-11 13:30   ` Christian Brauner
  2026-05-11 20:49 ` [syzbot ci] Re: fs: Fix missed inode write during fsync syzbot ci
  9 siblings, 1 reply; 18+ messages in thread
From: Jan Kara @ 2026-05-11 12:13 UTC (permalink / raw)
  To: linux-fsdevel
  Cc: Christian Brauner, aivazian.tigran, OGAWA Hirofumi, Ted Tso,
	linux-ext4, Jan Kara

Use mmb inode buffer writeout infrastructure to reliably write out
inode's inode table block on fsync(2) in nojournal mode (from
ext4_sync_parent() and ext4_fsync_nojournal()). This significantly
simplifies the code as we don't have to explicitely handle inode buffer
writeback in ext4_write_inode() and thus we can also remove
sync_inode_metadata() calls from ext4_sync_parent() and
ext4_write_inode() call from ext4_fsync_nojournal().

Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/ext4/ext4_jbd2.c |  2 +-
 fs/ext4/ext4_jbd2.h |  2 ++
 fs/ext4/fsync.c     | 12 ------------
 fs/ext4/inode.c     | 24 +++++-------------------
 4 files changed, 8 insertions(+), 32 deletions(-)

diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index 74f05bd0cdde..6bbaf72108fd 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -350,7 +350,7 @@ int __ext4_journal_get_create_access(const char *where, unsigned int line,
 	return 0;
 }
 
-static void ext4_inode_attach_mmb(struct inode *inode)
+void ext4_inode_attach_mmb(struct inode *inode)
 {
 	struct mapping_metadata_bhs *mmb;
 
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index 63d17c5201b5..2a01b8279c88 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -122,6 +122,8 @@
 #define EXT4_HT_EXT_CONVERT     11
 #define EXT4_HT_MAX             12
 
+void ext4_inode_attach_mmb(struct inode *inode);
+
 int
 ext4_mark_iloc_dirty(handle_t *handle,
 		     struct inode *inode,
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index e25d365e1179..af84489e57c6 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -75,9 +75,6 @@ static int ext4_sync_parent(struct inode *inode)
 			if (ret)
 				break;
 		}
-		ret = sync_inode_metadata(inode, 1);
-		if (ret)
-			break;
 	}
 	dput(dentry);
 	return ret;
@@ -87,10 +84,6 @@ static int ext4_fsync_nojournal(struct file *file, loff_t start, loff_t end,
 				int datasync, bool *needs_barrier)
 {
 	struct inode *inode = file->f_inode;
-	struct writeback_control wbc = {
-		.sync_mode = WB_SYNC_ALL,
-		.nr_to_write = 0,
-	};
 	int ret;
 
 	ret = mmb_fsync_noflush(file, EXT4_I(inode)->i_metadata_bhs,
@@ -98,11 +91,6 @@ static int ext4_fsync_nojournal(struct file *file, loff_t start, loff_t end,
 	if (ret)
 		return ret;
 
-	/* Force writeout of inode table buffer to disk */
-	ret = ext4_write_inode(inode, &wbc);
-	if (ret)
-		return ret;
-
 	ret = ext4_sync_parent(inode);
 
 	if (test_opt(inode->i_sb, BARRIER))
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 3e66e9510909..09506b4de1b2 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -5786,24 +5786,6 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
 
 		err = ext4_fc_commit(EXT4_SB(inode->i_sb)->s_journal,
 						EXT4_I(inode)->i_sync_tid);
-	} else {
-		struct ext4_iloc iloc;
-
-		err = __ext4_get_inode_loc_noinmem(inode, &iloc);
-		if (err)
-			return err;
-		/*
-		 * sync(2) will flush the whole buffer cache. No need to do
-		 * it here separately for each inode.
-		 */
-		if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync)
-			sync_dirty_buffer(iloc.bh);
-		if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) {
-			ext4_error_inode_block(inode, iloc.bh->b_blocknr, EIO,
-					       "IO error syncing inode");
-			err = -EIO;
-		}
-		brelse(iloc.bh);
 	}
 	return err;
 }
@@ -6348,7 +6330,11 @@ int ext4_mark_iloc_dirty(handle_t *handle,
 
 	/* the do_update_inode consumes one bh->b_count */
 	get_bh(iloc->bh);
-
+	if (!ext4_handle_valid(handle)) {
+		if (!EXT4_I(inode)->i_metadata_bhs)
+			ext4_inode_attach_mmb(inode);
+		EXT4_I(inode)->i_metadata_bhs->inode_blk = iloc->bh->b_blocknr;
+	}
 	/* ext4_do_update_inode() does jbd2_journal_dirty_metadata */
 	err = ext4_do_update_inode(handle, inode, iloc);
 	put_bh(iloc->bh);
-- 
2.51.0


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* Re: [PATCH 3/9] fs: Writeout inode buffer from mmb_sync()
  2026-05-11 12:13 ` [PATCH 3/9] fs: Writeout inode buffer from mmb_sync() Jan Kara
@ 2026-05-11 13:27   ` Christian Brauner
  0 siblings, 0 replies; 18+ messages in thread
From: Christian Brauner @ 2026-05-11 13:27 UTC (permalink / raw)
  To: Jan Kara
  Cc: linux-fsdevel, aivazian.tigran, OGAWA Hirofumi, Ted Tso,
	linux-ext4

On Mon, May 11, 2026 at 02:13:53PM +0200, Jan Kara wrote:
> Currently metadata bh tracking does not track inode buffers because they
> are usually shared by several inodes and so our linked list tracking
> cannot be used. On fsync we call sync_inode_metadata() to write inode
> instead where filesystems' .write_inode methods detect data integrity
> writeback and take care to submit inode buffer to disk and wait for it
> in that case. This is however racy as for example flush worker can
> submit normal (WB_SYNC_NONE) inode writeback first, which makes the
> inode clean and copies the inode to the buffer but doesn't submit the
> buffer for IO. Thus sync_inode_metadata() call does nothing and we fail
> to persist inode buffer to disk on fsync(2).
> 
> Fix the problem by allowing filesystem to set the number of block backing
> the inode in mmb structure and mmb_sync() then takes care to writeout
> corresponding buffer and wait for it.
> 
> Signed-off-by: Jan Kara <jack@suse.cz>
> ---
>  fs/buffer.c        | 34 +++++++++++++++++++++++-----------
>  include/linux/fs.h |  1 +
>  2 files changed, 24 insertions(+), 11 deletions(-)
> 
> diff --git a/fs/buffer.c b/fs/buffer.c
> index b0b3792b1496..dba29a45346b 100644
> --- a/fs/buffer.c
> +++ b/fs/buffer.c
> @@ -477,12 +477,14 @@ EXPORT_SYMBOL(mark_buffer_async_write);
>   * using RCU, grab the lock, verify we didn't race with somebody detaching the
>   * bh / moving it to different inode and only then proceeding.
>   */
> +#define INVALID_BLK (~0ULL)
>  
>  void mmb_init(struct mapping_metadata_bhs *mmb, struct address_space *mapping)
>  {
>  	spin_lock_init(&mmb->lock);
>  	INIT_LIST_HEAD(&mmb->list);
>  	mmb->mapping = mapping;
> +	mmb->inode_blk = INVALID_BLK;
>  }
>  EXPORT_SYMBOL(mmb_init);
>  
> @@ -593,8 +595,18 @@ int mmb_sync(struct mapping_metadata_bhs *mmb)
>  			}
>  		}
>  	}
> -
>  	spin_unlock(&mmb->lock);
> +
> +	/* Writeout inode buffer head */
> +	if (mmb->inode_blk != INVALID_BLK) {
> +		bh = sb_find_get_block(mmb->mapping->host->i_sb, mmb->inode_blk);
> +		write_dirty_buffer(bh, REQ_SYNC);
> +		wait_on_buffer(bh);
> +		if (!buffer_uptodate(bh))
> +			err = -EIO;
> +		brelse(bh);
> +	}
> +
>  	blk_finish_plug(&plug);
>  	spin_lock(&mmb->lock);
>  
> @@ -646,18 +658,18 @@ int mmb_fsync_noflush(struct file *file, struct mapping_metadata_bhs *mmb,
>  	if (err)
>  		return err;
>  
> -	if (mmb)
> -		ret = mmb_sync(mmb);
>  	if (!(inode_state_read_once(inode) & I_DIRTY_ALL))
> -		goto out;
> +		goto sync_buffers;
>  	if (datasync && !(inode_state_read_once(inode) & I_DIRTY_DATASYNC))
> -		goto out;
> -
> -	err = sync_inode_metadata(inode, 1);
> -	if (ret == 0)
> -		ret = err;
> -
> -out:
> +		goto sync_buffers;
> +
> +	ret = sync_inode_metadata(inode, 1);
> +sync_buffers:
> +	if (mmb) {
> +		err = mmb_sync(mmb);
> +		if (ret == 0)
> +			ret = err;
> +	}
>  	/* check and advance again to catch errors after syncing out buffers */
>  	err = file_check_and_advance_wb_err(file);
>  	if (ret == 0)
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index 11559c513dfb..435a41e4c90f 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -446,6 +446,7 @@ extern const struct address_space_operations empty_aops;
>  /* Structure for tracking metadata buffer heads associated with the mapping */
>  struct mapping_metadata_bhs {
>  	struct address_space *mapping;	/* Mapping bhs are associated with */
> +	sector_t inode_blk;	/* Number of block containing the inode */

This is great, thanks!

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 9/9] ext4: Use mmb infrastructure for inode buffer writeout
  2026-05-11 12:13 ` [PATCH 9/9] ext4: Use mmb infrastructure for inode buffer writeout Jan Kara
@ 2026-05-11 13:30   ` Christian Brauner
  0 siblings, 0 replies; 18+ messages in thread
From: Christian Brauner @ 2026-05-11 13:30 UTC (permalink / raw)
  To: Jan Kara
  Cc: linux-fsdevel, aivazian.tigran, OGAWA Hirofumi, Ted Tso,
	linux-ext4

On Mon, May 11, 2026 at 02:13:59PM +0200, Jan Kara wrote:
> Use mmb inode buffer writeout infrastructure to reliably write out
> inode's inode table block on fsync(2) in nojournal mode (from
> ext4_sync_parent() and ext4_fsync_nojournal()). This significantly
> simplifies the code as we don't have to explicitely handle inode buffer
> writeback in ext4_write_inode() and thus we can also remove
> sync_inode_metadata() calls from ext4_sync_parent() and
> ext4_write_inode() call from ext4_fsync_nojournal().
> 
> Signed-off-by: Jan Kara <jack@suse.cz>
> ---
>  fs/ext4/ext4_jbd2.c |  2 +-
>  fs/ext4/ext4_jbd2.h |  2 ++
>  fs/ext4/fsync.c     | 12 ------------
>  fs/ext4/inode.c     | 24 +++++-------------------
>  4 files changed, 8 insertions(+), 32 deletions(-)
> 
> diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
> index 74f05bd0cdde..6bbaf72108fd 100644
> --- a/fs/ext4/ext4_jbd2.c
> +++ b/fs/ext4/ext4_jbd2.c
> @@ -350,7 +350,7 @@ int __ext4_journal_get_create_access(const char *where, unsigned int line,
>  	return 0;
>  }
>  
> -static void ext4_inode_attach_mmb(struct inode *inode)
> +void ext4_inode_attach_mmb(struct inode *inode)
>  {
>  	struct mapping_metadata_bhs *mmb;
>  
> diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
> index 63d17c5201b5..2a01b8279c88 100644
> --- a/fs/ext4/ext4_jbd2.h
> +++ b/fs/ext4/ext4_jbd2.h
> @@ -122,6 +122,8 @@
>  #define EXT4_HT_EXT_CONVERT     11
>  #define EXT4_HT_MAX             12
>  
> +void ext4_inode_attach_mmb(struct inode *inode);
> +
>  int
>  ext4_mark_iloc_dirty(handle_t *handle,
>  		     struct inode *inode,
> diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
> index e25d365e1179..af84489e57c6 100644
> --- a/fs/ext4/fsync.c
> +++ b/fs/ext4/fsync.c
> @@ -75,9 +75,6 @@ static int ext4_sync_parent(struct inode *inode)
>  			if (ret)
>  				break;
>  		}
> -		ret = sync_inode_metadata(inode, 1);
> -		if (ret)
> -			break;
>  	}
>  	dput(dentry);
>  	return ret;
> @@ -87,10 +84,6 @@ static int ext4_fsync_nojournal(struct file *file, loff_t start, loff_t end,
>  				int datasync, bool *needs_barrier)
>  {
>  	struct inode *inode = file->f_inode;
> -	struct writeback_control wbc = {
> -		.sync_mode = WB_SYNC_ALL,
> -		.nr_to_write = 0,
> -	};
>  	int ret;
>  
>  	ret = mmb_fsync_noflush(file, EXT4_I(inode)->i_metadata_bhs,
> @@ -98,11 +91,6 @@ static int ext4_fsync_nojournal(struct file *file, loff_t start, loff_t end,
>  	if (ret)
>  		return ret;
>  
> -	/* Force writeout of inode table buffer to disk */
> -	ret = ext4_write_inode(inode, &wbc);
> -	if (ret)
> -		return ret;
> -
>  	ret = ext4_sync_parent(inode);
>  
>  	if (test_opt(inode->i_sb, BARRIER))
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 3e66e9510909..09506b4de1b2 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -5786,24 +5786,6 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
>  
>  		err = ext4_fc_commit(EXT4_SB(inode->i_sb)->s_journal,
>  						EXT4_I(inode)->i_sync_tid);
> -	} else {
> -		struct ext4_iloc iloc;
> -
> -		err = __ext4_get_inode_loc_noinmem(inode, &iloc);
> -		if (err)
> -			return err;
> -		/*
> -		 * sync(2) will flush the whole buffer cache. No need to do
> -		 * it here separately for each inode.
> -		 */
> -		if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync)
> -			sync_dirty_buffer(iloc.bh);
> -		if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) {
> -			ext4_error_inode_block(inode, iloc.bh->b_blocknr, EIO,
> -					       "IO error syncing inode");
> -			err = -EIO;
> -		}
> -		brelse(iloc.bh);
>  	}
>  	return err;
>  }
> @@ -6348,7 +6330,11 @@ int ext4_mark_iloc_dirty(handle_t *handle,
>  
>  	/* the do_update_inode consumes one bh->b_count */
>  	get_bh(iloc->bh);
> -
> +	if (!ext4_handle_valid(handle)) {
> +		if (!EXT4_I(inode)->i_metadata_bhs)
> +			ext4_inode_attach_mmb(inode);
> +		EXT4_I(inode)->i_metadata_bhs->inode_blk = iloc->bh->b_blocknr;

The series is great overall. The only thing I think we should change is
that we should hide this

EXT4_I(inode)->i_metadata_bhs->inode_blk = iloc->bh->b_blocknr;

behind a dedicated static inline/regular function call instead of
open-coding it everywhere. Can then also be paired with some
VFS_WARN_ON_ONCE() to detect garbage bh->b_blocknr.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 6/9] fat: Fix possibly missing inode write on fsync(2)
  2026-05-11 12:13 ` [PATCH 6/9] fat: " Jan Kara
@ 2026-05-11 14:32   ` OGAWA Hirofumi
  2026-05-11 17:03     ` Jan Kara
  0 siblings, 1 reply; 18+ messages in thread
From: OGAWA Hirofumi @ 2026-05-11 14:32 UTC (permalink / raw)
  To: Jan Kara
  Cc: linux-fsdevel, Christian Brauner, aivazian.tigran, Ted Tso,
	linux-ext4

Jan Kara <jack@suse.cz> writes:

> Use mmb inode buffer writeout infrastructure to reliably write out
> inode's buffer on fsync(2).

> Signed-off-by: Jan Kara <jack@suse.cz>
> ---
>  fs/fat/inode.c | 3 ++-
>  1 file changed, 2 insertions(+), 1 deletion(-)
>
> diff --git a/fs/fat/inode.c b/fs/fat/inode.c
> index 28f78df086ef..4ca00b7a618b 100644
> --- a/fs/fat/inode.c
> +++ b/fs/fat/inode.c
> @@ -907,6 +907,7 @@ static int __fat_write_inode(struct inode *inode, int wait)
>  	}
>  	spin_unlock(&sbi->inode_hash_lock);
>  	mark_buffer_dirty(bh);
> +	MSDOS_I(inode)->i_metadata_bhs.inode_blk = bh->b_blocknr;

When inode position was changed/removed, this will point the wrong
block. And maybe sync a unrelated block and wait.

>  	err = 0;
>  	if (wait)
>  		err = sync_dirty_buffer(bh);
> @@ -925,7 +926,7 @@ static int fat_write_inode(struct inode *inode, struct writeback_control *wbc)
>  		err = fat_clusters_flush(sb);
>  		mutex_unlock(&MSDOS_SB(sb)->s_lock);
>  	} else
> -		err = __fat_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL);
> +		err = __fat_write_inode(inode, 0);
>  
>  	return err;
>  }

-- 
OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 6/9] fat: Fix possibly missing inode write on fsync(2)
  2026-05-11 14:32   ` OGAWA Hirofumi
@ 2026-05-11 17:03     ` Jan Kara
  2026-05-11 18:02       ` OGAWA Hirofumi
  0 siblings, 1 reply; 18+ messages in thread
From: Jan Kara @ 2026-05-11 17:03 UTC (permalink / raw)
  To: OGAWA Hirofumi
  Cc: Jan Kara, linux-fsdevel, Christian Brauner, aivazian.tigran,
	Ted Tso, linux-ext4

On Mon 11-05-26 23:32:45, OGAWA Hirofumi wrote:
> Jan Kara <jack@suse.cz> writes:
> 
> > Use mmb inode buffer writeout infrastructure to reliably write out
> > inode's buffer on fsync(2).
> 
> > Signed-off-by: Jan Kara <jack@suse.cz>
> > ---
> >  fs/fat/inode.c | 3 ++-
> >  1 file changed, 2 insertions(+), 1 deletion(-)
> >
> > diff --git a/fs/fat/inode.c b/fs/fat/inode.c
> > index 28f78df086ef..4ca00b7a618b 100644
> > --- a/fs/fat/inode.c
> > +++ b/fs/fat/inode.c
> > @@ -907,6 +907,7 @@ static int __fat_write_inode(struct inode *inode, int wait)
> >  	}
> >  	spin_unlock(&sbi->inode_hash_lock);
> >  	mark_buffer_dirty(bh);
> > +	MSDOS_I(inode)->i_metadata_bhs.inode_blk = bh->b_blocknr;
> 
> When inode position was changed/removed, this will point the wrong
> block. And maybe sync a unrelated block and wait.

So I didn't realize that e.g. rename does change the backing inode block.
But given we set i_metadata_bhs.inode_blk on each inode write, inode_blk
should always contain the current position where the inode was written so
fsync should be syncing the right block. Or am I still missing something?

								Honza
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 6/9] fat: Fix possibly missing inode write on fsync(2)
  2026-05-11 17:03     ` Jan Kara
@ 2026-05-11 18:02       ` OGAWA Hirofumi
  2026-05-12  7:29         ` Jan Kara
  0 siblings, 1 reply; 18+ messages in thread
From: OGAWA Hirofumi @ 2026-05-11 18:02 UTC (permalink / raw)
  To: Jan Kara
  Cc: linux-fsdevel, Christian Brauner, aivazian.tigran, Ted Tso,
	linux-ext4

Jan Kara <jack@suse.cz> writes:

> On Mon 11-05-26 23:32:45, OGAWA Hirofumi wrote:
>> Jan Kara <jack@suse.cz> writes:
>> 
>> > Use mmb inode buffer writeout infrastructure to reliably write out
>> > inode's buffer on fsync(2).
>> 
>> > Signed-off-by: Jan Kara <jack@suse.cz>
>> > ---
>> >  fs/fat/inode.c | 3 ++-
>> >  1 file changed, 2 insertions(+), 1 deletion(-)
>> >
>> > diff --git a/fs/fat/inode.c b/fs/fat/inode.c
>> > index 28f78df086ef..4ca00b7a618b 100644
>> > --- a/fs/fat/inode.c
>> > +++ b/fs/fat/inode.c
>> > @@ -907,6 +907,7 @@ static int __fat_write_inode(struct inode *inode, int wait)
>> >  	}
>> >  	spin_unlock(&sbi->inode_hash_lock);
>> >  	mark_buffer_dirty(bh);
>> > +	MSDOS_I(inode)->i_metadata_bhs.inode_blk = bh->b_blocknr;
>> 
>> When inode position was changed/removed, this will point the wrong
>> block. And maybe sync a unrelated block and wait.
>
> So I didn't realize that e.g. rename does change the backing inode block.
> But given we set i_metadata_bhs.inode_blk on each inode write, inode_blk
> should always contain the current position where the inode was written so
> fsync should be syncing the right block. Or am I still missing something?

I didn't check the case of rename completely, just recalled it when I
saw this code, need confirm/check.  But at least, the case of remove
will leave it even after the block is reused.
-- 
OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* [syzbot ci] Re: fs: Fix missed inode write during fsync
  2026-05-11 12:13 [PATCH 0/9] fs: Fix missed inode write during fsync Jan Kara
                   ` (8 preceding siblings ...)
  2026-05-11 12:13 ` [PATCH 9/9] ext4: Use mmb infrastructure for inode buffer writeout Jan Kara
@ 2026-05-11 20:49 ` syzbot ci
  9 siblings, 0 replies; 18+ messages in thread
From: syzbot ci @ 2026-05-11 20:49 UTC (permalink / raw)
  To: aivazian.tigran, brauner, dsterba, hirofumi, jack, linux-ext4,
	linux-fsdevel, tytso
  Cc: syzbot, syzkaller-bugs

syzbot ci has tested the following series

[v1] fs: Fix missed inode write during fsync
https://lore.kernel.org/all/20260511115725.28441-1-jack@suse.cz
* [PATCH 1/9] affs: Drop support for metadata bh tracking
* [PATCH 2/9] ext4: Allocate mapping_metadata_bhs struct on demand
* [PATCH 3/9] fs: Writeout inode buffer from mmb_sync()
* [PATCH 4/9] ext2: Fix possibly missing inode write on fsync(2)
* [PATCH 5/9] udf: Fix possibly missing inode write on fsync(2)
* [PATCH 6/9] fat: Fix possibly missing inode write on fsync(2)
* [PATCH 7/9] minix: Fix possibly missing inode write on fsync(2)
* [PATCH 8/9] bfs: Fix possibly missing inode write on fsync(2)
* [PATCH 9/9] ext4: Use mmb infrastructure for inode buffer writeout

and found the following issue:
KASAN: null-ptr-deref Write in write_dirty_buffer

Full report is available here:
https://ci.syzbot.org/series/d987d2d8-3775-4aa9-959f-8a045778888c

***

KASAN: null-ptr-deref Write in write_dirty_buffer

tree:      torvalds
URL:       https://kernel.googlesource.com/pub/scm/linux/kernel/git/torvalds/linux
base:      5d6919055dec134de3c40167a490f33c74c12581
arch:      amd64
compiler:  Debian clang version 21.1.8 (++20251221033036+2078da43e25a-1~exp1~20251221153213.50), Debian LLD 21.1.8
config:    https://ci.syzbot.org/builds/567d596c-ca65-43c9-bd7d-1e60cfe9da2a/config
syz repro: https://ci.syzbot.org/findings/1bc13af8-2d91-4fbd-b43e-fbe72f29ca41/syz_repro

EXT4-fs (loop2): unmounting filesystem 00000000-0000-0000-0000-000000000000.
==================================================================
BUG: KASAN: null-ptr-deref in instrument_atomic_read_write include/linux/instrumented.h:112 [inline]
BUG: KASAN: null-ptr-deref in test_and_set_bit_lock include/asm-generic/bitops/instrumented-lock.h:57 [inline]
BUG: KASAN: null-ptr-deref in trylock_buffer include/linux/buffer_head.h:425 [inline]
BUG: KASAN: null-ptr-deref in lock_buffer include/linux/buffer_head.h:431 [inline]
BUG: KASAN: null-ptr-deref in write_dirty_buffer+0x37/0x190 fs/buffer.c:2760
Write of size 8 at addr 0000000000000000 by task syz-executor/5742

CPU: 1 UID: 0 PID: 5742 Comm: syz-executor Not tainted syzkaller #0 PREEMPT(full) 
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.2-debian-1.16.2-1 04/01/2014
Call Trace:
 <TASK>
 dump_stack_lvl+0xe8/0x150 lib/dump_stack.c:120
 kasan_report+0x117/0x150 mm/kasan/report.c:595
 check_region_inline mm/kasan/generic.c:-1 [inline]
 kasan_check_range+0x264/0x2c0 mm/kasan/generic.c:200
 instrument_atomic_read_write include/linux/instrumented.h:112 [inline]
 test_and_set_bit_lock include/asm-generic/bitops/instrumented-lock.h:57 [inline]
 trylock_buffer include/linux/buffer_head.h:425 [inline]
 lock_buffer include/linux/buffer_head.h:431 [inline]
 write_dirty_buffer+0x37/0x190 fs/buffer.c:2760
 mmb_sync+0x74c/0xed0 fs/buffer.c:603
 ext4_evict_inode+0x2fa/0x1040 fs/ext4/inode.c:199
 evict+0x61e/0xb10 fs/inode.c:841
 ext4_quota_off+0x470/0x580 fs/ext4/super.c:7326
 ext4_quotas_off fs/ext4/super.c:1195 [inline]
 ext4_put_super+0xdf/0xd80 fs/ext4/super.c:1306
 generic_shutdown_super+0x13d/0x2d0 fs/super.c:646
 kill_block_super+0x44/0x90 fs/super.c:1725
 ext4_kill_sb+0x68/0xb0 fs/ext4/super.c:7494
 deactivate_locked_super+0xbc/0x130 fs/super.c:476
 cleanup_mnt+0x437/0x4d0 fs/namespace.c:1312
 task_work_run+0x1d9/0x270 kernel/task_work.c:233
 resume_user_mode_work include/linux/resume_user_mode.h:50 [inline]
 __exit_to_user_mode_loop kernel/entry/common.c:67 [inline]
 exit_to_user_mode_loop+0xf3/0x4d0 kernel/entry/common.c:98
 __exit_to_user_mode_prepare include/linux/irq-entry-common.h:207 [inline]
 syscall_exit_to_user_mode_prepare include/linux/irq-entry-common.h:238 [inline]
 syscall_exit_to_user_mode include/linux/entry-common.h:318 [inline]
 do_syscall_64+0x33e/0xf80 arch/x86/entry/syscall_64.c:100
 entry_SYSCALL_64_after_hwframe+0x77/0x7f
RIP: 0033:0x7fd8d1b9e017
Code: a2 c7 05 dc 06 25 00 00 00 00 00 eb 96 e8 e1 12 00 00 90 31 f6 e9 09 00 00 00 66 0f 1f 84 00 00 00 00 00 b8 a6 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 01 c3 48 c7 c2 e8 ff ff ff f7 d8 64 89 02 b8
RSP: 002b:00007ffef04ebf88 EFLAGS: 00000246 ORIG_RAX: 00000000000000a6
RAX: 0000000000000000 RBX: 00007fd8d1c32120 RCX: 00007fd8d1b9e017
RDX: 0000000000000000 RSI: 0000000000000009 RDI: 00007ffef04ec040
RBP: 00007ffef04ec040 R08: 00007ffef04ed040 R09: 00000000ffffffff
R10: 0000000000000000 R11: 0000000000000246 R12: 00007ffef04ed0d0
R13: 00007fd8d1c32120 R14: 0000000000014595 R15: 00007ffef04ed110
 </TASK>
==================================================================


***

If these findings have caused you to resend the series or submit a
separate fix, please add the following tag to your commit message:
  Tested-by: syzbot@syzkaller.appspotmail.com

---
This report is generated by a bot. It may contain errors.
syzbot ci engineers can be reached at syzkaller@googlegroups.com.

To test a patch for this bug, please reply with `#syz test`
(should be on a separate line).

The patch should be attached to the email.
Note: arguments like custom git repos and branches are not supported.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 6/9] fat: Fix possibly missing inode write on fsync(2)
  2026-05-11 18:02       ` OGAWA Hirofumi
@ 2026-05-12  7:29         ` Jan Kara
  2026-05-12 14:17           ` OGAWA Hirofumi
  0 siblings, 1 reply; 18+ messages in thread
From: Jan Kara @ 2026-05-12  7:29 UTC (permalink / raw)
  To: OGAWA Hirofumi
  Cc: Jan Kara, linux-fsdevel, Christian Brauner, aivazian.tigran,
	Ted Tso, linux-ext4

On Tue 12-05-26 03:02:13, OGAWA Hirofumi wrote:
> Jan Kara <jack@suse.cz> writes:
> 
> > On Mon 11-05-26 23:32:45, OGAWA Hirofumi wrote:
> >> Jan Kara <jack@suse.cz> writes:
> >> 
> >> > Use mmb inode buffer writeout infrastructure to reliably write out
> >> > inode's buffer on fsync(2).
> >> 
> >> > Signed-off-by: Jan Kara <jack@suse.cz>
> >> > ---
> >> >  fs/fat/inode.c | 3 ++-
> >> >  1 file changed, 2 insertions(+), 1 deletion(-)
> >> >
> >> > diff --git a/fs/fat/inode.c b/fs/fat/inode.c
> >> > index 28f78df086ef..4ca00b7a618b 100644
> >> > --- a/fs/fat/inode.c
> >> > +++ b/fs/fat/inode.c
> >> > @@ -907,6 +907,7 @@ static int __fat_write_inode(struct inode *inode, int wait)
> >> >  	}
> >> >  	spin_unlock(&sbi->inode_hash_lock);
> >> >  	mark_buffer_dirty(bh);
> >> > +	MSDOS_I(inode)->i_metadata_bhs.inode_blk = bh->b_blocknr;
> >> 
> >> When inode position was changed/removed, this will point the wrong
> >> block. And maybe sync a unrelated block and wait.
> >
> > So I didn't realize that e.g. rename does change the backing inode block.
> > But given we set i_metadata_bhs.inode_blk on each inode write, inode_blk
> > should always contain the current position where the inode was written so
> > fsync should be syncing the right block. Or am I still missing something?
> 
> I didn't check the case of rename completely, just recalled it when I
> saw this code, need confirm/check.  But at least, the case of remove
> will leave it even after the block is reused.

Right. fat_detach() should set i_metadata_bhs.inode_blk to INVALID_BLK,
thanks for catching that. I was thinking whether we should set
i_metadata_bhs.inode_blk in fat_attach() instead of during inode dirtying.
It would be somewhat more obviously correct but it could lead to
unnecessary flushing in case the directory block gets dirtied by some other
entry in it while the inode we are fsyncing got never dirtied. IMHO that's
a sensible tradeoff so I'd do that but what is your opinion?

								Honza
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 6/9] fat: Fix possibly missing inode write on fsync(2)
  2026-05-12  7:29         ` Jan Kara
@ 2026-05-12 14:17           ` OGAWA Hirofumi
  0 siblings, 0 replies; 18+ messages in thread
From: OGAWA Hirofumi @ 2026-05-12 14:17 UTC (permalink / raw)
  To: Jan Kara
  Cc: linux-fsdevel, Christian Brauner, aivazian.tigran, Ted Tso,
	linux-ext4

Jan Kara <jack@suse.cz> writes:

>> I didn't check the case of rename completely, just recalled it when I
>> saw this code, need confirm/check.  But at least, the case of remove
>> will leave it even after the block is reused.
>
> Right. fat_detach() should set i_metadata_bhs.inode_blk to INVALID_BLK,
> thanks for catching that. I was thinking whether we should set
> i_metadata_bhs.inode_blk in fat_attach() instead of during inode dirtying.
> It would be somewhat more obviously correct but it could lead to
> unnecessary flushing in case the directory block gets dirtied by some other
> entry in it while the inode we are fsyncing got never dirtied. IMHO that's
> a sensible tradeoff so I'd do that but what is your opinion?

IMO, the marker should be cleared like b_assoc_buffers or I_DIRTY_*
flags after each sync. Otherwise, because the block is shared with other
inodes, it would sync/wait the unrelated dirty easily.

[And more serious implementation, looks like it should be cleared at
similar points or such with b_assoc_buffers is cleared to minimize
unrelated sync/wait.]
-- 
OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>

^ permalink raw reply	[flat|nested] 18+ messages in thread

end of thread, other threads:[~2026-05-12 14:17 UTC | newest]

Thread overview: 18+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-05-11 12:13 [PATCH 0/9] fs: Fix missed inode write during fsync Jan Kara
2026-05-11 12:13 ` [PATCH 1/9] affs: Drop support for metadata bh tracking Jan Kara
2026-05-11 12:13 ` [PATCH 2/9] ext4: Allocate mapping_metadata_bhs struct on demand Jan Kara
2026-05-11 12:13 ` [PATCH 3/9] fs: Writeout inode buffer from mmb_sync() Jan Kara
2026-05-11 13:27   ` Christian Brauner
2026-05-11 12:13 ` [PATCH 4/9] ext2: Fix possibly missing inode write on fsync(2) Jan Kara
2026-05-11 12:13 ` [PATCH 5/9] udf: " Jan Kara
2026-05-11 12:13 ` [PATCH 6/9] fat: " Jan Kara
2026-05-11 14:32   ` OGAWA Hirofumi
2026-05-11 17:03     ` Jan Kara
2026-05-11 18:02       ` OGAWA Hirofumi
2026-05-12  7:29         ` Jan Kara
2026-05-12 14:17           ` OGAWA Hirofumi
2026-05-11 12:13 ` [PATCH 7/9] minix: " Jan Kara
2026-05-11 12:13 ` [PATCH 8/9] bfs: " Jan Kara
2026-05-11 12:13 ` [PATCH 9/9] ext4: Use mmb infrastructure for inode buffer writeout Jan Kara
2026-05-11 13:30   ` Christian Brauner
2026-05-11 20:49 ` [syzbot ci] Re: fs: Fix missed inode write during fsync syzbot ci

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox