From: Jan Kara <jack@suse.cz>
To: <linux-fsdevel@vger.kernel.org>
Cc: Christian Brauner <brauner@kernel.org>,
aivazian.tigran@gmail.com, Ted Tso <tytso@mit.edu>,
<linux-ext4@vger.kernel.org>,
OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>,
Jan Kara <jack@suse.cz>
Subject: [PATCH v2 03/10] fs: Writeout inode buffer from mmb_sync()
Date: Mon, 25 May 2026 10:58:09 +0200 [thread overview]
Message-ID: <20260525085821.769119-13-jack@suse.cz> (raw)
In-Reply-To: <20260525085035.12891-1-jack@suse.cz>
Currently metadata bh tracking does not track inode buffers because they
are usually shared by several inodes and so our linked list tracking
cannot be used. On fsync we call sync_inode_metadata() to write inode
instead where filesystems' .write_inode methods detect data integrity
writeback and take care to submit inode buffer to disk and wait for it
in that case. This is however racy as for example flush worker can
submit normal (WB_SYNC_NONE) inode writeback first, which makes the
inode clean and copies the inode to the buffer but doesn't submit the
buffer for IO. Thus sync_inode_metadata() call does nothing and we fail
to persist inode buffer to disk on fsync(2).
Fix the problem by allowing filesystem to set the number of block backing
the inode in mmb structure and mmb_sync() then takes care to writeout
corresponding buffer and wait for it.
Signed-off-by: Jan Kara <jack@suse.cz>
---
fs/buffer.c | 64 +++++++++++++++++++++++++++++--------
include/linux/buffer_head.h | 14 ++++++++
include/linux/fs.h | 1 +
3 files changed, 66 insertions(+), 13 deletions(-)
diff --git a/fs/buffer.c b/fs/buffer.c
index b0b3792b1496..f83fb3cdc6ac 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -477,12 +477,12 @@ EXPORT_SYMBOL(mark_buffer_async_write);
* using RCU, grab the lock, verify we didn't race with somebody detaching the
* bh / moving it to different inode and only then proceeding.
*/
-
void mmb_init(struct mapping_metadata_bhs *mmb, struct address_space *mapping)
{
spin_lock_init(&mmb->lock);
INIT_LIST_HEAD(&mmb->list);
mmb->mapping = mapping;
+ mmb->inode_blk = MMB_INVALID_BLK;
}
EXPORT_SYMBOL(mmb_init);
@@ -550,11 +550,13 @@ EXPORT_SYMBOL_GPL(mmb_has_buffers);
int mmb_sync(struct mapping_metadata_bhs *mmb)
{
struct buffer_head *bh;
+ sector_t inode_blk;
int err = 0;
struct blk_plug plug;
LIST_HEAD(tmp);
- if (!mmb_has_buffers(mmb))
+ if (!mmb_has_buffers(mmb) &&
+ data_race(mmb->inode_blk == MMB_INVALID_BLK))
return 0;
blk_start_plug(&plug);
@@ -593,8 +595,22 @@ int mmb_sync(struct mapping_metadata_bhs *mmb)
}
}
}
-
+ inode_blk = mmb->inode_blk;
+ mmb->inode_blk = MMB_INVALID_BLK;
spin_unlock(&mmb->lock);
+
+ /* Writeout inode buffer if it was set and wasn't written out yet */
+ if (inode_blk != MMB_INVALID_BLK) {
+ bh = sb_find_get_block(mmb->mapping->host->i_sb, inode_blk);
+ if (bh) {
+ write_dirty_buffer(bh, REQ_SYNC);
+ wait_on_buffer(bh);
+ if (!buffer_uptodate(bh))
+ err = -EIO;
+ brelse(bh);
+ }
+ }
+
blk_finish_plug(&plug);
spin_lock(&mmb->lock);
@@ -646,18 +662,18 @@ int mmb_fsync_noflush(struct file *file, struct mapping_metadata_bhs *mmb,
if (err)
return err;
- if (mmb)
- ret = mmb_sync(mmb);
if (!(inode_state_read_once(inode) & I_DIRTY_ALL))
- goto out;
+ goto sync_buffers;
if (datasync && !(inode_state_read_once(inode) & I_DIRTY_DATASYNC))
- goto out;
-
- err = sync_inode_metadata(inode, 1);
- if (ret == 0)
- ret = err;
-
-out:
+ goto sync_buffers;
+
+ ret = sync_inode_metadata(inode, 1);
+sync_buffers:
+ if (mmb) {
+ err = mmb_sync(mmb);
+ if (ret == 0)
+ ret = err;
+ }
/* check and advance again to catch errors after syncing out buffers */
err = file_check_and_advance_wb_err(file);
if (ret == 0)
@@ -733,6 +749,28 @@ void mmb_mark_buffer_dirty(struct buffer_head *bh,
}
EXPORT_SYMBOL(mmb_mark_buffer_dirty);
+/**
+ * mmb_mark_inode_buffer_dirty - Mark buffer containing inode as dirty and
+ * track it for fsync.
+ * @bh: The buffer containing the inode.
+ * @mmb: Mmb structure for metadata tracking.
+ *
+ * Mark the buffer containing inode as dirty and track the block number of
+ * the buffer containing the inode in mmb so that it gets written out from
+ * mmb_sync().
+ */
+void mmb_mark_inode_buffer_dirty(struct buffer_head *bh,
+ struct mapping_metadata_bhs *mmb)
+{
+ /* For simplicity we use mmb->lock to synchronize with mmb_sync() */
+ spin_lock(&mmb->lock);
+ mark_buffer_dirty(bh);
+ mmb->inode_blk = bh->b_blocknr;
+ spin_unlock(&mmb->lock);
+}
+EXPORT_SYMBOL(mmb_mark_inode_buffer_dirty);
+
+
/**
* block_dirty_folio - Mark a folio as dirty.
* @mapping: The address space containing this folio.
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index e4939e33b4b5..b77464359028 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -207,6 +207,8 @@ void end_buffer_write_sync(struct buffer_head *bh, int uptodate);
/* Things to do with metadata buffers list */
void mmb_mark_buffer_dirty(struct buffer_head *bh, struct mapping_metadata_bhs *mmb);
+void mmb_mark_inode_buffer_dirty(struct buffer_head *bh,
+ struct mapping_metadata_bhs *mmb);
int mmb_fsync_noflush(struct file *file, struct mapping_metadata_bhs *mmb,
loff_t start, loff_t end, bool datasync);
int mmb_fsync(struct file *file, struct mapping_metadata_bhs *mmb,
@@ -513,12 +515,24 @@ bool block_dirty_folio(struct address_space *mapping, struct folio *folio);
#ifdef CONFIG_BUFFER_HEAD
+#define MMB_INVALID_BLK (~0ULL)
+
void buffer_init(void);
bool try_to_free_buffers(struct folio *folio);
void mmb_init(struct mapping_metadata_bhs *mmb, struct address_space *mapping);
bool mmb_has_buffers(struct mapping_metadata_bhs *mmb);
void mmb_invalidate(struct mapping_metadata_bhs *mmb);
int mmb_sync(struct mapping_metadata_bhs *mmb);
+static inline void mmb_clear_inode_blk(struct mapping_metadata_bhs *mmb)
+{
+ /*
+ * The lock is mostly pointless here but let's keep setting of
+ * inode_blk consistently under it.
+ */
+ spin_lock(&mmb->lock);
+ mmb->inode_blk = MMB_INVALID_BLK;
+ spin_unlock(&mmb->lock);
+}
void invalidate_bh_lrus(void);
void invalidate_bh_lrus_cpu(void);
bool has_bh_in_lru(int cpu, void *dummy);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 11559c513dfb..435a41e4c90f 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -446,6 +446,7 @@ extern const struct address_space_operations empty_aops;
/* Structure for tracking metadata buffer heads associated with the mapping */
struct mapping_metadata_bhs {
struct address_space *mapping; /* Mapping bhs are associated with */
+ sector_t inode_blk; /* Number of block containing the inode */
spinlock_t lock; /* Lock protecting bh list */
struct list_head list; /* The list of bhs (b_assoc_buffers) */
};
--
2.51.0
next prev parent reply other threads:[~2026-05-25 8:58 UTC|newest]
Thread overview: 15+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-05-25 8:58 [PATCH v2 0/10] fs: Fix missed inode write during fsync Jan Kara
2026-05-25 8:58 ` [PATCH v2 01/10] affs: Drop support for metadata bh tracking Jan Kara
2026-05-25 10:06 ` David Sterba
2026-05-25 8:58 ` [PATCH v2 02/10] ext4: Allocate mapping_metadata_bhs struct on demand Jan Kara
2026-06-03 13:41 ` Theodore Tso
2026-05-25 8:58 ` Jan Kara [this message]
2026-05-25 8:58 ` [PATCH v2 04/10] ext2: Fix possibly missing inode write on fsync(2) Jan Kara
2026-05-25 8:58 ` [PATCH v2 05/10] udf: " Jan Kara
2026-05-25 8:58 ` [PATCH v2 06/10] fat: " Jan Kara
2026-05-25 8:58 ` [PATCH v2 07/10] minix: " Jan Kara
2026-05-25 8:58 ` [PATCH v2 08/10] bfs: " Jan Kara
2026-05-25 8:58 ` [PATCH v2 09/10] ext4: Use mmb infrastructure for inode buffer writeout Jan Kara
2026-06-03 13:41 ` Theodore Tso
2026-05-25 8:58 ` [PATCH v2 10/10] fs: Fix missed inode writeback when racing with __writeback_single_inode Jan Kara
2026-06-02 7:22 ` [PATCH v2 0/10] fs: Fix missed inode write during fsync Jan Kara
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260525085821.769119-13-jack@suse.cz \
--to=jack@suse.cz \
--cc=aivazian.tigran@gmail.com \
--cc=brauner@kernel.org \
--cc=hirofumi@mail.parknet.co.jp \
--cc=linux-ext4@vger.kernel.org \
--cc=linux-fsdevel@vger.kernel.org \
--cc=tytso@mit.edu \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox