* [PATCH] btrfs: add btrfs_inode_disk_i_size() helper to prevent torn reads of disk_i_size
@ 2026-03-24 9:01 Cen Zhang
0 siblings, 0 replies; only message in thread
From: Cen Zhang @ 2026-03-24 9:01 UTC (permalink / raw)
To: clm; +Cc: dsterba, linux-btrfs, linux-kernel, baijiaju1990, zzzccc,
Cen Zhang
From: zzzccc <1539412714@qq.com>
btrfs_inode::disk_i_size is a u64 field updated under inode->lock by
btrfs_inode_safe_disk_i_size_write(), but several read sites access it
without holding that lock. On 64-bit platforms this is fine because
aligned u64 loads are architecturally atomic, but on 32-bit platforms a
u64 load is performed as two 32-bit loads which can tear if a concurrent
write updates both halves.
A torn read of disk_i_size is dangerous in the metadata-serialization
paths (fill_inode_item, fill_stack_inode_item) because the torn value
gets persisted to the B-tree on disk. After a crash, fsck / mount would
see a file size that never existed:
- If the torn value is too large, stale data beyond the real EOF is
exposed (information leak).
- If the torn value is too small (e.g. zero), file data is silently
lost.
Signed-off-by: Cen Zhang <zzzccc427@gmail.com>
---
fs/btrfs/btrfs_inode.h | 24 ++++++++++++++++++++++++
fs/btrfs/delayed-inode.c | 2 +-
fs/btrfs/file.c | 2 +-
fs/btrfs/inode.c | 6 +++---
4 files changed, 29 insertions(+), 5 deletions(-)
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 55c272fe5d92..7aff326bedbb 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -418,6 +418,30 @@ static inline void btrfs_i_size_write(struct btrfs_inode *inode, u64 size)
inode->disk_i_size = size;
}
+/*
+ * Get the on-disk file size safely without holding inode->lock.
+ *
+ * disk_i_size is protected by inode->lock when being written (see
+ * btrfs_inode_safe_disk_i_size_write()), but several read sites access
+ * it without that lock. On 64-bit platforms a plain READ_ONCE() is
+ * sufficient because aligned u64 loads are atomic. On 32-bit platforms
+ * a u64 load can tear, so we take the spinlock to guarantee a consistent
+ * snapshot.
+ */
+static inline u64 btrfs_inode_disk_i_size(struct btrfs_inode *inode)
+{
+#if BITS_PER_LONG == 32
+ u64 size;
+
+ spin_lock(&inode->lock);
+ size = inode->disk_i_size;
+ spin_unlock(&inode->lock);
+ return size;
+#else
+ return READ_ONCE(inode->disk_i_size);
+#endif
+}
+
static inline bool btrfs_is_free_space_inode(const struct btrfs_inode *inode)
{
return test_bit(BTRFS_INODE_FREE_SPACE_INODE, &inode->runtime_flags);
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 56ff8afe9a22..86be9d1bee55 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -1841,7 +1841,7 @@ static void fill_stack_inode_item(struct btrfs_trans_handle *trans,
btrfs_set_stack_inode_uid(inode_item, i_uid_read(vfs_inode));
btrfs_set_stack_inode_gid(inode_item, i_gid_read(vfs_inode));
- btrfs_set_stack_inode_size(inode_item, inode->disk_i_size);
+ btrfs_set_stack_inode_size(inode_item, btrfs_inode_disk_i_size(inode));
btrfs_set_stack_inode_mode(inode_item, vfs_inode->i_mode);
btrfs_set_stack_inode_nlink(inode_item, vfs_inode->i_nlink);
btrfs_set_stack_inode_nbytes(inode_item, inode_get_bytes(vfs_inode));
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index a4cb9d3cfc4e..dcd306f669d8 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -178,7 +178,7 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans,
if (args->drop_cache)
btrfs_drop_extent_map_range(inode, args->start, args->end - 1, false);
- if (data_race(args->start >= inode->disk_i_size) && !args->replace_extent)
+ if (args->start >= btrfs_inode_disk_i_size(inode) && !args->replace_extent)
modify_tree = 0;
update_refs = (btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index afc5d75d2dcb..5c75c949e855 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -837,7 +837,7 @@ static inline void inode_should_defrag(struct btrfs_inode *inode,
{
/* If this is a small write inside eof, kick off a defrag */
if (num_bytes < small_write &&
- (start > 0 || end + 1 < inode->disk_i_size))
+ (start > 0 || end + 1 < btrfs_inode_disk_i_size(inode)))
btrfs_add_inode_defrag(inode, small_write);
}
@@ -4264,7 +4264,7 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
btrfs_set_inode_uid(leaf, item, i_uid_read(inode));
btrfs_set_inode_gid(leaf, item, i_gid_read(inode));
- btrfs_set_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size);
+ btrfs_set_inode_size(leaf, item, btrfs_inode_disk_i_size(BTRFS_I(inode)));
btrfs_set_inode_mode(leaf, item, inode->i_mode);
btrfs_set_inode_nlink(leaf, item, inode->i_nlink);
@@ -5455,7 +5455,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
ret2 = btrfs_wait_ordered_range(BTRFS_I(inode), 0, (u64)-1);
if (ret2)
return ret2;
- i_size_write(inode, BTRFS_I(inode)->disk_i_size);
+ i_size_write(inode, btrfs_inode_disk_i_size(BTRFS_I(inode)));
}
}
--
2.34.1
^ permalink raw reply related [flat|nested] only message in thread
only message in thread, other threads:[~2026-03-24 9:19 UTC | newest]
Thread overview: (only message) (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-03-24 9:01 [PATCH] btrfs: add btrfs_inode_disk_i_size() helper to prevent torn reads of disk_i_size Cen Zhang
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox