From: Cen Zhang <zzzccc427@gmail.com>
To: clm@fb.com
Cc: dsterba@suse.com, linux-btrfs@vger.kernel.org,
linux-kernel@vger.kernel.org, baijiaju1990@gmail.com,
zzzccc <1539412714@qq.com>, Cen Zhang <zzzccc427@gmail.com>
Subject: [PATCH] btrfs: add btrfs_inode_disk_i_size() helper to prevent torn reads of disk_i_size
Date: Tue, 24 Mar 2026 17:01:59 +0800 [thread overview]
Message-ID: <20260324090200.3932789-1-zzzccc427@gmail.com> (raw)
From: zzzccc <1539412714@qq.com>
btrfs_inode::disk_i_size is a u64 field updated under inode->lock by
btrfs_inode_safe_disk_i_size_write(), but several read sites access it
without holding that lock. On 64-bit platforms this is fine because
aligned u64 loads are architecturally atomic, but on 32-bit platforms a
u64 load is performed as two 32-bit loads which can tear if a concurrent
write updates both halves.
A torn read of disk_i_size is dangerous in the metadata-serialization
paths (fill_inode_item, fill_stack_inode_item) because the torn value
gets persisted to the B-tree on disk. After a crash, fsck / mount would
see a file size that never existed:
- If the torn value is too large, stale data beyond the real EOF is
exposed (information leak).
- If the torn value is too small (e.g. zero), file data is silently
lost.
Signed-off-by: Cen Zhang <zzzccc427@gmail.com>
---
fs/btrfs/btrfs_inode.h | 24 ++++++++++++++++++++++++
fs/btrfs/delayed-inode.c | 2 +-
fs/btrfs/file.c | 2 +-
fs/btrfs/inode.c | 6 +++---
4 files changed, 29 insertions(+), 5 deletions(-)
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 55c272fe5d92..7aff326bedbb 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -418,6 +418,30 @@ static inline void btrfs_i_size_write(struct btrfs_inode *inode, u64 size)
inode->disk_i_size = size;
}
+/*
+ * Get the on-disk file size safely without holding inode->lock.
+ *
+ * disk_i_size is protected by inode->lock when being written (see
+ * btrfs_inode_safe_disk_i_size_write()), but several read sites access
+ * it without that lock. On 64-bit platforms a plain READ_ONCE() is
+ * sufficient because aligned u64 loads are atomic. On 32-bit platforms
+ * a u64 load can tear, so we take the spinlock to guarantee a consistent
+ * snapshot.
+ */
+static inline u64 btrfs_inode_disk_i_size(struct btrfs_inode *inode)
+{
+#if BITS_PER_LONG == 32
+ u64 size;
+
+ spin_lock(&inode->lock);
+ size = inode->disk_i_size;
+ spin_unlock(&inode->lock);
+ return size;
+#else
+ return READ_ONCE(inode->disk_i_size);
+#endif
+}
+
static inline bool btrfs_is_free_space_inode(const struct btrfs_inode *inode)
{
return test_bit(BTRFS_INODE_FREE_SPACE_INODE, &inode->runtime_flags);
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 56ff8afe9a22..86be9d1bee55 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -1841,7 +1841,7 @@ static void fill_stack_inode_item(struct btrfs_trans_handle *trans,
btrfs_set_stack_inode_uid(inode_item, i_uid_read(vfs_inode));
btrfs_set_stack_inode_gid(inode_item, i_gid_read(vfs_inode));
- btrfs_set_stack_inode_size(inode_item, inode->disk_i_size);
+ btrfs_set_stack_inode_size(inode_item, btrfs_inode_disk_i_size(inode));
btrfs_set_stack_inode_mode(inode_item, vfs_inode->i_mode);
btrfs_set_stack_inode_nlink(inode_item, vfs_inode->i_nlink);
btrfs_set_stack_inode_nbytes(inode_item, inode_get_bytes(vfs_inode));
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index a4cb9d3cfc4e..dcd306f669d8 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -178,7 +178,7 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans,
if (args->drop_cache)
btrfs_drop_extent_map_range(inode, args->start, args->end - 1, false);
- if (data_race(args->start >= inode->disk_i_size) && !args->replace_extent)
+ if (args->start >= btrfs_inode_disk_i_size(inode) && !args->replace_extent)
modify_tree = 0;
update_refs = (btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index afc5d75d2dcb..5c75c949e855 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -837,7 +837,7 @@ static inline void inode_should_defrag(struct btrfs_inode *inode,
{
/* If this is a small write inside eof, kick off a defrag */
if (num_bytes < small_write &&
- (start > 0 || end + 1 < inode->disk_i_size))
+ (start > 0 || end + 1 < btrfs_inode_disk_i_size(inode)))
btrfs_add_inode_defrag(inode, small_write);
}
@@ -4264,7 +4264,7 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
btrfs_set_inode_uid(leaf, item, i_uid_read(inode));
btrfs_set_inode_gid(leaf, item, i_gid_read(inode));
- btrfs_set_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size);
+ btrfs_set_inode_size(leaf, item, btrfs_inode_disk_i_size(BTRFS_I(inode)));
btrfs_set_inode_mode(leaf, item, inode->i_mode);
btrfs_set_inode_nlink(leaf, item, inode->i_nlink);
@@ -5455,7 +5455,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
ret2 = btrfs_wait_ordered_range(BTRFS_I(inode), 0, (u64)-1);
if (ret2)
return ret2;
- i_size_write(inode, BTRFS_I(inode)->disk_i_size);
+ i_size_write(inode, btrfs_inode_disk_i_size(BTRFS_I(inode)));
}
}
--
2.34.1
reply other threads:[~2026-03-24 9:19 UTC|newest]
Thread overview: [no followups] expand[flat|nested] mbox.gz Atom feed
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260324090200.3932789-1-zzzccc427@gmail.com \
--to=zzzccc427@gmail.com \
--cc=1539412714@qq.com \
--cc=baijiaju1990@gmail.com \
--cc=clm@fb.com \
--cc=dsterba@suse.com \
--cc=linux-btrfs@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox