* [PATCH 1/2] btrfs: fix data race on transaction->state
2025-11-18 15:59 [PATCH 0/2] Fix data race with transaction->state Josef Bacik
@ 2025-11-18 15:59 ` Josef Bacik
2025-11-19 12:36 ` David Sterba
2025-11-18 15:59 ` [PATCH 2/2] btrfs: remove useless smp_mb in start_transaction Josef Bacik
1 sibling, 1 reply; 4+ messages in thread
From: Josef Bacik @ 2025-11-18 15:59 UTC (permalink / raw)
To: linux-btrfs
Debugging a hang with btrfs on QEMU I discovered a data race with
transaction->state. In wait_current_trans we do
wait_event(fs_info->transaction_wait,
cur_trans->state>=TRANS_STATE_UNBLOCKED ||
TRANS_ABORTED(cur_trans));
however we're doing this outside of the fs_info->trans_lock. This
generally isn't super problematic because we hit
wake_up(fs_info->transaction_wait) quite a bit, but it could lead to
latencies where we miss wakeups, or in the worst case (like the compiler
re-orders the load of the ->state outside of the wait_event loop) we
could hang completely.
Fix this by using WRITE_ONCE whenever we update trans->state, and use
READ_ONCE everywhere we're not holding fs_info->trans_lock.
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
---
fs/btrfs/disk-io.c | 8 ++++----
fs/btrfs/qgroup.c | 2 +-
fs/btrfs/transaction.c | 28 +++++++++++++++-------------
fs/btrfs/volumes.c | 3 ++-
4 files changed, 22 insertions(+), 19 deletions(-)
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 0aa7e5d1b05f..3859d934f658 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -4795,17 +4795,17 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans)
btrfs_destroy_delayed_refs(cur_trans);
- cur_trans->state = TRANS_STATE_COMMIT_START;
+ WRITE_ONCE(cur_trans->state, TRANS_STATE_COMMIT_START);
wake_up(&fs_info->transaction_blocked_wait);
- cur_trans->state = TRANS_STATE_UNBLOCKED;
+ WRITE_ONCE(cur_trans->state, TRANS_STATE_UNBLOCKED);
wake_up(&fs_info->transaction_wait);
btrfs_destroy_marked_extents(fs_info, &cur_trans->dirty_pages,
EXTENT_DIRTY);
btrfs_destroy_pinned_extent(fs_info, &cur_trans->pinned_extents);
- cur_trans->state =TRANS_STATE_COMPLETED;
+ WRITE_ONCE(cur_trans->state, TRANS_STATE_COMPLETED);
wake_up(&cur_trans->commit_wait);
}
@@ -4828,7 +4828,7 @@ static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info)
continue;
}
if (t == fs_info->running_transaction) {
- t->state = TRANS_STATE_COMMIT_DOING;
+ WRITE_ONCE(t->state, TRANS_STATE_COMMIT_DOING);
spin_unlock(&fs_info->trans_lock);
/*
* We wait for 0 num_writers since we don't hold a trans
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 1175b8192cd7..658171e96bee 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -3094,7 +3094,7 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans)
* are holding the qgroup_ioctl_lock, otherwise we can race with a quota
* disable operation (ioctl) and access a freed quota root.
*/
- if (trans->transaction->state != TRANS_STATE_COMMIT_DOING)
+ if (READ_ONCE(trans->transaction->state) != TRANS_STATE_COMMIT_DOING)
lockdep_assert_held(&fs_info->qgroup_ioctl_lock);
if (!fs_info->quota_root)
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 89ae0c7a610a..20bacee478d1 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -345,7 +345,7 @@ static noinline int join_transaction(struct btrfs_fs_info *fs_info,
extwriter_counter_init(cur_trans, type);
init_waitqueue_head(&cur_trans->writer_wait);
init_waitqueue_head(&cur_trans->commit_wait);
- cur_trans->state = TRANS_STATE_RUNNING;
+ WRITE_ONCE(cur_trans->state, TRANS_STATE_RUNNING);
/*
* One for this trans handle, one so it will live on until we
* commit the transaction.
@@ -509,6 +509,8 @@ int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
static inline int is_transaction_blocked(struct btrfs_transaction *trans)
{
+ lockdep_assert_held(&trans->fs_info->trans_lock);
+
return (trans->state >= TRANS_STATE_COMMIT_START &&
trans->state < TRANS_STATE_UNBLOCKED &&
!TRANS_ABORTED(trans));
@@ -530,7 +532,7 @@ static void wait_current_trans(struct btrfs_fs_info *fs_info)
btrfs_might_wait_for_state(fs_info, BTRFS_LOCKDEP_TRANS_UNBLOCKED);
wait_event(fs_info->transaction_wait,
- cur_trans->state >= TRANS_STATE_UNBLOCKED ||
+ READ_ONCE(cur_trans->state) >= TRANS_STATE_UNBLOCKED ||
TRANS_ABORTED(cur_trans));
btrfs_put_transaction(cur_trans);
} else {
@@ -726,7 +728,7 @@ start_transaction(struct btrfs_root *root, unsigned int num_items,
btrfs_init_metadata_block_rsv(fs_info, &h->delayed_rsv, BTRFS_BLOCK_RSV_DELOPS);
smp_mb();
- if (cur_trans->state >= TRANS_STATE_COMMIT_START &&
+ if (READ_ONCE(cur_trans->state) >= TRANS_STATE_COMMIT_START &&
may_wait_transaction(fs_info, type)) {
current->journal_info = h;
btrfs_commit_transaction(h);
@@ -912,7 +914,7 @@ static noinline void wait_for_commit(struct btrfs_transaction *commit,
btrfs_might_wait_for_state(fs_info, BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED);
while (1) {
- wait_event(commit->commit_wait, commit->state >= min_state);
+ wait_event(commit->commit_wait, READ_ONCE(commit->state) >= min_state);
if (put)
btrfs_put_transaction(commit);
@@ -1008,7 +1010,7 @@ bool btrfs_should_end_transaction(struct btrfs_trans_handle *trans)
{
struct btrfs_transaction *cur_trans = trans->transaction;
- if (cur_trans->state >= TRANS_STATE_COMMIT_START ||
+ if (READ_ONCE(cur_trans->state) >= TRANS_STATE_COMMIT_START ||
test_bit(BTRFS_DELAYED_REFS_FLUSHING, &cur_trans->delayed_refs.flags))
return true;
@@ -1986,7 +1988,7 @@ void btrfs_commit_transaction_async(struct btrfs_trans_handle *trans)
*/
btrfs_might_wait_for_state(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_PREP);
wait_event(fs_info->transaction_blocked_wait,
- cur_trans->state >= TRANS_STATE_COMMIT_START ||
+ READ_ONCE(cur_trans->state) >= TRANS_STATE_COMMIT_START ||
TRANS_ABORTED(cur_trans));
btrfs_put_transaction(cur_trans);
}
@@ -2029,7 +2031,7 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, int err)
BUG_ON(list_empty(&cur_trans->list));
if (cur_trans == fs_info->running_transaction) {
- cur_trans->state = TRANS_STATE_COMMIT_DOING;
+ WRITE_ONCE(cur_trans->state, TRANS_STATE_COMMIT_DOING);
spin_unlock(&fs_info->trans_lock);
/*
@@ -2269,7 +2271,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
return ret;
}
- cur_trans->state = TRANS_STATE_COMMIT_PREP;
+ WRITE_ONCE(cur_trans->state, TRANS_STATE_COMMIT_PREP);
wake_up(&fs_info->transaction_blocked_wait);
btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_PREP);
@@ -2307,7 +2309,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
}
}
- cur_trans->state = TRANS_STATE_COMMIT_START;
+ WRITE_ONCE(cur_trans->state, TRANS_STATE_COMMIT_START);
wake_up(&fs_info->transaction_blocked_wait);
spin_unlock(&fs_info->trans_lock);
@@ -2362,7 +2364,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
*/
spin_lock(&fs_info->trans_lock);
add_pending_snapshot(trans);
- cur_trans->state = TRANS_STATE_COMMIT_DOING;
+ WRITE_ONCE(cur_trans->state, TRANS_STATE_COMMIT_DOING);
spin_unlock(&fs_info->trans_lock);
/*
@@ -2517,7 +2519,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
mutex_lock(&fs_info->tree_log_mutex);
spin_lock(&fs_info->trans_lock);
- cur_trans->state = TRANS_STATE_UNBLOCKED;
+ WRITE_ONCE(cur_trans->state, TRANS_STATE_UNBLOCKED);
fs_info->running_transaction = NULL;
spin_unlock(&fs_info->trans_lock);
mutex_unlock(&fs_info->reloc_mutex);
@@ -2552,7 +2554,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
* We needn't acquire the lock here because there is no other task
* which can change it.
*/
- cur_trans->state = TRANS_STATE_SUPER_COMMITTED;
+ WRITE_ONCE(cur_trans->state, TRANS_STATE_SUPER_COMMITTED);
wake_up(&cur_trans->commit_wait);
btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED);
@@ -2568,7 +2570,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
* We needn't acquire the lock here because there is no other task
* which can change it.
*/
- cur_trans->state = TRANS_STATE_COMPLETED;
+ WRITE_ONCE(cur_trans->state, TRANS_STATE_COMPLETED);
wake_up(&cur_trans->commit_wait);
btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_COMPLETED);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 88065e52184c..aac09223eb5d 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -7889,7 +7889,8 @@ void btrfs_commit_device_sizes(struct btrfs_transaction *trans)
{
struct btrfs_device *curr, *next;
- ASSERT(trans->state == TRANS_STATE_COMMIT_DOING, "state=%d" , trans->state);
+ ASSERT(READ_ONCE(trans->state) == TRANS_STATE_COMMIT_DOING, "state=%d",
+ READ_ONCE(trans->state));
if (list_empty(&trans->dev_update_list))
return;
--
2.51.1
^ permalink raw reply related [flat|nested] 4+ messages in thread