* [PATCH 1/2] btrfs: don't finalize fs roots during qgroup snapshot accounting
2026-04-23 22:43 [PATCH 0/2] btrfs: skip COW for written extent buffers Leo Martins
@ 2026-04-23 22:43 ` Leo Martins
2026-04-24 3:15 ` Sun YangKai
2026-04-23 22:43 ` [PATCH 2/2] btrfs: skip COW for written extent buffers allocated in current transaction Leo Martins
1 sibling, 1 reply; 4+ messages in thread
From: Leo Martins @ 2026-04-23 22:43 UTC (permalink / raw)
To: linux-btrfs, kernel-team
qgroup_account_snapshot() calls commit_fs_roots() to persist root
items for consistent qgroup accounting. However, commit_fs_roots()
is a transaction finalization function that also clears
BTRFS_ROOT_FORCE_COW, frees log trees, clears BTRFS_ROOT_TRANS_TAG,
and frees qgroup metadata reservations. These are all premature
since the transaction is still in progress and create_pending_snapshot()
will modify the source root's tree afterward.
Currently this is harmless because should_cow_block() always COWs
WRITTEN extent buffers regardless of FORCE_COW state. But clearing
FORCE_COW mid-transaction leaves roots in an incorrect state:
blocks may be shared with a snapshot (via btrfs_copy_root) while
FORCE_COW no longer reflects that. Any future optimization that
relies on FORCE_COW to decide whether COW can be skipped would
silently corrupt snapshots.
Extract prepare_root_item() as a helper that updates a root's
root_item and writes it to the tree root. This is the only part of
commit_fs_roots() that qgroup accounting needs. Add
qgroup_commit_fs_roots() which iterates dirty roots calling only
prepare_root_item(), without finalization side effects. It iterates
using an advancing index instead of clearing tags, so roots remain
tagged for the real commit_fs_roots() call later, eliminating the
record_root_in_trans() re-tagging that was previously needed.
Signed-off-by: Leo Martins <loemra.dev@gmail.com>
---
fs/btrfs/transaction.c | 105 +++++++++++++++++++++++++++++------------
1 file changed, 75 insertions(+), 30 deletions(-)
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 55791bb100a22..5b362f2680200 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -1477,8 +1477,66 @@ void btrfs_add_dead_root(struct btrfs_root *root)
}
/*
- * Update each subvolume root and its relocation root, if it exists, in the tree
- * of tree roots. Also free log roots if they exist.
+ * Update the root_item for @root to point at the current root->node
+ * and write it to the tree root.
+ */
+static int prepare_root_item(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ if (root->commit_root != root->node) {
+ list_add_tail(&root->dirty_list,
+ &trans->transaction->switch_commits);
+ btrfs_set_root_node(&root->root_item, root->node);
+ }
+
+ return btrfs_update_root(trans, trans->fs_info->tree_root,
+ &root->root_key, &root->root_item);
+}
+
+/*
+ * Update root items for all dirty fs roots without the finalization
+ * side effects of commit_fs_roots() (clearing FORCE_COW, freeing
+ * logs, clearing trans tags, etc.). Tags are preserved so
+ * commit_fs_roots() can still find these roots later.
+ */
+static int qgroup_commit_fs_roots(struct btrfs_trans_handle *trans)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_root *gang[8];
+ unsigned long index = 0;
+ int i;
+ int ret;
+
+ spin_lock(&fs_info->fs_roots_radix_lock);
+ while (1) {
+ ret = radix_tree_gang_lookup_tag(&fs_info->fs_roots_radix,
+ (void **)gang, index,
+ ARRAY_SIZE(gang),
+ BTRFS_ROOT_TRANS_TAG);
+ if (ret == 0)
+ break;
+ for (i = 0; i < ret; i++) {
+ struct btrfs_root *root = gang[i];
+ int ret2;
+
+ index = btrfs_root_id(root) + 1;
+ spin_unlock(&fs_info->fs_roots_radix_lock);
+
+ ret2 = prepare_root_item(trans, root);
+ if (unlikely(ret2))
+ return ret2;
+
+ spin_lock(&fs_info->fs_roots_radix_lock);
+ }
+ }
+ spin_unlock(&fs_info->fs_roots_radix_lock);
+ return 0;
+}
+
+/*
+ * Finalize each dirty subvolume root: free log trees, update relocation
+ * roots, clear BTRFS_ROOT_FORCE_COW, persist root items, and clear
+ * BTRFS_ROOT_TRANS_TAG.
*/
static noinline int commit_fs_roots(struct btrfs_trans_handle *trans)
{
@@ -1535,16 +1593,7 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans)
clear_bit(BTRFS_ROOT_FORCE_COW, &root->state);
smp_mb__after_atomic();
- if (root->commit_root != root->node) {
- list_add_tail(&root->dirty_list,
- &trans->transaction->switch_commits);
- btrfs_set_root_node(&root->root_item,
- root->node);
- }
-
- ret2 = btrfs_update_root(trans, fs_info->tree_root,
- &root->root_key,
- &root->root_item);
+ ret2 = prepare_root_item(trans, root);
if (unlikely(ret2))
return ret2;
spin_lock(&fs_info->fs_roots_radix_lock);
@@ -1604,7 +1653,13 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans,
return ret;
}
- ret = commit_fs_roots(trans);
+ /*
+ * Use qgroup_commit_fs_roots() instead of commit_fs_roots()
+ * because the transaction is still in progress and we must not
+ * clear FORCE_COW on roots whose blocks are shared with a
+ * snapshot.
+ */
+ ret = qgroup_commit_fs_roots(trans);
if (ret)
return ret;
ret = btrfs_qgroup_account_extents(trans);
@@ -1618,16 +1673,12 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans,
return ret;
/*
- * Now we do a simplified commit transaction, which will:
- * 1) commit all subvolume and extent tree
- * To ensure all subvolume and extent tree have a valid
- * commit_root to accounting later insert_dir_item()
- * 2) write all btree blocks onto disk
- * This is to make sure later btree modification will be cowed
- * Or commit_root can be populated and cause wrong qgroup numbers
- * In this simplified commit, we don't really care about other trees
- * like chunk and root tree, as they won't affect qgroup.
- * And we don't write super to avoid half committed status.
+ * Simplified commit for qgroup accounting:
+ * 1) commit cow-only roots (extent tree, etc.) for consistent
+ * commit_root pointers
+ * 2) write all btree blocks to disk so subsequent modifications
+ * are properly COW'd
+ * We don't write the super to avoid a half-committed state.
*/
ret = commit_cowonly_roots(trans);
if (ret)
@@ -1640,13 +1691,7 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans,
return ret;
}
- /*
- * Force parent root to be updated, as we recorded it before so its
- * last_trans == cur_transid.
- * Or it won't be committed again onto disk after later
- * insert_dir_item()
- */
- return record_root_in_trans(trans, parent, true);
+ return 0;
}
/*
--
2.52.0
^ permalink raw reply related [flat|nested] 4+ messages in thread* [PATCH 2/2] btrfs: skip COW for written extent buffers allocated in current transaction
2026-04-23 22:43 [PATCH 0/2] btrfs: skip COW for written extent buffers Leo Martins
2026-04-23 22:43 ` [PATCH 1/2] btrfs: don't finalize fs roots during qgroup snapshot accounting Leo Martins
@ 2026-04-23 22:43 ` Leo Martins
1 sibling, 0 replies; 4+ messages in thread
From: Leo Martins @ 2026-04-23 22:43 UTC (permalink / raw)
To: linux-btrfs, kernel-team; +Cc: Filipe Manana, Sun YangKai
When memory pressure causes writeback of a recently COW'd buffer,
btrfs sets BTRFS_HEADER_FLAG_WRITTEN on it. Subsequent
btrfs_search_slot() restarts then see the WRITTEN flag and re-COW
the buffer unnecessarily, causing COW amplification that can exhaust
block reservations and degrade throughput.
Overwriting in place is crash-safe because the committed superblock
does not reference buffers allocated in the current (uncommitted)
transaction, so no on-disk tree points to this block yet.
When should_cow_block() encounters a WRITTEN buffer whose generation
matches the current transaction, instead of requesting a COW, re-dirty
the buffer and re-register its range in the transaction's dirty_pages.
Both are necessary because btrfs tracks dirty metadata through two
independent mechanisms. set_extent_buffer_dirty() sets the
EXTENT_BUFFER_DIRTY flag and the buffer_tree xarray PAGECACHE_TAG_DIRTY
mark, which is what background writeback (btree_write_cache_pages) uses
to find and write dirty buffers. The transaction's dirty_pages io tree
is a separate structure used by btrfs_write_and_wait_transaction() at
commit time to ensure all buffers allocated during the transaction are
persisted. The dirty_pages range was originally registered in
btrfs_init_new_buffer() when the block was first allocated. Normally
dirty_pages is only cleared at commit time by
btrfs_write_and_wait_transaction(), but if qgroups are enabled and
snapshots are being created, qgroup_account_snapshot() may have already
called btrfs_write_and_wait_transaction() and released the range before
the final commit-time call.
Keep BTRFS_HEADER_FLAG_WRITTEN set so that btrfs_free_tree_block()
correctly pins the block if it is freed later.
Relax the lockdep assertion in btrfs_mark_buffer_dirty() from
btrfs_assert_tree_write_locked() to lockdep_assert_held() so that it
accepts either a read or write lock. should_cow_block() may be called
from btrfs_search_slot() when only a read lock is held (nodes above
write_lock_level are read-locked). The write lock assertion previously
documented the caller convention that buffer content was being modified
under exclusive access, but btrfs_mark_buffer_dirty() and
set_extent_buffer_dirty() themselves only perform independently
synchronized operations: atomic bit ops on bflags, folio_mark_dirty()
(kernel-internal folio locking), xarray mark updates (xarray spinlock),
and percpu counter updates. The read lock is sufficient because it
prevents lock_extent_buffer_for_io() from acquiring the write lock and
racing on the dirty state. Since rw_semaphore permits concurrent
readers, multiple threads can enter btrfs_mark_buffer_dirty()
simultaneously for the same buffer; this is safe because
test_and_set_bit(EXTENT_BUFFER_DIRTY) ensures only one thread performs
the full dirty state transition.
Remove the CONFIG_BTRFS_DEBUG assertion in set_extent_buffer_dirty()
that checked folio_test_dirty() after marking the buffer dirty. This
assertion assumed exclusive access (only one thread in
set_extent_buffer_dirty() at a time), which held when the only caller
was btrfs_mark_buffer_dirty() under write lock. With concurrent readers
calling through should_cow_block(), a thread that loses the
test_and_set_bit race sees was_dirty=true and skips the folio dirty
marking, but the winning thread may not have called
btrfs_meta_folio_set_dirty() yet, causing the assertion to fire. This
is a benign race: the winning thread will complete the folio dirty
marking, and no writeback can clear it while readers hold their locks.
Hoist the EXTENT_BUFFER_WRITEBACK, BTRFS_HEADER_FLAG_RELOC, and
BTRFS_ROOT_FORCE_COW checks before the WRITTEN block since they apply
regardless of whether the buffer has been written back. This
consolidates the exclusion logic and simplifies the WRITTEN path to
only handle log trees and zoned devices. Moving the RELOC checks
before the smp_mb__before_atomic() barrier is safe because both
btrfs_root_id() (immutable) and BTRFS_HEADER_FLAG_RELOC (set at COW
time under tree lock) are stable values not subject to concurrent
modification; the barrier is only needed for BTRFS_ROOT_FORCE_COW
which is set concurrently by create_pending_snapshot().
Exclude cases where in-place overwrite is not safe:
- EXTENT_BUFFER_WRITEBACK: buffer is mid-I/O
- Zoned devices: require sequential writes
- Log trees: log blocks are immediately referenced by a committed
superblock via btrfs_sync_log(), so overwriting could corrupt the
committed log
- BTRFS_ROOT_FORCE_COW: snapshot in progress
- BTRFS_HEADER_FLAG_RELOC: block being relocated
Signed-off-by: Leo Martins <loemra.dev@gmail.com>
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: Sun YangKai <sunk67188@gmail.com>
---
fs/btrfs/ctree.c | 83 +++++++++++++++++++++++++++++++++++---------
fs/btrfs/disk-io.c | 2 +-
fs/btrfs/extent_io.c | 4 ---
3 files changed, 67 insertions(+), 22 deletions(-)
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 829d8be7f423b..3f2e9e3145c2d 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -603,6 +603,25 @@ int btrfs_force_cow_block(struct btrfs_trans_handle *trans,
return ret;
}
+/*
+ * Check if @buf needs to be COW'd.
+ *
+ * Returns true if COW is required, false if the block can be reused
+ * in place.
+ *
+ * We do not need to COW a block if:
+ * 1) the block was created or changed in this transaction;
+ * 2) the block does not belong to TREE_RELOC tree;
+ * 3) the root is not forced COW.
+ *
+ * Forced COW happens when we create a snapshot during transaction commit:
+ * after copying the src root, we must COW the shared block to ensure
+ * metadata consistency.
+ *
+ * When returning false for a WRITTEN buffer allocated in the current
+ * transaction, re-dirties the buffer for in-place overwrite instead
+ * of requesting a new COW.
+ */
static inline bool should_cow_block(struct btrfs_trans_handle *trans,
const struct btrfs_root *root,
struct extent_buffer *buf)
@@ -610,22 +629,14 @@ static inline bool should_cow_block(struct btrfs_trans_handle *trans,
if (btrfs_is_testing(root->fs_info))
return false;
- /*
- * We do not need to cow a block if
- * 1) this block is not created or changed in this transaction;
- * 2) this block does not belong to TREE_RELOC tree;
- * 3) the root is not forced COW.
- *
- * What is forced COW:
- * when we create snapshot during committing the transaction,
- * after we've finished copying src root, we must COW the shared
- * block to ensure the metadata consistency.
- */
-
if (btrfs_header_generation(buf) != trans->transid)
return true;
- if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN))
+ if (test_bit(EXTENT_BUFFER_WRITEBACK, &buf->bflags))
+ return true;
+
+ if (btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID &&
+ btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC))
return true;
/* Ensure we can see the FORCE_COW bit. */
@@ -633,11 +644,49 @@ static inline bool should_cow_block(struct btrfs_trans_handle *trans,
if (test_bit(BTRFS_ROOT_FORCE_COW, &root->state))
return true;
- if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID)
- return false;
+ if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
+ /*
+ * The buffer was allocated in this transaction and has been
+ * written back to disk (WRITTEN is set). Normally we'd COW
+ * it again, but since the committed superblock doesn't
+ * reference this buffer (it was allocated in this transaction),
+ * we can safely overwrite it in place.
+ *
+ * We keep BTRFS_HEADER_FLAG_WRITTEN set. The block has been
+ * persisted at this bytenr and will be again after the
+ * in-place update. This is important so that
+ * btrfs_free_tree_block() correctly pins the block if it is
+ * freed later (e.g., during tree rebalancing or FORCE_COW).
+ *
+ * Log trees and zoned devices cannot use this optimization:
+ * - Log trees: log blocks are written and immediately
+ * referenced by a committed superblock via
+ * btrfs_sync_log(), bypassing the normal transaction
+ * commit. Overwriting in place could corrupt the
+ * committed log.
+ * - Zoned devices: require sequential writes.
+ */
+ if (btrfs_root_id(root) == BTRFS_TREE_LOG_OBJECTID ||
+ btrfs_is_zoned(root->fs_info))
+ return true;
- if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC))
- return true;
+ /*
+ * Re-register this block's range in the current transaction's
+ * dirty_pages so that btrfs_write_and_wait_transaction()
+ * writes it. The range was originally registered when the
+ * block was allocated. Normally dirty_pages is only cleared
+ * at commit time by btrfs_write_and_wait_transaction(), but
+ * if qgroups are enabled and snapshots are being created,
+ * qgroup_account_snapshot() may have already called
+ * btrfs_write_and_wait_transaction() and released the range
+ * before the final commit-time call.
+ */
+ btrfs_set_extent_bit(&trans->transaction->dirty_pages,
+ buf->start,
+ buf->start + buf->len - 1,
+ EXTENT_DIRTY, NULL);
+ btrfs_mark_buffer_dirty(trans, buf);
+ }
btrfs_inhibit_eb_writeback(trans, buf);
return false;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 8a11be02eeb9b..cc6893f9892ef 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -4479,7 +4479,7 @@ void btrfs_mark_buffer_dirty(struct btrfs_trans_handle *trans,
#endif
/* This is an active transaction (its state < TRANS_STATE_UNBLOCKED). */
ASSERT(trans->transid == fs_info->generation);
- btrfs_assert_tree_write_locked(buf);
+ lockdep_assert_held(&buf->lock);
if (unlikely(transid != fs_info->generation)) {
btrfs_abort_transaction(trans, -EUCLEAN);
btrfs_crit(fs_info,
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 3411c41993477..dd6304795ace0 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3833,10 +3833,6 @@ void set_extent_buffer_dirty(struct extent_buffer *eb)
eb->len,
eb->fs_info->dirty_metadata_batch);
}
-#ifdef CONFIG_BTRFS_DEBUG
- for (int i = 0; i < num_extent_folios(eb); i++)
- ASSERT(folio_test_dirty(eb->folios[i]));
-#endif
}
void clear_extent_buffer_uptodate(struct extent_buffer *eb)
--
2.52.0
^ permalink raw reply related [flat|nested] 4+ messages in thread