From: Boris Burkov <boris@bur.io>
To: Mark Harmstone <mark@harmstone.com>
Cc: linux-btrfs@vger.kernel.org
Subject: Re: [PATCH v5 11/16] btrfs: move existing remaps before relocating block group
Date: Tue, 11 Nov 2025 21:41:35 -0800 [thread overview]
Message-ID: <aRQeDyQVdcWN4kxF@devvm12410.ftw0.facebook.com> (raw)
In-Reply-To: <20251110171511.20900-12-mark@harmstone.com>
On Mon, Nov 10, 2025 at 05:14:35PM +0000, Mark Harmstone wrote:
> If when relocating a block group we find that `remap_bytes` > 0 in its
> block group item, that means that it has been the destination block
> group for another that has been remapped.
>
> We need to seach the remap tree for any remap backrefs within this
> range, and move the data to a third block group. This is because
> otherwise btrfs_translate_remap() could end up following an unbounded
> chain of remaps, which would only get worse over time.
>
> We only relocate one block group at a time, so `remap_bytes` will only
> ever go down while we are doing this. Once we're finished we set the
> REMAPPED flag on the block group, which will permanently prevent any
> other data from being moved to within it.
>
> Signed-off-by: Mark Harmstone <mark@harmstone.com>
Reviewed-by: Boris Burkov <boris@bur.io>
> ---
> fs/btrfs/bio.c | 3 +-
> fs/btrfs/bio.h | 3 +
> fs/btrfs/extent-tree.c | 6 +-
> fs/btrfs/relocation.c | 487 +++++++++++++++++++++++++++++++++++++++++
> 4 files changed, 496 insertions(+), 3 deletions(-)
>
> diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c
> index a73652b8724a..8e75a369729f 100644
> --- a/fs/btrfs/bio.c
> +++ b/fs/btrfs/bio.c
> @@ -778,7 +778,8 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num)
> */
> if (!(inode->flags & BTRFS_INODE_NODATASUM) &&
> !test_bit(BTRFS_FS_STATE_NO_DATA_CSUMS, &fs_info->fs_state) &&
> - !btrfs_is_data_reloc_root(inode->root)) {
> + !btrfs_is_data_reloc_root(inode->root) &&
> + !bbio->is_remap) {
> if (should_async_write(bbio) &&
> btrfs_wq_submit_bio(bbio, bioc, &smap, mirror_num))
> goto done;
> diff --git a/fs/btrfs/bio.h b/fs/btrfs/bio.h
> index deaeea3becf4..aefb95401499 100644
> --- a/fs/btrfs/bio.h
> +++ b/fs/btrfs/bio.h
> @@ -87,6 +87,9 @@ struct btrfs_bio {
> */
> bool is_scrub;
>
> + /* Whether the bio is coming from copy_remapped_data_io(). */
> + bool is_remap;
> +
> /* Whether the csum generation for data write is async. */
> bool async_csum;
>
> diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
> index 4bda12cdf697..a813f441c459 100644
> --- a/fs/btrfs/extent-tree.c
> +++ b/fs/btrfs/extent-tree.c
> @@ -4544,7 +4544,8 @@ static noinline int find_free_extent(struct btrfs_root *root,
> block_group->cached != BTRFS_CACHE_NO) {
> down_read(&space_info->groups_sem);
> if (list_empty(&block_group->list) ||
> - block_group->ro) {
> + block_group->ro ||
> + block_group->flags & BTRFS_BLOCK_GROUP_REMAPPED) {
> /*
> * someone is removing this block group,
> * we can't jump into the have_block_group
> @@ -4578,7 +4579,8 @@ static noinline int find_free_extent(struct btrfs_root *root,
>
> ffe_ctl->hinted = false;
> /* If the block group is read-only, we can skip it entirely. */
> - if (unlikely(block_group->ro)) {
> + if (unlikely(block_group->ro) ||
> + block_group->flags & BTRFS_BLOCK_GROUP_REMAPPED) {
> if (ffe_ctl->for_treelog)
> btrfs_clear_treelog_bg(block_group);
> if (ffe_ctl->for_data_reloc)
> diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
> index 535e07cc3719..1fed02f76ed4 100644
> --- a/fs/btrfs/relocation.c
> +++ b/fs/btrfs/relocation.c
> @@ -3977,6 +3977,487 @@ static void adjust_block_group_remap_bytes(struct btrfs_trans_handle *trans,
> btrfs_inc_delayed_refs_rsv_bg_updates(fs_info);
> }
>
> +struct reloc_io_private {
> + struct completion done;
> + refcount_t pending_refs;
> + blk_status_t status;
> +};
> +
> +static void reloc_endio(struct btrfs_bio *bbio)
> +{
> + struct reloc_io_private *priv = bbio->private;
> +
> + if (bbio->bio.bi_status)
> + WRITE_ONCE(priv->status, bbio->bio.bi_status);
> +
> + if (refcount_dec_and_test(&priv->pending_refs))
> + complete(&priv->done);
> +
> + bio_put(&bbio->bio);
> +}
> +
> +static int copy_remapped_data_io(struct btrfs_fs_info *fs_info,
> + struct reloc_io_private *priv,
> + struct page **pages, u64 addr, u64 length,
> + bool do_write)
> +{
> + struct btrfs_bio *bbio;
> + unsigned long i = 0;
> + blk_opf_t op = do_write ? REQ_OP_WRITE : REQ_OP_READ;
> +
> + init_completion(&priv->done);
> + refcount_set(&priv->pending_refs, 1);
> + priv->status = 0;
> +
> + bbio = btrfs_bio_alloc(BIO_MAX_VECS, op, BTRFS_I(fs_info->btree_inode),
> + addr, reloc_endio, priv);
> + bbio->bio.bi_iter.bi_sector = addr >> SECTOR_SHIFT;
> + bbio->is_remap = true;
> +
> + do {
> + size_t bytes = min_t(u64, length, PAGE_SIZE);
> +
> + if (bio_add_page(&bbio->bio, pages[i], bytes, 0) < bytes) {
> + refcount_inc(&priv->pending_refs);
> + btrfs_submit_bbio(bbio, 0);
> +
> + bbio = btrfs_bio_alloc(BIO_MAX_VECS, op,
> + BTRFS_I(fs_info->btree_inode),
> + addr, reloc_endio, priv);
> + bbio->bio.bi_iter.bi_sector = addr >> SECTOR_SHIFT;
> + bbio->is_remap = true;
> + continue;
> + }
> +
> + i++;
> + addr += bytes;
> + length -= bytes;
> + } while (length);
> +
> + refcount_inc(&priv->pending_refs);
> + btrfs_submit_bbio(bbio, 0);
> +
> + if (!refcount_dec_and_test(&priv->pending_refs))
> + wait_for_completion_io(&priv->done);
> +
> + return blk_status_to_errno(READ_ONCE(priv->status));
> +}
> +
> +static int copy_remapped_data(struct btrfs_fs_info *fs_info, u64 old_addr,
> + u64 new_addr, u64 length)
> +{
> + int ret;
> + struct page **pages;
> + unsigned int nr_pages;
> + struct reloc_io_private priv;
> +
> + nr_pages = (length + PAGE_SIZE - 1) >> PAGE_SHIFT;
> + pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
> + if (!pages)
> + return -ENOMEM;
> + ret = btrfs_alloc_page_array(nr_pages, pages, 0);
> + if (ret) {
> + ret = -ENOMEM;
> + goto end;
> + }
> +
> + ret = copy_remapped_data_io(fs_info, &priv, pages, old_addr, length,
> + false);
> + if (ret)
> + goto end;
> +
> + ret = copy_remapped_data_io(fs_info, &priv, pages, new_addr, length,
> + true);
> +
> +end:
> + for (unsigned int i = 0; i < nr_pages; i++) {
> + if (pages[i])
> + __free_page(pages[i]);
> + }
> + kfree(pages);
> +
> + return ret;
> +}
> +
> +static int do_copy(struct btrfs_fs_info *fs_info, u64 old_addr, u64 new_addr,
> + u64 length)
> +{
> + int ret;
> +
> + /* Copy 1MB at a time, to avoid using too much memory. */
> +
> + do {
> + u64 to_copy = min_t(u64, length, SZ_1M);
> +
> + /* Limit to one bio. */
> + to_copy = min_t(u64, to_copy, BIO_MAX_VECS << PAGE_SHIFT);
> +
> + ret = copy_remapped_data(fs_info, old_addr, new_addr,
> + to_copy);
> + if (ret)
> + return ret;
> +
> + if (to_copy == length)
> + break;
> +
> + old_addr += to_copy;
> + new_addr += to_copy;
> + length -= to_copy;
> + } while (true);
> +
> + return 0;
> +}
> +
> +static int add_remap_item(struct btrfs_trans_handle *trans,
> + struct btrfs_path *path, u64 new_addr, u64 length,
> + u64 old_addr)
> +{
> + struct btrfs_fs_info *fs_info = trans->fs_info;
> + struct btrfs_remap remap;
> + struct btrfs_key key;
> + struct extent_buffer *leaf;
> + int ret;
> +
> + key.objectid = old_addr;
> + key.type = BTRFS_REMAP_KEY;
> + key.offset = length;
> +
> + ret = btrfs_insert_empty_item(trans, fs_info->remap_root, path,
> + &key, sizeof(struct btrfs_remap));
> + if (ret)
> + return ret;
> +
> + leaf = path->nodes[0];
> +
> + btrfs_set_stack_remap_address(&remap, new_addr);
> +
> + write_extent_buffer(leaf, &remap,
> + btrfs_item_ptr_offset(leaf, path->slots[0]),
> + sizeof(struct btrfs_remap));
> +
> + btrfs_release_path(path);
> +
> + return 0;
> +}
> +
> +static int add_remap_backref_item(struct btrfs_trans_handle *trans,
> + struct btrfs_path *path, u64 new_addr,
> + u64 length, u64 old_addr)
> +{
> + struct btrfs_fs_info *fs_info = trans->fs_info;
> + struct btrfs_remap remap;
> + struct btrfs_key key;
> + struct extent_buffer *leaf;
> + int ret;
> +
> + key.objectid = new_addr;
> + key.type = BTRFS_REMAP_BACKREF_KEY;
> + key.offset = length;
> +
> + ret = btrfs_insert_empty_item(trans, fs_info->remap_root,
> + path, &key, sizeof(struct btrfs_remap));
> + if (ret)
> + return ret;
> +
> + leaf = path->nodes[0];
> +
> + btrfs_set_stack_remap_address(&remap, old_addr);
> +
> + write_extent_buffer(leaf, &remap,
> + btrfs_item_ptr_offset(leaf, path->slots[0]),
> + sizeof(struct btrfs_remap));
> +
> + btrfs_release_path(path);
> +
> + return 0;
> +}
> +
> +static int move_existing_remap(struct btrfs_fs_info *fs_info,
> + struct btrfs_path *path,
> + struct btrfs_block_group *bg, u64 new_addr,
> + u64 length, u64 old_addr)
> +{
> + struct btrfs_trans_handle *trans;
> + struct extent_buffer *leaf;
> + struct btrfs_remap *remap_ptr, remap;
> + struct btrfs_key key, ins;
> + u64 dest_addr, dest_length, min_size;
> + struct btrfs_block_group *dest_bg;
> + int ret;
> + bool is_data = bg->flags & BTRFS_BLOCK_GROUP_DATA;
> + struct btrfs_space_info *sinfo = bg->space_info;
> + bool mutex_taken = false, bg_needs_free_space;
> +
> + spin_lock(&sinfo->lock);
> + btrfs_space_info_update_bytes_may_use(sinfo, length);
> + spin_unlock(&sinfo->lock);
> +
> + if (is_data)
> + min_size = fs_info->sectorsize;
> + else
> + min_size = fs_info->nodesize;
> +
> + ret = btrfs_reserve_extent(fs_info->fs_root, length, length, min_size,
> + 0, 0, &ins, is_data, false);
> + if (ret) {
> + spin_lock(&sinfo->lock);
> + btrfs_space_info_update_bytes_may_use(sinfo, -length);
> + spin_unlock(&sinfo->lock);
> + return ret;
> + }
> +
> + dest_addr = ins.objectid;
> + dest_length = ins.offset;
> +
> + if (!is_data && !IS_ALIGNED(dest_length, fs_info->nodesize)) {
> + u64 new_length = ALIGN_DOWN(dest_length, fs_info->nodesize);
> +
> + btrfs_free_reserved_extent(fs_info, dest_addr + new_length,
> + dest_length - new_length, 0);
> +
> + dest_length = new_length;
> + }
> +
> + trans = btrfs_join_transaction(fs_info->remap_root);
> + if (IS_ERR(trans)) {
> + ret = PTR_ERR(trans);
> + trans = NULL;
> + goto end;
> + }
> +
> + mutex_lock(&fs_info->remap_mutex);
> + mutex_taken = true;
> +
> + /* Find old remap entry. */
> +
> + key.objectid = old_addr;
> + key.type = BTRFS_REMAP_KEY;
> + key.offset = length;
> +
> + ret = btrfs_search_slot(trans, fs_info->remap_root, &key,
> + path, 0, 1);
> + if (ret == 1) {
> + /*
> + * Not a problem if the remap entry wasn't found: that means
> + * that another transaction has deallocated the data.
> + * move_existing_remaps() loops until the BG contains no
> + * remaps, so we can just return 0 in this case.
> + */
> + btrfs_release_path(path);
> + ret = 0;
> + goto end;
> + } else if (ret) {
> + goto end;
> + }
> +
> + ret = do_copy(fs_info, new_addr, dest_addr, dest_length);
> + if (ret)
> + goto end;
> +
> + /* Change data of old remap entry. */
> +
> + leaf = path->nodes[0];
> +
> + remap_ptr = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_remap);
> + btrfs_set_remap_address(leaf, remap_ptr, dest_addr);
> +
> + btrfs_mark_buffer_dirty(trans, leaf);
> +
> + if (dest_length != length) {
> + key.offset = dest_length;
> + btrfs_set_item_key_safe(trans, path, &key);
> + }
> +
> + btrfs_release_path(path);
> +
> + if (dest_length != length) {
> + /* Add remap item for remainder. */
> +
> + ret = add_remap_item(trans, path, new_addr + dest_length,
> + length - dest_length,
> + old_addr + dest_length);
> + if (ret)
> + goto end;
> + }
> +
> + /* Change or remove old backref. */
> +
> + key.objectid = new_addr;
> + key.type = BTRFS_REMAP_BACKREF_KEY;
> + key.offset = length;
> +
> + ret = btrfs_search_slot(trans, fs_info->remap_root, &key,
> + path, -1, 1);
> + if (ret) {
> + if (ret == 1) {
> + btrfs_release_path(path);
> + ret = -ENOENT;
> + }
> + goto end;
> + }
> +
> + leaf = path->nodes[0];
> +
> + if (dest_length == length) {
> + ret = btrfs_del_item(trans, fs_info->remap_root, path);
> + if (ret) {
> + btrfs_release_path(path);
> + goto end;
> + }
> + } else {
> + key.objectid += dest_length;
> + key.offset -= dest_length;
> + btrfs_set_item_key_safe(trans, path, &key);
> +
> + btrfs_set_stack_remap_address(&remap, old_addr + dest_length);
> +
> + write_extent_buffer(leaf, &remap,
> + btrfs_item_ptr_offset(leaf, path->slots[0]),
> + sizeof(struct btrfs_remap));
> + }
> +
> + btrfs_release_path(path);
> +
> + /* Add new backref. */
> +
> + ret = add_remap_backref_item(trans, path, dest_addr, dest_length,
> + old_addr);
> + if (ret)
> + goto end;
> +
> + adjust_block_group_remap_bytes(trans, bg, -dest_length);
> +
> + ret = btrfs_add_to_free_space_tree(trans, new_addr, dest_length);
> + if (ret)
> + goto end;
> +
> + dest_bg = btrfs_lookup_block_group(fs_info, dest_addr);
> +
> + adjust_block_group_remap_bytes(trans, dest_bg, dest_length);
> +
> + mutex_lock(&dest_bg->free_space_lock);
> + bg_needs_free_space = test_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE,
> + &dest_bg->runtime_flags);
> + mutex_unlock(&dest_bg->free_space_lock);
> + btrfs_put_block_group(dest_bg);
> +
> + if (bg_needs_free_space) {
> + ret = btrfs_add_block_group_free_space(trans, dest_bg);
> + if (ret)
> + goto end;
> + }
> +
> + ret = btrfs_remove_from_free_space_tree(trans, dest_addr, dest_length);
> + if (ret) {
> + btrfs_remove_from_free_space_tree(trans, new_addr,
> + dest_length);
> + goto end;
> + }
> +
> + ret = 0;
> +
> +end:
> + if (mutex_taken)
> + mutex_unlock(&fs_info->remap_mutex);
> +
> + btrfs_dec_block_group_reservations(fs_info, dest_addr);
> +
> + if (ret) {
> + btrfs_free_reserved_extent(fs_info, dest_addr, dest_length, 0);
> +
> + if (trans) {
> + btrfs_abort_transaction(trans, ret);
> + btrfs_end_transaction(trans);
> + }
> + } else {
> + dest_bg = btrfs_lookup_block_group(fs_info, dest_addr);
> + btrfs_free_reserved_bytes(dest_bg, dest_length, 0);
> + btrfs_put_block_group(dest_bg);
> +
> + ret = btrfs_commit_transaction(trans);
> + }
> +
> + return ret;
> +}
> +
> +static int move_existing_remaps(struct btrfs_fs_info *fs_info,
> + struct btrfs_block_group *bg,
> + struct btrfs_path *path)
> +{
> + int ret;
> + struct btrfs_key key;
> + struct extent_buffer *leaf;
> + struct btrfs_remap *remap;
> + u64 old_addr;
> +
> + /* Look for backrefs in remap tree. */
> +
> + while (bg->remap_bytes > 0) {
> + key.objectid = bg->start;
> + key.type = BTRFS_REMAP_BACKREF_KEY;
> + key.offset = 0;
> +
> + ret = btrfs_search_slot(NULL, fs_info->remap_root, &key, path,
> + 0, 0);
> + if (ret < 0)
> + return ret;
> +
> + leaf = path->nodes[0];
> +
> + if (path->slots[0] >= btrfs_header_nritems(leaf)) {
> + ret = btrfs_next_leaf(fs_info->remap_root, path);
> + if (ret < 0) {
> + btrfs_release_path(path);
> + return ret;
> + }
> +
> + if (ret) {
> + btrfs_release_path(path);
> + break;
> + }
> +
> + leaf = path->nodes[0];
> + }
> +
> + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
> +
> + if (key.type != BTRFS_REMAP_BACKREF_KEY) {
> + path->slots[0]++;
> +
> + if (path->slots[0] >= btrfs_header_nritems(leaf)) {
> + ret = btrfs_next_leaf(fs_info->remap_root, path);
> + if (ret < 0) {
> + btrfs_release_path(path);
> + return ret;
> + }
> +
> + if (ret) {
> + btrfs_release_path(path);
> + break;
> + }
> +
> + leaf = path->nodes[0];
> + }
> + }
> +
> + remap = btrfs_item_ptr(leaf, path->slots[0],
> + struct btrfs_remap);
> +
> + old_addr = btrfs_remap_address(leaf, remap);
> +
> + btrfs_release_path(path);
> +
> + ret = move_existing_remap(fs_info, path, bg, key.objectid,
> + key.offset, old_addr);
> + if (ret)
> + return ret;
> + }
> +
> + BUG_ON(bg->remap_bytes > 0);
> +
> + return 0;
> +}
> +
> static int create_remap_tree_entries(struct btrfs_trans_handle *trans,
> struct btrfs_path *path,
> struct btrfs_block_group *bg)
> @@ -4550,6 +5031,12 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start,
> WARN_ON(ret && ret != -EAGAIN);
>
> if (should_relocate_using_remap_tree(bg)) {
> + if (bg->remap_bytes != 0) {
> + ret = move_existing_remaps(fs_info, bg, path);
> + if (ret)
> + goto out;
> + }
> +
> ret = start_block_group_remapping(fs_info, path, bg);
> } else {
> while (1) {
> --
> 2.51.0
>
next prev parent reply other threads:[~2025-11-12 5:41 UTC|newest]
Thread overview: 26+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-11-10 17:14 [PATCH v5 00/16] Remap tree Mark Harmstone
2025-11-10 17:14 ` [PATCH v5 01/16] btrfs: add definitions and constants for remap-tree Mark Harmstone
2025-11-10 17:14 ` [PATCH v5 02/16] btrfs: add REMAP chunk type Mark Harmstone
2025-11-10 17:14 ` [PATCH v5 03/16] btrfs: allow remapped chunks to have zero stripes Mark Harmstone
2025-11-10 17:14 ` [PATCH v5 04/16] btrfs: remove remapped block groups from the free-space tree Mark Harmstone
2025-11-10 17:14 ` [PATCH v5 05/16] btrfs: don't add metadata items for the remap tree to the extent tree Mark Harmstone
2025-11-10 17:14 ` [PATCH v5 06/16] btrfs: add extended version of struct block_group_item Mark Harmstone
2025-11-10 17:14 ` [PATCH v5 07/16] btrfs: allow mounting filesystems with remap-tree incompat flag Mark Harmstone
2025-11-10 17:14 ` [PATCH v5 08/16] btrfs: redirect I/O for remapped block groups Mark Harmstone
2025-11-10 17:14 ` [PATCH v5 09/16] btrfs: handle deletions from remapped block group Mark Harmstone
2025-11-12 5:14 ` Boris Burkov
2025-11-12 18:51 ` Mark Harmstone
2025-11-13 11:09 ` Mark Harmstone
2025-11-12 6:15 ` Boris Burkov
2025-11-10 17:14 ` [PATCH v5 10/16] btrfs: handle setting up relocation of block group with remap-tree Mark Harmstone
2025-11-12 5:35 ` Boris Burkov
2025-11-12 17:38 ` Mark Harmstone
2025-11-10 17:14 ` [PATCH v5 11/16] btrfs: move existing remaps before relocating block group Mark Harmstone
2025-11-12 5:41 ` Boris Burkov [this message]
2025-11-10 17:14 ` [PATCH v5 12/16] btrfs: replace identity remaps with actual remaps when doing relocations Mark Harmstone
2025-11-10 17:14 ` [PATCH v5 13/16] btrfs: add do_remap param to btrfs_discard_extent() Mark Harmstone
2025-11-10 17:14 ` [PATCH v5 14/16] btrfs: allow balancing remap tree Mark Harmstone
2025-11-10 17:14 ` [PATCH v5 15/16] btrfs: handle discarding fully-remapped block groups Mark Harmstone
2025-11-12 5:55 ` Boris Burkov
2025-11-10 17:14 ` [PATCH v5 16/16] btrfs: populate fully_remapped_bgs_list on mount Mark Harmstone
2025-11-12 5:57 ` Boris Burkov
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=aRQeDyQVdcWN4kxF@devvm12410.ftw0.facebook.com \
--to=boris@bur.io \
--cc=linux-btrfs@vger.kernel.org \
--cc=mark@harmstone.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox