From: Boris Burkov <boris@bur.io>
To: Mark Harmstone <mark@harmstone.com>
Cc: linux-btrfs@vger.kernel.org
Subject: Re: [PATCH v4 12/16] btrfs: replace identity remaps with actual remaps when doing relocations
Date: Fri, 31 Oct 2025 17:09:05 -0700 [thread overview]
Message-ID: <aQVPoWsIvwLXWePB@devvm12410.ftw0.facebook.com> (raw)
In-Reply-To: <20251024181227.32228-13-mark@harmstone.com>
On Fri, Oct 24, 2025 at 07:12:13PM +0100, Mark Harmstone wrote:
> Add a function do_remap_tree_reloc(), which does the actual work of
> doing a relocation using the remap tree.
>
> In a loop we call do_remap_tree_reloc_trans(), which searches for the
> first identity remap for the block group. We call btrfs_reserve_extent()
> to find space elsewhere for it, and read the data into memory and write
> it to the new location. We then carve out the identity remap and replace
> it with an actual remap, which points to the new location in which to
> look.
>
> Once the last identity remap has been removed we call
> last_identity_remap_gone(), which, as with deletions, removes the
> chunk's stripes and device extents.
>
> Signed-off-by: Mark Harmstone <mark@harmstone.com>
Want to be as clear as possible on the reservation fragmentation
stuff, but otherwise LGTM.
Reviewed-by: Boris Burkov <boris@bur.io>
> ---
> fs/btrfs/relocation.c | 317 ++++++++++++++++++++++++++++++++++++++++++
> 1 file changed, 317 insertions(+)
>
> diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
> index d31817379078..ebbc619be682 100644
> --- a/fs/btrfs/relocation.c
> +++ b/fs/btrfs/relocation.c
> @@ -4640,6 +4640,61 @@ static int create_remap_tree_entries(struct btrfs_trans_handle *trans,
> return ret;
> }
>
> +static int find_next_identity_remap(struct btrfs_trans_handle *trans,
> + struct btrfs_path *path, u64 bg_end,
> + u64 last_start, u64 *start,
> + u64 *length)
> +{
> + int ret;
> + struct btrfs_key key, found_key;
> + struct btrfs_root *remap_root = trans->fs_info->remap_root;
> + struct extent_buffer *leaf;
> +
> + key.objectid = last_start;
> + key.type = BTRFS_IDENTITY_REMAP_KEY;
> + key.offset = 0;
> +
> + ret = btrfs_search_slot(trans, remap_root, &key, path, 0, 0);
> + if (ret < 0)
> + goto out;
> +
> + leaf = path->nodes[0];
> + while (true) {
> + if (path->slots[0] >= btrfs_header_nritems(leaf)) {
> + ret = btrfs_next_leaf(remap_root, path);
> +
> + if (ret != 0) {
> + if (ret == 1)
> + ret = -ENOENT;
> + goto out;
> + }
> +
> + leaf = path->nodes[0];
> + }
> +
> + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
> +
> + if (found_key.objectid >= bg_end) {
> + ret = -ENOENT;
> + goto out;
> + }
> +
> + if (found_key.type == BTRFS_IDENTITY_REMAP_KEY) {
> + *start = found_key.objectid;
> + *length = found_key.offset;
> + ret = 0;
> + goto out;
> + }
> +
> + path->slots[0]++;
> + }
> +
> +out:
> + btrfs_release_path(path);
> +
> + return ret;
> +}
> +
> static int remove_chunk_stripes(struct btrfs_trans_handle *trans,
> struct btrfs_chunk_map *chunk,
> struct btrfs_path *path)
> @@ -4753,6 +4808,96 @@ static void adjust_identity_remap_count(struct btrfs_trans_handle *trans,
> btrfs_mark_bg_fully_remapped(bg, trans);
> }
>
> +static int add_remap_entry(struct btrfs_trans_handle *trans,
> + struct btrfs_path *path,
> + struct btrfs_block_group *src_bg, u64 old_addr,
> + u64 new_addr, u64 length)
> +{
> + struct btrfs_fs_info *fs_info = trans->fs_info;
> + struct btrfs_key key, new_key;
> + int ret;
> + int identity_count_delta = 0;
> +
> + key.objectid = old_addr;
> + key.type = (u8)-1;
> + key.offset = (u64)-1;
> +
> + ret = btrfs_search_slot(trans, fs_info->remap_root, &key, path, -1, 1);
> + if (ret < 0)
> + goto end;
> +
> + if (path->slots[0] == 0) {
> + ret = -ENOENT;
> + goto end;
> + }
> +
> + path->slots[0]--;
> +
> + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
> +
> + if (key.type != BTRFS_IDENTITY_REMAP_KEY ||
> + key.objectid > old_addr ||
> + key.objectid + key.offset <= old_addr) {
> + ret = -ENOENT;
> + goto end;
> + }
> +
> + /* Shorten or delete identity mapping entry. */
> +
> + if (key.objectid == old_addr) {
> + ret = btrfs_del_item(trans, fs_info->remap_root, path);
> + if (ret)
> + goto end;
> +
> + identity_count_delta--;
> + } else {
> + new_key.objectid = key.objectid;
> + new_key.type = BTRFS_IDENTITY_REMAP_KEY;
> + new_key.offset = old_addr - key.objectid;
> +
> + btrfs_set_item_key_safe(trans, path, &new_key);
> + }
> +
> + btrfs_release_path(path);
> +
> + /* Create new remap entry. */
> +
> + ret = add_remap_item(trans, path, new_addr, length, old_addr);
> + if (ret)
> + goto end;
> +
> + /* Add entry for remainder of identity mapping, if necessary. */
> +
> + if (key.objectid + key.offset != old_addr + length) {
> + new_key.objectid = old_addr + length;
> + new_key.type = BTRFS_IDENTITY_REMAP_KEY;
> + new_key.offset = key.objectid + key.offset - old_addr - length;
> +
> + ret = btrfs_insert_empty_item(trans, fs_info->remap_root,
> + path, &new_key, 0);
> + if (ret)
> + goto end;
> +
> + btrfs_release_path(path);
> +
> + identity_count_delta++;
> + }
> +
> + /* Add backref. */
> +
> + ret = add_remap_backref_item(trans, path, new_addr, length, old_addr);
> + if (ret)
> + goto end;
> +
> + if (identity_count_delta != 0)
> + adjust_identity_remap_count(trans, src_bg, identity_count_delta);
> +
> +end:
> + btrfs_release_path(path);
> +
> + return ret;
> +}
> +
> static int mark_chunk_remapped(struct btrfs_trans_handle *trans,
> struct btrfs_path *path, uint64_t start)
> {
> @@ -4802,6 +4947,169 @@ static int mark_chunk_remapped(struct btrfs_trans_handle *trans,
> return ret;
> }
>
> +static int do_remap_tree_reloc_trans(struct btrfs_fs_info *fs_info,
> + struct btrfs_block_group *src_bg,
> + struct btrfs_path *path, u64 *last_start)
> +{
> + struct btrfs_trans_handle *trans;
> + struct btrfs_root *extent_root;
> + struct btrfs_key ins;
> + struct btrfs_block_group *dest_bg = NULL;
> + u64 start, remap_length, length, new_addr, min_size;
> + int ret;
> + bool no_more = false;
> + bool is_data = src_bg->flags & BTRFS_BLOCK_GROUP_DATA;
> + bool made_reservation = false, bg_needs_free_space;
> + struct btrfs_space_info *sinfo = src_bg->space_info;
> +
> + extent_root = btrfs_extent_root(fs_info, src_bg->start);
> +
> + trans = btrfs_start_transaction(extent_root, 0);
> + if (IS_ERR(trans))
> + return PTR_ERR(trans);
> +
> + mutex_lock(&fs_info->remap_mutex);
> +
> + ret = find_next_identity_remap(trans, path, src_bg->start + src_bg->length,
> + *last_start, &start, &remap_length);
> + if (ret == -ENOENT) {
> + no_more = true;
> + goto next;
> + } else if (ret) {
> + mutex_unlock(&fs_info->remap_mutex);
> + btrfs_end_transaction(trans);
> + return ret;
> + }
> +
> + /* Try to reserve enough space for block. */
> +
> + spin_lock(&sinfo->lock);
> + btrfs_space_info_update_bytes_may_use(sinfo, remap_length);
> + spin_unlock(&sinfo->lock);
> +
> + if (is_data)
> + min_size = fs_info->sectorsize;
> + else
> + min_size = fs_info->nodesize;
As Qu mentioned, I think it makes sense to not change too much at once
and not add the extra fragmentation factor baked in with remap tree.
This isn't a format change so we can change it later if we have data
about lots of failing relocations to support that.
On the other hand, we are going contiguous non-free at a time rather
than extent at a time, so maybe this is actually quite necessary.
Let's document / highlight that if it's the reasoning.
> +
> + ret = btrfs_reserve_extent(fs_info->fs_root, remap_length,
> + remap_length, min_size,
> + 0, 0, &ins, is_data, false);
> + if (ret) {
> + spin_lock(&sinfo->lock);
> + btrfs_space_info_update_bytes_may_use(sinfo, -remap_length);
> + spin_unlock(&sinfo->lock);
> +
> + mutex_unlock(&fs_info->remap_mutex);
> + btrfs_end_transaction(trans);
> + return ret;
> + }
> +
> + made_reservation = true;
> +
> + new_addr = ins.objectid;
> + length = ins.offset;
> +
> + if (!is_data && !IS_ALIGNED(length, fs_info->nodesize)) {
> + u64 new_length = ALIGN_DOWN(length, fs_info->nodesize);
> +
> + btrfs_free_reserved_extent(fs_info, new_addr + new_length,
> + length - new_length, 0);
> +
> + length = new_length;
> + }
> +
> + dest_bg = btrfs_lookup_block_group(fs_info, new_addr);
> +
> + mutex_lock(&dest_bg->free_space_lock);
> + bg_needs_free_space = test_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE,
> + &dest_bg->runtime_flags);
> + mutex_unlock(&dest_bg->free_space_lock);
> +
> + if (bg_needs_free_space) {
> + ret = btrfs_add_block_group_free_space(trans, dest_bg);
> + if (ret)
> + goto fail;
> + }
> +
> + ret = do_copy(fs_info, start, new_addr, length);
> + if (ret)
> + goto fail;
> +
> + ret = btrfs_remove_from_free_space_tree(trans, new_addr, length);
> + if (ret)
> + goto fail;
> +
> + ret = add_remap_entry(trans, path, src_bg, start, new_addr, length);
> + if (ret) {
> + btrfs_add_to_free_space_tree(trans, new_addr, length);
> + goto fail;
> + }
> +
> + adjust_block_group_remap_bytes(trans, dest_bg, length);
> + btrfs_free_reserved_bytes(dest_bg, length, 0);
> +
> + spin_lock(&sinfo->lock);
> + sinfo->bytes_readonly += length;
> + spin_unlock(&sinfo->lock);
> +
> +next:
> + if (dest_bg)
> + btrfs_put_block_group(dest_bg);
> +
> + if (made_reservation)
> + btrfs_dec_block_group_reservations(fs_info, new_addr);
> +
> + mutex_unlock(&fs_info->remap_mutex);
> +
> + if (src_bg->identity_remap_count == 0)
> + btrfs_mark_bg_fully_remapped(src_bg, trans);
> +
> + ret = btrfs_end_transaction(trans);
> + if (ret)
> + return ret;
> +
> + if (no_more)
> + return 1;
> +
> + *last_start = start;
> +
> + return 0;
> +
> +fail:
> + if (dest_bg)
> + btrfs_put_block_group(dest_bg);
> +
> + btrfs_free_reserved_extent(fs_info, new_addr, length, 0);
> +
> + mutex_unlock(&fs_info->remap_mutex);
> + btrfs_end_transaction(trans);
> +
> + return ret;
> +}
> +
> +static int do_remap_tree_reloc(struct btrfs_fs_info *fs_info,
> + struct btrfs_path *path,
> + struct btrfs_block_group *bg)
> +{
> + u64 last_start;
> + int ret;
> +
> + last_start = bg->start;
> +
> + while (true) {
> + ret = do_remap_tree_reloc_trans(fs_info, bg, path,
> + &last_start);
> + if (ret) {
> + if (ret == 1)
> + ret = 0;
> + break;
> + }
> + }
> +
> + return ret;
> +}
> +
> int btrfs_translate_remap(struct btrfs_fs_info *fs_info, u64 *logical,
> u64 *length)
> {
> @@ -5046,6 +5354,15 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start,
> }
>
> ret = start_block_group_remapping(fs_info, path, bg);
> + if (ret)
> + goto out;
> +
> + ret = do_remap_tree_reloc(fs_info, path, rc->block_group);
> + if (ret)
> + goto out;
> +
> + btrfs_delete_unused_bgs(fs_info);
> +
> goto out;
> }
>
> --
> 2.49.1
>
next prev parent reply other threads:[~2025-11-01 0:09 UTC|newest]
Thread overview: 42+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-10-24 18:12 [PATCH v4 00/16] Remap tree Mark Harmstone
2025-10-24 18:12 ` [PATCH v4 01/16] btrfs: add definitions and constants for remap-tree Mark Harmstone
2025-10-31 22:50 ` Boris Burkov
2025-11-03 12:18 ` Mark Harmstone
2025-10-24 18:12 ` [PATCH v4 02/16] btrfs: add REMAP chunk type Mark Harmstone
2025-10-24 18:12 ` [PATCH v4 03/16] btrfs: allow remapped chunks to have zero stripes Mark Harmstone
2025-10-31 21:39 ` Boris Burkov
2025-10-24 18:12 ` [PATCH v4 04/16] btrfs: remove remapped block groups from the free-space tree Mark Harmstone
2025-10-31 21:44 ` Boris Burkov
2025-11-03 12:39 ` Mark Harmstone
2025-10-24 18:12 ` [PATCH v4 05/16] btrfs: don't add metadata items for the remap tree to the extent tree Mark Harmstone
2025-10-24 18:12 ` [PATCH v4 06/16] btrfs: add extended version of struct block_group_item Mark Harmstone
2025-10-31 21:47 ` Boris Burkov
2025-10-24 18:12 ` [PATCH v4 07/16] btrfs: allow mounting filesystems with remap-tree incompat flag Mark Harmstone
2025-10-24 18:12 ` [PATCH v4 08/16] btrfs: redirect I/O for remapped block groups Mark Harmstone
2025-10-31 22:03 ` Boris Burkov
2025-10-24 18:12 ` [PATCH v4 09/16] btrfs: handle deletions from remapped block group Mark Harmstone
2025-10-31 23:05 ` Boris Burkov
2025-11-03 15:51 ` Mark Harmstone
2025-10-31 23:30 ` Boris Burkov
2025-11-04 12:30 ` Mark Harmstone
2025-10-24 18:12 ` [PATCH v4 10/16] btrfs: handle setting up relocation of block group with remap-tree Mark Harmstone
2025-10-31 23:43 ` Boris Burkov
2025-11-03 18:45 ` Mark Harmstone
2025-10-24 18:12 ` [PATCH v4 11/16] btrfs: move existing remaps before relocating block group Mark Harmstone
2025-11-01 0:02 ` Boris Burkov
2025-11-04 13:00 ` Mark Harmstone
2025-11-01 0:10 ` Boris Burkov
2025-10-24 18:12 ` [PATCH v4 12/16] btrfs: replace identity remaps with actual remaps when doing relocations Mark Harmstone
2025-11-01 0:09 ` Boris Burkov [this message]
2025-11-04 14:31 ` Mark Harmstone
2025-10-24 18:12 ` [PATCH v4 13/16] btrfs: add do_remap param to btrfs_discard_extent() Mark Harmstone
2025-11-01 0:12 ` Boris Burkov
2025-10-24 18:12 ` [PATCH v4 14/16] btrfs: allow balancing remap tree Mark Harmstone
2025-10-24 18:12 ` [PATCH v4 15/16] btrfs: handle discarding fully-remapped block groups Mark Harmstone
2025-10-27 16:04 ` kernel test robot
2025-10-31 22:12 ` Boris Burkov
2025-11-03 16:49 ` Mark Harmstone
2025-11-09 8:42 ` Philip Li
2025-10-31 22:11 ` Boris Burkov
2025-11-03 17:01 ` Mark Harmstone
2025-10-24 18:12 ` [PATCH v4 16/16] btrfs: add stripe removal pending flag Mark Harmstone
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=aQVPoWsIvwLXWePB@devvm12410.ftw0.facebook.com \
--to=boris@bur.io \
--cc=linux-btrfs@vger.kernel.org \
--cc=mark@harmstone.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox