public inbox for linux-btrfs@vger.kernel.org
 help / color / mirror / Atom feed
From: Boris Burkov <boris@bur.io>
To: Mark Harmstone <mark@harmstone.com>
Cc: linux-btrfs@vger.kernel.org
Subject: Re: [PATCH v4 12/16] btrfs: replace identity remaps with actual remaps when doing relocations
Date: Fri, 31 Oct 2025 17:09:05 -0700	[thread overview]
Message-ID: <aQVPoWsIvwLXWePB@devvm12410.ftw0.facebook.com> (raw)
In-Reply-To: <20251024181227.32228-13-mark@harmstone.com>

On Fri, Oct 24, 2025 at 07:12:13PM +0100, Mark Harmstone wrote:
> Add a function do_remap_tree_reloc(), which does the actual work of
> doing a relocation using the remap tree.
> 
> In a loop we call do_remap_tree_reloc_trans(), which searches for the
> first identity remap for the block group. We call btrfs_reserve_extent()
> to find space elsewhere for it, and read the data into memory and write
> it to the new location. We then carve out the identity remap and replace
> it with an actual remap, which points to the new location in which to
> look.
> 
> Once the last identity remap has been removed we call
> last_identity_remap_gone(), which, as with deletions, removes the
> chunk's stripes and device extents.
> 
> Signed-off-by: Mark Harmstone <mark@harmstone.com>

Want to be as clear as possible on the reservation fragmentation
stuff, but otherwise LGTM.

Reviewed-by: Boris Burkov <boris@bur.io>
> ---
>  fs/btrfs/relocation.c | 317 ++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 317 insertions(+)
> 
> diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
> index d31817379078..ebbc619be682 100644
> --- a/fs/btrfs/relocation.c
> +++ b/fs/btrfs/relocation.c
> @@ -4640,6 +4640,61 @@ static int create_remap_tree_entries(struct btrfs_trans_handle *trans,
>  	return ret;
>  }
>  
> +static int find_next_identity_remap(struct btrfs_trans_handle *trans,
> +				    struct btrfs_path *path, u64 bg_end,
> +				    u64 last_start, u64 *start,
> +				    u64 *length)
> +{
> +	int ret;
> +	struct btrfs_key key, found_key;
> +	struct btrfs_root *remap_root = trans->fs_info->remap_root;
> +	struct extent_buffer *leaf;
> +
> +	key.objectid = last_start;
> +	key.type = BTRFS_IDENTITY_REMAP_KEY;
> +	key.offset = 0;
> +
> +	ret = btrfs_search_slot(trans, remap_root, &key, path, 0, 0);
> +	if (ret < 0)
> +		goto out;
> +
> +	leaf = path->nodes[0];
> +	while (true) {
> +		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
> +			ret = btrfs_next_leaf(remap_root, path);
> +
> +			if (ret != 0) {
> +				if (ret == 1)
> +					ret = -ENOENT;
> +				goto out;
> +			}
> +
> +			leaf = path->nodes[0];
> +		}
> +
> +		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
> +
> +		if (found_key.objectid >= bg_end) {
> +			ret = -ENOENT;
> +			goto out;
> +		}
> +
> +		if (found_key.type == BTRFS_IDENTITY_REMAP_KEY) {
> +			*start = found_key.objectid;
> +			*length = found_key.offset;
> +			ret = 0;
> +			goto out;
> +		}
> +
> +		path->slots[0]++;
> +	}
> +
> +out:
> +	btrfs_release_path(path);
> +
> +	return ret;
> +}
> +
>  static int remove_chunk_stripes(struct btrfs_trans_handle *trans,
>  				struct btrfs_chunk_map *chunk,
>  				struct btrfs_path *path)
> @@ -4753,6 +4808,96 @@ static void adjust_identity_remap_count(struct btrfs_trans_handle *trans,
>  		btrfs_mark_bg_fully_remapped(bg, trans);
>  }
>  
> +static int add_remap_entry(struct btrfs_trans_handle *trans,
> +			   struct btrfs_path *path,
> +			   struct btrfs_block_group *src_bg, u64 old_addr,
> +			   u64 new_addr, u64 length)
> +{
> +	struct btrfs_fs_info *fs_info = trans->fs_info;
> +	struct btrfs_key key, new_key;
> +	int ret;
> +	int identity_count_delta = 0;
> +
> +	key.objectid = old_addr;
> +	key.type = (u8)-1;
> +	key.offset = (u64)-1;
> +
> +	ret = btrfs_search_slot(trans, fs_info->remap_root, &key, path, -1, 1);
> +	if (ret < 0)
> +		goto end;
> +
> +	if (path->slots[0] == 0) {
> +		ret = -ENOENT;
> +		goto end;
> +	}
> +
> +	path->slots[0]--;
> +
> +	btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
> +
> +	if (key.type != BTRFS_IDENTITY_REMAP_KEY ||
> +	    key.objectid > old_addr ||
> +	    key.objectid + key.offset <= old_addr) {
> +		ret = -ENOENT;
> +		goto end;
> +	}
> +
> +	/* Shorten or delete identity mapping entry. */
> +
> +	if (key.objectid == old_addr) {
> +		ret = btrfs_del_item(trans, fs_info->remap_root, path);
> +		if (ret)
> +			goto end;
> +
> +		identity_count_delta--;
> +	} else {
> +		new_key.objectid = key.objectid;
> +		new_key.type = BTRFS_IDENTITY_REMAP_KEY;
> +		new_key.offset = old_addr - key.objectid;
> +
> +		btrfs_set_item_key_safe(trans, path, &new_key);
> +	}
> +
> +	btrfs_release_path(path);
> +
> +	/* Create new remap entry. */
> +
> +	ret = add_remap_item(trans, path, new_addr, length, old_addr);
> +	if (ret)
> +		goto end;
> +
> +	/* Add entry for remainder of identity mapping, if necessary. */
> +
> +	if (key.objectid + key.offset != old_addr + length) {
> +		new_key.objectid = old_addr + length;
> +		new_key.type = BTRFS_IDENTITY_REMAP_KEY;
> +		new_key.offset = key.objectid + key.offset - old_addr - length;
> +
> +		ret = btrfs_insert_empty_item(trans, fs_info->remap_root,
> +					      path, &new_key, 0);
> +		if (ret)
> +			goto end;
> +
> +		btrfs_release_path(path);
> +
> +		identity_count_delta++;
> +	}
> +
> +	/* Add backref. */
> +
> +	ret = add_remap_backref_item(trans, path, new_addr, length, old_addr);
> +	if (ret)
> +		goto end;
> +
> +	if (identity_count_delta != 0)
> +		adjust_identity_remap_count(trans, src_bg, identity_count_delta);
> +
> +end:
> +	btrfs_release_path(path);
> +
> +	return ret;
> +}
> +
>  static int mark_chunk_remapped(struct btrfs_trans_handle *trans,
>  			       struct btrfs_path *path, uint64_t start)
>  {
> @@ -4802,6 +4947,169 @@ static int mark_chunk_remapped(struct btrfs_trans_handle *trans,
>  	return ret;
>  }
>  
> +static int do_remap_tree_reloc_trans(struct btrfs_fs_info *fs_info,
> +				     struct btrfs_block_group *src_bg,
> +				     struct btrfs_path *path, u64 *last_start)
> +{
> +	struct btrfs_trans_handle *trans;
> +	struct btrfs_root *extent_root;
> +	struct btrfs_key ins;
> +	struct btrfs_block_group *dest_bg = NULL;
> +	u64 start, remap_length, length, new_addr, min_size;
> +	int ret;
> +	bool no_more = false;
> +	bool is_data = src_bg->flags & BTRFS_BLOCK_GROUP_DATA;
> +	bool made_reservation = false, bg_needs_free_space;
> +	struct btrfs_space_info *sinfo = src_bg->space_info;
> +
> +	extent_root = btrfs_extent_root(fs_info, src_bg->start);
> +
> +	trans = btrfs_start_transaction(extent_root, 0);
> +	if (IS_ERR(trans))
> +		return PTR_ERR(trans);
> +
> +	mutex_lock(&fs_info->remap_mutex);
> +
> +	ret = find_next_identity_remap(trans, path, src_bg->start + src_bg->length,
> +				       *last_start, &start, &remap_length);
> +	if (ret == -ENOENT) {
> +		no_more = true;
> +		goto next;
> +	} else if (ret) {
> +		mutex_unlock(&fs_info->remap_mutex);
> +		btrfs_end_transaction(trans);
> +		return ret;
> +	}
> +
> +	/* Try to reserve enough space for block. */
> +
> +	spin_lock(&sinfo->lock);
> +	btrfs_space_info_update_bytes_may_use(sinfo, remap_length);
> +	spin_unlock(&sinfo->lock);
> +
> +	if (is_data)
> +		min_size = fs_info->sectorsize;
> +	else
> +		min_size = fs_info->nodesize;

As Qu mentioned, I think it makes sense to not change too much at once
and not add the extra fragmentation factor baked in with remap tree.
This isn't a format change so we can change it later if we have data
about lots of failing relocations to support that.

On the other hand, we are going contiguous non-free at a time rather
than extent at a time, so maybe this is actually quite necessary.

Let's document / highlight that if it's the reasoning.

> +
> +	ret = btrfs_reserve_extent(fs_info->fs_root, remap_length,
> +				   remap_length, min_size,
> +				   0, 0, &ins, is_data, false);
> +	if (ret) {
> +		spin_lock(&sinfo->lock);
> +		btrfs_space_info_update_bytes_may_use(sinfo, -remap_length);
> +		spin_unlock(&sinfo->lock);
> +
> +		mutex_unlock(&fs_info->remap_mutex);
> +		btrfs_end_transaction(trans);
> +		return ret;
> +	}
> +
> +	made_reservation = true;
> +
> +	new_addr = ins.objectid;
> +	length = ins.offset;
> +
> +	if (!is_data && !IS_ALIGNED(length, fs_info->nodesize)) {
> +		u64 new_length = ALIGN_DOWN(length, fs_info->nodesize);
> +
> +		btrfs_free_reserved_extent(fs_info, new_addr + new_length,
> +					   length - new_length, 0);
> +
> +		length = new_length;
> +	}
> +
> +	dest_bg = btrfs_lookup_block_group(fs_info, new_addr);
> +
> +	mutex_lock(&dest_bg->free_space_lock);
> +	bg_needs_free_space = test_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE,
> +				       &dest_bg->runtime_flags);
> +	mutex_unlock(&dest_bg->free_space_lock);
> +
> +	if (bg_needs_free_space) {
> +		ret = btrfs_add_block_group_free_space(trans, dest_bg);
> +		if (ret)
> +			goto fail;
> +	}
> +
> +	ret = do_copy(fs_info, start, new_addr, length);
> +	if (ret)
> +		goto fail;
> +
> +	ret = btrfs_remove_from_free_space_tree(trans, new_addr, length);
> +	if (ret)
> +		goto fail;
> +
> +	ret = add_remap_entry(trans, path, src_bg, start, new_addr, length);
> +	if (ret) {
> +		btrfs_add_to_free_space_tree(trans, new_addr, length);
> +		goto fail;
> +	}
> +
> +	adjust_block_group_remap_bytes(trans, dest_bg, length);
> +	btrfs_free_reserved_bytes(dest_bg, length, 0);
> +
> +	spin_lock(&sinfo->lock);
> +	sinfo->bytes_readonly += length;
> +	spin_unlock(&sinfo->lock);
> +
> +next:
> +	if (dest_bg)
> +		btrfs_put_block_group(dest_bg);
> +
> +	if (made_reservation)
> +		btrfs_dec_block_group_reservations(fs_info, new_addr);
> +
> +	mutex_unlock(&fs_info->remap_mutex);
> +
> +	if (src_bg->identity_remap_count == 0)
> +		btrfs_mark_bg_fully_remapped(src_bg, trans);
> +
> +	ret = btrfs_end_transaction(trans);
> +	if (ret)
> +		return ret;
> +
> +	if (no_more)
> +		return 1;
> +
> +	*last_start = start;
> +
> +	return 0;
> +
> +fail:
> +	if (dest_bg)
> +		btrfs_put_block_group(dest_bg);
> +
> +	btrfs_free_reserved_extent(fs_info, new_addr, length, 0);
> +
> +	mutex_unlock(&fs_info->remap_mutex);
> +	btrfs_end_transaction(trans);
> +
> +	return ret;
> +}
> +
> +static int do_remap_tree_reloc(struct btrfs_fs_info *fs_info,
> +			       struct btrfs_path *path,
> +			       struct btrfs_block_group *bg)
> +{
> +	u64 last_start;
> +	int ret;
> +
> +	last_start = bg->start;
> +
> +	while (true) {
> +		ret = do_remap_tree_reloc_trans(fs_info, bg, path,
> +						&last_start);
> +		if (ret) {
> +			if (ret == 1)
> +				ret = 0;
> +			break;
> +		}
> +	}
> +
> +	return ret;
> +}
> +
>  int btrfs_translate_remap(struct btrfs_fs_info *fs_info, u64 *logical,
>  			  u64 *length)
>  {
> @@ -5046,6 +5354,15 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start,
>  		}
>  
>  		ret = start_block_group_remapping(fs_info, path, bg);
> +		if (ret)
> +			goto out;
> +
> +		ret = do_remap_tree_reloc(fs_info, path, rc->block_group);
> +		if (ret)
> +			goto out;
> +
> +		btrfs_delete_unused_bgs(fs_info);
> +
>  		goto out;
>  	}
>  
> -- 
> 2.49.1
> 

  reply	other threads:[~2025-11-01  0:09 UTC|newest]

Thread overview: 42+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-10-24 18:12 [PATCH v4 00/16] Remap tree Mark Harmstone
2025-10-24 18:12 ` [PATCH v4 01/16] btrfs: add definitions and constants for remap-tree Mark Harmstone
2025-10-31 22:50   ` Boris Burkov
2025-11-03 12:18     ` Mark Harmstone
2025-10-24 18:12 ` [PATCH v4 02/16] btrfs: add REMAP chunk type Mark Harmstone
2025-10-24 18:12 ` [PATCH v4 03/16] btrfs: allow remapped chunks to have zero stripes Mark Harmstone
2025-10-31 21:39   ` Boris Burkov
2025-10-24 18:12 ` [PATCH v4 04/16] btrfs: remove remapped block groups from the free-space tree Mark Harmstone
2025-10-31 21:44   ` Boris Burkov
2025-11-03 12:39     ` Mark Harmstone
2025-10-24 18:12 ` [PATCH v4 05/16] btrfs: don't add metadata items for the remap tree to the extent tree Mark Harmstone
2025-10-24 18:12 ` [PATCH v4 06/16] btrfs: add extended version of struct block_group_item Mark Harmstone
2025-10-31 21:47   ` Boris Burkov
2025-10-24 18:12 ` [PATCH v4 07/16] btrfs: allow mounting filesystems with remap-tree incompat flag Mark Harmstone
2025-10-24 18:12 ` [PATCH v4 08/16] btrfs: redirect I/O for remapped block groups Mark Harmstone
2025-10-31 22:03   ` Boris Burkov
2025-10-24 18:12 ` [PATCH v4 09/16] btrfs: handle deletions from remapped block group Mark Harmstone
2025-10-31 23:05   ` Boris Burkov
2025-11-03 15:51     ` Mark Harmstone
2025-10-31 23:30   ` Boris Burkov
2025-11-04 12:30     ` Mark Harmstone
2025-10-24 18:12 ` [PATCH v4 10/16] btrfs: handle setting up relocation of block group with remap-tree Mark Harmstone
2025-10-31 23:43   ` Boris Burkov
2025-11-03 18:45     ` Mark Harmstone
2025-10-24 18:12 ` [PATCH v4 11/16] btrfs: move existing remaps before relocating block group Mark Harmstone
2025-11-01  0:02   ` Boris Burkov
2025-11-04 13:00     ` Mark Harmstone
2025-11-01  0:10   ` Boris Burkov
2025-10-24 18:12 ` [PATCH v4 12/16] btrfs: replace identity remaps with actual remaps when doing relocations Mark Harmstone
2025-11-01  0:09   ` Boris Burkov [this message]
2025-11-04 14:31     ` Mark Harmstone
2025-10-24 18:12 ` [PATCH v4 13/16] btrfs: add do_remap param to btrfs_discard_extent() Mark Harmstone
2025-11-01  0:12   ` Boris Burkov
2025-10-24 18:12 ` [PATCH v4 14/16] btrfs: allow balancing remap tree Mark Harmstone
2025-10-24 18:12 ` [PATCH v4 15/16] btrfs: handle discarding fully-remapped block groups Mark Harmstone
2025-10-27 16:04   ` kernel test robot
2025-10-31 22:12     ` Boris Burkov
2025-11-03 16:49       ` Mark Harmstone
2025-11-09  8:42         ` Philip Li
2025-10-31 22:11   ` Boris Burkov
2025-11-03 17:01     ` Mark Harmstone
2025-10-24 18:12 ` [PATCH v4 16/16] btrfs: add stripe removal pending flag Mark Harmstone

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=aQVPoWsIvwLXWePB@devvm12410.ftw0.facebook.com \
    --to=boris@bur.io \
    --cc=linux-btrfs@vger.kernel.org \
    --cc=mark@harmstone.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox