* [PATCH v4 01/16] btrfs: add definitions and constants for remap-tree
2025-10-24 18:12 [PATCH v4 00/16] Remap tree Mark Harmstone
@ 2025-10-24 18:12 ` Mark Harmstone
2025-10-31 22:50 ` Boris Burkov
2025-10-24 18:12 ` [PATCH v4 02/16] btrfs: add REMAP chunk type Mark Harmstone
` (14 subsequent siblings)
15 siblings, 1 reply; 42+ messages in thread
From: Mark Harmstone @ 2025-10-24 18:12 UTC (permalink / raw)
To: linux-btrfs; +Cc: Mark Harmstone, Boris Burkov
Add an incompat flag for the new remap-tree feature, and the constants
and definitions needed to support it.
Signed-off-by: Mark Harmstone <mark@harmstone.com>
Reviewed-by: Boris Burkov <boris@bur.io>
---
fs/btrfs/accessors.h | 3 +++
fs/btrfs/locking.c | 1 +
fs/btrfs/sysfs.c | 2 ++
fs/btrfs/tree-checker.c | 6 ++----
fs/btrfs/tree-checker.h | 5 +++++
fs/btrfs/volumes.c | 1 +
include/uapi/linux/btrfs.h | 1 +
include/uapi/linux/btrfs_tree.h | 12 ++++++++++++
8 files changed, 27 insertions(+), 4 deletions(-)
diff --git a/fs/btrfs/accessors.h b/fs/btrfs/accessors.h
index 99b3ced12805..95a1ca8c099b 100644
--- a/fs/btrfs/accessors.h
+++ b/fs/btrfs/accessors.h
@@ -1009,6 +1009,9 @@ BTRFS_SETGET_STACK_FUNCS(stack_verity_descriptor_encryption,
BTRFS_SETGET_STACK_FUNCS(stack_verity_descriptor_size,
struct btrfs_verity_descriptor_item, size, 64);
+BTRFS_SETGET_FUNCS(remap_address, struct btrfs_remap, address, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_remap_address, struct btrfs_remap, address, 64);
+
/* Cast into the data area of the leaf. */
#define btrfs_item_ptr(leaf, slot, type) \
((type *)(btrfs_item_nr_offset(leaf, 0) + btrfs_item_offset(leaf, slot)))
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 0035851d72b0..726e4d70f37c 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -73,6 +73,7 @@ static struct btrfs_lockdep_keyset {
{ .id = BTRFS_FREE_SPACE_TREE_OBJECTID, DEFINE_NAME("free-space") },
{ .id = BTRFS_BLOCK_GROUP_TREE_OBJECTID, DEFINE_NAME("block-group") },
{ .id = BTRFS_RAID_STRIPE_TREE_OBJECTID, DEFINE_NAME("raid-stripe") },
+ { .id = BTRFS_REMAP_TREE_OBJECTID, DEFINE_NAME("remap-tree") },
{ .id = 0, DEFINE_NAME("tree") },
};
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index d66681ce2b3d..9e1ba524d26a 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -291,6 +291,7 @@ BTRFS_FEAT_ATTR_COMPAT_RO(free_space_tree, FREE_SPACE_TREE);
BTRFS_FEAT_ATTR_COMPAT_RO(block_group_tree, BLOCK_GROUP_TREE);
BTRFS_FEAT_ATTR_INCOMPAT(raid1c34, RAID1C34);
BTRFS_FEAT_ATTR_INCOMPAT(simple_quota, SIMPLE_QUOTA);
+BTRFS_FEAT_ATTR_INCOMPAT(remap_tree, REMAP_TREE);
#ifdef CONFIG_BLK_DEV_ZONED
BTRFS_FEAT_ATTR_INCOMPAT(zoned, ZONED);
#endif
@@ -325,6 +326,7 @@ static struct attribute *btrfs_supported_feature_attrs[] = {
BTRFS_FEAT_ATTR_PTR(raid1c34),
BTRFS_FEAT_ATTR_PTR(block_group_tree),
BTRFS_FEAT_ATTR_PTR(simple_quota),
+ BTRFS_FEAT_ATTR_PTR(remap_tree),
#ifdef CONFIG_BLK_DEV_ZONED
BTRFS_FEAT_ATTR_PTR(zoned),
#endif
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index 5684750ca7a6..af9a26844113 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -913,12 +913,10 @@ int btrfs_check_chunk_valid(const struct btrfs_fs_info *fs_info,
length, btrfs_stripe_nr_to_offset(U32_MAX));
return -EUCLEAN;
}
- if (unlikely(type & ~(BTRFS_BLOCK_GROUP_TYPE_MASK |
- BTRFS_BLOCK_GROUP_PROFILE_MASK))) {
+ if (unlikely(type & ~BTRFS_BLOCK_GROUP_VALID)) {
chunk_err(fs_info, leaf, chunk, logical,
"unrecognized chunk type: 0x%llx",
- ~(BTRFS_BLOCK_GROUP_TYPE_MASK |
- BTRFS_BLOCK_GROUP_PROFILE_MASK) & type);
+ type & ~BTRFS_BLOCK_GROUP_VALID);
return -EUCLEAN;
}
diff --git a/fs/btrfs/tree-checker.h b/fs/btrfs/tree-checker.h
index eb201f4ec3c7..833e2fd989eb 100644
--- a/fs/btrfs/tree-checker.h
+++ b/fs/btrfs/tree-checker.h
@@ -57,6 +57,11 @@ enum btrfs_tree_block_status {
BTRFS_TREE_BLOCK_WRITTEN_NOT_SET,
};
+
+#define BTRFS_BLOCK_GROUP_VALID (BTRFS_BLOCK_GROUP_TYPE_MASK | \
+ BTRFS_BLOCK_GROUP_PROFILE_MASK | \
+ BTRFS_BLOCK_GROUP_REMAPPED)
+
/*
* Exported simply for btrfs-progs which wants to have the
* btrfs_tree_block_status return codes.
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 45d89b12025b..63e5a17f96f9 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -231,6 +231,7 @@ void btrfs_describe_block_groups(u64 bg_flags, char *buf, u32 size_buf)
DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_DATA, "data");
DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_SYSTEM, "system");
DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_METADATA, "metadata");
+ DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_REMAPPED, "remapped");
DESCRIBE_FLAG(BTRFS_AVAIL_ALLOC_BIT_SINGLE, "single");
for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h
index f40b300bd664..0763a23aeebc 100644
--- a/include/uapi/linux/btrfs.h
+++ b/include/uapi/linux/btrfs.h
@@ -336,6 +336,7 @@ struct btrfs_ioctl_fs_info_args {
#define BTRFS_FEATURE_INCOMPAT_EXTENT_TREE_V2 (1ULL << 13)
#define BTRFS_FEATURE_INCOMPAT_RAID_STRIPE_TREE (1ULL << 14)
#define BTRFS_FEATURE_INCOMPAT_SIMPLE_QUOTA (1ULL << 16)
+#define BTRFS_FEATURE_INCOMPAT_REMAP_TREE (1ULL << 17)
struct btrfs_ioctl_feature_flags {
__u64 compat_flags;
diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h
index fc29d273845d..4439d77a7252 100644
--- a/include/uapi/linux/btrfs_tree.h
+++ b/include/uapi/linux/btrfs_tree.h
@@ -76,6 +76,9 @@
/* Tracks RAID stripes in block groups. */
#define BTRFS_RAID_STRIPE_TREE_OBJECTID 12ULL
+/* Holds details of remapped addresses after relocation. */
+#define BTRFS_REMAP_TREE_OBJECTID 13ULL
+
/* device stats in the device tree */
#define BTRFS_DEV_STATS_OBJECTID 0ULL
@@ -282,6 +285,10 @@
#define BTRFS_RAID_STRIPE_KEY 230
+#define BTRFS_IDENTITY_REMAP_KEY 234
+#define BTRFS_REMAP_KEY 235
+#define BTRFS_REMAP_BACKREF_KEY 236
+
/*
* Records the overall state of the qgroups.
* There's only one instance of this key present,
@@ -1161,6 +1168,7 @@ struct btrfs_dev_replace_item {
#define BTRFS_BLOCK_GROUP_RAID6 (1ULL << 8)
#define BTRFS_BLOCK_GROUP_RAID1C3 (1ULL << 9)
#define BTRFS_BLOCK_GROUP_RAID1C4 (1ULL << 10)
+#define BTRFS_BLOCK_GROUP_REMAPPED (1ULL << 11)
#define BTRFS_BLOCK_GROUP_RESERVED (BTRFS_AVAIL_ALLOC_BIT_SINGLE | \
BTRFS_SPACE_INFO_GLOBAL_RSV)
@@ -1323,4 +1331,8 @@ struct btrfs_verity_descriptor_item {
__u8 encryption;
} __attribute__ ((__packed__));
+struct btrfs_remap {
+ __le64 address;
+} __attribute__ ((__packed__));
+
#endif /* _BTRFS_CTREE_H_ */
--
2.49.1
^ permalink raw reply related [flat|nested] 42+ messages in thread* Re: [PATCH v4 01/16] btrfs: add definitions and constants for remap-tree
2025-10-24 18:12 ` [PATCH v4 01/16] btrfs: add definitions and constants for remap-tree Mark Harmstone
@ 2025-10-31 22:50 ` Boris Burkov
2025-11-03 12:18 ` Mark Harmstone
0 siblings, 1 reply; 42+ messages in thread
From: Boris Burkov @ 2025-10-31 22:50 UTC (permalink / raw)
To: Mark Harmstone; +Cc: linux-btrfs
On Fri, Oct 24, 2025 at 07:12:02PM +0100, Mark Harmstone wrote:
> Add an incompat flag for the new remap-tree feature, and the constants
> and definitions needed to support it.
>
> Signed-off-by: Mark Harmstone <mark@harmstone.com>
> Reviewed-by: Boris Burkov <boris@bur.io>
> ---
> fs/btrfs/accessors.h | 3 +++
> fs/btrfs/locking.c | 1 +
> fs/btrfs/sysfs.c | 2 ++
> fs/btrfs/tree-checker.c | 6 ++----
> fs/btrfs/tree-checker.h | 5 +++++
> fs/btrfs/volumes.c | 1 +
> include/uapi/linux/btrfs.h | 1 +
> include/uapi/linux/btrfs_tree.h | 12 ++++++++++++
> 8 files changed, 27 insertions(+), 4 deletions(-)
>
> diff --git a/fs/btrfs/accessors.h b/fs/btrfs/accessors.h
> index 99b3ced12805..95a1ca8c099b 100644
> --- a/fs/btrfs/accessors.h
> +++ b/fs/btrfs/accessors.h
> @@ -1009,6 +1009,9 @@ BTRFS_SETGET_STACK_FUNCS(stack_verity_descriptor_encryption,
> BTRFS_SETGET_STACK_FUNCS(stack_verity_descriptor_size,
> struct btrfs_verity_descriptor_item, size, 64);
>
> +BTRFS_SETGET_FUNCS(remap_address, struct btrfs_remap, address, 64);
> +BTRFS_SETGET_STACK_FUNCS(stack_remap_address, struct btrfs_remap, address, 64);
> +
> /* Cast into the data area of the leaf. */
> #define btrfs_item_ptr(leaf, slot, type) \
> ((type *)(btrfs_item_nr_offset(leaf, 0) + btrfs_item_offset(leaf, slot)))
> diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
> index 0035851d72b0..726e4d70f37c 100644
> --- a/fs/btrfs/locking.c
> +++ b/fs/btrfs/locking.c
> @@ -73,6 +73,7 @@ static struct btrfs_lockdep_keyset {
> { .id = BTRFS_FREE_SPACE_TREE_OBJECTID, DEFINE_NAME("free-space") },
> { .id = BTRFS_BLOCK_GROUP_TREE_OBJECTID, DEFINE_NAME("block-group") },
> { .id = BTRFS_RAID_STRIPE_TREE_OBJECTID, DEFINE_NAME("raid-stripe") },
> + { .id = BTRFS_REMAP_TREE_OBJECTID, DEFINE_NAME("remap-tree") },
> { .id = 0, DEFINE_NAME("tree") },
> };
>
> diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
> index d66681ce2b3d..9e1ba524d26a 100644
> --- a/fs/btrfs/sysfs.c
> +++ b/fs/btrfs/sysfs.c
> @@ -291,6 +291,7 @@ BTRFS_FEAT_ATTR_COMPAT_RO(free_space_tree, FREE_SPACE_TREE);
> BTRFS_FEAT_ATTR_COMPAT_RO(block_group_tree, BLOCK_GROUP_TREE);
> BTRFS_FEAT_ATTR_INCOMPAT(raid1c34, RAID1C34);
> BTRFS_FEAT_ATTR_INCOMPAT(simple_quota, SIMPLE_QUOTA);
> +BTRFS_FEAT_ATTR_INCOMPAT(remap_tree, REMAP_TREE);
> #ifdef CONFIG_BLK_DEV_ZONED
> BTRFS_FEAT_ATTR_INCOMPAT(zoned, ZONED);
> #endif
> @@ -325,6 +326,7 @@ static struct attribute *btrfs_supported_feature_attrs[] = {
> BTRFS_FEAT_ATTR_PTR(raid1c34),
> BTRFS_FEAT_ATTR_PTR(block_group_tree),
> BTRFS_FEAT_ATTR_PTR(simple_quota),
> + BTRFS_FEAT_ATTR_PTR(remap_tree),
> #ifdef CONFIG_BLK_DEV_ZONED
> BTRFS_FEAT_ATTR_PTR(zoned),
> #endif
> diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
> index 5684750ca7a6..af9a26844113 100644
> --- a/fs/btrfs/tree-checker.c
> +++ b/fs/btrfs/tree-checker.c
> @@ -913,12 +913,10 @@ int btrfs_check_chunk_valid(const struct btrfs_fs_info *fs_info,
> length, btrfs_stripe_nr_to_offset(U32_MAX));
> return -EUCLEAN;
> }
> - if (unlikely(type & ~(BTRFS_BLOCK_GROUP_TYPE_MASK |
> - BTRFS_BLOCK_GROUP_PROFILE_MASK))) {
> + if (unlikely(type & ~BTRFS_BLOCK_GROUP_VALID)) {
> chunk_err(fs_info, leaf, chunk, logical,
> "unrecognized chunk type: 0x%llx",
> - ~(BTRFS_BLOCK_GROUP_TYPE_MASK |
> - BTRFS_BLOCK_GROUP_PROFILE_MASK) & type);
> + type & ~BTRFS_BLOCK_GROUP_VALID);
> return -EUCLEAN;
> }
>
> diff --git a/fs/btrfs/tree-checker.h b/fs/btrfs/tree-checker.h
> index eb201f4ec3c7..833e2fd989eb 100644
> --- a/fs/btrfs/tree-checker.h
> +++ b/fs/btrfs/tree-checker.h
> @@ -57,6 +57,11 @@ enum btrfs_tree_block_status {
> BTRFS_TREE_BLOCK_WRITTEN_NOT_SET,
> };
>
> +
> +#define BTRFS_BLOCK_GROUP_VALID (BTRFS_BLOCK_GROUP_TYPE_MASK | \
> + BTRFS_BLOCK_GROUP_PROFILE_MASK | \
> + BTRFS_BLOCK_GROUP_REMAPPED)
> +
> /*
> * Exported simply for btrfs-progs which wants to have the
> * btrfs_tree_block_status return codes.
> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
> index 45d89b12025b..63e5a17f96f9 100644
> --- a/fs/btrfs/volumes.c
> +++ b/fs/btrfs/volumes.c
> @@ -231,6 +231,7 @@ void btrfs_describe_block_groups(u64 bg_flags, char *buf, u32 size_buf)
> DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_DATA, "data");
> DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_SYSTEM, "system");
> DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_METADATA, "metadata");
> + DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_REMAPPED, "remapped");
>
> DESCRIBE_FLAG(BTRFS_AVAIL_ALLOC_BIT_SINGLE, "single");
> for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
> diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h
> index f40b300bd664..0763a23aeebc 100644
> --- a/include/uapi/linux/btrfs.h
> +++ b/include/uapi/linux/btrfs.h
> @@ -336,6 +336,7 @@ struct btrfs_ioctl_fs_info_args {
> #define BTRFS_FEATURE_INCOMPAT_EXTENT_TREE_V2 (1ULL << 13)
> #define BTRFS_FEATURE_INCOMPAT_RAID_STRIPE_TREE (1ULL << 14)
> #define BTRFS_FEATURE_INCOMPAT_SIMPLE_QUOTA (1ULL << 16)
> +#define BTRFS_FEATURE_INCOMPAT_REMAP_TREE (1ULL << 17)
>
> struct btrfs_ioctl_feature_flags {
> __u64 compat_flags;
> diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h
> index fc29d273845d..4439d77a7252 100644
> --- a/include/uapi/linux/btrfs_tree.h
> +++ b/include/uapi/linux/btrfs_tree.h
> @@ -76,6 +76,9 @@
> /* Tracks RAID stripes in block groups. */
> #define BTRFS_RAID_STRIPE_TREE_OBJECTID 12ULL
>
> +/* Holds details of remapped addresses after relocation. */
> +#define BTRFS_REMAP_TREE_OBJECTID 13ULL
> +
> /* device stats in the device tree */
> #define BTRFS_DEV_STATS_OBJECTID 0ULL
>
> @@ -282,6 +285,10 @@
>
> #define BTRFS_RAID_STRIPE_KEY 230
>
> +#define BTRFS_IDENTITY_REMAP_KEY 234
> +#define BTRFS_REMAP_KEY 235
> +#define BTRFS_REMAP_BACKREF_KEY 236
> +
> /*
> * Records the overall state of the qgroups.
> * There's only one instance of this key present,
> @@ -1161,6 +1168,7 @@ struct btrfs_dev_replace_item {
> #define BTRFS_BLOCK_GROUP_RAID6 (1ULL << 8)
> #define BTRFS_BLOCK_GROUP_RAID1C3 (1ULL << 9)
> #define BTRFS_BLOCK_GROUP_RAID1C4 (1ULL << 10)
> +#define BTRFS_BLOCK_GROUP_REMAPPED (1ULL << 11)
> #define BTRFS_BLOCK_GROUP_RESERVED (BTRFS_AVAIL_ALLOC_BIT_SINGLE | \
> BTRFS_SPACE_INFO_GLOBAL_RSV)
>
> @@ -1323,4 +1331,8 @@ struct btrfs_verity_descriptor_item {
> __u8 encryption;
> } __attribute__ ((__packed__));
>
> +struct btrfs_remap {
> + __le64 address;
> +} __attribute__ ((__packed__));
> +
nit: I think this should probably be btrfs_remap_item not btrfs_remap,
though I don't think that convention is totally universal.
> #endif /* _BTRFS_CTREE_H_ */
> --
> 2.49.1
>
^ permalink raw reply [flat|nested] 42+ messages in thread* Re: [PATCH v4 01/16] btrfs: add definitions and constants for remap-tree
2025-10-31 22:50 ` Boris Burkov
@ 2025-11-03 12:18 ` Mark Harmstone
0 siblings, 0 replies; 42+ messages in thread
From: Mark Harmstone @ 2025-11-03 12:18 UTC (permalink / raw)
To: Boris Burkov; +Cc: linux-btrfs
On 31/10/2025 10.50 pm, Boris Burkov wrote:
> On Fri, Oct 24, 2025 at 07:12:02PM +0100, Mark Harmstone wrote:
>> Add an incompat flag for the new remap-tree feature, and the constants
>> and definitions needed to support it.
>>
>> Signed-off-by: Mark Harmstone <mark@harmstone.com>
>> Reviewed-by: Boris Burkov <boris@bur.io>
>> ---
>> fs/btrfs/accessors.h | 3 +++
>> fs/btrfs/locking.c | 1 +
>> fs/btrfs/sysfs.c | 2 ++
>> fs/btrfs/tree-checker.c | 6 ++----
>> fs/btrfs/tree-checker.h | 5 +++++
>> fs/btrfs/volumes.c | 1 +
>> include/uapi/linux/btrfs.h | 1 +
>> include/uapi/linux/btrfs_tree.h | 12 ++++++++++++
>> 8 files changed, 27 insertions(+), 4 deletions(-)
>>
>> diff --git a/fs/btrfs/accessors.h b/fs/btrfs/accessors.h
>> index 99b3ced12805..95a1ca8c099b 100644
>> --- a/fs/btrfs/accessors.h
>> +++ b/fs/btrfs/accessors.h
>> @@ -1009,6 +1009,9 @@ BTRFS_SETGET_STACK_FUNCS(stack_verity_descriptor_encryption,
>> BTRFS_SETGET_STACK_FUNCS(stack_verity_descriptor_size,
>> struct btrfs_verity_descriptor_item, size, 64);
>>
>> +BTRFS_SETGET_FUNCS(remap_address, struct btrfs_remap, address, 64);
>> +BTRFS_SETGET_STACK_FUNCS(stack_remap_address, struct btrfs_remap, address, 64);
>> +
>> /* Cast into the data area of the leaf. */
>> #define btrfs_item_ptr(leaf, slot, type) \
>> ((type *)(btrfs_item_nr_offset(leaf, 0) + btrfs_item_offset(leaf, slot)))
>> diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
>> index 0035851d72b0..726e4d70f37c 100644
>> --- a/fs/btrfs/locking.c
>> +++ b/fs/btrfs/locking.c
>> @@ -73,6 +73,7 @@ static struct btrfs_lockdep_keyset {
>> { .id = BTRFS_FREE_SPACE_TREE_OBJECTID, DEFINE_NAME("free-space") },
>> { .id = BTRFS_BLOCK_GROUP_TREE_OBJECTID, DEFINE_NAME("block-group") },
>> { .id = BTRFS_RAID_STRIPE_TREE_OBJECTID, DEFINE_NAME("raid-stripe") },
>> + { .id = BTRFS_REMAP_TREE_OBJECTID, DEFINE_NAME("remap-tree") },
>> { .id = 0, DEFINE_NAME("tree") },
>> };
>>
>> diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
>> index d66681ce2b3d..9e1ba524d26a 100644
>> --- a/fs/btrfs/sysfs.c
>> +++ b/fs/btrfs/sysfs.c
>> @@ -291,6 +291,7 @@ BTRFS_FEAT_ATTR_COMPAT_RO(free_space_tree, FREE_SPACE_TREE);
>> BTRFS_FEAT_ATTR_COMPAT_RO(block_group_tree, BLOCK_GROUP_TREE);
>> BTRFS_FEAT_ATTR_INCOMPAT(raid1c34, RAID1C34);
>> BTRFS_FEAT_ATTR_INCOMPAT(simple_quota, SIMPLE_QUOTA);
>> +BTRFS_FEAT_ATTR_INCOMPAT(remap_tree, REMAP_TREE);
>> #ifdef CONFIG_BLK_DEV_ZONED
>> BTRFS_FEAT_ATTR_INCOMPAT(zoned, ZONED);
>> #endif
>> @@ -325,6 +326,7 @@ static struct attribute *btrfs_supported_feature_attrs[] = {
>> BTRFS_FEAT_ATTR_PTR(raid1c34),
>> BTRFS_FEAT_ATTR_PTR(block_group_tree),
>> BTRFS_FEAT_ATTR_PTR(simple_quota),
>> + BTRFS_FEAT_ATTR_PTR(remap_tree),
>> #ifdef CONFIG_BLK_DEV_ZONED
>> BTRFS_FEAT_ATTR_PTR(zoned),
>> #endif
>> diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
>> index 5684750ca7a6..af9a26844113 100644
>> --- a/fs/btrfs/tree-checker.c
>> +++ b/fs/btrfs/tree-checker.c
>> @@ -913,12 +913,10 @@ int btrfs_check_chunk_valid(const struct btrfs_fs_info *fs_info,
>> length, btrfs_stripe_nr_to_offset(U32_MAX));
>> return -EUCLEAN;
>> }
>> - if (unlikely(type & ~(BTRFS_BLOCK_GROUP_TYPE_MASK |
>> - BTRFS_BLOCK_GROUP_PROFILE_MASK))) {
>> + if (unlikely(type & ~BTRFS_BLOCK_GROUP_VALID)) {
>> chunk_err(fs_info, leaf, chunk, logical,
>> "unrecognized chunk type: 0x%llx",
>> - ~(BTRFS_BLOCK_GROUP_TYPE_MASK |
>> - BTRFS_BLOCK_GROUP_PROFILE_MASK) & type);
>> + type & ~BTRFS_BLOCK_GROUP_VALID);
>> return -EUCLEAN;
>> }
>>
>> diff --git a/fs/btrfs/tree-checker.h b/fs/btrfs/tree-checker.h
>> index eb201f4ec3c7..833e2fd989eb 100644
>> --- a/fs/btrfs/tree-checker.h
>> +++ b/fs/btrfs/tree-checker.h
>> @@ -57,6 +57,11 @@ enum btrfs_tree_block_status {
>> BTRFS_TREE_BLOCK_WRITTEN_NOT_SET,
>> };
>>
>> +
>> +#define BTRFS_BLOCK_GROUP_VALID (BTRFS_BLOCK_GROUP_TYPE_MASK | \
>> + BTRFS_BLOCK_GROUP_PROFILE_MASK | \
>> + BTRFS_BLOCK_GROUP_REMAPPED)
>> +
>> /*
>> * Exported simply for btrfs-progs which wants to have the
>> * btrfs_tree_block_status return codes.
>> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
>> index 45d89b12025b..63e5a17f96f9 100644
>> --- a/fs/btrfs/volumes.c
>> +++ b/fs/btrfs/volumes.c
>> @@ -231,6 +231,7 @@ void btrfs_describe_block_groups(u64 bg_flags, char *buf, u32 size_buf)
>> DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_DATA, "data");
>> DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_SYSTEM, "system");
>> DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_METADATA, "metadata");
>> + DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_REMAPPED, "remapped");
>>
>> DESCRIBE_FLAG(BTRFS_AVAIL_ALLOC_BIT_SINGLE, "single");
>> for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
>> diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h
>> index f40b300bd664..0763a23aeebc 100644
>> --- a/include/uapi/linux/btrfs.h
>> +++ b/include/uapi/linux/btrfs.h
>> @@ -336,6 +336,7 @@ struct btrfs_ioctl_fs_info_args {
>> #define BTRFS_FEATURE_INCOMPAT_EXTENT_TREE_V2 (1ULL << 13)
>> #define BTRFS_FEATURE_INCOMPAT_RAID_STRIPE_TREE (1ULL << 14)
>> #define BTRFS_FEATURE_INCOMPAT_SIMPLE_QUOTA (1ULL << 16)
>> +#define BTRFS_FEATURE_INCOMPAT_REMAP_TREE (1ULL << 17)
>>
>> struct btrfs_ioctl_feature_flags {
>> __u64 compat_flags;
>> diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h
>> index fc29d273845d..4439d77a7252 100644
>> --- a/include/uapi/linux/btrfs_tree.h
>> +++ b/include/uapi/linux/btrfs_tree.h
>> @@ -76,6 +76,9 @@
>> /* Tracks RAID stripes in block groups. */
>> #define BTRFS_RAID_STRIPE_TREE_OBJECTID 12ULL
>>
>> +/* Holds details of remapped addresses after relocation. */
>> +#define BTRFS_REMAP_TREE_OBJECTID 13ULL
>> +
>> /* device stats in the device tree */
>> #define BTRFS_DEV_STATS_OBJECTID 0ULL
>>
>> @@ -282,6 +285,10 @@
>>
>> #define BTRFS_RAID_STRIPE_KEY 230
>>
>> +#define BTRFS_IDENTITY_REMAP_KEY 234
>> +#define BTRFS_REMAP_KEY 235
>> +#define BTRFS_REMAP_BACKREF_KEY 236
>> +
>> /*
>> * Records the overall state of the qgroups.
>> * There's only one instance of this key present,
>> @@ -1161,6 +1168,7 @@ struct btrfs_dev_replace_item {
>> #define BTRFS_BLOCK_GROUP_RAID6 (1ULL << 8)
>> #define BTRFS_BLOCK_GROUP_RAID1C3 (1ULL << 9)
>> #define BTRFS_BLOCK_GROUP_RAID1C4 (1ULL << 10)
>> +#define BTRFS_BLOCK_GROUP_REMAPPED (1ULL << 11)
>> #define BTRFS_BLOCK_GROUP_RESERVED (BTRFS_AVAIL_ALLOC_BIT_SINGLE | \
>> BTRFS_SPACE_INFO_GLOBAL_RSV)
>>
>> @@ -1323,4 +1331,8 @@ struct btrfs_verity_descriptor_item {
>> __u8 encryption;
>> } __attribute__ ((__packed__));
>>
>> +struct btrfs_remap {
>> + __le64 address;
>> +} __attribute__ ((__packed__));
>> +
>
> nit: I think this should probably be btrfs_remap_item not btrfs_remap,
> though I don't think that convention is totally universal.
I've no strong opinions either way, but my reasoning was that struct btrfs_remap_item
might imply that it's only for remap items, whereas the same structure is shared for
remaps and remap backrefs.
For what it's worth, the RAID stripe tree items, which I think are the newest, don't
have the _item suffix.
>> #endif /* _BTRFS_CTREE_H_ */
>> --
>> 2.49.1
>>
^ permalink raw reply [flat|nested] 42+ messages in thread
* [PATCH v4 02/16] btrfs: add REMAP chunk type
2025-10-24 18:12 [PATCH v4 00/16] Remap tree Mark Harmstone
2025-10-24 18:12 ` [PATCH v4 01/16] btrfs: add definitions and constants for remap-tree Mark Harmstone
@ 2025-10-24 18:12 ` Mark Harmstone
2025-10-24 18:12 ` [PATCH v4 03/16] btrfs: allow remapped chunks to have zero stripes Mark Harmstone
` (13 subsequent siblings)
15 siblings, 0 replies; 42+ messages in thread
From: Mark Harmstone @ 2025-10-24 18:12 UTC (permalink / raw)
To: linux-btrfs; +Cc: Mark Harmstone, Boris Burkov
Add a new REMAP chunk type, which is a metadata chunk that holds the
remap tree.
This is needed for bootstrapping purposes: the remap tree can't itself
be remapped, and must be relocated the existing way, by COWing every
leaf. The remap tree can't go in the SYSTEM chunk as space there is
limited, because a copy of the chunk item gets placed in the superblock.
The changes in fs/btrfs/volumes.h are because we're adding a new block
group type bit after the profile bits, and so can no longer rely on the
const_ilog2 trick.
The sizing to 32MB per chunk, matching the SYSTEM chunk, is an estimate
here, we can adjust it later if it proves to be too big or too small.
This works out to be ~500,000 remap items, which for a 4KB block size
covers ~2GB of remapped data in the worst case and ~500TB in the best case.
Signed-off-by: Mark Harmstone <mark@harmstone.com>
Reviewed-by: Boris Burkov <boris@bur.io>
---
fs/btrfs/block-rsv.c | 8 ++++++++
fs/btrfs/block-rsv.h | 1 +
fs/btrfs/disk-io.c | 1 +
fs/btrfs/fs.h | 2 ++
fs/btrfs/space-info.c | 13 ++++++++++++-
fs/btrfs/sysfs.c | 2 ++
fs/btrfs/tree-checker.c | 13 +++++++++++--
fs/btrfs/volumes.c | 3 +++
fs/btrfs/volumes.h | 11 +++++++++--
include/uapi/linux/btrfs_tree.h | 4 +++-
10 files changed, 52 insertions(+), 6 deletions(-)
diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c
index 96cf7a162987..71bcaa6fa7ee 100644
--- a/fs/btrfs/block-rsv.c
+++ b/fs/btrfs/block-rsv.c
@@ -419,6 +419,9 @@ void btrfs_init_root_block_rsv(struct btrfs_root *root)
case BTRFS_TREE_LOG_OBJECTID:
root->block_rsv = &fs_info->treelog_rsv;
break;
+ case BTRFS_REMAP_TREE_OBJECTID:
+ root->block_rsv = &fs_info->remap_block_rsv;
+ break;
default:
root->block_rsv = NULL;
break;
@@ -432,6 +435,9 @@ void btrfs_init_global_block_rsv(struct btrfs_fs_info *fs_info)
space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
fs_info->chunk_block_rsv.space_info = space_info;
+ space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_REMAP);
+ fs_info->remap_block_rsv.space_info = space_info;
+
space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
fs_info->global_block_rsv.space_info = space_info;
fs_info->trans_block_rsv.space_info = space_info;
@@ -458,6 +464,8 @@ void btrfs_release_global_block_rsv(struct btrfs_fs_info *fs_info)
WARN_ON(fs_info->trans_block_rsv.reserved > 0);
WARN_ON(fs_info->chunk_block_rsv.size > 0);
WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
+ WARN_ON(fs_info->remap_block_rsv.size > 0);
+ WARN_ON(fs_info->remap_block_rsv.reserved > 0);
WARN_ON(fs_info->delayed_block_rsv.size > 0);
WARN_ON(fs_info->delayed_block_rsv.reserved > 0);
WARN_ON(fs_info->delayed_refs_rsv.reserved > 0);
diff --git a/fs/btrfs/block-rsv.h b/fs/btrfs/block-rsv.h
index 79ae9d05cd91..8359fb96bc3c 100644
--- a/fs/btrfs/block-rsv.h
+++ b/fs/btrfs/block-rsv.h
@@ -22,6 +22,7 @@ enum btrfs_rsv_type {
BTRFS_BLOCK_RSV_DELALLOC,
BTRFS_BLOCK_RSV_TRANS,
BTRFS_BLOCK_RSV_CHUNK,
+ BTRFS_BLOCK_RSV_REMAP,
BTRFS_BLOCK_RSV_DELOPS,
BTRFS_BLOCK_RSV_DELREFS,
BTRFS_BLOCK_RSV_TREELOG,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 0aa7e5d1b05f..0bf78ae2060d 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2816,6 +2816,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
BTRFS_BLOCK_RSV_GLOBAL);
btrfs_init_block_rsv(&fs_info->trans_block_rsv, BTRFS_BLOCK_RSV_TRANS);
btrfs_init_block_rsv(&fs_info->chunk_block_rsv, BTRFS_BLOCK_RSV_CHUNK);
+ btrfs_init_block_rsv(&fs_info->remap_block_rsv, BTRFS_BLOCK_RSV_REMAP);
btrfs_init_block_rsv(&fs_info->treelog_rsv, BTRFS_BLOCK_RSV_TREELOG);
btrfs_init_block_rsv(&fs_info->empty_block_rsv, BTRFS_BLOCK_RSV_EMPTY);
btrfs_init_block_rsv(&fs_info->delayed_block_rsv,
diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
index ad389fb1c01a..40a25f9f617e 100644
--- a/fs/btrfs/fs.h
+++ b/fs/btrfs/fs.h
@@ -495,6 +495,8 @@ struct btrfs_fs_info {
struct btrfs_block_rsv trans_block_rsv;
/* Block reservation for chunk tree */
struct btrfs_block_rsv chunk_block_rsv;
+ /* Block reservation for remap tree */
+ struct btrfs_block_rsv remap_block_rsv;
/* Block reservation for delayed operations */
struct btrfs_block_rsv delayed_block_rsv;
/* Block reservation for delayed refs */
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index 69237f5d6078..a2ce72d3e873 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -215,7 +215,7 @@ static u64 calc_chunk_size(const struct btrfs_fs_info *fs_info, u64 flags)
if (flags & BTRFS_BLOCK_GROUP_DATA)
return BTRFS_MAX_DATA_CHUNK_SIZE;
- else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
+ else if (flags & (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_REMAP))
return SZ_32M;
/* Handle BTRFS_BLOCK_GROUP_METADATA */
@@ -343,6 +343,8 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
if (mixed) {
flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA;
ret = create_space_info(fs_info, flags);
+ if (ret)
+ goto out;
} else {
flags = BTRFS_BLOCK_GROUP_METADATA;
ret = create_space_info(fs_info, flags);
@@ -351,7 +353,15 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
flags = BTRFS_BLOCK_GROUP_DATA;
ret = create_space_info(fs_info, flags);
+ if (ret)
+ goto out;
+ }
+
+ if (features & BTRFS_FEATURE_INCOMPAT_REMAP_TREE) {
+ flags = BTRFS_BLOCK_GROUP_REMAP;
+ ret = create_space_info(fs_info, flags);
}
+
out:
return ret;
}
@@ -587,6 +597,7 @@ static void dump_global_block_rsv(struct btrfs_fs_info *fs_info)
DUMP_BLOCK_RSV(fs_info, global_block_rsv);
DUMP_BLOCK_RSV(fs_info, trans_block_rsv);
DUMP_BLOCK_RSV(fs_info, chunk_block_rsv);
+ DUMP_BLOCK_RSV(fs_info, remap_block_rsv);
DUMP_BLOCK_RSV(fs_info, delayed_block_rsv);
DUMP_BLOCK_RSV(fs_info, delayed_refs_rsv);
}
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 9e1ba524d26a..4ac68dc7e0c0 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -1973,6 +1973,8 @@ static const char *alloc_name(struct btrfs_space_info *space_info)
case BTRFS_BLOCK_GROUP_SYSTEM:
ASSERT(space_info->subgroup_id == BTRFS_SUB_GROUP_PRIMARY);
return "system";
+ case BTRFS_BLOCK_GROUP_REMAP:
+ return "remap";
default:
WARN_ON(1);
return "invalid-combination";
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index af9a26844113..681c5c7fae35 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -748,17 +748,26 @@ static int check_block_group_item(struct extent_buffer *leaf,
return -EUCLEAN;
}
+ if (flags & BTRFS_BLOCK_GROUP_REMAP &&
+ !btrfs_fs_incompat(fs_info, REMAP_TREE)) {
+ block_group_err(leaf, slot,
+"invalid flags, have 0x%llx (REMAP flag set) but no remap-tree incompat flag",
+ flags);
+ return -EUCLEAN;
+ }
+
type = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;
if (unlikely(type != BTRFS_BLOCK_GROUP_DATA &&
type != BTRFS_BLOCK_GROUP_METADATA &&
type != BTRFS_BLOCK_GROUP_SYSTEM &&
+ type != BTRFS_BLOCK_GROUP_REMAP &&
type != (BTRFS_BLOCK_GROUP_METADATA |
BTRFS_BLOCK_GROUP_DATA))) {
block_group_err(leaf, slot,
-"invalid type, have 0x%llx (%lu bits set) expect either 0x%llx, 0x%llx, 0x%llx or 0x%llx",
+"invalid type, have 0x%llx (%lu bits set) expect either 0x%llx, 0x%llx, 0x%llx, 0x%llx or 0x%llx",
type, hweight64(type),
BTRFS_BLOCK_GROUP_DATA, BTRFS_BLOCK_GROUP_METADATA,
- BTRFS_BLOCK_GROUP_SYSTEM,
+ BTRFS_BLOCK_GROUP_SYSTEM, BTRFS_BLOCK_GROUP_REMAP,
BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA);
return -EUCLEAN;
}
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 63e5a17f96f9..82b8189f3e81 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -231,6 +231,9 @@ void btrfs_describe_block_groups(u64 bg_flags, char *buf, u32 size_buf)
DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_DATA, "data");
DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_SYSTEM, "system");
DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_METADATA, "metadata");
+ /* block groups containing the remap tree */
+ DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_REMAP, "remap");
+ /* block group that has been remapped */
DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_REMAPPED, "remapped");
DESCRIBE_FLAG(BTRFS_AVAIL_ALLOC_BIT_SINGLE, "single");
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index adbd9e6c09ff..7cf76bffcab6 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -58,8 +58,6 @@ static_assert(const_ilog2(BTRFS_STRIPE_LEN) == BTRFS_STRIPE_LEN_SHIFT);
*/
static_assert(const_ffs(BTRFS_BLOCK_GROUP_RAID0) <
const_ffs(BTRFS_BLOCK_GROUP_PROFILE_MASK & ~BTRFS_BLOCK_GROUP_RAID0));
-static_assert(const_ilog2(BTRFS_BLOCK_GROUP_RAID0) >
- ilog2(BTRFS_BLOCK_GROUP_TYPE_MASK));
/* ilog2() can handle both constants and variables */
#define BTRFS_BG_FLAG_TO_INDEX(profile) \
@@ -81,6 +79,15 @@ enum btrfs_raid_types {
BTRFS_NR_RAID_TYPES
};
+static_assert(BTRFS_RAID_RAID0 == 1);
+static_assert(BTRFS_RAID_RAID1 == 2);
+static_assert(BTRFS_RAID_DUP == 3);
+static_assert(BTRFS_RAID_RAID10 == 4);
+static_assert(BTRFS_RAID_RAID5 == 5);
+static_assert(BTRFS_RAID_RAID6 == 6);
+static_assert(BTRFS_RAID_RAID1C3 == 7);
+static_assert(BTRFS_RAID_RAID1C4 == 8);
+
/*
* Use sequence counter to get consistent device stat data on
* 32-bit processors.
diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h
index 4439d77a7252..9a36f0206d90 100644
--- a/include/uapi/linux/btrfs_tree.h
+++ b/include/uapi/linux/btrfs_tree.h
@@ -1169,12 +1169,14 @@ struct btrfs_dev_replace_item {
#define BTRFS_BLOCK_GROUP_RAID1C3 (1ULL << 9)
#define BTRFS_BLOCK_GROUP_RAID1C4 (1ULL << 10)
#define BTRFS_BLOCK_GROUP_REMAPPED (1ULL << 11)
+#define BTRFS_BLOCK_GROUP_REMAP (1ULL << 12)
#define BTRFS_BLOCK_GROUP_RESERVED (BTRFS_AVAIL_ALLOC_BIT_SINGLE | \
BTRFS_SPACE_INFO_GLOBAL_RSV)
#define BTRFS_BLOCK_GROUP_TYPE_MASK (BTRFS_BLOCK_GROUP_DATA | \
BTRFS_BLOCK_GROUP_SYSTEM | \
- BTRFS_BLOCK_GROUP_METADATA)
+ BTRFS_BLOCK_GROUP_METADATA | \
+ BTRFS_BLOCK_GROUP_REMAP)
#define BTRFS_BLOCK_GROUP_PROFILE_MASK (BTRFS_BLOCK_GROUP_RAID0 | \
BTRFS_BLOCK_GROUP_RAID1 | \
--
2.49.1
^ permalink raw reply related [flat|nested] 42+ messages in thread* [PATCH v4 03/16] btrfs: allow remapped chunks to have zero stripes
2025-10-24 18:12 [PATCH v4 00/16] Remap tree Mark Harmstone
2025-10-24 18:12 ` [PATCH v4 01/16] btrfs: add definitions and constants for remap-tree Mark Harmstone
2025-10-24 18:12 ` [PATCH v4 02/16] btrfs: add REMAP chunk type Mark Harmstone
@ 2025-10-24 18:12 ` Mark Harmstone
2025-10-31 21:39 ` Boris Burkov
2025-10-24 18:12 ` [PATCH v4 04/16] btrfs: remove remapped block groups from the free-space tree Mark Harmstone
` (12 subsequent siblings)
15 siblings, 1 reply; 42+ messages in thread
From: Mark Harmstone @ 2025-10-24 18:12 UTC (permalink / raw)
To: linux-btrfs; +Cc: Mark Harmstone
When a chunk has been fully remapped, we are going to set its
num_stripes to 0, as it will no longer represent a physical location on
disk.
Change tree-checker to allow for this, and fix read_one_chunk() to avoid
a divide by zero.
Signed-off-by: Mark Harmstone <mark@harmstone.com>
---
fs/btrfs/tree-checker.c | 65 ++++++++++++++++++++++++++++-------------
fs/btrfs/volumes.c | 7 ++++-
2 files changed, 51 insertions(+), 21 deletions(-)
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index 681c5c7fae35..b6827c2a7815 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -816,6 +816,41 @@ static void chunk_err(const struct btrfs_fs_info *fs_info,
va_end(args);
}
+static bool valid_stripe_count(u64 profile, u16 num_stripes,
+ u16 sub_stripes)
+{
+ switch (profile) {
+ case BTRFS_BLOCK_GROUP_RAID0:
+ return true;
+ case BTRFS_BLOCK_GROUP_RAID10:
+ return sub_stripes ==
+ btrfs_raid_array[BTRFS_RAID_RAID10].sub_stripes;
+ case BTRFS_BLOCK_GROUP_RAID1:
+ return num_stripes ==
+ btrfs_raid_array[BTRFS_RAID_RAID1].devs_min;
+ case BTRFS_BLOCK_GROUP_RAID1C3:
+ return num_stripes ==
+ btrfs_raid_array[BTRFS_RAID_RAID1C3].devs_min;
+ case BTRFS_BLOCK_GROUP_RAID1C4:
+ return num_stripes ==
+ btrfs_raid_array[BTRFS_RAID_RAID1C4].devs_min;
+ case BTRFS_BLOCK_GROUP_RAID5:
+ return num_stripes >=
+ btrfs_raid_array[BTRFS_RAID_RAID5].devs_min;
+ case BTRFS_BLOCK_GROUP_RAID6:
+ return num_stripes >=
+ btrfs_raid_array[BTRFS_RAID_RAID6].devs_min;
+ case BTRFS_BLOCK_GROUP_DUP:
+ return num_stripes ==
+ btrfs_raid_array[BTRFS_RAID_DUP].dev_stripes;
+ case 0: /* SINGLE */
+ return num_stripes ==
+ btrfs_raid_array[BTRFS_RAID_SINGLE].dev_stripes;
+ default:
+ BUG();
+ }
+}
+
/*
* The common chunk check which could also work on super block sys chunk array.
*
@@ -839,6 +874,7 @@ int btrfs_check_chunk_valid(const struct btrfs_fs_info *fs_info,
u64 features;
u32 chunk_sector_size;
bool mixed = false;
+ bool remapped;
int raid_index;
int nparity;
int ncopies;
@@ -862,12 +898,14 @@ int btrfs_check_chunk_valid(const struct btrfs_fs_info *fs_info,
ncopies = btrfs_raid_array[raid_index].ncopies;
nparity = btrfs_raid_array[raid_index].nparity;
- if (unlikely(!num_stripes)) {
+ remapped = type & BTRFS_BLOCK_GROUP_REMAPPED;
+
+ if (unlikely(!remapped && !num_stripes)) {
chunk_err(fs_info, leaf, chunk, logical,
"invalid chunk num_stripes, have %u", num_stripes);
return -EUCLEAN;
}
- if (unlikely(num_stripes < ncopies)) {
+ if (unlikely(num_stripes != 0 && num_stripes < ncopies)) {
chunk_err(fs_info, leaf, chunk, logical,
"invalid chunk num_stripes < ncopies, have %u < %d",
num_stripes, ncopies);
@@ -965,22 +1003,9 @@ int btrfs_check_chunk_valid(const struct btrfs_fs_info *fs_info,
}
}
- if (unlikely((type & BTRFS_BLOCK_GROUP_RAID10 &&
- sub_stripes != btrfs_raid_array[BTRFS_RAID_RAID10].sub_stripes) ||
- (type & BTRFS_BLOCK_GROUP_RAID1 &&
- num_stripes != btrfs_raid_array[BTRFS_RAID_RAID1].devs_min) ||
- (type & BTRFS_BLOCK_GROUP_RAID1C3 &&
- num_stripes != btrfs_raid_array[BTRFS_RAID_RAID1C3].devs_min) ||
- (type & BTRFS_BLOCK_GROUP_RAID1C4 &&
- num_stripes != btrfs_raid_array[BTRFS_RAID_RAID1C4].devs_min) ||
- (type & BTRFS_BLOCK_GROUP_RAID5 &&
- num_stripes < btrfs_raid_array[BTRFS_RAID_RAID5].devs_min) ||
- (type & BTRFS_BLOCK_GROUP_RAID6 &&
- num_stripes < btrfs_raid_array[BTRFS_RAID_RAID6].devs_min) ||
- (type & BTRFS_BLOCK_GROUP_DUP &&
- num_stripes != btrfs_raid_array[BTRFS_RAID_DUP].dev_stripes) ||
- ((type & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 &&
- num_stripes != btrfs_raid_array[BTRFS_RAID_SINGLE].dev_stripes))) {
+ if (!remapped &&
+ !valid_stripe_count(type & BTRFS_BLOCK_GROUP_PROFILE_MASK,
+ num_stripes, sub_stripes)) {
chunk_err(fs_info, leaf, chunk, logical,
"invalid num_stripes:sub_stripes %u:%u for profile %llu",
num_stripes, sub_stripes,
@@ -1004,11 +1029,11 @@ static int check_leaf_chunk_item(struct extent_buffer *leaf,
struct btrfs_fs_info *fs_info = leaf->fs_info;
int num_stripes;
- if (unlikely(btrfs_item_size(leaf, slot) < sizeof(struct btrfs_chunk))) {
+ if (unlikely(btrfs_item_size(leaf, slot) < offsetof(struct btrfs_chunk, stripe))) {
chunk_err(fs_info, leaf, chunk, key->offset,
"invalid chunk item size: have %u expect [%zu, %u)",
btrfs_item_size(leaf, slot),
- sizeof(struct btrfs_chunk),
+ offsetof(struct btrfs_chunk, stripe),
BTRFS_LEAF_DATA_SIZE(fs_info));
return -EUCLEAN;
}
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 82b8189f3e81..8a9bff0426ae 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -7059,7 +7059,12 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
*/
map->sub_stripes = btrfs_raid_array[index].sub_stripes;
map->verified_stripes = 0;
- map->stripe_size = btrfs_calc_stripe_length(map);
+
+ if (num_stripes > 0)
+ map->stripe_size = btrfs_calc_stripe_length(map);
+ else
+ map->stripe_size = 0;
+
for (i = 0; i < num_stripes; i++) {
map->stripes[i].physical =
btrfs_stripe_offset_nr(leaf, chunk, i);
--
2.49.1
^ permalink raw reply related [flat|nested] 42+ messages in thread* Re: [PATCH v4 03/16] btrfs: allow remapped chunks to have zero stripes
2025-10-24 18:12 ` [PATCH v4 03/16] btrfs: allow remapped chunks to have zero stripes Mark Harmstone
@ 2025-10-31 21:39 ` Boris Burkov
0 siblings, 0 replies; 42+ messages in thread
From: Boris Burkov @ 2025-10-31 21:39 UTC (permalink / raw)
To: Mark Harmstone; +Cc: linux-btrfs
On Fri, Oct 24, 2025 at 07:12:04PM +0100, Mark Harmstone wrote:
> When a chunk has been fully remapped, we are going to set its
> num_stripes to 0, as it will no longer represent a physical location on
> disk.
>
> Change tree-checker to allow for this, and fix read_one_chunk() to avoid
> a divide by zero.
>
> Signed-off-by: Mark Harmstone <mark@harmstone.com>
Reviewed-by: Boris Burkov <boris@bur.io>
> ---
> fs/btrfs/tree-checker.c | 65 ++++++++++++++++++++++++++++-------------
> fs/btrfs/volumes.c | 7 ++++-
> 2 files changed, 51 insertions(+), 21 deletions(-)
>
> diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
> index 681c5c7fae35..b6827c2a7815 100644
> --- a/fs/btrfs/tree-checker.c
> +++ b/fs/btrfs/tree-checker.c
> @@ -816,6 +816,41 @@ static void chunk_err(const struct btrfs_fs_info *fs_info,
> va_end(args);
> }
>
> +static bool valid_stripe_count(u64 profile, u16 num_stripes,
> + u16 sub_stripes)
> +{
> + switch (profile) {
> + case BTRFS_BLOCK_GROUP_RAID0:
> + return true;
> + case BTRFS_BLOCK_GROUP_RAID10:
> + return sub_stripes ==
> + btrfs_raid_array[BTRFS_RAID_RAID10].sub_stripes;
> + case BTRFS_BLOCK_GROUP_RAID1:
> + return num_stripes ==
> + btrfs_raid_array[BTRFS_RAID_RAID1].devs_min;
> + case BTRFS_BLOCK_GROUP_RAID1C3:
> + return num_stripes ==
> + btrfs_raid_array[BTRFS_RAID_RAID1C3].devs_min;
> + case BTRFS_BLOCK_GROUP_RAID1C4:
> + return num_stripes ==
> + btrfs_raid_array[BTRFS_RAID_RAID1C4].devs_min;
> + case BTRFS_BLOCK_GROUP_RAID5:
> + return num_stripes >=
> + btrfs_raid_array[BTRFS_RAID_RAID5].devs_min;
> + case BTRFS_BLOCK_GROUP_RAID6:
> + return num_stripes >=
> + btrfs_raid_array[BTRFS_RAID_RAID6].devs_min;
> + case BTRFS_BLOCK_GROUP_DUP:
> + return num_stripes ==
> + btrfs_raid_array[BTRFS_RAID_DUP].dev_stripes;
> + case 0: /* SINGLE */
> + return num_stripes ==
> + btrfs_raid_array[BTRFS_RAID_SINGLE].dev_stripes;
> + default:
> + BUG();
> + }
> +}
> +
> /*
> * The common chunk check which could also work on super block sys chunk array.
> *
> @@ -839,6 +874,7 @@ int btrfs_check_chunk_valid(const struct btrfs_fs_info *fs_info,
> u64 features;
> u32 chunk_sector_size;
> bool mixed = false;
> + bool remapped;
> int raid_index;
> int nparity;
> int ncopies;
> @@ -862,12 +898,14 @@ int btrfs_check_chunk_valid(const struct btrfs_fs_info *fs_info,
> ncopies = btrfs_raid_array[raid_index].ncopies;
> nparity = btrfs_raid_array[raid_index].nparity;
>
> - if (unlikely(!num_stripes)) {
> + remapped = type & BTRFS_BLOCK_GROUP_REMAPPED;
> +
> + if (unlikely(!remapped && !num_stripes)) {
> chunk_err(fs_info, leaf, chunk, logical,
> "invalid chunk num_stripes, have %u", num_stripes);
> return -EUCLEAN;
> }
> - if (unlikely(num_stripes < ncopies)) {
> + if (unlikely(num_stripes != 0 && num_stripes < ncopies)) {
> chunk_err(fs_info, leaf, chunk, logical,
> "invalid chunk num_stripes < ncopies, have %u < %d",
> num_stripes, ncopies);
> @@ -965,22 +1003,9 @@ int btrfs_check_chunk_valid(const struct btrfs_fs_info *fs_info,
> }
> }
>
> - if (unlikely((type & BTRFS_BLOCK_GROUP_RAID10 &&
> - sub_stripes != btrfs_raid_array[BTRFS_RAID_RAID10].sub_stripes) ||
> - (type & BTRFS_BLOCK_GROUP_RAID1 &&
> - num_stripes != btrfs_raid_array[BTRFS_RAID_RAID1].devs_min) ||
> - (type & BTRFS_BLOCK_GROUP_RAID1C3 &&
> - num_stripes != btrfs_raid_array[BTRFS_RAID_RAID1C3].devs_min) ||
> - (type & BTRFS_BLOCK_GROUP_RAID1C4 &&
> - num_stripes != btrfs_raid_array[BTRFS_RAID_RAID1C4].devs_min) ||
> - (type & BTRFS_BLOCK_GROUP_RAID5 &&
> - num_stripes < btrfs_raid_array[BTRFS_RAID_RAID5].devs_min) ||
> - (type & BTRFS_BLOCK_GROUP_RAID6 &&
> - num_stripes < btrfs_raid_array[BTRFS_RAID_RAID6].devs_min) ||
> - (type & BTRFS_BLOCK_GROUP_DUP &&
> - num_stripes != btrfs_raid_array[BTRFS_RAID_DUP].dev_stripes) ||
> - ((type & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 &&
> - num_stripes != btrfs_raid_array[BTRFS_RAID_SINGLE].dev_stripes))) {
> + if (!remapped &&
> + !valid_stripe_count(type & BTRFS_BLOCK_GROUP_PROFILE_MASK,
> + num_stripes, sub_stripes)) {
> chunk_err(fs_info, leaf, chunk, logical,
> "invalid num_stripes:sub_stripes %u:%u for profile %llu",
> num_stripes, sub_stripes,
> @@ -1004,11 +1029,11 @@ static int check_leaf_chunk_item(struct extent_buffer *leaf,
> struct btrfs_fs_info *fs_info = leaf->fs_info;
> int num_stripes;
>
> - if (unlikely(btrfs_item_size(leaf, slot) < sizeof(struct btrfs_chunk))) {
> + if (unlikely(btrfs_item_size(leaf, slot) < offsetof(struct btrfs_chunk, stripe))) {
> chunk_err(fs_info, leaf, chunk, key->offset,
> "invalid chunk item size: have %u expect [%zu, %u)",
> btrfs_item_size(leaf, slot),
> - sizeof(struct btrfs_chunk),
> + offsetof(struct btrfs_chunk, stripe),
> BTRFS_LEAF_DATA_SIZE(fs_info));
> return -EUCLEAN;
> }
> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
> index 82b8189f3e81..8a9bff0426ae 100644
> --- a/fs/btrfs/volumes.c
> +++ b/fs/btrfs/volumes.c
> @@ -7059,7 +7059,12 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
> */
> map->sub_stripes = btrfs_raid_array[index].sub_stripes;
> map->verified_stripes = 0;
> - map->stripe_size = btrfs_calc_stripe_length(map);
> +
> + if (num_stripes > 0)
> + map->stripe_size = btrfs_calc_stripe_length(map);
> + else
> + map->stripe_size = 0;
> +
> for (i = 0; i < num_stripes; i++) {
> map->stripes[i].physical =
> btrfs_stripe_offset_nr(leaf, chunk, i);
> --
> 2.49.1
>
^ permalink raw reply [flat|nested] 42+ messages in thread
* [PATCH v4 04/16] btrfs: remove remapped block groups from the free-space tree
2025-10-24 18:12 [PATCH v4 00/16] Remap tree Mark Harmstone
` (2 preceding siblings ...)
2025-10-24 18:12 ` [PATCH v4 03/16] btrfs: allow remapped chunks to have zero stripes Mark Harmstone
@ 2025-10-24 18:12 ` Mark Harmstone
2025-10-31 21:44 ` Boris Burkov
2025-10-24 18:12 ` [PATCH v4 05/16] btrfs: don't add metadata items for the remap tree to the extent tree Mark Harmstone
` (11 subsequent siblings)
15 siblings, 1 reply; 42+ messages in thread
From: Mark Harmstone @ 2025-10-24 18:12 UTC (permalink / raw)
To: linux-btrfs; +Cc: Mark Harmstone
No new allocations can be done from block groups that have the REMAPPED flag
set, so there's no value in their having entries in the free-space tree.
Prevent a search through the free-space tree being scheduled for such a
block group, and prevent any additions to the in-memory free-space tree.
Signed-off-by: Mark Harmstone <mark@harmstone.com>
---
fs/btrfs/block-group.c | 15 ++++++++++++---
fs/btrfs/free-space-cache.c | 3 +++
2 files changed, 15 insertions(+), 3 deletions(-)
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index ec1e4fc0cd51..b5f2ec8d013f 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -933,6 +933,13 @@ int btrfs_cache_block_group(struct btrfs_block_group *cache, bool wait)
if (btrfs_is_zoned(fs_info))
return 0;
+ /*
+ * No allocations can be done from remapped block groups, so they have
+ * no entries in the free-space tree.
+ */
+ if (cache->flags & BTRFS_BLOCK_GROUP_REMAPPED)
+ return 0;
+
caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
if (!caching_ctl)
return -ENOMEM;
@@ -1248,9 +1255,11 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
* another task to attempt to create another block group with the same
* item key (and failing with -EEXIST and a transaction abort).
*/
- ret = btrfs_remove_block_group_free_space(trans, block_group);
- if (ret)
- goto out;
+ if (!(block_group->flags & BTRFS_BLOCK_GROUP_REMAPPED)) {
+ ret = btrfs_remove_block_group_free_space(trans, block_group);
+ if (ret)
+ goto out;
+ }
ret = remove_block_group_item(trans, path, block_group);
if (ret < 0)
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index ab873bd67192..ec9a97d75d10 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -2756,6 +2756,9 @@ int btrfs_add_free_space(struct btrfs_block_group *block_group,
{
enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
+ if (block_group->flags & BTRFS_BLOCK_GROUP_REMAPPED)
+ return 0;
+
if (btrfs_is_zoned(block_group->fs_info))
return __btrfs_add_free_space_zoned(block_group, bytenr, size,
true);
--
2.49.1
^ permalink raw reply related [flat|nested] 42+ messages in thread* Re: [PATCH v4 04/16] btrfs: remove remapped block groups from the free-space tree
2025-10-24 18:12 ` [PATCH v4 04/16] btrfs: remove remapped block groups from the free-space tree Mark Harmstone
@ 2025-10-31 21:44 ` Boris Burkov
2025-11-03 12:39 ` Mark Harmstone
0 siblings, 1 reply; 42+ messages in thread
From: Boris Burkov @ 2025-10-31 21:44 UTC (permalink / raw)
To: Mark Harmstone; +Cc: linux-btrfs
On Fri, Oct 24, 2025 at 07:12:05PM +0100, Mark Harmstone wrote:
> No new allocations can be done from block groups that have the REMAPPED flag
> set, so there's no value in their having entries in the free-space tree.
>
> Prevent a search through the free-space tree being scheduled for such a
> block group, and prevent any additions to the in-memory free-space tree.
>
> Signed-off-by: Mark Harmstone <mark@harmstone.com>
> ---
> fs/btrfs/block-group.c | 15 ++++++++++++---
> fs/btrfs/free-space-cache.c | 3 +++
> 2 files changed, 15 insertions(+), 3 deletions(-)
>
> diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
> index ec1e4fc0cd51..b5f2ec8d013f 100644
> --- a/fs/btrfs/block-group.c
> +++ b/fs/btrfs/block-group.c
> @@ -933,6 +933,13 @@ int btrfs_cache_block_group(struct btrfs_block_group *cache, bool wait)
> if (btrfs_is_zoned(fs_info))
> return 0;
>
> + /*
> + * No allocations can be done from remapped block groups, so they have
> + * no entries in the free-space tree.
> + */
> + if (cache->flags & BTRFS_BLOCK_GROUP_REMAPPED)
> + return 0;
> +
> caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
> if (!caching_ctl)
> return -ENOMEM;
> @@ -1248,9 +1255,11 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
> * another task to attempt to create another block group with the same
> * item key (and failing with -EEXIST and a transaction abort).
> */
> - ret = btrfs_remove_block_group_free_space(trans, block_group);
> - if (ret)
> - goto out;
> + if (!(block_group->flags & BTRFS_BLOCK_GROUP_REMAPPED)) {
> + ret = btrfs_remove_block_group_free_space(trans, block_group);
> + if (ret)
> + goto out;
> + }
I feel like a comment or the commit message could explain the change to
the btrfs_remove_block_group bit more clearly. Like "remapped has no free
space so removing it is a no-op". If it is in fact a no-op, is there any
problem with calling it?
With that extra bit of doc/explanation, feel free to add
Reviewed-by: Boris Burkov <boris@bur.io>
>
> ret = remove_block_group_item(trans, path, block_group);
> if (ret < 0)
> diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
> index ab873bd67192..ec9a97d75d10 100644
> --- a/fs/btrfs/free-space-cache.c
> +++ b/fs/btrfs/free-space-cache.c
> @@ -2756,6 +2756,9 @@ int btrfs_add_free_space(struct btrfs_block_group *block_group,
> {
> enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
>
> + if (block_group->flags & BTRFS_BLOCK_GROUP_REMAPPED)
> + return 0;
> +
> if (btrfs_is_zoned(block_group->fs_info))
> return __btrfs_add_free_space_zoned(block_group, bytenr, size,
> true);
> --
> 2.49.1
>
^ permalink raw reply [flat|nested] 42+ messages in thread* Re: [PATCH v4 04/16] btrfs: remove remapped block groups from the free-space tree
2025-10-31 21:44 ` Boris Burkov
@ 2025-11-03 12:39 ` Mark Harmstone
0 siblings, 0 replies; 42+ messages in thread
From: Mark Harmstone @ 2025-11-03 12:39 UTC (permalink / raw)
To: Boris Burkov; +Cc: linux-btrfs
On 31/10/2025 9.44 pm, Boris Burkov wrote:
> On Fri, Oct 24, 2025 at 07:12:05PM +0100, Mark Harmstone wrote:
>> No new allocations can be done from block groups that have the REMAPPED flag
>> set, so there's no value in their having entries in the free-space tree.
>>
>> Prevent a search through the free-space tree being scheduled for such a
>> block group, and prevent any additions to the in-memory free-space tree.
>>
>> Signed-off-by: Mark Harmstone <mark@harmstone.com>
>> ---
>> fs/btrfs/block-group.c | 15 ++++++++++++---
>> fs/btrfs/free-space-cache.c | 3 +++
>> 2 files changed, 15 insertions(+), 3 deletions(-)
>>
>> diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
>> index ec1e4fc0cd51..b5f2ec8d013f 100644
>> --- a/fs/btrfs/block-group.c
>> +++ b/fs/btrfs/block-group.c
>> @@ -933,6 +933,13 @@ int btrfs_cache_block_group(struct btrfs_block_group *cache, bool wait)
>> if (btrfs_is_zoned(fs_info))
>> return 0;
>>
>> + /*
>> + * No allocations can be done from remapped block groups, so they have
>> + * no entries in the free-space tree.
>> + */
>> + if (cache->flags & BTRFS_BLOCK_GROUP_REMAPPED)
>> + return 0;
>> +
>> caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
>> if (!caching_ctl)
>> return -ENOMEM;
>> @@ -1248,9 +1255,11 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
>> * another task to attempt to create another block group with the same
>> * item key (and failing with -EEXIST and a transaction abort).
>> */
>> - ret = btrfs_remove_block_group_free_space(trans, block_group);
>> - if (ret)
>> - goto out;
>> + if (!(block_group->flags & BTRFS_BLOCK_GROUP_REMAPPED)) {
>> + ret = btrfs_remove_block_group_free_space(trans, block_group);
>> + if (ret)
>> + goto out;
>> + }
>
> I feel like a comment or the commit message could explain the change to
> the btrfs_remove_block_group bit more clearly. Like "remapped has no free
> space so removing it is a no-op". If it is in fact a no-op, is there any
> problem with calling it?
Makes sense. I think in theory it's a no-op, but in practice it would ASSERT
because the free space info it's expecting isn't there.
> With that extra bit of doc/explanation, feel free to add
> Reviewed-by: Boris Burkov <boris@bur.io>
>
>>
>> ret = remove_block_group_item(trans, path, block_group);
>> if (ret < 0)
>> diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
>> index ab873bd67192..ec9a97d75d10 100644
>> --- a/fs/btrfs/free-space-cache.c
>> +++ b/fs/btrfs/free-space-cache.c
>> @@ -2756,6 +2756,9 @@ int btrfs_add_free_space(struct btrfs_block_group *block_group,
>> {
>> enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED;
>>
>> + if (block_group->flags & BTRFS_BLOCK_GROUP_REMAPPED)
>> + return 0;
>> +
>> if (btrfs_is_zoned(block_group->fs_info))
>> return __btrfs_add_free_space_zoned(block_group, bytenr, size,
>> true);
>> --
>> 2.49.1
>>
^ permalink raw reply [flat|nested] 42+ messages in thread
* [PATCH v4 05/16] btrfs: don't add metadata items for the remap tree to the extent tree
2025-10-24 18:12 [PATCH v4 00/16] Remap tree Mark Harmstone
` (3 preceding siblings ...)
2025-10-24 18:12 ` [PATCH v4 04/16] btrfs: remove remapped block groups from the free-space tree Mark Harmstone
@ 2025-10-24 18:12 ` Mark Harmstone
2025-10-24 18:12 ` [PATCH v4 06/16] btrfs: add extended version of struct block_group_item Mark Harmstone
` (10 subsequent siblings)
15 siblings, 0 replies; 42+ messages in thread
From: Mark Harmstone @ 2025-10-24 18:12 UTC (permalink / raw)
To: linux-btrfs; +Cc: Mark Harmstone, Boris Burkov
There is the following potential problem with the remap tree and delayed refs:
* Remapped extent freed in a delayed ref, which removes an entry from the
remap tree
* Remap tree now small enough to fit in a single leaf
* Corruption as we now have a level-0 block with a level-1 metadata item
in the extent tree
One solution to this would be to rework the remap tree code so that it operates
via delayed refs. But as we're hoping to remove cow-only metadata items in the
future anyway, change things so that the remap tree doesn't have any entries in
the extent tree. This also has the benefit of reducing write amplification.
We also make it so that the clear_cache mount option is a no-op, as with the
extent tree v2, as the free-space tree can no longer be recreated from the
extent tree.
Finally disable relocating the remap tree itself, which is added back in
a later patch. As it is we would get corruption as the traditional
relocation method walks the extent tree, and we're removing its metadata
items.
Signed-off-by: Mark Harmstone <mark@harmstone.com>
Reviewed-by: Boris Burkov <boris@bur.io>
---
fs/btrfs/disk-io.c | 3 +++
fs/btrfs/extent-tree.c | 31 ++++++++++++++++++++++++++++++-
fs/btrfs/volumes.c | 3 +++
3 files changed, 36 insertions(+), 1 deletion(-)
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 0bf78ae2060d..13d9a9ece3ca 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3050,6 +3050,9 @@ int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info)
if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))
btrfs_warn(fs_info,
"'clear_cache' option is ignored with extent tree v2");
+ else if (btrfs_fs_incompat(fs_info, REMAP_TREE))
+ btrfs_warn(fs_info,
+ "'clear_cache' option is ignored with remap tree");
else
rebuild_free_space_tree = true;
} else if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) &&
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index ae2c3dc9957e..871a63799311 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1552,6 +1552,28 @@ static void free_head_ref_squota_rsv(struct btrfs_fs_info *fs_info,
BTRFS_QGROUP_RSV_DATA);
}
+static int drop_remap_tree_ref(struct btrfs_trans_handle *trans,
+ const struct btrfs_delayed_ref_node *node)
+{
+ u64 bytenr = node->bytenr;
+ u64 num_bytes = node->num_bytes;
+ int ret;
+
+ ret = btrfs_add_to_free_space_tree(trans, bytenr, num_bytes);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+ }
+
+ ret = btrfs_update_block_group(trans, bytenr, num_bytes, false);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+ }
+
+ return 0;
+}
+
static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_head *href,
const struct btrfs_delayed_ref_node *node,
@@ -1746,7 +1768,10 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
} else if (node->action == BTRFS_ADD_DELAYED_REF) {
ret = __btrfs_inc_extent_ref(trans, node, extent_op);
} else if (node->action == BTRFS_DROP_DELAYED_REF) {
- ret = __btrfs_free_extent(trans, href, node, extent_op);
+ if (node->ref_root == BTRFS_REMAP_TREE_OBJECTID)
+ ret = drop_remap_tree_ref(trans, node);
+ else
+ ret = __btrfs_free_extent(trans, href, node, extent_op);
} else {
BUG();
}
@@ -4895,6 +4920,9 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
int level = btrfs_delayed_ref_owner(node);
bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
+ if (unlikely(node->ref_root == BTRFS_REMAP_TREE_OBJECTID))
+ goto skip;
+
extent_key.objectid = node->bytenr;
if (skinny_metadata) {
/* The owner of a tree block is the level. */
@@ -4947,6 +4975,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
btrfs_free_path(path);
+skip:
return alloc_reserved_extent(trans, node->bytenr, fs_info->nodesize);
}
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 8a9bff0426ae..7b2bec28dbd7 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -3972,6 +3972,9 @@ static bool should_balance_chunk(struct extent_buffer *leaf, struct btrfs_chunk
struct btrfs_balance_args *bargs = NULL;
u64 chunk_type = btrfs_chunk_type(leaf, chunk);
+ if (chunk_type & BTRFS_BLOCK_GROUP_REMAP)
+ return false;
+
/* type filter */
if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) &
(bctl->flags & BTRFS_BALANCE_TYPE_MASK))) {
--
2.49.1
^ permalink raw reply related [flat|nested] 42+ messages in thread* [PATCH v4 06/16] btrfs: add extended version of struct block_group_item
2025-10-24 18:12 [PATCH v4 00/16] Remap tree Mark Harmstone
` (4 preceding siblings ...)
2025-10-24 18:12 ` [PATCH v4 05/16] btrfs: don't add metadata items for the remap tree to the extent tree Mark Harmstone
@ 2025-10-24 18:12 ` Mark Harmstone
2025-10-31 21:47 ` Boris Burkov
2025-10-24 18:12 ` [PATCH v4 07/16] btrfs: allow mounting filesystems with remap-tree incompat flag Mark Harmstone
` (9 subsequent siblings)
15 siblings, 1 reply; 42+ messages in thread
From: Mark Harmstone @ 2025-10-24 18:12 UTC (permalink / raw)
To: linux-btrfs; +Cc: Mark Harmstone
Add a struct btrfs_block_group_item_v2, which is used in the block group
tree if the remap-tree incompat flag is set.
This adds two new fields to the block group item: `remap_bytes` and
`identity_remap_count`.
`remap_bytes` records the amount of data that's physically within this
block group, but nominally in another, remapped block group. This is
necessary because this data will need to be moved first if this block
group is itself relocated. If `remap_bytes` > 0, this is an indicator to
the relocation thread that it will need to search the remap-tree for
backrefs. A block group must also have `remap_bytes` == 0 before it can
be dropped.
`identity_remap_count` records how many identity remap items are located
in the remap tree for this block group. When relocation is begun for
this block group, this is set to the number of holes in the free-space
tree for this range. As identity remaps are converted into actual remaps
by the relocation process, this number is decreased. Once it reaches 0,
either because of relocation or because extents have been deleted, the
block group has been fully remapped and its chunk's device extents are
removed.
Signed-off-by: Mark Harmstone <mark@harmstone.com>
---
fs/btrfs/accessors.h | 20 +++++++
fs/btrfs/block-group.c | 100 ++++++++++++++++++++++++--------
fs/btrfs/block-group.h | 14 ++++-
fs/btrfs/discard.c | 2 +-
fs/btrfs/tree-checker.c | 10 +++-
include/uapi/linux/btrfs_tree.h | 8 +++
6 files changed, 126 insertions(+), 28 deletions(-)
diff --git a/fs/btrfs/accessors.h b/fs/btrfs/accessors.h
index 95a1ca8c099b..0dd161ee6863 100644
--- a/fs/btrfs/accessors.h
+++ b/fs/btrfs/accessors.h
@@ -239,6 +239,26 @@ BTRFS_SETGET_FUNCS(block_group_flags, struct btrfs_block_group_item, flags, 64);
BTRFS_SETGET_STACK_FUNCS(stack_block_group_flags,
struct btrfs_block_group_item, flags, 64);
+/* struct btrfs_block_group_item_v2 */
+BTRFS_SETGET_STACK_FUNCS(stack_block_group_v2_used, struct btrfs_block_group_item_v2,
+ used, 64);
+BTRFS_SETGET_FUNCS(block_group_v2_used, struct btrfs_block_group_item_v2, used, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_block_group_v2_chunk_objectid,
+ struct btrfs_block_group_item_v2, chunk_objectid, 64);
+BTRFS_SETGET_FUNCS(block_group_v2_chunk_objectid,
+ struct btrfs_block_group_item_v2, chunk_objectid, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_block_group_v2_flags,
+ struct btrfs_block_group_item_v2, flags, 64);
+BTRFS_SETGET_FUNCS(block_group_v2_flags, struct btrfs_block_group_item_v2, flags, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_block_group_v2_remap_bytes,
+ struct btrfs_block_group_item_v2, remap_bytes, 64);
+BTRFS_SETGET_FUNCS(block_group_v2_remap_bytes, struct btrfs_block_group_item_v2,
+ remap_bytes, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_block_group_v2_identity_remap_count,
+ struct btrfs_block_group_item_v2, identity_remap_count, 32);
+BTRFS_SETGET_FUNCS(block_group_v2_identity_remap_count, struct btrfs_block_group_item_v2,
+ identity_remap_count, 32);
+
/* struct btrfs_free_space_info */
BTRFS_SETGET_FUNCS(free_space_extent_count, struct btrfs_free_space_info,
extent_count, 32);
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index b5f2ec8d013f..27173aca6fc1 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -2374,7 +2374,7 @@ static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info)
}
static int read_one_block_group(struct btrfs_fs_info *info,
- struct btrfs_block_group_item *bgi,
+ struct btrfs_block_group_item_v2 *bgi,
const struct btrfs_key *key,
int need_clear)
{
@@ -2389,11 +2389,16 @@ static int read_one_block_group(struct btrfs_fs_info *info,
return -ENOMEM;
cache->length = key->offset;
- cache->used = btrfs_stack_block_group_used(bgi);
+ cache->used = btrfs_stack_block_group_v2_used(bgi);
cache->commit_used = cache->used;
- cache->flags = btrfs_stack_block_group_flags(bgi);
- cache->global_root_id = btrfs_stack_block_group_chunk_objectid(bgi);
+ cache->flags = btrfs_stack_block_group_v2_flags(bgi);
+ cache->global_root_id = btrfs_stack_block_group_v2_chunk_objectid(bgi);
cache->space_info = btrfs_find_space_info(info, cache->flags);
+ cache->remap_bytes = btrfs_stack_block_group_v2_remap_bytes(bgi);
+ cache->commit_remap_bytes = cache->remap_bytes;
+ cache->identity_remap_count =
+ btrfs_stack_block_group_v2_identity_remap_count(bgi);
+ cache->commit_identity_remap_count = cache->identity_remap_count;
btrfs_set_free_space_tree_thresholds(cache);
@@ -2458,7 +2463,7 @@ static int read_one_block_group(struct btrfs_fs_info *info,
} else if (cache->length == cache->used) {
cache->cached = BTRFS_CACHE_FINISHED;
btrfs_free_excluded_extents(cache);
- } else if (cache->used == 0) {
+ } else if (cache->used == 0 && cache->remap_bytes == 0) {
cache->cached = BTRFS_CACHE_FINISHED;
ret = btrfs_add_new_free_space(cache, cache->start,
cache->start + cache->length, NULL);
@@ -2478,7 +2483,7 @@ static int read_one_block_group(struct btrfs_fs_info *info,
set_avail_alloc_bits(info, cache->flags);
if (btrfs_chunk_writeable(info, cache->start)) {
- if (cache->used == 0) {
+ if (cache->used == 0 && cache->remap_bytes == 0) {
ASSERT(list_empty(&cache->bg_list));
if (btrfs_test_opt(info, DISCARD_ASYNC))
btrfs_discard_queue_work(&info->discard_ctl, cache);
@@ -2582,9 +2587,10 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
need_clear = 1;
while (1) {
- struct btrfs_block_group_item bgi;
+ struct btrfs_block_group_item_v2 bgi;
struct extent_buffer *leaf;
int slot;
+ size_t size;
ret = find_first_block_group(info, path, &key);
if (ret > 0)
@@ -2595,8 +2601,16 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
leaf = path->nodes[0];
slot = path->slots[0];
+ if (btrfs_fs_incompat(info, REMAP_TREE)) {
+ size = sizeof(struct btrfs_block_group_item_v2);
+ } else {
+ size = sizeof(struct btrfs_block_group_item);
+ btrfs_set_stack_block_group_v2_remap_bytes(&bgi, 0);
+ btrfs_set_stack_block_group_v2_identity_remap_count(&bgi, 0);
+ }
+
read_extent_buffer(leaf, &bgi, btrfs_item_ptr_offset(leaf, slot),
- sizeof(bgi));
+ size);
btrfs_item_key_to_cpu(leaf, &key, slot);
btrfs_release_path(path);
@@ -2666,25 +2680,38 @@ static int insert_block_group_item(struct btrfs_trans_handle *trans,
struct btrfs_block_group *block_group)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- struct btrfs_block_group_item bgi;
+ struct btrfs_block_group_item_v2 bgi;
struct btrfs_root *root = btrfs_block_group_root(fs_info);
struct btrfs_key key;
u64 old_commit_used;
+ size_t size;
int ret;
spin_lock(&block_group->lock);
- btrfs_set_stack_block_group_used(&bgi, block_group->used);
- btrfs_set_stack_block_group_chunk_objectid(&bgi,
- block_group->global_root_id);
- btrfs_set_stack_block_group_flags(&bgi, block_group->flags);
+ btrfs_set_stack_block_group_v2_used(&bgi, block_group->used);
+ btrfs_set_stack_block_group_v2_chunk_objectid(&bgi,
+ block_group->global_root_id);
+ btrfs_set_stack_block_group_v2_flags(&bgi, block_group->flags);
+ btrfs_set_stack_block_group_v2_remap_bytes(&bgi,
+ block_group->remap_bytes);
+ btrfs_set_stack_block_group_v2_identity_remap_count(&bgi,
+ block_group->identity_remap_count);
old_commit_used = block_group->commit_used;
block_group->commit_used = block_group->used;
+ block_group->commit_remap_bytes = block_group->remap_bytes;
+ block_group->commit_identity_remap_count =
+ block_group->identity_remap_count;
key.objectid = block_group->start;
key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
key.offset = block_group->length;
spin_unlock(&block_group->lock);
- ret = btrfs_insert_item(trans, root, &key, &bgi, sizeof(bgi));
+ if (btrfs_fs_incompat(fs_info, REMAP_TREE))
+ size = sizeof(struct btrfs_block_group_item_v2);
+ else
+ size = sizeof(struct btrfs_block_group_item);
+
+ ret = btrfs_insert_item(trans, root, &key, &bgi, size);
if (ret < 0) {
spin_lock(&block_group->lock);
block_group->commit_used = old_commit_used;
@@ -3139,10 +3166,12 @@ static int update_block_group_item(struct btrfs_trans_handle *trans,
struct btrfs_root *root = btrfs_block_group_root(fs_info);
unsigned long bi;
struct extent_buffer *leaf;
- struct btrfs_block_group_item bgi;
+ struct btrfs_block_group_item_v2 bgi;
struct btrfs_key key;
- u64 old_commit_used;
- u64 used;
+ u64 old_commit_used, old_commit_remap_bytes;
+ u32 old_commit_identity_remap_count;
+ u64 used, remap_bytes;
+ u32 identity_remap_count;
/*
* Block group items update can be triggered out of commit transaction
@@ -3152,13 +3181,21 @@ static int update_block_group_item(struct btrfs_trans_handle *trans,
*/
spin_lock(&cache->lock);
old_commit_used = cache->commit_used;
+ old_commit_remap_bytes = cache->commit_remap_bytes;
+ old_commit_identity_remap_count = cache->commit_identity_remap_count;
used = cache->used;
- /* No change in used bytes, can safely skip it. */
- if (cache->commit_used == used) {
+ remap_bytes = cache->remap_bytes;
+ identity_remap_count = cache->identity_remap_count;
+ /* No change in values, can safely skip it. */
+ if (cache->commit_used == used &&
+ cache->commit_remap_bytes == remap_bytes &&
+ cache->commit_identity_remap_count == identity_remap_count) {
spin_unlock(&cache->lock);
return 0;
}
cache->commit_used = used;
+ cache->commit_remap_bytes = remap_bytes;
+ cache->commit_identity_remap_count = identity_remap_count;
spin_unlock(&cache->lock);
key.objectid = cache->start;
@@ -3174,11 +3211,23 @@ static int update_block_group_item(struct btrfs_trans_handle *trans,
leaf = path->nodes[0];
bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
- btrfs_set_stack_block_group_used(&bgi, used);
- btrfs_set_stack_block_group_chunk_objectid(&bgi,
- cache->global_root_id);
- btrfs_set_stack_block_group_flags(&bgi, cache->flags);
- write_extent_buffer(leaf, &bgi, bi, sizeof(bgi));
+ btrfs_set_stack_block_group_v2_used(&bgi, used);
+ btrfs_set_stack_block_group_v2_chunk_objectid(&bgi,
+ cache->global_root_id);
+ btrfs_set_stack_block_group_v2_flags(&bgi, cache->flags);
+
+ if (btrfs_fs_incompat(fs_info, REMAP_TREE)) {
+ btrfs_set_stack_block_group_v2_remap_bytes(&bgi,
+ cache->remap_bytes);
+ btrfs_set_stack_block_group_v2_identity_remap_count(&bgi,
+ cache->identity_remap_count);
+ write_extent_buffer(leaf, &bgi, bi,
+ sizeof(struct btrfs_block_group_item_v2));
+ } else {
+ write_extent_buffer(leaf, &bgi, bi,
+ sizeof(struct btrfs_block_group_item));
+ }
+
fail:
btrfs_release_path(path);
/*
@@ -3193,6 +3242,9 @@ static int update_block_group_item(struct btrfs_trans_handle *trans,
if (ret < 0 && ret != -ENOENT) {
spin_lock(&cache->lock);
cache->commit_used = old_commit_used;
+ cache->commit_remap_bytes = old_commit_remap_bytes;
+ cache->commit_identity_remap_count =
+ old_commit_identity_remap_count;
spin_unlock(&cache->lock);
}
return ret;
diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h
index 9172104a5889..af23fdb3cf4d 100644
--- a/fs/btrfs/block-group.h
+++ b/fs/btrfs/block-group.h
@@ -129,6 +129,8 @@ struct btrfs_block_group {
u64 flags;
u64 cache_generation;
u64 global_root_id;
+ u64 remap_bytes;
+ u32 identity_remap_count;
/*
* The last committed used bytes of this block group, if the above @used
@@ -136,6 +138,15 @@ struct btrfs_block_group {
* group item of this block group.
*/
u64 commit_used;
+ /*
+ * The last committed remap_bytes value of this block group.
+ */
+ u64 commit_remap_bytes;
+ /*
+ * The last commited identity_remap_count value of this block group.
+ */
+ u32 commit_identity_remap_count;
+
/*
* If the free space extent count exceeds this number, convert the block
* group to bitmaps.
@@ -282,7 +293,8 @@ static inline bool btrfs_is_block_group_used(const struct btrfs_block_group *bg)
{
lockdep_assert_held(&bg->lock);
- return (bg->used > 0 || bg->reserved > 0 || bg->pinned > 0);
+ return (bg->used > 0 || bg->reserved > 0 || bg->pinned > 0 ||
+ bg->remap_bytes > 0);
}
static inline bool btrfs_is_block_group_data_only(const struct btrfs_block_group *block_group)
diff --git a/fs/btrfs/discard.c b/fs/btrfs/discard.c
index 89fe85778115..ee5f5b2788e1 100644
--- a/fs/btrfs/discard.c
+++ b/fs/btrfs/discard.c
@@ -373,7 +373,7 @@ void btrfs_discard_queue_work(struct btrfs_discard_ctl *discard_ctl,
if (!block_group || !btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC))
return;
- if (block_group->used == 0)
+ if (block_group->used == 0 && block_group->remap_bytes == 0)
add_to_discard_unused_list(discard_ctl, block_group);
else
add_to_discard_list(discard_ctl, block_group);
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index b6827c2a7815..08b1bcfc7db7 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -688,6 +688,7 @@ static int check_block_group_item(struct extent_buffer *leaf,
u64 chunk_objectid;
u64 flags;
u64 type;
+ size_t exp_size;
/*
* Here we don't really care about alignment since extent allocator can
@@ -699,10 +700,15 @@ static int check_block_group_item(struct extent_buffer *leaf,
return -EUCLEAN;
}
- if (unlikely(item_size != sizeof(bgi))) {
+ if (btrfs_fs_incompat(fs_info, REMAP_TREE))
+ exp_size = sizeof(struct btrfs_block_group_item_v2);
+ else
+ exp_size = sizeof(struct btrfs_block_group_item);
+
+ if (unlikely(item_size != exp_size)) {
block_group_err(leaf, slot,
"invalid item size, have %u expect %zu",
- item_size, sizeof(bgi));
+ item_size, exp_size);
return -EUCLEAN;
}
diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h
index 9a36f0206d90..500e3a7df90b 100644
--- a/include/uapi/linux/btrfs_tree.h
+++ b/include/uapi/linux/btrfs_tree.h
@@ -1229,6 +1229,14 @@ struct btrfs_block_group_item {
__le64 flags;
} __attribute__ ((__packed__));
+struct btrfs_block_group_item_v2 {
+ __le64 used;
+ __le64 chunk_objectid;
+ __le64 flags;
+ __le64 remap_bytes;
+ __le32 identity_remap_count;
+} __attribute__ ((__packed__));
+
struct btrfs_free_space_info {
__le32 extent_count;
__le32 flags;
--
2.49.1
^ permalink raw reply related [flat|nested] 42+ messages in thread* Re: [PATCH v4 06/16] btrfs: add extended version of struct block_group_item
2025-10-24 18:12 ` [PATCH v4 06/16] btrfs: add extended version of struct block_group_item Mark Harmstone
@ 2025-10-31 21:47 ` Boris Burkov
0 siblings, 0 replies; 42+ messages in thread
From: Boris Burkov @ 2025-10-31 21:47 UTC (permalink / raw)
To: Mark Harmstone; +Cc: linux-btrfs
On Fri, Oct 24, 2025 at 07:12:07PM +0100, Mark Harmstone wrote:
> Add a struct btrfs_block_group_item_v2, which is used in the block group
> tree if the remap-tree incompat flag is set.
>
> This adds two new fields to the block group item: `remap_bytes` and
> `identity_remap_count`.
>
> `remap_bytes` records the amount of data that's physically within this
> block group, but nominally in another, remapped block group. This is
> necessary because this data will need to be moved first if this block
> group is itself relocated. If `remap_bytes` > 0, this is an indicator to
> the relocation thread that it will need to search the remap-tree for
> backrefs. A block group must also have `remap_bytes` == 0 before it can
> be dropped.
>
> `identity_remap_count` records how many identity remap items are located
> in the remap tree for this block group. When relocation is begun for
> this block group, this is set to the number of holes in the free-space
> tree for this range. As identity remaps are converted into actual remaps
> by the relocation process, this number is decreased. Once it reaches 0,
> either because of relocation or because extents have been deleted, the
> block group has been fully remapped and its chunk's device extents are
> removed.
>
> Signed-off-by: Mark Harmstone <mark@harmstone.com>
Reviewed-by: Boris Burkov <boris@bur.io>
> ---
> fs/btrfs/accessors.h | 20 +++++++
> fs/btrfs/block-group.c | 100 ++++++++++++++++++++++++--------
> fs/btrfs/block-group.h | 14 ++++-
> fs/btrfs/discard.c | 2 +-
> fs/btrfs/tree-checker.c | 10 +++-
> include/uapi/linux/btrfs_tree.h | 8 +++
> 6 files changed, 126 insertions(+), 28 deletions(-)
>
> diff --git a/fs/btrfs/accessors.h b/fs/btrfs/accessors.h
> index 95a1ca8c099b..0dd161ee6863 100644
> --- a/fs/btrfs/accessors.h
> +++ b/fs/btrfs/accessors.h
> @@ -239,6 +239,26 @@ BTRFS_SETGET_FUNCS(block_group_flags, struct btrfs_block_group_item, flags, 64);
> BTRFS_SETGET_STACK_FUNCS(stack_block_group_flags,
> struct btrfs_block_group_item, flags, 64);
>
> +/* struct btrfs_block_group_item_v2 */
> +BTRFS_SETGET_STACK_FUNCS(stack_block_group_v2_used, struct btrfs_block_group_item_v2,
> + used, 64);
> +BTRFS_SETGET_FUNCS(block_group_v2_used, struct btrfs_block_group_item_v2, used, 64);
> +BTRFS_SETGET_STACK_FUNCS(stack_block_group_v2_chunk_objectid,
> + struct btrfs_block_group_item_v2, chunk_objectid, 64);
> +BTRFS_SETGET_FUNCS(block_group_v2_chunk_objectid,
> + struct btrfs_block_group_item_v2, chunk_objectid, 64);
> +BTRFS_SETGET_STACK_FUNCS(stack_block_group_v2_flags,
> + struct btrfs_block_group_item_v2, flags, 64);
> +BTRFS_SETGET_FUNCS(block_group_v2_flags, struct btrfs_block_group_item_v2, flags, 64);
> +BTRFS_SETGET_STACK_FUNCS(stack_block_group_v2_remap_bytes,
> + struct btrfs_block_group_item_v2, remap_bytes, 64);
> +BTRFS_SETGET_FUNCS(block_group_v2_remap_bytes, struct btrfs_block_group_item_v2,
> + remap_bytes, 64);
> +BTRFS_SETGET_STACK_FUNCS(stack_block_group_v2_identity_remap_count,
> + struct btrfs_block_group_item_v2, identity_remap_count, 32);
> +BTRFS_SETGET_FUNCS(block_group_v2_identity_remap_count, struct btrfs_block_group_item_v2,
> + identity_remap_count, 32);
> +
> /* struct btrfs_free_space_info */
> BTRFS_SETGET_FUNCS(free_space_extent_count, struct btrfs_free_space_info,
> extent_count, 32);
> diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
> index b5f2ec8d013f..27173aca6fc1 100644
> --- a/fs/btrfs/block-group.c
> +++ b/fs/btrfs/block-group.c
> @@ -2374,7 +2374,7 @@ static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info)
> }
>
> static int read_one_block_group(struct btrfs_fs_info *info,
> - struct btrfs_block_group_item *bgi,
> + struct btrfs_block_group_item_v2 *bgi,
> const struct btrfs_key *key,
> int need_clear)
> {
> @@ -2389,11 +2389,16 @@ static int read_one_block_group(struct btrfs_fs_info *info,
> return -ENOMEM;
>
> cache->length = key->offset;
> - cache->used = btrfs_stack_block_group_used(bgi);
> + cache->used = btrfs_stack_block_group_v2_used(bgi);
> cache->commit_used = cache->used;
> - cache->flags = btrfs_stack_block_group_flags(bgi);
> - cache->global_root_id = btrfs_stack_block_group_chunk_objectid(bgi);
> + cache->flags = btrfs_stack_block_group_v2_flags(bgi);
> + cache->global_root_id = btrfs_stack_block_group_v2_chunk_objectid(bgi);
> cache->space_info = btrfs_find_space_info(info, cache->flags);
> + cache->remap_bytes = btrfs_stack_block_group_v2_remap_bytes(bgi);
> + cache->commit_remap_bytes = cache->remap_bytes;
> + cache->identity_remap_count =
> + btrfs_stack_block_group_v2_identity_remap_count(bgi);
> + cache->commit_identity_remap_count = cache->identity_remap_count;
>
> btrfs_set_free_space_tree_thresholds(cache);
>
> @@ -2458,7 +2463,7 @@ static int read_one_block_group(struct btrfs_fs_info *info,
> } else if (cache->length == cache->used) {
> cache->cached = BTRFS_CACHE_FINISHED;
> btrfs_free_excluded_extents(cache);
> - } else if (cache->used == 0) {
> + } else if (cache->used == 0 && cache->remap_bytes == 0) {
> cache->cached = BTRFS_CACHE_FINISHED;
> ret = btrfs_add_new_free_space(cache, cache->start,
> cache->start + cache->length, NULL);
> @@ -2478,7 +2483,7 @@ static int read_one_block_group(struct btrfs_fs_info *info,
>
> set_avail_alloc_bits(info, cache->flags);
> if (btrfs_chunk_writeable(info, cache->start)) {
> - if (cache->used == 0) {
> + if (cache->used == 0 && cache->remap_bytes == 0) {
> ASSERT(list_empty(&cache->bg_list));
> if (btrfs_test_opt(info, DISCARD_ASYNC))
> btrfs_discard_queue_work(&info->discard_ctl, cache);
> @@ -2582,9 +2587,10 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
> need_clear = 1;
>
> while (1) {
> - struct btrfs_block_group_item bgi;
> + struct btrfs_block_group_item_v2 bgi;
> struct extent_buffer *leaf;
> int slot;
> + size_t size;
>
> ret = find_first_block_group(info, path, &key);
> if (ret > 0)
> @@ -2595,8 +2601,16 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
> leaf = path->nodes[0];
> slot = path->slots[0];
>
> + if (btrfs_fs_incompat(info, REMAP_TREE)) {
> + size = sizeof(struct btrfs_block_group_item_v2);
> + } else {
> + size = sizeof(struct btrfs_block_group_item);
> + btrfs_set_stack_block_group_v2_remap_bytes(&bgi, 0);
> + btrfs_set_stack_block_group_v2_identity_remap_count(&bgi, 0);
> + }
> +
> read_extent_buffer(leaf, &bgi, btrfs_item_ptr_offset(leaf, slot),
> - sizeof(bgi));
> + size);
>
> btrfs_item_key_to_cpu(leaf, &key, slot);
> btrfs_release_path(path);
> @@ -2666,25 +2680,38 @@ static int insert_block_group_item(struct btrfs_trans_handle *trans,
> struct btrfs_block_group *block_group)
> {
> struct btrfs_fs_info *fs_info = trans->fs_info;
> - struct btrfs_block_group_item bgi;
> + struct btrfs_block_group_item_v2 bgi;
> struct btrfs_root *root = btrfs_block_group_root(fs_info);
> struct btrfs_key key;
> u64 old_commit_used;
> + size_t size;
> int ret;
>
> spin_lock(&block_group->lock);
> - btrfs_set_stack_block_group_used(&bgi, block_group->used);
> - btrfs_set_stack_block_group_chunk_objectid(&bgi,
> - block_group->global_root_id);
> - btrfs_set_stack_block_group_flags(&bgi, block_group->flags);
> + btrfs_set_stack_block_group_v2_used(&bgi, block_group->used);
> + btrfs_set_stack_block_group_v2_chunk_objectid(&bgi,
> + block_group->global_root_id);
> + btrfs_set_stack_block_group_v2_flags(&bgi, block_group->flags);
> + btrfs_set_stack_block_group_v2_remap_bytes(&bgi,
> + block_group->remap_bytes);
> + btrfs_set_stack_block_group_v2_identity_remap_count(&bgi,
> + block_group->identity_remap_count);
> old_commit_used = block_group->commit_used;
> block_group->commit_used = block_group->used;
> + block_group->commit_remap_bytes = block_group->remap_bytes;
> + block_group->commit_identity_remap_count =
> + block_group->identity_remap_count;
> key.objectid = block_group->start;
> key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
> key.offset = block_group->length;
> spin_unlock(&block_group->lock);
>
> - ret = btrfs_insert_item(trans, root, &key, &bgi, sizeof(bgi));
> + if (btrfs_fs_incompat(fs_info, REMAP_TREE))
> + size = sizeof(struct btrfs_block_group_item_v2);
> + else
> + size = sizeof(struct btrfs_block_group_item);
> +
> + ret = btrfs_insert_item(trans, root, &key, &bgi, size);
> if (ret < 0) {
> spin_lock(&block_group->lock);
> block_group->commit_used = old_commit_used;
> @@ -3139,10 +3166,12 @@ static int update_block_group_item(struct btrfs_trans_handle *trans,
> struct btrfs_root *root = btrfs_block_group_root(fs_info);
> unsigned long bi;
> struct extent_buffer *leaf;
> - struct btrfs_block_group_item bgi;
> + struct btrfs_block_group_item_v2 bgi;
> struct btrfs_key key;
> - u64 old_commit_used;
> - u64 used;
> + u64 old_commit_used, old_commit_remap_bytes;
> + u32 old_commit_identity_remap_count;
> + u64 used, remap_bytes;
> + u32 identity_remap_count;
>
> /*
> * Block group items update can be triggered out of commit transaction
> @@ -3152,13 +3181,21 @@ static int update_block_group_item(struct btrfs_trans_handle *trans,
> */
> spin_lock(&cache->lock);
> old_commit_used = cache->commit_used;
> + old_commit_remap_bytes = cache->commit_remap_bytes;
> + old_commit_identity_remap_count = cache->commit_identity_remap_count;
> used = cache->used;
> - /* No change in used bytes, can safely skip it. */
> - if (cache->commit_used == used) {
> + remap_bytes = cache->remap_bytes;
> + identity_remap_count = cache->identity_remap_count;
> + /* No change in values, can safely skip it. */
> + if (cache->commit_used == used &&
> + cache->commit_remap_bytes == remap_bytes &&
> + cache->commit_identity_remap_count == identity_remap_count) {
> spin_unlock(&cache->lock);
> return 0;
> }
> cache->commit_used = used;
> + cache->commit_remap_bytes = remap_bytes;
> + cache->commit_identity_remap_count = identity_remap_count;
> spin_unlock(&cache->lock);
>
> key.objectid = cache->start;
> @@ -3174,11 +3211,23 @@ static int update_block_group_item(struct btrfs_trans_handle *trans,
>
> leaf = path->nodes[0];
> bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
> - btrfs_set_stack_block_group_used(&bgi, used);
> - btrfs_set_stack_block_group_chunk_objectid(&bgi,
> - cache->global_root_id);
> - btrfs_set_stack_block_group_flags(&bgi, cache->flags);
> - write_extent_buffer(leaf, &bgi, bi, sizeof(bgi));
> + btrfs_set_stack_block_group_v2_used(&bgi, used);
> + btrfs_set_stack_block_group_v2_chunk_objectid(&bgi,
> + cache->global_root_id);
> + btrfs_set_stack_block_group_v2_flags(&bgi, cache->flags);
> +
> + if (btrfs_fs_incompat(fs_info, REMAP_TREE)) {
> + btrfs_set_stack_block_group_v2_remap_bytes(&bgi,
> + cache->remap_bytes);
> + btrfs_set_stack_block_group_v2_identity_remap_count(&bgi,
> + cache->identity_remap_count);
> + write_extent_buffer(leaf, &bgi, bi,
> + sizeof(struct btrfs_block_group_item_v2));
> + } else {
> + write_extent_buffer(leaf, &bgi, bi,
> + sizeof(struct btrfs_block_group_item));
> + }
> +
> fail:
> btrfs_release_path(path);
> /*
> @@ -3193,6 +3242,9 @@ static int update_block_group_item(struct btrfs_trans_handle *trans,
> if (ret < 0 && ret != -ENOENT) {
> spin_lock(&cache->lock);
> cache->commit_used = old_commit_used;
> + cache->commit_remap_bytes = old_commit_remap_bytes;
> + cache->commit_identity_remap_count =
> + old_commit_identity_remap_count;
> spin_unlock(&cache->lock);
> }
> return ret;
> diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h
> index 9172104a5889..af23fdb3cf4d 100644
> --- a/fs/btrfs/block-group.h
> +++ b/fs/btrfs/block-group.h
> @@ -129,6 +129,8 @@ struct btrfs_block_group {
> u64 flags;
> u64 cache_generation;
> u64 global_root_id;
> + u64 remap_bytes;
> + u32 identity_remap_count;
>
> /*
> * The last committed used bytes of this block group, if the above @used
> @@ -136,6 +138,15 @@ struct btrfs_block_group {
> * group item of this block group.
> */
> u64 commit_used;
> + /*
> + * The last committed remap_bytes value of this block group.
> + */
> + u64 commit_remap_bytes;
> + /*
> + * The last commited identity_remap_count value of this block group.
> + */
> + u32 commit_identity_remap_count;
> +
> /*
> * If the free space extent count exceeds this number, convert the block
> * group to bitmaps.
> @@ -282,7 +293,8 @@ static inline bool btrfs_is_block_group_used(const struct btrfs_block_group *bg)
> {
> lockdep_assert_held(&bg->lock);
>
> - return (bg->used > 0 || bg->reserved > 0 || bg->pinned > 0);
> + return (bg->used > 0 || bg->reserved > 0 || bg->pinned > 0 ||
> + bg->remap_bytes > 0);
> }
>
> static inline bool btrfs_is_block_group_data_only(const struct btrfs_block_group *block_group)
> diff --git a/fs/btrfs/discard.c b/fs/btrfs/discard.c
> index 89fe85778115..ee5f5b2788e1 100644
> --- a/fs/btrfs/discard.c
> +++ b/fs/btrfs/discard.c
> @@ -373,7 +373,7 @@ void btrfs_discard_queue_work(struct btrfs_discard_ctl *discard_ctl,
> if (!block_group || !btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC))
> return;
>
> - if (block_group->used == 0)
> + if (block_group->used == 0 && block_group->remap_bytes == 0)
> add_to_discard_unused_list(discard_ctl, block_group);
> else
> add_to_discard_list(discard_ctl, block_group);
> diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
> index b6827c2a7815..08b1bcfc7db7 100644
> --- a/fs/btrfs/tree-checker.c
> +++ b/fs/btrfs/tree-checker.c
> @@ -688,6 +688,7 @@ static int check_block_group_item(struct extent_buffer *leaf,
> u64 chunk_objectid;
> u64 flags;
> u64 type;
> + size_t exp_size;
>
> /*
> * Here we don't really care about alignment since extent allocator can
> @@ -699,10 +700,15 @@ static int check_block_group_item(struct extent_buffer *leaf,
> return -EUCLEAN;
> }
>
> - if (unlikely(item_size != sizeof(bgi))) {
> + if (btrfs_fs_incompat(fs_info, REMAP_TREE))
> + exp_size = sizeof(struct btrfs_block_group_item_v2);
> + else
> + exp_size = sizeof(struct btrfs_block_group_item);
> +
> + if (unlikely(item_size != exp_size)) {
> block_group_err(leaf, slot,
> "invalid item size, have %u expect %zu",
> - item_size, sizeof(bgi));
> + item_size, exp_size);
> return -EUCLEAN;
> }
>
> diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h
> index 9a36f0206d90..500e3a7df90b 100644
> --- a/include/uapi/linux/btrfs_tree.h
> +++ b/include/uapi/linux/btrfs_tree.h
> @@ -1229,6 +1229,14 @@ struct btrfs_block_group_item {
> __le64 flags;
> } __attribute__ ((__packed__));
>
> +struct btrfs_block_group_item_v2 {
> + __le64 used;
> + __le64 chunk_objectid;
> + __le64 flags;
> + __le64 remap_bytes;
> + __le32 identity_remap_count;
> +} __attribute__ ((__packed__));
> +
> struct btrfs_free_space_info {
> __le32 extent_count;
> __le32 flags;
> --
> 2.49.1
>
^ permalink raw reply [flat|nested] 42+ messages in thread
* [PATCH v4 07/16] btrfs: allow mounting filesystems with remap-tree incompat flag
2025-10-24 18:12 [PATCH v4 00/16] Remap tree Mark Harmstone
` (5 preceding siblings ...)
2025-10-24 18:12 ` [PATCH v4 06/16] btrfs: add extended version of struct block_group_item Mark Harmstone
@ 2025-10-24 18:12 ` Mark Harmstone
2025-10-24 18:12 ` [PATCH v4 08/16] btrfs: redirect I/O for remapped block groups Mark Harmstone
` (8 subsequent siblings)
15 siblings, 0 replies; 42+ messages in thread
From: Mark Harmstone @ 2025-10-24 18:12 UTC (permalink / raw)
To: linux-btrfs; +Cc: Mark Harmstone, Boris Burkov
If we encounter a filesystem with the remap-tree incompat flag set,
valdiate its compatibility with the other flags, and load the remap tree
using the values that have been added to the superblock.
The remap-tree feature depends on the free space tere, but no-holes and
block-group-tree have been made dependencies to reduce the testing
matrix. Similarly I'm not aware of any reason why mixed-bg and zoned would be
incompatible with remap-tree, but this is blocked for the time being
until it can be fully tested.
Signed-off-by: Mark Harmstone <mark@harmstone.com>
Reviewed-by: Boris Burkov <boris@bur.io>
---
fs/btrfs/Kconfig | 2 +
fs/btrfs/accessors.h | 6 ++
fs/btrfs/disk-io.c | 101 ++++++++++++++++++++++++++++----
fs/btrfs/extent-tree.c | 2 +
fs/btrfs/fs.h | 4 +-
fs/btrfs/transaction.c | 7 +++
include/uapi/linux/btrfs_tree.h | 5 +-
7 files changed, 113 insertions(+), 14 deletions(-)
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index 4438637c8900..77b5a9f27840 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -117,4 +117,6 @@ config BTRFS_EXPERIMENTAL
- large folio support
+ - remap-tree - logical address remapping tree
+
If unsure, say N.
diff --git a/fs/btrfs/accessors.h b/fs/btrfs/accessors.h
index 0dd161ee6863..392eaad75e72 100644
--- a/fs/btrfs/accessors.h
+++ b/fs/btrfs/accessors.h
@@ -882,6 +882,12 @@ BTRFS_SETGET_STACK_FUNCS(super_uuid_tree_generation, struct btrfs_super_block,
uuid_tree_generation, 64);
BTRFS_SETGET_STACK_FUNCS(super_nr_global_roots, struct btrfs_super_block,
nr_global_roots, 64);
+BTRFS_SETGET_STACK_FUNCS(super_remap_root, struct btrfs_super_block,
+ remap_root, 64);
+BTRFS_SETGET_STACK_FUNCS(super_remap_root_generation, struct btrfs_super_block,
+ remap_root_generation, 64);
+BTRFS_SETGET_STACK_FUNCS(super_remap_root_level, struct btrfs_super_block,
+ remap_root_level, 8);
/* struct btrfs_file_extent_item */
BTRFS_SETGET_STACK_FUNCS(stack_file_extent_type, struct btrfs_file_extent_item,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 13d9a9ece3ca..d3ff148311d8 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1180,6 +1180,8 @@ static struct btrfs_root *btrfs_get_global_root(struct btrfs_fs_info *fs_info,
return btrfs_grab_root(btrfs_global_root(fs_info, &key));
case BTRFS_RAID_STRIPE_TREE_OBJECTID:
return btrfs_grab_root(fs_info->stripe_root);
+ case BTRFS_REMAP_TREE_OBJECTID:
+ return btrfs_grab_root(fs_info->remap_root);
default:
return NULL;
}
@@ -1271,6 +1273,7 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info)
btrfs_put_root(fs_info->data_reloc_root);
btrfs_put_root(fs_info->block_group_root);
btrfs_put_root(fs_info->stripe_root);
+ btrfs_put_root(fs_info->remap_root);
btrfs_check_leaked_roots(fs_info);
btrfs_extent_buffer_leak_debug_check(fs_info);
kfree(fs_info->super_copy);
@@ -1825,6 +1828,7 @@ static void free_root_pointers(struct btrfs_fs_info *info, bool free_chunk_root)
free_root_extent_buffers(info->data_reloc_root);
free_root_extent_buffers(info->block_group_root);
free_root_extent_buffers(info->stripe_root);
+ free_root_extent_buffers(info->remap_root);
if (free_chunk_root)
free_root_extent_buffers(info->chunk_root);
}
@@ -2257,20 +2261,45 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
if (ret)
goto out;
- /*
- * This tree can share blocks with some other fs tree during relocation
- * and we need a proper setup by btrfs_get_fs_root
- */
- root = btrfs_get_fs_root(tree_root->fs_info,
- BTRFS_DATA_RELOC_TREE_OBJECTID, true);
- if (IS_ERR(root)) {
- if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
- ret = PTR_ERR(root);
- goto out;
+ if (btrfs_fs_incompat(fs_info, REMAP_TREE)) {
+ /* remap_root already loaded in load_important_roots() */
+ root = fs_info->remap_root;
+
+ set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
+
+ root->root_key.objectid = BTRFS_REMAP_TREE_OBJECTID;
+ root->root_key.type = BTRFS_ROOT_ITEM_KEY;
+ root->root_key.offset = 0;
+
+ /* Check that data reloc tree doesn't also exist */
+ location.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID;
+ root = btrfs_read_tree_root(fs_info->tree_root, &location);
+ if (!IS_ERR(root)) {
+ btrfs_err(fs_info,
+ "data reloc tree exists when remap-tree enabled");
+ btrfs_put_root(root);
+ return -EIO;
+ } else if (PTR_ERR(root) != -ENOENT) {
+ btrfs_warn(fs_info,
+ "error %ld when checking for data reloc tree",
+ PTR_ERR(root));
}
} else {
- set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
- fs_info->data_reloc_root = root;
+ /*
+ * This tree can share blocks with some other fs tree during
+ * relocation and we need a proper setup by btrfs_get_fs_root
+ */
+ root = btrfs_get_fs_root(tree_root->fs_info,
+ BTRFS_DATA_RELOC_TREE_OBJECTID, true);
+ if (IS_ERR(root)) {
+ if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
+ ret = PTR_ERR(root);
+ goto out;
+ }
+ } else {
+ set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
+ fs_info->data_reloc_root = root;
+ }
}
location.objectid = BTRFS_QUOTA_TREE_OBJECTID;
@@ -2510,6 +2539,31 @@ int btrfs_validate_super(const struct btrfs_fs_info *fs_info,
ret = -EINVAL;
}
+ /*
+ * Reduce test matrix for remap tree by requiring block-group-tree
+ * and no-holes. Free-space-tree is a hard requirement.
+ */
+ if (btrfs_fs_incompat(fs_info, REMAP_TREE) &&
+ (!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID) ||
+ !btrfs_fs_incompat(fs_info, NO_HOLES) ||
+ !btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE))) {
+ btrfs_err(fs_info,
+"remap-tree feature requires free-space-tree, no-holes, and block-group-tree");
+ ret = -EINVAL;
+ }
+
+ if (btrfs_fs_incompat(fs_info, REMAP_TREE) &&
+ btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
+ btrfs_err(fs_info, "remap-tree not supported with mixed-bg");
+ ret = -EINVAL;
+ }
+
+ if (btrfs_fs_incompat(fs_info, REMAP_TREE) &&
+ btrfs_fs_incompat(fs_info, ZONED)) {
+ btrfs_err(fs_info, "remap-tree not supported with zoned devices");
+ ret = -EINVAL;
+ }
+
/*
* Hint to catch really bogus numbers, bitflips or so, more exact checks are
* done later
@@ -2668,6 +2722,18 @@ static int load_important_roots(struct btrfs_fs_info *fs_info)
btrfs_warn(fs_info, "couldn't read tree root");
return ret;
}
+
+ if (btrfs_fs_incompat(fs_info, REMAP_TREE)) {
+ bytenr = btrfs_super_remap_root(sb);
+ gen = btrfs_super_remap_root_generation(sb);
+ level = btrfs_super_remap_root_level(sb);
+ ret = load_super_root(fs_info->remap_root, bytenr, gen, level);
+ if (ret) {
+ btrfs_warn(fs_info, "couldn't read remap root");
+ return ret;
+ }
+ }
+
return 0;
}
@@ -3285,6 +3351,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
struct btrfs_fs_info *fs_info = btrfs_sb(sb);
struct btrfs_root *tree_root;
struct btrfs_root *chunk_root;
+ struct btrfs_root *remap_root;
int ret;
int level;
@@ -3418,6 +3485,16 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
if (ret < 0)
goto fail_alloc;
+ if (btrfs_super_incompat_flags(disk_super) & BTRFS_FEATURE_INCOMPAT_REMAP_TREE) {
+ remap_root = btrfs_alloc_root(fs_info, BTRFS_REMAP_TREE_OBJECTID,
+ GFP_KERNEL);
+ fs_info->remap_root = remap_root;
+ if (!remap_root) {
+ ret = -ENOMEM;
+ goto fail_alloc;
+ }
+ }
+
/*
* At this point our mount options are validated, if we set ->max_inline
* to something non-standard make sure we truncate it to sectorsize.
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 871a63799311..d3ca8105ffc7 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2589,6 +2589,8 @@ static u64 get_alloc_profile_by_root(struct btrfs_root *root, int data)
flags = BTRFS_BLOCK_GROUP_DATA;
else if (root == fs_info->chunk_root)
flags = BTRFS_BLOCK_GROUP_SYSTEM;
+ else if (root == fs_info->remap_root)
+ flags = BTRFS_BLOCK_GROUP_REMAP;
else
flags = BTRFS_BLOCK_GROUP_METADATA;
diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
index 40a25f9f617e..62057e8006a9 100644
--- a/fs/btrfs/fs.h
+++ b/fs/btrfs/fs.h
@@ -301,7 +301,8 @@ enum {
#define BTRFS_FEATURE_INCOMPAT_SUPP \
(BTRFS_FEATURE_INCOMPAT_SUPP_STABLE | \
BTRFS_FEATURE_INCOMPAT_RAID_STRIPE_TREE | \
- BTRFS_FEATURE_INCOMPAT_EXTENT_TREE_V2)
+ BTRFS_FEATURE_INCOMPAT_EXTENT_TREE_V2 | \
+ BTRFS_FEATURE_INCOMPAT_REMAP_TREE)
#else
@@ -461,6 +462,7 @@ struct btrfs_fs_info {
struct btrfs_root *data_reloc_root;
struct btrfs_root *block_group_root;
struct btrfs_root *stripe_root;
+ struct btrfs_root *remap_root;
/* The log root tree is a directory of all the other log roots */
struct btrfs_root *log_root_tree;
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 907f2d047b44..de3eeb37408a 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -1949,6 +1949,13 @@ static void update_super_roots(struct btrfs_fs_info *fs_info)
super->cache_generation = 0;
if (test_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags))
super->uuid_tree_generation = root_item->generation;
+
+ if (btrfs_fs_incompat(fs_info, REMAP_TREE)) {
+ root_item = &fs_info->remap_root->root_item;
+ super->remap_root = root_item->bytenr;
+ super->remap_root_generation = root_item->generation;
+ super->remap_root_level = root_item->level;
+ }
}
int btrfs_transaction_blocked(struct btrfs_fs_info *info)
diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h
index 500e3a7df90b..89bcb80081a6 100644
--- a/include/uapi/linux/btrfs_tree.h
+++ b/include/uapi/linux/btrfs_tree.h
@@ -721,9 +721,12 @@ struct btrfs_super_block {
__u8 metadata_uuid[BTRFS_FSID_SIZE];
__u64 nr_global_roots;
+ __le64 remap_root;
+ __le64 remap_root_generation;
+ __u8 remap_root_level;
/* Future expansion */
- __le64 reserved[27];
+ __u8 reserved[199];
__u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE];
struct btrfs_root_backup super_roots[BTRFS_NUM_BACKUP_ROOTS];
--
2.49.1
^ permalink raw reply related [flat|nested] 42+ messages in thread* [PATCH v4 08/16] btrfs: redirect I/O for remapped block groups
2025-10-24 18:12 [PATCH v4 00/16] Remap tree Mark Harmstone
` (6 preceding siblings ...)
2025-10-24 18:12 ` [PATCH v4 07/16] btrfs: allow mounting filesystems with remap-tree incompat flag Mark Harmstone
@ 2025-10-24 18:12 ` Mark Harmstone
2025-10-31 22:03 ` Boris Burkov
2025-10-24 18:12 ` [PATCH v4 09/16] btrfs: handle deletions from remapped block group Mark Harmstone
` (7 subsequent siblings)
15 siblings, 1 reply; 42+ messages in thread
From: Mark Harmstone @ 2025-10-24 18:12 UTC (permalink / raw)
To: linux-btrfs; +Cc: Mark Harmstone
Change btrfs_map_block() so that if the block group has the REMAPPED
flag set, we call btrfs_translate_remap() to obtain a new address.
btrfs_translate_remap() searches the remap tree for a range
corresponding to the logical address passed to btrfs_map_block(). If it
is within an identity remap, this part of the block group hasn't yet
been relocated, and so we use the existing address.
If it is within an actual remap, we subtract the start of the remap
range and add the address of its destination, contained in the item's
payload.
Signed-off-by: Mark Harmstone <mark@harmstone.com>
---
fs/btrfs/relocation.c | 54 +++++++++++++++++++++++++++++++++++++++++++
fs/btrfs/relocation.h | 2 ++
fs/btrfs/volumes.c | 19 +++++++++++++++
3 files changed, 75 insertions(+)
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 96539e8b7b4b..a8abe24de8d7 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -3870,6 +3870,60 @@ static const char *stage_to_string(enum reloc_stage stage)
return "unknown";
}
+int btrfs_translate_remap(struct btrfs_fs_info *fs_info, u64 *logical,
+ u64 *length)
+{
+ int ret;
+ struct btrfs_key key, found_key;
+ struct extent_buffer *leaf;
+ struct btrfs_remap *remap;
+ BTRFS_PATH_AUTO_FREE(path);
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ key.objectid = *logical;
+ key.type = (u8)-1;
+ key.offset = (u64)-1;
+
+ ret = btrfs_search_slot(NULL, fs_info->remap_root, &key, path,
+ 0, 0);
+ if (ret < 0)
+ return ret;
+
+ leaf = path->nodes[0];
+
+ if (path->slots[0] == 0)
+ return -ENOENT;
+
+ path->slots[0]--;
+
+ btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+
+ if (found_key.type != BTRFS_REMAP_KEY &&
+ found_key.type != BTRFS_IDENTITY_REMAP_KEY) {
+ return -ENOENT;
+ }
+
+ if (found_key.objectid > *logical ||
+ found_key.objectid + found_key.offset <= *logical) {
+ return -ENOENT;
+ }
+
+ if (*logical + *length > found_key.objectid + found_key.offset)
+ *length = found_key.objectid + found_key.offset - *logical;
+
+ if (found_key.type == BTRFS_IDENTITY_REMAP_KEY)
+ return 0;
+
+ remap = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_remap);
+
+ *logical += btrfs_remap_address(leaf, remap) - found_key.objectid;
+
+ return 0;
+}
+
/*
* function to relocate all extents in a block group.
*/
diff --git a/fs/btrfs/relocation.h b/fs/btrfs/relocation.h
index 5c36b3f84b57..b2ba83966650 100644
--- a/fs/btrfs/relocation.h
+++ b/fs/btrfs/relocation.h
@@ -31,5 +31,7 @@ int btrfs_should_cancel_balance(const struct btrfs_fs_info *fs_info);
struct btrfs_root *find_reloc_root(struct btrfs_fs_info *fs_info, u64 bytenr);
bool btrfs_should_ignore_reloc_root(const struct btrfs_root *root);
u64 btrfs_get_reloc_bg_bytenr(const struct btrfs_fs_info *fs_info);
+int btrfs_translate_remap(struct btrfs_fs_info *fs_info, u64 *logical,
+ u64 *length);
#endif
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 7b2bec28dbd7..d117f74e08c1 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -6598,6 +6598,25 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
if (IS_ERR(map))
return PTR_ERR(map);
+ if (map->type & BTRFS_BLOCK_GROUP_REMAPPED) {
+ u64 new_logical = logical;
+
+ ret = btrfs_translate_remap(fs_info, &new_logical, length);
+ if (ret)
+ return ret;
+
+ if (new_logical != logical) {
+ btrfs_free_chunk_map(map);
+
+ map = btrfs_get_chunk_map(fs_info, new_logical,
+ *length);
+ if (IS_ERR(map))
+ return PTR_ERR(map);
+
+ logical = new_logical;
+ }
+ }
+
num_copies = btrfs_chunk_map_num_copies(map);
if (io_geom.mirror_num > num_copies)
return -EINVAL;
--
2.49.1
^ permalink raw reply related [flat|nested] 42+ messages in thread* Re: [PATCH v4 08/16] btrfs: redirect I/O for remapped block groups
2025-10-24 18:12 ` [PATCH v4 08/16] btrfs: redirect I/O for remapped block groups Mark Harmstone
@ 2025-10-31 22:03 ` Boris Burkov
0 siblings, 0 replies; 42+ messages in thread
From: Boris Burkov @ 2025-10-31 22:03 UTC (permalink / raw)
To: Mark Harmstone; +Cc: linux-btrfs
On Fri, Oct 24, 2025 at 07:12:09PM +0100, Mark Harmstone wrote:
> Change btrfs_map_block() so that if the block group has the REMAPPED
> flag set, we call btrfs_translate_remap() to obtain a new address.
>
> btrfs_translate_remap() searches the remap tree for a range
> corresponding to the logical address passed to btrfs_map_block(). If it
> is within an identity remap, this part of the block group hasn't yet
> been relocated, and so we use the existing address.
>
> If it is within an actual remap, we subtract the start of the remap
> range and add the address of its destination, contained in the item's
> payload.
>
> Signed-off-by: Mark Harmstone <mark@harmstone.com>
Reviewed-by: Boris Burkov <boris@bur.io>
> ---
> fs/btrfs/relocation.c | 54 +++++++++++++++++++++++++++++++++++++++++++
> fs/btrfs/relocation.h | 2 ++
> fs/btrfs/volumes.c | 19 +++++++++++++++
> 3 files changed, 75 insertions(+)
>
> diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
> index 96539e8b7b4b..a8abe24de8d7 100644
> --- a/fs/btrfs/relocation.c
> +++ b/fs/btrfs/relocation.c
> @@ -3870,6 +3870,60 @@ static const char *stage_to_string(enum reloc_stage stage)
> return "unknown";
> }
>
> +int btrfs_translate_remap(struct btrfs_fs_info *fs_info, u64 *logical,
> + u64 *length)
> +{
> + int ret;
> + struct btrfs_key key, found_key;
> + struct extent_buffer *leaf;
> + struct btrfs_remap *remap;
> + BTRFS_PATH_AUTO_FREE(path);
> +
> + path = btrfs_alloc_path();
> + if (!path)
> + return -ENOMEM;
> +
> + key.objectid = *logical;
> + key.type = (u8)-1;
> + key.offset = (u64)-1;
> +
> + ret = btrfs_search_slot(NULL, fs_info->remap_root, &key, path,
> + 0, 0);
> + if (ret < 0)
> + return ret;
> +
> + leaf = path->nodes[0];
> +
> + if (path->slots[0] == 0)
> + return -ENOENT;
> +
> + path->slots[0]--;
> +
> + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
> +
> + if (found_key.type != BTRFS_REMAP_KEY &&
> + found_key.type != BTRFS_IDENTITY_REMAP_KEY) {
> + return -ENOENT;
> + }
> +
> + if (found_key.objectid > *logical ||
> + found_key.objectid + found_key.offset <= *logical) {
> + return -ENOENT;
> + }
> +
> + if (*logical + *length > found_key.objectid + found_key.offset)
> + *length = found_key.objectid + found_key.offset - *logical;
> +
> + if (found_key.type == BTRFS_IDENTITY_REMAP_KEY)
> + return 0;
> +
> + remap = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_remap);
> +
> + *logical += btrfs_remap_address(leaf, remap) - found_key.objectid;
> +
> + return 0;
> +}
> +
> /*
> * function to relocate all extents in a block group.
> */
> diff --git a/fs/btrfs/relocation.h b/fs/btrfs/relocation.h
> index 5c36b3f84b57..b2ba83966650 100644
> --- a/fs/btrfs/relocation.h
> +++ b/fs/btrfs/relocation.h
> @@ -31,5 +31,7 @@ int btrfs_should_cancel_balance(const struct btrfs_fs_info *fs_info);
> struct btrfs_root *find_reloc_root(struct btrfs_fs_info *fs_info, u64 bytenr);
> bool btrfs_should_ignore_reloc_root(const struct btrfs_root *root);
> u64 btrfs_get_reloc_bg_bytenr(const struct btrfs_fs_info *fs_info);
> +int btrfs_translate_remap(struct btrfs_fs_info *fs_info, u64 *logical,
> + u64 *length);
>
> #endif
> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
> index 7b2bec28dbd7..d117f74e08c1 100644
> --- a/fs/btrfs/volumes.c
> +++ b/fs/btrfs/volumes.c
> @@ -6598,6 +6598,25 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
> if (IS_ERR(map))
> return PTR_ERR(map);
>
> + if (map->type & BTRFS_BLOCK_GROUP_REMAPPED) {
> + u64 new_logical = logical;
> +
> + ret = btrfs_translate_remap(fs_info, &new_logical, length);
> + if (ret)
> + return ret;
> +
> + if (new_logical != logical) {
> + btrfs_free_chunk_map(map);
> +
> + map = btrfs_get_chunk_map(fs_info, new_logical,
> + *length);
> + if (IS_ERR(map))
> + return PTR_ERR(map);
> +
> + logical = new_logical;
> + }
> + }
> +
> num_copies = btrfs_chunk_map_num_copies(map);
> if (io_geom.mirror_num > num_copies)
> return -EINVAL;
> --
> 2.49.1
>
^ permalink raw reply [flat|nested] 42+ messages in thread
* [PATCH v4 09/16] btrfs: handle deletions from remapped block group
2025-10-24 18:12 [PATCH v4 00/16] Remap tree Mark Harmstone
` (7 preceding siblings ...)
2025-10-24 18:12 ` [PATCH v4 08/16] btrfs: redirect I/O for remapped block groups Mark Harmstone
@ 2025-10-24 18:12 ` Mark Harmstone
2025-10-31 23:05 ` Boris Burkov
2025-10-31 23:30 ` Boris Burkov
2025-10-24 18:12 ` [PATCH v4 10/16] btrfs: handle setting up relocation of block group with remap-tree Mark Harmstone
` (6 subsequent siblings)
15 siblings, 2 replies; 42+ messages in thread
From: Mark Harmstone @ 2025-10-24 18:12 UTC (permalink / raw)
To: linux-btrfs; +Cc: Mark Harmstone
Handle the case where we free an extent from a block group that has the
REMAPPED flag set. Because the remap tree is orthogonal to the extent
tree, for data this may be within any number of identity remaps or
actual remaps. If we're freeing a metadata node, this will be wholly
inside one or the other.
btrfs_remove_extent_from_remap_tree() searches the remap tree for the
remaps that cover the range in question, then calls
remove_range_from_remap_tree() for each one, to punch a hole in the
remap and adjust the free-space tree.
For an identity remap, remove_range_from_remap_tree() will adjust the
block group's `identity_remap_count` if this changes. If it reaches
zero we call last_identity_remap_gone(), which removes the chunk's
stripes and device extents - it is now fully remapped.
The changes which involve the block group's ro flag are because the
REMAPPED flag itself prevents a block group from having any new
allocations within it, and so we don't need to account for this
separately.
Signed-off-by: Mark Harmstone <mark@harmstone.com>
---
fs/btrfs/block-group.c | 118 +++++++---
fs/btrfs/block-group.h | 4 +
fs/btrfs/disk-io.c | 2 +
fs/btrfs/extent-tree.c | 77 ++++++-
fs/btrfs/extent-tree.h | 1 +
fs/btrfs/fs.h | 4 +-
fs/btrfs/relocation.c | 509 +++++++++++++++++++++++++++++++++++++++++
fs/btrfs/relocation.h | 6 +
fs/btrfs/transaction.c | 4 +
fs/btrfs/volumes.c | 56 +++--
fs/btrfs/volumes.h | 6 +
11 files changed, 728 insertions(+), 59 deletions(-)
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 27173aca6fc1..3bf5f20d90ec 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -1068,6 +1068,32 @@ static int remove_block_group_item(struct btrfs_trans_handle *trans,
return ret;
}
+void btrfs_remove_bg_from_sinfo(struct btrfs_block_group *block_group)
+{
+ int factor = btrfs_bg_type_to_factor(block_group->flags);
+
+ spin_lock(&block_group->space_info->lock);
+
+ if (btrfs_test_opt(block_group->fs_info, ENOSPC_DEBUG)) {
+ WARN_ON(block_group->space_info->total_bytes
+ < block_group->length);
+ WARN_ON(block_group->space_info->bytes_readonly
+ < block_group->length - block_group->zone_unusable);
+ WARN_ON(block_group->space_info->bytes_zone_unusable
+ < block_group->zone_unusable);
+ WARN_ON(block_group->space_info->disk_total
+ < block_group->length * factor);
+ }
+ block_group->space_info->total_bytes -= block_group->length;
+ block_group->space_info->bytes_readonly -=
+ (block_group->length - block_group->zone_unusable);
+ btrfs_space_info_update_bytes_zone_unusable(block_group->space_info,
+ -block_group->zone_unusable);
+ block_group->space_info->disk_total -= block_group->length * factor;
+
+ spin_unlock(&block_group->space_info->lock);
+}
+
int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
struct btrfs_chunk_map *map)
{
@@ -1079,7 +1105,6 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
struct kobject *kobj = NULL;
int ret;
int index;
- int factor;
struct btrfs_caching_control *caching_ctl = NULL;
bool remove_map;
bool remove_rsv = false;
@@ -1088,7 +1113,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
if (!block_group)
return -ENOENT;
- BUG_ON(!block_group->ro);
+ BUG_ON(!block_group->ro && !(block_group->flags & BTRFS_BLOCK_GROUP_REMAPPED));
trace_btrfs_remove_block_group(block_group);
/*
@@ -1100,7 +1125,6 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
block_group->length);
index = btrfs_bg_flags_to_raid_index(block_group->flags);
- factor = btrfs_bg_type_to_factor(block_group->flags);
/* make sure this block group isn't part of an allocation cluster */
cluster = &fs_info->data_alloc_cluster;
@@ -1224,26 +1248,11 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
spin_lock(&block_group->space_info->lock);
list_del_init(&block_group->ro_list);
-
- if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
- WARN_ON(block_group->space_info->total_bytes
- < block_group->length);
- WARN_ON(block_group->space_info->bytes_readonly
- < block_group->length - block_group->zone_unusable);
- WARN_ON(block_group->space_info->bytes_zone_unusable
- < block_group->zone_unusable);
- WARN_ON(block_group->space_info->disk_total
- < block_group->length * factor);
- }
- block_group->space_info->total_bytes -= block_group->length;
- block_group->space_info->bytes_readonly -=
- (block_group->length - block_group->zone_unusable);
- btrfs_space_info_update_bytes_zone_unusable(block_group->space_info,
- -block_group->zone_unusable);
- block_group->space_info->disk_total -= block_group->length * factor;
-
spin_unlock(&block_group->space_info->lock);
+ if (!(block_group->flags & BTRFS_BLOCK_GROUP_REMAPPED))
+ btrfs_remove_bg_from_sinfo(block_group);
+
/*
* Remove the free space for the block group from the free space tree
* and the block group's item from the extent tree before marking the
@@ -1538,6 +1547,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
while (!list_empty(&fs_info->unused_bgs)) {
u64 used;
int trimming;
+ bool made_ro = false;
block_group = list_first_entry(&fs_info->unused_bgs,
struct btrfs_block_group,
@@ -1574,7 +1584,8 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
spin_lock(&space_info->lock);
spin_lock(&block_group->lock);
- if (btrfs_is_block_group_used(block_group) || block_group->ro ||
+ if (btrfs_is_block_group_used(block_group) ||
+ (block_group->ro && !(block_group->flags & BTRFS_BLOCK_GROUP_REMAPPED)) ||
list_is_singular(&block_group->list)) {
/*
* We want to bail if we made new allocations or have
@@ -1616,9 +1627,10 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
* needing to allocate extents from the block group.
*/
used = btrfs_space_info_used(space_info, true);
- if ((space_info->total_bytes - block_group->length < used &&
+ if (((space_info->total_bytes - block_group->length < used &&
block_group->zone_unusable < block_group->length) ||
- has_unwritten_metadata(block_group)) {
+ has_unwritten_metadata(block_group)) &&
+ !(block_group->flags & BTRFS_BLOCK_GROUP_REMAPPED)) {
/*
* Add a reference for the list, compensate for the ref
* drop under the "next" label for the
@@ -1636,8 +1648,14 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
spin_unlock(&block_group->lock);
spin_unlock(&space_info->lock);
- /* We don't want to force the issue, only flip if it's ok. */
- ret = inc_block_group_ro(block_group, 0);
+ if (!(block_group->flags & BTRFS_BLOCK_GROUP_REMAPPED)) {
+ /* We don't want to force the issue, only flip if it's ok. */
+ ret = inc_block_group_ro(block_group, 0);
+ made_ro = true;
+ } else {
+ ret = 0;
+ }
+
up_write(&space_info->groups_sem);
if (ret < 0) {
ret = 0;
@@ -1646,7 +1664,8 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
ret = btrfs_zone_finish(block_group);
if (ret < 0) {
- btrfs_dec_block_group_ro(block_group);
+ if (made_ro)
+ btrfs_dec_block_group_ro(block_group);
if (ret == -EAGAIN) {
btrfs_link_bg_list(block_group, &retry_list);
ret = 0;
@@ -1661,7 +1680,8 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
trans = btrfs_start_trans_remove_block_group(fs_info,
block_group->start);
if (IS_ERR(trans)) {
- btrfs_dec_block_group_ro(block_group);
+ if (made_ro)
+ btrfs_dec_block_group_ro(block_group);
ret = PTR_ERR(trans);
goto next;
}
@@ -1671,7 +1691,8 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
* just delete them, we don't care about them anymore.
*/
if (!clean_pinned_extents(trans, block_group)) {
- btrfs_dec_block_group_ro(block_group);
+ if (made_ro)
+ btrfs_dec_block_group_ro(block_group);
goto end_trans;
}
@@ -1685,7 +1706,8 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
spin_lock(&fs_info->discard_ctl.lock);
if (!list_empty(&block_group->discard_list)) {
spin_unlock(&fs_info->discard_ctl.lock);
- btrfs_dec_block_group_ro(block_group);
+ if (made_ro)
+ btrfs_dec_block_group_ro(block_group);
btrfs_discard_queue_work(&fs_info->discard_ctl,
block_group);
goto end_trans;
@@ -1779,6 +1801,15 @@ void btrfs_mark_bg_unused(struct btrfs_block_group *bg)
struct btrfs_fs_info *fs_info = bg->fs_info;
spin_lock(&fs_info->unused_bgs_lock);
+
+ /* Leave fully remapped block groups on the fully_remapped_bgs list. */
+ if (bg->flags & BTRFS_BLOCK_GROUP_REMAPPED &&
+ bg->identity_remap_count == 0 &&
+ !list_empty(&bg->bg_list)) {
+ spin_unlock(&fs_info->unused_bgs_lock);
+ return;
+ }
+
if (list_empty(&bg->bg_list)) {
btrfs_get_block_group(bg);
trace_btrfs_add_unused_block_group(bg);
@@ -4772,3 +4803,30 @@ bool btrfs_block_group_should_use_size_class(const struct btrfs_block_group *bg)
return false;
return true;
}
+
+void btrfs_mark_bg_fully_remapped(struct btrfs_block_group *bg,
+ struct btrfs_trans_handle *trans)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ bool already_done;
+
+ spin_lock(&bg->lock);
+ already_done = bg->fully_remapped;
+ bg->fully_remapped = true;
+ spin_unlock(&bg->lock);
+
+ if (already_done)
+ return;
+
+ spin_lock(&fs_info->unused_bgs_lock);
+
+ if (!list_empty(&bg->bg_list))
+ list_del(&bg->bg_list);
+ else
+ btrfs_get_block_group(bg);
+
+ list_add_tail(&bg->bg_list, &fs_info->fully_remapped_bgs);
+
+ spin_unlock(&fs_info->unused_bgs_lock);
+
+}
diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h
index af23fdb3cf4d..d85f3c2546d0 100644
--- a/fs/btrfs/block-group.h
+++ b/fs/btrfs/block-group.h
@@ -282,6 +282,7 @@ struct btrfs_block_group {
struct extent_buffer *last_eb;
enum btrfs_block_group_size_class size_class;
u64 reclaim_mark;
+ bool fully_remapped;
};
static inline u64 btrfs_block_group_end(const struct btrfs_block_group *block_group)
@@ -336,6 +337,7 @@ int btrfs_add_new_free_space(struct btrfs_block_group *block_group,
struct btrfs_trans_handle *btrfs_start_trans_remove_block_group(
struct btrfs_fs_info *fs_info,
const u64 chunk_offset);
+void btrfs_remove_bg_from_sinfo(struct btrfs_block_group *block_group);
int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
struct btrfs_chunk_map *map);
void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info);
@@ -407,5 +409,7 @@ int btrfs_use_block_group_size_class(struct btrfs_block_group *bg,
enum btrfs_block_group_size_class size_class,
bool force_wrong_size_class);
bool btrfs_block_group_should_use_size_class(const struct btrfs_block_group *bg);
+void btrfs_mark_bg_fully_remapped(struct btrfs_block_group *bg,
+ struct btrfs_trans_handle *trans);
#endif /* BTRFS_BLOCK_GROUP_H */
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index d3ff148311d8..1a3e525f3d1a 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2870,6 +2870,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
INIT_LIST_HEAD(&fs_info->unused_bgs);
INIT_LIST_HEAD(&fs_info->reclaim_bgs);
+ INIT_LIST_HEAD(&fs_info->fully_remapped_bgs);
INIT_LIST_HEAD(&fs_info->zone_active_bgs);
#ifdef CONFIG_BTRFS_DEBUG
INIT_LIST_HEAD(&fs_info->allocated_roots);
@@ -2925,6 +2926,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
mutex_init(&fs_info->chunk_mutex);
mutex_init(&fs_info->transaction_kthread_mutex);
mutex_init(&fs_info->cleaner_mutex);
+ mutex_init(&fs_info->remap_mutex);
mutex_init(&fs_info->ro_block_group_mutex);
init_rwsem(&fs_info->commit_root_sem);
init_rwsem(&fs_info->cleanup_work_sem);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index d3ca8105ffc7..1c14e0c82c03 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -40,6 +40,7 @@
#include "orphan.h"
#include "tree-checker.h"
#include "raid-stripe-tree.h"
+#include "relocation.h"
#undef SCRAMBLE_DELAYED_REFS
@@ -2847,6 +2848,52 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info,
return ret;
}
+int btrfs_handle_fully_remapped_bgs(struct btrfs_trans_handle *trans)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_block_group *block_group, *tmp;
+ struct list_head *fully_remapped_bgs;
+ int ret;
+
+ fully_remapped_bgs = &fs_info->fully_remapped_bgs;
+ list_for_each_entry_safe(block_group, tmp, fully_remapped_bgs, bg_list) {
+ struct btrfs_chunk_map *map;
+
+ map = btrfs_get_chunk_map(fs_info, block_group->start, 1);
+ if (IS_ERR(map))
+ return PTR_ERR(map);
+
+ ret = btrfs_last_identity_remap_gone(trans, map, block_group);
+ if (ret) {
+ btrfs_free_chunk_map(map);
+ return ret;
+ }
+
+ /*
+ * Set num_stripes to 0, so that btrfs_remove_dev_extents()
+ * won't run a second time.
+ */
+ map->num_stripes = 0;
+
+ btrfs_free_chunk_map(map);
+
+ if (block_group->used == 0) {
+ spin_lock(&fs_info->unused_bgs_lock);
+ list_move_tail(&block_group->bg_list,
+ &fs_info->unused_bgs);
+ spin_unlock(&fs_info->unused_bgs_lock);
+ } else {
+ spin_lock(&fs_info->unused_bgs_lock);
+ list_del_init(&block_group->bg_list);
+ spin_unlock(&fs_info->unused_bgs_lock);
+
+ btrfs_put_block_group(block_group);
+ }
+ }
+
+ return 0;
+}
+
int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
@@ -2999,11 +3046,23 @@ u64 btrfs_get_extent_owner_root(struct btrfs_fs_info *fs_info,
}
static int do_free_extent_accounting(struct btrfs_trans_handle *trans,
- u64 bytenr, struct btrfs_squota_delta *delta)
+ u64 bytenr, struct btrfs_squota_delta *delta,
+ struct btrfs_path *path)
{
int ret;
+ bool remapped = false;
u64 num_bytes = delta->num_bytes;
+ /* returns 1 on success and 0 on no-op */
+ ret = btrfs_remove_extent_from_remap_tree(trans, path, bytenr,
+ num_bytes);
+ if (ret < 0) {
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+ } else if (ret == 1) {
+ remapped = true;
+ }
+
if (delta->is_data) {
struct btrfs_root *csum_root;
@@ -3027,10 +3086,16 @@ static int do_free_extent_accounting(struct btrfs_trans_handle *trans,
return ret;
}
- ret = btrfs_add_to_free_space_tree(trans, bytenr, num_bytes);
- if (unlikely(ret)) {
- btrfs_abort_transaction(trans, ret);
- return ret;
+ /*
+ * If remapped, FST has already been taken care of in
+ * remove_range_from_remap_tree().
+ */
+ if (!remapped) {
+ ret = btrfs_add_to_free_space_tree(trans, bytenr, num_bytes);
+ if (unlikely(ret)) {
+ btrfs_abort_transaction(trans, ret);
+ return ret;
+ }
}
ret = btrfs_update_block_group(trans, bytenr, num_bytes, false);
@@ -3396,7 +3461,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
}
btrfs_release_path(path);
- ret = do_free_extent_accounting(trans, bytenr, &delta);
+ ret = do_free_extent_accounting(trans, bytenr, &delta, path);
}
btrfs_release_path(path);
diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h
index e970ac42a871..6b67a4e528da 100644
--- a/fs/btrfs/extent-tree.h
+++ b/fs/btrfs/extent-tree.h
@@ -164,5 +164,6 @@ void btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, u64 start, u6
int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
u64 num_bytes, u64 *actual_bytes);
int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range);
+int btrfs_handle_fully_remapped_bgs(struct btrfs_trans_handle *trans);
#endif
diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
index 62057e8006a9..c3dacbfe118c 100644
--- a/fs/btrfs/fs.h
+++ b/fs/btrfs/fs.h
@@ -573,6 +573,7 @@ struct btrfs_fs_info {
struct mutex transaction_kthread_mutex;
struct mutex cleaner_mutex;
struct mutex chunk_mutex;
+ struct mutex remap_mutex;
/*
* This is taken to make sure we don't set block groups ro after the
@@ -827,10 +828,11 @@ struct btrfs_fs_info {
struct list_head reclaim_bgs;
int bg_reclaim_threshold;
- /* Protects the lists unused_bgs and reclaim_bgs. */
+ /* Protects the lists unused_bgs, reclaim_bgs, and fully_remapped_bgs. */
spinlock_t unused_bgs_lock;
/* Protected by unused_bgs_lock. */
struct list_head unused_bgs;
+ struct list_head fully_remapped_bgs;
struct mutex unused_bg_unpin_mutex;
/* Protect block groups that are going to be deleted */
struct mutex reclaim_bgs_lock;
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index a8abe24de8d7..9f3ce3395d6a 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -37,6 +37,7 @@
#include "super.h"
#include "tree-checker.h"
#include "raid-stripe-tree.h"
+#include "free-space-tree.h"
/*
* Relocation overview
@@ -3870,6 +3871,151 @@ static const char *stage_to_string(enum reloc_stage stage)
return "unknown";
}
+static void adjust_block_group_remap_bytes(struct btrfs_trans_handle *trans,
+ struct btrfs_block_group *bg,
+ s64 diff)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ bool bg_already_dirty = true, mark_unused = false;
+
+ spin_lock(&bg->lock);
+
+ bg->remap_bytes += diff;
+
+ if (bg->used == 0 && bg->remap_bytes == 0)
+ mark_unused = true;
+
+ spin_unlock(&bg->lock);
+
+ if (mark_unused)
+ btrfs_mark_bg_unused(bg);
+
+ spin_lock(&trans->transaction->dirty_bgs_lock);
+ if (list_empty(&bg->dirty_list)) {
+ list_add_tail(&bg->dirty_list, &trans->transaction->dirty_bgs);
+ bg_already_dirty = false;
+ btrfs_get_block_group(bg);
+ }
+ spin_unlock(&trans->transaction->dirty_bgs_lock);
+
+ /* Modified block groups are accounted for in the delayed_refs_rsv. */
+ if (!bg_already_dirty)
+ btrfs_inc_delayed_refs_rsv_bg_updates(fs_info);
+}
+
+static int remove_chunk_stripes(struct btrfs_trans_handle *trans,
+ struct btrfs_chunk_map *chunk,
+ struct btrfs_path *path)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_key key;
+ struct extent_buffer *leaf;
+ struct btrfs_chunk *c;
+ int ret;
+
+ key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
+ key.type = BTRFS_CHUNK_ITEM_KEY;
+ key.offset = chunk->start;
+
+ ret = btrfs_search_slot(trans, fs_info->chunk_root, &key, path,
+ 0, 1);
+ if (ret) {
+ if (ret == 1) {
+ btrfs_release_path(path);
+ ret = -ENOENT;
+ }
+ return ret;
+ }
+
+ leaf = path->nodes[0];
+
+ c = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_chunk);
+ btrfs_set_chunk_num_stripes(leaf, c, 0);
+ btrfs_set_chunk_sub_stripes(leaf, c, 0);
+
+ btrfs_truncate_item(trans, path, offsetof(struct btrfs_chunk, stripe),
+ 1);
+
+ btrfs_mark_buffer_dirty(trans, leaf);
+
+ btrfs_release_path(path);
+
+ return 0;
+}
+
+int btrfs_last_identity_remap_gone(struct btrfs_trans_handle *trans,
+ struct btrfs_chunk_map *chunk,
+ struct btrfs_block_group *bg)
+{
+ int ret;
+ BTRFS_PATH_AUTO_FREE(path);
+
+ ret = btrfs_remove_dev_extents(trans, chunk);
+ if (ret)
+ return ret;
+
+ mutex_lock(&trans->fs_info->chunk_mutex);
+
+ for (unsigned int i = 0; i < chunk->num_stripes; i++) {
+ ret = btrfs_update_device(trans, chunk->stripes[i].dev);
+ if (ret) {
+ mutex_unlock(&trans->fs_info->chunk_mutex);
+ return ret;
+ }
+ }
+
+ mutex_unlock(&trans->fs_info->chunk_mutex);
+
+ write_lock(&trans->fs_info->mapping_tree_lock);
+ btrfs_chunk_map_device_clear_bits(chunk, CHUNK_ALLOCATED);
+ write_unlock(&trans->fs_info->mapping_tree_lock);
+
+ btrfs_remove_bg_from_sinfo(bg);
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ ret = remove_chunk_stripes(trans, chunk, path);
+ if (ret)
+ return ret;
+
+ return 0;
+}
+
+static void adjust_identity_remap_count(struct btrfs_trans_handle *trans,
+ struct btrfs_block_group *bg, int delta)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ bool bg_already_dirty = true, mark_fully_remapped = false;
+
+ WARN_ON(delta < 0 && -delta > bg->identity_remap_count);
+
+ spin_lock(&bg->lock);
+
+ bg->identity_remap_count += delta;
+
+ if (!bg->fully_remapped && bg->identity_remap_count == 0)
+ mark_fully_remapped = true;
+
+ spin_unlock(&bg->lock);
+
+ spin_lock(&trans->transaction->dirty_bgs_lock);
+ if (list_empty(&bg->dirty_list)) {
+ list_add_tail(&bg->dirty_list, &trans->transaction->dirty_bgs);
+ bg_already_dirty = false;
+ btrfs_get_block_group(bg);
+ }
+ spin_unlock(&trans->transaction->dirty_bgs_lock);
+
+ /* Modified block groups are accounted for in the delayed_refs_rsv. */
+ if (!bg_already_dirty)
+ btrfs_inc_delayed_refs_rsv_bg_updates(fs_info);
+
+ if (mark_fully_remapped)
+ btrfs_mark_bg_fully_remapped(bg, trans);
+}
+
int btrfs_translate_remap(struct btrfs_fs_info *fs_info, u64 *logical,
u64 *length)
{
@@ -4478,3 +4624,366 @@ u64 btrfs_get_reloc_bg_bytenr(const struct btrfs_fs_info *fs_info)
logical = fs_info->reloc_ctl->block_group->start;
return logical;
}
+
+static int remove_range_from_remap_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_path *path,
+ struct btrfs_block_group *bg,
+ u64 bytenr, u64 num_bytes)
+{
+ int ret;
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct extent_buffer *leaf = path->nodes[0];
+ struct btrfs_key key, new_key;
+ struct btrfs_remap *remap_ptr = NULL, remap;
+ struct btrfs_block_group *dest_bg = NULL;
+ u64 end, new_addr = 0, remap_start, remap_length, overlap_length;
+ bool is_identity_remap;
+
+ end = bytenr + num_bytes;
+
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+
+ is_identity_remap = key.type == BTRFS_IDENTITY_REMAP_KEY;
+
+ remap_start = key.objectid;
+ remap_length = key.offset;
+
+ if (!is_identity_remap) {
+ remap_ptr = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_remap);
+ new_addr = btrfs_remap_address(leaf, remap_ptr);
+
+ dest_bg = btrfs_lookup_block_group(fs_info, new_addr);
+ }
+
+ if (bytenr == remap_start && num_bytes >= remap_length) {
+ /* Remove entirely. */
+
+ ret = btrfs_del_item(trans, fs_info->remap_root, path);
+ if (ret)
+ goto end;
+
+ btrfs_release_path(path);
+
+ overlap_length = remap_length;
+
+ if (!is_identity_remap) {
+ /* Remove backref. */
+
+ key.objectid = new_addr;
+ key.type = BTRFS_REMAP_BACKREF_KEY;
+ key.offset = remap_length;
+
+ ret = btrfs_search_slot(trans, fs_info->remap_root,
+ &key, path, -1, 1);
+ if (ret) {
+ if (ret == 1) {
+ btrfs_release_path(path);
+ ret = -ENOENT;
+ }
+ goto end;
+ }
+
+ ret = btrfs_del_item(trans, fs_info->remap_root, path);
+
+ btrfs_release_path(path);
+
+ if (ret)
+ goto end;
+
+ adjust_block_group_remap_bytes(trans, dest_bg,
+ -remap_length);
+ } else {
+ adjust_identity_remap_count(trans, bg, -1);
+ }
+ } else if (bytenr == remap_start) {
+ /* Remove beginning. */
+
+ new_key.objectid = end;
+ new_key.type = key.type;
+ new_key.offset = remap_length + remap_start - end;
+
+ btrfs_set_item_key_safe(trans, path, &new_key);
+ btrfs_mark_buffer_dirty(trans, leaf);
+
+ overlap_length = num_bytes;
+
+ if (!is_identity_remap) {
+ btrfs_set_remap_address(leaf, remap_ptr,
+ new_addr + end - remap_start);
+ btrfs_release_path(path);
+
+ /* Adjust backref. */
+
+ key.objectid = new_addr;
+ key.type = BTRFS_REMAP_BACKREF_KEY;
+ key.offset = remap_length;
+
+ ret = btrfs_search_slot(trans, fs_info->remap_root,
+ &key, path, -1, 1);
+ if (ret) {
+ if (ret == 1) {
+ btrfs_release_path(path);
+ ret = -ENOENT;
+ }
+ goto end;
+ }
+
+ leaf = path->nodes[0];
+
+ new_key.objectid = new_addr + end - remap_start;
+ new_key.type = BTRFS_REMAP_BACKREF_KEY;
+ new_key.offset = remap_length + remap_start - end;
+
+ btrfs_set_item_key_safe(trans, path, &new_key);
+
+ remap_ptr = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_remap);
+ btrfs_set_remap_address(leaf, remap_ptr, end);
+
+ btrfs_mark_buffer_dirty(trans, path->nodes[0]);
+
+ btrfs_release_path(path);
+
+ adjust_block_group_remap_bytes(trans, dest_bg,
+ -num_bytes);
+ }
+ } else if (bytenr + num_bytes < remap_start + remap_length) {
+ /* Remove middle. */
+
+ new_key.objectid = remap_start;
+ new_key.type = key.type;
+ new_key.offset = bytenr - remap_start;
+
+ btrfs_set_item_key_safe(trans, path, &new_key);
+ btrfs_mark_buffer_dirty(trans, leaf);
+
+ new_key.objectid = end;
+ new_key.offset = remap_start + remap_length - end;
+
+ btrfs_release_path(path);
+
+ overlap_length = num_bytes;
+
+ if (!is_identity_remap) {
+ /* Add second remap entry. */
+
+ ret = btrfs_insert_empty_item(trans, fs_info->remap_root,
+ path, &new_key,
+ sizeof(struct btrfs_remap));
+ if (ret)
+ goto end;
+
+ btrfs_set_stack_remap_address(&remap,
+ new_addr + end - remap_start);
+
+ write_extent_buffer(path->nodes[0], &remap,
+ btrfs_item_ptr_offset(path->nodes[0], path->slots[0]),
+ sizeof(struct btrfs_remap));
+
+ btrfs_release_path(path);
+
+ /* Shorten backref entry. */
+
+ key.objectid = new_addr;
+ key.type = BTRFS_REMAP_BACKREF_KEY;
+ key.offset = remap_length;
+
+ ret = btrfs_search_slot(trans, fs_info->remap_root,
+ &key, path, -1, 1);
+ if (ret) {
+ if (ret == 1) {
+ btrfs_release_path(path);
+ ret = -ENOENT;
+ }
+ goto end;
+ }
+
+ new_key.objectid = new_addr;
+ new_key.type = BTRFS_REMAP_BACKREF_KEY;
+ new_key.offset = bytenr - remap_start;
+
+ btrfs_set_item_key_safe(trans, path, &new_key);
+ btrfs_mark_buffer_dirty(trans, path->nodes[0]);
+
+ btrfs_release_path(path);
+
+ /* Add second backref entry. */
+
+ new_key.objectid = new_addr + end - remap_start;
+ new_key.type = BTRFS_REMAP_BACKREF_KEY;
+ new_key.offset = remap_start + remap_length - end;
+
+ ret = btrfs_insert_empty_item(trans, fs_info->remap_root,
+ path, &new_key,
+ sizeof(struct btrfs_remap));
+ if (ret)
+ goto end;
+
+ btrfs_set_stack_remap_address(&remap, end);
+
+ write_extent_buffer(path->nodes[0], &remap,
+ btrfs_item_ptr_offset(path->nodes[0], path->slots[0]),
+ sizeof(struct btrfs_remap));
+
+ btrfs_release_path(path);
+
+ adjust_block_group_remap_bytes(trans, dest_bg,
+ -num_bytes);
+ } else {
+ /* Add second identity remap entry. */
+
+ ret = btrfs_insert_empty_item(trans, fs_info->remap_root,
+ path, &new_key, 0);
+ if (ret)
+ goto end;
+
+ btrfs_release_path(path);
+
+ adjust_identity_remap_count(trans, bg, 1);
+ }
+ } else {
+ /* Remove end. */
+
+ new_key.objectid = remap_start;
+ new_key.type = key.type;
+ new_key.offset = bytenr - remap_start;
+
+ btrfs_set_item_key_safe(trans, path, &new_key);
+ btrfs_mark_buffer_dirty(trans, leaf);
+
+ btrfs_release_path(path);
+
+ overlap_length = remap_start + remap_length - bytenr;
+
+ if (!is_identity_remap) {
+ /* Shorten backref entry. */
+
+ key.objectid = new_addr;
+ key.type = BTRFS_REMAP_BACKREF_KEY;
+ key.offset = remap_length;
+
+ ret = btrfs_search_slot(trans, fs_info->remap_root,
+ &key, path, -1, 1);
+ if (ret) {
+ if (ret == 1) {
+ btrfs_release_path(path);
+ ret = -ENOENT;
+ }
+ goto end;
+ }
+
+ new_key.objectid = new_addr;
+ new_key.type = BTRFS_REMAP_BACKREF_KEY;
+ new_key.offset = bytenr - remap_start;
+
+ btrfs_set_item_key_safe(trans, path, &new_key);
+ btrfs_mark_buffer_dirty(trans, path->nodes[0]);
+
+ btrfs_release_path(path);
+
+ adjust_block_group_remap_bytes(trans, dest_bg,
+ bytenr - remap_start - remap_length);
+ }
+ }
+
+ if (!is_identity_remap) {
+ ret = btrfs_add_to_free_space_tree(trans,
+ bytenr - remap_start + new_addr,
+ overlap_length);
+ if (ret)
+ goto end;
+ }
+
+ ret = overlap_length;
+
+end:
+ if (dest_bg)
+ btrfs_put_block_group(dest_bg);
+
+ return ret;
+}
+
+/*
+ * Returns 1 if remove_range_from_remap_tree() has been called successfully,
+ * 0 if block group wasn't remapped, and a negative number on error.
+ */
+int btrfs_remove_extent_from_remap_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_path *path,
+ u64 bytenr, u64 num_bytes)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_key key, found_key;
+ struct extent_buffer *leaf;
+ struct btrfs_block_group *bg;
+ int ret, length;
+
+ if (!(btrfs_super_incompat_flags(fs_info->super_copy) &
+ BTRFS_FEATURE_INCOMPAT_REMAP_TREE))
+ return 0;
+
+ bg = btrfs_lookup_block_group(fs_info, bytenr);
+ if (!bg)
+ return 0;
+
+ mutex_lock(&fs_info->remap_mutex);
+
+ if (!(bg->flags & BTRFS_BLOCK_GROUP_REMAPPED)) {
+ mutex_unlock(&fs_info->remap_mutex);
+ btrfs_put_block_group(bg);
+ return 0;
+ }
+
+ do {
+ key.objectid = bytenr;
+ key.type = (u8)-1;
+ key.offset = (u64)-1;
+
+ ret = btrfs_search_slot(trans, fs_info->remap_root, &key, path,
+ -1, 1);
+ if (ret < 0)
+ goto end;
+
+ leaf = path->nodes[0];
+
+ if (path->slots[0] == 0) {
+ ret = -ENOENT;
+ goto end;
+ }
+
+ path->slots[0]--;
+
+ btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+
+ if (found_key.type != BTRFS_IDENTITY_REMAP_KEY &&
+ found_key.type != BTRFS_REMAP_KEY) {
+ ret = -ENOENT;
+ goto end;
+ }
+
+ if (bytenr < found_key.objectid ||
+ bytenr >= found_key.objectid + found_key.offset) {
+ ret = -ENOENT;
+ goto end;
+ }
+
+ length = remove_range_from_remap_tree(trans, path, bg, bytenr,
+ num_bytes);
+ if (length < 0) {
+ ret = length;
+ goto end;
+ }
+
+ bytenr += length;
+ num_bytes -= length;
+ } while (num_bytes > 0);
+
+ ret = 1;
+
+end:
+ mutex_unlock(&fs_info->remap_mutex);
+
+ btrfs_put_block_group(bg);
+ btrfs_release_path(path);
+ return ret;
+}
diff --git a/fs/btrfs/relocation.h b/fs/btrfs/relocation.h
index b2ba83966650..7cfe91971cab 100644
--- a/fs/btrfs/relocation.h
+++ b/fs/btrfs/relocation.h
@@ -33,5 +33,11 @@ bool btrfs_should_ignore_reloc_root(const struct btrfs_root *root);
u64 btrfs_get_reloc_bg_bytenr(const struct btrfs_fs_info *fs_info);
int btrfs_translate_remap(struct btrfs_fs_info *fs_info, u64 *logical,
u64 *length);
+int btrfs_remove_extent_from_remap_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_path *path,
+ u64 bytenr, u64 num_bytes);
+int btrfs_last_identity_remap_gone(struct btrfs_trans_handle *trans,
+ struct btrfs_chunk_map *chunk,
+ struct btrfs_block_group *bg);
#endif
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index de3eeb37408a..ffee6c285182 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -2437,6 +2437,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
if (ret)
goto unlock_reloc;
+ ret = btrfs_handle_fully_remapped_bgs(trans);
+ if (ret)
+ goto unlock_reloc;
+
/*
* make sure none of the code above managed to slip in a
* delayed item
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index d117f74e08c1..99ad95e1c300 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2929,8 +2929,8 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
return ret;
}
-static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
- struct btrfs_device *device)
+int btrfs_update_device(struct btrfs_trans_handle *trans,
+ struct btrfs_device *device)
{
int ret;
BTRFS_PATH_AUTO_FREE(path);
@@ -3228,25 +3228,13 @@ static int remove_chunk_item(struct btrfs_trans_handle *trans,
return btrfs_free_chunk(trans, chunk_offset);
}
-int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
+int btrfs_remove_dev_extents(struct btrfs_trans_handle *trans,
+ struct btrfs_chunk_map *map)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- struct btrfs_chunk_map *map;
+ struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
u64 dev_extent_len = 0;
int i, ret = 0;
- struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
-
- map = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
- if (IS_ERR(map)) {
- /*
- * This is a logic error, but we don't want to just rely on the
- * user having built with ASSERT enabled, so if ASSERT doesn't
- * do anything we still error out.
- */
- DEBUG_WARN("errr %ld reading chunk map at offset %llu",
- PTR_ERR(map), chunk_offset);
- return PTR_ERR(map);
- }
/*
* First delete the device extent items from the devices btree.
@@ -3267,7 +3255,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
if (unlikely(ret)) {
mutex_unlock(&fs_devices->device_list_mutex);
btrfs_abort_transaction(trans, ret);
- goto out;
+ return ret;
}
if (device->bytes_used > 0) {
@@ -3287,6 +3275,30 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
}
mutex_unlock(&fs_devices->device_list_mutex);
+ return 0;
+}
+
+int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_chunk_map *map;
+ int ret;
+
+ map = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
+ if (IS_ERR(map)) {
+ /*
+ * This is a logic error, but we don't want to just rely on the
+ * user having built with ASSERT enabled, so if ASSERT doesn't
+ * do anything we still error out.
+ */
+ ASSERT(0);
+ return PTR_ERR(map);
+ }
+
+ ret = btrfs_remove_dev_extents(trans, map);
+ if (ret)
+ goto out;
+
/*
* We acquire fs_info->chunk_mutex for 2 reasons:
*
@@ -5422,7 +5434,7 @@ static void chunk_map_device_set_bits(struct btrfs_chunk_map *map, unsigned int
}
}
-static void chunk_map_device_clear_bits(struct btrfs_chunk_map *map, unsigned int bits)
+void btrfs_chunk_map_device_clear_bits(struct btrfs_chunk_map *map, unsigned int bits)
{
for (int i = 0; i < map->num_stripes; i++) {
struct btrfs_io_stripe *stripe = &map->stripes[i];
@@ -5439,7 +5451,7 @@ void btrfs_remove_chunk_map(struct btrfs_fs_info *fs_info, struct btrfs_chunk_ma
write_lock(&fs_info->mapping_tree_lock);
rb_erase_cached(&map->rb_node, &fs_info->mapping_tree);
RB_CLEAR_NODE(&map->rb_node);
- chunk_map_device_clear_bits(map, CHUNK_ALLOCATED);
+ btrfs_chunk_map_device_clear_bits(map, CHUNK_ALLOCATED);
write_unlock(&fs_info->mapping_tree_lock);
/* Once for the tree reference. */
@@ -5475,7 +5487,7 @@ int btrfs_add_chunk_map(struct btrfs_fs_info *fs_info, struct btrfs_chunk_map *m
return -EEXIST;
}
chunk_map_device_set_bits(map, CHUNK_ALLOCATED);
- chunk_map_device_clear_bits(map, CHUNK_TRIMMED);
+ btrfs_chunk_map_device_clear_bits(map, CHUNK_TRIMMED);
write_unlock(&fs_info->mapping_tree_lock);
return 0;
@@ -5840,7 +5852,7 @@ void btrfs_mapping_tree_free(struct btrfs_fs_info *fs_info)
map = rb_entry(node, struct btrfs_chunk_map, rb_node);
rb_erase_cached(&map->rb_node, &fs_info->mapping_tree);
RB_CLEAR_NODE(&map->rb_node);
- chunk_map_device_clear_bits(map, CHUNK_ALLOCATED);
+ btrfs_chunk_map_device_clear_bits(map, CHUNK_ALLOCATED);
/* Once for the tree ref. */
btrfs_free_chunk_map(map);
cond_resched_rwlock_write(&fs_info->mapping_tree_lock);
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 7cf76bffcab6..0c64cae59f1c 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -794,6 +794,8 @@ u64 btrfs_calc_stripe_length(const struct btrfs_chunk_map *map);
int btrfs_nr_parity_stripes(u64 type);
int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans,
struct btrfs_block_group *bg);
+int btrfs_remove_dev_extents(struct btrfs_trans_handle *trans,
+ struct btrfs_chunk_map *map);
int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset);
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
@@ -905,6 +907,10 @@ bool btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical);
bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr);
const u8 *btrfs_sb_fsid_ptr(const struct btrfs_super_block *sb);
+int btrfs_update_device(struct btrfs_trans_handle *trans,
+ struct btrfs_device *device);
+void btrfs_chunk_map_device_clear_bits(struct btrfs_chunk_map *map,
+ unsigned int bits);
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_info,
--
2.49.1
^ permalink raw reply related [flat|nested] 42+ messages in thread* Re: [PATCH v4 09/16] btrfs: handle deletions from remapped block group
2025-10-24 18:12 ` [PATCH v4 09/16] btrfs: handle deletions from remapped block group Mark Harmstone
@ 2025-10-31 23:05 ` Boris Burkov
2025-11-03 15:51 ` Mark Harmstone
2025-10-31 23:30 ` Boris Burkov
1 sibling, 1 reply; 42+ messages in thread
From: Boris Burkov @ 2025-10-31 23:05 UTC (permalink / raw)
To: Mark Harmstone; +Cc: linux-btrfs
On Fri, Oct 24, 2025 at 07:12:10PM +0100, Mark Harmstone wrote:
> Handle the case where we free an extent from a block group that has the
> REMAPPED flag set. Because the remap tree is orthogonal to the extent
> tree, for data this may be within any number of identity remaps or
> actual remaps. If we're freeing a metadata node, this will be wholly
> inside one or the other.
>
> btrfs_remove_extent_from_remap_tree() searches the remap tree for the
> remaps that cover the range in question, then calls
> remove_range_from_remap_tree() for each one, to punch a hole in the
> remap and adjust the free-space tree.
>
> For an identity remap, remove_range_from_remap_tree() will adjust the
> block group's `identity_remap_count` if this changes. If it reaches
> zero we call last_identity_remap_gone(), which removes the chunk's
> stripes and device extents - it is now fully remapped.
>
> The changes which involve the block group's ro flag are because the
> REMAPPED flag itself prevents a block group from having any new
> allocations within it, and so we don't need to account for this
> separately.
>
> Signed-off-by: Mark Harmstone <mark@harmstone.com>
> ---
> fs/btrfs/block-group.c | 118 +++++++---
> fs/btrfs/block-group.h | 4 +
> fs/btrfs/disk-io.c | 2 +
> fs/btrfs/extent-tree.c | 77 ++++++-
> fs/btrfs/extent-tree.h | 1 +
> fs/btrfs/fs.h | 4 +-
> fs/btrfs/relocation.c | 509 +++++++++++++++++++++++++++++++++++++++++
> fs/btrfs/relocation.h | 6 +
> fs/btrfs/transaction.c | 4 +
> fs/btrfs/volumes.c | 56 +++--
> fs/btrfs/volumes.h | 6 +
> 11 files changed, 728 insertions(+), 59 deletions(-)
>
> diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
> index 27173aca6fc1..3bf5f20d90ec 100644
> --- a/fs/btrfs/block-group.c
> +++ b/fs/btrfs/block-group.c
> @@ -1068,6 +1068,32 @@ static int remove_block_group_item(struct btrfs_trans_handle *trans,
> return ret;
> }
>
> +void btrfs_remove_bg_from_sinfo(struct btrfs_block_group *block_group)
> +{
> + int factor = btrfs_bg_type_to_factor(block_group->flags);
> +
> + spin_lock(&block_group->space_info->lock);
> +
> + if (btrfs_test_opt(block_group->fs_info, ENOSPC_DEBUG)) {
> + WARN_ON(block_group->space_info->total_bytes
> + < block_group->length);
> + WARN_ON(block_group->space_info->bytes_readonly
> + < block_group->length - block_group->zone_unusable);
> + WARN_ON(block_group->space_info->bytes_zone_unusable
> + < block_group->zone_unusable);
> + WARN_ON(block_group->space_info->disk_total
> + < block_group->length * factor);
> + }
> + block_group->space_info->total_bytes -= block_group->length;
> + block_group->space_info->bytes_readonly -=
> + (block_group->length - block_group->zone_unusable);
> + btrfs_space_info_update_bytes_zone_unusable(block_group->space_info,
> + -block_group->zone_unusable);
> + block_group->space_info->disk_total -= block_group->length * factor;
> +
> + spin_unlock(&block_group->space_info->lock);
> +}
> +
> int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
> struct btrfs_chunk_map *map)
> {
> @@ -1079,7 +1105,6 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
> struct kobject *kobj = NULL;
> int ret;
> int index;
> - int factor;
> struct btrfs_caching_control *caching_ctl = NULL;
> bool remove_map;
> bool remove_rsv = false;
> @@ -1088,7 +1113,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
> if (!block_group)
> return -ENOENT;
>
> - BUG_ON(!block_group->ro);
> + BUG_ON(!block_group->ro && !(block_group->flags & BTRFS_BLOCK_GROUP_REMAPPED));
>
> trace_btrfs_remove_block_group(block_group);
> /*
> @@ -1100,7 +1125,6 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
> block_group->length);
>
> index = btrfs_bg_flags_to_raid_index(block_group->flags);
> - factor = btrfs_bg_type_to_factor(block_group->flags);
>
> /* make sure this block group isn't part of an allocation cluster */
> cluster = &fs_info->data_alloc_cluster;
> @@ -1224,26 +1248,11 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
>
> spin_lock(&block_group->space_info->lock);
> list_del_init(&block_group->ro_list);
> -
> - if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
> - WARN_ON(block_group->space_info->total_bytes
> - < block_group->length);
> - WARN_ON(block_group->space_info->bytes_readonly
> - < block_group->length - block_group->zone_unusable);
> - WARN_ON(block_group->space_info->bytes_zone_unusable
> - < block_group->zone_unusable);
> - WARN_ON(block_group->space_info->disk_total
> - < block_group->length * factor);
> - }
> - block_group->space_info->total_bytes -= block_group->length;
> - block_group->space_info->bytes_readonly -=
> - (block_group->length - block_group->zone_unusable);
> - btrfs_space_info_update_bytes_zone_unusable(block_group->space_info,
> - -block_group->zone_unusable);
> - block_group->space_info->disk_total -= block_group->length * factor;
> -
> spin_unlock(&block_group->space_info->lock);
>
> + if (!(block_group->flags & BTRFS_BLOCK_GROUP_REMAPPED))
> + btrfs_remove_bg_from_sinfo(block_group);
> +
> /*
> * Remove the free space for the block group from the free space tree
> * and the block group's item from the extent tree before marking the
> @@ -1538,6 +1547,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
> while (!list_empty(&fs_info->unused_bgs)) {
> u64 used;
> int trimming;
> + bool made_ro = false;
>
> block_group = list_first_entry(&fs_info->unused_bgs,
> struct btrfs_block_group,
> @@ -1574,7 +1584,8 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
>
> spin_lock(&space_info->lock);
> spin_lock(&block_group->lock);
> - if (btrfs_is_block_group_used(block_group) || block_group->ro ||
> + if (btrfs_is_block_group_used(block_group) ||
> + (block_group->ro && !(block_group->flags & BTRFS_BLOCK_GROUP_REMAPPED)) ||
> list_is_singular(&block_group->list)) {
> /*
> * We want to bail if we made new allocations or have
> @@ -1616,9 +1627,10 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
> * needing to allocate extents from the block group.
> */
> used = btrfs_space_info_used(space_info, true);
> - if ((space_info->total_bytes - block_group->length < used &&
> + if (((space_info->total_bytes - block_group->length < used &&
> block_group->zone_unusable < block_group->length) ||
> - has_unwritten_metadata(block_group)) {
> + has_unwritten_metadata(block_group)) &&
> + !(block_group->flags & BTRFS_BLOCK_GROUP_REMAPPED)) {
> /*
> * Add a reference for the list, compensate for the ref
> * drop under the "next" label for the
> @@ -1636,8 +1648,14 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
> spin_unlock(&block_group->lock);
> spin_unlock(&space_info->lock);
>
> - /* We don't want to force the issue, only flip if it's ok. */
> - ret = inc_block_group_ro(block_group, 0);
If we are deleting an unused bg, what is the harm in marking it ro even
if it is remapped and it's redundant for new allocations?
> + if (!(block_group->flags & BTRFS_BLOCK_GROUP_REMAPPED)) {
> + /* We don't want to force the issue, only flip if it's ok. */
> + ret = inc_block_group_ro(block_group, 0);
> + made_ro = true;
> + } else {
> + ret = 0;
> + }
> +
> up_write(&space_info->groups_sem);
> if (ret < 0) {
> ret = 0;
> @@ -1646,7 +1664,8 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
>
> ret = btrfs_zone_finish(block_group);
> if (ret < 0) {
> - btrfs_dec_block_group_ro(block_group);
> + if (made_ro)
> + btrfs_dec_block_group_ro(block_group);
> if (ret == -EAGAIN) {
> btrfs_link_bg_list(block_group, &retry_list);
> ret = 0;
> @@ -1661,7 +1680,8 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
> trans = btrfs_start_trans_remove_block_group(fs_info,
> block_group->start);
> if (IS_ERR(trans)) {
> - btrfs_dec_block_group_ro(block_group);
> + if (made_ro)
> + btrfs_dec_block_group_ro(block_group);
> ret = PTR_ERR(trans);
> goto next;
> }
> @@ -1671,7 +1691,8 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
> * just delete them, we don't care about them anymore.
> */
> if (!clean_pinned_extents(trans, block_group)) {
> - btrfs_dec_block_group_ro(block_group);
> + if (made_ro)
> + btrfs_dec_block_group_ro(block_group);
> goto end_trans;
> }
>
> @@ -1685,7 +1706,8 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
> spin_lock(&fs_info->discard_ctl.lock);
> if (!list_empty(&block_group->discard_list)) {
> spin_unlock(&fs_info->discard_ctl.lock);
> - btrfs_dec_block_group_ro(block_group);
> + if (made_ro)
> + btrfs_dec_block_group_ro(block_group);
> btrfs_discard_queue_work(&fs_info->discard_ctl,
> block_group);
> goto end_trans;
> @@ -1779,6 +1801,15 @@ void btrfs_mark_bg_unused(struct btrfs_block_group *bg)
> struct btrfs_fs_info *fs_info = bg->fs_info;
>
> spin_lock(&fs_info->unused_bgs_lock);
> +
> + /* Leave fully remapped block groups on the fully_remapped_bgs list. */
> + if (bg->flags & BTRFS_BLOCK_GROUP_REMAPPED &&
> + bg->identity_remap_count == 0 &&
> + !list_empty(&bg->bg_list)) {
> + spin_unlock(&fs_info->unused_bgs_lock);
> + return;
> + }
> +
> if (list_empty(&bg->bg_list)) {
> btrfs_get_block_group(bg);
> trace_btrfs_add_unused_block_group(bg);
> @@ -4772,3 +4803,30 @@ bool btrfs_block_group_should_use_size_class(const struct btrfs_block_group *bg)
> return false;
> return true;
> }
> +
> +void btrfs_mark_bg_fully_remapped(struct btrfs_block_group *bg,
> + struct btrfs_trans_handle *trans)
> +{
> + struct btrfs_fs_info *fs_info = trans->fs_info;
> + bool already_done;
> +
> + spin_lock(&bg->lock);
> + already_done = bg->fully_remapped;
> + bg->fully_remapped = true;
> + spin_unlock(&bg->lock);
> +
> + if (already_done)
> + return;
> +
> + spin_lock(&fs_info->unused_bgs_lock);
> +
> + if (!list_empty(&bg->bg_list))
> + list_del(&bg->bg_list);
> + else
> + btrfs_get_block_group(bg);
> +
> + list_add_tail(&bg->bg_list, &fs_info->fully_remapped_bgs);
> +
> + spin_unlock(&fs_info->unused_bgs_lock);
> +
> +}
> diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h
> index af23fdb3cf4d..d85f3c2546d0 100644
> --- a/fs/btrfs/block-group.h
> +++ b/fs/btrfs/block-group.h
> @@ -282,6 +282,7 @@ struct btrfs_block_group {
> struct extent_buffer *last_eb;
> enum btrfs_block_group_size_class size_class;
> u64 reclaim_mark;
> + bool fully_remapped;
> };
>
> static inline u64 btrfs_block_group_end(const struct btrfs_block_group *block_group)
> @@ -336,6 +337,7 @@ int btrfs_add_new_free_space(struct btrfs_block_group *block_group,
> struct btrfs_trans_handle *btrfs_start_trans_remove_block_group(
> struct btrfs_fs_info *fs_info,
> const u64 chunk_offset);
> +void btrfs_remove_bg_from_sinfo(struct btrfs_block_group *block_group);
> int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
> struct btrfs_chunk_map *map);
> void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info);
> @@ -407,5 +409,7 @@ int btrfs_use_block_group_size_class(struct btrfs_block_group *bg,
> enum btrfs_block_group_size_class size_class,
> bool force_wrong_size_class);
> bool btrfs_block_group_should_use_size_class(const struct btrfs_block_group *bg);
> +void btrfs_mark_bg_fully_remapped(struct btrfs_block_group *bg,
> + struct btrfs_trans_handle *trans);
>
> #endif /* BTRFS_BLOCK_GROUP_H */
> diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
> index d3ff148311d8..1a3e525f3d1a 100644
> --- a/fs/btrfs/disk-io.c
> +++ b/fs/btrfs/disk-io.c
> @@ -2870,6 +2870,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
> INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
> INIT_LIST_HEAD(&fs_info->unused_bgs);
> INIT_LIST_HEAD(&fs_info->reclaim_bgs);
> + INIT_LIST_HEAD(&fs_info->fully_remapped_bgs);
> INIT_LIST_HEAD(&fs_info->zone_active_bgs);
> #ifdef CONFIG_BTRFS_DEBUG
> INIT_LIST_HEAD(&fs_info->allocated_roots);
> @@ -2925,6 +2926,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
> mutex_init(&fs_info->chunk_mutex);
> mutex_init(&fs_info->transaction_kthread_mutex);
> mutex_init(&fs_info->cleaner_mutex);
> + mutex_init(&fs_info->remap_mutex);
> mutex_init(&fs_info->ro_block_group_mutex);
> init_rwsem(&fs_info->commit_root_sem);
> init_rwsem(&fs_info->cleanup_work_sem);
> diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
> index d3ca8105ffc7..1c14e0c82c03 100644
> --- a/fs/btrfs/extent-tree.c
> +++ b/fs/btrfs/extent-tree.c
> @@ -40,6 +40,7 @@
> #include "orphan.h"
> #include "tree-checker.h"
> #include "raid-stripe-tree.h"
> +#include "relocation.h"
>
> #undef SCRAMBLE_DELAYED_REFS
>
> @@ -2847,6 +2848,52 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info,
> return ret;
> }
>
> +int btrfs_handle_fully_remapped_bgs(struct btrfs_trans_handle *trans)
> +{
> + struct btrfs_fs_info *fs_info = trans->fs_info;
> + struct btrfs_block_group *block_group, *tmp;
> + struct list_head *fully_remapped_bgs;
> + int ret;
> +
> + fully_remapped_bgs = &fs_info->fully_remapped_bgs;
> + list_for_each_entry_safe(block_group, tmp, fully_remapped_bgs, bg_list) {
> + struct btrfs_chunk_map *map;
> +
> + map = btrfs_get_chunk_map(fs_info, block_group->start, 1);
> + if (IS_ERR(map))
> + return PTR_ERR(map);
> +
> + ret = btrfs_last_identity_remap_gone(trans, map, block_group);
> + if (ret) {
> + btrfs_free_chunk_map(map);
> + return ret;
> + }
> +
> + /*
> + * Set num_stripes to 0, so that btrfs_remove_dev_extents()
> + * won't run a second time.
> + */
> + map->num_stripes = 0;
> +
> + btrfs_free_chunk_map(map);
> +
> + if (block_group->used == 0) {
> + spin_lock(&fs_info->unused_bgs_lock);
> + list_move_tail(&block_group->bg_list,
> + &fs_info->unused_bgs);
> + spin_unlock(&fs_info->unused_bgs_lock);
> + } else {
> + spin_lock(&fs_info->unused_bgs_lock);
> + list_del_init(&block_group->bg_list);
> + spin_unlock(&fs_info->unused_bgs_lock);
> +
> + btrfs_put_block_group(block_group);
> + }
> + }
> +
> + return 0;
> +}
> +
> int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
> {
> struct btrfs_fs_info *fs_info = trans->fs_info;
> @@ -2999,11 +3046,23 @@ u64 btrfs_get_extent_owner_root(struct btrfs_fs_info *fs_info,
> }
>
> static int do_free_extent_accounting(struct btrfs_trans_handle *trans,
> - u64 bytenr, struct btrfs_squota_delta *delta)
> + u64 bytenr, struct btrfs_squota_delta *delta,
> + struct btrfs_path *path)
> {
> int ret;
> + bool remapped = false;
> u64 num_bytes = delta->num_bytes;
>
> + /* returns 1 on success and 0 on no-op */
> + ret = btrfs_remove_extent_from_remap_tree(trans, path, bytenr,
> + num_bytes);
> + if (ret < 0) {
> + btrfs_abort_transaction(trans, ret);
> + return ret;
> + } else if (ret == 1) {
> + remapped = true;
> + }
> +
> if (delta->is_data) {
> struct btrfs_root *csum_root;
>
> @@ -3027,10 +3086,16 @@ static int do_free_extent_accounting(struct btrfs_trans_handle *trans,
> return ret;
> }
>
> - ret = btrfs_add_to_free_space_tree(trans, bytenr, num_bytes);
> - if (unlikely(ret)) {
> - btrfs_abort_transaction(trans, ret);
> - return ret;
> + /*
> + * If remapped, FST has already been taken care of in
> + * remove_range_from_remap_tree().
> + */
> + if (!remapped) {
> + ret = btrfs_add_to_free_space_tree(trans, bytenr, num_bytes);
> + if (unlikely(ret)) {
> + btrfs_abort_transaction(trans, ret);
> + return ret;
> + }
> }
>
> ret = btrfs_update_block_group(trans, bytenr, num_bytes, false);
> @@ -3396,7 +3461,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
> }
> btrfs_release_path(path);
>
> - ret = do_free_extent_accounting(trans, bytenr, &delta);
> + ret = do_free_extent_accounting(trans, bytenr, &delta, path);
> }
> btrfs_release_path(path);
>
> diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h
> index e970ac42a871..6b67a4e528da 100644
> --- a/fs/btrfs/extent-tree.h
> +++ b/fs/btrfs/extent-tree.h
> @@ -164,5 +164,6 @@ void btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, u64 start, u6
> int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
> u64 num_bytes, u64 *actual_bytes);
> int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range);
> +int btrfs_handle_fully_remapped_bgs(struct btrfs_trans_handle *trans);
>
> #endif
> diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
> index 62057e8006a9..c3dacbfe118c 100644
> --- a/fs/btrfs/fs.h
> +++ b/fs/btrfs/fs.h
> @@ -573,6 +573,7 @@ struct btrfs_fs_info {
> struct mutex transaction_kthread_mutex;
> struct mutex cleaner_mutex;
> struct mutex chunk_mutex;
> + struct mutex remap_mutex;
>
> /*
> * This is taken to make sure we don't set block groups ro after the
> @@ -827,10 +828,11 @@ struct btrfs_fs_info {
> struct list_head reclaim_bgs;
> int bg_reclaim_threshold;
>
> - /* Protects the lists unused_bgs and reclaim_bgs. */
> + /* Protects the lists unused_bgs, reclaim_bgs, and fully_remapped_bgs. */
> spinlock_t unused_bgs_lock;
> /* Protected by unused_bgs_lock. */
> struct list_head unused_bgs;
> + struct list_head fully_remapped_bgs;
> struct mutex unused_bg_unpin_mutex;
> /* Protect block groups that are going to be deleted */
> struct mutex reclaim_bgs_lock;
> diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
> index a8abe24de8d7..9f3ce3395d6a 100644
> --- a/fs/btrfs/relocation.c
> +++ b/fs/btrfs/relocation.c
> @@ -37,6 +37,7 @@
> #include "super.h"
> #include "tree-checker.h"
> #include "raid-stripe-tree.h"
> +#include "free-space-tree.h"
>
> /*
> * Relocation overview
> @@ -3870,6 +3871,151 @@ static const char *stage_to_string(enum reloc_stage stage)
> return "unknown";
> }
>
> +static void adjust_block_group_remap_bytes(struct btrfs_trans_handle *trans,
> + struct btrfs_block_group *bg,
> + s64 diff)
> +{
> + struct btrfs_fs_info *fs_info = trans->fs_info;
> + bool bg_already_dirty = true, mark_unused = false;
> +
> + spin_lock(&bg->lock);
> +
> + bg->remap_bytes += diff;
> +
> + if (bg->used == 0 && bg->remap_bytes == 0)
> + mark_unused = true;
> +
> + spin_unlock(&bg->lock);
> +
> + if (mark_unused)
> + btrfs_mark_bg_unused(bg);
> +
> + spin_lock(&trans->transaction->dirty_bgs_lock);
> + if (list_empty(&bg->dirty_list)) {
> + list_add_tail(&bg->dirty_list, &trans->transaction->dirty_bgs);
> + bg_already_dirty = false;
> + btrfs_get_block_group(bg);
> + }
> + spin_unlock(&trans->transaction->dirty_bgs_lock);
> +
> + /* Modified block groups are accounted for in the delayed_refs_rsv. */
> + if (!bg_already_dirty)
> + btrfs_inc_delayed_refs_rsv_bg_updates(fs_info);
> +}
> +
> +static int remove_chunk_stripes(struct btrfs_trans_handle *trans,
> + struct btrfs_chunk_map *chunk,
> + struct btrfs_path *path)
> +{
> + struct btrfs_fs_info *fs_info = trans->fs_info;
> + struct btrfs_key key;
> + struct extent_buffer *leaf;
> + struct btrfs_chunk *c;
> + int ret;
> +
> + key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
> + key.type = BTRFS_CHUNK_ITEM_KEY;
> + key.offset = chunk->start;
> +
> + ret = btrfs_search_slot(trans, fs_info->chunk_root, &key, path,
> + 0, 1);
> + if (ret) {
> + if (ret == 1) {
> + btrfs_release_path(path);
> + ret = -ENOENT;
> + }
> + return ret;
> + }
> +
> + leaf = path->nodes[0];
> +
> + c = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_chunk);
> + btrfs_set_chunk_num_stripes(leaf, c, 0);
> + btrfs_set_chunk_sub_stripes(leaf, c, 0);
> +
> + btrfs_truncate_item(trans, path, offsetof(struct btrfs_chunk, stripe),
> + 1);
> +
> + btrfs_mark_buffer_dirty(trans, leaf);
> +
> + btrfs_release_path(path);
> +
> + return 0;
> +}
> +
> +int btrfs_last_identity_remap_gone(struct btrfs_trans_handle *trans,
> + struct btrfs_chunk_map *chunk,
> + struct btrfs_block_group *bg)
> +{
> + int ret;
> + BTRFS_PATH_AUTO_FREE(path);
> +
> + ret = btrfs_remove_dev_extents(trans, chunk);
> + if (ret)
> + return ret;
> +
> + mutex_lock(&trans->fs_info->chunk_mutex);
> +
> + for (unsigned int i = 0; i < chunk->num_stripes; i++) {
> + ret = btrfs_update_device(trans, chunk->stripes[i].dev);
> + if (ret) {
> + mutex_unlock(&trans->fs_info->chunk_mutex);
> + return ret;
> + }
> + }
> +
> + mutex_unlock(&trans->fs_info->chunk_mutex);
> +
> + write_lock(&trans->fs_info->mapping_tree_lock);
> + btrfs_chunk_map_device_clear_bits(chunk, CHUNK_ALLOCATED);
> + write_unlock(&trans->fs_info->mapping_tree_lock);
> +
> + btrfs_remove_bg_from_sinfo(bg);
> +
> + path = btrfs_alloc_path();
> + if (!path)
> + return -ENOMEM;
> +
> + ret = remove_chunk_stripes(trans, chunk, path);
> + if (ret)
> + return ret;
> +
> + return 0;
> +}
> +
> +static void adjust_identity_remap_count(struct btrfs_trans_handle *trans,
> + struct btrfs_block_group *bg, int delta)
> +{
> + struct btrfs_fs_info *fs_info = trans->fs_info;
> + bool bg_already_dirty = true, mark_fully_remapped = false;
> +
> + WARN_ON(delta < 0 && -delta > bg->identity_remap_count);
> +
> + spin_lock(&bg->lock);
> +
> + bg->identity_remap_count += delta;
> +
> + if (!bg->fully_remapped && bg->identity_remap_count == 0)
> + mark_fully_remapped = true;
> +
> + spin_unlock(&bg->lock);
> +
> + spin_lock(&trans->transaction->dirty_bgs_lock);
> + if (list_empty(&bg->dirty_list)) {
> + list_add_tail(&bg->dirty_list, &trans->transaction->dirty_bgs);
> + bg_already_dirty = false;
> + btrfs_get_block_group(bg);
> + }
> + spin_unlock(&trans->transaction->dirty_bgs_lock);
> +
> + /* Modified block groups are accounted for in the delayed_refs_rsv. */
> + if (!bg_already_dirty)
> + btrfs_inc_delayed_refs_rsv_bg_updates(fs_info);
> +
> + if (mark_fully_remapped)
> + btrfs_mark_bg_fully_remapped(bg, trans);
> +}
> +
> int btrfs_translate_remap(struct btrfs_fs_info *fs_info, u64 *logical,
> u64 *length)
> {
> @@ -4478,3 +4624,366 @@ u64 btrfs_get_reloc_bg_bytenr(const struct btrfs_fs_info *fs_info)
> logical = fs_info->reloc_ctl->block_group->start;
> return logical;
> }
> +
> +static int remove_range_from_remap_tree(struct btrfs_trans_handle *trans,
> + struct btrfs_path *path,
> + struct btrfs_block_group *bg,
> + u64 bytenr, u64 num_bytes)
> +{
> + int ret;
> + struct btrfs_fs_info *fs_info = trans->fs_info;
> + struct extent_buffer *leaf = path->nodes[0];
> + struct btrfs_key key, new_key;
> + struct btrfs_remap *remap_ptr = NULL, remap;
> + struct btrfs_block_group *dest_bg = NULL;
> + u64 end, new_addr = 0, remap_start, remap_length, overlap_length;
> + bool is_identity_remap;
> +
> + end = bytenr + num_bytes;
> +
> + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
> +
> + is_identity_remap = key.type == BTRFS_IDENTITY_REMAP_KEY;
> +
> + remap_start = key.objectid;
> + remap_length = key.offset;
> +
> + if (!is_identity_remap) {
> + remap_ptr = btrfs_item_ptr(leaf, path->slots[0],
> + struct btrfs_remap);
> + new_addr = btrfs_remap_address(leaf, remap_ptr);
> +
> + dest_bg = btrfs_lookup_block_group(fs_info, new_addr);
> + }
> +
> + if (bytenr == remap_start && num_bytes >= remap_length) {
> + /* Remove entirely. */
> +
> + ret = btrfs_del_item(trans, fs_info->remap_root, path);
> + if (ret)
> + goto end;
> +
> + btrfs_release_path(path);
> +
> + overlap_length = remap_length;
> +
> + if (!is_identity_remap) {
> + /* Remove backref. */
> +
> + key.objectid = new_addr;
> + key.type = BTRFS_REMAP_BACKREF_KEY;
> + key.offset = remap_length;
> +
> + ret = btrfs_search_slot(trans, fs_info->remap_root,
> + &key, path, -1, 1);
> + if (ret) {
> + if (ret == 1) {
> + btrfs_release_path(path);
> + ret = -ENOENT;
> + }
> + goto end;
> + }
> +
> + ret = btrfs_del_item(trans, fs_info->remap_root, path);
> +
> + btrfs_release_path(path);
> +
> + if (ret)
> + goto end;
> +
> + adjust_block_group_remap_bytes(trans, dest_bg,
> + -remap_length);
> + } else {
> + adjust_identity_remap_count(trans, bg, -1);
> + }
> + } else if (bytenr == remap_start) {
> + /* Remove beginning. */
> +
> + new_key.objectid = end;
> + new_key.type = key.type;
> + new_key.offset = remap_length + remap_start - end;
> +
> + btrfs_set_item_key_safe(trans, path, &new_key);
> + btrfs_mark_buffer_dirty(trans, leaf);
> +
> + overlap_length = num_bytes;
> +
> + if (!is_identity_remap) {
> + btrfs_set_remap_address(leaf, remap_ptr,
> + new_addr + end - remap_start);
> + btrfs_release_path(path);
> +
> + /* Adjust backref. */
> +
> + key.objectid = new_addr;
> + key.type = BTRFS_REMAP_BACKREF_KEY;
> + key.offset = remap_length;
> +
> + ret = btrfs_search_slot(trans, fs_info->remap_root,
> + &key, path, -1, 1);
> + if (ret) {
> + if (ret == 1) {
> + btrfs_release_path(path);
> + ret = -ENOENT;
> + }
> + goto end;
> + }
> +
> + leaf = path->nodes[0];
> +
> + new_key.objectid = new_addr + end - remap_start;
> + new_key.type = BTRFS_REMAP_BACKREF_KEY;
> + new_key.offset = remap_length + remap_start - end;
> +
> + btrfs_set_item_key_safe(trans, path, &new_key);
> +
> + remap_ptr = btrfs_item_ptr(leaf, path->slots[0],
> + struct btrfs_remap);
> + btrfs_set_remap_address(leaf, remap_ptr, end);
> +
> + btrfs_mark_buffer_dirty(trans, path->nodes[0]);
> +
> + btrfs_release_path(path);
> +
> + adjust_block_group_remap_bytes(trans, dest_bg,
> + -num_bytes);
> + }
> + } else if (bytenr + num_bytes < remap_start + remap_length) {
> + /* Remove middle. */
> +
> + new_key.objectid = remap_start;
> + new_key.type = key.type;
> + new_key.offset = bytenr - remap_start;
> +
> + btrfs_set_item_key_safe(trans, path, &new_key);
> + btrfs_mark_buffer_dirty(trans, leaf);
> +
> + new_key.objectid = end;
> + new_key.offset = remap_start + remap_length - end;
> +
> + btrfs_release_path(path);
> +
> + overlap_length = num_bytes;
> +
> + if (!is_identity_remap) {
> + /* Add second remap entry. */
> +
> + ret = btrfs_insert_empty_item(trans, fs_info->remap_root,
> + path, &new_key,
> + sizeof(struct btrfs_remap));
> + if (ret)
> + goto end;
> +
> + btrfs_set_stack_remap_address(&remap,
> + new_addr + end - remap_start);
> +
> + write_extent_buffer(path->nodes[0], &remap,
> + btrfs_item_ptr_offset(path->nodes[0], path->slots[0]),
> + sizeof(struct btrfs_remap));
> +
> + btrfs_release_path(path);
> +
> + /* Shorten backref entry. */
> +
> + key.objectid = new_addr;
> + key.type = BTRFS_REMAP_BACKREF_KEY;
> + key.offset = remap_length;
> +
> + ret = btrfs_search_slot(trans, fs_info->remap_root,
> + &key, path, -1, 1);
> + if (ret) {
> + if (ret == 1) {
> + btrfs_release_path(path);
> + ret = -ENOENT;
> + }
> + goto end;
> + }
> +
> + new_key.objectid = new_addr;
> + new_key.type = BTRFS_REMAP_BACKREF_KEY;
> + new_key.offset = bytenr - remap_start;
> +
> + btrfs_set_item_key_safe(trans, path, &new_key);
> + btrfs_mark_buffer_dirty(trans, path->nodes[0]);
> +
> + btrfs_release_path(path);
> +
> + /* Add second backref entry. */
> +
> + new_key.objectid = new_addr + end - remap_start;
> + new_key.type = BTRFS_REMAP_BACKREF_KEY;
> + new_key.offset = remap_start + remap_length - end;
> +
> + ret = btrfs_insert_empty_item(trans, fs_info->remap_root,
> + path, &new_key,
> + sizeof(struct btrfs_remap));
> + if (ret)
> + goto end;
> +
> + btrfs_set_stack_remap_address(&remap, end);
> +
> + write_extent_buffer(path->nodes[0], &remap,
> + btrfs_item_ptr_offset(path->nodes[0], path->slots[0]),
> + sizeof(struct btrfs_remap));
> +
> + btrfs_release_path(path);
> +
> + adjust_block_group_remap_bytes(trans, dest_bg,
> + -num_bytes);
> + } else {
> + /* Add second identity remap entry. */
> +
> + ret = btrfs_insert_empty_item(trans, fs_info->remap_root,
> + path, &new_key, 0);
> + if (ret)
> + goto end;
> +
> + btrfs_release_path(path);
> +
> + adjust_identity_remap_count(trans, bg, 1);
> + }
> + } else {
> + /* Remove end. */
> +
> + new_key.objectid = remap_start;
> + new_key.type = key.type;
> + new_key.offset = bytenr - remap_start;
> +
> + btrfs_set_item_key_safe(trans, path, &new_key);
> + btrfs_mark_buffer_dirty(trans, leaf);
> +
> + btrfs_release_path(path);
> +
> + overlap_length = remap_start + remap_length - bytenr;
> +
> + if (!is_identity_remap) {
> + /* Shorten backref entry. */
> +
> + key.objectid = new_addr;
> + key.type = BTRFS_REMAP_BACKREF_KEY;
> + key.offset = remap_length;
> +
> + ret = btrfs_search_slot(trans, fs_info->remap_root,
> + &key, path, -1, 1);
> + if (ret) {
> + if (ret == 1) {
> + btrfs_release_path(path);
> + ret = -ENOENT;
> + }
> + goto end;
> + }
> +
> + new_key.objectid = new_addr;
> + new_key.type = BTRFS_REMAP_BACKREF_KEY;
> + new_key.offset = bytenr - remap_start;
> +
> + btrfs_set_item_key_safe(trans, path, &new_key);
> + btrfs_mark_buffer_dirty(trans, path->nodes[0]);
> +
> + btrfs_release_path(path);
> +
> + adjust_block_group_remap_bytes(trans, dest_bg,
> + bytenr - remap_start - remap_length);
> + }
> + }
> +
> + if (!is_identity_remap) {
> + ret = btrfs_add_to_free_space_tree(trans,
> + bytenr - remap_start + new_addr,
> + overlap_length);
> + if (ret)
> + goto end;
> + }
> +
> + ret = overlap_length;
> +
> +end:
> + if (dest_bg)
> + btrfs_put_block_group(dest_bg);
> +
> + return ret;
> +}
> +
> +/*
> + * Returns 1 if remove_range_from_remap_tree() has been called successfully,
> + * 0 if block group wasn't remapped, and a negative number on error.
> + */
> +int btrfs_remove_extent_from_remap_tree(struct btrfs_trans_handle *trans,
> + struct btrfs_path *path,
> + u64 bytenr, u64 num_bytes)
> +{
> + struct btrfs_fs_info *fs_info = trans->fs_info;
> + struct btrfs_key key, found_key;
> + struct extent_buffer *leaf;
> + struct btrfs_block_group *bg;
> + int ret, length;
> +
> + if (!(btrfs_super_incompat_flags(fs_info->super_copy) &
> + BTRFS_FEATURE_INCOMPAT_REMAP_TREE))
> + return 0;
> +
> + bg = btrfs_lookup_block_group(fs_info, bytenr);
> + if (!bg)
> + return 0;
> +
> + mutex_lock(&fs_info->remap_mutex);
> +
> + if (!(bg->flags & BTRFS_BLOCK_GROUP_REMAPPED)) {
> + mutex_unlock(&fs_info->remap_mutex);
> + btrfs_put_block_group(bg);
> + return 0;
> + }
> +
> + do {
> + key.objectid = bytenr;
> + key.type = (u8)-1;
> + key.offset = (u64)-1;
> +
> + ret = btrfs_search_slot(trans, fs_info->remap_root, &key, path,
> + -1, 1);
> + if (ret < 0)
> + goto end;
> +
> + leaf = path->nodes[0];
> +
> + if (path->slots[0] == 0) {
> + ret = -ENOENT;
> + goto end;
> + }
> +
> + path->slots[0]--;
> +
> + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
> +
> + if (found_key.type != BTRFS_IDENTITY_REMAP_KEY &&
> + found_key.type != BTRFS_REMAP_KEY) {
> + ret = -ENOENT;
> + goto end;
> + }
> +
> + if (bytenr < found_key.objectid ||
> + bytenr >= found_key.objectid + found_key.offset) {
> + ret = -ENOENT;
> + goto end;
> + }
> +
> + length = remove_range_from_remap_tree(trans, path, bg, bytenr,
> + num_bytes);
> + if (length < 0) {
> + ret = length;
> + goto end;
> + }
> +
> + bytenr += length;
> + num_bytes -= length;
> + } while (num_bytes > 0);
> +
> + ret = 1;
> +
> +end:
> + mutex_unlock(&fs_info->remap_mutex);
> +
> + btrfs_put_block_group(bg);
> + btrfs_release_path(path);
> + return ret;
> +}
> diff --git a/fs/btrfs/relocation.h b/fs/btrfs/relocation.h
> index b2ba83966650..7cfe91971cab 100644
> --- a/fs/btrfs/relocation.h
> +++ b/fs/btrfs/relocation.h
> @@ -33,5 +33,11 @@ bool btrfs_should_ignore_reloc_root(const struct btrfs_root *root);
> u64 btrfs_get_reloc_bg_bytenr(const struct btrfs_fs_info *fs_info);
> int btrfs_translate_remap(struct btrfs_fs_info *fs_info, u64 *logical,
> u64 *length);
> +int btrfs_remove_extent_from_remap_tree(struct btrfs_trans_handle *trans,
> + struct btrfs_path *path,
> + u64 bytenr, u64 num_bytes);
> +int btrfs_last_identity_remap_gone(struct btrfs_trans_handle *trans,
> + struct btrfs_chunk_map *chunk,
> + struct btrfs_block_group *bg);
>
> #endif
> diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
> index de3eeb37408a..ffee6c285182 100644
> --- a/fs/btrfs/transaction.c
> +++ b/fs/btrfs/transaction.c
> @@ -2437,6 +2437,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
> if (ret)
> goto unlock_reloc;
>
> + ret = btrfs_handle_fully_remapped_bgs(trans);
> + if (ret)
> + goto unlock_reloc;
> +
> /*
> * make sure none of the code above managed to slip in a
> * delayed item
> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
> index d117f74e08c1..99ad95e1c300 100644
> --- a/fs/btrfs/volumes.c
> +++ b/fs/btrfs/volumes.c
> @@ -2929,8 +2929,8 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
> return ret;
> }
>
> -static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
> - struct btrfs_device *device)
> +int btrfs_update_device(struct btrfs_trans_handle *trans,
> + struct btrfs_device *device)
> {
> int ret;
> BTRFS_PATH_AUTO_FREE(path);
> @@ -3228,25 +3228,13 @@ static int remove_chunk_item(struct btrfs_trans_handle *trans,
> return btrfs_free_chunk(trans, chunk_offset);
> }
>
> -int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
> +int btrfs_remove_dev_extents(struct btrfs_trans_handle *trans,
> + struct btrfs_chunk_map *map)
> {
> struct btrfs_fs_info *fs_info = trans->fs_info;
> - struct btrfs_chunk_map *map;
> + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
> u64 dev_extent_len = 0;
> int i, ret = 0;
> - struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
> -
> - map = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
> - if (IS_ERR(map)) {
> - /*
> - * This is a logic error, but we don't want to just rely on the
> - * user having built with ASSERT enabled, so if ASSERT doesn't
> - * do anything we still error out.
> - */
> - DEBUG_WARN("errr %ld reading chunk map at offset %llu",
> - PTR_ERR(map), chunk_offset);
> - return PTR_ERR(map);
> - }
>
> /*
> * First delete the device extent items from the devices btree.
> @@ -3267,7 +3255,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
> if (unlikely(ret)) {
> mutex_unlock(&fs_devices->device_list_mutex);
> btrfs_abort_transaction(trans, ret);
> - goto out;
> + return ret;
> }
>
> if (device->bytes_used > 0) {
> @@ -3287,6 +3275,30 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
> }
> mutex_unlock(&fs_devices->device_list_mutex);
>
> + return 0;
> +}
> +
> +int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
> +{
> + struct btrfs_fs_info *fs_info = trans->fs_info;
> + struct btrfs_chunk_map *map;
> + int ret;
> +
> + map = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
> + if (IS_ERR(map)) {
> + /*
> + * This is a logic error, but we don't want to just rely on the
> + * user having built with ASSERT enabled, so if ASSERT doesn't
> + * do anything we still error out.
> + */
> + ASSERT(0);
> + return PTR_ERR(map);
> + }
> +
> + ret = btrfs_remove_dev_extents(trans, map);
> + if (ret)
> + goto out;
> +
> /*
> * We acquire fs_info->chunk_mutex for 2 reasons:
> *
> @@ -5422,7 +5434,7 @@ static void chunk_map_device_set_bits(struct btrfs_chunk_map *map, unsigned int
> }
> }
>
> -static void chunk_map_device_clear_bits(struct btrfs_chunk_map *map, unsigned int bits)
> +void btrfs_chunk_map_device_clear_bits(struct btrfs_chunk_map *map, unsigned int bits)
> {
> for (int i = 0; i < map->num_stripes; i++) {
> struct btrfs_io_stripe *stripe = &map->stripes[i];
> @@ -5439,7 +5451,7 @@ void btrfs_remove_chunk_map(struct btrfs_fs_info *fs_info, struct btrfs_chunk_ma
> write_lock(&fs_info->mapping_tree_lock);
> rb_erase_cached(&map->rb_node, &fs_info->mapping_tree);
> RB_CLEAR_NODE(&map->rb_node);
> - chunk_map_device_clear_bits(map, CHUNK_ALLOCATED);
> + btrfs_chunk_map_device_clear_bits(map, CHUNK_ALLOCATED);
> write_unlock(&fs_info->mapping_tree_lock);
>
> /* Once for the tree reference. */
> @@ -5475,7 +5487,7 @@ int btrfs_add_chunk_map(struct btrfs_fs_info *fs_info, struct btrfs_chunk_map *m
> return -EEXIST;
> }
> chunk_map_device_set_bits(map, CHUNK_ALLOCATED);
> - chunk_map_device_clear_bits(map, CHUNK_TRIMMED);
> + btrfs_chunk_map_device_clear_bits(map, CHUNK_TRIMMED);
> write_unlock(&fs_info->mapping_tree_lock);
>
> return 0;
> @@ -5840,7 +5852,7 @@ void btrfs_mapping_tree_free(struct btrfs_fs_info *fs_info)
> map = rb_entry(node, struct btrfs_chunk_map, rb_node);
> rb_erase_cached(&map->rb_node, &fs_info->mapping_tree);
> RB_CLEAR_NODE(&map->rb_node);
> - chunk_map_device_clear_bits(map, CHUNK_ALLOCATED);
> + btrfs_chunk_map_device_clear_bits(map, CHUNK_ALLOCATED);
> /* Once for the tree ref. */
> btrfs_free_chunk_map(map);
> cond_resched_rwlock_write(&fs_info->mapping_tree_lock);
> diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
> index 7cf76bffcab6..0c64cae59f1c 100644
> --- a/fs/btrfs/volumes.h
> +++ b/fs/btrfs/volumes.h
> @@ -794,6 +794,8 @@ u64 btrfs_calc_stripe_length(const struct btrfs_chunk_map *map);
> int btrfs_nr_parity_stripes(u64 type);
> int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans,
> struct btrfs_block_group *bg);
> +int btrfs_remove_dev_extents(struct btrfs_trans_handle *trans,
> + struct btrfs_chunk_map *map);
> int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset);
>
> #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
> @@ -905,6 +907,10 @@ bool btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical);
>
> bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr);
> const u8 *btrfs_sb_fsid_ptr(const struct btrfs_super_block *sb);
> +int btrfs_update_device(struct btrfs_trans_handle *trans,
> + struct btrfs_device *device);
> +void btrfs_chunk_map_device_clear_bits(struct btrfs_chunk_map *map,
> + unsigned int bits);
>
> #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
> struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_info,
> --
> 2.49.1
>
^ permalink raw reply [flat|nested] 42+ messages in thread* Re: [PATCH v4 09/16] btrfs: handle deletions from remapped block group
2025-10-31 23:05 ` Boris Burkov
@ 2025-11-03 15:51 ` Mark Harmstone
0 siblings, 0 replies; 42+ messages in thread
From: Mark Harmstone @ 2025-11-03 15:51 UTC (permalink / raw)
To: Boris Burkov; +Cc: linux-btrfs
On 31/10/2025 11.05 pm, Boris Burkov wrote:
> On Fri, Oct 24, 2025 at 07:12:10PM +0100, Mark Harmstone wrote:
>> Handle the case where we free an extent from a block group that has the
>> REMAPPED flag set. Because the remap tree is orthogonal to the extent
>> tree, for data this may be within any number of identity remaps or
>> actual remaps. If we're freeing a metadata node, this will be wholly
>> inside one or the other.
>>
>> btrfs_remove_extent_from_remap_tree() searches the remap tree for the
>> remaps that cover the range in question, then calls
>> remove_range_from_remap_tree() for each one, to punch a hole in the
>> remap and adjust the free-space tree.
>>
>> For an identity remap, remove_range_from_remap_tree() will adjust the
>> block group's `identity_remap_count` if this changes. If it reaches
>> zero we call last_identity_remap_gone(), which removes the chunk's
>> stripes and device extents - it is now fully remapped.
>>
>> The changes which involve the block group's ro flag are because the
>> REMAPPED flag itself prevents a block group from having any new
>> allocations within it, and so we don't need to account for this
>> separately.
>>
>> Signed-off-by: Mark Harmstone <mark@harmstone.com>
>> ---
>> fs/btrfs/block-group.c | 118 +++++++---
>> fs/btrfs/block-group.h | 4 +
>> fs/btrfs/disk-io.c | 2 +
>> fs/btrfs/extent-tree.c | 77 ++++++-
>> fs/btrfs/extent-tree.h | 1 +
>> fs/btrfs/fs.h | 4 +-
>> fs/btrfs/relocation.c | 509 +++++++++++++++++++++++++++++++++++++++++
>> fs/btrfs/relocation.h | 6 +
>> fs/btrfs/transaction.c | 4 +
>> fs/btrfs/volumes.c | 56 +++--
>> fs/btrfs/volumes.h | 6 +
>> 11 files changed, 728 insertions(+), 59 deletions(-)
>>
>> diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
>> index 27173aca6fc1..3bf5f20d90ec 100644
>> --- a/fs/btrfs/block-group.c
>> +++ b/fs/btrfs/block-group.c
>> @@ -1068,6 +1068,32 @@ static int remove_block_group_item(struct btrfs_trans_handle *trans,
>> return ret;
>> }
>>
>> +void btrfs_remove_bg_from_sinfo(struct btrfs_block_group *block_group)
>> +{
>> + int factor = btrfs_bg_type_to_factor(block_group->flags);
>> +
>> + spin_lock(&block_group->space_info->lock);
>> +
>> + if (btrfs_test_opt(block_group->fs_info, ENOSPC_DEBUG)) {
>> + WARN_ON(block_group->space_info->total_bytes
>> + < block_group->length);
>> + WARN_ON(block_group->space_info->bytes_readonly
>> + < block_group->length - block_group->zone_unusable);
>> + WARN_ON(block_group->space_info->bytes_zone_unusable
>> + < block_group->zone_unusable);
>> + WARN_ON(block_group->space_info->disk_total
>> + < block_group->length * factor);
>> + }
>> + block_group->space_info->total_bytes -= block_group->length;
>> + block_group->space_info->bytes_readonly -=
>> + (block_group->length - block_group->zone_unusable);
>> + btrfs_space_info_update_bytes_zone_unusable(block_group->space_info,
>> + -block_group->zone_unusable);
>> + block_group->space_info->disk_total -= block_group->length * factor;
>> +
>> + spin_unlock(&block_group->space_info->lock);
>> +}
>> +
>> int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
>> struct btrfs_chunk_map *map)
>> {
>> @@ -1079,7 +1105,6 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
>> struct kobject *kobj = NULL;
>> int ret;
>> int index;
>> - int factor;
>> struct btrfs_caching_control *caching_ctl = NULL;
>> bool remove_map;
>> bool remove_rsv = false;
>> @@ -1088,7 +1113,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
>> if (!block_group)
>> return -ENOENT;
>>
>> - BUG_ON(!block_group->ro);
>> + BUG_ON(!block_group->ro && !(block_group->flags & BTRFS_BLOCK_GROUP_REMAPPED));
>>
>> trace_btrfs_remove_block_group(block_group);
>> /*
>> @@ -1100,7 +1125,6 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
>> block_group->length);
>>
>> index = btrfs_bg_flags_to_raid_index(block_group->flags);
>> - factor = btrfs_bg_type_to_factor(block_group->flags);
>>
>> /* make sure this block group isn't part of an allocation cluster */
>> cluster = &fs_info->data_alloc_cluster;
>> @@ -1224,26 +1248,11 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
>>
>> spin_lock(&block_group->space_info->lock);
>> list_del_init(&block_group->ro_list);
>> -
>> - if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
>> - WARN_ON(block_group->space_info->total_bytes
>> - < block_group->length);
>> - WARN_ON(block_group->space_info->bytes_readonly
>> - < block_group->length - block_group->zone_unusable);
>> - WARN_ON(block_group->space_info->bytes_zone_unusable
>> - < block_group->zone_unusable);
>> - WARN_ON(block_group->space_info->disk_total
>> - < block_group->length * factor);
>> - }
>> - block_group->space_info->total_bytes -= block_group->length;
>> - block_group->space_info->bytes_readonly -=
>> - (block_group->length - block_group->zone_unusable);
>> - btrfs_space_info_update_bytes_zone_unusable(block_group->space_info,
>> - -block_group->zone_unusable);
>> - block_group->space_info->disk_total -= block_group->length * factor;
>> -
>> spin_unlock(&block_group->space_info->lock);
>>
>> + if (!(block_group->flags & BTRFS_BLOCK_GROUP_REMAPPED))
>> + btrfs_remove_bg_from_sinfo(block_group);
>> +
>> /*
>> * Remove the free space for the block group from the free space tree
>> * and the block group's item from the extent tree before marking the
>> @@ -1538,6 +1547,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
>> while (!list_empty(&fs_info->unused_bgs)) {
>> u64 used;
>> int trimming;
>> + bool made_ro = false;
>>
>> block_group = list_first_entry(&fs_info->unused_bgs,
>> struct btrfs_block_group,
>> @@ -1574,7 +1584,8 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
>>
>> spin_lock(&space_info->lock);
>> spin_lock(&block_group->lock);
>> - if (btrfs_is_block_group_used(block_group) || block_group->ro ||
>> + if (btrfs_is_block_group_used(block_group) ||
>> + (block_group->ro && !(block_group->flags & BTRFS_BLOCK_GROUP_REMAPPED)) ||
>> list_is_singular(&block_group->list)) {
>> /*
>> * We want to bail if we made new allocations or have
>> @@ -1616,9 +1627,10 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
>> * needing to allocate extents from the block group.
>> */
>> used = btrfs_space_info_used(space_info, true);
>> - if ((space_info->total_bytes - block_group->length < used &&
>> + if (((space_info->total_bytes - block_group->length < used &&
>> block_group->zone_unusable < block_group->length) ||
>> - has_unwritten_metadata(block_group)) {
>> + has_unwritten_metadata(block_group)) &&
>> + !(block_group->flags & BTRFS_BLOCK_GROUP_REMAPPED)) {
>> /*
>> * Add a reference for the list, compensate for the ref
>> * drop under the "next" label for the
>> @@ -1636,8 +1648,14 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
>> spin_unlock(&block_group->lock);
>> spin_unlock(&space_info->lock);
>>
>> - /* We don't want to force the issue, only flip if it's ok. */
>> - ret = inc_block_group_ro(block_group, 0);
>
> If we are deleting an unused bg, what is the harm in marking it ro even
> if it is remapped and it's redundant for new allocations?
That is a good question. I'll take this out - I can't reproduce the bug I had to do this
for, nor can I remember precisely what it was. Quite possibly it was fixed in the
process of tightening up the code elsewhere.
>
>> + if (!(block_group->flags & BTRFS_BLOCK_GROUP_REMAPPED)) {
>> + /* We don't want to force the issue, only flip if it's ok. */
>> + ret = inc_block_group_ro(block_group, 0);
>> + made_ro = true;
>> + } else {
>> + ret = 0;
>> + }
>> +
>> up_write(&space_info->groups_sem);
>> if (ret < 0) {
>> ret = 0;
>> @@ -1646,7 +1664,8 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
>>
>> ret = btrfs_zone_finish(block_group);
>> if (ret < 0) {
>> - btrfs_dec_block_group_ro(block_group);
>> + if (made_ro)
>> + btrfs_dec_block_group_ro(block_group);
>> if (ret == -EAGAIN) {
>> btrfs_link_bg_list(block_group, &retry_list);
>> ret = 0;
>> @@ -1661,7 +1680,8 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
>> trans = btrfs_start_trans_remove_block_group(fs_info,
>> block_group->start);
>> if (IS_ERR(trans)) {
>> - btrfs_dec_block_group_ro(block_group);
>> + if (made_ro)
>> + btrfs_dec_block_group_ro(block_group);
>> ret = PTR_ERR(trans);
>> goto next;
>> }
>> @@ -1671,7 +1691,8 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
>> * just delete them, we don't care about them anymore.
>> */
>> if (!clean_pinned_extents(trans, block_group)) {
>> - btrfs_dec_block_group_ro(block_group);
>> + if (made_ro)
>> + btrfs_dec_block_group_ro(block_group);
>> goto end_trans;
>> }
>>
>> @@ -1685,7 +1706,8 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
>> spin_lock(&fs_info->discard_ctl.lock);
>> if (!list_empty(&block_group->discard_list)) {
>> spin_unlock(&fs_info->discard_ctl.lock);
>> - btrfs_dec_block_group_ro(block_group);
>> + if (made_ro)
>> + btrfs_dec_block_group_ro(block_group);
>> btrfs_discard_queue_work(&fs_info->discard_ctl,
>> block_group);
>> goto end_trans;
>> @@ -1779,6 +1801,15 @@ void btrfs_mark_bg_unused(struct btrfs_block_group *bg)
>> struct btrfs_fs_info *fs_info = bg->fs_info;
>>
>> spin_lock(&fs_info->unused_bgs_lock);
>> +
>> + /* Leave fully remapped block groups on the fully_remapped_bgs list. */
>> + if (bg->flags & BTRFS_BLOCK_GROUP_REMAPPED &&
>> + bg->identity_remap_count == 0 &&
>> + !list_empty(&bg->bg_list)) {
>> + spin_unlock(&fs_info->unused_bgs_lock);
>> + return;
>> + }
>> +
>> if (list_empty(&bg->bg_list)) {
>> btrfs_get_block_group(bg);
>> trace_btrfs_add_unused_block_group(bg);
>> @@ -4772,3 +4803,30 @@ bool btrfs_block_group_should_use_size_class(const struct btrfs_block_group *bg)
>> return false;
>> return true;
>> }
>> +
>> +void btrfs_mark_bg_fully_remapped(struct btrfs_block_group *bg,
>> + struct btrfs_trans_handle *trans)
>> +{
>> + struct btrfs_fs_info *fs_info = trans->fs_info;
>> + bool already_done;
>> +
>> + spin_lock(&bg->lock);
>> + already_done = bg->fully_remapped;
>> + bg->fully_remapped = true;
>> + spin_unlock(&bg->lock);
>> +
>> + if (already_done)
>> + return;
>> +
>> + spin_lock(&fs_info->unused_bgs_lock);
>> +
>> + if (!list_empty(&bg->bg_list))
>> + list_del(&bg->bg_list);
>> + else
>> + btrfs_get_block_group(bg);
>> +
>> + list_add_tail(&bg->bg_list, &fs_info->fully_remapped_bgs);
>> +
>> + spin_unlock(&fs_info->unused_bgs_lock);
>> +
>> +}
>> diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h
>> index af23fdb3cf4d..d85f3c2546d0 100644
>> --- a/fs/btrfs/block-group.h
>> +++ b/fs/btrfs/block-group.h
>> @@ -282,6 +282,7 @@ struct btrfs_block_group {
>> struct extent_buffer *last_eb;
>> enum btrfs_block_group_size_class size_class;
>> u64 reclaim_mark;
>> + bool fully_remapped;
>> };
>>
>> static inline u64 btrfs_block_group_end(const struct btrfs_block_group *block_group)
>> @@ -336,6 +337,7 @@ int btrfs_add_new_free_space(struct btrfs_block_group *block_group,
>> struct btrfs_trans_handle *btrfs_start_trans_remove_block_group(
>> struct btrfs_fs_info *fs_info,
>> const u64 chunk_offset);
>> +void btrfs_remove_bg_from_sinfo(struct btrfs_block_group *block_group);
>> int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
>> struct btrfs_chunk_map *map);
>> void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info);
>> @@ -407,5 +409,7 @@ int btrfs_use_block_group_size_class(struct btrfs_block_group *bg,
>> enum btrfs_block_group_size_class size_class,
>> bool force_wrong_size_class);
>> bool btrfs_block_group_should_use_size_class(const struct btrfs_block_group *bg);
>> +void btrfs_mark_bg_fully_remapped(struct btrfs_block_group *bg,
>> + struct btrfs_trans_handle *trans);
>>
>> #endif /* BTRFS_BLOCK_GROUP_H */
>> diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
>> index d3ff148311d8..1a3e525f3d1a 100644
>> --- a/fs/btrfs/disk-io.c
>> +++ b/fs/btrfs/disk-io.c
>> @@ -2870,6 +2870,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
>> INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
>> INIT_LIST_HEAD(&fs_info->unused_bgs);
>> INIT_LIST_HEAD(&fs_info->reclaim_bgs);
>> + INIT_LIST_HEAD(&fs_info->fully_remapped_bgs);
>> INIT_LIST_HEAD(&fs_info->zone_active_bgs);
>> #ifdef CONFIG_BTRFS_DEBUG
>> INIT_LIST_HEAD(&fs_info->allocated_roots);
>> @@ -2925,6 +2926,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
>> mutex_init(&fs_info->chunk_mutex);
>> mutex_init(&fs_info->transaction_kthread_mutex);
>> mutex_init(&fs_info->cleaner_mutex);
>> + mutex_init(&fs_info->remap_mutex);
>> mutex_init(&fs_info->ro_block_group_mutex);
>> init_rwsem(&fs_info->commit_root_sem);
>> init_rwsem(&fs_info->cleanup_work_sem);
>> diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
>> index d3ca8105ffc7..1c14e0c82c03 100644
>> --- a/fs/btrfs/extent-tree.c
>> +++ b/fs/btrfs/extent-tree.c
>> @@ -40,6 +40,7 @@
>> #include "orphan.h"
>> #include "tree-checker.h"
>> #include "raid-stripe-tree.h"
>> +#include "relocation.h"
>>
>> #undef SCRAMBLE_DELAYED_REFS
>>
>> @@ -2847,6 +2848,52 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info,
>> return ret;
>> }
>>
>> +int btrfs_handle_fully_remapped_bgs(struct btrfs_trans_handle *trans)
>> +{
>> + struct btrfs_fs_info *fs_info = trans->fs_info;
>> + struct btrfs_block_group *block_group, *tmp;
>> + struct list_head *fully_remapped_bgs;
>> + int ret;
>> +
>> + fully_remapped_bgs = &fs_info->fully_remapped_bgs;
>> + list_for_each_entry_safe(block_group, tmp, fully_remapped_bgs, bg_list) {
>> + struct btrfs_chunk_map *map;
>> +
>> + map = btrfs_get_chunk_map(fs_info, block_group->start, 1);
>> + if (IS_ERR(map))
>> + return PTR_ERR(map);
>> +
>> + ret = btrfs_last_identity_remap_gone(trans, map, block_group);
>> + if (ret) {
>> + btrfs_free_chunk_map(map);
>> + return ret;
>> + }
>> +
>> + /*
>> + * Set num_stripes to 0, so that btrfs_remove_dev_extents()
>> + * won't run a second time.
>> + */
>> + map->num_stripes = 0;
>> +
>> + btrfs_free_chunk_map(map);
>> +
>> + if (block_group->used == 0) {
>> + spin_lock(&fs_info->unused_bgs_lock);
>> + list_move_tail(&block_group->bg_list,
>> + &fs_info->unused_bgs);
>> + spin_unlock(&fs_info->unused_bgs_lock);
>> + } else {
>> + spin_lock(&fs_info->unused_bgs_lock);
>> + list_del_init(&block_group->bg_list);
>> + spin_unlock(&fs_info->unused_bgs_lock);
>> +
>> + btrfs_put_block_group(block_group);
>> + }
>> + }
>> +
>> + return 0;
>> +}
>> +
>> int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
>> {
>> struct btrfs_fs_info *fs_info = trans->fs_info;
>> @@ -2999,11 +3046,23 @@ u64 btrfs_get_extent_owner_root(struct btrfs_fs_info *fs_info,
>> }
>>
>> static int do_free_extent_accounting(struct btrfs_trans_handle *trans,
>> - u64 bytenr, struct btrfs_squota_delta *delta)
>> + u64 bytenr, struct btrfs_squota_delta *delta,
>> + struct btrfs_path *path)
>> {
>> int ret;
>> + bool remapped = false;
>> u64 num_bytes = delta->num_bytes;
>>
>> + /* returns 1 on success and 0 on no-op */
>> + ret = btrfs_remove_extent_from_remap_tree(trans, path, bytenr,
>> + num_bytes);
>> + if (ret < 0) {
>> + btrfs_abort_transaction(trans, ret);
>> + return ret;
>> + } else if (ret == 1) {
>> + remapped = true;
>> + }
>> +
>> if (delta->is_data) {
>> struct btrfs_root *csum_root;
>>
>> @@ -3027,10 +3086,16 @@ static int do_free_extent_accounting(struct btrfs_trans_handle *trans,
>> return ret;
>> }
>>
>> - ret = btrfs_add_to_free_space_tree(trans, bytenr, num_bytes);
>> - if (unlikely(ret)) {
>> - btrfs_abort_transaction(trans, ret);
>> - return ret;
>> + /*
>> + * If remapped, FST has already been taken care of in
>> + * remove_range_from_remap_tree().
>> + */
>> + if (!remapped) {
>> + ret = btrfs_add_to_free_space_tree(trans, bytenr, num_bytes);
>> + if (unlikely(ret)) {
>> + btrfs_abort_transaction(trans, ret);
>> + return ret;
>> + }
>> }
>>
>> ret = btrfs_update_block_group(trans, bytenr, num_bytes, false);
>> @@ -3396,7 +3461,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
>> }
>> btrfs_release_path(path);
>>
>> - ret = do_free_extent_accounting(trans, bytenr, &delta);
>> + ret = do_free_extent_accounting(trans, bytenr, &delta, path);
>> }
>> btrfs_release_path(path);
>>
>> diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h
>> index e970ac42a871..6b67a4e528da 100644
>> --- a/fs/btrfs/extent-tree.h
>> +++ b/fs/btrfs/extent-tree.h
>> @@ -164,5 +164,6 @@ void btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, u64 start, u6
>> int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
>> u64 num_bytes, u64 *actual_bytes);
>> int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range);
>> +int btrfs_handle_fully_remapped_bgs(struct btrfs_trans_handle *trans);
>>
>> #endif
>> diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
>> index 62057e8006a9..c3dacbfe118c 100644
>> --- a/fs/btrfs/fs.h
>> +++ b/fs/btrfs/fs.h
>> @@ -573,6 +573,7 @@ struct btrfs_fs_info {
>> struct mutex transaction_kthread_mutex;
>> struct mutex cleaner_mutex;
>> struct mutex chunk_mutex;
>> + struct mutex remap_mutex;
>>
>> /*
>> * This is taken to make sure we don't set block groups ro after the
>> @@ -827,10 +828,11 @@ struct btrfs_fs_info {
>> struct list_head reclaim_bgs;
>> int bg_reclaim_threshold;
>>
>> - /* Protects the lists unused_bgs and reclaim_bgs. */
>> + /* Protects the lists unused_bgs, reclaim_bgs, and fully_remapped_bgs. */
>> spinlock_t unused_bgs_lock;
>> /* Protected by unused_bgs_lock. */
>> struct list_head unused_bgs;
>> + struct list_head fully_remapped_bgs;
>> struct mutex unused_bg_unpin_mutex;
>> /* Protect block groups that are going to be deleted */
>> struct mutex reclaim_bgs_lock;
>> diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
>> index a8abe24de8d7..9f3ce3395d6a 100644
>> --- a/fs/btrfs/relocation.c
>> +++ b/fs/btrfs/relocation.c
>> @@ -37,6 +37,7 @@
>> #include "super.h"
>> #include "tree-checker.h"
>> #include "raid-stripe-tree.h"
>> +#include "free-space-tree.h"
>>
>> /*
>> * Relocation overview
>> @@ -3870,6 +3871,151 @@ static const char *stage_to_string(enum reloc_stage stage)
>> return "unknown";
>> }
>>
>> +static void adjust_block_group_remap_bytes(struct btrfs_trans_handle *trans,
>> + struct btrfs_block_group *bg,
>> + s64 diff)
>> +{
>> + struct btrfs_fs_info *fs_info = trans->fs_info;
>> + bool bg_already_dirty = true, mark_unused = false;
>> +
>> + spin_lock(&bg->lock);
>> +
>> + bg->remap_bytes += diff;
>> +
>> + if (bg->used == 0 && bg->remap_bytes == 0)
>> + mark_unused = true;
>> +
>> + spin_unlock(&bg->lock);
>> +
>> + if (mark_unused)
>> + btrfs_mark_bg_unused(bg);
>> +
>> + spin_lock(&trans->transaction->dirty_bgs_lock);
>> + if (list_empty(&bg->dirty_list)) {
>> + list_add_tail(&bg->dirty_list, &trans->transaction->dirty_bgs);
>> + bg_already_dirty = false;
>> + btrfs_get_block_group(bg);
>> + }
>> + spin_unlock(&trans->transaction->dirty_bgs_lock);
>> +
>> + /* Modified block groups are accounted for in the delayed_refs_rsv. */
>> + if (!bg_already_dirty)
>> + btrfs_inc_delayed_refs_rsv_bg_updates(fs_info);
>> +}
>> +
>> +static int remove_chunk_stripes(struct btrfs_trans_handle *trans,
>> + struct btrfs_chunk_map *chunk,
>> + struct btrfs_path *path)
>> +{
>> + struct btrfs_fs_info *fs_info = trans->fs_info;
>> + struct btrfs_key key;
>> + struct extent_buffer *leaf;
>> + struct btrfs_chunk *c;
>> + int ret;
>> +
>> + key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
>> + key.type = BTRFS_CHUNK_ITEM_KEY;
>> + key.offset = chunk->start;
>> +
>> + ret = btrfs_search_slot(trans, fs_info->chunk_root, &key, path,
>> + 0, 1);
>> + if (ret) {
>> + if (ret == 1) {
>> + btrfs_release_path(path);
>> + ret = -ENOENT;
>> + }
>> + return ret;
>> + }
>> +
>> + leaf = path->nodes[0];
>> +
>> + c = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_chunk);
>> + btrfs_set_chunk_num_stripes(leaf, c, 0);
>> + btrfs_set_chunk_sub_stripes(leaf, c, 0);
>> +
>> + btrfs_truncate_item(trans, path, offsetof(struct btrfs_chunk, stripe),
>> + 1);
>> +
>> + btrfs_mark_buffer_dirty(trans, leaf);
>> +
>> + btrfs_release_path(path);
>> +
>> + return 0;
>> +}
>> +
>> +int btrfs_last_identity_remap_gone(struct btrfs_trans_handle *trans,
>> + struct btrfs_chunk_map *chunk,
>> + struct btrfs_block_group *bg)
>> +{
>> + int ret;
>> + BTRFS_PATH_AUTO_FREE(path);
>> +
>> + ret = btrfs_remove_dev_extents(trans, chunk);
>> + if (ret)
>> + return ret;
>> +
>> + mutex_lock(&trans->fs_info->chunk_mutex);
>> +
>> + for (unsigned int i = 0; i < chunk->num_stripes; i++) {
>> + ret = btrfs_update_device(trans, chunk->stripes[i].dev);
>> + if (ret) {
>> + mutex_unlock(&trans->fs_info->chunk_mutex);
>> + return ret;
>> + }
>> + }
>> +
>> + mutex_unlock(&trans->fs_info->chunk_mutex);
>> +
>> + write_lock(&trans->fs_info->mapping_tree_lock);
>> + btrfs_chunk_map_device_clear_bits(chunk, CHUNK_ALLOCATED);
>> + write_unlock(&trans->fs_info->mapping_tree_lock);
>> +
>> + btrfs_remove_bg_from_sinfo(bg);
>> +
>> + path = btrfs_alloc_path();
>> + if (!path)
>> + return -ENOMEM;
>> +
>> + ret = remove_chunk_stripes(trans, chunk, path);
>> + if (ret)
>> + return ret;
>> +
>> + return 0;
>> +}
>> +
>> +static void adjust_identity_remap_count(struct btrfs_trans_handle *trans,
>> + struct btrfs_block_group *bg, int delta)
>> +{
>> + struct btrfs_fs_info *fs_info = trans->fs_info;
>> + bool bg_already_dirty = true, mark_fully_remapped = false;
>> +
>> + WARN_ON(delta < 0 && -delta > bg->identity_remap_count);
>> +
>> + spin_lock(&bg->lock);
>> +
>> + bg->identity_remap_count += delta;
>> +
>> + if (!bg->fully_remapped && bg->identity_remap_count == 0)
>> + mark_fully_remapped = true;
>> +
>> + spin_unlock(&bg->lock);
>> +
>> + spin_lock(&trans->transaction->dirty_bgs_lock);
>> + if (list_empty(&bg->dirty_list)) {
>> + list_add_tail(&bg->dirty_list, &trans->transaction->dirty_bgs);
>> + bg_already_dirty = false;
>> + btrfs_get_block_group(bg);
>> + }
>> + spin_unlock(&trans->transaction->dirty_bgs_lock);
>> +
>> + /* Modified block groups are accounted for in the delayed_refs_rsv. */
>> + if (!bg_already_dirty)
>> + btrfs_inc_delayed_refs_rsv_bg_updates(fs_info);
>> +
>> + if (mark_fully_remapped)
>> + btrfs_mark_bg_fully_remapped(bg, trans);
>> +}
>> +
>> int btrfs_translate_remap(struct btrfs_fs_info *fs_info, u64 *logical,
>> u64 *length)
>> {
>> @@ -4478,3 +4624,366 @@ u64 btrfs_get_reloc_bg_bytenr(const struct btrfs_fs_info *fs_info)
>> logical = fs_info->reloc_ctl->block_group->start;
>> return logical;
>> }
>> +
>> +static int remove_range_from_remap_tree(struct btrfs_trans_handle *trans,
>> + struct btrfs_path *path,
>> + struct btrfs_block_group *bg,
>> + u64 bytenr, u64 num_bytes)
>> +{
>> + int ret;
>> + struct btrfs_fs_info *fs_info = trans->fs_info;
>> + struct extent_buffer *leaf = path->nodes[0];
>> + struct btrfs_key key, new_key;
>> + struct btrfs_remap *remap_ptr = NULL, remap;
>> + struct btrfs_block_group *dest_bg = NULL;
>> + u64 end, new_addr = 0, remap_start, remap_length, overlap_length;
>> + bool is_identity_remap;
>> +
>> + end = bytenr + num_bytes;
>> +
>> + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
>> +
>> + is_identity_remap = key.type == BTRFS_IDENTITY_REMAP_KEY;
>> +
>> + remap_start = key.objectid;
>> + remap_length = key.offset;
>> +
>> + if (!is_identity_remap) {
>> + remap_ptr = btrfs_item_ptr(leaf, path->slots[0],
>> + struct btrfs_remap);
>> + new_addr = btrfs_remap_address(leaf, remap_ptr);
>> +
>> + dest_bg = btrfs_lookup_block_group(fs_info, new_addr);
>> + }
>> +
>> + if (bytenr == remap_start && num_bytes >= remap_length) {
>> + /* Remove entirely. */
>> +
>> + ret = btrfs_del_item(trans, fs_info->remap_root, path);
>> + if (ret)
>> + goto end;
>> +
>> + btrfs_release_path(path);
>> +
>> + overlap_length = remap_length;
>> +
>> + if (!is_identity_remap) {
>> + /* Remove backref. */
>> +
>> + key.objectid = new_addr;
>> + key.type = BTRFS_REMAP_BACKREF_KEY;
>> + key.offset = remap_length;
>> +
>> + ret = btrfs_search_slot(trans, fs_info->remap_root,
>> + &key, path, -1, 1);
>> + if (ret) {
>> + if (ret == 1) {
>> + btrfs_release_path(path);
>> + ret = -ENOENT;
>> + }
>> + goto end;
>> + }
>> +
>> + ret = btrfs_del_item(trans, fs_info->remap_root, path);
>> +
>> + btrfs_release_path(path);
>> +
>> + if (ret)
>> + goto end;
>> +
>> + adjust_block_group_remap_bytes(trans, dest_bg,
>> + -remap_length);
>> + } else {
>> + adjust_identity_remap_count(trans, bg, -1);
>> + }
>> + } else if (bytenr == remap_start) {
>> + /* Remove beginning. */
>> +
>> + new_key.objectid = end;
>> + new_key.type = key.type;
>> + new_key.offset = remap_length + remap_start - end;
>> +
>> + btrfs_set_item_key_safe(trans, path, &new_key);
>> + btrfs_mark_buffer_dirty(trans, leaf);
>> +
>> + overlap_length = num_bytes;
>> +
>> + if (!is_identity_remap) {
>> + btrfs_set_remap_address(leaf, remap_ptr,
>> + new_addr + end - remap_start);
>> + btrfs_release_path(path);
>> +
>> + /* Adjust backref. */
>> +
>> + key.objectid = new_addr;
>> + key.type = BTRFS_REMAP_BACKREF_KEY;
>> + key.offset = remap_length;
>> +
>> + ret = btrfs_search_slot(trans, fs_info->remap_root,
>> + &key, path, -1, 1);
>> + if (ret) {
>> + if (ret == 1) {
>> + btrfs_release_path(path);
>> + ret = -ENOENT;
>> + }
>> + goto end;
>> + }
>> +
>> + leaf = path->nodes[0];
>> +
>> + new_key.objectid = new_addr + end - remap_start;
>> + new_key.type = BTRFS_REMAP_BACKREF_KEY;
>> + new_key.offset = remap_length + remap_start - end;
>> +
>> + btrfs_set_item_key_safe(trans, path, &new_key);
>> +
>> + remap_ptr = btrfs_item_ptr(leaf, path->slots[0],
>> + struct btrfs_remap);
>> + btrfs_set_remap_address(leaf, remap_ptr, end);
>> +
>> + btrfs_mark_buffer_dirty(trans, path->nodes[0]);
>> +
>> + btrfs_release_path(path);
>> +
>> + adjust_block_group_remap_bytes(trans, dest_bg,
>> + -num_bytes);
>> + }
>> + } else if (bytenr + num_bytes < remap_start + remap_length) {
>> + /* Remove middle. */
>> +
>> + new_key.objectid = remap_start;
>> + new_key.type = key.type;
>> + new_key.offset = bytenr - remap_start;
>> +
>> + btrfs_set_item_key_safe(trans, path, &new_key);
>> + btrfs_mark_buffer_dirty(trans, leaf);
>> +
>> + new_key.objectid = end;
>> + new_key.offset = remap_start + remap_length - end;
>> +
>> + btrfs_release_path(path);
>> +
>> + overlap_length = num_bytes;
>> +
>> + if (!is_identity_remap) {
>> + /* Add second remap entry. */
>> +
>> + ret = btrfs_insert_empty_item(trans, fs_info->remap_root,
>> + path, &new_key,
>> + sizeof(struct btrfs_remap));
>> + if (ret)
>> + goto end;
>> +
>> + btrfs_set_stack_remap_address(&remap,
>> + new_addr + end - remap_start);
>> +
>> + write_extent_buffer(path->nodes[0], &remap,
>> + btrfs_item_ptr_offset(path->nodes[0], path->slots[0]),
>> + sizeof(struct btrfs_remap));
>> +
>> + btrfs_release_path(path);
>> +
>> + /* Shorten backref entry. */
>> +
>> + key.objectid = new_addr;
>> + key.type = BTRFS_REMAP_BACKREF_KEY;
>> + key.offset = remap_length;
>> +
>> + ret = btrfs_search_slot(trans, fs_info->remap_root,
>> + &key, path, -1, 1);
>> + if (ret) {
>> + if (ret == 1) {
>> + btrfs_release_path(path);
>> + ret = -ENOENT;
>> + }
>> + goto end;
>> + }
>> +
>> + new_key.objectid = new_addr;
>> + new_key.type = BTRFS_REMAP_BACKREF_KEY;
>> + new_key.offset = bytenr - remap_start;
>> +
>> + btrfs_set_item_key_safe(trans, path, &new_key);
>> + btrfs_mark_buffer_dirty(trans, path->nodes[0]);
>> +
>> + btrfs_release_path(path);
>> +
>> + /* Add second backref entry. */
>> +
>> + new_key.objectid = new_addr + end - remap_start;
>> + new_key.type = BTRFS_REMAP_BACKREF_KEY;
>> + new_key.offset = remap_start + remap_length - end;
>> +
>> + ret = btrfs_insert_empty_item(trans, fs_info->remap_root,
>> + path, &new_key,
>> + sizeof(struct btrfs_remap));
>> + if (ret)
>> + goto end;
>> +
>> + btrfs_set_stack_remap_address(&remap, end);
>> +
>> + write_extent_buffer(path->nodes[0], &remap,
>> + btrfs_item_ptr_offset(path->nodes[0], path->slots[0]),
>> + sizeof(struct btrfs_remap));
>> +
>> + btrfs_release_path(path);
>> +
>> + adjust_block_group_remap_bytes(trans, dest_bg,
>> + -num_bytes);
>> + } else {
>> + /* Add second identity remap entry. */
>> +
>> + ret = btrfs_insert_empty_item(trans, fs_info->remap_root,
>> + path, &new_key, 0);
>> + if (ret)
>> + goto end;
>> +
>> + btrfs_release_path(path);
>> +
>> + adjust_identity_remap_count(trans, bg, 1);
>> + }
>> + } else {
>> + /* Remove end. */
>> +
>> + new_key.objectid = remap_start;
>> + new_key.type = key.type;
>> + new_key.offset = bytenr - remap_start;
>> +
>> + btrfs_set_item_key_safe(trans, path, &new_key);
>> + btrfs_mark_buffer_dirty(trans, leaf);
>> +
>> + btrfs_release_path(path);
>> +
>> + overlap_length = remap_start + remap_length - bytenr;
>> +
>> + if (!is_identity_remap) {
>> + /* Shorten backref entry. */
>> +
>> + key.objectid = new_addr;
>> + key.type = BTRFS_REMAP_BACKREF_KEY;
>> + key.offset = remap_length;
>> +
>> + ret = btrfs_search_slot(trans, fs_info->remap_root,
>> + &key, path, -1, 1);
>> + if (ret) {
>> + if (ret == 1) {
>> + btrfs_release_path(path);
>> + ret = -ENOENT;
>> + }
>> + goto end;
>> + }
>> +
>> + new_key.objectid = new_addr;
>> + new_key.type = BTRFS_REMAP_BACKREF_KEY;
>> + new_key.offset = bytenr - remap_start;
>> +
>> + btrfs_set_item_key_safe(trans, path, &new_key);
>> + btrfs_mark_buffer_dirty(trans, path->nodes[0]);
>> +
>> + btrfs_release_path(path);
>> +
>> + adjust_block_group_remap_bytes(trans, dest_bg,
>> + bytenr - remap_start - remap_length);
>> + }
>> + }
>> +
>> + if (!is_identity_remap) {
>> + ret = btrfs_add_to_free_space_tree(trans,
>> + bytenr - remap_start + new_addr,
>> + overlap_length);
>> + if (ret)
>> + goto end;
>> + }
>> +
>> + ret = overlap_length;
>> +
>> +end:
>> + if (dest_bg)
>> + btrfs_put_block_group(dest_bg);
>> +
>> + return ret;
>> +}
>> +
>> +/*
>> + * Returns 1 if remove_range_from_remap_tree() has been called successfully,
>> + * 0 if block group wasn't remapped, and a negative number on error.
>> + */
>> +int btrfs_remove_extent_from_remap_tree(struct btrfs_trans_handle *trans,
>> + struct btrfs_path *path,
>> + u64 bytenr, u64 num_bytes)
>> +{
>> + struct btrfs_fs_info *fs_info = trans->fs_info;
>> + struct btrfs_key key, found_key;
>> + struct extent_buffer *leaf;
>> + struct btrfs_block_group *bg;
>> + int ret, length;
>> +
>> + if (!(btrfs_super_incompat_flags(fs_info->super_copy) &
>> + BTRFS_FEATURE_INCOMPAT_REMAP_TREE))
>> + return 0;
>> +
>> + bg = btrfs_lookup_block_group(fs_info, bytenr);
>> + if (!bg)
>> + return 0;
>> +
>> + mutex_lock(&fs_info->remap_mutex);
>> +
>> + if (!(bg->flags & BTRFS_BLOCK_GROUP_REMAPPED)) {
>> + mutex_unlock(&fs_info->remap_mutex);
>> + btrfs_put_block_group(bg);
>> + return 0;
>> + }
>> +
>> + do {
>> + key.objectid = bytenr;
>> + key.type = (u8)-1;
>> + key.offset = (u64)-1;
>> +
>> + ret = btrfs_search_slot(trans, fs_info->remap_root, &key, path,
>> + -1, 1);
>> + if (ret < 0)
>> + goto end;
>> +
>> + leaf = path->nodes[0];
>> +
>> + if (path->slots[0] == 0) {
>> + ret = -ENOENT;
>> + goto end;
>> + }
>> +
>> + path->slots[0]--;
>> +
>> + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
>> +
>> + if (found_key.type != BTRFS_IDENTITY_REMAP_KEY &&
>> + found_key.type != BTRFS_REMAP_KEY) {
>> + ret = -ENOENT;
>> + goto end;
>> + }
>> +
>> + if (bytenr < found_key.objectid ||
>> + bytenr >= found_key.objectid + found_key.offset) {
>> + ret = -ENOENT;
>> + goto end;
>> + }
>> +
>> + length = remove_range_from_remap_tree(trans, path, bg, bytenr,
>> + num_bytes);
>> + if (length < 0) {
>> + ret = length;
>> + goto end;
>> + }
>> +
>> + bytenr += length;
>> + num_bytes -= length;
>> + } while (num_bytes > 0);
>> +
>> + ret = 1;
>> +
>> +end:
>> + mutex_unlock(&fs_info->remap_mutex);
>> +
>> + btrfs_put_block_group(bg);
>> + btrfs_release_path(path);
>> + return ret;
>> +}
>> diff --git a/fs/btrfs/relocation.h b/fs/btrfs/relocation.h
>> index b2ba83966650..7cfe91971cab 100644
>> --- a/fs/btrfs/relocation.h
>> +++ b/fs/btrfs/relocation.h
>> @@ -33,5 +33,11 @@ bool btrfs_should_ignore_reloc_root(const struct btrfs_root *root);
>> u64 btrfs_get_reloc_bg_bytenr(const struct btrfs_fs_info *fs_info);
>> int btrfs_translate_remap(struct btrfs_fs_info *fs_info, u64 *logical,
>> u64 *length);
>> +int btrfs_remove_extent_from_remap_tree(struct btrfs_trans_handle *trans,
>> + struct btrfs_path *path,
>> + u64 bytenr, u64 num_bytes);
>> +int btrfs_last_identity_remap_gone(struct btrfs_trans_handle *trans,
>> + struct btrfs_chunk_map *chunk,
>> + struct btrfs_block_group *bg);
>>
>> #endif
>> diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
>> index de3eeb37408a..ffee6c285182 100644
>> --- a/fs/btrfs/transaction.c
>> +++ b/fs/btrfs/transaction.c
>> @@ -2437,6 +2437,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
>> if (ret)
>> goto unlock_reloc;
>>
>> + ret = btrfs_handle_fully_remapped_bgs(trans);
>> + if (ret)
>> + goto unlock_reloc;
>> +
>> /*
>> * make sure none of the code above managed to slip in a
>> * delayed item
>> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
>> index d117f74e08c1..99ad95e1c300 100644
>> --- a/fs/btrfs/volumes.c
>> +++ b/fs/btrfs/volumes.c
>> @@ -2929,8 +2929,8 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
>> return ret;
>> }
>>
>> -static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
>> - struct btrfs_device *device)
>> +int btrfs_update_device(struct btrfs_trans_handle *trans,
>> + struct btrfs_device *device)
>> {
>> int ret;
>> BTRFS_PATH_AUTO_FREE(path);
>> @@ -3228,25 +3228,13 @@ static int remove_chunk_item(struct btrfs_trans_handle *trans,
>> return btrfs_free_chunk(trans, chunk_offset);
>> }
>>
>> -int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
>> +int btrfs_remove_dev_extents(struct btrfs_trans_handle *trans,
>> + struct btrfs_chunk_map *map)
>> {
>> struct btrfs_fs_info *fs_info = trans->fs_info;
>> - struct btrfs_chunk_map *map;
>> + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
>> u64 dev_extent_len = 0;
>> int i, ret = 0;
>> - struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
>> -
>> - map = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
>> - if (IS_ERR(map)) {
>> - /*
>> - * This is a logic error, but we don't want to just rely on the
>> - * user having built with ASSERT enabled, so if ASSERT doesn't
>> - * do anything we still error out.
>> - */
>> - DEBUG_WARN("errr %ld reading chunk map at offset %llu",
>> - PTR_ERR(map), chunk_offset);
>> - return PTR_ERR(map);
>> - }
>>
>> /*
>> * First delete the device extent items from the devices btree.
>> @@ -3267,7 +3255,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
>> if (unlikely(ret)) {
>> mutex_unlock(&fs_devices->device_list_mutex);
>> btrfs_abort_transaction(trans, ret);
>> - goto out;
>> + return ret;
>> }
>>
>> if (device->bytes_used > 0) {
>> @@ -3287,6 +3275,30 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
>> }
>> mutex_unlock(&fs_devices->device_list_mutex);
>>
>> + return 0;
>> +}
>> +
>> +int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
>> +{
>> + struct btrfs_fs_info *fs_info = trans->fs_info;
>> + struct btrfs_chunk_map *map;
>> + int ret;
>> +
>> + map = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
>> + if (IS_ERR(map)) {
>> + /*
>> + * This is a logic error, but we don't want to just rely on the
>> + * user having built with ASSERT enabled, so if ASSERT doesn't
>> + * do anything we still error out.
>> + */
>> + ASSERT(0);
>> + return PTR_ERR(map);
>> + }
>> +
>> + ret = btrfs_remove_dev_extents(trans, map);
>> + if (ret)
>> + goto out;
>> +
>> /*
>> * We acquire fs_info->chunk_mutex for 2 reasons:
>> *
>> @@ -5422,7 +5434,7 @@ static void chunk_map_device_set_bits(struct btrfs_chunk_map *map, unsigned int
>> }
>> }
>>
>> -static void chunk_map_device_clear_bits(struct btrfs_chunk_map *map, unsigned int bits)
>> +void btrfs_chunk_map_device_clear_bits(struct btrfs_chunk_map *map, unsigned int bits)
>> {
>> for (int i = 0; i < map->num_stripes; i++) {
>> struct btrfs_io_stripe *stripe = &map->stripes[i];
>> @@ -5439,7 +5451,7 @@ void btrfs_remove_chunk_map(struct btrfs_fs_info *fs_info, struct btrfs_chunk_ma
>> write_lock(&fs_info->mapping_tree_lock);
>> rb_erase_cached(&map->rb_node, &fs_info->mapping_tree);
>> RB_CLEAR_NODE(&map->rb_node);
>> - chunk_map_device_clear_bits(map, CHUNK_ALLOCATED);
>> + btrfs_chunk_map_device_clear_bits(map, CHUNK_ALLOCATED);
>> write_unlock(&fs_info->mapping_tree_lock);
>>
>> /* Once for the tree reference. */
>> @@ -5475,7 +5487,7 @@ int btrfs_add_chunk_map(struct btrfs_fs_info *fs_info, struct btrfs_chunk_map *m
>> return -EEXIST;
>> }
>> chunk_map_device_set_bits(map, CHUNK_ALLOCATED);
>> - chunk_map_device_clear_bits(map, CHUNK_TRIMMED);
>> + btrfs_chunk_map_device_clear_bits(map, CHUNK_TRIMMED);
>> write_unlock(&fs_info->mapping_tree_lock);
>>
>> return 0;
>> @@ -5840,7 +5852,7 @@ void btrfs_mapping_tree_free(struct btrfs_fs_info *fs_info)
>> map = rb_entry(node, struct btrfs_chunk_map, rb_node);
>> rb_erase_cached(&map->rb_node, &fs_info->mapping_tree);
>> RB_CLEAR_NODE(&map->rb_node);
>> - chunk_map_device_clear_bits(map, CHUNK_ALLOCATED);
>> + btrfs_chunk_map_device_clear_bits(map, CHUNK_ALLOCATED);
>> /* Once for the tree ref. */
>> btrfs_free_chunk_map(map);
>> cond_resched_rwlock_write(&fs_info->mapping_tree_lock);
>> diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
>> index 7cf76bffcab6..0c64cae59f1c 100644
>> --- a/fs/btrfs/volumes.h
>> +++ b/fs/btrfs/volumes.h
>> @@ -794,6 +794,8 @@ u64 btrfs_calc_stripe_length(const struct btrfs_chunk_map *map);
>> int btrfs_nr_parity_stripes(u64 type);
>> int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans,
>> struct btrfs_block_group *bg);
>> +int btrfs_remove_dev_extents(struct btrfs_trans_handle *trans,
>> + struct btrfs_chunk_map *map);
>> int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset);
>>
>> #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
>> @@ -905,6 +907,10 @@ bool btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical);
>>
>> bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr);
>> const u8 *btrfs_sb_fsid_ptr(const struct btrfs_super_block *sb);
>> +int btrfs_update_device(struct btrfs_trans_handle *trans,
>> + struct btrfs_device *device);
>> +void btrfs_chunk_map_device_clear_bits(struct btrfs_chunk_map *map,
>> + unsigned int bits);
>>
>> #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
>> struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_info,
>> --
>> 2.49.1
>>
^ permalink raw reply [flat|nested] 42+ messages in thread
* Re: [PATCH v4 09/16] btrfs: handle deletions from remapped block group
2025-10-24 18:12 ` [PATCH v4 09/16] btrfs: handle deletions from remapped block group Mark Harmstone
2025-10-31 23:05 ` Boris Burkov
@ 2025-10-31 23:30 ` Boris Burkov
2025-11-04 12:30 ` Mark Harmstone
1 sibling, 1 reply; 42+ messages in thread
From: Boris Burkov @ 2025-10-31 23:30 UTC (permalink / raw)
To: Mark Harmstone; +Cc: linux-btrfs
On Fri, Oct 24, 2025 at 07:12:10PM +0100, Mark Harmstone wrote:
> Handle the case where we free an extent from a block group that has the
> REMAPPED flag set. Because the remap tree is orthogonal to the extent
> tree, for data this may be within any number of identity remaps or
> actual remaps. If we're freeing a metadata node, this will be wholly
> inside one or the other.
>
> btrfs_remove_extent_from_remap_tree() searches the remap tree for the
> remaps that cover the range in question, then calls
> remove_range_from_remap_tree() for each one, to punch a hole in the
> remap and adjust the free-space tree.
>
> For an identity remap, remove_range_from_remap_tree() will adjust the
> block group's `identity_remap_count` if this changes. If it reaches
> zero we call last_identity_remap_gone(), which removes the chunk's
> stripes and device extents - it is now fully remapped.
>
> The changes which involve the block group's ro flag are because the
> REMAPPED flag itself prevents a block group from having any new
> allocations within it, and so we don't need to account for this
> separately.
>
> Signed-off-by: Mark Harmstone <mark@harmstone.com>
> ---
> fs/btrfs/block-group.c | 118 +++++++---
> fs/btrfs/block-group.h | 4 +
> fs/btrfs/disk-io.c | 2 +
> fs/btrfs/extent-tree.c | 77 ++++++-
> fs/btrfs/extent-tree.h | 1 +
> fs/btrfs/fs.h | 4 +-
> fs/btrfs/relocation.c | 509 +++++++++++++++++++++++++++++++++++++++++
> fs/btrfs/relocation.h | 6 +
> fs/btrfs/transaction.c | 4 +
> fs/btrfs/volumes.c | 56 +++--
> fs/btrfs/volumes.h | 6 +
> 11 files changed, 728 insertions(+), 59 deletions(-)
>
> diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
> index 27173aca6fc1..3bf5f20d90ec 100644
> --- a/fs/btrfs/block-group.c
> +++ b/fs/btrfs/block-group.c
> @@ -1068,6 +1068,32 @@ static int remove_block_group_item(struct btrfs_trans_handle *trans,
> return ret;
> }
>
> +void btrfs_remove_bg_from_sinfo(struct btrfs_block_group *block_group)
> +{
> + int factor = btrfs_bg_type_to_factor(block_group->flags);
> +
> + spin_lock(&block_group->space_info->lock);
> +
> + if (btrfs_test_opt(block_group->fs_info, ENOSPC_DEBUG)) {
> + WARN_ON(block_group->space_info->total_bytes
> + < block_group->length);
> + WARN_ON(block_group->space_info->bytes_readonly
> + < block_group->length - block_group->zone_unusable);
> + WARN_ON(block_group->space_info->bytes_zone_unusable
> + < block_group->zone_unusable);
> + WARN_ON(block_group->space_info->disk_total
> + < block_group->length * factor);
> + }
> + block_group->space_info->total_bytes -= block_group->length;
> + block_group->space_info->bytes_readonly -=
> + (block_group->length - block_group->zone_unusable);
> + btrfs_space_info_update_bytes_zone_unusable(block_group->space_info,
> + -block_group->zone_unusable);
> + block_group->space_info->disk_total -= block_group->length * factor;
> +
> + spin_unlock(&block_group->space_info->lock);
> +}
> +
> int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
> struct btrfs_chunk_map *map)
> {
> @@ -1079,7 +1105,6 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
> struct kobject *kobj = NULL;
> int ret;
> int index;
> - int factor;
> struct btrfs_caching_control *caching_ctl = NULL;
> bool remove_map;
> bool remove_rsv = false;
> @@ -1088,7 +1113,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
> if (!block_group)
> return -ENOENT;
>
> - BUG_ON(!block_group->ro);
> + BUG_ON(!block_group->ro && !(block_group->flags & BTRFS_BLOCK_GROUP_REMAPPED));
>
> trace_btrfs_remove_block_group(block_group);
> /*
> @@ -1100,7 +1125,6 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
> block_group->length);
>
> index = btrfs_bg_flags_to_raid_index(block_group->flags);
> - factor = btrfs_bg_type_to_factor(block_group->flags);
>
> /* make sure this block group isn't part of an allocation cluster */
> cluster = &fs_info->data_alloc_cluster;
> @@ -1224,26 +1248,11 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
>
> spin_lock(&block_group->space_info->lock);
> list_del_init(&block_group->ro_list);
> -
> - if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
> - WARN_ON(block_group->space_info->total_bytes
> - < block_group->length);
> - WARN_ON(block_group->space_info->bytes_readonly
> - < block_group->length - block_group->zone_unusable);
> - WARN_ON(block_group->space_info->bytes_zone_unusable
> - < block_group->zone_unusable);
> - WARN_ON(block_group->space_info->disk_total
> - < block_group->length * factor);
> - }
> - block_group->space_info->total_bytes -= block_group->length;
> - block_group->space_info->bytes_readonly -=
> - (block_group->length - block_group->zone_unusable);
> - btrfs_space_info_update_bytes_zone_unusable(block_group->space_info,
> - -block_group->zone_unusable);
> - block_group->space_info->disk_total -= block_group->length * factor;
> -
> spin_unlock(&block_group->space_info->lock);
>
> + if (!(block_group->flags & BTRFS_BLOCK_GROUP_REMAPPED))
> + btrfs_remove_bg_from_sinfo(block_group);
> +
> /*
> * Remove the free space for the block group from the free space tree
> * and the block group's item from the extent tree before marking the
> @@ -1538,6 +1547,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
> while (!list_empty(&fs_info->unused_bgs)) {
> u64 used;
> int trimming;
> + bool made_ro = false;
>
> block_group = list_first_entry(&fs_info->unused_bgs,
> struct btrfs_block_group,
> @@ -1574,7 +1584,8 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
>
> spin_lock(&space_info->lock);
> spin_lock(&block_group->lock);
> - if (btrfs_is_block_group_used(block_group) || block_group->ro ||
> + if (btrfs_is_block_group_used(block_group) ||
> + (block_group->ro && !(block_group->flags & BTRFS_BLOCK_GROUP_REMAPPED)) ||
> list_is_singular(&block_group->list)) {
> /*
> * We want to bail if we made new allocations or have
> @@ -1616,9 +1627,10 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
> * needing to allocate extents from the block group.
> */
> used = btrfs_space_info_used(space_info, true);
> - if ((space_info->total_bytes - block_group->length < used &&
> + if (((space_info->total_bytes - block_group->length < used &&
> block_group->zone_unusable < block_group->length) ||
> - has_unwritten_metadata(block_group)) {
> + has_unwritten_metadata(block_group)) &&
> + !(block_group->flags & BTRFS_BLOCK_GROUP_REMAPPED)) {
> /*
> * Add a reference for the list, compensate for the ref
> * drop under the "next" label for the
> @@ -1636,8 +1648,14 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
> spin_unlock(&block_group->lock);
> spin_unlock(&space_info->lock);
>
> - /* We don't want to force the issue, only flip if it's ok. */
> - ret = inc_block_group_ro(block_group, 0);
> + if (!(block_group->flags & BTRFS_BLOCK_GROUP_REMAPPED)) {
> + /* We don't want to force the issue, only flip if it's ok. */
> + ret = inc_block_group_ro(block_group, 0);
> + made_ro = true;
> + } else {
> + ret = 0;
> + }
> +
> up_write(&space_info->groups_sem);
> if (ret < 0) {
> ret = 0;
> @@ -1646,7 +1664,8 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
>
> ret = btrfs_zone_finish(block_group);
> if (ret < 0) {
> - btrfs_dec_block_group_ro(block_group);
> + if (made_ro)
> + btrfs_dec_block_group_ro(block_group);
> if (ret == -EAGAIN) {
> btrfs_link_bg_list(block_group, &retry_list);
> ret = 0;
> @@ -1661,7 +1680,8 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
> trans = btrfs_start_trans_remove_block_group(fs_info,
> block_group->start);
> if (IS_ERR(trans)) {
> - btrfs_dec_block_group_ro(block_group);
> + if (made_ro)
> + btrfs_dec_block_group_ro(block_group);
> ret = PTR_ERR(trans);
> goto next;
> }
> @@ -1671,7 +1691,8 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
> * just delete them, we don't care about them anymore.
> */
> if (!clean_pinned_extents(trans, block_group)) {
> - btrfs_dec_block_group_ro(block_group);
> + if (made_ro)
> + btrfs_dec_block_group_ro(block_group);
> goto end_trans;
> }
>
> @@ -1685,7 +1706,8 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
> spin_lock(&fs_info->discard_ctl.lock);
> if (!list_empty(&block_group->discard_list)) {
> spin_unlock(&fs_info->discard_ctl.lock);
> - btrfs_dec_block_group_ro(block_group);
> + if (made_ro)
> + btrfs_dec_block_group_ro(block_group);
> btrfs_discard_queue_work(&fs_info->discard_ctl,
> block_group);
> goto end_trans;
> @@ -1779,6 +1801,15 @@ void btrfs_mark_bg_unused(struct btrfs_block_group *bg)
> struct btrfs_fs_info *fs_info = bg->fs_info;
>
> spin_lock(&fs_info->unused_bgs_lock);
> +
> + /* Leave fully remapped block groups on the fully_remapped_bgs list. */
> + if (bg->flags & BTRFS_BLOCK_GROUP_REMAPPED &&
> + bg->identity_remap_count == 0 &&
> + !list_empty(&bg->bg_list)) {
> + spin_unlock(&fs_info->unused_bgs_lock);
> + return;
> + }
> +
> if (list_empty(&bg->bg_list)) {
> btrfs_get_block_group(bg);
> trace_btrfs_add_unused_block_group(bg);
> @@ -4772,3 +4803,30 @@ bool btrfs_block_group_should_use_size_class(const struct btrfs_block_group *bg)
> return false;
> return true;
> }
> +
> +void btrfs_mark_bg_fully_remapped(struct btrfs_block_group *bg,
> + struct btrfs_trans_handle *trans)
> +{
> + struct btrfs_fs_info *fs_info = trans->fs_info;
> + bool already_done;
> +
> + spin_lock(&bg->lock);
> + already_done = bg->fully_remapped;
> + bg->fully_remapped = true;
> + spin_unlock(&bg->lock);
> +
> + if (already_done)
> + return;
> +
> + spin_lock(&fs_info->unused_bgs_lock);
> +
which list could it be on in this case? reclaim? do we not take those
off before reclaiming them?
> + if (!list_empty(&bg->bg_list))
> + list_del(&bg->bg_list);
> + else
> + btrfs_get_block_group(bg);
> +
> + list_add_tail(&bg->bg_list, &fs_info->fully_remapped_bgs);
> +
> + spin_unlock(&fs_info->unused_bgs_lock);
> +
> +}
> diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h
> index af23fdb3cf4d..d85f3c2546d0 100644
> --- a/fs/btrfs/block-group.h
> +++ b/fs/btrfs/block-group.h
> @@ -282,6 +282,7 @@ struct btrfs_block_group {
> struct extent_buffer *last_eb;
> enum btrfs_block_group_size_class size_class;
> u64 reclaim_mark;
> + bool fully_remapped;
> };
>
> static inline u64 btrfs_block_group_end(const struct btrfs_block_group *block_group)
> @@ -336,6 +337,7 @@ int btrfs_add_new_free_space(struct btrfs_block_group *block_group,
> struct btrfs_trans_handle *btrfs_start_trans_remove_block_group(
> struct btrfs_fs_info *fs_info,
> const u64 chunk_offset);
> +void btrfs_remove_bg_from_sinfo(struct btrfs_block_group *block_group);
> int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
> struct btrfs_chunk_map *map);
> void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info);
> @@ -407,5 +409,7 @@ int btrfs_use_block_group_size_class(struct btrfs_block_group *bg,
> enum btrfs_block_group_size_class size_class,
> bool force_wrong_size_class);
> bool btrfs_block_group_should_use_size_class(const struct btrfs_block_group *bg);
> +void btrfs_mark_bg_fully_remapped(struct btrfs_block_group *bg,
> + struct btrfs_trans_handle *trans);
>
> #endif /* BTRFS_BLOCK_GROUP_H */
> diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
> index d3ff148311d8..1a3e525f3d1a 100644
> --- a/fs/btrfs/disk-io.c
> +++ b/fs/btrfs/disk-io.c
> @@ -2870,6 +2870,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
> INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
> INIT_LIST_HEAD(&fs_info->unused_bgs);
> INIT_LIST_HEAD(&fs_info->reclaim_bgs);
> + INIT_LIST_HEAD(&fs_info->fully_remapped_bgs);
> INIT_LIST_HEAD(&fs_info->zone_active_bgs);
> #ifdef CONFIG_BTRFS_DEBUG
> INIT_LIST_HEAD(&fs_info->allocated_roots);
> @@ -2925,6 +2926,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
> mutex_init(&fs_info->chunk_mutex);
> mutex_init(&fs_info->transaction_kthread_mutex);
> mutex_init(&fs_info->cleaner_mutex);
> + mutex_init(&fs_info->remap_mutex);
> mutex_init(&fs_info->ro_block_group_mutex);
> init_rwsem(&fs_info->commit_root_sem);
> init_rwsem(&fs_info->cleanup_work_sem);
> diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
> index d3ca8105ffc7..1c14e0c82c03 100644
> --- a/fs/btrfs/extent-tree.c
> +++ b/fs/btrfs/extent-tree.c
> @@ -40,6 +40,7 @@
> #include "orphan.h"
> #include "tree-checker.h"
> #include "raid-stripe-tree.h"
> +#include "relocation.h"
>
> #undef SCRAMBLE_DELAYED_REFS
>
> @@ -2847,6 +2848,52 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info,
> return ret;
> }
>
> +int btrfs_handle_fully_remapped_bgs(struct btrfs_trans_handle *trans)
> +{
> + struct btrfs_fs_info *fs_info = trans->fs_info;
> + struct btrfs_block_group *block_group, *tmp;
> + struct list_head *fully_remapped_bgs;
> + int ret;
> +
> + fully_remapped_bgs = &fs_info->fully_remapped_bgs;
> + list_for_each_entry_safe(block_group, tmp, fully_remapped_bgs, bg_list) {
> + struct btrfs_chunk_map *map;
> +
> + map = btrfs_get_chunk_map(fs_info, block_group->start, 1);
> + if (IS_ERR(map))
> + return PTR_ERR(map);
> +
> + ret = btrfs_last_identity_remap_gone(trans, map, block_group);
> + if (ret) {
> + btrfs_free_chunk_map(map);
> + return ret;
> + }
> +
> + /*
> + * Set num_stripes to 0, so that btrfs_remove_dev_extents()
> + * won't run a second time.
> + */
> + map->num_stripes = 0;
> +
> + btrfs_free_chunk_map(map);
> +
> + if (block_group->used == 0) {
> + spin_lock(&fs_info->unused_bgs_lock);
> + list_move_tail(&block_group->bg_list,
> + &fs_info->unused_bgs);
> + spin_unlock(&fs_info->unused_bgs_lock);
> + } else {
> + spin_lock(&fs_info->unused_bgs_lock);
> + list_del_init(&block_group->bg_list);
> + spin_unlock(&fs_info->unused_bgs_lock);
> +
> + btrfs_put_block_group(block_group);
> + }
> + }
> +
> + return 0;
> +}
> +
> int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
> {
> struct btrfs_fs_info *fs_info = trans->fs_info;
> @@ -2999,11 +3046,23 @@ u64 btrfs_get_extent_owner_root(struct btrfs_fs_info *fs_info,
> }
>
> static int do_free_extent_accounting(struct btrfs_trans_handle *trans,
> - u64 bytenr, struct btrfs_squota_delta *delta)
> + u64 bytenr, struct btrfs_squota_delta *delta,
> + struct btrfs_path *path)
> {
> int ret;
> + bool remapped = false;
> u64 num_bytes = delta->num_bytes;
>
> + /* returns 1 on success and 0 on no-op */
> + ret = btrfs_remove_extent_from_remap_tree(trans, path, bytenr,
> + num_bytes);
> + if (ret < 0) {
> + btrfs_abort_transaction(trans, ret);
> + return ret;
> + } else if (ret == 1) {
> + remapped = true;
> + }
> +
> if (delta->is_data) {
> struct btrfs_root *csum_root;
>
> @@ -3027,10 +3086,16 @@ static int do_free_extent_accounting(struct btrfs_trans_handle *trans,
> return ret;
> }
>
> - ret = btrfs_add_to_free_space_tree(trans, bytenr, num_bytes);
> - if (unlikely(ret)) {
> - btrfs_abort_transaction(trans, ret);
> - return ret;
> + /*
> + * If remapped, FST has already been taken care of in
> + * remove_range_from_remap_tree().
> + */
> + if (!remapped) {
> + ret = btrfs_add_to_free_space_tree(trans, bytenr, num_bytes);
> + if (unlikely(ret)) {
> + btrfs_abort_transaction(trans, ret);
> + return ret;
> + }
> }
>
> ret = btrfs_update_block_group(trans, bytenr, num_bytes, false);
> @@ -3396,7 +3461,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
> }
> btrfs_release_path(path);
>
> - ret = do_free_extent_accounting(trans, bytenr, &delta);
> + ret = do_free_extent_accounting(trans, bytenr, &delta, path);
> }
> btrfs_release_path(path);
>
> diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h
> index e970ac42a871..6b67a4e528da 100644
> --- a/fs/btrfs/extent-tree.h
> +++ b/fs/btrfs/extent-tree.h
> @@ -164,5 +164,6 @@ void btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, u64 start, u6
> int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
> u64 num_bytes, u64 *actual_bytes);
> int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range);
> +int btrfs_handle_fully_remapped_bgs(struct btrfs_trans_handle *trans);
>
> #endif
> diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
> index 62057e8006a9..c3dacbfe118c 100644
> --- a/fs/btrfs/fs.h
> +++ b/fs/btrfs/fs.h
> @@ -573,6 +573,7 @@ struct btrfs_fs_info {
> struct mutex transaction_kthread_mutex;
> struct mutex cleaner_mutex;
> struct mutex chunk_mutex;
> + struct mutex remap_mutex;
>
> /*
> * This is taken to make sure we don't set block groups ro after the
> @@ -827,10 +828,11 @@ struct btrfs_fs_info {
> struct list_head reclaim_bgs;
> int bg_reclaim_threshold;
>
> - /* Protects the lists unused_bgs and reclaim_bgs. */
> + /* Protects the lists unused_bgs, reclaim_bgs, and fully_remapped_bgs. */
> spinlock_t unused_bgs_lock;
> /* Protected by unused_bgs_lock. */
> struct list_head unused_bgs;
> + struct list_head fully_remapped_bgs;
> struct mutex unused_bg_unpin_mutex;
> /* Protect block groups that are going to be deleted */
> struct mutex reclaim_bgs_lock;
> diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
> index a8abe24de8d7..9f3ce3395d6a 100644
> --- a/fs/btrfs/relocation.c
> +++ b/fs/btrfs/relocation.c
> @@ -37,6 +37,7 @@
> #include "super.h"
> #include "tree-checker.h"
> #include "raid-stripe-tree.h"
> +#include "free-space-tree.h"
>
> /*
> * Relocation overview
> @@ -3870,6 +3871,151 @@ static const char *stage_to_string(enum reloc_stage stage)
> return "unknown";
> }
>
> +static void adjust_block_group_remap_bytes(struct btrfs_trans_handle *trans,
> + struct btrfs_block_group *bg,
> + s64 diff)
> +{
> + struct btrfs_fs_info *fs_info = trans->fs_info;
> + bool bg_already_dirty = true, mark_unused = false;
> +
> + spin_lock(&bg->lock);
> +
> + bg->remap_bytes += diff;
> +
> + if (bg->used == 0 && bg->remap_bytes == 0)
> + mark_unused = true;
> +
> + spin_unlock(&bg->lock);
> +
> + if (mark_unused)
> + btrfs_mark_bg_unused(bg);
> +
> + spin_lock(&trans->transaction->dirty_bgs_lock);
> + if (list_empty(&bg->dirty_list)) {
> + list_add_tail(&bg->dirty_list, &trans->transaction->dirty_bgs);
> + bg_already_dirty = false;
> + btrfs_get_block_group(bg);
> + }
> + spin_unlock(&trans->transaction->dirty_bgs_lock);
> +
> + /* Modified block groups are accounted for in the delayed_refs_rsv. */
> + if (!bg_already_dirty)
> + btrfs_inc_delayed_refs_rsv_bg_updates(fs_info);
> +}
> +
> +static int remove_chunk_stripes(struct btrfs_trans_handle *trans,
> + struct btrfs_chunk_map *chunk,
> + struct btrfs_path *path)
> +{
> + struct btrfs_fs_info *fs_info = trans->fs_info;
> + struct btrfs_key key;
> + struct extent_buffer *leaf;
> + struct btrfs_chunk *c;
> + int ret;
> +
> + key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
> + key.type = BTRFS_CHUNK_ITEM_KEY;
> + key.offset = chunk->start;
> +
> + ret = btrfs_search_slot(trans, fs_info->chunk_root, &key, path,
> + 0, 1);
> + if (ret) {
> + if (ret == 1) {
> + btrfs_release_path(path);
> + ret = -ENOENT;
> + }
> + return ret;
> + }
> +
> + leaf = path->nodes[0];
> +
> + c = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_chunk);
> + btrfs_set_chunk_num_stripes(leaf, c, 0);
> + btrfs_set_chunk_sub_stripes(leaf, c, 0);
> +
> + btrfs_truncate_item(trans, path, offsetof(struct btrfs_chunk, stripe),
> + 1);
> +
> + btrfs_mark_buffer_dirty(trans, leaf);
> +
> + btrfs_release_path(path);
> +
> + return 0;
> +}
> +
> +int btrfs_last_identity_remap_gone(struct btrfs_trans_handle *trans,
> + struct btrfs_chunk_map *chunk,
> + struct btrfs_block_group *bg)
> +{
> + int ret;
> + BTRFS_PATH_AUTO_FREE(path);
> +
> + ret = btrfs_remove_dev_extents(trans, chunk);
> + if (ret)
> + return ret;
> +
> + mutex_lock(&trans->fs_info->chunk_mutex);
> +
> + for (unsigned int i = 0; i < chunk->num_stripes; i++) {
> + ret = btrfs_update_device(trans, chunk->stripes[i].dev);
> + if (ret) {
> + mutex_unlock(&trans->fs_info->chunk_mutex);
> + return ret;
> + }
> + }
> +
> + mutex_unlock(&trans->fs_info->chunk_mutex);
> +
> + write_lock(&trans->fs_info->mapping_tree_lock);
> + btrfs_chunk_map_device_clear_bits(chunk, CHUNK_ALLOCATED);
> + write_unlock(&trans->fs_info->mapping_tree_lock);
> +
> + btrfs_remove_bg_from_sinfo(bg);
> +
> + path = btrfs_alloc_path();
> + if (!path)
> + return -ENOMEM;
> +
> + ret = remove_chunk_stripes(trans, chunk, path);
> + if (ret)
> + return ret;
> +
> + return 0;
> +}
> +
> +static void adjust_identity_remap_count(struct btrfs_trans_handle *trans,
> + struct btrfs_block_group *bg, int delta)
> +{
> + struct btrfs_fs_info *fs_info = trans->fs_info;
> + bool bg_already_dirty = true, mark_fully_remapped = false;
> +
> + WARN_ON(delta < 0 && -delta > bg->identity_remap_count);
> +
> + spin_lock(&bg->lock);
> +
> + bg->identity_remap_count += delta;
> +
> + if (!bg->fully_remapped && bg->identity_remap_count == 0)
> + mark_fully_remapped = true;
> +
> + spin_unlock(&bg->lock);
> +
> + spin_lock(&trans->transaction->dirty_bgs_lock);
> + if (list_empty(&bg->dirty_list)) {
> + list_add_tail(&bg->dirty_list, &trans->transaction->dirty_bgs);
> + bg_already_dirty = false;
> + btrfs_get_block_group(bg);
> + }
> + spin_unlock(&trans->transaction->dirty_bgs_lock);
> +
> + /* Modified block groups are accounted for in the delayed_refs_rsv. */
> + if (!bg_already_dirty)
> + btrfs_inc_delayed_refs_rsv_bg_updates(fs_info);
> +
> + if (mark_fully_remapped)
> + btrfs_mark_bg_fully_remapped(bg, trans);
> +}
> +
> int btrfs_translate_remap(struct btrfs_fs_info *fs_info, u64 *logical,
> u64 *length)
> {
> @@ -4478,3 +4624,366 @@ u64 btrfs_get_reloc_bg_bytenr(const struct btrfs_fs_info *fs_info)
> logical = fs_info->reloc_ctl->block_group->start;
> return logical;
> }
> +
Please document the expectation on the path passed (and other params)
> +static int remove_range_from_remap_tree(struct btrfs_trans_handle *trans,
> + struct btrfs_path *path,
> + struct btrfs_block_group *bg,
> + u64 bytenr, u64 num_bytes)
> +{
> + int ret;
> + struct btrfs_fs_info *fs_info = trans->fs_info;
> + struct extent_buffer *leaf = path->nodes[0];
> + struct btrfs_key key, new_key;
> + struct btrfs_remap *remap_ptr = NULL, remap;
> + struct btrfs_block_group *dest_bg = NULL;
> + u64 end, new_addr = 0, remap_start, remap_length, overlap_length;
> + bool is_identity_remap;
> +
> + end = bytenr + num_bytes;
> +
> + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
> +
> + is_identity_remap = key.type == BTRFS_IDENTITY_REMAP_KEY;
> +
> + remap_start = key.objectid;
> + remap_length = key.offset;
> +
> + if (!is_identity_remap) {
> + remap_ptr = btrfs_item_ptr(leaf, path->slots[0],
> + struct btrfs_remap);
> + new_addr = btrfs_remap_address(leaf, remap_ptr);
> +
> + dest_bg = btrfs_lookup_block_group(fs_info, new_addr);
> + }
> +
These open-coded cases are all quite large and I suspect redundant (I'll
try to catch concretely how) so it would make it more readable to break
them out into static functions (remove_remapping, trim_remapping, etc.)
The most readable might be to simply always delete the entire remapping
item and replace it with the needed new ones rather than modifying in-place.
> + if (bytenr == remap_start && num_bytes >= remap_length) {
> + /* Remove entirely. */
> +
> + ret = btrfs_del_item(trans, fs_info->remap_root, path);
> + if (ret)
> + goto end;
> +
> + btrfs_release_path(path);
> +
> + overlap_length = remap_length;
> +
> + if (!is_identity_remap) {
> + /* Remove backref. */
> +
> + key.objectid = new_addr;
> + key.type = BTRFS_REMAP_BACKREF_KEY;
> + key.offset = remap_length;
> +
> + ret = btrfs_search_slot(trans, fs_info->remap_root,
> + &key, path, -1, 1);
> + if (ret) {
> + if (ret == 1) {
> + btrfs_release_path(path);
> + ret = -ENOENT;
> + }
> + goto end;
> + }
> +
> + ret = btrfs_del_item(trans, fs_info->remap_root, path);
> +
> + btrfs_release_path(path);
> +
> + if (ret)
> + goto end;
> +
> + adjust_block_group_remap_bytes(trans, dest_bg,
> + -remap_length);
> + } else {
> + adjust_identity_remap_count(trans, bg, -1);
> + }
> + } else if (bytenr == remap_start) {
> + /* Remove beginning. */
> +
> + new_key.objectid = end;
> + new_key.type = key.type;
> + new_key.offset = remap_length + remap_start - end;
> +
> + btrfs_set_item_key_safe(trans, path, &new_key);
> + btrfs_mark_buffer_dirty(trans, leaf);
> +
> + overlap_length = num_bytes;
> +
> + if (!is_identity_remap) {
> + btrfs_set_remap_address(leaf, remap_ptr,
> + new_addr + end - remap_start);
> + btrfs_release_path(path);
> +
> + /* Adjust backref. */
> +
> + key.objectid = new_addr;
> + key.type = BTRFS_REMAP_BACKREF_KEY;
> + key.offset = remap_length;
> +
> + ret = btrfs_search_slot(trans, fs_info->remap_root,
> + &key, path, -1, 1);
> + if (ret) {
> + if (ret == 1) {
> + btrfs_release_path(path);
> + ret = -ENOENT;
> + }
> + goto end;
> + }
> +
> + leaf = path->nodes[0];
> +
> + new_key.objectid = new_addr + end - remap_start;
> + new_key.type = BTRFS_REMAP_BACKREF_KEY;
> + new_key.offset = remap_length + remap_start - end;
> +
> + btrfs_set_item_key_safe(trans, path, &new_key);
> +
> + remap_ptr = btrfs_item_ptr(leaf, path->slots[0],
> + struct btrfs_remap);
> + btrfs_set_remap_address(leaf, remap_ptr, end);
> +
> + btrfs_mark_buffer_dirty(trans, path->nodes[0]);
> +
> + btrfs_release_path(path);
> +
> + adjust_block_group_remap_bytes(trans, dest_bg,
> + -num_bytes);
> + }
> + } else if (bytenr + num_bytes < remap_start + remap_length) {
> + /* Remove middle. */
> +
> + new_key.objectid = remap_start;
> + new_key.type = key.type;
> + new_key.offset = bytenr - remap_start;
> +
> + btrfs_set_item_key_safe(trans, path, &new_key);
> + btrfs_mark_buffer_dirty(trans, leaf);
> +
> + new_key.objectid = end;
> + new_key.offset = remap_start + remap_length - end;
> +
> + btrfs_release_path(path);
> +
> + overlap_length = num_bytes;
> +
> + if (!is_identity_remap) {
> + /* Add second remap entry. */
> +
> + ret = btrfs_insert_empty_item(trans, fs_info->remap_root,
> + path, &new_key,
> + sizeof(struct btrfs_remap));
> + if (ret)
> + goto end;
> +
> + btrfs_set_stack_remap_address(&remap,
> + new_addr + end - remap_start);
> +
> + write_extent_buffer(path->nodes[0], &remap,
> + btrfs_item_ptr_offset(path->nodes[0], path->slots[0]),
> + sizeof(struct btrfs_remap));
> +
> + btrfs_release_path(path);
> +
> + /* Shorten backref entry. */
> +
> + key.objectid = new_addr;
> + key.type = BTRFS_REMAP_BACKREF_KEY;
> + key.offset = remap_length;
> +
> + ret = btrfs_search_slot(trans, fs_info->remap_root,
> + &key, path, -1, 1);
> + if (ret) {
> + if (ret == 1) {
> + btrfs_release_path(path);
> + ret = -ENOENT;
> + }
> + goto end;
> + }
> +
> + new_key.objectid = new_addr;
> + new_key.type = BTRFS_REMAP_BACKREF_KEY;
> + new_key.offset = bytenr - remap_start;
> +
> + btrfs_set_item_key_safe(trans, path, &new_key);
> + btrfs_mark_buffer_dirty(trans, path->nodes[0]);
> +
> + btrfs_release_path(path);
> +
> + /* Add second backref entry. */
> +
> + new_key.objectid = new_addr + end - remap_start;
> + new_key.type = BTRFS_REMAP_BACKREF_KEY;
> + new_key.offset = remap_start + remap_length - end;
> +
> + ret = btrfs_insert_empty_item(trans, fs_info->remap_root,
> + path, &new_key,
> + sizeof(struct btrfs_remap));
> + if (ret)
> + goto end;
> +
> + btrfs_set_stack_remap_address(&remap, end);
> +
> + write_extent_buffer(path->nodes[0], &remap,
> + btrfs_item_ptr_offset(path->nodes[0], path->slots[0]),
> + sizeof(struct btrfs_remap));
> +
> + btrfs_release_path(path);
> +
> + adjust_block_group_remap_bytes(trans, dest_bg,
> + -num_bytes);
> + } else {
> + /* Add second identity remap entry. */
> +
> + ret = btrfs_insert_empty_item(trans, fs_info->remap_root,
> + path, &new_key, 0);
> + if (ret)
> + goto end;
> +
> + btrfs_release_path(path);
> +
> + adjust_identity_remap_count(trans, bg, 1);
> + }
> + } else {
> + /* Remove end. */
> +
> + new_key.objectid = remap_start;
> + new_key.type = key.type;
> + new_key.offset = bytenr - remap_start;
> +
> + btrfs_set_item_key_safe(trans, path, &new_key);
> + btrfs_mark_buffer_dirty(trans, leaf);
> +
> + btrfs_release_path(path);
> +
> + overlap_length = remap_start + remap_length - bytenr;
> +
> + if (!is_identity_remap) {
> + /* Shorten backref entry. */
> +
> + key.objectid = new_addr;
> + key.type = BTRFS_REMAP_BACKREF_KEY;
> + key.offset = remap_length;
> +
> + ret = btrfs_search_slot(trans, fs_info->remap_root,
> + &key, path, -1, 1);
> + if (ret) {
> + if (ret == 1) {
> + btrfs_release_path(path);
> + ret = -ENOENT;
> + }
> + goto end;
> + }
> +
> + new_key.objectid = new_addr;
> + new_key.type = BTRFS_REMAP_BACKREF_KEY;
> + new_key.offset = bytenr - remap_start;
> +
> + btrfs_set_item_key_safe(trans, path, &new_key);
> + btrfs_mark_buffer_dirty(trans, path->nodes[0]);
> +
> + btrfs_release_path(path);
> +
> + adjust_block_group_remap_bytes(trans, dest_bg,
> + bytenr - remap_start - remap_length);
> + }
> + }
> +
> + if (!is_identity_remap) {
> + ret = btrfs_add_to_free_space_tree(trans,
> + bytenr - remap_start + new_addr,
> + overlap_length);
> + if (ret)
> + goto end;
> + }
why do this here instead of just letting it happen normally in
do_free_extent_accounting (where you added the new skipping logic)?
> +
> + ret = overlap_length;
> +
> +end:
> + if (dest_bg)
> + btrfs_put_block_group(dest_bg);
> +
> + return ret;
> +}
> +
> +/*
> + * Returns 1 if remove_range_from_remap_tree() has been called successfully,
> + * 0 if block group wasn't remapped, and a negative number on error.
> + */
> +int btrfs_remove_extent_from_remap_tree(struct btrfs_trans_handle *trans,
> + struct btrfs_path *path,
> + u64 bytenr, u64 num_bytes)
> +{
> + struct btrfs_fs_info *fs_info = trans->fs_info;
> + struct btrfs_key key, found_key;
> + struct extent_buffer *leaf;
> + struct btrfs_block_group *bg;
> + int ret, length;
> +
> + if (!(btrfs_super_incompat_flags(fs_info->super_copy) &
> + BTRFS_FEATURE_INCOMPAT_REMAP_TREE))
> + return 0;
> +
> + bg = btrfs_lookup_block_group(fs_info, bytenr);
> + if (!bg)
> + return 0;
> +
> + mutex_lock(&fs_info->remap_mutex);
> +
> + if (!(bg->flags & BTRFS_BLOCK_GROUP_REMAPPED)) {
> + mutex_unlock(&fs_info->remap_mutex);
> + btrfs_put_block_group(bg);
> + return 0;
> + }
> +
> + do {
> + key.objectid = bytenr;
> + key.type = (u8)-1;
> + key.offset = (u64)-1;
> +
> + ret = btrfs_search_slot(trans, fs_info->remap_root, &key, path,
> + -1, 1);
> + if (ret < 0)
> + goto end;
> +
> + leaf = path->nodes[0];
> +
> + if (path->slots[0] == 0) {
> + ret = -ENOENT;
> + goto end;
> + }
> +
> + path->slots[0]--;
> +
> + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
> +
> + if (found_key.type != BTRFS_IDENTITY_REMAP_KEY &&
> + found_key.type != BTRFS_REMAP_KEY) {
> + ret = -ENOENT;
> + goto end;
> + }
> +
> + if (bytenr < found_key.objectid ||
> + bytenr >= found_key.objectid + found_key.offset) {
> + ret = -ENOENT;
> + goto end;
> + }
> +
> + length = remove_range_from_remap_tree(trans, path, bg, bytenr,
> + num_bytes);
> + if (length < 0) {
> + ret = length;
> + goto end;
> + }
> +
> + bytenr += length;
> + num_bytes -= length;
> + } while (num_bytes > 0);
> +
> + ret = 1;
> +
> +end:
> + mutex_unlock(&fs_info->remap_mutex);
> +
> + btrfs_put_block_group(bg);
> + btrfs_release_path(path);
> + return ret;
> +}
> diff --git a/fs/btrfs/relocation.h b/fs/btrfs/relocation.h
> index b2ba83966650..7cfe91971cab 100644
> --- a/fs/btrfs/relocation.h
> +++ b/fs/btrfs/relocation.h
> @@ -33,5 +33,11 @@ bool btrfs_should_ignore_reloc_root(const struct btrfs_root *root);
> u64 btrfs_get_reloc_bg_bytenr(const struct btrfs_fs_info *fs_info);
> int btrfs_translate_remap(struct btrfs_fs_info *fs_info, u64 *logical,
> u64 *length);
> +int btrfs_remove_extent_from_remap_tree(struct btrfs_trans_handle *trans,
> + struct btrfs_path *path,
> + u64 bytenr, u64 num_bytes);
> +int btrfs_last_identity_remap_gone(struct btrfs_trans_handle *trans,
> + struct btrfs_chunk_map *chunk,
> + struct btrfs_block_group *bg);
>
> #endif
> diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
> index de3eeb37408a..ffee6c285182 100644
> --- a/fs/btrfs/transaction.c
> +++ b/fs/btrfs/transaction.c
> @@ -2437,6 +2437,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
> if (ret)
> goto unlock_reloc;
>
Does this need to be in the transaction critical section or can it
happen asynchronously like other bg cleanup tasks?
If so, that should be justified in the commit message.
> + ret = btrfs_handle_fully_remapped_bgs(trans);
> + if (ret)
> + goto unlock_reloc;
> +
> /*
> * make sure none of the code above managed to slip in a
> * delayed item
> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
> index d117f74e08c1..99ad95e1c300 100644
> --- a/fs/btrfs/volumes.c
> +++ b/fs/btrfs/volumes.c
> @@ -2929,8 +2929,8 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
> return ret;
> }
>
> -static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
> - struct btrfs_device *device)
> +int btrfs_update_device(struct btrfs_trans_handle *trans,
> + struct btrfs_device *device)
> {
> int ret;
> BTRFS_PATH_AUTO_FREE(path);
> @@ -3228,25 +3228,13 @@ static int remove_chunk_item(struct btrfs_trans_handle *trans,
> return btrfs_free_chunk(trans, chunk_offset);
> }
>
> -int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
> +int btrfs_remove_dev_extents(struct btrfs_trans_handle *trans,
> + struct btrfs_chunk_map *map)
> {
> struct btrfs_fs_info *fs_info = trans->fs_info;
> - struct btrfs_chunk_map *map;
> + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
> u64 dev_extent_len = 0;
> int i, ret = 0;
> - struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
> -
> - map = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
> - if (IS_ERR(map)) {
> - /*
> - * This is a logic error, but we don't want to just rely on the
> - * user having built with ASSERT enabled, so if ASSERT doesn't
> - * do anything we still error out.
> - */
> - DEBUG_WARN("errr %ld reading chunk map at offset %llu",
> - PTR_ERR(map), chunk_offset);
> - return PTR_ERR(map);
> - }
>
> /*
> * First delete the device extent items from the devices btree.
> @@ -3267,7 +3255,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
> if (unlikely(ret)) {
> mutex_unlock(&fs_devices->device_list_mutex);
> btrfs_abort_transaction(trans, ret);
> - goto out;
> + return ret;
> }
>
> if (device->bytes_used > 0) {
> @@ -3287,6 +3275,30 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
> }
> mutex_unlock(&fs_devices->device_list_mutex);
>
> + return 0;
> +}
> +
> +int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
> +{
> + struct btrfs_fs_info *fs_info = trans->fs_info;
> + struct btrfs_chunk_map *map;
> + int ret;
> +
> + map = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
> + if (IS_ERR(map)) {
> + /*
> + * This is a logic error, but we don't want to just rely on the
> + * user having built with ASSERT enabled, so if ASSERT doesn't
> + * do anything we still error out.
> + */
> + ASSERT(0);
> + return PTR_ERR(map);
> + }
> +
> + ret = btrfs_remove_dev_extents(trans, map);
> + if (ret)
> + goto out;
> +
> /*
> * We acquire fs_info->chunk_mutex for 2 reasons:
> *
> @@ -5422,7 +5434,7 @@ static void chunk_map_device_set_bits(struct btrfs_chunk_map *map, unsigned int
> }
> }
>
> -static void chunk_map_device_clear_bits(struct btrfs_chunk_map *map, unsigned int bits)
> +void btrfs_chunk_map_device_clear_bits(struct btrfs_chunk_map *map, unsigned int bits)
> {
> for (int i = 0; i < map->num_stripes; i++) {
> struct btrfs_io_stripe *stripe = &map->stripes[i];
> @@ -5439,7 +5451,7 @@ void btrfs_remove_chunk_map(struct btrfs_fs_info *fs_info, struct btrfs_chunk_ma
> write_lock(&fs_info->mapping_tree_lock);
> rb_erase_cached(&map->rb_node, &fs_info->mapping_tree);
> RB_CLEAR_NODE(&map->rb_node);
> - chunk_map_device_clear_bits(map, CHUNK_ALLOCATED);
> + btrfs_chunk_map_device_clear_bits(map, CHUNK_ALLOCATED);
> write_unlock(&fs_info->mapping_tree_lock);
>
> /* Once for the tree reference. */
> @@ -5475,7 +5487,7 @@ int btrfs_add_chunk_map(struct btrfs_fs_info *fs_info, struct btrfs_chunk_map *m
> return -EEXIST;
> }
> chunk_map_device_set_bits(map, CHUNK_ALLOCATED);
> - chunk_map_device_clear_bits(map, CHUNK_TRIMMED);
> + btrfs_chunk_map_device_clear_bits(map, CHUNK_TRIMMED);
> write_unlock(&fs_info->mapping_tree_lock);
>
> return 0;
> @@ -5840,7 +5852,7 @@ void btrfs_mapping_tree_free(struct btrfs_fs_info *fs_info)
> map = rb_entry(node, struct btrfs_chunk_map, rb_node);
> rb_erase_cached(&map->rb_node, &fs_info->mapping_tree);
> RB_CLEAR_NODE(&map->rb_node);
> - chunk_map_device_clear_bits(map, CHUNK_ALLOCATED);
> + btrfs_chunk_map_device_clear_bits(map, CHUNK_ALLOCATED);
> /* Once for the tree ref. */
> btrfs_free_chunk_map(map);
> cond_resched_rwlock_write(&fs_info->mapping_tree_lock);
> diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
> index 7cf76bffcab6..0c64cae59f1c 100644
> --- a/fs/btrfs/volumes.h
> +++ b/fs/btrfs/volumes.h
> @@ -794,6 +794,8 @@ u64 btrfs_calc_stripe_length(const struct btrfs_chunk_map *map);
> int btrfs_nr_parity_stripes(u64 type);
> int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans,
> struct btrfs_block_group *bg);
> +int btrfs_remove_dev_extents(struct btrfs_trans_handle *trans,
> + struct btrfs_chunk_map *map);
> int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset);
>
> #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
> @@ -905,6 +907,10 @@ bool btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical);
>
> bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr);
> const u8 *btrfs_sb_fsid_ptr(const struct btrfs_super_block *sb);
> +int btrfs_update_device(struct btrfs_trans_handle *trans,
> + struct btrfs_device *device);
> +void btrfs_chunk_map_device_clear_bits(struct btrfs_chunk_map *map,
> + unsigned int bits);
>
> #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
> struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_info,
> --
> 2.49.1
>
^ permalink raw reply [flat|nested] 42+ messages in thread* Re: [PATCH v4 09/16] btrfs: handle deletions from remapped block group
2025-10-31 23:30 ` Boris Burkov
@ 2025-11-04 12:30 ` Mark Harmstone
0 siblings, 0 replies; 42+ messages in thread
From: Mark Harmstone @ 2025-11-04 12:30 UTC (permalink / raw)
To: Boris Burkov; +Cc: linux-btrfs
On 31/10/2025 11.30 pm, Boris Burkov wrote:
> On Fri, Oct 24, 2025 at 07:12:10PM +0100, Mark Harmstone wrote:
>> Handle the case where we free an extent from a block group that has the
>> REMAPPED flag set. Because the remap tree is orthogonal to the extent
>> tree, for data this may be within any number of identity remaps or
>> actual remaps. If we're freeing a metadata node, this will be wholly
>> inside one or the other.
>>
>> btrfs_remove_extent_from_remap_tree() searches the remap tree for the
>> remaps that cover the range in question, then calls
>> remove_range_from_remap_tree() for each one, to punch a hole in the
>> remap and adjust the free-space tree.
>>
>> For an identity remap, remove_range_from_remap_tree() will adjust the
>> block group's `identity_remap_count` if this changes. If it reaches
>> zero we call last_identity_remap_gone(), which removes the chunk's
>> stripes and device extents - it is now fully remapped.
>>
>> The changes which involve the block group's ro flag are because the
>> REMAPPED flag itself prevents a block group from having any new
>> allocations within it, and so we don't need to account for this
>> separately.
>>
>> Signed-off-by: Mark Harmstone <mark@harmstone.com>
>> ---
>> fs/btrfs/block-group.c | 118 +++++++---
>> fs/btrfs/block-group.h | 4 +
>> fs/btrfs/disk-io.c | 2 +
>> fs/btrfs/extent-tree.c | 77 ++++++-
>> fs/btrfs/extent-tree.h | 1 +
>> fs/btrfs/fs.h | 4 +-
>> fs/btrfs/relocation.c | 509 +++++++++++++++++++++++++++++++++++++++++
>> fs/btrfs/relocation.h | 6 +
>> fs/btrfs/transaction.c | 4 +
>> fs/btrfs/volumes.c | 56 +++--
>> fs/btrfs/volumes.h | 6 +
>> 11 files changed, 728 insertions(+), 59 deletions(-)
>>
>> diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
>> index 27173aca6fc1..3bf5f20d90ec 100644
>> --- a/fs/btrfs/block-group.c
>> +++ b/fs/btrfs/block-group.c
>> @@ -1068,6 +1068,32 @@ static int remove_block_group_item(struct btrfs_trans_handle *trans,
>> return ret;
>> }
>>
>> +void btrfs_remove_bg_from_sinfo(struct btrfs_block_group *block_group)
>> +{
>> + int factor = btrfs_bg_type_to_factor(block_group->flags);
>> +
>> + spin_lock(&block_group->space_info->lock);
>> +
>> + if (btrfs_test_opt(block_group->fs_info, ENOSPC_DEBUG)) {
>> + WARN_ON(block_group->space_info->total_bytes
>> + < block_group->length);
>> + WARN_ON(block_group->space_info->bytes_readonly
>> + < block_group->length - block_group->zone_unusable);
>> + WARN_ON(block_group->space_info->bytes_zone_unusable
>> + < block_group->zone_unusable);
>> + WARN_ON(block_group->space_info->disk_total
>> + < block_group->length * factor);
>> + }
>> + block_group->space_info->total_bytes -= block_group->length;
>> + block_group->space_info->bytes_readonly -=
>> + (block_group->length - block_group->zone_unusable);
>> + btrfs_space_info_update_bytes_zone_unusable(block_group->space_info,
>> + -block_group->zone_unusable);
>> + block_group->space_info->disk_total -= block_group->length * factor;
>> +
>> + spin_unlock(&block_group->space_info->lock);
>> +}
>> +
>> int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
>> struct btrfs_chunk_map *map)
>> {
>> @@ -1079,7 +1105,6 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
>> struct kobject *kobj = NULL;
>> int ret;
>> int index;
>> - int factor;
>> struct btrfs_caching_control *caching_ctl = NULL;
>> bool remove_map;
>> bool remove_rsv = false;
>> @@ -1088,7 +1113,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
>> if (!block_group)
>> return -ENOENT;
>>
>> - BUG_ON(!block_group->ro);
>> + BUG_ON(!block_group->ro && !(block_group->flags & BTRFS_BLOCK_GROUP_REMAPPED));
>>
>> trace_btrfs_remove_block_group(block_group);
>> /*
>> @@ -1100,7 +1125,6 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
>> block_group->length);
>>
>> index = btrfs_bg_flags_to_raid_index(block_group->flags);
>> - factor = btrfs_bg_type_to_factor(block_group->flags);
>>
>> /* make sure this block group isn't part of an allocation cluster */
>> cluster = &fs_info->data_alloc_cluster;
>> @@ -1224,26 +1248,11 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
>>
>> spin_lock(&block_group->space_info->lock);
>> list_del_init(&block_group->ro_list);
>> -
>> - if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
>> - WARN_ON(block_group->space_info->total_bytes
>> - < block_group->length);
>> - WARN_ON(block_group->space_info->bytes_readonly
>> - < block_group->length - block_group->zone_unusable);
>> - WARN_ON(block_group->space_info->bytes_zone_unusable
>> - < block_group->zone_unusable);
>> - WARN_ON(block_group->space_info->disk_total
>> - < block_group->length * factor);
>> - }
>> - block_group->space_info->total_bytes -= block_group->length;
>> - block_group->space_info->bytes_readonly -=
>> - (block_group->length - block_group->zone_unusable);
>> - btrfs_space_info_update_bytes_zone_unusable(block_group->space_info,
>> - -block_group->zone_unusable);
>> - block_group->space_info->disk_total -= block_group->length * factor;
>> -
>> spin_unlock(&block_group->space_info->lock);
>>
>> + if (!(block_group->flags & BTRFS_BLOCK_GROUP_REMAPPED))
>> + btrfs_remove_bg_from_sinfo(block_group);
>> +
>> /*
>> * Remove the free space for the block group from the free space tree
>> * and the block group's item from the extent tree before marking the
>> @@ -1538,6 +1547,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
>> while (!list_empty(&fs_info->unused_bgs)) {
>> u64 used;
>> int trimming;
>> + bool made_ro = false;
>>
>> block_group = list_first_entry(&fs_info->unused_bgs,
>> struct btrfs_block_group,
>> @@ -1574,7 +1584,8 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
>>
>> spin_lock(&space_info->lock);
>> spin_lock(&block_group->lock);
>> - if (btrfs_is_block_group_used(block_group) || block_group->ro ||
>> + if (btrfs_is_block_group_used(block_group) ||
>> + (block_group->ro && !(block_group->flags & BTRFS_BLOCK_GROUP_REMAPPED)) ||
>> list_is_singular(&block_group->list)) {
>> /*
>> * We want to bail if we made new allocations or have
>> @@ -1616,9 +1627,10 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
>> * needing to allocate extents from the block group.
>> */
>> used = btrfs_space_info_used(space_info, true);
>> - if ((space_info->total_bytes - block_group->length < used &&
>> + if (((space_info->total_bytes - block_group->length < used &&
>> block_group->zone_unusable < block_group->length) ||
>> - has_unwritten_metadata(block_group)) {
>> + has_unwritten_metadata(block_group)) &&
>> + !(block_group->flags & BTRFS_BLOCK_GROUP_REMAPPED)) {
>> /*
>> * Add a reference for the list, compensate for the ref
>> * drop under the "next" label for the
>> @@ -1636,8 +1648,14 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
>> spin_unlock(&block_group->lock);
>> spin_unlock(&space_info->lock);
>>
>> - /* We don't want to force the issue, only flip if it's ok. */
>> - ret = inc_block_group_ro(block_group, 0);
>> + if (!(block_group->flags & BTRFS_BLOCK_GROUP_REMAPPED)) {
>> + /* We don't want to force the issue, only flip if it's ok. */
>> + ret = inc_block_group_ro(block_group, 0);
>> + made_ro = true;
>> + } else {
>> + ret = 0;
>> + }
>> +
>> up_write(&space_info->groups_sem);
>> if (ret < 0) {
>> ret = 0;
>> @@ -1646,7 +1664,8 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
>>
>> ret = btrfs_zone_finish(block_group);
>> if (ret < 0) {
>> - btrfs_dec_block_group_ro(block_group);
>> + if (made_ro)
>> + btrfs_dec_block_group_ro(block_group);
>> if (ret == -EAGAIN) {
>> btrfs_link_bg_list(block_group, &retry_list);
>> ret = 0;
>> @@ -1661,7 +1680,8 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
>> trans = btrfs_start_trans_remove_block_group(fs_info,
>> block_group->start);
>> if (IS_ERR(trans)) {
>> - btrfs_dec_block_group_ro(block_group);
>> + if (made_ro)
>> + btrfs_dec_block_group_ro(block_group);
>> ret = PTR_ERR(trans);
>> goto next;
>> }
>> @@ -1671,7 +1691,8 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
>> * just delete them, we don't care about them anymore.
>> */
>> if (!clean_pinned_extents(trans, block_group)) {
>> - btrfs_dec_block_group_ro(block_group);
>> + if (made_ro)
>> + btrfs_dec_block_group_ro(block_group);
>> goto end_trans;
>> }
>>
>> @@ -1685,7 +1706,8 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
>> spin_lock(&fs_info->discard_ctl.lock);
>> if (!list_empty(&block_group->discard_list)) {
>> spin_unlock(&fs_info->discard_ctl.lock);
>> - btrfs_dec_block_group_ro(block_group);
>> + if (made_ro)
>> + btrfs_dec_block_group_ro(block_group);
>> btrfs_discard_queue_work(&fs_info->discard_ctl,
>> block_group);
>> goto end_trans;
>> @@ -1779,6 +1801,15 @@ void btrfs_mark_bg_unused(struct btrfs_block_group *bg)
>> struct btrfs_fs_info *fs_info = bg->fs_info;
>>
>> spin_lock(&fs_info->unused_bgs_lock);
>> +
>> + /* Leave fully remapped block groups on the fully_remapped_bgs list. */
>> + if (bg->flags & BTRFS_BLOCK_GROUP_REMAPPED &&
>> + bg->identity_remap_count == 0 &&
>> + !list_empty(&bg->bg_list)) {
>> + spin_unlock(&fs_info->unused_bgs_lock);
>> + return;
>> + }
>> +
>> if (list_empty(&bg->bg_list)) {
>> btrfs_get_block_group(bg);
>> trace_btrfs_add_unused_block_group(bg);
>> @@ -4772,3 +4803,30 @@ bool btrfs_block_group_should_use_size_class(const struct btrfs_block_group *bg)
>> return false;
>> return true;
>> }
>> +
>> +void btrfs_mark_bg_fully_remapped(struct btrfs_block_group *bg,
>> + struct btrfs_trans_handle *trans)
>> +{
>> + struct btrfs_fs_info *fs_info = trans->fs_info;
>> + bool already_done;
>> +
>> + spin_lock(&bg->lock);
>> + already_done = bg->fully_remapped;
>> + bg->fully_remapped = true;
>> + spin_unlock(&bg->lock);
>> +
>> + if (already_done)
>> + return;
>> +
>> + spin_lock(&fs_info->unused_bgs_lock);
>> +
>
> which list could it be on in this case? reclaim? do we not take those
> off before reclaiming them?
Probably none I don't think, I've just copied what we do elsewhere. I'll
replace the check with ASSERT(list_empty(&bg->bg_list)), so we know if
something does go wrong.
>> + if (!list_empty(&bg->bg_list))
>> + list_del(&bg->bg_list);
>> + else
>> + btrfs_get_block_group(bg);
>> +
>> + list_add_tail(&bg->bg_list, &fs_info->fully_remapped_bgs);
>> +
>> + spin_unlock(&fs_info->unused_bgs_lock);
>> +
>> +}
>> diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h
>> index af23fdb3cf4d..d85f3c2546d0 100644
>> --- a/fs/btrfs/block-group.h
>> +++ b/fs/btrfs/block-group.h
>> @@ -282,6 +282,7 @@ struct btrfs_block_group {
>> struct extent_buffer *last_eb;
>> enum btrfs_block_group_size_class size_class;
>> u64 reclaim_mark;
>> + bool fully_remapped;
>> };
>>
>> static inline u64 btrfs_block_group_end(const struct btrfs_block_group *block_group)
>> @@ -336,6 +337,7 @@ int btrfs_add_new_free_space(struct btrfs_block_group *block_group,
>> struct btrfs_trans_handle *btrfs_start_trans_remove_block_group(
>> struct btrfs_fs_info *fs_info,
>> const u64 chunk_offset);
>> +void btrfs_remove_bg_from_sinfo(struct btrfs_block_group *block_group);
>> int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
>> struct btrfs_chunk_map *map);
>> void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info);
>> @@ -407,5 +409,7 @@ int btrfs_use_block_group_size_class(struct btrfs_block_group *bg,
>> enum btrfs_block_group_size_class size_class,
>> bool force_wrong_size_class);
>> bool btrfs_block_group_should_use_size_class(const struct btrfs_block_group *bg);
>> +void btrfs_mark_bg_fully_remapped(struct btrfs_block_group *bg,
>> + struct btrfs_trans_handle *trans);
>>
>> #endif /* BTRFS_BLOCK_GROUP_H */
>> diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
>> index d3ff148311d8..1a3e525f3d1a 100644
>> --- a/fs/btrfs/disk-io.c
>> +++ b/fs/btrfs/disk-io.c
>> @@ -2870,6 +2870,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
>> INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
>> INIT_LIST_HEAD(&fs_info->unused_bgs);
>> INIT_LIST_HEAD(&fs_info->reclaim_bgs);
>> + INIT_LIST_HEAD(&fs_info->fully_remapped_bgs);
>> INIT_LIST_HEAD(&fs_info->zone_active_bgs);
>> #ifdef CONFIG_BTRFS_DEBUG
>> INIT_LIST_HEAD(&fs_info->allocated_roots);
>> @@ -2925,6 +2926,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
>> mutex_init(&fs_info->chunk_mutex);
>> mutex_init(&fs_info->transaction_kthread_mutex);
>> mutex_init(&fs_info->cleaner_mutex);
>> + mutex_init(&fs_info->remap_mutex);
>> mutex_init(&fs_info->ro_block_group_mutex);
>> init_rwsem(&fs_info->commit_root_sem);
>> init_rwsem(&fs_info->cleanup_work_sem);
>> diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
>> index d3ca8105ffc7..1c14e0c82c03 100644
>> --- a/fs/btrfs/extent-tree.c
>> +++ b/fs/btrfs/extent-tree.c
>> @@ -40,6 +40,7 @@
>> #include "orphan.h"
>> #include "tree-checker.h"
>> #include "raid-stripe-tree.h"
>> +#include "relocation.h"
>>
>> #undef SCRAMBLE_DELAYED_REFS
>>
>> @@ -2847,6 +2848,52 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info,
>> return ret;
>> }
>>
>> +int btrfs_handle_fully_remapped_bgs(struct btrfs_trans_handle *trans)
>> +{
>> + struct btrfs_fs_info *fs_info = trans->fs_info;
>> + struct btrfs_block_group *block_group, *tmp;
>> + struct list_head *fully_remapped_bgs;
>> + int ret;
>> +
>> + fully_remapped_bgs = &fs_info->fully_remapped_bgs;
>> + list_for_each_entry_safe(block_group, tmp, fully_remapped_bgs, bg_list) {
>> + struct btrfs_chunk_map *map;
>> +
>> + map = btrfs_get_chunk_map(fs_info, block_group->start, 1);
>> + if (IS_ERR(map))
>> + return PTR_ERR(map);
>> +
>> + ret = btrfs_last_identity_remap_gone(trans, map, block_group);
>> + if (ret) {
>> + btrfs_free_chunk_map(map);
>> + return ret;
>> + }
>> +
>> + /*
>> + * Set num_stripes to 0, so that btrfs_remove_dev_extents()
>> + * won't run a second time.
>> + */
>> + map->num_stripes = 0;
>> +
>> + btrfs_free_chunk_map(map);
>> +
>> + if (block_group->used == 0) {
>> + spin_lock(&fs_info->unused_bgs_lock);
>> + list_move_tail(&block_group->bg_list,
>> + &fs_info->unused_bgs);
>> + spin_unlock(&fs_info->unused_bgs_lock);
>> + } else {
>> + spin_lock(&fs_info->unused_bgs_lock);
>> + list_del_init(&block_group->bg_list);
>> + spin_unlock(&fs_info->unused_bgs_lock);
>> +
>> + btrfs_put_block_group(block_group);
>> + }
>> + }
>> +
>> + return 0;
>> +}
>> +
>> int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
>> {
>> struct btrfs_fs_info *fs_info = trans->fs_info;
>> @@ -2999,11 +3046,23 @@ u64 btrfs_get_extent_owner_root(struct btrfs_fs_info *fs_info,
>> }
>>
>> static int do_free_extent_accounting(struct btrfs_trans_handle *trans,
>> - u64 bytenr, struct btrfs_squota_delta *delta)
>> + u64 bytenr, struct btrfs_squota_delta *delta,
>> + struct btrfs_path *path)
>> {
>> int ret;
>> + bool remapped = false;
>> u64 num_bytes = delta->num_bytes;
>>
>> + /* returns 1 on success and 0 on no-op */
>> + ret = btrfs_remove_extent_from_remap_tree(trans, path, bytenr,
>> + num_bytes);
>> + if (ret < 0) {
>> + btrfs_abort_transaction(trans, ret);
>> + return ret;
>> + } else if (ret == 1) {
>> + remapped = true;
>> + }
>> +
>> if (delta->is_data) {
>> struct btrfs_root *csum_root;
>>
>> @@ -3027,10 +3086,16 @@ static int do_free_extent_accounting(struct btrfs_trans_handle *trans,
>> return ret;
>> }
>>
>> - ret = btrfs_add_to_free_space_tree(trans, bytenr, num_bytes);
>> - if (unlikely(ret)) {
>> - btrfs_abort_transaction(trans, ret);
>> - return ret;
>> + /*
>> + * If remapped, FST has already been taken care of in
>> + * remove_range_from_remap_tree().
>> + */
>> + if (!remapped) {
>> + ret = btrfs_add_to_free_space_tree(trans, bytenr, num_bytes);
>> + if (unlikely(ret)) {
>> + btrfs_abort_transaction(trans, ret);
>> + return ret;
>> + }
>> }
>>
>> ret = btrfs_update_block_group(trans, bytenr, num_bytes, false);
>> @@ -3396,7 +3461,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
>> }
>> btrfs_release_path(path);
>>
>> - ret = do_free_extent_accounting(trans, bytenr, &delta);
>> + ret = do_free_extent_accounting(trans, bytenr, &delta, path);
>> }
>> btrfs_release_path(path);
>>
>> diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h
>> index e970ac42a871..6b67a4e528da 100644
>> --- a/fs/btrfs/extent-tree.h
>> +++ b/fs/btrfs/extent-tree.h
>> @@ -164,5 +164,6 @@ void btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, u64 start, u6
>> int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
>> u64 num_bytes, u64 *actual_bytes);
>> int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range);
>> +int btrfs_handle_fully_remapped_bgs(struct btrfs_trans_handle *trans);
>>
>> #endif
>> diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
>> index 62057e8006a9..c3dacbfe118c 100644
>> --- a/fs/btrfs/fs.h
>> +++ b/fs/btrfs/fs.h
>> @@ -573,6 +573,7 @@ struct btrfs_fs_info {
>> struct mutex transaction_kthread_mutex;
>> struct mutex cleaner_mutex;
>> struct mutex chunk_mutex;
>> + struct mutex remap_mutex;
>>
>> /*
>> * This is taken to make sure we don't set block groups ro after the
>> @@ -827,10 +828,11 @@ struct btrfs_fs_info {
>> struct list_head reclaim_bgs;
>> int bg_reclaim_threshold;
>>
>> - /* Protects the lists unused_bgs and reclaim_bgs. */
>> + /* Protects the lists unused_bgs, reclaim_bgs, and fully_remapped_bgs. */
>> spinlock_t unused_bgs_lock;
>> /* Protected by unused_bgs_lock. */
>> struct list_head unused_bgs;
>> + struct list_head fully_remapped_bgs;
>> struct mutex unused_bg_unpin_mutex;
>> /* Protect block groups that are going to be deleted */
>> struct mutex reclaim_bgs_lock;
>> diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
>> index a8abe24de8d7..9f3ce3395d6a 100644
>> --- a/fs/btrfs/relocation.c
>> +++ b/fs/btrfs/relocation.c
>> @@ -37,6 +37,7 @@
>> #include "super.h"
>> #include "tree-checker.h"
>> #include "raid-stripe-tree.h"
>> +#include "free-space-tree.h"
>>
>> /*
>> * Relocation overview
>> @@ -3870,6 +3871,151 @@ static const char *stage_to_string(enum reloc_stage stage)
>> return "unknown";
>> }
>>
>> +static void adjust_block_group_remap_bytes(struct btrfs_trans_handle *trans,
>> + struct btrfs_block_group *bg,
>> + s64 diff)
>> +{
>> + struct btrfs_fs_info *fs_info = trans->fs_info;
>> + bool bg_already_dirty = true, mark_unused = false;
>> +
>> + spin_lock(&bg->lock);
>> +
>> + bg->remap_bytes += diff;
>> +
>> + if (bg->used == 0 && bg->remap_bytes == 0)
>> + mark_unused = true;
>> +
>> + spin_unlock(&bg->lock);
>> +
>> + if (mark_unused)
>> + btrfs_mark_bg_unused(bg);
>> +
>> + spin_lock(&trans->transaction->dirty_bgs_lock);
>> + if (list_empty(&bg->dirty_list)) {
>> + list_add_tail(&bg->dirty_list, &trans->transaction->dirty_bgs);
>> + bg_already_dirty = false;
>> + btrfs_get_block_group(bg);
>> + }
>> + spin_unlock(&trans->transaction->dirty_bgs_lock);
>> +
>> + /* Modified block groups are accounted for in the delayed_refs_rsv. */
>> + if (!bg_already_dirty)
>> + btrfs_inc_delayed_refs_rsv_bg_updates(fs_info);
>> +}
>> +
>> +static int remove_chunk_stripes(struct btrfs_trans_handle *trans,
>> + struct btrfs_chunk_map *chunk,
>> + struct btrfs_path *path)
>> +{
>> + struct btrfs_fs_info *fs_info = trans->fs_info;
>> + struct btrfs_key key;
>> + struct extent_buffer *leaf;
>> + struct btrfs_chunk *c;
>> + int ret;
>> +
>> + key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
>> + key.type = BTRFS_CHUNK_ITEM_KEY;
>> + key.offset = chunk->start;
>> +
>> + ret = btrfs_search_slot(trans, fs_info->chunk_root, &key, path,
>> + 0, 1);
>> + if (ret) {
>> + if (ret == 1) {
>> + btrfs_release_path(path);
>> + ret = -ENOENT;
>> + }
>> + return ret;
>> + }
>> +
>> + leaf = path->nodes[0];
>> +
>> + c = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_chunk);
>> + btrfs_set_chunk_num_stripes(leaf, c, 0);
>> + btrfs_set_chunk_sub_stripes(leaf, c, 0);
>> +
>> + btrfs_truncate_item(trans, path, offsetof(struct btrfs_chunk, stripe),
>> + 1);
>> +
>> + btrfs_mark_buffer_dirty(trans, leaf);
>> +
>> + btrfs_release_path(path);
>> +
>> + return 0;
>> +}
>> +
>> +int btrfs_last_identity_remap_gone(struct btrfs_trans_handle *trans,
>> + struct btrfs_chunk_map *chunk,
>> + struct btrfs_block_group *bg)
>> +{
>> + int ret;
>> + BTRFS_PATH_AUTO_FREE(path);
>> +
>> + ret = btrfs_remove_dev_extents(trans, chunk);
>> + if (ret)
>> + return ret;
>> +
>> + mutex_lock(&trans->fs_info->chunk_mutex);
>> +
>> + for (unsigned int i = 0; i < chunk->num_stripes; i++) {
>> + ret = btrfs_update_device(trans, chunk->stripes[i].dev);
>> + if (ret) {
>> + mutex_unlock(&trans->fs_info->chunk_mutex);
>> + return ret;
>> + }
>> + }
>> +
>> + mutex_unlock(&trans->fs_info->chunk_mutex);
>> +
>> + write_lock(&trans->fs_info->mapping_tree_lock);
>> + btrfs_chunk_map_device_clear_bits(chunk, CHUNK_ALLOCATED);
>> + write_unlock(&trans->fs_info->mapping_tree_lock);
>> +
>> + btrfs_remove_bg_from_sinfo(bg);
>> +
>> + path = btrfs_alloc_path();
>> + if (!path)
>> + return -ENOMEM;
>> +
>> + ret = remove_chunk_stripes(trans, chunk, path);
>> + if (ret)
>> + return ret;
>> +
>> + return 0;
>> +}
>> +
>> +static void adjust_identity_remap_count(struct btrfs_trans_handle *trans,
>> + struct btrfs_block_group *bg, int delta)
>> +{
>> + struct btrfs_fs_info *fs_info = trans->fs_info;
>> + bool bg_already_dirty = true, mark_fully_remapped = false;
>> +
>> + WARN_ON(delta < 0 && -delta > bg->identity_remap_count);
>> +
>> + spin_lock(&bg->lock);
>> +
>> + bg->identity_remap_count += delta;
>> +
>> + if (!bg->fully_remapped && bg->identity_remap_count == 0)
>> + mark_fully_remapped = true;
>> +
>> + spin_unlock(&bg->lock);
>> +
>> + spin_lock(&trans->transaction->dirty_bgs_lock);
>> + if (list_empty(&bg->dirty_list)) {
>> + list_add_tail(&bg->dirty_list, &trans->transaction->dirty_bgs);
>> + bg_already_dirty = false;
>> + btrfs_get_block_group(bg);
>> + }
>> + spin_unlock(&trans->transaction->dirty_bgs_lock);
>> +
>> + /* Modified block groups are accounted for in the delayed_refs_rsv. */
>> + if (!bg_already_dirty)
>> + btrfs_inc_delayed_refs_rsv_bg_updates(fs_info);
>> +
>> + if (mark_fully_remapped)
>> + btrfs_mark_bg_fully_remapped(bg, trans);
>> +}
>> +
>> int btrfs_translate_remap(struct btrfs_fs_info *fs_info, u64 *logical,
>> u64 *length)
>> {
>> @@ -4478,3 +4624,366 @@ u64 btrfs_get_reloc_bg_bytenr(const struct btrfs_fs_info *fs_info)
>> logical = fs_info->reloc_ctl->block_group->start;
>> return logical;
>> }
>> +
>
> Please document the expectation on the path passed (and other params)
>
>> +static int remove_range_from_remap_tree(struct btrfs_trans_handle *trans,
>> + struct btrfs_path *path,
>> + struct btrfs_block_group *bg,
>> + u64 bytenr, u64 num_bytes)
>> +{
>> + int ret;
>> + struct btrfs_fs_info *fs_info = trans->fs_info;
>> + struct extent_buffer *leaf = path->nodes[0];
>> + struct btrfs_key key, new_key;
>> + struct btrfs_remap *remap_ptr = NULL, remap;
>> + struct btrfs_block_group *dest_bg = NULL;
>> + u64 end, new_addr = 0, remap_start, remap_length, overlap_length;
>> + bool is_identity_remap;
>> +
>> + end = bytenr + num_bytes;
>> +
>> + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
>> +
>> + is_identity_remap = key.type == BTRFS_IDENTITY_REMAP_KEY;
>> +
>> + remap_start = key.objectid;
>> + remap_length = key.offset;
>> +
>> + if (!is_identity_remap) {
>> + remap_ptr = btrfs_item_ptr(leaf, path->slots[0],
>> + struct btrfs_remap);
>> + new_addr = btrfs_remap_address(leaf, remap_ptr);
>> +
>> + dest_bg = btrfs_lookup_block_group(fs_info, new_addr);
>> + }
>> +
>
> These open-coded cases are all quite large and I suspect redundant (I'll
> try to catch concretely how) so it would make it more readable to break
> them out into static functions (remove_remapping, trim_remapping, etc.)
>
> The most readable might be to simply always delete the entire remapping
> item and replace it with the needed new ones rather than modifying in-place.
That feels clumsier to me... but it is the approach followed by
remove_free_space_extent(). I suppose clear code is better than being
clever.
>> + if (bytenr == remap_start && num_bytes >= remap_length) {
>> + /* Remove entirely. */
>> +
>> + ret = btrfs_del_item(trans, fs_info->remap_root, path);
>> + if (ret)
>> + goto end;
>> +
>> + btrfs_release_path(path);
>> +
>> + overlap_length = remap_length;
>> +
>> + if (!is_identity_remap) {
>> + /* Remove backref. */
>> +
>> + key.objectid = new_addr;
>> + key.type = BTRFS_REMAP_BACKREF_KEY;
>> + key.offset = remap_length;
>> +
>> + ret = btrfs_search_slot(trans, fs_info->remap_root,
>> + &key, path, -1, 1);
>> + if (ret) {
>> + if (ret == 1) {
>> + btrfs_release_path(path);
>> + ret = -ENOENT;
>> + }
>> + goto end;
>> + }
>> +
>> + ret = btrfs_del_item(trans, fs_info->remap_root, path);
>> +
>> + btrfs_release_path(path);
>> +
>> + if (ret)
>> + goto end;
>> +
>> + adjust_block_group_remap_bytes(trans, dest_bg,
>> + -remap_length);
>> + } else {
>> + adjust_identity_remap_count(trans, bg, -1);
>> + }
>> + } else if (bytenr == remap_start) {
>> + /* Remove beginning. */
>> +
>> + new_key.objectid = end;
>> + new_key.type = key.type;
>> + new_key.offset = remap_length + remap_start - end;
>> +
>> + btrfs_set_item_key_safe(trans, path, &new_key);
>> + btrfs_mark_buffer_dirty(trans, leaf);
>> +
>> + overlap_length = num_bytes;
>> +
>> + if (!is_identity_remap) {
>> + btrfs_set_remap_address(leaf, remap_ptr,
>> + new_addr + end - remap_start);
>> + btrfs_release_path(path);
>> +
>> + /* Adjust backref. */
>> +
>> + key.objectid = new_addr;
>> + key.type = BTRFS_REMAP_BACKREF_KEY;
>> + key.offset = remap_length;
>> +
>> + ret = btrfs_search_slot(trans, fs_info->remap_root,
>> + &key, path, -1, 1);
>> + if (ret) {
>> + if (ret == 1) {
>> + btrfs_release_path(path);
>> + ret = -ENOENT;
>> + }
>> + goto end;
>> + }
>> +
>> + leaf = path->nodes[0];
>> +
>> + new_key.objectid = new_addr + end - remap_start;
>> + new_key.type = BTRFS_REMAP_BACKREF_KEY;
>> + new_key.offset = remap_length + remap_start - end;
>> +
>> + btrfs_set_item_key_safe(trans, path, &new_key);
>> +
>> + remap_ptr = btrfs_item_ptr(leaf, path->slots[0],
>> + struct btrfs_remap);
>> + btrfs_set_remap_address(leaf, remap_ptr, end);
>> +
>> + btrfs_mark_buffer_dirty(trans, path->nodes[0]);
>> +
>> + btrfs_release_path(path);
>> +
>> + adjust_block_group_remap_bytes(trans, dest_bg,
>> + -num_bytes);
>> + }
>> + } else if (bytenr + num_bytes < remap_start + remap_length) {
>> + /* Remove middle. */
>> +
>> + new_key.objectid = remap_start;
>> + new_key.type = key.type;
>> + new_key.offset = bytenr - remap_start;
>> +
>> + btrfs_set_item_key_safe(trans, path, &new_key);
>> + btrfs_mark_buffer_dirty(trans, leaf);
>> +
>> + new_key.objectid = end;
>> + new_key.offset = remap_start + remap_length - end;
>> +
>> + btrfs_release_path(path);
>> +
>> + overlap_length = num_bytes;
>> +
>> + if (!is_identity_remap) {
>> + /* Add second remap entry. */
>> +
>> + ret = btrfs_insert_empty_item(trans, fs_info->remap_root,
>> + path, &new_key,
>> + sizeof(struct btrfs_remap));
>> + if (ret)
>> + goto end;
>> +
>> + btrfs_set_stack_remap_address(&remap,
>> + new_addr + end - remap_start);
>> +
>> + write_extent_buffer(path->nodes[0], &remap,
>> + btrfs_item_ptr_offset(path->nodes[0], path->slots[0]),
>> + sizeof(struct btrfs_remap));
>> +
>> + btrfs_release_path(path);
>> +
>> + /* Shorten backref entry. */
>> +
>> + key.objectid = new_addr;
>> + key.type = BTRFS_REMAP_BACKREF_KEY;
>> + key.offset = remap_length;
>> +
>> + ret = btrfs_search_slot(trans, fs_info->remap_root,
>> + &key, path, -1, 1);
>> + if (ret) {
>> + if (ret == 1) {
>> + btrfs_release_path(path);
>> + ret = -ENOENT;
>> + }
>> + goto end;
>> + }
>> +
>> + new_key.objectid = new_addr;
>> + new_key.type = BTRFS_REMAP_BACKREF_KEY;
>> + new_key.offset = bytenr - remap_start;
>> +
>> + btrfs_set_item_key_safe(trans, path, &new_key);
>> + btrfs_mark_buffer_dirty(trans, path->nodes[0]);
>> +
>> + btrfs_release_path(path);
>> +
>> + /* Add second backref entry. */
>> +
>> + new_key.objectid = new_addr + end - remap_start;
>> + new_key.type = BTRFS_REMAP_BACKREF_KEY;
>> + new_key.offset = remap_start + remap_length - end;
>> +
>> + ret = btrfs_insert_empty_item(trans, fs_info->remap_root,
>> + path, &new_key,
>> + sizeof(struct btrfs_remap));
>> + if (ret)
>> + goto end;
>> +
>> + btrfs_set_stack_remap_address(&remap, end);
>> +
>> + write_extent_buffer(path->nodes[0], &remap,
>> + btrfs_item_ptr_offset(path->nodes[0], path->slots[0]),
>> + sizeof(struct btrfs_remap));
>> +
>> + btrfs_release_path(path);
>> +
>> + adjust_block_group_remap_bytes(trans, dest_bg,
>> + -num_bytes);
>> + } else {
>> + /* Add second identity remap entry. */
>> +
>> + ret = btrfs_insert_empty_item(trans, fs_info->remap_root,
>> + path, &new_key, 0);
>> + if (ret)
>> + goto end;
>> +
>> + btrfs_release_path(path);
>> +
>> + adjust_identity_remap_count(trans, bg, 1);
>> + }
>> + } else {
>> + /* Remove end. */
>> +
>> + new_key.objectid = remap_start;
>> + new_key.type = key.type;
>> + new_key.offset = bytenr - remap_start;
>> +
>> + btrfs_set_item_key_safe(trans, path, &new_key);
>> + btrfs_mark_buffer_dirty(trans, leaf);
>> +
>> + btrfs_release_path(path);
>> +
>> + overlap_length = remap_start + remap_length - bytenr;
>> +
>> + if (!is_identity_remap) {
>> + /* Shorten backref entry. */
>> +
>> + key.objectid = new_addr;
>> + key.type = BTRFS_REMAP_BACKREF_KEY;
>> + key.offset = remap_length;
>> +
>> + ret = btrfs_search_slot(trans, fs_info->remap_root,
>> + &key, path, -1, 1);
>> + if (ret) {
>> + if (ret == 1) {
>> + btrfs_release_path(path);
>> + ret = -ENOENT;
>> + }
>> + goto end;
>> + }
>> +
>> + new_key.objectid = new_addr;
>> + new_key.type = BTRFS_REMAP_BACKREF_KEY;
>> + new_key.offset = bytenr - remap_start;
>> +
>> + btrfs_set_item_key_safe(trans, path, &new_key);
>> + btrfs_mark_buffer_dirty(trans, path->nodes[0]);
>> +
>> + btrfs_release_path(path);
>> +
>> + adjust_block_group_remap_bytes(trans, dest_bg,
>> + bytenr - remap_start - remap_length);
>> + }
>> + }
>> +
>> + if (!is_identity_remap) {
>> + ret = btrfs_add_to_free_space_tree(trans,
>> + bytenr - remap_start + new_addr,
>> + overlap_length);
>> + if (ret)
>> + goto end;
>> + }
>
> why do this here instead of just letting it happen normally in
> do_free_extent_accounting (where you added the new skipping logic)?
Because the addresses in the free-space tree are all after translation,
whereas the bytenr passed to do_free_extent_accounting() is before
translation. And you can't even call btrfs_translate_remap() in
do_free_extent_accounting(), because you've just deleted the remap items.
>> +
>> + ret = overlap_length;
>> +
>> +end:
>> + if (dest_bg)
>> + btrfs_put_block_group(dest_bg);
>> +
>> + return ret;
>> +}
>> +
>> +/*
>> + * Returns 1 if remove_range_from_remap_tree() has been called successfully,
>> + * 0 if block group wasn't remapped, and a negative number on error.
>> + */
>> +int btrfs_remove_extent_from_remap_tree(struct btrfs_trans_handle *trans,
>> + struct btrfs_path *path,
>> + u64 bytenr, u64 num_bytes)
>> +{
>> + struct btrfs_fs_info *fs_info = trans->fs_info;
>> + struct btrfs_key key, found_key;
>> + struct extent_buffer *leaf;
>> + struct btrfs_block_group *bg;
>> + int ret, length;
>> +
>> + if (!(btrfs_super_incompat_flags(fs_info->super_copy) &
>> + BTRFS_FEATURE_INCOMPAT_REMAP_TREE))
>> + return 0;
>> +
>> + bg = btrfs_lookup_block_group(fs_info, bytenr);
>> + if (!bg)
>> + return 0;
>> +
>> + mutex_lock(&fs_info->remap_mutex);
>> +
>> + if (!(bg->flags & BTRFS_BLOCK_GROUP_REMAPPED)) {
>> + mutex_unlock(&fs_info->remap_mutex);
>> + btrfs_put_block_group(bg);
>> + return 0;
>> + }
>> +
>> + do {
>> + key.objectid = bytenr;
>> + key.type = (u8)-1;
>> + key.offset = (u64)-1;
>> +
>> + ret = btrfs_search_slot(trans, fs_info->remap_root, &key, path,
>> + -1, 1);
>> + if (ret < 0)
>> + goto end;
>> +
>> + leaf = path->nodes[0];
>> +
>> + if (path->slots[0] == 0) {
>> + ret = -ENOENT;
>> + goto end;
>> + }
>> +
>> + path->slots[0]--;
>> +
>> + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
>> +
>> + if (found_key.type != BTRFS_IDENTITY_REMAP_KEY &&
>> + found_key.type != BTRFS_REMAP_KEY) {
>> + ret = -ENOENT;
>> + goto end;
>> + }
>> +
>> + if (bytenr < found_key.objectid ||
>> + bytenr >= found_key.objectid + found_key.offset) {
>> + ret = -ENOENT;
>> + goto end;
>> + }
>> +
>> + length = remove_range_from_remap_tree(trans, path, bg, bytenr,
>> + num_bytes);
>> + if (length < 0) {
>> + ret = length;
>> + goto end;
>> + }
>> +
>> + bytenr += length;
>> + num_bytes -= length;
>> + } while (num_bytes > 0);
>> +
>> + ret = 1;
>> +
>> +end:
>> + mutex_unlock(&fs_info->remap_mutex);
>> +
>> + btrfs_put_block_group(bg);
>> + btrfs_release_path(path);
>> + return ret;
>> +}
>> diff --git a/fs/btrfs/relocation.h b/fs/btrfs/relocation.h
>> index b2ba83966650..7cfe91971cab 100644
>> --- a/fs/btrfs/relocation.h
>> +++ b/fs/btrfs/relocation.h
>> @@ -33,5 +33,11 @@ bool btrfs_should_ignore_reloc_root(const struct btrfs_root *root);
>> u64 btrfs_get_reloc_bg_bytenr(const struct btrfs_fs_info *fs_info);
>> int btrfs_translate_remap(struct btrfs_fs_info *fs_info, u64 *logical,
>> u64 *length);
>> +int btrfs_remove_extent_from_remap_tree(struct btrfs_trans_handle *trans,
>> + struct btrfs_path *path,
>> + u64 bytenr, u64 num_bytes);
>> +int btrfs_last_identity_remap_gone(struct btrfs_trans_handle *trans,
>> + struct btrfs_chunk_map *chunk,
>> + struct btrfs_block_group *bg);
>>
>> #endif
>> diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
>> index de3eeb37408a..ffee6c285182 100644
>> --- a/fs/btrfs/transaction.c
>> +++ b/fs/btrfs/transaction.c
>> @@ -2437,6 +2437,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
>> if (ret)
>> goto unlock_reloc;
>>
>
> Does this need to be in the transaction critical section or can it
> happen asynchronously like other bg cleanup tasks?
>
> If so, that should be justified in the commit message.
btrfs_handle_fully_remapped_bgs() does the following:
* For async discard, it queues the BG up for the worker thread to finish
off (and, as of the later patch, sets the stripe removal pending flag)
* For nodiscard, it removes the chunk stripes and device extents,
freeing the BG's space so it can be used again by the chunk allocator
* For sync discard, it does the same but also discards the whole range
So it's only in the case of sync discard that it does anything
particularly heavy.
We can e.g. delay deleting unused BGs because they can be used again,
but here it's dead space until the stripes and device extents get
removed: delaying it would make the chunk allocator more likely to
ENOSPC. (Which doesn't help us with async discard, but we've discussed
off-list how a later improvement would be to make the chunk allocator
flush the pending fully remapped discard list before it ENOSPCs.)
>> + ret = btrfs_handle_fully_remapped_bgs(trans);
>> + if (ret)
>> + goto unlock_reloc;
>> +
>> /*
>> * make sure none of the code above managed to slip in a
>> * delayed item
>> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
>> index d117f74e08c1..99ad95e1c300 100644
>> --- a/fs/btrfs/volumes.c
>> +++ b/fs/btrfs/volumes.c
>> @@ -2929,8 +2929,8 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
>> return ret;
>> }
>>
>> -static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
>> - struct btrfs_device *device)
>> +int btrfs_update_device(struct btrfs_trans_handle *trans,
>> + struct btrfs_device *device)
>> {
>> int ret;
>> BTRFS_PATH_AUTO_FREE(path);
>> @@ -3228,25 +3228,13 @@ static int remove_chunk_item(struct btrfs_trans_handle *trans,
>> return btrfs_free_chunk(trans, chunk_offset);
>> }
>>
>> -int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
>> +int btrfs_remove_dev_extents(struct btrfs_trans_handle *trans,
>> + struct btrfs_chunk_map *map)
>> {
>> struct btrfs_fs_info *fs_info = trans->fs_info;
>> - struct btrfs_chunk_map *map;
>> + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
>> u64 dev_extent_len = 0;
>> int i, ret = 0;
>> - struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
>> -
>> - map = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
>> - if (IS_ERR(map)) {
>> - /*
>> - * This is a logic error, but we don't want to just rely on the
>> - * user having built with ASSERT enabled, so if ASSERT doesn't
>> - * do anything we still error out.
>> - */
>> - DEBUG_WARN("errr %ld reading chunk map at offset %llu",
>> - PTR_ERR(map), chunk_offset);
>> - return PTR_ERR(map);
>> - }
>>
>> /*
>> * First delete the device extent items from the devices btree.
>> @@ -3267,7 +3255,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
>> if (unlikely(ret)) {
>> mutex_unlock(&fs_devices->device_list_mutex);
>> btrfs_abort_transaction(trans, ret);
>> - goto out;
>> + return ret;
>> }
>>
>> if (device->bytes_used > 0) {
>> @@ -3287,6 +3275,30 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
>> }
>> mutex_unlock(&fs_devices->device_list_mutex);
>>
>> + return 0;
>> +}
>> +
>> +int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
>> +{
>> + struct btrfs_fs_info *fs_info = trans->fs_info;
>> + struct btrfs_chunk_map *map;
>> + int ret;
>> +
>> + map = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
>> + if (IS_ERR(map)) {
>> + /*
>> + * This is a logic error, but we don't want to just rely on the
>> + * user having built with ASSERT enabled, so if ASSERT doesn't
>> + * do anything we still error out.
>> + */
>> + ASSERT(0);
>> + return PTR_ERR(map);
>> + }
>> +
>> + ret = btrfs_remove_dev_extents(trans, map);
>> + if (ret)
>> + goto out;
>> +
>> /*
>> * We acquire fs_info->chunk_mutex for 2 reasons:
>> *
>> @@ -5422,7 +5434,7 @@ static void chunk_map_device_set_bits(struct btrfs_chunk_map *map, unsigned int
>> }
>> }
>>
>> -static void chunk_map_device_clear_bits(struct btrfs_chunk_map *map, unsigned int bits)
>> +void btrfs_chunk_map_device_clear_bits(struct btrfs_chunk_map *map, unsigned int bits)
>> {
>> for (int i = 0; i < map->num_stripes; i++) {
>> struct btrfs_io_stripe *stripe = &map->stripes[i];
>> @@ -5439,7 +5451,7 @@ void btrfs_remove_chunk_map(struct btrfs_fs_info *fs_info, struct btrfs_chunk_ma
>> write_lock(&fs_info->mapping_tree_lock);
>> rb_erase_cached(&map->rb_node, &fs_info->mapping_tree);
>> RB_CLEAR_NODE(&map->rb_node);
>> - chunk_map_device_clear_bits(map, CHUNK_ALLOCATED);
>> + btrfs_chunk_map_device_clear_bits(map, CHUNK_ALLOCATED);
>> write_unlock(&fs_info->mapping_tree_lock);
>>
>> /* Once for the tree reference. */
>> @@ -5475,7 +5487,7 @@ int btrfs_add_chunk_map(struct btrfs_fs_info *fs_info, struct btrfs_chunk_map *m
>> return -EEXIST;
>> }
>> chunk_map_device_set_bits(map, CHUNK_ALLOCATED);
>> - chunk_map_device_clear_bits(map, CHUNK_TRIMMED);
>> + btrfs_chunk_map_device_clear_bits(map, CHUNK_TRIMMED);
>> write_unlock(&fs_info->mapping_tree_lock);
>>
>> return 0;
>> @@ -5840,7 +5852,7 @@ void btrfs_mapping_tree_free(struct btrfs_fs_info *fs_info)
>> map = rb_entry(node, struct btrfs_chunk_map, rb_node);
>> rb_erase_cached(&map->rb_node, &fs_info->mapping_tree);
>> RB_CLEAR_NODE(&map->rb_node);
>> - chunk_map_device_clear_bits(map, CHUNK_ALLOCATED);
>> + btrfs_chunk_map_device_clear_bits(map, CHUNK_ALLOCATED);
>> /* Once for the tree ref. */
>> btrfs_free_chunk_map(map);
>> cond_resched_rwlock_write(&fs_info->mapping_tree_lock);
>> diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
>> index 7cf76bffcab6..0c64cae59f1c 100644
>> --- a/fs/btrfs/volumes.h
>> +++ b/fs/btrfs/volumes.h
>> @@ -794,6 +794,8 @@ u64 btrfs_calc_stripe_length(const struct btrfs_chunk_map *map);
>> int btrfs_nr_parity_stripes(u64 type);
>> int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans,
>> struct btrfs_block_group *bg);
>> +int btrfs_remove_dev_extents(struct btrfs_trans_handle *trans,
>> + struct btrfs_chunk_map *map);
>> int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset);
>>
>> #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
>> @@ -905,6 +907,10 @@ bool btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical);
>>
>> bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr);
>> const u8 *btrfs_sb_fsid_ptr(const struct btrfs_super_block *sb);
>> +int btrfs_update_device(struct btrfs_trans_handle *trans,
>> + struct btrfs_device *device);
>> +void btrfs_chunk_map_device_clear_bits(struct btrfs_chunk_map *map,
>> + unsigned int bits);
>>
>> #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
>> struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_info,
>> --
>> 2.49.1
>>
^ permalink raw reply [flat|nested] 42+ messages in thread
* [PATCH v4 10/16] btrfs: handle setting up relocation of block group with remap-tree
2025-10-24 18:12 [PATCH v4 00/16] Remap tree Mark Harmstone
` (8 preceding siblings ...)
2025-10-24 18:12 ` [PATCH v4 09/16] btrfs: handle deletions from remapped block group Mark Harmstone
@ 2025-10-24 18:12 ` Mark Harmstone
2025-10-31 23:43 ` Boris Burkov
2025-10-24 18:12 ` [PATCH v4 11/16] btrfs: move existing remaps before relocating block group Mark Harmstone
` (5 subsequent siblings)
15 siblings, 1 reply; 42+ messages in thread
From: Mark Harmstone @ 2025-10-24 18:12 UTC (permalink / raw)
To: linux-btrfs; +Cc: Mark Harmstone
Handle the preliminary work for relocating a block group in a filesystem
with the remap-tree flag set.
If the block group is SYSTEM btrfs_relocate_block_group() proceeds as it
does already, as bootstrapping issues mean that these block groups have
to be processed the existing way. Similarly with REMAP blocks, which are
dealt with in a later patch.
Otherwise we walk the free-space tree for the block group in question,
recording any holes. These get converted into identity remaps and placed
in the remap tree, and the block group's REMAPPED flag is set. From now
on no new allocations are possible within this block group, and any I/O
to it will be funnelled through btrfs_translate_remap(). We store the
number of identity remaps in `identity_remap_count`, so that we know
when we've removed the last one and the block group is fully remapped.
The change in btrfs_read_roots() is because data relocations no longer
rely on the data reloc tree as a hidden subvolume in which to do
snapshots.
Signed-off-by: Mark Harmstone <mark@harmstone.com>
---
fs/btrfs/block-group.c | 6 +-
fs/btrfs/block-group.h | 4 +
fs/btrfs/free-space-tree.c | 4 +-
fs/btrfs/free-space-tree.h | 5 +-
fs/btrfs/relocation.c | 423 ++++++++++++++++++++++++++++++++++++-
fs/btrfs/relocation.h | 2 +-
fs/btrfs/space-info.c | 9 +-
fs/btrfs/volumes.c | 15 +-
8 files changed, 447 insertions(+), 21 deletions(-)
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 3bf5f20d90ec..8feddb472882 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -2423,6 +2423,7 @@ static int read_one_block_group(struct btrfs_fs_info *info,
cache->used = btrfs_stack_block_group_v2_used(bgi);
cache->commit_used = cache->used;
cache->flags = btrfs_stack_block_group_v2_flags(bgi);
+ cache->commit_flags = cache->flags;
cache->global_root_id = btrfs_stack_block_group_v2_chunk_objectid(bgi);
cache->space_info = btrfs_find_space_info(info, cache->flags);
cache->remap_bytes = btrfs_stack_block_group_v2_remap_bytes(bgi);
@@ -2732,6 +2733,7 @@ static int insert_block_group_item(struct btrfs_trans_handle *trans,
block_group->commit_remap_bytes = block_group->remap_bytes;
block_group->commit_identity_remap_count =
block_group->identity_remap_count;
+ block_group->commit_flags = block_group->flags;
key.objectid = block_group->start;
key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
key.offset = block_group->length;
@@ -3220,13 +3222,15 @@ static int update_block_group_item(struct btrfs_trans_handle *trans,
/* No change in values, can safely skip it. */
if (cache->commit_used == used &&
cache->commit_remap_bytes == remap_bytes &&
- cache->commit_identity_remap_count == identity_remap_count) {
+ cache->commit_identity_remap_count == identity_remap_count &&
+ cache->commit_flags == cache->flags) {
spin_unlock(&cache->lock);
return 0;
}
cache->commit_used = used;
cache->commit_remap_bytes = remap_bytes;
cache->commit_identity_remap_count = identity_remap_count;
+ cache->commit_flags = cache->flags;
spin_unlock(&cache->lock);
key.objectid = cache->start;
diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h
index d85f3c2546d0..4522074a45c2 100644
--- a/fs/btrfs/block-group.h
+++ b/fs/btrfs/block-group.h
@@ -146,6 +146,10 @@ struct btrfs_block_group {
* The last commited identity_remap_count value of this block group.
*/
u32 commit_identity_remap_count;
+ /*
+ * The last committed flags value for this block group.
+ */
+ u64 commit_flags;
/*
* If the free space extent count exceeds this number, convert the block
diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c
index 26eae347739f..e46b1fa86f80 100644
--- a/fs/btrfs/free-space-tree.c
+++ b/fs/btrfs/free-space-tree.c
@@ -21,8 +21,7 @@ static int __add_block_group_free_space(struct btrfs_trans_handle *trans,
struct btrfs_block_group *block_group,
struct btrfs_path *path);
-static struct btrfs_root *btrfs_free_space_root(
- struct btrfs_block_group *block_group)
+struct btrfs_root *btrfs_free_space_root(struct btrfs_block_group *block_group)
{
struct btrfs_key key = {
.objectid = BTRFS_FREE_SPACE_TREE_OBJECTID,
@@ -93,7 +92,6 @@ static int add_new_free_space_info(struct btrfs_trans_handle *trans,
return 0;
}
-EXPORT_FOR_TESTS
struct btrfs_free_space_info *btrfs_search_free_space_info(
struct btrfs_trans_handle *trans,
struct btrfs_block_group *block_group,
diff --git a/fs/btrfs/free-space-tree.h b/fs/btrfs/free-space-tree.h
index 3d9a5d4477fc..89d2ff7e5c18 100644
--- a/fs/btrfs/free-space-tree.h
+++ b/fs/btrfs/free-space-tree.h
@@ -35,12 +35,13 @@ int btrfs_add_to_free_space_tree(struct btrfs_trans_handle *trans,
u64 start, u64 size);
int btrfs_remove_from_free_space_tree(struct btrfs_trans_handle *trans,
u64 start, u64 size);
-
-#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
struct btrfs_free_space_info *
btrfs_search_free_space_info(struct btrfs_trans_handle *trans,
struct btrfs_block_group *block_group,
struct btrfs_path *path, int cow);
+struct btrfs_root *btrfs_free_space_root(struct btrfs_block_group *block_group);
+
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
int __btrfs_add_to_free_space_tree(struct btrfs_trans_handle *trans,
struct btrfs_block_group *block_group,
struct btrfs_path *path, u64 start, u64 size);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 9f3ce3395d6a..cd53509c2fda 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -3627,7 +3627,7 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
btrfs_btree_balance_dirty(fs_info);
}
- if (!err) {
+ if (!err && !btrfs_fs_incompat(fs_info, REMAP_TREE)) {
ret = relocate_file_extent_cluster(rc);
if (ret < 0)
err = ret;
@@ -3871,6 +3871,90 @@ static const char *stage_to_string(enum reloc_stage stage)
return "unknown";
}
+static int add_remap_tree_entries(struct btrfs_trans_handle *trans,
+ struct btrfs_path *path,
+ struct btrfs_key *entries,
+ unsigned int num_entries)
+{
+ int ret;
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_item_batch batch;
+ u32 *data_sizes;
+ u32 max_items;
+
+ max_items = BTRFS_LEAF_DATA_SIZE(trans->fs_info) / sizeof(struct btrfs_item);
+
+ data_sizes = kzalloc(sizeof(u32) * min_t(u32, num_entries, max_items),
+ GFP_NOFS);
+ if (!data_sizes)
+ return -ENOMEM;
+
+ while (true) {
+ batch.keys = entries;
+ batch.data_sizes = data_sizes;
+ batch.total_data_size = 0;
+ batch.nr = min_t(u32, num_entries, max_items);
+
+ ret = btrfs_insert_empty_items(trans, fs_info->remap_root, path,
+ &batch);
+ btrfs_release_path(path);
+
+ if (num_entries <= max_items)
+ break;
+
+ num_entries -= max_items;
+ entries += max_items;
+ }
+
+ kfree(data_sizes);
+
+ return ret;
+}
+
+struct space_run {
+ u64 start;
+ u64 end;
+};
+
+static void parse_bitmap(u64 block_size, const unsigned long *bitmap,
+ unsigned long size, u64 address,
+ struct space_run *space_runs,
+ unsigned int *num_space_runs)
+{
+ unsigned long pos, end;
+ u64 run_start, run_length;
+
+ pos = find_first_bit(bitmap, size);
+
+ if (pos == size)
+ return;
+
+ while (true) {
+ end = find_next_zero_bit(bitmap, size, pos);
+
+ run_start = address + (pos * block_size);
+ run_length = (end - pos) * block_size;
+
+ if (*num_space_runs != 0 &&
+ space_runs[*num_space_runs - 1].end == run_start) {
+ space_runs[*num_space_runs - 1].end += run_length;
+ } else {
+ space_runs[*num_space_runs].start = run_start;
+ space_runs[*num_space_runs].end = run_start + run_length;
+
+ (*num_space_runs)++;
+ }
+
+ if (end == size)
+ break;
+
+ pos = find_next_bit(bitmap, size, end + 1);
+
+ if (pos == size)
+ break;
+ }
+}
+
static void adjust_block_group_remap_bytes(struct btrfs_trans_handle *trans,
struct btrfs_block_group *bg,
s64 diff)
@@ -3903,6 +3987,184 @@ static void adjust_block_group_remap_bytes(struct btrfs_trans_handle *trans,
btrfs_inc_delayed_refs_rsv_bg_updates(fs_info);
}
+static int create_remap_tree_entries(struct btrfs_trans_handle *trans,
+ struct btrfs_path *path,
+ struct btrfs_block_group *bg)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_free_space_info *fsi;
+ struct btrfs_key key, found_key;
+ struct extent_buffer *leaf;
+ struct btrfs_root *space_root;
+ u32 extent_count;
+ struct space_run *space_runs = NULL;
+ unsigned int num_space_runs = 0;
+ struct btrfs_key *entries = NULL;
+ unsigned int max_entries, num_entries;
+ int ret;
+
+ mutex_lock(&bg->free_space_lock);
+
+ if (test_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &bg->runtime_flags)) {
+ mutex_unlock(&bg->free_space_lock);
+
+ ret = btrfs_add_block_group_free_space(trans, bg);
+ if (ret)
+ return ret;
+
+ mutex_lock(&bg->free_space_lock);
+ }
+
+ fsi = btrfs_search_free_space_info(trans, bg, path, 0);
+ if (IS_ERR(fsi)) {
+ mutex_unlock(&bg->free_space_lock);
+ return PTR_ERR(fsi);
+ }
+
+ extent_count = btrfs_free_space_extent_count(path->nodes[0], fsi);
+
+ btrfs_release_path(path);
+
+ space_runs = kmalloc(sizeof(*space_runs) * extent_count, GFP_NOFS);
+ if (!space_runs) {
+ mutex_unlock(&bg->free_space_lock);
+ return -ENOMEM;
+ }
+
+ key.objectid = bg->start;
+ key.type = 0;
+ key.offset = 0;
+
+ space_root = btrfs_free_space_root(bg);
+
+ ret = btrfs_search_slot(trans, space_root, &key, path, 0, 0);
+ if (ret < 0) {
+ mutex_unlock(&bg->free_space_lock);
+ goto out;
+ }
+
+ ret = 0;
+
+ while (true) {
+ leaf = path->nodes[0];
+
+ btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+
+ if (found_key.objectid >= bg->start + bg->length)
+ break;
+
+ if (found_key.type == BTRFS_FREE_SPACE_EXTENT_KEY) {
+ if (num_space_runs != 0 &&
+ space_runs[num_space_runs - 1].end == found_key.objectid) {
+ space_runs[num_space_runs - 1].end =
+ found_key.objectid + found_key.offset;
+ } else {
+ BUG_ON(num_space_runs >= extent_count);
+
+ space_runs[num_space_runs].start = found_key.objectid;
+ space_runs[num_space_runs].end =
+ found_key.objectid + found_key.offset;
+
+ num_space_runs++;
+ }
+ } else if (found_key.type == BTRFS_FREE_SPACE_BITMAP_KEY) {
+ void *bitmap;
+ unsigned long offset;
+ u32 data_size;
+
+ offset = btrfs_item_ptr_offset(leaf, path->slots[0]);
+ data_size = btrfs_item_size(leaf, path->slots[0]);
+
+ if (data_size != 0) {
+ bitmap = kmalloc(data_size, GFP_NOFS);
+ if (!bitmap) {
+ mutex_unlock(&bg->free_space_lock);
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ read_extent_buffer(leaf, bitmap, offset,
+ data_size);
+
+ parse_bitmap(fs_info->sectorsize, bitmap,
+ data_size * BITS_PER_BYTE,
+ found_key.objectid, space_runs,
+ &num_space_runs);
+
+ BUG_ON(num_space_runs > extent_count);
+
+ kfree(bitmap);
+ }
+ }
+
+ path->slots[0]++;
+
+ if (path->slots[0] >= btrfs_header_nritems(leaf)) {
+ ret = btrfs_next_leaf(space_root, path);
+ if (ret != 0) {
+ if (ret == 1)
+ ret = 0;
+ break;
+ }
+ leaf = path->nodes[0];
+ }
+ }
+
+ btrfs_release_path(path);
+
+ mutex_unlock(&bg->free_space_lock);
+
+ max_entries = extent_count + 2;
+ entries = kmalloc(sizeof(*entries) * max_entries, GFP_NOFS);
+ if (!entries) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ num_entries = 0;
+
+ if (num_space_runs > 0 && space_runs[0].start > bg->start) {
+ entries[num_entries].objectid = bg->start;
+ entries[num_entries].type = BTRFS_IDENTITY_REMAP_KEY;
+ entries[num_entries].offset = space_runs[0].start - bg->start;
+ num_entries++;
+ }
+
+ for (unsigned int i = 1; i < num_space_runs; i++) {
+ entries[num_entries].objectid = space_runs[i - 1].end;
+ entries[num_entries].type = BTRFS_IDENTITY_REMAP_KEY;
+ entries[num_entries].offset =
+ space_runs[i].start - space_runs[i - 1].end;
+ num_entries++;
+ }
+
+ if (num_space_runs == 0) {
+ entries[num_entries].objectid = bg->start;
+ entries[num_entries].type = BTRFS_IDENTITY_REMAP_KEY;
+ entries[num_entries].offset = bg->length;
+ num_entries++;
+ } else if (space_runs[num_space_runs - 1].end < bg->start + bg->length) {
+ entries[num_entries].objectid = space_runs[num_space_runs - 1].end;
+ entries[num_entries].type = BTRFS_IDENTITY_REMAP_KEY;
+ entries[num_entries].offset =
+ bg->start + bg->length - space_runs[num_space_runs - 1].end;
+ num_entries++;
+ }
+
+ if (num_entries == 0)
+ goto out;
+
+ bg->identity_remap_count = num_entries;
+
+ ret = add_remap_tree_entries(trans, path, entries, num_entries);
+
+out:
+ kfree(entries);
+ kfree(space_runs);
+
+ return ret;
+}
+
static int remove_chunk_stripes(struct btrfs_trans_handle *trans,
struct btrfs_chunk_map *chunk,
struct btrfs_path *path)
@@ -4016,6 +4278,55 @@ static void adjust_identity_remap_count(struct btrfs_trans_handle *trans,
btrfs_mark_bg_fully_remapped(bg, trans);
}
+static int mark_chunk_remapped(struct btrfs_trans_handle *trans,
+ struct btrfs_path *path, uint64_t start)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_chunk_map *chunk;
+ struct btrfs_key key;
+ u64 type;
+ int ret;
+ struct extent_buffer *leaf;
+ struct btrfs_chunk *c;
+
+ read_lock(&fs_info->mapping_tree_lock);
+
+ chunk = btrfs_find_chunk_map_nolock(fs_info, start, 1);
+ if (!chunk) {
+ read_unlock(&fs_info->mapping_tree_lock);
+ return -ENOENT;
+ }
+
+ chunk->type |= BTRFS_BLOCK_GROUP_REMAPPED;
+ type = chunk->type;
+
+ read_unlock(&fs_info->mapping_tree_lock);
+
+ key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
+ key.type = BTRFS_CHUNK_ITEM_KEY;
+ key.offset = start;
+
+ ret = btrfs_search_slot(trans, fs_info->chunk_root, &key, path,
+ 0, 1);
+ if (ret == 1) {
+ ret = -ENOENT;
+ goto end;
+ } else if (ret < 0)
+ goto end;
+
+ leaf = path->nodes[0];
+
+ c = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_chunk);
+ btrfs_set_chunk_type(leaf, c, type);
+ btrfs_mark_buffer_dirty(trans, leaf);
+
+ ret = 0;
+end:
+ btrfs_free_chunk_map(chunk);
+ btrfs_release_path(path);
+ return ret;
+}
+
int btrfs_translate_remap(struct btrfs_fs_info *fs_info, u64 *logical,
u64 *length)
{
@@ -4070,17 +4381,94 @@ int btrfs_translate_remap(struct btrfs_fs_info *fs_info, u64 *logical,
return 0;
}
+static int start_block_group_remapping(struct btrfs_fs_info *fs_info,
+ struct btrfs_path *path,
+ struct btrfs_block_group *bg)
+{
+ struct btrfs_trans_handle *trans;
+ bool bg_already_dirty = true;
+ int ret, ret2;
+
+ ret = btrfs_cache_block_group(bg, true);
+ if (ret)
+ return ret;
+
+ trans = btrfs_start_transaction(fs_info->remap_root, 0);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+
+ /* We need to run delayed refs, to make sure FST is up to date. */
+ ret = btrfs_run_delayed_refs(trans, U64_MAX);
+ if (ret) {
+ btrfs_end_transaction(trans);
+ return ret;
+ }
+
+ mutex_lock(&fs_info->remap_mutex);
+
+ if (bg->flags & BTRFS_BLOCK_GROUP_REMAPPED) {
+ ret = 0;
+ goto end;
+ }
+
+ ret = create_remap_tree_entries(trans, path, bg);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto end;
+ }
+
+ spin_lock(&bg->lock);
+ bg->flags |= BTRFS_BLOCK_GROUP_REMAPPED;
+ spin_unlock(&bg->lock);
+
+ spin_lock(&trans->transaction->dirty_bgs_lock);
+ if (list_empty(&bg->dirty_list)) {
+ list_add_tail(&bg->dirty_list,
+ &trans->transaction->dirty_bgs);
+ bg_already_dirty = false;
+ btrfs_get_block_group(bg);
+ }
+ spin_unlock(&trans->transaction->dirty_bgs_lock);
+
+ /* Modified block groups are accounted for in the delayed_refs_rsv. */
+ if (!bg_already_dirty)
+ btrfs_inc_delayed_refs_rsv_bg_updates(fs_info);
+
+ ret = mark_chunk_remapped(trans, path, bg->start);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto end;
+ }
+
+ ret = btrfs_remove_block_group_free_space(trans, bg);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto end;
+ }
+
+ btrfs_remove_free_space_cache(bg);
+
+end:
+ mutex_unlock(&fs_info->remap_mutex);
+
+ ret2 = btrfs_end_transaction(trans);
+ if (!ret)
+ ret = ret2;
+
+ return ret;
+}
+
/*
* function to relocate all extents in a block group.
*/
int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start,
- bool verbose)
+ bool verbose, bool *using_remap_tree)
{
struct btrfs_block_group *bg;
struct btrfs_root *extent_root = btrfs_extent_root(fs_info, group_start);
struct reloc_control *rc;
struct inode *inode;
- struct btrfs_path *path;
+ struct btrfs_path *path = NULL;
int ret;
bool bg_is_ro = false;
@@ -4142,7 +4530,7 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start,
}
inode = lookup_free_space_inode(rc->block_group, path);
- btrfs_free_path(path);
+ btrfs_release_path(path);
if (!IS_ERR(inode))
ret = delete_block_group_cache(rc->block_group, inode, 0);
@@ -4152,11 +4540,17 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start,
if (ret && ret != -ENOENT)
goto out;
- rc->data_inode = create_reloc_inode(rc->block_group);
- if (IS_ERR(rc->data_inode)) {
- ret = PTR_ERR(rc->data_inode);
- rc->data_inode = NULL;
- goto out;
+ *using_remap_tree = btrfs_fs_incompat(fs_info, REMAP_TREE) &&
+ !(bg->flags & BTRFS_BLOCK_GROUP_SYSTEM) &&
+ !(bg->flags & BTRFS_BLOCK_GROUP_REMAP);
+
+ if (!btrfs_fs_incompat(fs_info, REMAP_TREE)) {
+ rc->data_inode = create_reloc_inode(rc->block_group);
+ if (IS_ERR(rc->data_inode)) {
+ ret = PTR_ERR(rc->data_inode);
+ rc->data_inode = NULL;
+ goto out;
+ }
}
if (verbose)
@@ -4169,6 +4563,11 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start,
ret = btrfs_zone_finish(rc->block_group);
WARN_ON(ret && ret != -EAGAIN);
+ if (*using_remap_tree) {
+ ret = start_block_group_remapping(fs_info, path, bg);
+ goto out;
+ }
+
while (1) {
enum reloc_stage finishes_stage;
@@ -4216,7 +4615,9 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start,
out:
if (ret && bg_is_ro)
btrfs_dec_block_group_ro(rc->block_group);
- iput(rc->data_inode);
+ if (!btrfs_fs_incompat(fs_info, REMAP_TREE))
+ iput(rc->data_inode);
+ btrfs_free_path(path);
reloc_chunk_end(fs_info);
out_put_bg:
btrfs_put_block_group(bg);
@@ -4410,7 +4811,7 @@ int btrfs_recover_relocation(struct btrfs_fs_info *fs_info)
btrfs_free_path(path);
- if (ret == 0) {
+ if (ret == 0 && !btrfs_fs_incompat(fs_info, REMAP_TREE)) {
/* cleanup orphan inode in data relocation tree */
fs_root = btrfs_grab_root(fs_info->data_reloc_root);
ASSERT(fs_root);
diff --git a/fs/btrfs/relocation.h b/fs/btrfs/relocation.h
index 7cfe91971cab..fbe191ff5d08 100644
--- a/fs/btrfs/relocation.h
+++ b/fs/btrfs/relocation.h
@@ -13,7 +13,7 @@ struct btrfs_ordered_extent;
struct btrfs_pending_snapshot;
int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start,
- bool verbose);
+ bool verbose, bool *using_remap_tree);
int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, struct btrfs_root *root);
int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index a2ce72d3e873..752d098d1a6a 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -375,8 +375,13 @@ void btrfs_add_bg_to_space_info(struct btrfs_fs_info *info,
factor = btrfs_bg_type_to_factor(block_group->flags);
spin_lock(&space_info->lock);
- space_info->total_bytes += block_group->length;
- space_info->disk_total += block_group->length * factor;
+
+ if (!(block_group->flags & BTRFS_BLOCK_GROUP_REMAPPED) ||
+ block_group->identity_remap_count != 0) {
+ space_info->total_bytes += block_group->length;
+ space_info->disk_total += block_group->length * factor;
+ }
+
space_info->bytes_used += block_group->used;
space_info->disk_used += block_group->used * factor;
space_info->bytes_readonly += block_group->bytes_super;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 99ad95e1c300..cda94c6f5239 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -3418,6 +3418,7 @@ int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset,
struct btrfs_block_group *block_group;
u64 length;
int ret;
+ bool using_remap_tree;
if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
btrfs_err(fs_info,
@@ -3441,7 +3442,8 @@ int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset,
/* step one, relocate all the extents inside this chunk */
btrfs_scrub_pause(fs_info);
- ret = btrfs_relocate_block_group(fs_info, chunk_offset, true);
+ ret = btrfs_relocate_block_group(fs_info, chunk_offset, true,
+ &using_remap_tree);
btrfs_scrub_continue(fs_info);
if (ret) {
/*
@@ -3453,6 +3455,9 @@ int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset,
return ret;
}
+ if (using_remap_tree)
+ return 0;
+
block_group = btrfs_lookup_block_group(fs_info, chunk_offset);
if (!block_group)
return -ENOENT;
@@ -4156,6 +4161,14 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)
chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
chunk_type = btrfs_chunk_type(leaf, chunk);
+ /* Check if chunk has already been fully relocated. */
+ if (chunk_type & BTRFS_BLOCK_GROUP_REMAPPED &&
+ btrfs_chunk_num_stripes(leaf, chunk) == 0) {
+ btrfs_release_path(path);
+ mutex_unlock(&fs_info->reclaim_bgs_lock);
+ goto loop;
+ }
+
if (!counting) {
spin_lock(&fs_info->balance_lock);
bctl->stat.considered++;
--
2.49.1
^ permalink raw reply related [flat|nested] 42+ messages in thread* Re: [PATCH v4 10/16] btrfs: handle setting up relocation of block group with remap-tree
2025-10-24 18:12 ` [PATCH v4 10/16] btrfs: handle setting up relocation of block group with remap-tree Mark Harmstone
@ 2025-10-31 23:43 ` Boris Burkov
2025-11-03 18:45 ` Mark Harmstone
0 siblings, 1 reply; 42+ messages in thread
From: Boris Burkov @ 2025-10-31 23:43 UTC (permalink / raw)
To: Mark Harmstone; +Cc: linux-btrfs
On Fri, Oct 24, 2025 at 07:12:11PM +0100, Mark Harmstone wrote:
> Handle the preliminary work for relocating a block group in a filesystem
> with the remap-tree flag set.
>
> If the block group is SYSTEM btrfs_relocate_block_group() proceeds as it
> does already, as bootstrapping issues mean that these block groups have
> to be processed the existing way. Similarly with REMAP blocks, which are
> dealt with in a later patch.
>
> Otherwise we walk the free-space tree for the block group in question,
> recording any holes. These get converted into identity remaps and placed
> in the remap tree, and the block group's REMAPPED flag is set. From now
> on no new allocations are possible within this block group, and any I/O
> to it will be funnelled through btrfs_translate_remap(). We store the
> number of identity remaps in `identity_remap_count`, so that we know
> when we've removed the last one and the block group is fully remapped.
>
> The change in btrfs_read_roots() is because data relocations no longer
> rely on the data reloc tree as a hidden subvolume in which to do
> snapshots.
>
> Signed-off-by: Mark Harmstone <mark@harmstone.com>
> ---
> fs/btrfs/block-group.c | 6 +-
> fs/btrfs/block-group.h | 4 +
> fs/btrfs/free-space-tree.c | 4 +-
> fs/btrfs/free-space-tree.h | 5 +-
> fs/btrfs/relocation.c | 423 ++++++++++++++++++++++++++++++++++++-
> fs/btrfs/relocation.h | 2 +-
> fs/btrfs/space-info.c | 9 +-
> fs/btrfs/volumes.c | 15 +-
> 8 files changed, 447 insertions(+), 21 deletions(-)
>
> diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
> index 3bf5f20d90ec..8feddb472882 100644
> --- a/fs/btrfs/block-group.c
> +++ b/fs/btrfs/block-group.c
> @@ -2423,6 +2423,7 @@ static int read_one_block_group(struct btrfs_fs_info *info,
> cache->used = btrfs_stack_block_group_v2_used(bgi);
> cache->commit_used = cache->used;
> cache->flags = btrfs_stack_block_group_v2_flags(bgi);
> + cache->commit_flags = cache->flags;
> cache->global_root_id = btrfs_stack_block_group_v2_chunk_objectid(bgi);
> cache->space_info = btrfs_find_space_info(info, cache->flags);
> cache->remap_bytes = btrfs_stack_block_group_v2_remap_bytes(bgi);
> @@ -2732,6 +2733,7 @@ static int insert_block_group_item(struct btrfs_trans_handle *trans,
> block_group->commit_remap_bytes = block_group->remap_bytes;
> block_group->commit_identity_remap_count =
> block_group->identity_remap_count;
> + block_group->commit_flags = block_group->flags;
> key.objectid = block_group->start;
> key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
> key.offset = block_group->length;
> @@ -3220,13 +3222,15 @@ static int update_block_group_item(struct btrfs_trans_handle *trans,
> /* No change in values, can safely skip it. */
> if (cache->commit_used == used &&
> cache->commit_remap_bytes == remap_bytes &&
> - cache->commit_identity_remap_count == identity_remap_count) {
> + cache->commit_identity_remap_count == identity_remap_count &&
> + cache->commit_flags == cache->flags) {
> spin_unlock(&cache->lock);
> return 0;
> }
> cache->commit_used = used;
> cache->commit_remap_bytes = remap_bytes;
> cache->commit_identity_remap_count = identity_remap_count;
> + cache->commit_flags = cache->flags;
> spin_unlock(&cache->lock);
>
> key.objectid = cache->start;
> diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h
> index d85f3c2546d0..4522074a45c2 100644
> --- a/fs/btrfs/block-group.h
> +++ b/fs/btrfs/block-group.h
> @@ -146,6 +146,10 @@ struct btrfs_block_group {
> * The last commited identity_remap_count value of this block group.
> */
> u32 commit_identity_remap_count;
> + /*
> + * The last committed flags value for this block group.
> + */
> + u64 commit_flags;
>
> /*
> * If the free space extent count exceeds this number, convert the block
> diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c
> index 26eae347739f..e46b1fa86f80 100644
> --- a/fs/btrfs/free-space-tree.c
> +++ b/fs/btrfs/free-space-tree.c
> @@ -21,8 +21,7 @@ static int __add_block_group_free_space(struct btrfs_trans_handle *trans,
> struct btrfs_block_group *block_group,
> struct btrfs_path *path);
>
> -static struct btrfs_root *btrfs_free_space_root(
> - struct btrfs_block_group *block_group)
> +struct btrfs_root *btrfs_free_space_root(struct btrfs_block_group *block_group)
> {
> struct btrfs_key key = {
> .objectid = BTRFS_FREE_SPACE_TREE_OBJECTID,
> @@ -93,7 +92,6 @@ static int add_new_free_space_info(struct btrfs_trans_handle *trans,
> return 0;
> }
>
> -EXPORT_FOR_TESTS
> struct btrfs_free_space_info *btrfs_search_free_space_info(
> struct btrfs_trans_handle *trans,
> struct btrfs_block_group *block_group,
> diff --git a/fs/btrfs/free-space-tree.h b/fs/btrfs/free-space-tree.h
> index 3d9a5d4477fc..89d2ff7e5c18 100644
> --- a/fs/btrfs/free-space-tree.h
> +++ b/fs/btrfs/free-space-tree.h
> @@ -35,12 +35,13 @@ int btrfs_add_to_free_space_tree(struct btrfs_trans_handle *trans,
> u64 start, u64 size);
> int btrfs_remove_from_free_space_tree(struct btrfs_trans_handle *trans,
> u64 start, u64 size);
> -
> -#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
> struct btrfs_free_space_info *
> btrfs_search_free_space_info(struct btrfs_trans_handle *trans,
> struct btrfs_block_group *block_group,
> struct btrfs_path *path, int cow);
> +struct btrfs_root *btrfs_free_space_root(struct btrfs_block_group *block_group);
> +
> +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
> int __btrfs_add_to_free_space_tree(struct btrfs_trans_handle *trans,
> struct btrfs_block_group *block_group,
> struct btrfs_path *path, u64 start, u64 size);
> diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
> index 9f3ce3395d6a..cd53509c2fda 100644
> --- a/fs/btrfs/relocation.c
> +++ b/fs/btrfs/relocation.c
> @@ -3627,7 +3627,7 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
> btrfs_btree_balance_dirty(fs_info);
> }
>
> - if (!err) {
> + if (!err && !btrfs_fs_incompat(fs_info, REMAP_TREE)) {
> ret = relocate_file_extent_cluster(rc);
> if (ret < 0)
> err = ret;
> @@ -3871,6 +3871,90 @@ static const char *stage_to_string(enum reloc_stage stage)
> return "unknown";
> }
>
> +static int add_remap_tree_entries(struct btrfs_trans_handle *trans,
> + struct btrfs_path *path,
> + struct btrfs_key *entries,
> + unsigned int num_entries)
> +{
> + int ret;
> + struct btrfs_fs_info *fs_info = trans->fs_info;
> + struct btrfs_item_batch batch;
> + u32 *data_sizes;
> + u32 max_items;
> +
> + max_items = BTRFS_LEAF_DATA_SIZE(trans->fs_info) / sizeof(struct btrfs_item);
> +
> + data_sizes = kzalloc(sizeof(u32) * min_t(u32, num_entries, max_items),
> + GFP_NOFS);
> + if (!data_sizes)
> + return -ENOMEM;
> +
> + while (true) {
> + batch.keys = entries;
> + batch.data_sizes = data_sizes;
> + batch.total_data_size = 0;
> + batch.nr = min_t(u32, num_entries, max_items);
> +
> + ret = btrfs_insert_empty_items(trans, fs_info->remap_root, path,
> + &batch);
> + btrfs_release_path(path);
> +
> + if (num_entries <= max_items)
> + break;
> +
> + num_entries -= max_items;
> + entries += max_items;
> + }
> +
> + kfree(data_sizes);
> +
> + return ret;
> +}
> +
> +struct space_run {
> + u64 start;
> + u64 end;
> +};
> +
> +static void parse_bitmap(u64 block_size, const unsigned long *bitmap,
> + unsigned long size, u64 address,
> + struct space_run *space_runs,
> + unsigned int *num_space_runs)
> +{
> + unsigned long pos, end;
> + u64 run_start, run_length;
> +
> + pos = find_first_bit(bitmap, size);
> +
> + if (pos == size)
> + return;
> +
> + while (true) {
> + end = find_next_zero_bit(bitmap, size, pos);
> +
> + run_start = address + (pos * block_size);
> + run_length = (end - pos) * block_size;
> +
> + if (*num_space_runs != 0 &&
> + space_runs[*num_space_runs - 1].end == run_start) {
> + space_runs[*num_space_runs - 1].end += run_length;
> + } else {
> + space_runs[*num_space_runs].start = run_start;
> + space_runs[*num_space_runs].end = run_start + run_length;
> +
> + (*num_space_runs)++;
> + }
> +
> + if (end == size)
> + break;
> +
> + pos = find_next_bit(bitmap, size, end + 1);
> +
> + if (pos == size)
> + break;
> + }
> +}
> +
> static void adjust_block_group_remap_bytes(struct btrfs_trans_handle *trans,
> struct btrfs_block_group *bg,
> s64 diff)
> @@ -3903,6 +3987,184 @@ static void adjust_block_group_remap_bytes(struct btrfs_trans_handle *trans,
> btrfs_inc_delayed_refs_rsv_bg_updates(fs_info);
> }
>
> +static int create_remap_tree_entries(struct btrfs_trans_handle *trans,
> + struct btrfs_path *path,
> + struct btrfs_block_group *bg)
> +{
> + struct btrfs_fs_info *fs_info = trans->fs_info;
> + struct btrfs_free_space_info *fsi;
> + struct btrfs_key key, found_key;
> + struct extent_buffer *leaf;
> + struct btrfs_root *space_root;
> + u32 extent_count;
> + struct space_run *space_runs = NULL;
> + unsigned int num_space_runs = 0;
> + struct btrfs_key *entries = NULL;
> + unsigned int max_entries, num_entries;
> + int ret;
> +
> + mutex_lock(&bg->free_space_lock);
> +
> + if (test_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &bg->runtime_flags)) {
> + mutex_unlock(&bg->free_space_lock);
> +
> + ret = btrfs_add_block_group_free_space(trans, bg);
> + if (ret)
> + return ret;
> +
> + mutex_lock(&bg->free_space_lock);
> + }
> +
> + fsi = btrfs_search_free_space_info(trans, bg, path, 0);
> + if (IS_ERR(fsi)) {
> + mutex_unlock(&bg->free_space_lock);
> + return PTR_ERR(fsi);
> + }
> +
> + extent_count = btrfs_free_space_extent_count(path->nodes[0], fsi);
> +
> + btrfs_release_path(path);
> +
> + space_runs = kmalloc(sizeof(*space_runs) * extent_count, GFP_NOFS);
> + if (!space_runs) {
> + mutex_unlock(&bg->free_space_lock);
> + return -ENOMEM;
> + }
> +
> + key.objectid = bg->start;
> + key.type = 0;
> + key.offset = 0;
> +
> + space_root = btrfs_free_space_root(bg);
> +
> + ret = btrfs_search_slot(trans, space_root, &key, path, 0, 0);
> + if (ret < 0) {
> + mutex_unlock(&bg->free_space_lock);
> + goto out;
> + }
> +
> + ret = 0;
> +
> + while (true) {
> + leaf = path->nodes[0];
> +
> + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
> +
> + if (found_key.objectid >= bg->start + bg->length)
> + break;
> +
> + if (found_key.type == BTRFS_FREE_SPACE_EXTENT_KEY) {
> + if (num_space_runs != 0 &&
> + space_runs[num_space_runs - 1].end == found_key.objectid) {
> + space_runs[num_space_runs - 1].end =
> + found_key.objectid + found_key.offset;
> + } else {
> + BUG_ON(num_space_runs >= extent_count);
> +
> + space_runs[num_space_runs].start = found_key.objectid;
> + space_runs[num_space_runs].end =
> + found_key.objectid + found_key.offset;
> +
> + num_space_runs++;
> + }
> + } else if (found_key.type == BTRFS_FREE_SPACE_BITMAP_KEY) {
> + void *bitmap;
> + unsigned long offset;
> + u32 data_size;
> +
> + offset = btrfs_item_ptr_offset(leaf, path->slots[0]);
> + data_size = btrfs_item_size(leaf, path->slots[0]);
> +
> + if (data_size != 0) {
> + bitmap = kmalloc(data_size, GFP_NOFS);
> + if (!bitmap) {
> + mutex_unlock(&bg->free_space_lock);
> + ret = -ENOMEM;
> + goto out;
> + }
> +
> + read_extent_buffer(leaf, bitmap, offset,
> + data_size);
> +
> + parse_bitmap(fs_info->sectorsize, bitmap,
> + data_size * BITS_PER_BYTE,
> + found_key.objectid, space_runs,
> + &num_space_runs);
> +
> + BUG_ON(num_space_runs > extent_count);
> +
> + kfree(bitmap);
> + }
> + }
> +
> + path->slots[0]++;
> +
> + if (path->slots[0] >= btrfs_header_nritems(leaf)) {
> + ret = btrfs_next_leaf(space_root, path);
> + if (ret != 0) {
> + if (ret == 1)
> + ret = 0;
> + break;
> + }
> + leaf = path->nodes[0];
> + }
> + }
> +
> + btrfs_release_path(path);
> +
> + mutex_unlock(&bg->free_space_lock);
> +
> + max_entries = extent_count + 2;
> + entries = kmalloc(sizeof(*entries) * max_entries, GFP_NOFS);
> + if (!entries) {
> + ret = -ENOMEM;
> + goto out;
> + }
> +
> + num_entries = 0;
> +
> + if (num_space_runs > 0 && space_runs[0].start > bg->start) {
> + entries[num_entries].objectid = bg->start;
> + entries[num_entries].type = BTRFS_IDENTITY_REMAP_KEY;
> + entries[num_entries].offset = space_runs[0].start - bg->start;
> + num_entries++;
> + }
> +
> + for (unsigned int i = 1; i < num_space_runs; i++) {
> + entries[num_entries].objectid = space_runs[i - 1].end;
> + entries[num_entries].type = BTRFS_IDENTITY_REMAP_KEY;
> + entries[num_entries].offset =
> + space_runs[i].start - space_runs[i - 1].end;
> + num_entries++;
> + }
> +
> + if (num_space_runs == 0) {
> + entries[num_entries].objectid = bg->start;
> + entries[num_entries].type = BTRFS_IDENTITY_REMAP_KEY;
> + entries[num_entries].offset = bg->length;
> + num_entries++;
> + } else if (space_runs[num_space_runs - 1].end < bg->start + bg->length) {
> + entries[num_entries].objectid = space_runs[num_space_runs - 1].end;
> + entries[num_entries].type = BTRFS_IDENTITY_REMAP_KEY;
> + entries[num_entries].offset =
> + bg->start + bg->length - space_runs[num_space_runs - 1].end;
> + num_entries++;
> + }
> +
> + if (num_entries == 0)
> + goto out;
> +
> + bg->identity_remap_count = num_entries;
> +
> + ret = add_remap_tree_entries(trans, path, entries, num_entries);
> +
> +out:
> + kfree(entries);
> + kfree(space_runs);
> +
> + return ret;
> +}
> +
> static int remove_chunk_stripes(struct btrfs_trans_handle *trans,
> struct btrfs_chunk_map *chunk,
> struct btrfs_path *path)
> @@ -4016,6 +4278,55 @@ static void adjust_identity_remap_count(struct btrfs_trans_handle *trans,
> btrfs_mark_bg_fully_remapped(bg, trans);
> }
>
> +static int mark_chunk_remapped(struct btrfs_trans_handle *trans,
> + struct btrfs_path *path, uint64_t start)
> +{
> + struct btrfs_fs_info *fs_info = trans->fs_info;
> + struct btrfs_chunk_map *chunk;
> + struct btrfs_key key;
> + u64 type;
> + int ret;
> + struct extent_buffer *leaf;
> + struct btrfs_chunk *c;
> +
> + read_lock(&fs_info->mapping_tree_lock);
> +
> + chunk = btrfs_find_chunk_map_nolock(fs_info, start, 1);
> + if (!chunk) {
> + read_unlock(&fs_info->mapping_tree_lock);
> + return -ENOENT;
> + }
> +
> + chunk->type |= BTRFS_BLOCK_GROUP_REMAPPED;
> + type = chunk->type;
> +
> + read_unlock(&fs_info->mapping_tree_lock);
> +
> + key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
> + key.type = BTRFS_CHUNK_ITEM_KEY;
> + key.offset = start;
> +
> + ret = btrfs_search_slot(trans, fs_info->chunk_root, &key, path,
> + 0, 1);
> + if (ret == 1) {
> + ret = -ENOENT;
> + goto end;
> + } else if (ret < 0)
> + goto end;
> +
> + leaf = path->nodes[0];
> +
> + c = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_chunk);
> + btrfs_set_chunk_type(leaf, c, type);
> + btrfs_mark_buffer_dirty(trans, leaf);
> +
> + ret = 0;
> +end:
> + btrfs_free_chunk_map(chunk);
> + btrfs_release_path(path);
> + return ret;
> +}
> +
> int btrfs_translate_remap(struct btrfs_fs_info *fs_info, u64 *logical,
> u64 *length)
> {
> @@ -4070,17 +4381,94 @@ int btrfs_translate_remap(struct btrfs_fs_info *fs_info, u64 *logical,
> return 0;
> }
>
> +static int start_block_group_remapping(struct btrfs_fs_info *fs_info,
> + struct btrfs_path *path,
> + struct btrfs_block_group *bg)
> +{
> + struct btrfs_trans_handle *trans;
> + bool bg_already_dirty = true;
> + int ret, ret2;
> +
> + ret = btrfs_cache_block_group(bg, true);
> + if (ret)
> + return ret;
> +
> + trans = btrfs_start_transaction(fs_info->remap_root, 0);
> + if (IS_ERR(trans))
> + return PTR_ERR(trans);
> +
> + /* We need to run delayed refs, to make sure FST is up to date. */
> + ret = btrfs_run_delayed_refs(trans, U64_MAX);
> + if (ret) {
> + btrfs_end_transaction(trans);
> + return ret;
> + }
> +
> + mutex_lock(&fs_info->remap_mutex);
> +
> + if (bg->flags & BTRFS_BLOCK_GROUP_REMAPPED) {
> + ret = 0;
> + goto end;
> + }
> +
> + ret = create_remap_tree_entries(trans, path, bg);
> + if (ret) {
> + btrfs_abort_transaction(trans, ret);
> + goto end;
> + }
> +
> + spin_lock(&bg->lock);
> + bg->flags |= BTRFS_BLOCK_GROUP_REMAPPED;
> + spin_unlock(&bg->lock);
> +
> + spin_lock(&trans->transaction->dirty_bgs_lock);
> + if (list_empty(&bg->dirty_list)) {
> + list_add_tail(&bg->dirty_list,
> + &trans->transaction->dirty_bgs);
> + bg_already_dirty = false;
> + btrfs_get_block_group(bg);
> + }
> + spin_unlock(&trans->transaction->dirty_bgs_lock);
> +
> + /* Modified block groups are accounted for in the delayed_refs_rsv. */
> + if (!bg_already_dirty)
> + btrfs_inc_delayed_refs_rsv_bg_updates(fs_info);
> +
> + ret = mark_chunk_remapped(trans, path, bg->start);
> + if (ret) {
> + btrfs_abort_transaction(trans, ret);
> + goto end;
> + }
> +
> + ret = btrfs_remove_block_group_free_space(trans, bg);
> + if (ret) {
> + btrfs_abort_transaction(trans, ret);
> + goto end;
> + }
> +
> + btrfs_remove_free_space_cache(bg);
> +
> +end:
> + mutex_unlock(&fs_info->remap_mutex);
> +
> + ret2 = btrfs_end_transaction(trans);
> + if (!ret)
> + ret = ret2;
> +
> + return ret;
> +}
> +
> /*
> * function to relocate all extents in a block group.
> */
> int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start,
> - bool verbose)
> + bool verbose, bool *using_remap_tree)
> {
> struct btrfs_block_group *bg;
> struct btrfs_root *extent_root = btrfs_extent_root(fs_info, group_start);
> struct reloc_control *rc;
> struct inode *inode;
> - struct btrfs_path *path;
> + struct btrfs_path *path = NULL;
> int ret;
> bool bg_is_ro = false;
>
> @@ -4142,7 +4530,7 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start,
> }
>
> inode = lookup_free_space_inode(rc->block_group, path);
> - btrfs_free_path(path);
> + btrfs_release_path(path);
>
> if (!IS_ERR(inode))
> ret = delete_block_group_cache(rc->block_group, inode, 0);
> @@ -4152,11 +4540,17 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start,
> if (ret && ret != -ENOENT)
> goto out;
>
> - rc->data_inode = create_reloc_inode(rc->block_group);
> - if (IS_ERR(rc->data_inode)) {
> - ret = PTR_ERR(rc->data_inode);
> - rc->data_inode = NULL;
> - goto out;
> + *using_remap_tree = btrfs_fs_incompat(fs_info, REMAP_TREE) &&
> + !(bg->flags & BTRFS_BLOCK_GROUP_SYSTEM) &&
> + !(bg->flags & BTRFS_BLOCK_GROUP_REMAP);
> +
> + if (!btrfs_fs_incompat(fs_info, REMAP_TREE)) {
> + rc->data_inode = create_reloc_inode(rc->block_group);
> + if (IS_ERR(rc->data_inode)) {
> + ret = PTR_ERR(rc->data_inode);
> + rc->data_inode = NULL;
> + goto out;
> + }
> }
>
> if (verbose)
> @@ -4169,6 +4563,11 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start,
> ret = btrfs_zone_finish(rc->block_group);
> WARN_ON(ret && ret != -EAGAIN);
>
> + if (*using_remap_tree) {
> + ret = start_block_group_remapping(fs_info, path, bg);
> + goto out;
> + }
> +
This new control flow is way too cute. There is very little shared code,
IMO. I think it is much clearer to route to a different implementation
only once rather than both in the caller and the callee.
> while (1) {
> enum reloc_stage finishes_stage;
>
> @@ -4216,7 +4615,9 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start,
> out:
> if (ret && bg_is_ro)
> btrfs_dec_block_group_ro(rc->block_group);
> - iput(rc->data_inode);
> + if (!btrfs_fs_incompat(fs_info, REMAP_TREE))
> + iput(rc->data_inode);
> + btrfs_free_path(path);
> reloc_chunk_end(fs_info);
> out_put_bg:
> btrfs_put_block_group(bg);
> @@ -4410,7 +4811,7 @@ int btrfs_recover_relocation(struct btrfs_fs_info *fs_info)
>
> btrfs_free_path(path);
>
> - if (ret == 0) {
> + if (ret == 0 && !btrfs_fs_incompat(fs_info, REMAP_TREE)) {
> /* cleanup orphan inode in data relocation tree */
> fs_root = btrfs_grab_root(fs_info->data_reloc_root);
> ASSERT(fs_root);
> diff --git a/fs/btrfs/relocation.h b/fs/btrfs/relocation.h
> index 7cfe91971cab..fbe191ff5d08 100644
> --- a/fs/btrfs/relocation.h
> +++ b/fs/btrfs/relocation.h
> @@ -13,7 +13,7 @@ struct btrfs_ordered_extent;
> struct btrfs_pending_snapshot;
>
> int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start,
> - bool verbose);
> + bool verbose, bool *using_remap_tree);
> int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, struct btrfs_root *root);
> int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
> struct btrfs_root *root);
> diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
> index a2ce72d3e873..752d098d1a6a 100644
> --- a/fs/btrfs/space-info.c
> +++ b/fs/btrfs/space-info.c
> @@ -375,8 +375,13 @@ void btrfs_add_bg_to_space_info(struct btrfs_fs_info *info,
> factor = btrfs_bg_type_to_factor(block_group->flags);
>
> spin_lock(&space_info->lock);
> - space_info->total_bytes += block_group->length;
> - space_info->disk_total += block_group->length * factor;
> +
> + if (!(block_group->flags & BTRFS_BLOCK_GROUP_REMAPPED) ||
> + block_group->identity_remap_count != 0) {
> + space_info->total_bytes += block_group->length;
> + space_info->disk_total += block_group->length * factor;
> + }
> +
> space_info->bytes_used += block_group->used;
> space_info->disk_used += block_group->used * factor;
> space_info->bytes_readonly += block_group->bytes_super;
> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
> index 99ad95e1c300..cda94c6f5239 100644
> --- a/fs/btrfs/volumes.c
> +++ b/fs/btrfs/volumes.c
> @@ -3418,6 +3418,7 @@ int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset,
> struct btrfs_block_group *block_group;
> u64 length;
> int ret;
> + bool using_remap_tree;
>
> if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
> btrfs_err(fs_info,
> @@ -3441,7 +3442,8 @@ int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset,
>
> /* step one, relocate all the extents inside this chunk */
> btrfs_scrub_pause(fs_info);
> - ret = btrfs_relocate_block_group(fs_info, chunk_offset, true);
> + ret = btrfs_relocate_block_group(fs_info, chunk_offset, true,
> + &using_remap_tree);
> btrfs_scrub_continue(fs_info);
> if (ret) {
> /*
> @@ -3453,6 +3455,9 @@ int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset,
> return ret;
> }
>
> + if (using_remap_tree)
> + return 0;
> +
> block_group = btrfs_lookup_block_group(fs_info, chunk_offset);
> if (!block_group)
> return -ENOENT;
> @@ -4156,6 +4161,14 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)
> chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
> chunk_type = btrfs_chunk_type(leaf, chunk);
>
> + /* Check if chunk has already been fully relocated. */
> + if (chunk_type & BTRFS_BLOCK_GROUP_REMAPPED &&
> + btrfs_chunk_num_stripes(leaf, chunk) == 0) {
> + btrfs_release_path(path);
> + mutex_unlock(&fs_info->reclaim_bgs_lock);
> + goto loop;
> + }
> +
> if (!counting) {
> spin_lock(&fs_info->balance_lock);
> bctl->stat.considered++;
> --
> 2.49.1
>
^ permalink raw reply [flat|nested] 42+ messages in thread* Re: [PATCH v4 10/16] btrfs: handle setting up relocation of block group with remap-tree
2025-10-31 23:43 ` Boris Burkov
@ 2025-11-03 18:45 ` Mark Harmstone
0 siblings, 0 replies; 42+ messages in thread
From: Mark Harmstone @ 2025-11-03 18:45 UTC (permalink / raw)
To: Boris Burkov; +Cc: linux-btrfs
On 31/10/2025 11.43 pm, Boris Burkov wrote:
> On Fri, Oct 24, 2025 at 07:12:11PM +0100, Mark Harmstone wrote:
>> Handle the preliminary work for relocating a block group in a filesystem
>> with the remap-tree flag set.
>>
>> If the block group is SYSTEM btrfs_relocate_block_group() proceeds as it
>> does already, as bootstrapping issues mean that these block groups have
>> to be processed the existing way. Similarly with REMAP blocks, which are
>> dealt with in a later patch.
>>
>> Otherwise we walk the free-space tree for the block group in question,
>> recording any holes. These get converted into identity remaps and placed
>> in the remap tree, and the block group's REMAPPED flag is set. From now
>> on no new allocations are possible within this block group, and any I/O
>> to it will be funnelled through btrfs_translate_remap(). We store the
>> number of identity remaps in `identity_remap_count`, so that we know
>> when we've removed the last one and the block group is fully remapped.
>>
>> The change in btrfs_read_roots() is because data relocations no longer
>> rely on the data reloc tree as a hidden subvolume in which to do
>> snapshots.
>>
>> Signed-off-by: Mark Harmstone <mark@harmstone.com>
>> ---
>> fs/btrfs/block-group.c | 6 +-
>> fs/btrfs/block-group.h | 4 +
>> fs/btrfs/free-space-tree.c | 4 +-
>> fs/btrfs/free-space-tree.h | 5 +-
>> fs/btrfs/relocation.c | 423 ++++++++++++++++++++++++++++++++++++-
>> fs/btrfs/relocation.h | 2 +-
>> fs/btrfs/space-info.c | 9 +-
>> fs/btrfs/volumes.c | 15 +-
>> 8 files changed, 447 insertions(+), 21 deletions(-)
>>
>> diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
>> index 3bf5f20d90ec..8feddb472882 100644
>> --- a/fs/btrfs/block-group.c
>> +++ b/fs/btrfs/block-group.c
>> @@ -2423,6 +2423,7 @@ static int read_one_block_group(struct btrfs_fs_info *info,
>> cache->used = btrfs_stack_block_group_v2_used(bgi);
>> cache->commit_used = cache->used;
>> cache->flags = btrfs_stack_block_group_v2_flags(bgi);
>> + cache->commit_flags = cache->flags;
>> cache->global_root_id = btrfs_stack_block_group_v2_chunk_objectid(bgi);
>> cache->space_info = btrfs_find_space_info(info, cache->flags);
>> cache->remap_bytes = btrfs_stack_block_group_v2_remap_bytes(bgi);
>> @@ -2732,6 +2733,7 @@ static int insert_block_group_item(struct btrfs_trans_handle *trans,
>> block_group->commit_remap_bytes = block_group->remap_bytes;
>> block_group->commit_identity_remap_count =
>> block_group->identity_remap_count;
>> + block_group->commit_flags = block_group->flags;
>> key.objectid = block_group->start;
>> key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
>> key.offset = block_group->length;
>> @@ -3220,13 +3222,15 @@ static int update_block_group_item(struct btrfs_trans_handle *trans,
>> /* No change in values, can safely skip it. */
>> if (cache->commit_used == used &&
>> cache->commit_remap_bytes == remap_bytes &&
>> - cache->commit_identity_remap_count == identity_remap_count) {
>> + cache->commit_identity_remap_count == identity_remap_count &&
>> + cache->commit_flags == cache->flags) {
>> spin_unlock(&cache->lock);
>> return 0;
>> }
>> cache->commit_used = used;
>> cache->commit_remap_bytes = remap_bytes;
>> cache->commit_identity_remap_count = identity_remap_count;
>> + cache->commit_flags = cache->flags;
>> spin_unlock(&cache->lock);
>>
>> key.objectid = cache->start;
>> diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h
>> index d85f3c2546d0..4522074a45c2 100644
>> --- a/fs/btrfs/block-group.h
>> +++ b/fs/btrfs/block-group.h
>> @@ -146,6 +146,10 @@ struct btrfs_block_group {
>> * The last commited identity_remap_count value of this block group.
>> */
>> u32 commit_identity_remap_count;
>> + /*
>> + * The last committed flags value for this block group.
>> + */
>> + u64 commit_flags;
>>
>> /*
>> * If the free space extent count exceeds this number, convert the block
>> diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c
>> index 26eae347739f..e46b1fa86f80 100644
>> --- a/fs/btrfs/free-space-tree.c
>> +++ b/fs/btrfs/free-space-tree.c
>> @@ -21,8 +21,7 @@ static int __add_block_group_free_space(struct btrfs_trans_handle *trans,
>> struct btrfs_block_group *block_group,
>> struct btrfs_path *path);
>>
>> -static struct btrfs_root *btrfs_free_space_root(
>> - struct btrfs_block_group *block_group)
>> +struct btrfs_root *btrfs_free_space_root(struct btrfs_block_group *block_group)
>> {
>> struct btrfs_key key = {
>> .objectid = BTRFS_FREE_SPACE_TREE_OBJECTID,
>> @@ -93,7 +92,6 @@ static int add_new_free_space_info(struct btrfs_trans_handle *trans,
>> return 0;
>> }
>>
>> -EXPORT_FOR_TESTS
>> struct btrfs_free_space_info *btrfs_search_free_space_info(
>> struct btrfs_trans_handle *trans,
>> struct btrfs_block_group *block_group,
>> diff --git a/fs/btrfs/free-space-tree.h b/fs/btrfs/free-space-tree.h
>> index 3d9a5d4477fc..89d2ff7e5c18 100644
>> --- a/fs/btrfs/free-space-tree.h
>> +++ b/fs/btrfs/free-space-tree.h
>> @@ -35,12 +35,13 @@ int btrfs_add_to_free_space_tree(struct btrfs_trans_handle *trans,
>> u64 start, u64 size);
>> int btrfs_remove_from_free_space_tree(struct btrfs_trans_handle *trans,
>> u64 start, u64 size);
>> -
>> -#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
>> struct btrfs_free_space_info *
>> btrfs_search_free_space_info(struct btrfs_trans_handle *trans,
>> struct btrfs_block_group *block_group,
>> struct btrfs_path *path, int cow);
>> +struct btrfs_root *btrfs_free_space_root(struct btrfs_block_group *block_group);
>> +
>> +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
>> int __btrfs_add_to_free_space_tree(struct btrfs_trans_handle *trans,
>> struct btrfs_block_group *block_group,
>> struct btrfs_path *path, u64 start, u64 size);
>> diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
>> index 9f3ce3395d6a..cd53509c2fda 100644
>> --- a/fs/btrfs/relocation.c
>> +++ b/fs/btrfs/relocation.c
>> @@ -3627,7 +3627,7 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
>> btrfs_btree_balance_dirty(fs_info);
>> }
>>
>> - if (!err) {
>> + if (!err && !btrfs_fs_incompat(fs_info, REMAP_TREE)) {
>> ret = relocate_file_extent_cluster(rc);
>> if (ret < 0)
>> err = ret;
>> @@ -3871,6 +3871,90 @@ static const char *stage_to_string(enum reloc_stage stage)
>> return "unknown";
>> }
>>
>> +static int add_remap_tree_entries(struct btrfs_trans_handle *trans,
>> + struct btrfs_path *path,
>> + struct btrfs_key *entries,
>> + unsigned int num_entries)
>> +{
>> + int ret;
>> + struct btrfs_fs_info *fs_info = trans->fs_info;
>> + struct btrfs_item_batch batch;
>> + u32 *data_sizes;
>> + u32 max_items;
>> +
>> + max_items = BTRFS_LEAF_DATA_SIZE(trans->fs_info) / sizeof(struct btrfs_item);
>> +
>> + data_sizes = kzalloc(sizeof(u32) * min_t(u32, num_entries, max_items),
>> + GFP_NOFS);
>> + if (!data_sizes)
>> + return -ENOMEM;
>> +
>> + while (true) {
>> + batch.keys = entries;
>> + batch.data_sizes = data_sizes;
>> + batch.total_data_size = 0;
>> + batch.nr = min_t(u32, num_entries, max_items);
>> +
>> + ret = btrfs_insert_empty_items(trans, fs_info->remap_root, path,
>> + &batch);
>> + btrfs_release_path(path);
>> +
>> + if (num_entries <= max_items)
>> + break;
>> +
>> + num_entries -= max_items;
>> + entries += max_items;
>> + }
>> +
>> + kfree(data_sizes);
>> +
>> + return ret;
>> +}
>> +
>> +struct space_run {
>> + u64 start;
>> + u64 end;
>> +};
>> +
>> +static void parse_bitmap(u64 block_size, const unsigned long *bitmap,
>> + unsigned long size, u64 address,
>> + struct space_run *space_runs,
>> + unsigned int *num_space_runs)
>> +{
>> + unsigned long pos, end;
>> + u64 run_start, run_length;
>> +
>> + pos = find_first_bit(bitmap, size);
>> +
>> + if (pos == size)
>> + return;
>> +
>> + while (true) {
>> + end = find_next_zero_bit(bitmap, size, pos);
>> +
>> + run_start = address + (pos * block_size);
>> + run_length = (end - pos) * block_size;
>> +
>> + if (*num_space_runs != 0 &&
>> + space_runs[*num_space_runs - 1].end == run_start) {
>> + space_runs[*num_space_runs - 1].end += run_length;
>> + } else {
>> + space_runs[*num_space_runs].start = run_start;
>> + space_runs[*num_space_runs].end = run_start + run_length;
>> +
>> + (*num_space_runs)++;
>> + }
>> +
>> + if (end == size)
>> + break;
>> +
>> + pos = find_next_bit(bitmap, size, end + 1);
>> +
>> + if (pos == size)
>> + break;
>> + }
>> +}
>> +
>> static void adjust_block_group_remap_bytes(struct btrfs_trans_handle *trans,
>> struct btrfs_block_group *bg,
>> s64 diff)
>> @@ -3903,6 +3987,184 @@ static void adjust_block_group_remap_bytes(struct btrfs_trans_handle *trans,
>> btrfs_inc_delayed_refs_rsv_bg_updates(fs_info);
>> }
>>
>> +static int create_remap_tree_entries(struct btrfs_trans_handle *trans,
>> + struct btrfs_path *path,
>> + struct btrfs_block_group *bg)
>> +{
>> + struct btrfs_fs_info *fs_info = trans->fs_info;
>> + struct btrfs_free_space_info *fsi;
>> + struct btrfs_key key, found_key;
>> + struct extent_buffer *leaf;
>> + struct btrfs_root *space_root;
>> + u32 extent_count;
>> + struct space_run *space_runs = NULL;
>> + unsigned int num_space_runs = 0;
>> + struct btrfs_key *entries = NULL;
>> + unsigned int max_entries, num_entries;
>> + int ret;
>> +
>> + mutex_lock(&bg->free_space_lock);
>> +
>> + if (test_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &bg->runtime_flags)) {
>> + mutex_unlock(&bg->free_space_lock);
>> +
>> + ret = btrfs_add_block_group_free_space(trans, bg);
>> + if (ret)
>> + return ret;
>> +
>> + mutex_lock(&bg->free_space_lock);
>> + }
>> +
>> + fsi = btrfs_search_free_space_info(trans, bg, path, 0);
>> + if (IS_ERR(fsi)) {
>> + mutex_unlock(&bg->free_space_lock);
>> + return PTR_ERR(fsi);
>> + }
>> +
>> + extent_count = btrfs_free_space_extent_count(path->nodes[0], fsi);
>> +
>> + btrfs_release_path(path);
>> +
>> + space_runs = kmalloc(sizeof(*space_runs) * extent_count, GFP_NOFS);
>> + if (!space_runs) {
>> + mutex_unlock(&bg->free_space_lock);
>> + return -ENOMEM;
>> + }
>> +
>> + key.objectid = bg->start;
>> + key.type = 0;
>> + key.offset = 0;
>> +
>> + space_root = btrfs_free_space_root(bg);
>> +
>> + ret = btrfs_search_slot(trans, space_root, &key, path, 0, 0);
>> + if (ret < 0) {
>> + mutex_unlock(&bg->free_space_lock);
>> + goto out;
>> + }
>> +
>> + ret = 0;
>> +
>> + while (true) {
>> + leaf = path->nodes[0];
>> +
>> + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
>> +
>> + if (found_key.objectid >= bg->start + bg->length)
>> + break;
>> +
>> + if (found_key.type == BTRFS_FREE_SPACE_EXTENT_KEY) {
>> + if (num_space_runs != 0 &&
>> + space_runs[num_space_runs - 1].end == found_key.objectid) {
>> + space_runs[num_space_runs - 1].end =
>> + found_key.objectid + found_key.offset;
>> + } else {
>> + BUG_ON(num_space_runs >= extent_count);
>> +
>> + space_runs[num_space_runs].start = found_key.objectid;
>> + space_runs[num_space_runs].end =
>> + found_key.objectid + found_key.offset;
>> +
>> + num_space_runs++;
>> + }
>> + } else if (found_key.type == BTRFS_FREE_SPACE_BITMAP_KEY) {
>> + void *bitmap;
>> + unsigned long offset;
>> + u32 data_size;
>> +
>> + offset = btrfs_item_ptr_offset(leaf, path->slots[0]);
>> + data_size = btrfs_item_size(leaf, path->slots[0]);
>> +
>> + if (data_size != 0) {
>> + bitmap = kmalloc(data_size, GFP_NOFS);
>> + if (!bitmap) {
>> + mutex_unlock(&bg->free_space_lock);
>> + ret = -ENOMEM;
>> + goto out;
>> + }
>> +
>> + read_extent_buffer(leaf, bitmap, offset,
>> + data_size);
>> +
>> + parse_bitmap(fs_info->sectorsize, bitmap,
>> + data_size * BITS_PER_BYTE,
>> + found_key.objectid, space_runs,
>> + &num_space_runs);
>> +
>> + BUG_ON(num_space_runs > extent_count);
>> +
>> + kfree(bitmap);
>> + }
>> + }
>> +
>> + path->slots[0]++;
>> +
>> + if (path->slots[0] >= btrfs_header_nritems(leaf)) {
>> + ret = btrfs_next_leaf(space_root, path);
>> + if (ret != 0) {
>> + if (ret == 1)
>> + ret = 0;
>> + break;
>> + }
>> + leaf = path->nodes[0];
>> + }
>> + }
>> +
>> + btrfs_release_path(path);
>> +
>> + mutex_unlock(&bg->free_space_lock);
>> +
>> + max_entries = extent_count + 2;
>> + entries = kmalloc(sizeof(*entries) * max_entries, GFP_NOFS);
>> + if (!entries) {
>> + ret = -ENOMEM;
>> + goto out;
>> + }
>> +
>> + num_entries = 0;
>> +
>> + if (num_space_runs > 0 && space_runs[0].start > bg->start) {
>> + entries[num_entries].objectid = bg->start;
>> + entries[num_entries].type = BTRFS_IDENTITY_REMAP_KEY;
>> + entries[num_entries].offset = space_runs[0].start - bg->start;
>> + num_entries++;
>> + }
>> +
>> + for (unsigned int i = 1; i < num_space_runs; i++) {
>> + entries[num_entries].objectid = space_runs[i - 1].end;
>> + entries[num_entries].type = BTRFS_IDENTITY_REMAP_KEY;
>> + entries[num_entries].offset =
>> + space_runs[i].start - space_runs[i - 1].end;
>> + num_entries++;
>> + }
>> +
>> + if (num_space_runs == 0) {
>> + entries[num_entries].objectid = bg->start;
>> + entries[num_entries].type = BTRFS_IDENTITY_REMAP_KEY;
>> + entries[num_entries].offset = bg->length;
>> + num_entries++;
>> + } else if (space_runs[num_space_runs - 1].end < bg->start + bg->length) {
>> + entries[num_entries].objectid = space_runs[num_space_runs - 1].end;
>> + entries[num_entries].type = BTRFS_IDENTITY_REMAP_KEY;
>> + entries[num_entries].offset =
>> + bg->start + bg->length - space_runs[num_space_runs - 1].end;
>> + num_entries++;
>> + }
>> +
>> + if (num_entries == 0)
>> + goto out;
>> +
>> + bg->identity_remap_count = num_entries;
>> +
>> + ret = add_remap_tree_entries(trans, path, entries, num_entries);
>> +
>> +out:
>> + kfree(entries);
>> + kfree(space_runs);
>> +
>> + return ret;
>> +}
>> +
>> static int remove_chunk_stripes(struct btrfs_trans_handle *trans,
>> struct btrfs_chunk_map *chunk,
>> struct btrfs_path *path)
>> @@ -4016,6 +4278,55 @@ static void adjust_identity_remap_count(struct btrfs_trans_handle *trans,
>> btrfs_mark_bg_fully_remapped(bg, trans);
>> }
>>
>> +static int mark_chunk_remapped(struct btrfs_trans_handle *trans,
>> + struct btrfs_path *path, uint64_t start)
>> +{
>> + struct btrfs_fs_info *fs_info = trans->fs_info;
>> + struct btrfs_chunk_map *chunk;
>> + struct btrfs_key key;
>> + u64 type;
>> + int ret;
>> + struct extent_buffer *leaf;
>> + struct btrfs_chunk *c;
>> +
>> + read_lock(&fs_info->mapping_tree_lock);
>> +
>> + chunk = btrfs_find_chunk_map_nolock(fs_info, start, 1);
>> + if (!chunk) {
>> + read_unlock(&fs_info->mapping_tree_lock);
>> + return -ENOENT;
>> + }
>> +
>> + chunk->type |= BTRFS_BLOCK_GROUP_REMAPPED;
>> + type = chunk->type;
>> +
>> + read_unlock(&fs_info->mapping_tree_lock);
>> +
>> + key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
>> + key.type = BTRFS_CHUNK_ITEM_KEY;
>> + key.offset = start;
>> +
>> + ret = btrfs_search_slot(trans, fs_info->chunk_root, &key, path,
>> + 0, 1);
>> + if (ret == 1) {
>> + ret = -ENOENT;
>> + goto end;
>> + } else if (ret < 0)
>> + goto end;
>> +
>> + leaf = path->nodes[0];
>> +
>> + c = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_chunk);
>> + btrfs_set_chunk_type(leaf, c, type);
>> + btrfs_mark_buffer_dirty(trans, leaf);
>> +
>> + ret = 0;
>> +end:
>> + btrfs_free_chunk_map(chunk);
>> + btrfs_release_path(path);
>> + return ret;
>> +}
>> +
>> int btrfs_translate_remap(struct btrfs_fs_info *fs_info, u64 *logical,
>> u64 *length)
>> {
>> @@ -4070,17 +4381,94 @@ int btrfs_translate_remap(struct btrfs_fs_info *fs_info, u64 *logical,
>> return 0;
>> }
>>
>> +static int start_block_group_remapping(struct btrfs_fs_info *fs_info,
>> + struct btrfs_path *path,
>> + struct btrfs_block_group *bg)
>> +{
>> + struct btrfs_trans_handle *trans;
>> + bool bg_already_dirty = true;
>> + int ret, ret2;
>> +
>> + ret = btrfs_cache_block_group(bg, true);
>> + if (ret)
>> + return ret;
>> +
>> + trans = btrfs_start_transaction(fs_info->remap_root, 0);
>> + if (IS_ERR(trans))
>> + return PTR_ERR(trans);
>> +
>> + /* We need to run delayed refs, to make sure FST is up to date. */
>> + ret = btrfs_run_delayed_refs(trans, U64_MAX);
>> + if (ret) {
>> + btrfs_end_transaction(trans);
>> + return ret;
>> + }
>> +
>> + mutex_lock(&fs_info->remap_mutex);
>> +
>> + if (bg->flags & BTRFS_BLOCK_GROUP_REMAPPED) {
>> + ret = 0;
>> + goto end;
>> + }
>> +
>> + ret = create_remap_tree_entries(trans, path, bg);
>> + if (ret) {
>> + btrfs_abort_transaction(trans, ret);
>> + goto end;
>> + }
>> +
>> + spin_lock(&bg->lock);
>> + bg->flags |= BTRFS_BLOCK_GROUP_REMAPPED;
>> + spin_unlock(&bg->lock);
>> +
>> + spin_lock(&trans->transaction->dirty_bgs_lock);
>> + if (list_empty(&bg->dirty_list)) {
>> + list_add_tail(&bg->dirty_list,
>> + &trans->transaction->dirty_bgs);
>> + bg_already_dirty = false;
>> + btrfs_get_block_group(bg);
>> + }
>> + spin_unlock(&trans->transaction->dirty_bgs_lock);
>> +
>> + /* Modified block groups are accounted for in the delayed_refs_rsv. */
>> + if (!bg_already_dirty)
>> + btrfs_inc_delayed_refs_rsv_bg_updates(fs_info);
>> +
>> + ret = mark_chunk_remapped(trans, path, bg->start);
>> + if (ret) {
>> + btrfs_abort_transaction(trans, ret);
>> + goto end;
>> + }
>> +
>> + ret = btrfs_remove_block_group_free_space(trans, bg);
>> + if (ret) {
>> + btrfs_abort_transaction(trans, ret);
>> + goto end;
>> + }
>> +
>> + btrfs_remove_free_space_cache(bg);
>> +
>> +end:
>> + mutex_unlock(&fs_info->remap_mutex);
>> +
>> + ret2 = btrfs_end_transaction(trans);
>> + if (!ret)
>> + ret = ret2;
>> +
>> + return ret;
>> +}
>> +
>> /*
>> * function to relocate all extents in a block group.
>> */
>> int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start,
>> - bool verbose)
>> + bool verbose, bool *using_remap_tree)
>> {
>> struct btrfs_block_group *bg;
>> struct btrfs_root *extent_root = btrfs_extent_root(fs_info, group_start);
>> struct reloc_control *rc;
>> struct inode *inode;
>> - struct btrfs_path *path;
>> + struct btrfs_path *path = NULL;
>> int ret;
>> bool bg_is_ro = false;
>>
>> @@ -4142,7 +4530,7 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start,
>> }
>>
>> inode = lookup_free_space_inode(rc->block_group, path);
>> - btrfs_free_path(path);
>> + btrfs_release_path(path);
>>
>> if (!IS_ERR(inode))
>> ret = delete_block_group_cache(rc->block_group, inode, 0);
>> @@ -4152,11 +4540,17 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start,
>> if (ret && ret != -ENOENT)
>> goto out;
>>
>> - rc->data_inode = create_reloc_inode(rc->block_group);
>> - if (IS_ERR(rc->data_inode)) {
>> - ret = PTR_ERR(rc->data_inode);
>> - rc->data_inode = NULL;
>> - goto out;
>> + *using_remap_tree = btrfs_fs_incompat(fs_info, REMAP_TREE) &&
>> + !(bg->flags & BTRFS_BLOCK_GROUP_SYSTEM) &&
>> + !(bg->flags & BTRFS_BLOCK_GROUP_REMAP);
>> +
>> + if (!btrfs_fs_incompat(fs_info, REMAP_TREE)) {
>> + rc->data_inode = create_reloc_inode(rc->block_group);
>> + if (IS_ERR(rc->data_inode)) {
>> + ret = PTR_ERR(rc->data_inode);
>> + rc->data_inode = NULL;
>> + goto out;
>> + }
>> }
>>
>> if (verbose)
>> @@ -4169,6 +4563,11 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start,
>> ret = btrfs_zone_finish(rc->block_group);
>> WARN_ON(ret && ret != -EAGAIN);
>>
>> + if (*using_remap_tree) {
>> + ret = start_block_group_remapping(fs_info, path, bg);
>> + goto out;
>> + }
>> +
>
> This new control flow is way too cute. There is very little shared code,
> IMO. I think it is much clearer to route to a different implementation
> only once rather than both in the caller and the callee.
Setting the using_remap_tree pointer was an attempt to avoid a call to
btrfs_lookup_block_group(), but at the expense of the control flow more
confusing. I'm going to rejig it so it's cleaner.
>> while (1) {
>> enum reloc_stage finishes_stage;
>>
>> @@ -4216,7 +4615,9 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start,
>> out:
>> if (ret && bg_is_ro)
>> btrfs_dec_block_group_ro(rc->block_group);
>> - iput(rc->data_inode);
>> + if (!btrfs_fs_incompat(fs_info, REMAP_TREE))
>> + iput(rc->data_inode);
>> + btrfs_free_path(path);
>> reloc_chunk_end(fs_info);
>> out_put_bg:
>> btrfs_put_block_group(bg);
>> @@ -4410,7 +4811,7 @@ int btrfs_recover_relocation(struct btrfs_fs_info *fs_info)
>>
>> btrfs_free_path(path);
>>
>> - if (ret == 0) {
>> + if (ret == 0 && !btrfs_fs_incompat(fs_info, REMAP_TREE)) {
>> /* cleanup orphan inode in data relocation tree */
>> fs_root = btrfs_grab_root(fs_info->data_reloc_root);
>> ASSERT(fs_root);
>> diff --git a/fs/btrfs/relocation.h b/fs/btrfs/relocation.h
>> index 7cfe91971cab..fbe191ff5d08 100644
>> --- a/fs/btrfs/relocation.h
>> +++ b/fs/btrfs/relocation.h
>> @@ -13,7 +13,7 @@ struct btrfs_ordered_extent;
>> struct btrfs_pending_snapshot;
>>
>> int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start,
>> - bool verbose);
>> + bool verbose, bool *using_remap_tree);
>> int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, struct btrfs_root *root);
>> int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
>> struct btrfs_root *root);
>> diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
>> index a2ce72d3e873..752d098d1a6a 100644
>> --- a/fs/btrfs/space-info.c
>> +++ b/fs/btrfs/space-info.c
>> @@ -375,8 +375,13 @@ void btrfs_add_bg_to_space_info(struct btrfs_fs_info *info,
>> factor = btrfs_bg_type_to_factor(block_group->flags);
>>
>> spin_lock(&space_info->lock);
>> - space_info->total_bytes += block_group->length;
>> - space_info->disk_total += block_group->length * factor;
>> +
>> + if (!(block_group->flags & BTRFS_BLOCK_GROUP_REMAPPED) ||
>> + block_group->identity_remap_count != 0) {
>> + space_info->total_bytes += block_group->length;
>> + space_info->disk_total += block_group->length * factor;
>> + }
>> +
>> space_info->bytes_used += block_group->used;
>> space_info->disk_used += block_group->used * factor;
>> space_info->bytes_readonly += block_group->bytes_super;
>> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
>> index 99ad95e1c300..cda94c6f5239 100644
>> --- a/fs/btrfs/volumes.c
>> +++ b/fs/btrfs/volumes.c
>> @@ -3418,6 +3418,7 @@ int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset,
>> struct btrfs_block_group *block_group;
>> u64 length;
>> int ret;
>> + bool using_remap_tree;
>>
>> if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
>> btrfs_err(fs_info,
>> @@ -3441,7 +3442,8 @@ int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset,
>>
>> /* step one, relocate all the extents inside this chunk */
>> btrfs_scrub_pause(fs_info);
>> - ret = btrfs_relocate_block_group(fs_info, chunk_offset, true);
>> + ret = btrfs_relocate_block_group(fs_info, chunk_offset, true,
>> + &using_remap_tree);
>> btrfs_scrub_continue(fs_info);
>> if (ret) {
>> /*
>> @@ -3453,6 +3455,9 @@ int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset,
>> return ret;
>> }
>>
>> + if (using_remap_tree)
>> + return 0;
>> +
>> block_group = btrfs_lookup_block_group(fs_info, chunk_offset);
>> if (!block_group)
>> return -ENOENT;
>> @@ -4156,6 +4161,14 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)
>> chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
>> chunk_type = btrfs_chunk_type(leaf, chunk);
>>
>> + /* Check if chunk has already been fully relocated. */
>> + if (chunk_type & BTRFS_BLOCK_GROUP_REMAPPED &&
>> + btrfs_chunk_num_stripes(leaf, chunk) == 0) {
>> + btrfs_release_path(path);
>> + mutex_unlock(&fs_info->reclaim_bgs_lock);
>> + goto loop;
>> + }
>> +
>> if (!counting) {
>> spin_lock(&fs_info->balance_lock);
>> bctl->stat.considered++;
>> --
>> 2.49.1
>>
^ permalink raw reply [flat|nested] 42+ messages in thread
* [PATCH v4 11/16] btrfs: move existing remaps before relocating block group
2025-10-24 18:12 [PATCH v4 00/16] Remap tree Mark Harmstone
` (9 preceding siblings ...)
2025-10-24 18:12 ` [PATCH v4 10/16] btrfs: handle setting up relocation of block group with remap-tree Mark Harmstone
@ 2025-10-24 18:12 ` Mark Harmstone
2025-11-01 0:02 ` Boris Burkov
2025-11-01 0:10 ` Boris Burkov
2025-10-24 18:12 ` [PATCH v4 12/16] btrfs: replace identity remaps with actual remaps when doing relocations Mark Harmstone
` (4 subsequent siblings)
15 siblings, 2 replies; 42+ messages in thread
From: Mark Harmstone @ 2025-10-24 18:12 UTC (permalink / raw)
To: linux-btrfs; +Cc: Mark Harmstone
If when relocating a block group we find that `remap_bytes` > 0 in its
block group item, that means that it has been the destination block
group for another that has been remapped.
We need to seach the remap tree for any remap backrefs within this
range, and move the data to a third block group. This is because
otherwise btrfs_translate_remap() could end up following an unbounded
chain of remaps, which would only get worse over time.
We only relocate one block group at a time, so `remap_bytes` will only
ever go down while we are doing this. Once we're finished we set the
REMAPPED flag on the block group, which will permanently prevent any
other data from being moved to within it.
Signed-off-by: Mark Harmstone <mark@harmstone.com>
---
fs/btrfs/extent-tree.c | 6 +-
fs/btrfs/relocation.c | 481 +++++++++++++++++++++++++++++++++++++++++
2 files changed, 485 insertions(+), 2 deletions(-)
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 1c14e0c82c03..10dc6f8d2f71 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4545,7 +4545,8 @@ static noinline int find_free_extent(struct btrfs_root *root,
block_group->cached != BTRFS_CACHE_NO) {
down_read(&space_info->groups_sem);
if (list_empty(&block_group->list) ||
- block_group->ro) {
+ block_group->ro ||
+ block_group->flags & BTRFS_BLOCK_GROUP_REMAPPED) {
/*
* someone is removing this block group,
* we can't jump into the have_block_group
@@ -4579,7 +4580,8 @@ static noinline int find_free_extent(struct btrfs_root *root,
ffe_ctl->hinted = false;
/* If the block group is read-only, we can skip it entirely. */
- if (unlikely(block_group->ro)) {
+ if (unlikely(block_group->ro) ||
+ block_group->flags & BTRFS_BLOCK_GROUP_REMAPPED) {
if (ffe_ctl->for_treelog)
btrfs_clear_treelog_bg(block_group);
if (ffe_ctl->for_data_reloc)
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index cd53509c2fda..d31817379078 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -3987,6 +3987,481 @@ static void adjust_block_group_remap_bytes(struct btrfs_trans_handle *trans,
btrfs_inc_delayed_refs_rsv_bg_updates(fs_info);
}
+struct reloc_io_private {
+ struct completion done;
+ refcount_t pending_refs;
+ blk_status_t status;
+};
+
+static void reloc_endio(struct btrfs_bio *bbio)
+{
+ struct reloc_io_private *priv = bbio->private;
+
+ if (bbio->bio.bi_status)
+ WRITE_ONCE(priv->status, bbio->bio.bi_status);
+
+ if (refcount_dec_and_test(&priv->pending_refs))
+ complete(&priv->done);
+
+ bio_put(&bbio->bio);
+}
+
+static int copy_remapped_data_io(struct btrfs_fs_info *fs_info,
+ struct reloc_io_private *priv,
+ struct page **pages, u64 addr, u64 length,
+ bool do_write)
+{
+ struct btrfs_bio *bbio;
+ unsigned long i = 0;
+ blk_opf_t op = do_write ? REQ_OP_WRITE : REQ_OP_READ;
+
+ init_completion(&priv->done);
+ refcount_set(&priv->pending_refs, 1);
+ priv->status = 0;
+
+ bbio = btrfs_bio_alloc(BIO_MAX_VECS, op, fs_info, reloc_endio,
+ priv);
+ bbio->bio.bi_iter.bi_sector = addr >> SECTOR_SHIFT;
+
+ do {
+ size_t bytes = min_t(u64, length, PAGE_SIZE);
+
+ if (bio_add_page(&bbio->bio, pages[i], bytes, 0) < bytes) {
+ refcount_inc(&priv->pending_refs);
+ btrfs_submit_bbio(bbio, 0);
+
+ bbio = btrfs_bio_alloc(BIO_MAX_VECS, op, fs_info,
+ reloc_endio, priv);
+ bbio->bio.bi_iter.bi_sector = addr >> SECTOR_SHIFT;
+ continue;
+ }
+
+ i++;
+ addr += bytes;
+ length -= bytes;
+ } while (length);
+
+ refcount_inc(&priv->pending_refs);
+ btrfs_submit_bbio(bbio, 0);
+
+ if (!refcount_dec_and_test(&priv->pending_refs))
+ wait_for_completion_io(&priv->done);
+
+ return blk_status_to_errno(READ_ONCE(priv->status));
+}
+
+static int copy_remapped_data(struct btrfs_fs_info *fs_info, u64 old_addr,
+ u64 new_addr, u64 length)
+{
+ int ret;
+ struct page **pages;
+ unsigned int nr_pages;
+ struct reloc_io_private priv;
+
+ nr_pages = DIV_ROUND_UP(length, PAGE_SIZE);
+ pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
+ if (!pages)
+ return -ENOMEM;
+ ret = btrfs_alloc_page_array(nr_pages, pages, 0);
+ if (ret) {
+ ret = -ENOMEM;
+ goto end;
+ }
+
+ ret = copy_remapped_data_io(fs_info, &priv, pages, old_addr, length,
+ false);
+ if (ret)
+ goto end;
+
+ ret = copy_remapped_data_io(fs_info, &priv, pages, new_addr, length,
+ true);
+
+end:
+ for (unsigned int i = 0; i < nr_pages; i++) {
+ if (pages[i])
+ __free_page(pages[i]);
+ }
+ kfree(pages);
+
+ return ret;
+}
+
+static int do_copy(struct btrfs_fs_info *fs_info, u64 old_addr, u64 new_addr,
+ u64 length)
+{
+ int ret;
+
+ /* Copy 1MB at a time, to avoid using too much memory. */
+
+ do {
+ u64 to_copy = min_t(u64, length, SZ_1M);
+
+ ret = copy_remapped_data(fs_info, old_addr, new_addr,
+ to_copy);
+ if (ret)
+ return ret;
+
+ if (to_copy == length)
+ break;
+
+ old_addr += to_copy;
+ new_addr += to_copy;
+ length -= to_copy;
+ } while (true);
+
+ return 0;
+}
+
+static int add_remap_item(struct btrfs_trans_handle *trans,
+ struct btrfs_path *path, u64 new_addr, u64 length,
+ u64 old_addr)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_remap remap;
+ struct btrfs_key key;
+ struct extent_buffer *leaf;
+ int ret;
+
+ key.objectid = old_addr;
+ key.type = BTRFS_REMAP_KEY;
+ key.offset = length;
+
+ ret = btrfs_insert_empty_item(trans, fs_info->remap_root, path,
+ &key, sizeof(struct btrfs_remap));
+ if (ret)
+ return ret;
+
+ leaf = path->nodes[0];
+
+ btrfs_set_stack_remap_address(&remap, new_addr);
+
+ write_extent_buffer(leaf, &remap,
+ btrfs_item_ptr_offset(leaf, path->slots[0]),
+ sizeof(struct btrfs_remap));
+
+ btrfs_release_path(path);
+
+ return 0;
+}
+
+static int add_remap_backref_item(struct btrfs_trans_handle *trans,
+ struct btrfs_path *path, u64 new_addr,
+ u64 length, u64 old_addr)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_remap remap;
+ struct btrfs_key key;
+ struct extent_buffer *leaf;
+ int ret;
+
+ key.objectid = new_addr;
+ key.type = BTRFS_REMAP_BACKREF_KEY;
+ key.offset = length;
+
+ ret = btrfs_insert_empty_item(trans, fs_info->remap_root,
+ path, &key, sizeof(struct btrfs_remap));
+ if (ret)
+ return ret;
+
+ leaf = path->nodes[0];
+
+ btrfs_set_stack_remap_address(&remap, old_addr);
+
+ write_extent_buffer(leaf, &remap,
+ btrfs_item_ptr_offset(leaf, path->slots[0]),
+ sizeof(struct btrfs_remap));
+
+ btrfs_release_path(path);
+
+ return 0;
+}
+
+static int move_existing_remap(struct btrfs_fs_info *fs_info,
+ struct btrfs_path *path,
+ struct btrfs_block_group *bg, u64 new_addr,
+ u64 length, u64 old_addr)
+{
+ struct btrfs_trans_handle *trans;
+ struct extent_buffer *leaf;
+ struct btrfs_remap *remap_ptr, remap;
+ struct btrfs_key key, ins;
+ u64 dest_addr, dest_length, min_size;
+ struct btrfs_block_group *dest_bg;
+ int ret;
+ bool is_data = bg->flags & BTRFS_BLOCK_GROUP_DATA;
+ struct btrfs_space_info *sinfo = bg->space_info;
+ bool mutex_taken = false, bg_needs_free_space;
+
+ spin_lock(&sinfo->lock);
+ btrfs_space_info_update_bytes_may_use(sinfo, length);
+ spin_unlock(&sinfo->lock);
+
+ if (is_data)
+ min_size = fs_info->sectorsize;
+ else
+ min_size = fs_info->nodesize;
+
+ ret = btrfs_reserve_extent(fs_info->fs_root, length, length, min_size,
+ 0, 0, &ins, is_data, false);
+ if (ret) {
+ spin_lock(&sinfo->lock);
+ btrfs_space_info_update_bytes_may_use(sinfo, -length);
+ spin_unlock(&sinfo->lock);
+ return ret;
+ }
+
+ dest_addr = ins.objectid;
+ dest_length = ins.offset;
+
+ if (!is_data && !IS_ALIGNED(dest_length, fs_info->nodesize)) {
+ u64 new_length = ALIGN_DOWN(dest_length, fs_info->nodesize);
+
+ btrfs_free_reserved_extent(fs_info, dest_addr + new_length,
+ dest_length - new_length, 0);
+
+ dest_length = new_length;
+ }
+
+ trans = btrfs_join_transaction(fs_info->remap_root);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ trans = NULL;
+ goto end;
+ }
+
+ mutex_lock(&fs_info->remap_mutex);
+ mutex_taken = true;
+
+ /* Find old remap entry. */
+
+ key.objectid = old_addr;
+ key.type = BTRFS_REMAP_KEY;
+ key.offset = length;
+
+ ret = btrfs_search_slot(trans, fs_info->remap_root, &key,
+ path, 0, 1);
+ if (ret == 1) {
+ /*
+ * Not a problem if the remap entry wasn't found: that means
+ * that another transaction has deallocated the data.
+ * move_existing_remaps() loops until the BG contains no
+ * remaps, so we can just return 0 in this case.
+ */
+ btrfs_release_path(path);
+ ret = 0;
+ goto end;
+ } else if (ret) {
+ goto end;
+ }
+
+ ret = do_copy(fs_info, new_addr, dest_addr, dest_length);
+ if (ret)
+ goto end;
+
+ /* Change data of old remap entry. */
+
+ leaf = path->nodes[0];
+
+ remap_ptr = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_remap);
+ btrfs_set_remap_address(leaf, remap_ptr, dest_addr);
+
+ btrfs_mark_buffer_dirty(trans, leaf);
+
+ if (dest_length != length) {
+ key.offset = dest_length;
+ btrfs_set_item_key_safe(trans, path, &key);
+ }
+
+ btrfs_release_path(path);
+
+ if (dest_length != length) {
+ /* Add remap item for remainder. */
+
+ ret = add_remap_item(trans, path, new_addr + dest_length,
+ length - dest_length,
+ old_addr + dest_length);
+ if (ret)
+ goto end;
+ }
+
+ /* Change or remove old backref. */
+
+ key.objectid = new_addr;
+ key.type = BTRFS_REMAP_BACKREF_KEY;
+ key.offset = length;
+
+ ret = btrfs_search_slot(trans, fs_info->remap_root, &key,
+ path, -1, 1);
+ if (ret) {
+ if (ret == 1) {
+ btrfs_release_path(path);
+ ret = -ENOENT;
+ }
+ goto end;
+ }
+
+ leaf = path->nodes[0];
+
+ if (dest_length == length) {
+ ret = btrfs_del_item(trans, fs_info->remap_root, path);
+ if (ret) {
+ btrfs_release_path(path);
+ goto end;
+ }
+ } else {
+ key.objectid += dest_length;
+ key.offset -= dest_length;
+ btrfs_set_item_key_safe(trans, path, &key);
+
+ btrfs_set_stack_remap_address(&remap, old_addr + dest_length);
+
+ write_extent_buffer(leaf, &remap,
+ btrfs_item_ptr_offset(leaf, path->slots[0]),
+ sizeof(struct btrfs_remap));
+ }
+
+ btrfs_release_path(path);
+
+ /* Add new backref. */
+
+ ret = add_remap_backref_item(trans, path, dest_addr, dest_length,
+ old_addr);
+ if (ret)
+ goto end;
+
+ adjust_block_group_remap_bytes(trans, bg, -dest_length);
+
+ ret = btrfs_add_to_free_space_tree(trans, new_addr, dest_length);
+ if (ret)
+ goto end;
+
+ dest_bg = btrfs_lookup_block_group(fs_info, dest_addr);
+
+ adjust_block_group_remap_bytes(trans, dest_bg, dest_length);
+
+ mutex_lock(&dest_bg->free_space_lock);
+ bg_needs_free_space = test_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE,
+ &dest_bg->runtime_flags);
+ mutex_unlock(&dest_bg->free_space_lock);
+ btrfs_put_block_group(dest_bg);
+
+ if (bg_needs_free_space) {
+ ret = btrfs_add_block_group_free_space(trans, dest_bg);
+ if (ret)
+ goto end;
+ }
+
+ ret = btrfs_remove_from_free_space_tree(trans, dest_addr, dest_length);
+ if (ret) {
+ btrfs_remove_from_free_space_tree(trans, new_addr,
+ dest_length);
+ goto end;
+ }
+
+ ret = 0;
+
+end:
+ if (mutex_taken)
+ mutex_unlock(&fs_info->remap_mutex);
+
+ btrfs_dec_block_group_reservations(fs_info, dest_addr);
+
+ if (ret) {
+ btrfs_free_reserved_extent(fs_info, dest_addr, dest_length, 0);
+
+ if (trans) {
+ btrfs_abort_transaction(trans, ret);
+ btrfs_end_transaction(trans);
+ }
+ } else {
+ dest_bg = btrfs_lookup_block_group(fs_info, dest_addr);
+ btrfs_free_reserved_bytes(dest_bg, dest_length, 0);
+ btrfs_put_block_group(dest_bg);
+
+ ret = btrfs_commit_transaction(trans);
+ }
+
+ return ret;
+}
+
+static int move_existing_remaps(struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group *bg,
+ struct btrfs_path *path)
+{
+ int ret;
+ struct btrfs_key key;
+ struct extent_buffer *leaf;
+ struct btrfs_remap *remap;
+ u64 old_addr;
+
+ /* Look for backrefs in remap tree. */
+
+ while (bg->remap_bytes > 0) {
+ key.objectid = bg->start;
+ key.type = BTRFS_REMAP_BACKREF_KEY;
+ key.offset = 0;
+
+ ret = btrfs_search_slot(NULL, fs_info->remap_root, &key, path,
+ 0, 0);
+ if (ret < 0)
+ return ret;
+
+ leaf = path->nodes[0];
+
+ if (path->slots[0] >= btrfs_header_nritems(leaf)) {
+ ret = btrfs_next_leaf(fs_info->remap_root, path);
+ if (ret < 0) {
+ btrfs_release_path(path);
+ return ret;
+ }
+
+ if (ret) {
+ btrfs_release_path(path);
+ break;
+ }
+
+ leaf = path->nodes[0];
+ }
+
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+
+ if (key.type != BTRFS_REMAP_BACKREF_KEY) {
+ path->slots[0]++;
+
+ if (path->slots[0] >= btrfs_header_nritems(leaf)) {
+ ret = btrfs_next_leaf(fs_info->remap_root, path);
+ if (ret < 0) {
+ btrfs_release_path(path);
+ return ret;
+ }
+
+ if (ret) {
+ btrfs_release_path(path);
+ break;
+ }
+
+ leaf = path->nodes[0];
+ }
+ }
+
+ remap = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_remap);
+
+ old_addr = btrfs_remap_address(leaf, remap);
+
+ btrfs_release_path(path);
+
+ ret = move_existing_remap(fs_info, path, bg, key.objectid,
+ key.offset, old_addr);
+ if (ret)
+ return ret;
+ }
+
+ BUG_ON(bg->remap_bytes > 0);
+
+ return 0;
+}
+
static int create_remap_tree_entries(struct btrfs_trans_handle *trans,
struct btrfs_path *path,
struct btrfs_block_group *bg)
@@ -4564,6 +5039,12 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start,
WARN_ON(ret && ret != -EAGAIN);
if (*using_remap_tree) {
+ if (bg->remap_bytes != 0) {
+ ret = move_existing_remaps(fs_info, bg, path);
+ if (ret)
+ goto out;
+ }
+
ret = start_block_group_remapping(fs_info, path, bg);
goto out;
}
--
2.49.1
^ permalink raw reply related [flat|nested] 42+ messages in thread* Re: [PATCH v4 11/16] btrfs: move existing remaps before relocating block group
2025-10-24 18:12 ` [PATCH v4 11/16] btrfs: move existing remaps before relocating block group Mark Harmstone
@ 2025-11-01 0:02 ` Boris Burkov
2025-11-04 13:00 ` Mark Harmstone
2025-11-01 0:10 ` Boris Burkov
1 sibling, 1 reply; 42+ messages in thread
From: Boris Burkov @ 2025-11-01 0:02 UTC (permalink / raw)
To: Mark Harmstone; +Cc: linux-btrfs
On Fri, Oct 24, 2025 at 07:12:12PM +0100, Mark Harmstone wrote:
> If when relocating a block group we find that `remap_bytes` > 0 in its
> block group item, that means that it has been the destination block
> group for another that has been remapped.
>
> We need to seach the remap tree for any remap backrefs within this
> range, and move the data to a third block group. This is because
> otherwise btrfs_translate_remap() could end up following an unbounded
> chain of remaps, which would only get worse over time.
>
> We only relocate one block group at a time, so `remap_bytes` will only
> ever go down while we are doing this. Once we're finished we set the
> REMAPPED flag on the block group, which will permanently prevent any
> other data from being moved to within it.
>
> Signed-off-by: Mark Harmstone <mark@harmstone.com>
> ---
> fs/btrfs/extent-tree.c | 6 +-
> fs/btrfs/relocation.c | 481 +++++++++++++++++++++++++++++++++++++++++
> 2 files changed, 485 insertions(+), 2 deletions(-)
>
> diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
> index 1c14e0c82c03..10dc6f8d2f71 100644
> --- a/fs/btrfs/extent-tree.c
> +++ b/fs/btrfs/extent-tree.c
> @@ -4545,7 +4545,8 @@ static noinline int find_free_extent(struct btrfs_root *root,
> block_group->cached != BTRFS_CACHE_NO) {
> down_read(&space_info->groups_sem);
> if (list_empty(&block_group->list) ||
> - block_group->ro) {
> + block_group->ro ||
> + block_group->flags & BTRFS_BLOCK_GROUP_REMAPPED) {
> /*
> * someone is removing this block group,
> * we can't jump into the have_block_group
> @@ -4579,7 +4580,8 @@ static noinline int find_free_extent(struct btrfs_root *root,
>
> ffe_ctl->hinted = false;
> /* If the block group is read-only, we can skip it entirely. */
> - if (unlikely(block_group->ro)) {
> + if (unlikely(block_group->ro) ||
> + block_group->flags & BTRFS_BLOCK_GROUP_REMAPPED) {
> if (ffe_ctl->for_treelog)
> btrfs_clear_treelog_bg(block_group);
> if (ffe_ctl->for_data_reloc)
> diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
> index cd53509c2fda..d31817379078 100644
> --- a/fs/btrfs/relocation.c
> +++ b/fs/btrfs/relocation.c
> @@ -3987,6 +3987,481 @@ static void adjust_block_group_remap_bytes(struct btrfs_trans_handle *trans,
> btrfs_inc_delayed_refs_rsv_bg_updates(fs_info);
> }
>
> +struct reloc_io_private {
> + struct completion done;
> + refcount_t pending_refs;
> + blk_status_t status;
> +};
> +
> +static void reloc_endio(struct btrfs_bio *bbio)
> +{
> + struct reloc_io_private *priv = bbio->private;
> +
> + if (bbio->bio.bi_status)
> + WRITE_ONCE(priv->status, bbio->bio.bi_status);
> +
> + if (refcount_dec_and_test(&priv->pending_refs))
> + complete(&priv->done);
> +
> + bio_put(&bbio->bio);
> +}
> +
> +static int copy_remapped_data_io(struct btrfs_fs_info *fs_info,
> + struct reloc_io_private *priv,
> + struct page **pages, u64 addr, u64 length,
> + bool do_write)
> +{
> + struct btrfs_bio *bbio;
> + unsigned long i = 0;
> + blk_opf_t op = do_write ? REQ_OP_WRITE : REQ_OP_READ;
> +
> + init_completion(&priv->done);
> + refcount_set(&priv->pending_refs, 1);
> + priv->status = 0;
> +
> + bbio = btrfs_bio_alloc(BIO_MAX_VECS, op, fs_info, reloc_endio,
> + priv);
> + bbio->bio.bi_iter.bi_sector = addr >> SECTOR_SHIFT;
> +
> + do {
> + size_t bytes = min_t(u64, length, PAGE_SIZE);
> +
> + if (bio_add_page(&bbio->bio, pages[i], bytes, 0) < bytes) {
> + refcount_inc(&priv->pending_refs);
> + btrfs_submit_bbio(bbio, 0);
> +
> + bbio = btrfs_bio_alloc(BIO_MAX_VECS, op, fs_info,
> + reloc_endio, priv);
> + bbio->bio.bi_iter.bi_sector = addr >> SECTOR_SHIFT;
> + continue;
> + }
> +
> + i++;
> + addr += bytes;
> + length -= bytes;
> + } while (length);
> +
> + refcount_inc(&priv->pending_refs);
> + btrfs_submit_bbio(bbio, 0);
> +
> + if (!refcount_dec_and_test(&priv->pending_refs))
> + wait_for_completion_io(&priv->done);
> +
> + return blk_status_to_errno(READ_ONCE(priv->status));
> +}
> +
> +static int copy_remapped_data(struct btrfs_fs_info *fs_info, u64 old_addr,
> + u64 new_addr, u64 length)
> +{
> + int ret;
> + struct page **pages;
> + unsigned int nr_pages;
> + struct reloc_io_private priv;
> +
> + nr_pages = DIV_ROUND_UP(length, PAGE_SIZE);
length + (PAGE_SIZE - 1) >> PAGE_SHIFT avoids the division
You may also want to bail out if you detect the bs > ps case Qu is
working on, as I believe that will require using large folios here.
> + pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
> + if (!pages)
> + return -ENOMEM;
> + ret = btrfs_alloc_page_array(nr_pages, pages, 0);
> + if (ret) {
> + ret = -ENOMEM;
> + goto end;
> + }
> +
> + ret = copy_remapped_data_io(fs_info, &priv, pages, old_addr, length,
> + false);
> + if (ret)
> + goto end;
> +
> + ret = copy_remapped_data_io(fs_info, &priv, pages, new_addr, length,
> + true);
> +
> +end:
> + for (unsigned int i = 0; i < nr_pages; i++) {
> + if (pages[i])
> + __free_page(pages[i]);
> + }
> + kfree(pages);
> +
> + return ret;
> +}
> +
> +static int do_copy(struct btrfs_fs_info *fs_info, u64 old_addr, u64 new_addr,
> + u64 length)
> +{
> + int ret;
> +
> + /* Copy 1MB at a time, to avoid using too much memory. */
Seems sort of arbitrary.
How does this relate to the max via BIO_MAX_VECS?
> +
> + do {
> + u64 to_copy = min_t(u64, length, SZ_1M);
> +
> + ret = copy_remapped_data(fs_info, old_addr, new_addr,
> + to_copy);
> + if (ret)
> + return ret;
> +
> + if (to_copy == length)
> + break;
> +
> + old_addr += to_copy;
> + new_addr += to_copy;
> + length -= to_copy;
> + } while (true);
> +
> + return 0;
> +}
> +
> +static int add_remap_item(struct btrfs_trans_handle *trans,
> + struct btrfs_path *path, u64 new_addr, u64 length,
> + u64 old_addr)
> +{
> + struct btrfs_fs_info *fs_info = trans->fs_info;
> + struct btrfs_remap remap;
> + struct btrfs_key key;
> + struct extent_buffer *leaf;
> + int ret;
> +
> + key.objectid = old_addr;
> + key.type = BTRFS_REMAP_KEY;
> + key.offset = length;
> +
> + ret = btrfs_insert_empty_item(trans, fs_info->remap_root, path,
> + &key, sizeof(struct btrfs_remap));
> + if (ret)
> + return ret;
> +
> + leaf = path->nodes[0];
> +
> + btrfs_set_stack_remap_address(&remap, new_addr);
> +
> + write_extent_buffer(leaf, &remap,
> + btrfs_item_ptr_offset(leaf, path->slots[0]),
> + sizeof(struct btrfs_remap));
> +
> + btrfs_release_path(path);
> +
> + return 0;
> +}
> +
> +static int add_remap_backref_item(struct btrfs_trans_handle *trans,
> + struct btrfs_path *path, u64 new_addr,
> + u64 length, u64 old_addr)
> +{
> + struct btrfs_fs_info *fs_info = trans->fs_info;
> + struct btrfs_remap remap;
> + struct btrfs_key key;
> + struct extent_buffer *leaf;
> + int ret;
> +
> + key.objectid = new_addr;
> + key.type = BTRFS_REMAP_BACKREF_KEY;
> + key.offset = length;
> +
> + ret = btrfs_insert_empty_item(trans, fs_info->remap_root,
> + path, &key, sizeof(struct btrfs_remap));
> + if (ret)
> + return ret;
> +
> + leaf = path->nodes[0];
> +
> + btrfs_set_stack_remap_address(&remap, old_addr);
> +
> + write_extent_buffer(leaf, &remap,
> + btrfs_item_ptr_offset(leaf, path->slots[0]),
> + sizeof(struct btrfs_remap));
> +
> + btrfs_release_path(path);
> +
> + return 0;
> +}
> +
> +static int move_existing_remap(struct btrfs_fs_info *fs_info,
> + struct btrfs_path *path,
> + struct btrfs_block_group *bg, u64 new_addr,
> + u64 length, u64 old_addr)
> +{
> + struct btrfs_trans_handle *trans;
> + struct extent_buffer *leaf;
> + struct btrfs_remap *remap_ptr, remap;
> + struct btrfs_key key, ins;
> + u64 dest_addr, dest_length, min_size;
> + struct btrfs_block_group *dest_bg;
> + int ret;
> + bool is_data = bg->flags & BTRFS_BLOCK_GROUP_DATA;
> + struct btrfs_space_info *sinfo = bg->space_info;
> + bool mutex_taken = false, bg_needs_free_space;
> +
> + spin_lock(&sinfo->lock);
> + btrfs_space_info_update_bytes_may_use(sinfo, length);
> + spin_unlock(&sinfo->lock);
> +
> + if (is_data)
> + min_size = fs_info->sectorsize;
> + else
> + min_size = fs_info->nodesize;
> +
> + ret = btrfs_reserve_extent(fs_info->fs_root, length, length, min_size,
> + 0, 0, &ins, is_data, false);
> + if (ret) {
> + spin_lock(&sinfo->lock);
> + btrfs_space_info_update_bytes_may_use(sinfo, -length);
> + spin_unlock(&sinfo->lock);
> + return ret;
> + }
> +
> + dest_addr = ins.objectid;
> + dest_length = ins.offset;
> +
> + if (!is_data && !IS_ALIGNED(dest_length, fs_info->nodesize)) {
> + u64 new_length = ALIGN_DOWN(dest_length, fs_info->nodesize);
> +
> + btrfs_free_reserved_extent(fs_info, dest_addr + new_length,
> + dest_length - new_length, 0);
> +
> + dest_length = new_length;
> + }
> +
> + trans = btrfs_join_transaction(fs_info->remap_root);
> + if (IS_ERR(trans)) {
> + ret = PTR_ERR(trans);
> + trans = NULL;
> + goto end;
> + }
> +
> + mutex_lock(&fs_info->remap_mutex);
> + mutex_taken = true;
> +
> + /* Find old remap entry. */
> +
> + key.objectid = old_addr;
> + key.type = BTRFS_REMAP_KEY;
> + key.offset = length;
> +
> + ret = btrfs_search_slot(trans, fs_info->remap_root, &key,
> + path, 0, 1);
> + if (ret == 1) {
> + /*
> + * Not a problem if the remap entry wasn't found: that means
> + * that another transaction has deallocated the data.
> + * move_existing_remaps() loops until the BG contains no
> + * remaps, so we can just return 0 in this case.
> + */
I agree with this reasoning. However, what prevents someone from
deallocating this data after we have found the entry? Is there some
higher locking that protects us? As far as I can tell if the last extent
goes away we could delete the remap entry while simultaneously moving
it here?
> + btrfs_release_path(path);
> + ret = 0;
> + goto end;
> + } else if (ret) {
> + goto end;
> + }
> +
> + ret = do_copy(fs_info, new_addr, dest_addr, dest_length);
> + if (ret)
> + goto end;
> +
> + /* Change data of old remap entry. */
> +
> + leaf = path->nodes[0];
> +
> + remap_ptr = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_remap);
> + btrfs_set_remap_address(leaf, remap_ptr, dest_addr);
> +
> + btrfs_mark_buffer_dirty(trans, leaf);
> +
> + if (dest_length != length) {
> + key.offset = dest_length;
> + btrfs_set_item_key_safe(trans, path, &key);
> + }
> +
> + btrfs_release_path(path);
> +
> + if (dest_length != length) {
> + /* Add remap item for remainder. */
> +
> + ret = add_remap_item(trans, path, new_addr + dest_length,
> + length - dest_length,
> + old_addr + dest_length);
> + if (ret)
> + goto end;
> + }
> +
> + /* Change or remove old backref. */
> +
> + key.objectid = new_addr;
> + key.type = BTRFS_REMAP_BACKREF_KEY;
> + key.offset = length;
> +
> + ret = btrfs_search_slot(trans, fs_info->remap_root, &key,
> + path, -1, 1);
> + if (ret) {
> + if (ret == 1) {
> + btrfs_release_path(path);
> + ret = -ENOENT;
> + }
> + goto end;
> + }
> +
> + leaf = path->nodes[0];
> +
> + if (dest_length == length) {
> + ret = btrfs_del_item(trans, fs_info->remap_root, path);
> + if (ret) {
> + btrfs_release_path(path);
> + goto end;
> + }
> + } else {
> + key.objectid += dest_length;
> + key.offset -= dest_length;
> + btrfs_set_item_key_safe(trans, path, &key);
> +
> + btrfs_set_stack_remap_address(&remap, old_addr + dest_length);
> +
> + write_extent_buffer(leaf, &remap,
> + btrfs_item_ptr_offset(leaf, path->slots[0]),
> + sizeof(struct btrfs_remap));
> + }
> +
> + btrfs_release_path(path);
> +
> + /* Add new backref. */
> +
> + ret = add_remap_backref_item(trans, path, dest_addr, dest_length,
> + old_addr);
> + if (ret)
> + goto end;
> +
> + adjust_block_group_remap_bytes(trans, bg, -dest_length);
> +
> + ret = btrfs_add_to_free_space_tree(trans, new_addr, dest_length);
> + if (ret)
> + goto end;
> +
> + dest_bg = btrfs_lookup_block_group(fs_info, dest_addr);
> +
> + adjust_block_group_remap_bytes(trans, dest_bg, dest_length);
> +
> + mutex_lock(&dest_bg->free_space_lock);
> + bg_needs_free_space = test_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE,
> + &dest_bg->runtime_flags);
> + mutex_unlock(&dest_bg->free_space_lock);
> + btrfs_put_block_group(dest_bg);
> +
> + if (bg_needs_free_space) {
> + ret = btrfs_add_block_group_free_space(trans, dest_bg);
> + if (ret)
> + goto end;
> + }
> +
> + ret = btrfs_remove_from_free_space_tree(trans, dest_addr, dest_length);
> + if (ret) {
> + btrfs_remove_from_free_space_tree(trans, new_addr,
> + dest_length);
> + goto end;
> + }
> +
> + ret = 0;
> +
> +end:
> + if (mutex_taken)
> + mutex_unlock(&fs_info->remap_mutex);
> +
> + btrfs_dec_block_group_reservations(fs_info, dest_addr);
> +
> + if (ret) {
> + btrfs_free_reserved_extent(fs_info, dest_addr, dest_length, 0);
> +
> + if (trans) {
> + btrfs_abort_transaction(trans, ret);
> + btrfs_end_transaction(trans);
> + }
> + } else {
> + dest_bg = btrfs_lookup_block_group(fs_info, dest_addr);
> + btrfs_free_reserved_bytes(dest_bg, dest_length, 0);
> + btrfs_put_block_group(dest_bg);
> +
> + ret = btrfs_commit_transaction(trans);
> + }
> +
> + return ret;
> +}
> +
> +static int move_existing_remaps(struct btrfs_fs_info *fs_info,
> + struct btrfs_block_group *bg,
> + struct btrfs_path *path)
> +{
> + int ret;
> + struct btrfs_key key;
> + struct extent_buffer *leaf;
> + struct btrfs_remap *remap;
> + u64 old_addr;
> +
> + /* Look for backrefs in remap tree. */
> +
> + while (bg->remap_bytes > 0) {
> + key.objectid = bg->start;
> + key.type = BTRFS_REMAP_BACKREF_KEY;
> + key.offset = 0;
> +
> + ret = btrfs_search_slot(NULL, fs_info->remap_root, &key, path,
> + 0, 0);
> + if (ret < 0)
> + return ret;
> +
> + leaf = path->nodes[0];
> +
> + if (path->slots[0] >= btrfs_header_nritems(leaf)) {
> + ret = btrfs_next_leaf(fs_info->remap_root, path);
> + if (ret < 0) {
> + btrfs_release_path(path);
> + return ret;
> + }
> +
> + if (ret) {
> + btrfs_release_path(path);
> + break;
> + }
> +
> + leaf = path->nodes[0];
> + }
> +
> + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
> +
> + if (key.type != BTRFS_REMAP_BACKREF_KEY) {
> + path->slots[0]++;
> +
> + if (path->slots[0] >= btrfs_header_nritems(leaf)) {
> + ret = btrfs_next_leaf(fs_info->remap_root, path);
> + if (ret < 0) {
> + btrfs_release_path(path);
> + return ret;
> + }
> +
> + if (ret) {
> + btrfs_release_path(path);
> + break;
> + }
> +
> + leaf = path->nodes[0];
> + }
> + }
> +
> + remap = btrfs_item_ptr(leaf, path->slots[0],
> + struct btrfs_remap);
> +
> + old_addr = btrfs_remap_address(leaf, remap);
> +
> + btrfs_release_path(path);
> +
> + ret = move_existing_remap(fs_info, path, bg, key.objectid,
> + key.offset, old_addr);
> + if (ret)
> + return ret;
> + }
> +
> + BUG_ON(bg->remap_bytes > 0);
> +
> + return 0;
> +}
> +
> static int create_remap_tree_entries(struct btrfs_trans_handle *trans,
> struct btrfs_path *path,
> struct btrfs_block_group *bg)
> @@ -4564,6 +5039,12 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start,
> WARN_ON(ret && ret != -EAGAIN);
>
> if (*using_remap_tree) {
> + if (bg->remap_bytes != 0) {
> + ret = move_existing_remaps(fs_info, bg, path);
> + if (ret)
> + goto out;
> + }
> +
> ret = start_block_group_remapping(fs_info, path, bg);
> goto out;
> }
> --
> 2.49.1
>
^ permalink raw reply [flat|nested] 42+ messages in thread* Re: [PATCH v4 11/16] btrfs: move existing remaps before relocating block group
2025-11-01 0:02 ` Boris Burkov
@ 2025-11-04 13:00 ` Mark Harmstone
0 siblings, 0 replies; 42+ messages in thread
From: Mark Harmstone @ 2025-11-04 13:00 UTC (permalink / raw)
To: Boris Burkov; +Cc: linux-btrfs
On 01/11/2025 12.02 am, Boris Burkov wrote:
> On Fri, Oct 24, 2025 at 07:12:12PM +0100, Mark Harmstone wrote:
>> If when relocating a block group we find that `remap_bytes` > 0 in its
>> block group item, that means that it has been the destination block
>> group for another that has been remapped.
>>
>> We need to seach the remap tree for any remap backrefs within this
>> range, and move the data to a third block group. This is because
>> otherwise btrfs_translate_remap() could end up following an unbounded
>> chain of remaps, which would only get worse over time.
>>
>> We only relocate one block group at a time, so `remap_bytes` will only
>> ever go down while we are doing this. Once we're finished we set the
>> REMAPPED flag on the block group, which will permanently prevent any
>> other data from being moved to within it.
>>
>> Signed-off-by: Mark Harmstone <mark@harmstone.com>
>> ---
>> fs/btrfs/extent-tree.c | 6 +-
>> fs/btrfs/relocation.c | 481 +++++++++++++++++++++++++++++++++++++++++
>> 2 files changed, 485 insertions(+), 2 deletions(-)
>>
>> diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
>> index 1c14e0c82c03..10dc6f8d2f71 100644
>> --- a/fs/btrfs/extent-tree.c
>> +++ b/fs/btrfs/extent-tree.c
>> @@ -4545,7 +4545,8 @@ static noinline int find_free_extent(struct btrfs_root *root,
>> block_group->cached != BTRFS_CACHE_NO) {
>> down_read(&space_info->groups_sem);
>> if (list_empty(&block_group->list) ||
>> - block_group->ro) {
>> + block_group->ro ||
>> + block_group->flags & BTRFS_BLOCK_GROUP_REMAPPED) {
>> /*
>> * someone is removing this block group,
>> * we can't jump into the have_block_group
>> @@ -4579,7 +4580,8 @@ static noinline int find_free_extent(struct btrfs_root *root,
>>
>> ffe_ctl->hinted = false;
>> /* If the block group is read-only, we can skip it entirely. */
>> - if (unlikely(block_group->ro)) {
>> + if (unlikely(block_group->ro) ||
>> + block_group->flags & BTRFS_BLOCK_GROUP_REMAPPED) {
>> if (ffe_ctl->for_treelog)
>> btrfs_clear_treelog_bg(block_group);
>> if (ffe_ctl->for_data_reloc)
>> diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
>> index cd53509c2fda..d31817379078 100644
>> --- a/fs/btrfs/relocation.c
>> +++ b/fs/btrfs/relocation.c
>> @@ -3987,6 +3987,481 @@ static void adjust_block_group_remap_bytes(struct btrfs_trans_handle *trans,
>> btrfs_inc_delayed_refs_rsv_bg_updates(fs_info);
>> }
>>
>> +struct reloc_io_private {
>> + struct completion done;
>> + refcount_t pending_refs;
>> + blk_status_t status;
>> +};
>> +
>> +static void reloc_endio(struct btrfs_bio *bbio)
>> +{
>> + struct reloc_io_private *priv = bbio->private;
>> +
>> + if (bbio->bio.bi_status)
>> + WRITE_ONCE(priv->status, bbio->bio.bi_status);
>> +
>> + if (refcount_dec_and_test(&priv->pending_refs))
>> + complete(&priv->done);
>> +
>> + bio_put(&bbio->bio);
>> +}
>> +
>> +static int copy_remapped_data_io(struct btrfs_fs_info *fs_info,
>> + struct reloc_io_private *priv,
>> + struct page **pages, u64 addr, u64 length,
>> + bool do_write)
>> +{
>> + struct btrfs_bio *bbio;
>> + unsigned long i = 0;
>> + blk_opf_t op = do_write ? REQ_OP_WRITE : REQ_OP_READ;
>> +
>> + init_completion(&priv->done);
>> + refcount_set(&priv->pending_refs, 1);
>> + priv->status = 0;
>> +
>> + bbio = btrfs_bio_alloc(BIO_MAX_VECS, op, fs_info, reloc_endio,
>> + priv);
>> + bbio->bio.bi_iter.bi_sector = addr >> SECTOR_SHIFT;
>> +
>> + do {
>> + size_t bytes = min_t(u64, length, PAGE_SIZE);
>> +
>> + if (bio_add_page(&bbio->bio, pages[i], bytes, 0) < bytes) {
>> + refcount_inc(&priv->pending_refs);
>> + btrfs_submit_bbio(bbio, 0);
>> +
>> + bbio = btrfs_bio_alloc(BIO_MAX_VECS, op, fs_info,
>> + reloc_endio, priv);
>> + bbio->bio.bi_iter.bi_sector = addr >> SECTOR_SHIFT;
>> + continue;
>> + }
>> +
>> + i++;
>> + addr += bytes;
>> + length -= bytes;
>> + } while (length);
>> +
>> + refcount_inc(&priv->pending_refs);
>> + btrfs_submit_bbio(bbio, 0);
>> +
>> + if (!refcount_dec_and_test(&priv->pending_refs))
>> + wait_for_completion_io(&priv->done);
>> +
>> + return blk_status_to_errno(READ_ONCE(priv->status));
>> +}
>> +
>> +static int copy_remapped_data(struct btrfs_fs_info *fs_info, u64 old_addr,
>> + u64 new_addr, u64 length)
>> +{
>> + int ret;
>> + struct page **pages;
>> + unsigned int nr_pages;
>> + struct reloc_io_private priv;
>> +
>> + nr_pages = DIV_ROUND_UP(length, PAGE_SIZE);
>
> length + (PAGE_SIZE - 1) >> PAGE_SHIFT avoids the division
Nice. The compiler optimizes a / 4096 to a >> 12 anyway, so it's
probably the same asm.
> You may also want to bail out if you detect the bs > ps case Qu is
> working on, as I believe that will require using large folios here.
Yes, good catch. For now I'll make it so it'll refuse to mount in this case.
>> + pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
>> + if (!pages)
>> + return -ENOMEM;
>> + ret = btrfs_alloc_page_array(nr_pages, pages, 0);
>> + if (ret) {
>> + ret = -ENOMEM;
>> + goto end;
>> + }
>> +
>> + ret = copy_remapped_data_io(fs_info, &priv, pages, old_addr, length,
>> + false);
>> + if (ret)
>> + goto end;
>> +
>> + ret = copy_remapped_data_io(fs_info, &priv, pages, new_addr, length,
>> + true);
>> +
>> +end:
>> + for (unsigned int i = 0; i < nr_pages; i++) {
>> + if (pages[i])
>> + __free_page(pages[i]);
>> + }
>> + kfree(pages);
>> +
>> + return ret;
>> +}
>> +
>> +static int do_copy(struct btrfs_fs_info *fs_info, u64 old_addr, u64 new_addr,
>> + u64 length)
>> +{
>> + int ret;
>> +
>> + /* Copy 1MB at a time, to avoid using too much memory. */
>
> Seems sort of arbitrary.
It is, but so is e.g. how much data we process in async discard at once.
We can always tweak it later if it turns out to be too big or too small.
> How does this relate to the max via BIO_MAX_VECS?
It doesn't, but it probably should. BIO_MAX_VECS == 256, and 256 * 4KB
== 1MB.
So until I fix it to use large folios it should also be capped at
BIO_MAX_VECS << PAGE_SHIFT (which happens to be the same at the moment).
>> +
>> + do {
>> + u64 to_copy = min_t(u64, length, SZ_1M);
>> +
>> + ret = copy_remapped_data(fs_info, old_addr, new_addr,
>> + to_copy);
>> + if (ret)
>> + return ret;
>> +
>> + if (to_copy == length)
>> + break;
>> +
>> + old_addr += to_copy;
>> + new_addr += to_copy;
>> + length -= to_copy;
>> + } while (true);
>> +
>> + return 0;
>> +}
>> +
>> +static int add_remap_item(struct btrfs_trans_handle *trans,
>> + struct btrfs_path *path, u64 new_addr, u64 length,
>> + u64 old_addr)
>> +{
>> + struct btrfs_fs_info *fs_info = trans->fs_info;
>> + struct btrfs_remap remap;
>> + struct btrfs_key key;
>> + struct extent_buffer *leaf;
>> + int ret;
>> +
>> + key.objectid = old_addr;
>> + key.type = BTRFS_REMAP_KEY;
>> + key.offset = length;
>> +
>> + ret = btrfs_insert_empty_item(trans, fs_info->remap_root, path,
>> + &key, sizeof(struct btrfs_remap));
>> + if (ret)
>> + return ret;
>> +
>> + leaf = path->nodes[0];
>> +
>> + btrfs_set_stack_remap_address(&remap, new_addr);
>> +
>> + write_extent_buffer(leaf, &remap,
>> + btrfs_item_ptr_offset(leaf, path->slots[0]),
>> + sizeof(struct btrfs_remap));
>> +
>> + btrfs_release_path(path);
>> +
>> + return 0;
>> +}
>> +
>> +static int add_remap_backref_item(struct btrfs_trans_handle *trans,
>> + struct btrfs_path *path, u64 new_addr,
>> + u64 length, u64 old_addr)
>> +{
>> + struct btrfs_fs_info *fs_info = trans->fs_info;
>> + struct btrfs_remap remap;
>> + struct btrfs_key key;
>> + struct extent_buffer *leaf;
>> + int ret;
>> +
>> + key.objectid = new_addr;
>> + key.type = BTRFS_REMAP_BACKREF_KEY;
>> + key.offset = length;
>> +
>> + ret = btrfs_insert_empty_item(trans, fs_info->remap_root,
>> + path, &key, sizeof(struct btrfs_remap));
>> + if (ret)
>> + return ret;
>> +
>> + leaf = path->nodes[0];
>> +
>> + btrfs_set_stack_remap_address(&remap, old_addr);
>> +
>> + write_extent_buffer(leaf, &remap,
>> + btrfs_item_ptr_offset(leaf, path->slots[0]),
>> + sizeof(struct btrfs_remap));
>> +
>> + btrfs_release_path(path);
>> +
>> + return 0;
>> +}
>> +
>> +static int move_existing_remap(struct btrfs_fs_info *fs_info,
>> + struct btrfs_path *path,
>> + struct btrfs_block_group *bg, u64 new_addr,
>> + u64 length, u64 old_addr)
>> +{
>> + struct btrfs_trans_handle *trans;
>> + struct extent_buffer *leaf;
>> + struct btrfs_remap *remap_ptr, remap;
>> + struct btrfs_key key, ins;
>> + u64 dest_addr, dest_length, min_size;
>> + struct btrfs_block_group *dest_bg;
>> + int ret;
>> + bool is_data = bg->flags & BTRFS_BLOCK_GROUP_DATA;
>> + struct btrfs_space_info *sinfo = bg->space_info;
>> + bool mutex_taken = false, bg_needs_free_space;
>> +
>> + spin_lock(&sinfo->lock);
>> + btrfs_space_info_update_bytes_may_use(sinfo, length);
>> + spin_unlock(&sinfo->lock);
>> +
>> + if (is_data)
>> + min_size = fs_info->sectorsize;
>> + else
>> + min_size = fs_info->nodesize;
>> +
>> + ret = btrfs_reserve_extent(fs_info->fs_root, length, length, min_size,
>> + 0, 0, &ins, is_data, false);
>> + if (ret) {
>> + spin_lock(&sinfo->lock);
>> + btrfs_space_info_update_bytes_may_use(sinfo, -length);
>> + spin_unlock(&sinfo->lock);
>> + return ret;
>> + }
>> +
>> + dest_addr = ins.objectid;
>> + dest_length = ins.offset;
>> +
>> + if (!is_data && !IS_ALIGNED(dest_length, fs_info->nodesize)) {
>> + u64 new_length = ALIGN_DOWN(dest_length, fs_info->nodesize);
>> +
>> + btrfs_free_reserved_extent(fs_info, dest_addr + new_length,
>> + dest_length - new_length, 0);
>> +
>> + dest_length = new_length;
>> + }
>> +
>> + trans = btrfs_join_transaction(fs_info->remap_root);
>> + if (IS_ERR(trans)) {
>> + ret = PTR_ERR(trans);
>> + trans = NULL;
>> + goto end;
>> + }
>> +
>> + mutex_lock(&fs_info->remap_mutex);
>> + mutex_taken = true;
>> +
>> + /* Find old remap entry. */
>> +
>> + key.objectid = old_addr;
>> + key.type = BTRFS_REMAP_KEY;
>> + key.offset = length;
>> +
>> + ret = btrfs_search_slot(trans, fs_info->remap_root, &key,
>> + path, 0, 1);
>> + if (ret == 1) {
>> + /*
>> + * Not a problem if the remap entry wasn't found: that means
>> + * that another transaction has deallocated the data.
>> + * move_existing_remaps() loops until the BG contains no
>> + * remaps, so we can just return 0 in this case.
>> + */
>
> I agree with this reasoning. However, what prevents someone from
> deallocating this data after we have found the entry? Is there some
> higher locking that protects us? As far as I can tell if the last extent
> goes away we could delete the remap entry while simultaneously moving
> it here?
Yes, as you said in your other message we're protected by the remap_mutex.
>> + btrfs_release_path(path);
>> + ret = 0;
>> + goto end;
>> + } else if (ret) {
>> + goto end;
>> + }
>> +
>> + ret = do_copy(fs_info, new_addr, dest_addr, dest_length);
>> + if (ret)
>> + goto end;
>> +
>> + /* Change data of old remap entry. */
>> +
>> + leaf = path->nodes[0];
>> +
>> + remap_ptr = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_remap);
>> + btrfs_set_remap_address(leaf, remap_ptr, dest_addr);
>> +
>> + btrfs_mark_buffer_dirty(trans, leaf);
>> +
>> + if (dest_length != length) {
>> + key.offset = dest_length;
>> + btrfs_set_item_key_safe(trans, path, &key);
>> + }
>> +
>> + btrfs_release_path(path);
>> +
>> + if (dest_length != length) {
>> + /* Add remap item for remainder. */
>> +
>> + ret = add_remap_item(trans, path, new_addr + dest_length,
>> + length - dest_length,
>> + old_addr + dest_length);
>> + if (ret)
>> + goto end;
>> + }
>> +
>> + /* Change or remove old backref. */
>> +
>> + key.objectid = new_addr;
>> + key.type = BTRFS_REMAP_BACKREF_KEY;
>> + key.offset = length;
>> +
>> + ret = btrfs_search_slot(trans, fs_info->remap_root, &key,
>> + path, -1, 1);
>> + if (ret) {
>> + if (ret == 1) {
>> + btrfs_release_path(path);
>> + ret = -ENOENT;
>> + }
>> + goto end;
>> + }
>> +
>> + leaf = path->nodes[0];
>> +
>> + if (dest_length == length) {
>> + ret = btrfs_del_item(trans, fs_info->remap_root, path);
>> + if (ret) {
>> + btrfs_release_path(path);
>> + goto end;
>> + }
>> + } else {
>> + key.objectid += dest_length;
>> + key.offset -= dest_length;
>> + btrfs_set_item_key_safe(trans, path, &key);
>> +
>> + btrfs_set_stack_remap_address(&remap, old_addr + dest_length);
>> +
>> + write_extent_buffer(leaf, &remap,
>> + btrfs_item_ptr_offset(leaf, path->slots[0]),
>> + sizeof(struct btrfs_remap));
>> + }
>> +
>> + btrfs_release_path(path);
>> +
>> + /* Add new backref. */
>> +
>> + ret = add_remap_backref_item(trans, path, dest_addr, dest_length,
>> + old_addr);
>> + if (ret)
>> + goto end;
>> +
>> + adjust_block_group_remap_bytes(trans, bg, -dest_length);
>> +
>> + ret = btrfs_add_to_free_space_tree(trans, new_addr, dest_length);
>> + if (ret)
>> + goto end;
>> +
>> + dest_bg = btrfs_lookup_block_group(fs_info, dest_addr);
>> +
>> + adjust_block_group_remap_bytes(trans, dest_bg, dest_length);
>> +
>> + mutex_lock(&dest_bg->free_space_lock);
>> + bg_needs_free_space = test_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE,
>> + &dest_bg->runtime_flags);
>> + mutex_unlock(&dest_bg->free_space_lock);
>> + btrfs_put_block_group(dest_bg);
>> +
>> + if (bg_needs_free_space) {
>> + ret = btrfs_add_block_group_free_space(trans, dest_bg);
>> + if (ret)
>> + goto end;
>> + }
>> +
>> + ret = btrfs_remove_from_free_space_tree(trans, dest_addr, dest_length);
>> + if (ret) {
>> + btrfs_remove_from_free_space_tree(trans, new_addr,
>> + dest_length);
>> + goto end;
>> + }
>> +
>> + ret = 0;
>> +
>> +end:
>> + if (mutex_taken)
>> + mutex_unlock(&fs_info->remap_mutex);
>> +
>> + btrfs_dec_block_group_reservations(fs_info, dest_addr);
>> +
>> + if (ret) {
>> + btrfs_free_reserved_extent(fs_info, dest_addr, dest_length, 0);
>> +
>> + if (trans) {
>> + btrfs_abort_transaction(trans, ret);
>> + btrfs_end_transaction(trans);
>> + }
>> + } else {
>> + dest_bg = btrfs_lookup_block_group(fs_info, dest_addr);
>> + btrfs_free_reserved_bytes(dest_bg, dest_length, 0);
>> + btrfs_put_block_group(dest_bg);
>> +
>> + ret = btrfs_commit_transaction(trans);
>> + }
>> +
>> + return ret;
>> +}
>> +
>> +static int move_existing_remaps(struct btrfs_fs_info *fs_info,
>> + struct btrfs_block_group *bg,
>> + struct btrfs_path *path)
>> +{
>> + int ret;
>> + struct btrfs_key key;
>> + struct extent_buffer *leaf;
>> + struct btrfs_remap *remap;
>> + u64 old_addr;
>> +
>> + /* Look for backrefs in remap tree. */
>> +
>> + while (bg->remap_bytes > 0) {
>> + key.objectid = bg->start;
>> + key.type = BTRFS_REMAP_BACKREF_KEY;
>> + key.offset = 0;
>> +
>> + ret = btrfs_search_slot(NULL, fs_info->remap_root, &key, path,
>> + 0, 0);
>> + if (ret < 0)
>> + return ret;
>> +
>> + leaf = path->nodes[0];
>> +
>> + if (path->slots[0] >= btrfs_header_nritems(leaf)) {
>> + ret = btrfs_next_leaf(fs_info->remap_root, path);
>> + if (ret < 0) {
>> + btrfs_release_path(path);
>> + return ret;
>> + }
>> +
>> + if (ret) {
>> + btrfs_release_path(path);
>> + break;
>> + }
>> +
>> + leaf = path->nodes[0];
>> + }
>> +
>> + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
>> +
>> + if (key.type != BTRFS_REMAP_BACKREF_KEY) {
>> + path->slots[0]++;
>> +
>> + if (path->slots[0] >= btrfs_header_nritems(leaf)) {
>> + ret = btrfs_next_leaf(fs_info->remap_root, path);
>> + if (ret < 0) {
>> + btrfs_release_path(path);
>> + return ret;
>> + }
>> +
>> + if (ret) {
>> + btrfs_release_path(path);
>> + break;
>> + }
>> +
>> + leaf = path->nodes[0];
>> + }
>> + }
>> +
>> + remap = btrfs_item_ptr(leaf, path->slots[0],
>> + struct btrfs_remap);
>> +
>> + old_addr = btrfs_remap_address(leaf, remap);
>> +
>> + btrfs_release_path(path);
>> +
>> + ret = move_existing_remap(fs_info, path, bg, key.objectid,
>> + key.offset, old_addr);
>> + if (ret)
>> + return ret;
>> + }
>> +
>> + BUG_ON(bg->remap_bytes > 0);
>> +
>> + return 0;
>> +}
>> +
>> static int create_remap_tree_entries(struct btrfs_trans_handle *trans,
>> struct btrfs_path *path,
>> struct btrfs_block_group *bg)
>> @@ -4564,6 +5039,12 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start,
>> WARN_ON(ret && ret != -EAGAIN);
>>
>> if (*using_remap_tree) {
>> + if (bg->remap_bytes != 0) {
>> + ret = move_existing_remaps(fs_info, bg, path);
>> + if (ret)
>> + goto out;
>> + }
>> +
>> ret = start_block_group_remapping(fs_info, path, bg);
>> goto out;
>> }
>> --
>> 2.49.1
>>
^ permalink raw reply [flat|nested] 42+ messages in thread
* Re: [PATCH v4 11/16] btrfs: move existing remaps before relocating block group
2025-10-24 18:12 ` [PATCH v4 11/16] btrfs: move existing remaps before relocating block group Mark Harmstone
2025-11-01 0:02 ` Boris Burkov
@ 2025-11-01 0:10 ` Boris Burkov
1 sibling, 0 replies; 42+ messages in thread
From: Boris Burkov @ 2025-11-01 0:10 UTC (permalink / raw)
To: Mark Harmstone; +Cc: linux-btrfs
On Fri, Oct 24, 2025 at 07:12:12PM +0100, Mark Harmstone wrote:
> If when relocating a block group we find that `remap_bytes` > 0 in its
> block group item, that means that it has been the destination block
> group for another that has been remapped.
>
> We need to seach the remap tree for any remap backrefs within this
> range, and move the data to a third block group. This is because
> otherwise btrfs_translate_remap() could end up following an unbounded
> chain of remaps, which would only get worse over time.
>
> We only relocate one block group at a time, so `remap_bytes` will only
> ever go down while we are doing this. Once we're finished we set the
> REMAPPED flag on the block group, which will permanently prevent any
> other data from being moved to within it.
>
> Signed-off-by: Mark Harmstone <mark@harmstone.com>
> ---
> fs/btrfs/extent-tree.c | 6 +-
> fs/btrfs/relocation.c | 481 +++++++++++++++++++++++++++++++++++++++++
> 2 files changed, 485 insertions(+), 2 deletions(-)
>
> diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
> index 1c14e0c82c03..10dc6f8d2f71 100644
> --- a/fs/btrfs/extent-tree.c
> +++ b/fs/btrfs/extent-tree.c
> @@ -4545,7 +4545,8 @@ static noinline int find_free_extent(struct btrfs_root *root,
> block_group->cached != BTRFS_CACHE_NO) {
> down_read(&space_info->groups_sem);
> if (list_empty(&block_group->list) ||
> - block_group->ro) {
> + block_group->ro ||
> + block_group->flags & BTRFS_BLOCK_GROUP_REMAPPED) {
> /*
> * someone is removing this block group,
> * we can't jump into the have_block_group
> @@ -4579,7 +4580,8 @@ static noinline int find_free_extent(struct btrfs_root *root,
>
> ffe_ctl->hinted = false;
> /* If the block group is read-only, we can skip it entirely. */
> - if (unlikely(block_group->ro)) {
> + if (unlikely(block_group->ro) ||
> + block_group->flags & BTRFS_BLOCK_GROUP_REMAPPED) {
> if (ffe_ctl->for_treelog)
> btrfs_clear_treelog_bg(block_group);
> if (ffe_ctl->for_data_reloc)
> diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
> index cd53509c2fda..d31817379078 100644
> --- a/fs/btrfs/relocation.c
> +++ b/fs/btrfs/relocation.c
> @@ -3987,6 +3987,481 @@ static void adjust_block_group_remap_bytes(struct btrfs_trans_handle *trans,
> btrfs_inc_delayed_refs_rsv_bg_updates(fs_info);
> }
>
> +struct reloc_io_private {
> + struct completion done;
> + refcount_t pending_refs;
> + blk_status_t status;
> +};
> +
> +static void reloc_endio(struct btrfs_bio *bbio)
> +{
> + struct reloc_io_private *priv = bbio->private;
> +
> + if (bbio->bio.bi_status)
> + WRITE_ONCE(priv->status, bbio->bio.bi_status);
> +
> + if (refcount_dec_and_test(&priv->pending_refs))
> + complete(&priv->done);
> +
> + bio_put(&bbio->bio);
> +}
> +
> +static int copy_remapped_data_io(struct btrfs_fs_info *fs_info,
> + struct reloc_io_private *priv,
> + struct page **pages, u64 addr, u64 length,
> + bool do_write)
> +{
> + struct btrfs_bio *bbio;
> + unsigned long i = 0;
> + blk_opf_t op = do_write ? REQ_OP_WRITE : REQ_OP_READ;
> +
> + init_completion(&priv->done);
> + refcount_set(&priv->pending_refs, 1);
> + priv->status = 0;
> +
> + bbio = btrfs_bio_alloc(BIO_MAX_VECS, op, fs_info, reloc_endio,
> + priv);
> + bbio->bio.bi_iter.bi_sector = addr >> SECTOR_SHIFT;
> +
> + do {
> + size_t bytes = min_t(u64, length, PAGE_SIZE);
> +
> + if (bio_add_page(&bbio->bio, pages[i], bytes, 0) < bytes) {
> + refcount_inc(&priv->pending_refs);
> + btrfs_submit_bbio(bbio, 0);
> +
> + bbio = btrfs_bio_alloc(BIO_MAX_VECS, op, fs_info,
> + reloc_endio, priv);
> + bbio->bio.bi_iter.bi_sector = addr >> SECTOR_SHIFT;
> + continue;
> + }
> +
> + i++;
> + addr += bytes;
> + length -= bytes;
> + } while (length);
> +
> + refcount_inc(&priv->pending_refs);
> + btrfs_submit_bbio(bbio, 0);
> +
> + if (!refcount_dec_and_test(&priv->pending_refs))
> + wait_for_completion_io(&priv->done);
> +
> + return blk_status_to_errno(READ_ONCE(priv->status));
> +}
> +
> +static int copy_remapped_data(struct btrfs_fs_info *fs_info, u64 old_addr,
> + u64 new_addr, u64 length)
> +{
> + int ret;
> + struct page **pages;
> + unsigned int nr_pages;
> + struct reloc_io_private priv;
> +
> + nr_pages = DIV_ROUND_UP(length, PAGE_SIZE);
> + pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
> + if (!pages)
> + return -ENOMEM;
> + ret = btrfs_alloc_page_array(nr_pages, pages, 0);
> + if (ret) {
> + ret = -ENOMEM;
> + goto end;
> + }
> +
> + ret = copy_remapped_data_io(fs_info, &priv, pages, old_addr, length,
> + false);
> + if (ret)
> + goto end;
> +
> + ret = copy_remapped_data_io(fs_info, &priv, pages, new_addr, length,
> + true);
> +
> +end:
> + for (unsigned int i = 0; i < nr_pages; i++) {
> + if (pages[i])
> + __free_page(pages[i]);
> + }
> + kfree(pages);
> +
> + return ret;
> +}
> +
> +static int do_copy(struct btrfs_fs_info *fs_info, u64 old_addr, u64 new_addr,
> + u64 length)
> +{
> + int ret;
> +
> + /* Copy 1MB at a time, to avoid using too much memory. */
> +
> + do {
> + u64 to_copy = min_t(u64, length, SZ_1M);
> +
> + ret = copy_remapped_data(fs_info, old_addr, new_addr,
> + to_copy);
> + if (ret)
> + return ret;
> +
> + if (to_copy == length)
> + break;
> +
> + old_addr += to_copy;
> + new_addr += to_copy;
> + length -= to_copy;
> + } while (true);
> +
> + return 0;
> +}
> +
> +static int add_remap_item(struct btrfs_trans_handle *trans,
> + struct btrfs_path *path, u64 new_addr, u64 length,
> + u64 old_addr)
> +{
> + struct btrfs_fs_info *fs_info = trans->fs_info;
> + struct btrfs_remap remap;
> + struct btrfs_key key;
> + struct extent_buffer *leaf;
> + int ret;
> +
> + key.objectid = old_addr;
> + key.type = BTRFS_REMAP_KEY;
> + key.offset = length;
> +
> + ret = btrfs_insert_empty_item(trans, fs_info->remap_root, path,
> + &key, sizeof(struct btrfs_remap));
> + if (ret)
> + return ret;
> +
> + leaf = path->nodes[0];
> +
> + btrfs_set_stack_remap_address(&remap, new_addr);
> +
> + write_extent_buffer(leaf, &remap,
> + btrfs_item_ptr_offset(leaf, path->slots[0]),
> + sizeof(struct btrfs_remap));
> +
> + btrfs_release_path(path);
> +
> + return 0;
> +}
> +
> +static int add_remap_backref_item(struct btrfs_trans_handle *trans,
> + struct btrfs_path *path, u64 new_addr,
> + u64 length, u64 old_addr)
> +{
> + struct btrfs_fs_info *fs_info = trans->fs_info;
> + struct btrfs_remap remap;
> + struct btrfs_key key;
> + struct extent_buffer *leaf;
> + int ret;
> +
> + key.objectid = new_addr;
> + key.type = BTRFS_REMAP_BACKREF_KEY;
> + key.offset = length;
> +
> + ret = btrfs_insert_empty_item(trans, fs_info->remap_root,
> + path, &key, sizeof(struct btrfs_remap));
> + if (ret)
> + return ret;
> +
> + leaf = path->nodes[0];
> +
> + btrfs_set_stack_remap_address(&remap, old_addr);
> +
> + write_extent_buffer(leaf, &remap,
> + btrfs_item_ptr_offset(leaf, path->slots[0]),
> + sizeof(struct btrfs_remap));
> +
> + btrfs_release_path(path);
> +
> + return 0;
> +}
> +
> +static int move_existing_remap(struct btrfs_fs_info *fs_info,
> + struct btrfs_path *path,
> + struct btrfs_block_group *bg, u64 new_addr,
> + u64 length, u64 old_addr)
> +{
> + struct btrfs_trans_handle *trans;
> + struct extent_buffer *leaf;
> + struct btrfs_remap *remap_ptr, remap;
> + struct btrfs_key key, ins;
> + u64 dest_addr, dest_length, min_size;
> + struct btrfs_block_group *dest_bg;
> + int ret;
> + bool is_data = bg->flags & BTRFS_BLOCK_GROUP_DATA;
> + struct btrfs_space_info *sinfo = bg->space_info;
> + bool mutex_taken = false, bg_needs_free_space;
> +
> + spin_lock(&sinfo->lock);
> + btrfs_space_info_update_bytes_may_use(sinfo, length);
> + spin_unlock(&sinfo->lock);
> +
> + if (is_data)
> + min_size = fs_info->sectorsize;
> + else
> + min_size = fs_info->nodesize;
> +
> + ret = btrfs_reserve_extent(fs_info->fs_root, length, length, min_size,
> + 0, 0, &ins, is_data, false);
> + if (ret) {
> + spin_lock(&sinfo->lock);
> + btrfs_space_info_update_bytes_may_use(sinfo, -length);
> + spin_unlock(&sinfo->lock);
> + return ret;
> + }
> +
> + dest_addr = ins.objectid;
> + dest_length = ins.offset;
> +
> + if (!is_data && !IS_ALIGNED(dest_length, fs_info->nodesize)) {
> + u64 new_length = ALIGN_DOWN(dest_length, fs_info->nodesize);
> +
> + btrfs_free_reserved_extent(fs_info, dest_addr + new_length,
> + dest_length - new_length, 0);
> +
> + dest_length = new_length;
> + }
> +
> + trans = btrfs_join_transaction(fs_info->remap_root);
> + if (IS_ERR(trans)) {
> + ret = PTR_ERR(trans);
> + trans = NULL;
> + goto end;
> + }
> +
> + mutex_lock(&fs_info->remap_mutex);
I answered my own locking question. I think this mutex protects us from
the remap disappearing out from under us.
I'll take another look at the deletion side of it to make sure it still
makes sense if the remapping here wins the race and moves it before
deletion. :)
> + mutex_taken = true;
> +
> + /* Find old remap entry. */
> +
> + key.objectid = old_addr;
> + key.type = BTRFS_REMAP_KEY;
> + key.offset = length;
> +
> + ret = btrfs_search_slot(trans, fs_info->remap_root, &key,
> + path, 0, 1);
> + if (ret == 1) {
> + /*
> + * Not a problem if the remap entry wasn't found: that means
> + * that another transaction has deallocated the data.
> + * move_existing_remaps() loops until the BG contains no
> + * remaps, so we can just return 0 in this case.
> + */
> + btrfs_release_path(path);
> + ret = 0;
> + goto end;
> + } else if (ret) {
> + goto end;
> + }
> +
> + ret = do_copy(fs_info, new_addr, dest_addr, dest_length);
> + if (ret)
> + goto end;
> +
> + /* Change data of old remap entry. */
> +
> + leaf = path->nodes[0];
> +
> + remap_ptr = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_remap);
> + btrfs_set_remap_address(leaf, remap_ptr, dest_addr);
> +
> + btrfs_mark_buffer_dirty(trans, leaf);
> +
> + if (dest_length != length) {
> + key.offset = dest_length;
> + btrfs_set_item_key_safe(trans, path, &key);
> + }
> +
> + btrfs_release_path(path);
> +
> + if (dest_length != length) {
> + /* Add remap item for remainder. */
> +
> + ret = add_remap_item(trans, path, new_addr + dest_length,
> + length - dest_length,
> + old_addr + dest_length);
> + if (ret)
> + goto end;
> + }
> +
> + /* Change or remove old backref. */
> +
> + key.objectid = new_addr;
> + key.type = BTRFS_REMAP_BACKREF_KEY;
> + key.offset = length;
> +
> + ret = btrfs_search_slot(trans, fs_info->remap_root, &key,
> + path, -1, 1);
> + if (ret) {
> + if (ret == 1) {
> + btrfs_release_path(path);
> + ret = -ENOENT;
> + }
> + goto end;
> + }
> +
> + leaf = path->nodes[0];
> +
> + if (dest_length == length) {
> + ret = btrfs_del_item(trans, fs_info->remap_root, path);
> + if (ret) {
> + btrfs_release_path(path);
> + goto end;
> + }
> + } else {
> + key.objectid += dest_length;
> + key.offset -= dest_length;
> + btrfs_set_item_key_safe(trans, path, &key);
> +
> + btrfs_set_stack_remap_address(&remap, old_addr + dest_length);
> +
> + write_extent_buffer(leaf, &remap,
> + btrfs_item_ptr_offset(leaf, path->slots[0]),
> + sizeof(struct btrfs_remap));
> + }
> +
> + btrfs_release_path(path);
> +
> + /* Add new backref. */
> +
> + ret = add_remap_backref_item(trans, path, dest_addr, dest_length,
> + old_addr);
> + if (ret)
> + goto end;
> +
> + adjust_block_group_remap_bytes(trans, bg, -dest_length);
> +
> + ret = btrfs_add_to_free_space_tree(trans, new_addr, dest_length);
> + if (ret)
> + goto end;
> +
> + dest_bg = btrfs_lookup_block_group(fs_info, dest_addr);
> +
> + adjust_block_group_remap_bytes(trans, dest_bg, dest_length);
> +
> + mutex_lock(&dest_bg->free_space_lock);
> + bg_needs_free_space = test_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE,
> + &dest_bg->runtime_flags);
> + mutex_unlock(&dest_bg->free_space_lock);
> + btrfs_put_block_group(dest_bg);
> +
> + if (bg_needs_free_space) {
> + ret = btrfs_add_block_group_free_space(trans, dest_bg);
> + if (ret)
> + goto end;
> + }
> +
> + ret = btrfs_remove_from_free_space_tree(trans, dest_addr, dest_length);
> + if (ret) {
> + btrfs_remove_from_free_space_tree(trans, new_addr,
> + dest_length);
> + goto end;
> + }
> +
> + ret = 0;
> +
> +end:
> + if (mutex_taken)
> + mutex_unlock(&fs_info->remap_mutex);
> +
> + btrfs_dec_block_group_reservations(fs_info, dest_addr);
> +
> + if (ret) {
> + btrfs_free_reserved_extent(fs_info, dest_addr, dest_length, 0);
> +
> + if (trans) {
> + btrfs_abort_transaction(trans, ret);
> + btrfs_end_transaction(trans);
> + }
> + } else {
> + dest_bg = btrfs_lookup_block_group(fs_info, dest_addr);
> + btrfs_free_reserved_bytes(dest_bg, dest_length, 0);
> + btrfs_put_block_group(dest_bg);
> +
> + ret = btrfs_commit_transaction(trans);
> + }
> +
> + return ret;
> +}
> +
> +static int move_existing_remaps(struct btrfs_fs_info *fs_info,
> + struct btrfs_block_group *bg,
> + struct btrfs_path *path)
> +{
> + int ret;
> + struct btrfs_key key;
> + struct extent_buffer *leaf;
> + struct btrfs_remap *remap;
> + u64 old_addr;
> +
> + /* Look for backrefs in remap tree. */
> +
> + while (bg->remap_bytes > 0) {
> + key.objectid = bg->start;
> + key.type = BTRFS_REMAP_BACKREF_KEY;
> + key.offset = 0;
> +
> + ret = btrfs_search_slot(NULL, fs_info->remap_root, &key, path,
> + 0, 0);
> + if (ret < 0)
> + return ret;
> +
> + leaf = path->nodes[0];
> +
> + if (path->slots[0] >= btrfs_header_nritems(leaf)) {
> + ret = btrfs_next_leaf(fs_info->remap_root, path);
> + if (ret < 0) {
> + btrfs_release_path(path);
> + return ret;
> + }
> +
> + if (ret) {
> + btrfs_release_path(path);
> + break;
> + }
> +
> + leaf = path->nodes[0];
> + }
> +
> + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
> +
> + if (key.type != BTRFS_REMAP_BACKREF_KEY) {
> + path->slots[0]++;
> +
> + if (path->slots[0] >= btrfs_header_nritems(leaf)) {
> + ret = btrfs_next_leaf(fs_info->remap_root, path);
> + if (ret < 0) {
> + btrfs_release_path(path);
> + return ret;
> + }
> +
> + if (ret) {
> + btrfs_release_path(path);
> + break;
> + }
> +
> + leaf = path->nodes[0];
> + }
> + }
> +
> + remap = btrfs_item_ptr(leaf, path->slots[0],
> + struct btrfs_remap);
> +
> + old_addr = btrfs_remap_address(leaf, remap);
> +
> + btrfs_release_path(path);
> +
> + ret = move_existing_remap(fs_info, path, bg, key.objectid,
> + key.offset, old_addr);
> + if (ret)
> + return ret;
> + }
> +
> + BUG_ON(bg->remap_bytes > 0);
> +
> + return 0;
> +}
> +
> static int create_remap_tree_entries(struct btrfs_trans_handle *trans,
> struct btrfs_path *path,
> struct btrfs_block_group *bg)
> @@ -4564,6 +5039,12 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start,
> WARN_ON(ret && ret != -EAGAIN);
>
> if (*using_remap_tree) {
> + if (bg->remap_bytes != 0) {
> + ret = move_existing_remaps(fs_info, bg, path);
> + if (ret)
> + goto out;
> + }
> +
> ret = start_block_group_remapping(fs_info, path, bg);
> goto out;
> }
> --
> 2.49.1
>
^ permalink raw reply [flat|nested] 42+ messages in thread
* [PATCH v4 12/16] btrfs: replace identity remaps with actual remaps when doing relocations
2025-10-24 18:12 [PATCH v4 00/16] Remap tree Mark Harmstone
` (10 preceding siblings ...)
2025-10-24 18:12 ` [PATCH v4 11/16] btrfs: move existing remaps before relocating block group Mark Harmstone
@ 2025-10-24 18:12 ` Mark Harmstone
2025-11-01 0:09 ` Boris Burkov
2025-10-24 18:12 ` [PATCH v4 13/16] btrfs: add do_remap param to btrfs_discard_extent() Mark Harmstone
` (3 subsequent siblings)
15 siblings, 1 reply; 42+ messages in thread
From: Mark Harmstone @ 2025-10-24 18:12 UTC (permalink / raw)
To: linux-btrfs; +Cc: Mark Harmstone
Add a function do_remap_tree_reloc(), which does the actual work of
doing a relocation using the remap tree.
In a loop we call do_remap_tree_reloc_trans(), which searches for the
first identity remap for the block group. We call btrfs_reserve_extent()
to find space elsewhere for it, and read the data into memory and write
it to the new location. We then carve out the identity remap and replace
it with an actual remap, which points to the new location in which to
look.
Once the last identity remap has been removed we call
last_identity_remap_gone(), which, as with deletions, removes the
chunk's stripes and device extents.
Signed-off-by: Mark Harmstone <mark@harmstone.com>
---
fs/btrfs/relocation.c | 317 ++++++++++++++++++++++++++++++++++++++++++
1 file changed, 317 insertions(+)
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index d31817379078..ebbc619be682 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -4640,6 +4640,61 @@ static int create_remap_tree_entries(struct btrfs_trans_handle *trans,
return ret;
}
+static int find_next_identity_remap(struct btrfs_trans_handle *trans,
+ struct btrfs_path *path, u64 bg_end,
+ u64 last_start, u64 *start,
+ u64 *length)
+{
+ int ret;
+ struct btrfs_key key, found_key;
+ struct btrfs_root *remap_root = trans->fs_info->remap_root;
+ struct extent_buffer *leaf;
+
+ key.objectid = last_start;
+ key.type = BTRFS_IDENTITY_REMAP_KEY;
+ key.offset = 0;
+
+ ret = btrfs_search_slot(trans, remap_root, &key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+
+ leaf = path->nodes[0];
+ while (true) {
+ if (path->slots[0] >= btrfs_header_nritems(leaf)) {
+ ret = btrfs_next_leaf(remap_root, path);
+
+ if (ret != 0) {
+ if (ret == 1)
+ ret = -ENOENT;
+ goto out;
+ }
+
+ leaf = path->nodes[0];
+ }
+
+ btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+
+ if (found_key.objectid >= bg_end) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ if (found_key.type == BTRFS_IDENTITY_REMAP_KEY) {
+ *start = found_key.objectid;
+ *length = found_key.offset;
+ ret = 0;
+ goto out;
+ }
+
+ path->slots[0]++;
+ }
+
+out:
+ btrfs_release_path(path);
+
+ return ret;
+}
+
static int remove_chunk_stripes(struct btrfs_trans_handle *trans,
struct btrfs_chunk_map *chunk,
struct btrfs_path *path)
@@ -4753,6 +4808,96 @@ static void adjust_identity_remap_count(struct btrfs_trans_handle *trans,
btrfs_mark_bg_fully_remapped(bg, trans);
}
+static int add_remap_entry(struct btrfs_trans_handle *trans,
+ struct btrfs_path *path,
+ struct btrfs_block_group *src_bg, u64 old_addr,
+ u64 new_addr, u64 length)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_key key, new_key;
+ int ret;
+ int identity_count_delta = 0;
+
+ key.objectid = old_addr;
+ key.type = (u8)-1;
+ key.offset = (u64)-1;
+
+ ret = btrfs_search_slot(trans, fs_info->remap_root, &key, path, -1, 1);
+ if (ret < 0)
+ goto end;
+
+ if (path->slots[0] == 0) {
+ ret = -ENOENT;
+ goto end;
+ }
+
+ path->slots[0]--;
+
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+
+ if (key.type != BTRFS_IDENTITY_REMAP_KEY ||
+ key.objectid > old_addr ||
+ key.objectid + key.offset <= old_addr) {
+ ret = -ENOENT;
+ goto end;
+ }
+
+ /* Shorten or delete identity mapping entry. */
+
+ if (key.objectid == old_addr) {
+ ret = btrfs_del_item(trans, fs_info->remap_root, path);
+ if (ret)
+ goto end;
+
+ identity_count_delta--;
+ } else {
+ new_key.objectid = key.objectid;
+ new_key.type = BTRFS_IDENTITY_REMAP_KEY;
+ new_key.offset = old_addr - key.objectid;
+
+ btrfs_set_item_key_safe(trans, path, &new_key);
+ }
+
+ btrfs_release_path(path);
+
+ /* Create new remap entry. */
+
+ ret = add_remap_item(trans, path, new_addr, length, old_addr);
+ if (ret)
+ goto end;
+
+ /* Add entry for remainder of identity mapping, if necessary. */
+
+ if (key.objectid + key.offset != old_addr + length) {
+ new_key.objectid = old_addr + length;
+ new_key.type = BTRFS_IDENTITY_REMAP_KEY;
+ new_key.offset = key.objectid + key.offset - old_addr - length;
+
+ ret = btrfs_insert_empty_item(trans, fs_info->remap_root,
+ path, &new_key, 0);
+ if (ret)
+ goto end;
+
+ btrfs_release_path(path);
+
+ identity_count_delta++;
+ }
+
+ /* Add backref. */
+
+ ret = add_remap_backref_item(trans, path, new_addr, length, old_addr);
+ if (ret)
+ goto end;
+
+ if (identity_count_delta != 0)
+ adjust_identity_remap_count(trans, src_bg, identity_count_delta);
+
+end:
+ btrfs_release_path(path);
+
+ return ret;
+}
+
static int mark_chunk_remapped(struct btrfs_trans_handle *trans,
struct btrfs_path *path, uint64_t start)
{
@@ -4802,6 +4947,169 @@ static int mark_chunk_remapped(struct btrfs_trans_handle *trans,
return ret;
}
+static int do_remap_tree_reloc_trans(struct btrfs_fs_info *fs_info,
+ struct btrfs_block_group *src_bg,
+ struct btrfs_path *path, u64 *last_start)
+{
+ struct btrfs_trans_handle *trans;
+ struct btrfs_root *extent_root;
+ struct btrfs_key ins;
+ struct btrfs_block_group *dest_bg = NULL;
+ u64 start, remap_length, length, new_addr, min_size;
+ int ret;
+ bool no_more = false;
+ bool is_data = src_bg->flags & BTRFS_BLOCK_GROUP_DATA;
+ bool made_reservation = false, bg_needs_free_space;
+ struct btrfs_space_info *sinfo = src_bg->space_info;
+
+ extent_root = btrfs_extent_root(fs_info, src_bg->start);
+
+ trans = btrfs_start_transaction(extent_root, 0);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+
+ mutex_lock(&fs_info->remap_mutex);
+
+ ret = find_next_identity_remap(trans, path, src_bg->start + src_bg->length,
+ *last_start, &start, &remap_length);
+ if (ret == -ENOENT) {
+ no_more = true;
+ goto next;
+ } else if (ret) {
+ mutex_unlock(&fs_info->remap_mutex);
+ btrfs_end_transaction(trans);
+ return ret;
+ }
+
+ /* Try to reserve enough space for block. */
+
+ spin_lock(&sinfo->lock);
+ btrfs_space_info_update_bytes_may_use(sinfo, remap_length);
+ spin_unlock(&sinfo->lock);
+
+ if (is_data)
+ min_size = fs_info->sectorsize;
+ else
+ min_size = fs_info->nodesize;
+
+ ret = btrfs_reserve_extent(fs_info->fs_root, remap_length,
+ remap_length, min_size,
+ 0, 0, &ins, is_data, false);
+ if (ret) {
+ spin_lock(&sinfo->lock);
+ btrfs_space_info_update_bytes_may_use(sinfo, -remap_length);
+ spin_unlock(&sinfo->lock);
+
+ mutex_unlock(&fs_info->remap_mutex);
+ btrfs_end_transaction(trans);
+ return ret;
+ }
+
+ made_reservation = true;
+
+ new_addr = ins.objectid;
+ length = ins.offset;
+
+ if (!is_data && !IS_ALIGNED(length, fs_info->nodesize)) {
+ u64 new_length = ALIGN_DOWN(length, fs_info->nodesize);
+
+ btrfs_free_reserved_extent(fs_info, new_addr + new_length,
+ length - new_length, 0);
+
+ length = new_length;
+ }
+
+ dest_bg = btrfs_lookup_block_group(fs_info, new_addr);
+
+ mutex_lock(&dest_bg->free_space_lock);
+ bg_needs_free_space = test_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE,
+ &dest_bg->runtime_flags);
+ mutex_unlock(&dest_bg->free_space_lock);
+
+ if (bg_needs_free_space) {
+ ret = btrfs_add_block_group_free_space(trans, dest_bg);
+ if (ret)
+ goto fail;
+ }
+
+ ret = do_copy(fs_info, start, new_addr, length);
+ if (ret)
+ goto fail;
+
+ ret = btrfs_remove_from_free_space_tree(trans, new_addr, length);
+ if (ret)
+ goto fail;
+
+ ret = add_remap_entry(trans, path, src_bg, start, new_addr, length);
+ if (ret) {
+ btrfs_add_to_free_space_tree(trans, new_addr, length);
+ goto fail;
+ }
+
+ adjust_block_group_remap_bytes(trans, dest_bg, length);
+ btrfs_free_reserved_bytes(dest_bg, length, 0);
+
+ spin_lock(&sinfo->lock);
+ sinfo->bytes_readonly += length;
+ spin_unlock(&sinfo->lock);
+
+next:
+ if (dest_bg)
+ btrfs_put_block_group(dest_bg);
+
+ if (made_reservation)
+ btrfs_dec_block_group_reservations(fs_info, new_addr);
+
+ mutex_unlock(&fs_info->remap_mutex);
+
+ if (src_bg->identity_remap_count == 0)
+ btrfs_mark_bg_fully_remapped(src_bg, trans);
+
+ ret = btrfs_end_transaction(trans);
+ if (ret)
+ return ret;
+
+ if (no_more)
+ return 1;
+
+ *last_start = start;
+
+ return 0;
+
+fail:
+ if (dest_bg)
+ btrfs_put_block_group(dest_bg);
+
+ btrfs_free_reserved_extent(fs_info, new_addr, length, 0);
+
+ mutex_unlock(&fs_info->remap_mutex);
+ btrfs_end_transaction(trans);
+
+ return ret;
+}
+
+static int do_remap_tree_reloc(struct btrfs_fs_info *fs_info,
+ struct btrfs_path *path,
+ struct btrfs_block_group *bg)
+{
+ u64 last_start;
+ int ret;
+
+ last_start = bg->start;
+
+ while (true) {
+ ret = do_remap_tree_reloc_trans(fs_info, bg, path,
+ &last_start);
+ if (ret) {
+ if (ret == 1)
+ ret = 0;
+ break;
+ }
+ }
+
+ return ret;
+}
+
int btrfs_translate_remap(struct btrfs_fs_info *fs_info, u64 *logical,
u64 *length)
{
@@ -5046,6 +5354,15 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start,
}
ret = start_block_group_remapping(fs_info, path, bg);
+ if (ret)
+ goto out;
+
+ ret = do_remap_tree_reloc(fs_info, path, rc->block_group);
+ if (ret)
+ goto out;
+
+ btrfs_delete_unused_bgs(fs_info);
+
goto out;
}
--
2.49.1
^ permalink raw reply related [flat|nested] 42+ messages in thread* Re: [PATCH v4 12/16] btrfs: replace identity remaps with actual remaps when doing relocations
2025-10-24 18:12 ` [PATCH v4 12/16] btrfs: replace identity remaps with actual remaps when doing relocations Mark Harmstone
@ 2025-11-01 0:09 ` Boris Burkov
2025-11-04 14:31 ` Mark Harmstone
0 siblings, 1 reply; 42+ messages in thread
From: Boris Burkov @ 2025-11-01 0:09 UTC (permalink / raw)
To: Mark Harmstone; +Cc: linux-btrfs
On Fri, Oct 24, 2025 at 07:12:13PM +0100, Mark Harmstone wrote:
> Add a function do_remap_tree_reloc(), which does the actual work of
> doing a relocation using the remap tree.
>
> In a loop we call do_remap_tree_reloc_trans(), which searches for the
> first identity remap for the block group. We call btrfs_reserve_extent()
> to find space elsewhere for it, and read the data into memory and write
> it to the new location. We then carve out the identity remap and replace
> it with an actual remap, which points to the new location in which to
> look.
>
> Once the last identity remap has been removed we call
> last_identity_remap_gone(), which, as with deletions, removes the
> chunk's stripes and device extents.
>
> Signed-off-by: Mark Harmstone <mark@harmstone.com>
Want to be as clear as possible on the reservation fragmentation
stuff, but otherwise LGTM.
Reviewed-by: Boris Burkov <boris@bur.io>
> ---
> fs/btrfs/relocation.c | 317 ++++++++++++++++++++++++++++++++++++++++++
> 1 file changed, 317 insertions(+)
>
> diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
> index d31817379078..ebbc619be682 100644
> --- a/fs/btrfs/relocation.c
> +++ b/fs/btrfs/relocation.c
> @@ -4640,6 +4640,61 @@ static int create_remap_tree_entries(struct btrfs_trans_handle *trans,
> return ret;
> }
>
> +static int find_next_identity_remap(struct btrfs_trans_handle *trans,
> + struct btrfs_path *path, u64 bg_end,
> + u64 last_start, u64 *start,
> + u64 *length)
> +{
> + int ret;
> + struct btrfs_key key, found_key;
> + struct btrfs_root *remap_root = trans->fs_info->remap_root;
> + struct extent_buffer *leaf;
> +
> + key.objectid = last_start;
> + key.type = BTRFS_IDENTITY_REMAP_KEY;
> + key.offset = 0;
> +
> + ret = btrfs_search_slot(trans, remap_root, &key, path, 0, 0);
> + if (ret < 0)
> + goto out;
> +
> + leaf = path->nodes[0];
> + while (true) {
> + if (path->slots[0] >= btrfs_header_nritems(leaf)) {
> + ret = btrfs_next_leaf(remap_root, path);
> +
> + if (ret != 0) {
> + if (ret == 1)
> + ret = -ENOENT;
> + goto out;
> + }
> +
> + leaf = path->nodes[0];
> + }
> +
> + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
> +
> + if (found_key.objectid >= bg_end) {
> + ret = -ENOENT;
> + goto out;
> + }
> +
> + if (found_key.type == BTRFS_IDENTITY_REMAP_KEY) {
> + *start = found_key.objectid;
> + *length = found_key.offset;
> + ret = 0;
> + goto out;
> + }
> +
> + path->slots[0]++;
> + }
> +
> +out:
> + btrfs_release_path(path);
> +
> + return ret;
> +}
> +
> static int remove_chunk_stripes(struct btrfs_trans_handle *trans,
> struct btrfs_chunk_map *chunk,
> struct btrfs_path *path)
> @@ -4753,6 +4808,96 @@ static void adjust_identity_remap_count(struct btrfs_trans_handle *trans,
> btrfs_mark_bg_fully_remapped(bg, trans);
> }
>
> +static int add_remap_entry(struct btrfs_trans_handle *trans,
> + struct btrfs_path *path,
> + struct btrfs_block_group *src_bg, u64 old_addr,
> + u64 new_addr, u64 length)
> +{
> + struct btrfs_fs_info *fs_info = trans->fs_info;
> + struct btrfs_key key, new_key;
> + int ret;
> + int identity_count_delta = 0;
> +
> + key.objectid = old_addr;
> + key.type = (u8)-1;
> + key.offset = (u64)-1;
> +
> + ret = btrfs_search_slot(trans, fs_info->remap_root, &key, path, -1, 1);
> + if (ret < 0)
> + goto end;
> +
> + if (path->slots[0] == 0) {
> + ret = -ENOENT;
> + goto end;
> + }
> +
> + path->slots[0]--;
> +
> + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
> +
> + if (key.type != BTRFS_IDENTITY_REMAP_KEY ||
> + key.objectid > old_addr ||
> + key.objectid + key.offset <= old_addr) {
> + ret = -ENOENT;
> + goto end;
> + }
> +
> + /* Shorten or delete identity mapping entry. */
> +
> + if (key.objectid == old_addr) {
> + ret = btrfs_del_item(trans, fs_info->remap_root, path);
> + if (ret)
> + goto end;
> +
> + identity_count_delta--;
> + } else {
> + new_key.objectid = key.objectid;
> + new_key.type = BTRFS_IDENTITY_REMAP_KEY;
> + new_key.offset = old_addr - key.objectid;
> +
> + btrfs_set_item_key_safe(trans, path, &new_key);
> + }
> +
> + btrfs_release_path(path);
> +
> + /* Create new remap entry. */
> +
> + ret = add_remap_item(trans, path, new_addr, length, old_addr);
> + if (ret)
> + goto end;
> +
> + /* Add entry for remainder of identity mapping, if necessary. */
> +
> + if (key.objectid + key.offset != old_addr + length) {
> + new_key.objectid = old_addr + length;
> + new_key.type = BTRFS_IDENTITY_REMAP_KEY;
> + new_key.offset = key.objectid + key.offset - old_addr - length;
> +
> + ret = btrfs_insert_empty_item(trans, fs_info->remap_root,
> + path, &new_key, 0);
> + if (ret)
> + goto end;
> +
> + btrfs_release_path(path);
> +
> + identity_count_delta++;
> + }
> +
> + /* Add backref. */
> +
> + ret = add_remap_backref_item(trans, path, new_addr, length, old_addr);
> + if (ret)
> + goto end;
> +
> + if (identity_count_delta != 0)
> + adjust_identity_remap_count(trans, src_bg, identity_count_delta);
> +
> +end:
> + btrfs_release_path(path);
> +
> + return ret;
> +}
> +
> static int mark_chunk_remapped(struct btrfs_trans_handle *trans,
> struct btrfs_path *path, uint64_t start)
> {
> @@ -4802,6 +4947,169 @@ static int mark_chunk_remapped(struct btrfs_trans_handle *trans,
> return ret;
> }
>
> +static int do_remap_tree_reloc_trans(struct btrfs_fs_info *fs_info,
> + struct btrfs_block_group *src_bg,
> + struct btrfs_path *path, u64 *last_start)
> +{
> + struct btrfs_trans_handle *trans;
> + struct btrfs_root *extent_root;
> + struct btrfs_key ins;
> + struct btrfs_block_group *dest_bg = NULL;
> + u64 start, remap_length, length, new_addr, min_size;
> + int ret;
> + bool no_more = false;
> + bool is_data = src_bg->flags & BTRFS_BLOCK_GROUP_DATA;
> + bool made_reservation = false, bg_needs_free_space;
> + struct btrfs_space_info *sinfo = src_bg->space_info;
> +
> + extent_root = btrfs_extent_root(fs_info, src_bg->start);
> +
> + trans = btrfs_start_transaction(extent_root, 0);
> + if (IS_ERR(trans))
> + return PTR_ERR(trans);
> +
> + mutex_lock(&fs_info->remap_mutex);
> +
> + ret = find_next_identity_remap(trans, path, src_bg->start + src_bg->length,
> + *last_start, &start, &remap_length);
> + if (ret == -ENOENT) {
> + no_more = true;
> + goto next;
> + } else if (ret) {
> + mutex_unlock(&fs_info->remap_mutex);
> + btrfs_end_transaction(trans);
> + return ret;
> + }
> +
> + /* Try to reserve enough space for block. */
> +
> + spin_lock(&sinfo->lock);
> + btrfs_space_info_update_bytes_may_use(sinfo, remap_length);
> + spin_unlock(&sinfo->lock);
> +
> + if (is_data)
> + min_size = fs_info->sectorsize;
> + else
> + min_size = fs_info->nodesize;
As Qu mentioned, I think it makes sense to not change too much at once
and not add the extra fragmentation factor baked in with remap tree.
This isn't a format change so we can change it later if we have data
about lots of failing relocations to support that.
On the other hand, we are going contiguous non-free at a time rather
than extent at a time, so maybe this is actually quite necessary.
Let's document / highlight that if it's the reasoning.
> +
> + ret = btrfs_reserve_extent(fs_info->fs_root, remap_length,
> + remap_length, min_size,
> + 0, 0, &ins, is_data, false);
> + if (ret) {
> + spin_lock(&sinfo->lock);
> + btrfs_space_info_update_bytes_may_use(sinfo, -remap_length);
> + spin_unlock(&sinfo->lock);
> +
> + mutex_unlock(&fs_info->remap_mutex);
> + btrfs_end_transaction(trans);
> + return ret;
> + }
> +
> + made_reservation = true;
> +
> + new_addr = ins.objectid;
> + length = ins.offset;
> +
> + if (!is_data && !IS_ALIGNED(length, fs_info->nodesize)) {
> + u64 new_length = ALIGN_DOWN(length, fs_info->nodesize);
> +
> + btrfs_free_reserved_extent(fs_info, new_addr + new_length,
> + length - new_length, 0);
> +
> + length = new_length;
> + }
> +
> + dest_bg = btrfs_lookup_block_group(fs_info, new_addr);
> +
> + mutex_lock(&dest_bg->free_space_lock);
> + bg_needs_free_space = test_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE,
> + &dest_bg->runtime_flags);
> + mutex_unlock(&dest_bg->free_space_lock);
> +
> + if (bg_needs_free_space) {
> + ret = btrfs_add_block_group_free_space(trans, dest_bg);
> + if (ret)
> + goto fail;
> + }
> +
> + ret = do_copy(fs_info, start, new_addr, length);
> + if (ret)
> + goto fail;
> +
> + ret = btrfs_remove_from_free_space_tree(trans, new_addr, length);
> + if (ret)
> + goto fail;
> +
> + ret = add_remap_entry(trans, path, src_bg, start, new_addr, length);
> + if (ret) {
> + btrfs_add_to_free_space_tree(trans, new_addr, length);
> + goto fail;
> + }
> +
> + adjust_block_group_remap_bytes(trans, dest_bg, length);
> + btrfs_free_reserved_bytes(dest_bg, length, 0);
> +
> + spin_lock(&sinfo->lock);
> + sinfo->bytes_readonly += length;
> + spin_unlock(&sinfo->lock);
> +
> +next:
> + if (dest_bg)
> + btrfs_put_block_group(dest_bg);
> +
> + if (made_reservation)
> + btrfs_dec_block_group_reservations(fs_info, new_addr);
> +
> + mutex_unlock(&fs_info->remap_mutex);
> +
> + if (src_bg->identity_remap_count == 0)
> + btrfs_mark_bg_fully_remapped(src_bg, trans);
> +
> + ret = btrfs_end_transaction(trans);
> + if (ret)
> + return ret;
> +
> + if (no_more)
> + return 1;
> +
> + *last_start = start;
> +
> + return 0;
> +
> +fail:
> + if (dest_bg)
> + btrfs_put_block_group(dest_bg);
> +
> + btrfs_free_reserved_extent(fs_info, new_addr, length, 0);
> +
> + mutex_unlock(&fs_info->remap_mutex);
> + btrfs_end_transaction(trans);
> +
> + return ret;
> +}
> +
> +static int do_remap_tree_reloc(struct btrfs_fs_info *fs_info,
> + struct btrfs_path *path,
> + struct btrfs_block_group *bg)
> +{
> + u64 last_start;
> + int ret;
> +
> + last_start = bg->start;
> +
> + while (true) {
> + ret = do_remap_tree_reloc_trans(fs_info, bg, path,
> + &last_start);
> + if (ret) {
> + if (ret == 1)
> + ret = 0;
> + break;
> + }
> + }
> +
> + return ret;
> +}
> +
> int btrfs_translate_remap(struct btrfs_fs_info *fs_info, u64 *logical,
> u64 *length)
> {
> @@ -5046,6 +5354,15 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start,
> }
>
> ret = start_block_group_remapping(fs_info, path, bg);
> + if (ret)
> + goto out;
> +
> + ret = do_remap_tree_reloc(fs_info, path, rc->block_group);
> + if (ret)
> + goto out;
> +
> + btrfs_delete_unused_bgs(fs_info);
> +
> goto out;
> }
>
> --
> 2.49.1
>
^ permalink raw reply [flat|nested] 42+ messages in thread* Re: [PATCH v4 12/16] btrfs: replace identity remaps with actual remaps when doing relocations
2025-11-01 0:09 ` Boris Burkov
@ 2025-11-04 14:31 ` Mark Harmstone
0 siblings, 0 replies; 42+ messages in thread
From: Mark Harmstone @ 2025-11-04 14:31 UTC (permalink / raw)
To: Boris Burkov; +Cc: linux-btrfs
On 01/11/2025 12.09 am, Boris Burkov wrote:
> On Fri, Oct 24, 2025 at 07:12:13PM +0100, Mark Harmstone wrote:
>> Add a function do_remap_tree_reloc(), which does the actual work of
>> doing a relocation using the remap tree.
>>
>> In a loop we call do_remap_tree_reloc_trans(), which searches for the
>> first identity remap for the block group. We call btrfs_reserve_extent()
>> to find space elsewhere for it, and read the data into memory and write
>> it to the new location. We then carve out the identity remap and replace
>> it with an actual remap, which points to the new location in which to
>> look.
>>
>> Once the last identity remap has been removed we call
>> last_identity_remap_gone(), which, as with deletions, removes the
>> chunk's stripes and device extents.
>>
>> Signed-off-by: Mark Harmstone <mark@harmstone.com>
>
> Want to be as clear as possible on the reservation fragmentation
> stuff, but otherwise LGTM.
>
> Reviewed-by: Boris Burkov <boris@bur.io>
>> ---
>> fs/btrfs/relocation.c | 317 ++++++++++++++++++++++++++++++++++++++++++
>> 1 file changed, 317 insertions(+)
>>
>> diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
>> index d31817379078..ebbc619be682 100644
>> --- a/fs/btrfs/relocation.c
>> +++ b/fs/btrfs/relocation.c
>> @@ -4640,6 +4640,61 @@ static int create_remap_tree_entries(struct btrfs_trans_handle *trans,
>> return ret;
>> }
>>
>> +static int find_next_identity_remap(struct btrfs_trans_handle *trans,
>> + struct btrfs_path *path, u64 bg_end,
>> + u64 last_start, u64 *start,
>> + u64 *length)
>> +{
>> + int ret;
>> + struct btrfs_key key, found_key;
>> + struct btrfs_root *remap_root = trans->fs_info->remap_root;
>> + struct extent_buffer *leaf;
>> +
>> + key.objectid = last_start;
>> + key.type = BTRFS_IDENTITY_REMAP_KEY;
>> + key.offset = 0;
>> +
>> + ret = btrfs_search_slot(trans, remap_root, &key, path, 0, 0);
>> + if (ret < 0)
>> + goto out;
>> +
>> + leaf = path->nodes[0];
>> + while (true) {
>> + if (path->slots[0] >= btrfs_header_nritems(leaf)) {
>> + ret = btrfs_next_leaf(remap_root, path);
>> +
>> + if (ret != 0) {
>> + if (ret == 1)
>> + ret = -ENOENT;
>> + goto out;
>> + }
>> +
>> + leaf = path->nodes[0];
>> + }
>> +
>> + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
>> +
>> + if (found_key.objectid >= bg_end) {
>> + ret = -ENOENT;
>> + goto out;
>> + }
>> +
>> + if (found_key.type == BTRFS_IDENTITY_REMAP_KEY) {
>> + *start = found_key.objectid;
>> + *length = found_key.offset;
>> + ret = 0;
>> + goto out;
>> + }
>> +
>> + path->slots[0]++;
>> + }
>> +
>> +out:
>> + btrfs_release_path(path);
>> +
>> + return ret;
>> +}
>> +
>> static int remove_chunk_stripes(struct btrfs_trans_handle *trans,
>> struct btrfs_chunk_map *chunk,
>> struct btrfs_path *path)
>> @@ -4753,6 +4808,96 @@ static void adjust_identity_remap_count(struct btrfs_trans_handle *trans,
>> btrfs_mark_bg_fully_remapped(bg, trans);
>> }
>>
>> +static int add_remap_entry(struct btrfs_trans_handle *trans,
>> + struct btrfs_path *path,
>> + struct btrfs_block_group *src_bg, u64 old_addr,
>> + u64 new_addr, u64 length)
>> +{
>> + struct btrfs_fs_info *fs_info = trans->fs_info;
>> + struct btrfs_key key, new_key;
>> + int ret;
>> + int identity_count_delta = 0;
>> +
>> + key.objectid = old_addr;
>> + key.type = (u8)-1;
>> + key.offset = (u64)-1;
>> +
>> + ret = btrfs_search_slot(trans, fs_info->remap_root, &key, path, -1, 1);
>> + if (ret < 0)
>> + goto end;
>> +
>> + if (path->slots[0] == 0) {
>> + ret = -ENOENT;
>> + goto end;
>> + }
>> +
>> + path->slots[0]--;
>> +
>> + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
>> +
>> + if (key.type != BTRFS_IDENTITY_REMAP_KEY ||
>> + key.objectid > old_addr ||
>> + key.objectid + key.offset <= old_addr) {
>> + ret = -ENOENT;
>> + goto end;
>> + }
>> +
>> + /* Shorten or delete identity mapping entry. */
>> +
>> + if (key.objectid == old_addr) {
>> + ret = btrfs_del_item(trans, fs_info->remap_root, path);
>> + if (ret)
>> + goto end;
>> +
>> + identity_count_delta--;
>> + } else {
>> + new_key.objectid = key.objectid;
>> + new_key.type = BTRFS_IDENTITY_REMAP_KEY;
>> + new_key.offset = old_addr - key.objectid;
>> +
>> + btrfs_set_item_key_safe(trans, path, &new_key);
>> + }
>> +
>> + btrfs_release_path(path);
>> +
>> + /* Create new remap entry. */
>> +
>> + ret = add_remap_item(trans, path, new_addr, length, old_addr);
>> + if (ret)
>> + goto end;
>> +
>> + /* Add entry for remainder of identity mapping, if necessary. */
>> +
>> + if (key.objectid + key.offset != old_addr + length) {
>> + new_key.objectid = old_addr + length;
>> + new_key.type = BTRFS_IDENTITY_REMAP_KEY;
>> + new_key.offset = key.objectid + key.offset - old_addr - length;
>> +
>> + ret = btrfs_insert_empty_item(trans, fs_info->remap_root,
>> + path, &new_key, 0);
>> + if (ret)
>> + goto end;
>> +
>> + btrfs_release_path(path);
>> +
>> + identity_count_delta++;
>> + }
>> +
>> + /* Add backref. */
>> +
>> + ret = add_remap_backref_item(trans, path, new_addr, length, old_addr);
>> + if (ret)
>> + goto end;
>> +
>> + if (identity_count_delta != 0)
>> + adjust_identity_remap_count(trans, src_bg, identity_count_delta);
>> +
>> +end:
>> + btrfs_release_path(path);
>> +
>> + return ret;
>> +}
>> +
>> static int mark_chunk_remapped(struct btrfs_trans_handle *trans,
>> struct btrfs_path *path, uint64_t start)
>> {
>> @@ -4802,6 +4947,169 @@ static int mark_chunk_remapped(struct btrfs_trans_handle *trans,
>> return ret;
>> }
>>
>> +static int do_remap_tree_reloc_trans(struct btrfs_fs_info *fs_info,
>> + struct btrfs_block_group *src_bg,
>> + struct btrfs_path *path, u64 *last_start)
>> +{
>> + struct btrfs_trans_handle *trans;
>> + struct btrfs_root *extent_root;
>> + struct btrfs_key ins;
>> + struct btrfs_block_group *dest_bg = NULL;
>> + u64 start, remap_length, length, new_addr, min_size;
>> + int ret;
>> + bool no_more = false;
>> + bool is_data = src_bg->flags & BTRFS_BLOCK_GROUP_DATA;
>> + bool made_reservation = false, bg_needs_free_space;
>> + struct btrfs_space_info *sinfo = src_bg->space_info;
>> +
>> + extent_root = btrfs_extent_root(fs_info, src_bg->start);
>> +
>> + trans = btrfs_start_transaction(extent_root, 0);
>> + if (IS_ERR(trans))
>> + return PTR_ERR(trans);
>> +
>> + mutex_lock(&fs_info->remap_mutex);
>> +
>> + ret = find_next_identity_remap(trans, path, src_bg->start + src_bg->length,
>> + *last_start, &start, &remap_length);
>> + if (ret == -ENOENT) {
>> + no_more = true;
>> + goto next;
>> + } else if (ret) {
>> + mutex_unlock(&fs_info->remap_mutex);
>> + btrfs_end_transaction(trans);
>> + return ret;
>> + }
>> +
>> + /* Try to reserve enough space for block. */
>> +
>> + spin_lock(&sinfo->lock);
>> + btrfs_space_info_update_bytes_may_use(sinfo, remap_length);
>> + spin_unlock(&sinfo->lock);
>> +
>> + if (is_data)
>> + min_size = fs_info->sectorsize;
>> + else
>> + min_size = fs_info->nodesize;
>
> As Qu mentioned, I think it makes sense to not change too much at once
> and not add the extra fragmentation factor baked in with remap tree.
> This isn't a format change so we can change it later if we have data
> about lots of failing relocations to support that.
>
> On the other hand, we are going contiguous non-free at a time rather
> than extent at a time, so maybe this is actually quite necessary.
Exactly, this is a contiguous non-free range, rather than an extent in
the same sense as in the extent tree. I'll add a comment here as the
call to btrfs_reserve_extent() is misleading.
Reworking this to deal with actual extents would make it considerably
more complicated: the initial state of the remap tree is derived from
the free-space tree, and the free-space tree doesn't know about extents.
> Let's document / highlight that if it's the reasoning.
>
>> +
>> + ret = btrfs_reserve_extent(fs_info->fs_root, remap_length,
>> + remap_length, min_size,
>> + 0, 0, &ins, is_data, false);
>> + if (ret) {
>> + spin_lock(&sinfo->lock);
>> + btrfs_space_info_update_bytes_may_use(sinfo, -remap_length);
>> + spin_unlock(&sinfo->lock);
>> +
>> + mutex_unlock(&fs_info->remap_mutex);
>> + btrfs_end_transaction(trans);
>> + return ret;
>> + }
>> +
>> + made_reservation = true;
>> +
>> + new_addr = ins.objectid;
>> + length = ins.offset;
>> +
>> + if (!is_data && !IS_ALIGNED(length, fs_info->nodesize)) {
>> + u64 new_length = ALIGN_DOWN(length, fs_info->nodesize);
>> +
>> + btrfs_free_reserved_extent(fs_info, new_addr + new_length,
>> + length - new_length, 0);
>> +
>> + length = new_length;
>> + }
>> +
>> + dest_bg = btrfs_lookup_block_group(fs_info, new_addr);
>> +
>> + mutex_lock(&dest_bg->free_space_lock);
>> + bg_needs_free_space = test_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE,
>> + &dest_bg->runtime_flags);
>> + mutex_unlock(&dest_bg->free_space_lock);
>> +
>> + if (bg_needs_free_space) {
>> + ret = btrfs_add_block_group_free_space(trans, dest_bg);
>> + if (ret)
>> + goto fail;
>> + }
>> +
>> + ret = do_copy(fs_info, start, new_addr, length);
>> + if (ret)
>> + goto fail;
>> +
>> + ret = btrfs_remove_from_free_space_tree(trans, new_addr, length);
>> + if (ret)
>> + goto fail;
>> +
>> + ret = add_remap_entry(trans, path, src_bg, start, new_addr, length);
>> + if (ret) {
>> + btrfs_add_to_free_space_tree(trans, new_addr, length);
>> + goto fail;
>> + }
>> +
>> + adjust_block_group_remap_bytes(trans, dest_bg, length);
>> + btrfs_free_reserved_bytes(dest_bg, length, 0);
>> +
>> + spin_lock(&sinfo->lock);
>> + sinfo->bytes_readonly += length;
>> + spin_unlock(&sinfo->lock);
>> +
>> +next:
>> + if (dest_bg)
>> + btrfs_put_block_group(dest_bg);
>> +
>> + if (made_reservation)
>> + btrfs_dec_block_group_reservations(fs_info, new_addr);
>> +
>> + mutex_unlock(&fs_info->remap_mutex);
>> +
>> + if (src_bg->identity_remap_count == 0)
>> + btrfs_mark_bg_fully_remapped(src_bg, trans);
>> +
>> + ret = btrfs_end_transaction(trans);
>> + if (ret)
>> + return ret;
>> +
>> + if (no_more)
>> + return 1;
>> +
>> + *last_start = start;
>> +
>> + return 0;
>> +
>> +fail:
>> + if (dest_bg)
>> + btrfs_put_block_group(dest_bg);
>> +
>> + btrfs_free_reserved_extent(fs_info, new_addr, length, 0);
>> +
>> + mutex_unlock(&fs_info->remap_mutex);
>> + btrfs_end_transaction(trans);
>> +
>> + return ret;
>> +}
>> +
>> +static int do_remap_tree_reloc(struct btrfs_fs_info *fs_info,
>> + struct btrfs_path *path,
>> + struct btrfs_block_group *bg)
>> +{
>> + u64 last_start;
>> + int ret;
>> +
>> + last_start = bg->start;
>> +
>> + while (true) {
>> + ret = do_remap_tree_reloc_trans(fs_info, bg, path,
>> + &last_start);
>> + if (ret) {
>> + if (ret == 1)
>> + ret = 0;
>> + break;
>> + }
>> + }
>> +
>> + return ret;
>> +}
>> +
>> int btrfs_translate_remap(struct btrfs_fs_info *fs_info, u64 *logical,
>> u64 *length)
>> {
>> @@ -5046,6 +5354,15 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start,
>> }
>>
>> ret = start_block_group_remapping(fs_info, path, bg);
>> + if (ret)
>> + goto out;
>> +
>> + ret = do_remap_tree_reloc(fs_info, path, rc->block_group);
>> + if (ret)
>> + goto out;
>> +
>> + btrfs_delete_unused_bgs(fs_info);
>> +
>> goto out;
>> }
>>
>> --
>> 2.49.1
>>
^ permalink raw reply [flat|nested] 42+ messages in thread
* [PATCH v4 13/16] btrfs: add do_remap param to btrfs_discard_extent()
2025-10-24 18:12 [PATCH v4 00/16] Remap tree Mark Harmstone
` (11 preceding siblings ...)
2025-10-24 18:12 ` [PATCH v4 12/16] btrfs: replace identity remaps with actual remaps when doing relocations Mark Harmstone
@ 2025-10-24 18:12 ` Mark Harmstone
2025-11-01 0:12 ` Boris Burkov
2025-10-24 18:12 ` [PATCH v4 14/16] btrfs: allow balancing remap tree Mark Harmstone
` (2 subsequent siblings)
15 siblings, 1 reply; 42+ messages in thread
From: Mark Harmstone @ 2025-10-24 18:12 UTC (permalink / raw)
To: linux-btrfs; +Cc: Mark Harmstone
btrfs_discard_extent() can be called either when an extent is removed
or from walking the free-space tree. With a remapped block group these
two things are no longer equivalent: the extent's addresses are
remapped, while the free-space tree exclusively uses underlying
addresses.
Add a do_remap parameter to btrfs_discard_extent() and
btrfs_map_discard(), saying whether or not the address needs to be run
through the remap tree first.
Signed-off-by: Mark Harmstone <mark@harmstone.com>
---
fs/btrfs/extent-tree.c | 11 +++++++----
fs/btrfs/extent-tree.h | 2 +-
fs/btrfs/free-space-cache.c | 2 +-
fs/btrfs/inode.c | 2 +-
fs/btrfs/volumes.c | 24 ++++++++++++++++++++++--
fs/btrfs/volumes.h | 2 +-
6 files changed, 33 insertions(+), 10 deletions(-)
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 10dc6f8d2f71..82dc88915b7e 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1380,7 +1380,7 @@ static int do_discard_extent(struct btrfs_discard_stripe *stripe, u64 *bytes)
}
int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
- u64 num_bytes, u64 *actual_bytes)
+ u64 num_bytes, u64 *actual_bytes, bool do_remap)
{
int ret = 0;
u64 discarded_bytes = 0;
@@ -1398,7 +1398,8 @@ int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
int i;
num_bytes = end - cur;
- stripes = btrfs_map_discard(fs_info, cur, &num_bytes, &num_stripes);
+ stripes = btrfs_map_discard(fs_info, cur, &num_bytes,
+ &num_stripes, do_remap);
if (IS_ERR(stripes)) {
ret = PTR_ERR(stripes);
if (ret == -EOPNOTSUPP)
@@ -2914,7 +2915,8 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
if (btrfs_test_opt(fs_info, DISCARD_SYNC))
ret = btrfs_discard_extent(fs_info, start,
- end + 1 - start, NULL);
+ end + 1 - start, NULL,
+ true);
next_state = btrfs_next_extent_state(unpin, cached_state);
btrfs_clear_extent_dirty(unpin, start, end, &cached_state);
@@ -2972,7 +2974,8 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
ret = -EROFS;
if (!TRANS_ABORTED(trans))
ret = btrfs_discard_extent(fs_info, block_group->start,
- block_group->length, NULL);
+ block_group->length, NULL,
+ true);
/*
* Not strictly necessary to lock, as the block_group should be
diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h
index 6b67a4e528da..721b03d682b4 100644
--- a/fs/btrfs/extent-tree.h
+++ b/fs/btrfs/extent-tree.h
@@ -162,7 +162,7 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
struct extent_buffer *parent);
void btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, u64 start, u64 end);
int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
- u64 num_bytes, u64 *actual_bytes);
+ u64 num_bytes, u64 *actual_bytes, bool do_remap);
int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range);
int btrfs_handle_fully_remapped_bgs(struct btrfs_trans_handle *trans);
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index ec9a97d75d10..91670d0af179 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -3675,7 +3675,7 @@ static int do_trimming(struct btrfs_block_group *block_group,
spin_unlock(&block_group->lock);
spin_unlock(&space_info->lock);
- ret = btrfs_discard_extent(fs_info, start, bytes, &trimmed);
+ ret = btrfs_discard_extent(fs_info, start, bytes, &trimmed, false);
if (!ret) {
*total_trimmed += trimmed;
trim_state = BTRFS_TRIM_STATE_TRIMMED;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 79732756b87f..b31f6f1d53b0 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3305,7 +3305,7 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent)
btrfs_discard_extent(fs_info,
ordered_extent->disk_bytenr,
ordered_extent->disk_num_bytes,
- NULL);
+ NULL, true);
btrfs_free_reserved_extent(fs_info,
ordered_extent->disk_bytenr,
ordered_extent->disk_num_bytes, true);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index cda94c6f5239..76c521485542 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -3472,7 +3472,8 @@ int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset,
* filesystem's point of view.
*/
if (btrfs_is_zoned(fs_info)) {
- ret = btrfs_discard_extent(fs_info, chunk_offset, length, NULL);
+ ret = btrfs_discard_extent(fs_info, chunk_offset, length, NULL,
+ true);
if (ret)
btrfs_info(fs_info,
"failed to reset zone %llu after relocation",
@@ -6112,7 +6113,7 @@ void btrfs_put_bioc(struct btrfs_io_context *bioc)
*/
struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info,
u64 logical, u64 *length_ret,
- u32 *num_stripes)
+ u32 *num_stripes, bool do_remap)
{
struct btrfs_chunk_map *map;
struct btrfs_discard_stripe *stripes;
@@ -6136,6 +6137,25 @@ struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info,
if (IS_ERR(map))
return ERR_CAST(map);
+ if (do_remap && map->type & BTRFS_BLOCK_GROUP_REMAPPED) {
+ u64 new_logical = logical;
+
+ ret = btrfs_translate_remap(fs_info, &new_logical, &length);
+ if (ret)
+ goto out_free_map;
+
+ if (new_logical != logical) {
+ btrfs_free_chunk_map(map);
+
+ map = btrfs_get_chunk_map(fs_info, new_logical,
+ length);
+ if (IS_ERR(map))
+ return ERR_CAST(map);
+
+ logical = new_logical;
+ }
+ }
+
/* we don't discard raid56 yet */
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
ret = -EOPNOTSUPP;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 0c64cae59f1c..ce8751c1b06a 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -732,7 +732,7 @@ int btrfs_map_repair_block(struct btrfs_fs_info *fs_info,
u32 length, int mirror_num);
struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info,
u64 logical, u64 *length_ret,
- u32 *num_stripes);
+ u32 *num_stripes, bool do_remap);
int btrfs_read_sys_array(struct btrfs_fs_info *fs_info);
int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info);
struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans,
--
2.49.1
^ permalink raw reply related [flat|nested] 42+ messages in thread* Re: [PATCH v4 13/16] btrfs: add do_remap param to btrfs_discard_extent()
2025-10-24 18:12 ` [PATCH v4 13/16] btrfs: add do_remap param to btrfs_discard_extent() Mark Harmstone
@ 2025-11-01 0:12 ` Boris Burkov
0 siblings, 0 replies; 42+ messages in thread
From: Boris Burkov @ 2025-11-01 0:12 UTC (permalink / raw)
To: Mark Harmstone; +Cc: linux-btrfs
On Fri, Oct 24, 2025 at 07:12:14PM +0100, Mark Harmstone wrote:
> btrfs_discard_extent() can be called either when an extent is removed
> or from walking the free-space tree. With a remapped block group these
> two things are no longer equivalent: the extent's addresses are
> remapped, while the free-space tree exclusively uses underlying
> addresses.
>
> Add a do_remap parameter to btrfs_discard_extent() and
> btrfs_map_discard(), saying whether or not the address needs to be run
> through the remap tree first.
>
Reviewed-by: Boris Burkov <boris@bur.io>
> Signed-off-by: Mark Harmstone <mark@harmstone.com>
> ---
> fs/btrfs/extent-tree.c | 11 +++++++----
> fs/btrfs/extent-tree.h | 2 +-
> fs/btrfs/free-space-cache.c | 2 +-
> fs/btrfs/inode.c | 2 +-
> fs/btrfs/volumes.c | 24 ++++++++++++++++++++++--
> fs/btrfs/volumes.h | 2 +-
> 6 files changed, 33 insertions(+), 10 deletions(-)
>
> diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
> index 10dc6f8d2f71..82dc88915b7e 100644
> --- a/fs/btrfs/extent-tree.c
> +++ b/fs/btrfs/extent-tree.c
> @@ -1380,7 +1380,7 @@ static int do_discard_extent(struct btrfs_discard_stripe *stripe, u64 *bytes)
> }
>
> int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
> - u64 num_bytes, u64 *actual_bytes)
> + u64 num_bytes, u64 *actual_bytes, bool do_remap)
> {
> int ret = 0;
> u64 discarded_bytes = 0;
> @@ -1398,7 +1398,8 @@ int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
> int i;
>
> num_bytes = end - cur;
> - stripes = btrfs_map_discard(fs_info, cur, &num_bytes, &num_stripes);
> + stripes = btrfs_map_discard(fs_info, cur, &num_bytes,
> + &num_stripes, do_remap);
> if (IS_ERR(stripes)) {
> ret = PTR_ERR(stripes);
> if (ret == -EOPNOTSUPP)
> @@ -2914,7 +2915,8 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
>
> if (btrfs_test_opt(fs_info, DISCARD_SYNC))
> ret = btrfs_discard_extent(fs_info, start,
> - end + 1 - start, NULL);
> + end + 1 - start, NULL,
> + true);
>
> next_state = btrfs_next_extent_state(unpin, cached_state);
> btrfs_clear_extent_dirty(unpin, start, end, &cached_state);
> @@ -2972,7 +2974,8 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
> ret = -EROFS;
> if (!TRANS_ABORTED(trans))
> ret = btrfs_discard_extent(fs_info, block_group->start,
> - block_group->length, NULL);
> + block_group->length, NULL,
> + true);
>
> /*
> * Not strictly necessary to lock, as the block_group should be
> diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h
> index 6b67a4e528da..721b03d682b4 100644
> --- a/fs/btrfs/extent-tree.h
> +++ b/fs/btrfs/extent-tree.h
> @@ -162,7 +162,7 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
> struct extent_buffer *parent);
> void btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, u64 start, u64 end);
> int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
> - u64 num_bytes, u64 *actual_bytes);
> + u64 num_bytes, u64 *actual_bytes, bool do_remap);
> int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range);
> int btrfs_handle_fully_remapped_bgs(struct btrfs_trans_handle *trans);
>
> diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
> index ec9a97d75d10..91670d0af179 100644
> --- a/fs/btrfs/free-space-cache.c
> +++ b/fs/btrfs/free-space-cache.c
> @@ -3675,7 +3675,7 @@ static int do_trimming(struct btrfs_block_group *block_group,
> spin_unlock(&block_group->lock);
> spin_unlock(&space_info->lock);
>
> - ret = btrfs_discard_extent(fs_info, start, bytes, &trimmed);
> + ret = btrfs_discard_extent(fs_info, start, bytes, &trimmed, false);
> if (!ret) {
> *total_trimmed += trimmed;
> trim_state = BTRFS_TRIM_STATE_TRIMMED;
> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
> index 79732756b87f..b31f6f1d53b0 100644
> --- a/fs/btrfs/inode.c
> +++ b/fs/btrfs/inode.c
> @@ -3305,7 +3305,7 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent)
> btrfs_discard_extent(fs_info,
> ordered_extent->disk_bytenr,
> ordered_extent->disk_num_bytes,
> - NULL);
> + NULL, true);
> btrfs_free_reserved_extent(fs_info,
> ordered_extent->disk_bytenr,
> ordered_extent->disk_num_bytes, true);
> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
> index cda94c6f5239..76c521485542 100644
> --- a/fs/btrfs/volumes.c
> +++ b/fs/btrfs/volumes.c
> @@ -3472,7 +3472,8 @@ int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset,
> * filesystem's point of view.
> */
> if (btrfs_is_zoned(fs_info)) {
> - ret = btrfs_discard_extent(fs_info, chunk_offset, length, NULL);
> + ret = btrfs_discard_extent(fs_info, chunk_offset, length, NULL,
> + true);
> if (ret)
> btrfs_info(fs_info,
> "failed to reset zone %llu after relocation",
> @@ -6112,7 +6113,7 @@ void btrfs_put_bioc(struct btrfs_io_context *bioc)
> */
> struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info,
> u64 logical, u64 *length_ret,
> - u32 *num_stripes)
> + u32 *num_stripes, bool do_remap)
> {
> struct btrfs_chunk_map *map;
> struct btrfs_discard_stripe *stripes;
> @@ -6136,6 +6137,25 @@ struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info,
> if (IS_ERR(map))
> return ERR_CAST(map);
>
> + if (do_remap && map->type & BTRFS_BLOCK_GROUP_REMAPPED) {
> + u64 new_logical = logical;
> +
> + ret = btrfs_translate_remap(fs_info, &new_logical, &length);
> + if (ret)
> + goto out_free_map;
> +
> + if (new_logical != logical) {
> + btrfs_free_chunk_map(map);
> +
> + map = btrfs_get_chunk_map(fs_info, new_logical,
> + length);
> + if (IS_ERR(map))
> + return ERR_CAST(map);
> +
> + logical = new_logical;
> + }
> + }
> +
> /* we don't discard raid56 yet */
> if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
> ret = -EOPNOTSUPP;
> diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
> index 0c64cae59f1c..ce8751c1b06a 100644
> --- a/fs/btrfs/volumes.h
> +++ b/fs/btrfs/volumes.h
> @@ -732,7 +732,7 @@ int btrfs_map_repair_block(struct btrfs_fs_info *fs_info,
> u32 length, int mirror_num);
> struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info,
> u64 logical, u64 *length_ret,
> - u32 *num_stripes);
> + u32 *num_stripes, bool do_remap);
> int btrfs_read_sys_array(struct btrfs_fs_info *fs_info);
> int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info);
> struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans,
> --
> 2.49.1
>
^ permalink raw reply [flat|nested] 42+ messages in thread
* [PATCH v4 14/16] btrfs: allow balancing remap tree
2025-10-24 18:12 [PATCH v4 00/16] Remap tree Mark Harmstone
` (12 preceding siblings ...)
2025-10-24 18:12 ` [PATCH v4 13/16] btrfs: add do_remap param to btrfs_discard_extent() Mark Harmstone
@ 2025-10-24 18:12 ` Mark Harmstone
2025-10-24 18:12 ` [PATCH v4 15/16] btrfs: handle discarding fully-remapped block groups Mark Harmstone
2025-10-24 18:12 ` [PATCH v4 16/16] btrfs: add stripe removal pending flag Mark Harmstone
15 siblings, 0 replies; 42+ messages in thread
From: Mark Harmstone @ 2025-10-24 18:12 UTC (permalink / raw)
To: linux-btrfs; +Cc: Mark Harmstone, Boris Burkov
Balancing the REMAP chunk, i.e. the chunk in which the remap tree lives,
is a special case.
We can't use the remap tree itself for this, as then we'd have no way to
boostrap it on mount. And we can't use the pre-remap tree code for this
as it relies on walking the extent tree, and we're not creating backrefs
for REMAP chunks.
So instead, if a balance would relocate any REMAP block groups, mark
those block groups as readonly and COW every leaf of the remap tree.
There's more sophisticated ways of doing this, such as only COWing nodes
within a block group that's to be relocated, but they're fiddly and with
lots of edge cases. Plus it's not anticipated that a) the number of
REMAP chunks is going to be particularly large, or b) that users will
want to only relocate some of these chunks - the main use case here is
to unbreak RAID conversion and device removal.
Signed-off-by: Mark Harmstone <mark@harmstone.com>
Reviewed-by: Boris Burkov <boris@bur.io>
---
fs/btrfs/volumes.c | 159 +++++++++++++++++++++++++++++++++++++++++++--
1 file changed, 155 insertions(+), 4 deletions(-)
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 76c521485542..967a1d13cf59 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -3990,8 +3990,11 @@ static bool should_balance_chunk(struct extent_buffer *leaf, struct btrfs_chunk
struct btrfs_balance_args *bargs = NULL;
u64 chunk_type = btrfs_chunk_type(leaf, chunk);
- if (chunk_type & BTRFS_BLOCK_GROUP_REMAP)
- return false;
+ /* treat REMAP chunks as METADATA */
+ if (chunk_type & BTRFS_BLOCK_GROUP_REMAP) {
+ chunk_type &= ~BTRFS_BLOCK_GROUP_REMAP;
+ chunk_type |= BTRFS_BLOCK_GROUP_METADATA;
+ }
/* type filter */
if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) &
@@ -4074,6 +4077,113 @@ static bool should_balance_chunk(struct extent_buffer *leaf, struct btrfs_chunk
return true;
}
+struct remap_chunk_info {
+ struct list_head list;
+ u64 offset;
+ struct btrfs_block_group *bg;
+ bool made_ro;
+};
+
+static int cow_remap_tree(struct btrfs_trans_handle *trans,
+ struct btrfs_path *path)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_key key = { 0 };
+ int ret;
+
+ ret = btrfs_search_slot(trans, fs_info->remap_root, &key, path, 0, 1);
+ if (ret < 0)
+ return ret;
+
+ while (true) {
+ ret = btrfs_next_leaf(fs_info->remap_root, path);
+ if (ret < 0) {
+ return ret;
+ } else if (ret > 0) {
+ ret = 0;
+ break;
+ }
+
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+
+ btrfs_release_path(path);
+
+ ret = btrfs_search_slot(trans, fs_info->remap_root, &key, path,
+ 0, 1);
+ if (ret < 0)
+ break;
+ }
+
+ return ret;
+}
+
+static int balance_remap_chunks(struct btrfs_fs_info *fs_info,
+ struct btrfs_path *path,
+ struct list_head *chunks)
+{
+ struct remap_chunk_info *rci, *tmp;
+ struct btrfs_trans_handle *trans;
+ int ret;
+
+ list_for_each_entry_safe(rci, tmp, chunks, list) {
+ rci->bg = btrfs_lookup_block_group(fs_info, rci->offset);
+ if (!rci->bg) {
+ list_del(&rci->list);
+ kfree(rci);
+ continue;
+ }
+
+ ret = btrfs_inc_block_group_ro(rci->bg, false);
+ if (ret)
+ goto end;
+
+ rci->made_ro = true;
+ }
+
+ if (list_empty(chunks))
+ return 0;
+
+ trans = btrfs_start_transaction(fs_info->remap_root, 0);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto end;
+ }
+
+ mutex_lock(&fs_info->remap_mutex);
+
+ ret = cow_remap_tree(trans, path);
+
+ btrfs_release_path(path);
+
+ mutex_unlock(&fs_info->remap_mutex);
+
+ btrfs_commit_transaction(trans);
+
+end:
+ while (!list_empty(chunks)) {
+ bool unused;
+
+ rci = list_first_entry(chunks, struct remap_chunk_info, list);
+
+ spin_lock(&rci->bg->lock);
+ unused = !btrfs_is_block_group_used(rci->bg);
+ spin_unlock(&rci->bg->lock);
+
+ if (unused)
+ btrfs_mark_bg_unused(rci->bg);
+
+ if (rci->made_ro)
+ btrfs_dec_block_group_ro(rci->bg);
+
+ btrfs_put_block_group(rci->bg);
+
+ list_del(&rci->list);
+ kfree(rci);
+ }
+
+ return ret;
+}
+
static int __btrfs_balance(struct btrfs_fs_info *fs_info)
{
struct btrfs_balance_control *bctl = fs_info->balance_ctl;
@@ -4096,6 +4206,9 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)
u32 count_meta = 0;
u32 count_sys = 0;
int chunk_reserved = 0;
+ struct remap_chunk_info *rci;
+ unsigned int num_remap_chunks = 0;
+ LIST_HEAD(remap_chunks);
path = btrfs_alloc_path();
if (!path) {
@@ -4194,7 +4307,8 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)
count_data++;
else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
count_sys++;
- else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
+ else if (chunk_type & (BTRFS_BLOCK_GROUP_METADATA |
+ BTRFS_BLOCK_GROUP_REMAP))
count_meta++;
goto loop;
@@ -4214,6 +4328,30 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)
goto loop;
}
+ /*
+ * Balancing REMAP chunks takes place separately - add the
+ * details to a list so it can be processed later.
+ */
+ if (chunk_type & BTRFS_BLOCK_GROUP_REMAP) {
+ mutex_unlock(&fs_info->reclaim_bgs_lock);
+
+ rci = kmalloc(sizeof(struct remap_chunk_info),
+ GFP_NOFS);
+ if (!rci) {
+ ret = -ENOMEM;
+ goto error;
+ }
+
+ rci->offset = found_key.offset;
+ rci->bg = NULL;
+ rci->made_ro = false;
+ list_add_tail(&rci->list, &remap_chunks);
+
+ num_remap_chunks++;
+
+ goto loop;
+ }
+
if (!chunk_reserved) {
/*
* We may be relocating the only data chunk we have,
@@ -4253,11 +4391,24 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)
key.offset = found_key.offset - 1;
}
+ btrfs_release_path(path);
+
if (counting) {
- btrfs_release_path(path);
counting = false;
goto again;
}
+
+ if (!list_empty(&remap_chunks)) {
+ ret = balance_remap_chunks(fs_info, path, &remap_chunks);
+ if (ret == -ENOSPC)
+ enospc_errors++;
+
+ if (!ret) {
+ spin_lock(&fs_info->balance_lock);
+ bctl->stat.completed += num_remap_chunks;
+ spin_unlock(&fs_info->balance_lock);
+ }
+ }
error:
btrfs_free_path(path);
if (enospc_errors) {
--
2.49.1
^ permalink raw reply related [flat|nested] 42+ messages in thread* [PATCH v4 15/16] btrfs: handle discarding fully-remapped block groups
2025-10-24 18:12 [PATCH v4 00/16] Remap tree Mark Harmstone
` (13 preceding siblings ...)
2025-10-24 18:12 ` [PATCH v4 14/16] btrfs: allow balancing remap tree Mark Harmstone
@ 2025-10-24 18:12 ` Mark Harmstone
2025-10-27 16:04 ` kernel test robot
2025-10-31 22:11 ` Boris Burkov
2025-10-24 18:12 ` [PATCH v4 16/16] btrfs: add stripe removal pending flag Mark Harmstone
15 siblings, 2 replies; 42+ messages in thread
From: Mark Harmstone @ 2025-10-24 18:12 UTC (permalink / raw)
To: linux-btrfs; +Cc: Mark Harmstone
Discard normally works by iterating over the free-space entries of a
block group. This doesn't work for fully-remapped block groups, as we
removed their free-space entries when we started relocation.
For sync discard, call btrfs_discard_extent() when we commit the
transaction in which the last identity remap was removed.
For async discard, add a new function btrfs_trim_fully_remapped_block_group()
to be called by the discard worker, which iterates over the block
group's range using the normal async discard rules. Once we reach the
end, remove the chunk's stripes and device extents to get back its free
space.
Signed-off-by: Mark Harmstone <mark@harmstone.com>
---
fs/btrfs/block-group.c | 2 ++
fs/btrfs/block-group.h | 1 +
fs/btrfs/discard.c | 57 ++++++++++++++++++++++++++----
fs/btrfs/extent-tree.c | 10 ++++++
fs/btrfs/free-space-cache.c | 70 +++++++++++++++++++++++++++++++++++++
fs/btrfs/free-space-cache.h | 1 +
6 files changed, 134 insertions(+), 7 deletions(-)
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 8feddb472882..0c91553b02cf 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -4833,4 +4833,6 @@ void btrfs_mark_bg_fully_remapped(struct btrfs_block_group *bg,
spin_unlock(&fs_info->unused_bgs_lock);
+ if (btrfs_test_opt(fs_info, DISCARD_ASYNC))
+ btrfs_discard_queue_work(&fs_info->discard_ctl, bg);
}
diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h
index 4522074a45c2..b0b16efea19a 100644
--- a/fs/btrfs/block-group.h
+++ b/fs/btrfs/block-group.h
@@ -49,6 +49,7 @@ enum btrfs_discard_state {
BTRFS_DISCARD_EXTENTS,
BTRFS_DISCARD_BITMAPS,
BTRFS_DISCARD_RESET_CURSOR,
+ BTRFS_DISCARD_FULLY_REMAPPED,
};
/*
diff --git a/fs/btrfs/discard.c b/fs/btrfs/discard.c
index ee5f5b2788e1..f9890037395a 100644
--- a/fs/btrfs/discard.c
+++ b/fs/btrfs/discard.c
@@ -215,6 +215,27 @@ static struct btrfs_block_group *find_next_block_group(
return ret_block_group;
}
+/*
+ * Returns whether a block group is empty.
+ *
+ * @block_group: block_group of interest
+ *
+ * "Empty" here means that there are no extents physically located within the
+ * device extents corresponding to this block group.
+ *
+ * For a remapped block group, this means that all of its identity remaps have
+ * been removed. For a non-remapped block group, this means that no extents
+ * have an address within its range, and that nothing has been remapped to be
+ * within it.
+ */
+static bool block_group_is_empty(struct btrfs_block_group *block_group)
+{
+ if (block_group->flags & BTRFS_BLOCK_GROUP_REMAPPED)
+ return block_group->identity_remap_count == 0;
+ else
+ return block_group->used == 0 && block_group->remap_bytes == 0;
+}
+
/*
* Look up next block group and set it for use.
*
@@ -241,8 +262,10 @@ static struct btrfs_block_group *peek_discard_list(
block_group = find_next_block_group(discard_ctl, now);
if (block_group && now >= block_group->discard_eligible_time) {
+ bool empty = block_group_is_empty(block_group);
+
if (block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED &&
- block_group->used != 0) {
+ !empty) {
if (btrfs_is_block_group_data_only(block_group)) {
__add_to_discard_list(discard_ctl, block_group);
/*
@@ -267,7 +290,15 @@ static struct btrfs_block_group *peek_discard_list(
}
if (block_group->discard_state == BTRFS_DISCARD_RESET_CURSOR) {
block_group->discard_cursor = block_group->start;
- block_group->discard_state = BTRFS_DISCARD_EXTENTS;
+
+ if (block_group->flags & BTRFS_BLOCK_GROUP_REMAPPED &&
+ empty) {
+ block_group->discard_state =
+ BTRFS_DISCARD_FULLY_REMAPPED;
+ } else {
+ block_group->discard_state =
+ BTRFS_DISCARD_EXTENTS;
+ }
}
}
if (block_group) {
@@ -373,7 +404,7 @@ void btrfs_discard_queue_work(struct btrfs_discard_ctl *discard_ctl,
if (!block_group || !btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC))
return;
- if (block_group->used == 0 && block_group->remap_bytes == 0)
+ if (block_group_is_empty(block_group))
add_to_discard_unused_list(discard_ctl, block_group);
else
add_to_discard_list(discard_ctl, block_group);
@@ -470,7 +501,7 @@ static void btrfs_finish_discard_pass(struct btrfs_discard_ctl *discard_ctl,
{
remove_from_discard_list(discard_ctl, block_group);
- if (block_group->used == 0) {
+ if (block_group_is_empty(block_group)) {
if (btrfs_is_free_space_trimmed(block_group))
btrfs_mark_bg_unused(block_group);
else
@@ -524,7 +555,8 @@ static void btrfs_discard_workfn(struct work_struct *work)
/* Perform discarding */
minlen = discard_minlen[discard_index];
- if (discard_state == BTRFS_DISCARD_BITMAPS) {
+ switch (discard_state) {
+ case BTRFS_DISCARD_BITMAPS: {
u64 maxlen = 0;
/*
@@ -541,17 +573,28 @@ static void btrfs_discard_workfn(struct work_struct *work)
btrfs_block_group_end(block_group),
minlen, maxlen, true);
discard_ctl->discard_bitmap_bytes += trimmed;
- } else {
+
+ break;
+ }
+
+ case BTRFS_DISCARD_FULLY_REMAPPED:
+ btrfs_trim_fully_remapped_block_group(block_group);
+ break;
+
+ default:
btrfs_trim_block_group_extents(block_group, &trimmed,
block_group->discard_cursor,
btrfs_block_group_end(block_group),
minlen, true);
discard_ctl->discard_extent_bytes += trimmed;
+
+ break;
}
/* Determine next steps for a block_group */
if (block_group->discard_cursor >= btrfs_block_group_end(block_group)) {
- if (discard_state == BTRFS_DISCARD_BITMAPS) {
+ if (discard_state == BTRFS_DISCARD_BITMAPS ||
+ discard_state == BTRFS_DISCARD_FULLY_REMAPPED) {
btrfs_finish_discard_pass(discard_ctl, block_group);
} else {
block_group->discard_cursor = block_group->start;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 82dc88915b7e..82d102a157e9 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2860,6 +2860,12 @@ int btrfs_handle_fully_remapped_bgs(struct btrfs_trans_handle *trans)
list_for_each_entry_safe(block_group, tmp, fully_remapped_bgs, bg_list) {
struct btrfs_chunk_map *map;
+ btrfs_discard_queue_work(&fs_info->discard_ctl, block_group);
+
+ /* for async discard the below gets done in discard job */
+ if (btrfs_test_opt(fs_info, DISCARD_ASYNC))
+ continue;
+
map = btrfs_get_chunk_map(fs_info, block_group->start, 1);
if (IS_ERR(map))
return PTR_ERR(map);
@@ -2870,6 +2876,10 @@ int btrfs_handle_fully_remapped_bgs(struct btrfs_trans_handle *trans)
return ret;
}
+ if (!TRANS_ABORTED(trans))
+ btrfs_discard_extent(fs_info, block_group->start,
+ block_group->length, NULL, false);
+
/*
* Set num_stripes to 0, so that btrfs_remove_dev_extents()
* won't run a second time.
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 91670d0af179..5d5e3401e723 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -29,6 +29,7 @@
#include "file-item.h"
#include "file.h"
#include "super.h"
+#include "relocation.h"
#define BITS_PER_BITMAP (PAGE_SIZE * 8UL)
#define MAX_CACHE_BYTES_PER_GIG SZ_64K
@@ -3066,6 +3067,11 @@ bool btrfs_is_free_space_trimmed(struct btrfs_block_group *block_group)
struct rb_node *node;
bool ret = true;
+ if (block_group->flags & BTRFS_BLOCK_GROUP_REMAPPED &&
+ block_group->identity_remap_count == 0) {
+ return true;
+ }
+
spin_lock(&ctl->tree_lock);
node = rb_first(&ctl->free_space_offset);
@@ -3830,6 +3836,70 @@ static int trim_no_bitmap(struct btrfs_block_group *block_group,
return ret;
}
+void btrfs_trim_fully_remapped_block_group(struct btrfs_block_group *bg)
+{
+ struct btrfs_fs_info *fs_info = bg->fs_info;
+ struct btrfs_discard_ctl *discard_ctl = &fs_info->discard_ctl;
+ int ret = 0;
+ u64 bytes, trimmed;
+ const u64 max_discard_size = READ_ONCE(discard_ctl->max_discard_size);
+ u64 end = btrfs_block_group_end(bg);
+ struct btrfs_trans_handle *trans;
+ struct btrfs_chunk_map *map;
+
+ bytes = end - bg->discard_cursor;
+
+ if (max_discard_size &&
+ bytes >= (max_discard_size +
+ BTRFS_ASYNC_DISCARD_MIN_FILTER)) {
+ bytes = max_discard_size;
+ }
+
+ ret = btrfs_discard_extent(fs_info, bg->discard_cursor, bytes, &trimmed,
+ false);
+ if (ret)
+ return;
+
+ bg->discard_cursor += trimmed;
+
+ if (bg->discard_cursor < end)
+ return;
+
+ trans = btrfs_start_transaction(fs_info->tree_root, 0);
+ if (IS_ERR(trans))
+ return;
+
+ map = btrfs_get_chunk_map(fs_info, bg->start, 1);
+ if (IS_ERR(map)) {
+ ret = PTR_ERR(map);
+ btrfs_abort_transaction(trans, ret);
+ return;
+ }
+
+ ret = btrfs_last_identity_remap_gone(trans, map, bg);
+ if (ret) {
+ btrfs_free_chunk_map(map);
+ btrfs_abort_transaction(trans, ret);
+ return;
+ }
+
+ btrfs_end_transaction(trans);
+
+ /*
+ * Set num_stripes to 0, so that btrfs_remove_dev_extents()
+ * won't run a second time.
+ */
+ map->num_stripes = 0;
+
+ btrfs_free_chunk_map(map);
+
+ if (bg->used == 0) {
+ spin_lock(&fs_info->unused_bgs_lock);
+ list_move_tail(&bg->bg_list, &fs_info->unused_bgs);
+ spin_unlock(&fs_info->unused_bgs_lock);
+ }
+}
+
/*
* If we break out of trimming a bitmap prematurely, we should reset the
* trimming bit. In a rather contrived case, it's possible to race here so
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index 9f1dbfdee8ca..33fc3b245648 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -166,6 +166,7 @@ int btrfs_trim_block_group_extents(struct btrfs_block_group *block_group,
int btrfs_trim_block_group_bitmaps(struct btrfs_block_group *block_group,
u64 *trimmed, u64 start, u64 end, u64 minlen,
u64 maxlen, bool async);
+void btrfs_trim_fully_remapped_block_group(struct btrfs_block_group *bg);
bool btrfs_free_space_cache_v1_active(struct btrfs_fs_info *fs_info);
int btrfs_set_free_space_cache_v1_active(struct btrfs_fs_info *fs_info, bool active);
--
2.49.1
^ permalink raw reply related [flat|nested] 42+ messages in thread* Re: [PATCH v4 15/16] btrfs: handle discarding fully-remapped block groups
2025-10-24 18:12 ` [PATCH v4 15/16] btrfs: handle discarding fully-remapped block groups Mark Harmstone
@ 2025-10-27 16:04 ` kernel test robot
2025-10-31 22:12 ` Boris Burkov
2025-10-31 22:11 ` Boris Burkov
1 sibling, 1 reply; 42+ messages in thread
From: kernel test robot @ 2025-10-27 16:04 UTC (permalink / raw)
To: Mark Harmstone, linux-btrfs; +Cc: oe-kbuild-all, Mark Harmstone
Hi Mark,
kernel test robot noticed the following build warnings:
[auto build test WARNING on kdave/for-next]
[also build test WARNING on next-20251027]
[cannot apply to linus/master v6.18-rc3]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]
url: https://github.com/intel-lab-lkp/linux/commits/Mark-Harmstone/btrfs-add-definitions-and-constants-for-remap-tree/20251025-021910
base: https://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux.git for-next
patch link: https://lore.kernel.org/r/20251024181227.32228-16-mark%40harmstone.com
patch subject: [PATCH v4 15/16] btrfs: handle discarding fully-remapped block groups
config: arm-randconfig-003-20251027 (https://download.01.org/0day-ci/archive/20251027/202510272322.N1S5rdDc-lkp@intel.com/config)
compiler: arm-linux-gnueabi-gcc (GCC) 8.5.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20251027/202510272322.N1S5rdDc-lkp@intel.com/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202510272322.N1S5rdDc-lkp@intel.com/
Note: it may well be a FALSE warning. FWIW you are at least aware of it now.
http://gcc.gnu.org/wiki/Better_Uninitialized_Warnings
All warnings (new ones prefixed by >>):
fs/btrfs/discard.c: In function 'btrfs_discard_workfn':
>> fs/btrfs/discard.c:596:6: warning: 'discard_state' may be used uninitialized in this function [-Wmaybe-uninitialized]
if (discard_state == BTRFS_DISCARD_BITMAPS ||
^
vim +/discard_state +596 fs/btrfs/discard.c
513
514 /*
515 * Discard work queue callback
516 *
517 * @work: work
518 *
519 * Find the next block_group to start discarding and then discard a single
520 * region. It does this in a two-pass fashion: first extents and second
521 * bitmaps. Completely discarded block groups are sent to the unused_bgs path.
522 */
523 static void btrfs_discard_workfn(struct work_struct *work)
524 {
525 struct btrfs_discard_ctl *discard_ctl;
526 struct btrfs_block_group *block_group;
527 enum btrfs_discard_state discard_state;
528 int discard_index = 0;
529 u64 trimmed = 0;
530 u64 minlen = 0;
531 u64 now = ktime_get_ns();
532
533 discard_ctl = container_of(work, struct btrfs_discard_ctl, work.work);
534
535 block_group = peek_discard_list(discard_ctl, &discard_state,
536 &discard_index, now);
537 if (!block_group)
538 return;
539 if (!btrfs_run_discard_work(discard_ctl)) {
540 spin_lock(&discard_ctl->lock);
541 btrfs_put_block_group(block_group);
542 discard_ctl->block_group = NULL;
543 spin_unlock(&discard_ctl->lock);
544 return;
545 }
546 if (now < block_group->discard_eligible_time) {
547 spin_lock(&discard_ctl->lock);
548 btrfs_put_block_group(block_group);
549 discard_ctl->block_group = NULL;
550 spin_unlock(&discard_ctl->lock);
551 btrfs_discard_schedule_work(discard_ctl, false);
552 return;
553 }
554
555 /* Perform discarding */
556 minlen = discard_minlen[discard_index];
557
558 switch (discard_state) {
559 case BTRFS_DISCARD_BITMAPS: {
560 u64 maxlen = 0;
561
562 /*
563 * Use the previous levels minimum discard length as the max
564 * length filter. In the case something is added to make a
565 * region go beyond the max filter, the entire bitmap is set
566 * back to BTRFS_TRIM_STATE_UNTRIMMED.
567 */
568 if (discard_index != BTRFS_DISCARD_INDEX_UNUSED)
569 maxlen = discard_minlen[discard_index - 1];
570
571 btrfs_trim_block_group_bitmaps(block_group, &trimmed,
572 block_group->discard_cursor,
573 btrfs_block_group_end(block_group),
574 minlen, maxlen, true);
575 discard_ctl->discard_bitmap_bytes += trimmed;
576
577 break;
578 }
579
580 case BTRFS_DISCARD_FULLY_REMAPPED:
581 btrfs_trim_fully_remapped_block_group(block_group);
582 break;
583
584 default:
585 btrfs_trim_block_group_extents(block_group, &trimmed,
586 block_group->discard_cursor,
587 btrfs_block_group_end(block_group),
588 minlen, true);
589 discard_ctl->discard_extent_bytes += trimmed;
590
591 break;
592 }
593
594 /* Determine next steps for a block_group */
595 if (block_group->discard_cursor >= btrfs_block_group_end(block_group)) {
> 596 if (discard_state == BTRFS_DISCARD_BITMAPS ||
597 discard_state == BTRFS_DISCARD_FULLY_REMAPPED) {
598 btrfs_finish_discard_pass(discard_ctl, block_group);
599 } else {
600 block_group->discard_cursor = block_group->start;
601 spin_lock(&discard_ctl->lock);
602 if (block_group->discard_state !=
603 BTRFS_DISCARD_RESET_CURSOR)
604 block_group->discard_state =
605 BTRFS_DISCARD_BITMAPS;
606 spin_unlock(&discard_ctl->lock);
607 }
608 }
609
610 now = ktime_get_ns();
611 spin_lock(&discard_ctl->lock);
612 discard_ctl->prev_discard = trimmed;
613 discard_ctl->prev_discard_time = now;
614 btrfs_put_block_group(block_group);
615 discard_ctl->block_group = NULL;
616 __btrfs_discard_schedule_work(discard_ctl, now, false);
617 spin_unlock(&discard_ctl->lock);
618 }
619
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
^ permalink raw reply [flat|nested] 42+ messages in thread* Re: [PATCH v4 15/16] btrfs: handle discarding fully-remapped block groups
2025-10-27 16:04 ` kernel test robot
@ 2025-10-31 22:12 ` Boris Burkov
2025-11-03 16:49 ` Mark Harmstone
0 siblings, 1 reply; 42+ messages in thread
From: Boris Burkov @ 2025-10-31 22:12 UTC (permalink / raw)
To: kernel test robot; +Cc: Mark Harmstone, linux-btrfs, oe-kbuild-all
On Tue, Oct 28, 2025 at 12:04:11AM +0800, kernel test robot wrote:
> Hi Mark,
>
> kernel test robot noticed the following build warnings:
>
> [auto build test WARNING on kdave/for-next]
> [also build test WARNING on next-20251027]
> [cannot apply to linus/master v6.18-rc3]
> [If your patch is applied to the wrong git tree, kindly drop us a note.
> And when submitting patch, we suggest to use '--base' as documented in
> https://git-scm.com/docs/git-format-patch#_base_tree_information]
>
> url: https://github.com/intel-lab-lkp/linux/commits/Mark-Harmstone/btrfs-add-definitions-and-constants-for-remap-tree/20251025-021910
> base: https://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux.git for-next
> patch link: https://lore.kernel.org/r/20251024181227.32228-16-mark%40harmstone.com
> patch subject: [PATCH v4 15/16] btrfs: handle discarding fully-remapped block groups
> config: arm-randconfig-003-20251027 (https://download.01.org/0day-ci/archive/20251027/202510272322.N1S5rdDc-lkp@intel.com/config)
> compiler: arm-linux-gnueabi-gcc (GCC) 8.5.0
> reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20251027/202510272322.N1S5rdDc-lkp@intel.com/reproduce)
>
> If you fix the issue in a separate patch/commit (i.e. not just a new version of
> the same patch/commit), kindly add following tags
> | Reported-by: kernel test robot <lkp@intel.com>
> | Closes: https://lore.kernel.org/oe-kbuild-all/202510272322.N1S5rdDc-lkp@intel.com/
>
> Note: it may well be a FALSE warning. FWIW you are at least aware of it now.
> http://gcc.gnu.org/wiki/Better_Uninitialized_Warnings
>
> All warnings (new ones prefixed by >>):
>
> fs/btrfs/discard.c: In function 'btrfs_discard_workfn':
> >> fs/btrfs/discard.c:596:6: warning: 'discard_state' may be used uninitialized in this function [-Wmaybe-uninitialized]
> if (discard_state == BTRFS_DISCARD_BITMAPS ||
> ^
I think this gets set by peek_discard_list() so I don't think this is
a valid warning.
>
>
> vim +/discard_state +596 fs/btrfs/discard.c
>
> 513
> 514 /*
> 515 * Discard work queue callback
> 516 *
> 517 * @work: work
> 518 *
> 519 * Find the next block_group to start discarding and then discard a single
> 520 * region. It does this in a two-pass fashion: first extents and second
> 521 * bitmaps. Completely discarded block groups are sent to the unused_bgs path.
> 522 */
> 523 static void btrfs_discard_workfn(struct work_struct *work)
> 524 {
> 525 struct btrfs_discard_ctl *discard_ctl;
> 526 struct btrfs_block_group *block_group;
> 527 enum btrfs_discard_state discard_state;
> 528 int discard_index = 0;
> 529 u64 trimmed = 0;
> 530 u64 minlen = 0;
> 531 u64 now = ktime_get_ns();
> 532
> 533 discard_ctl = container_of(work, struct btrfs_discard_ctl, work.work);
> 534
> 535 block_group = peek_discard_list(discard_ctl, &discard_state,
> 536 &discard_index, now);
> 537 if (!block_group)
> 538 return;
> 539 if (!btrfs_run_discard_work(discard_ctl)) {
> 540 spin_lock(&discard_ctl->lock);
> 541 btrfs_put_block_group(block_group);
> 542 discard_ctl->block_group = NULL;
> 543 spin_unlock(&discard_ctl->lock);
> 544 return;
> 545 }
> 546 if (now < block_group->discard_eligible_time) {
> 547 spin_lock(&discard_ctl->lock);
> 548 btrfs_put_block_group(block_group);
> 549 discard_ctl->block_group = NULL;
> 550 spin_unlock(&discard_ctl->lock);
> 551 btrfs_discard_schedule_work(discard_ctl, false);
> 552 return;
> 553 }
> 554
> 555 /* Perform discarding */
> 556 minlen = discard_minlen[discard_index];
> 557
> 558 switch (discard_state) {
> 559 case BTRFS_DISCARD_BITMAPS: {
> 560 u64 maxlen = 0;
> 561
> 562 /*
> 563 * Use the previous levels minimum discard length as the max
> 564 * length filter. In the case something is added to make a
> 565 * region go beyond the max filter, the entire bitmap is set
> 566 * back to BTRFS_TRIM_STATE_UNTRIMMED.
> 567 */
> 568 if (discard_index != BTRFS_DISCARD_INDEX_UNUSED)
> 569 maxlen = discard_minlen[discard_index - 1];
> 570
> 571 btrfs_trim_block_group_bitmaps(block_group, &trimmed,
> 572 block_group->discard_cursor,
> 573 btrfs_block_group_end(block_group),
> 574 minlen, maxlen, true);
> 575 discard_ctl->discard_bitmap_bytes += trimmed;
> 576
> 577 break;
> 578 }
> 579
> 580 case BTRFS_DISCARD_FULLY_REMAPPED:
> 581 btrfs_trim_fully_remapped_block_group(block_group);
> 582 break;
> 583
> 584 default:
> 585 btrfs_trim_block_group_extents(block_group, &trimmed,
> 586 block_group->discard_cursor,
> 587 btrfs_block_group_end(block_group),
> 588 minlen, true);
> 589 discard_ctl->discard_extent_bytes += trimmed;
> 590
> 591 break;
> 592 }
> 593
> 594 /* Determine next steps for a block_group */
> 595 if (block_group->discard_cursor >= btrfs_block_group_end(block_group)) {
> > 596 if (discard_state == BTRFS_DISCARD_BITMAPS ||
> 597 discard_state == BTRFS_DISCARD_FULLY_REMAPPED) {
> 598 btrfs_finish_discard_pass(discard_ctl, block_group);
> 599 } else {
> 600 block_group->discard_cursor = block_group->start;
> 601 spin_lock(&discard_ctl->lock);
> 602 if (block_group->discard_state !=
> 603 BTRFS_DISCARD_RESET_CURSOR)
> 604 block_group->discard_state =
> 605 BTRFS_DISCARD_BITMAPS;
> 606 spin_unlock(&discard_ctl->lock);
> 607 }
> 608 }
> 609
> 610 now = ktime_get_ns();
> 611 spin_lock(&discard_ctl->lock);
> 612 discard_ctl->prev_discard = trimmed;
> 613 discard_ctl->prev_discard_time = now;
> 614 btrfs_put_block_group(block_group);
> 615 discard_ctl->block_group = NULL;
> 616 __btrfs_discard_schedule_work(discard_ctl, now, false);
> 617 spin_unlock(&discard_ctl->lock);
> 618 }
> 619
>
> --
> 0-DAY CI Kernel Test Service
> https://github.com/intel/lkp-tests/wiki
^ permalink raw reply [flat|nested] 42+ messages in thread* Re: [PATCH v4 15/16] btrfs: handle discarding fully-remapped block groups
2025-10-31 22:12 ` Boris Burkov
@ 2025-11-03 16:49 ` Mark Harmstone
2025-11-09 8:42 ` Philip Li
0 siblings, 1 reply; 42+ messages in thread
From: Mark Harmstone @ 2025-11-03 16:49 UTC (permalink / raw)
To: Boris Burkov, kernel test robot; +Cc: linux-btrfs, oe-kbuild-all
On 31/10/2025 10.12 pm, Boris Burkov wrote:
> On Tue, Oct 28, 2025 at 12:04:11AM +0800, kernel test robot wrote:
>> Hi Mark,
>>
>> kernel test robot noticed the following build warnings:
>>
>> [auto build test WARNING on kdave/for-next]
>> [also build test WARNING on next-20251027]
>> [cannot apply to linus/master v6.18-rc3]
>> [If your patch is applied to the wrong git tree, kindly drop us a note.
>> And when submitting patch, we suggest to use '--base' as documented in
>> https://git-scm.com/docs/git-format-patch#_base_tree_information]
>>
>> url: https://github.com/intel-lab-lkp/linux/commits/Mark-Harmstone/btrfs-add-definitions-and-constants-for-remap-tree/20251025-021910
>> base: https://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux.git for-next
>> patch link: https://lore.kernel.org/r/20251024181227.32228-16-mark%40harmstone.com
>> patch subject: [PATCH v4 15/16] btrfs: handle discarding fully-remapped block groups
>> config: arm-randconfig-003-20251027 (https://download.01.org/0day-ci/archive/20251027/202510272322.N1S5rdDc-lkp@intel.com/config)
>> compiler: arm-linux-gnueabi-gcc (GCC) 8.5.0
>> reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20251027/202510272322.N1S5rdDc-lkp@intel.com/reproduce)
>>
>> If you fix the issue in a separate patch/commit (i.e. not just a new version of
>> the same patch/commit), kindly add following tags
>> | Reported-by: kernel test robot <lkp@intel.com>
>> | Closes: https://lore.kernel.org/oe-kbuild-all/202510272322.N1S5rdDc-lkp@intel.com/
>>
>> Note: it may well be a FALSE warning. FWIW you are at least aware of it now.
>> http://gcc.gnu.org/wiki/Better_Uninitialized_Warnings
>>
>> All warnings (new ones prefixed by >>):
>>
>> fs/btrfs/discard.c: In function 'btrfs_discard_workfn':
>>>> fs/btrfs/discard.c:596:6: warning: 'discard_state' may be used uninitialized in this function [-Wmaybe-uninitialized]
>> if (discard_state == BTRFS_DISCARD_BITMAPS ||
>> ^
>
> I think this gets set by peek_discard_list() so I don't think this is
> a valid warning.
You are correct. discard_state gets initialized if the return value of
peek_discard_list() is not NULL, and if it is NULL we return before we
use it.
This is an ancient version of GCC, the warning doesn't trigger on GCC 15
- presumably it has better control flow analysis. I don't think the
robot should be compiling with this warning turned on for old compiler
versions, if it's prone to false positives.
>>
>>
>> vim +/discard_state +596 fs/btrfs/discard.c
>>
>> 513
>> 514 /*
>> 515 * Discard work queue callback
>> 516 *
>> 517 * @work: work
>> 518 *
>> 519 * Find the next block_group to start discarding and then discard a single
>> 520 * region. It does this in a two-pass fashion: first extents and second
>> 521 * bitmaps. Completely discarded block groups are sent to the unused_bgs path.
>> 522 */
>> 523 static void btrfs_discard_workfn(struct work_struct *work)
>> 524 {
>> 525 struct btrfs_discard_ctl *discard_ctl;
>> 526 struct btrfs_block_group *block_group;
>> 527 enum btrfs_discard_state discard_state;
>> 528 int discard_index = 0;
>> 529 u64 trimmed = 0;
>> 530 u64 minlen = 0;
>> 531 u64 now = ktime_get_ns();
>> 532
>> 533 discard_ctl = container_of(work, struct btrfs_discard_ctl, work.work);
>> 534
>> 535 block_group = peek_discard_list(discard_ctl, &discard_state,
>> 536 &discard_index, now);
>> 537 if (!block_group)
>> 538 return;
>> 539 if (!btrfs_run_discard_work(discard_ctl)) {
>> 540 spin_lock(&discard_ctl->lock);
>> 541 btrfs_put_block_group(block_group);
>> 542 discard_ctl->block_group = NULL;
>> 543 spin_unlock(&discard_ctl->lock);
>> 544 return;
>> 545 }
>> 546 if (now < block_group->discard_eligible_time) {
>> 547 spin_lock(&discard_ctl->lock);
>> 548 btrfs_put_block_group(block_group);
>> 549 discard_ctl->block_group = NULL;
>> 550 spin_unlock(&discard_ctl->lock);
>> 551 btrfs_discard_schedule_work(discard_ctl, false);
>> 552 return;
>> 553 }
>> 554
>> 555 /* Perform discarding */
>> 556 minlen = discard_minlen[discard_index];
>> 557
>> 558 switch (discard_state) {
>> 559 case BTRFS_DISCARD_BITMAPS: {
>> 560 u64 maxlen = 0;
>> 561
>> 562 /*
>> 563 * Use the previous levels minimum discard length as the max
>> 564 * length filter. In the case something is added to make a
>> 565 * region go beyond the max filter, the entire bitmap is set
>> 566 * back to BTRFS_TRIM_STATE_UNTRIMMED.
>> 567 */
>> 568 if (discard_index != BTRFS_DISCARD_INDEX_UNUSED)
>> 569 maxlen = discard_minlen[discard_index - 1];
>> 570
>> 571 btrfs_trim_block_group_bitmaps(block_group, &trimmed,
>> 572 block_group->discard_cursor,
>> 573 btrfs_block_group_end(block_group),
>> 574 minlen, maxlen, true);
>> 575 discard_ctl->discard_bitmap_bytes += trimmed;
>> 576
>> 577 break;
>> 578 }
>> 579
>> 580 case BTRFS_DISCARD_FULLY_REMAPPED:
>> 581 btrfs_trim_fully_remapped_block_group(block_group);
>> 582 break;
>> 583
>> 584 default:
>> 585 btrfs_trim_block_group_extents(block_group, &trimmed,
>> 586 block_group->discard_cursor,
>> 587 btrfs_block_group_end(block_group),
>> 588 minlen, true);
>> 589 discard_ctl->discard_extent_bytes += trimmed;
>> 590
>> 591 break;
>> 592 }
>> 593
>> 594 /* Determine next steps for a block_group */
>> 595 if (block_group->discard_cursor >= btrfs_block_group_end(block_group)) {
>> > 596 if (discard_state == BTRFS_DISCARD_BITMAPS ||
>> 597 discard_state == BTRFS_DISCARD_FULLY_REMAPPED) {
>> 598 btrfs_finish_discard_pass(discard_ctl, block_group);
>> 599 } else {
>> 600 block_group->discard_cursor = block_group->start;
>> 601 spin_lock(&discard_ctl->lock);
>> 602 if (block_group->discard_state !=
>> 603 BTRFS_DISCARD_RESET_CURSOR)
>> 604 block_group->discard_state =
>> 605 BTRFS_DISCARD_BITMAPS;
>> 606 spin_unlock(&discard_ctl->lock);
>> 607 }
>> 608 }
>> 609
>> 610 now = ktime_get_ns();
>> 611 spin_lock(&discard_ctl->lock);
>> 612 discard_ctl->prev_discard = trimmed;
>> 613 discard_ctl->prev_discard_time = now;
>> 614 btrfs_put_block_group(block_group);
>> 615 discard_ctl->block_group = NULL;
>> 616 __btrfs_discard_schedule_work(discard_ctl, now, false);
>> 617 spin_unlock(&discard_ctl->lock);
>> 618 }
>> 619
>>
>> --
>> 0-DAY CI Kernel Test Service
>> https://github.com/intel/lkp-tests/wiki
^ permalink raw reply [flat|nested] 42+ messages in thread* Re: [PATCH v4 15/16] btrfs: handle discarding fully-remapped block groups
2025-11-03 16:49 ` Mark Harmstone
@ 2025-11-09 8:42 ` Philip Li
0 siblings, 0 replies; 42+ messages in thread
From: Philip Li @ 2025-11-09 8:42 UTC (permalink / raw)
To: Mark Harmstone
Cc: Boris Burkov, kernel test robot, linux-btrfs, oe-kbuild-all
On Mon, Nov 03, 2025 at 04:49:26PM +0000, Mark Harmstone wrote:
> On 31/10/2025 10.12 pm, Boris Burkov wrote:
> > On Tue, Oct 28, 2025 at 12:04:11AM +0800, kernel test robot wrote:
> > > Hi Mark,
> > >
> > > kernel test robot noticed the following build warnings:
> > >
> > > [auto build test WARNING on kdave/for-next]
> > > [also build test WARNING on next-20251027]
> > > [cannot apply to linus/master v6.18-rc3]
> > > [If your patch is applied to the wrong git tree, kindly drop us a note.
> > > And when submitting patch, we suggest to use '--base' as documented in
> > > https://git-scm.com/docs/git-format-patch#_base_tree_information]
> > >
> > > url: https://github.com/intel-lab-lkp/linux/commits/Mark-Harmstone/btrfs-add-definitions-and-constants-for-remap-tree/20251025-021910
> > > base: https://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux.git for-next
> > > patch link: https://lore.kernel.org/r/20251024181227.32228-16-mark%40harmstone.com
> > > patch subject: [PATCH v4 15/16] btrfs: handle discarding fully-remapped block groups
> > > config: arm-randconfig-003-20251027 (https://download.01.org/0day-ci/archive/20251027/202510272322.N1S5rdDc-lkp@intel.com/config)
> > > compiler: arm-linux-gnueabi-gcc (GCC) 8.5.0
> > > reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20251027/202510272322.N1S5rdDc-lkp@intel.com/reproduce)
> > >
> > > If you fix the issue in a separate patch/commit (i.e. not just a new version of
> > > the same patch/commit), kindly add following tags
> > > | Reported-by: kernel test robot <lkp@intel.com>
> > > | Closes: https://lore.kernel.org/oe-kbuild-all/202510272322.N1S5rdDc-lkp@intel.com/
> > >
> > > Note: it may well be a FALSE warning. FWIW you are at least aware of it now.
> > > http://gcc.gnu.org/wiki/Better_Uninitialized_Warnings
> > >
> > > All warnings (new ones prefixed by >>):
> > >
> > > fs/btrfs/discard.c: In function 'btrfs_discard_workfn':
> > > > > fs/btrfs/discard.c:596:6: warning: 'discard_state' may be used uninitialized in this function [-Wmaybe-uninitialized]
> > > if (discard_state == BTRFS_DISCARD_BITMAPS ||
> > > ^
> >
> > I think this gets set by peek_discard_list() so I don't think this is
> > a valid warning.
>
> You are correct. discard_state gets initialized if the return value of
> peek_discard_list() is not NULL, and if it is NULL we return before we use
> it.
>
> This is an ancient version of GCC, the warning doesn't trigger on GCC 15 -
> presumably it has better control flow analysis. I don't think the robot
> should be compiling with this warning turned on for old compiler versions,
> if it's prone to false positives.
Thanks for the suggestion, I will update the bot to avoid sending out this
directly. Sorry for the false positive.
>
> > >
> > >
> > > vim +/discard_state +596 fs/btrfs/discard.c
> > >
> > > 513
> > > 514 /*
> > > 515 * Discard work queue callback
> > > 516 *
> > > 517 * @work: work
> > > 518 *
> > > 519 * Find the next block_group to start discarding and then discard a single
> > > 520 * region. It does this in a two-pass fashion: first extents and second
> > > 521 * bitmaps. Completely discarded block groups are sent to the unused_bgs path.
> > > 522 */
> > > 523 static void btrfs_discard_workfn(struct work_struct *work)
> > > 524 {
> > > 525 struct btrfs_discard_ctl *discard_ctl;
> > > 526 struct btrfs_block_group *block_group;
> > > 527 enum btrfs_discard_state discard_state;
> > > 528 int discard_index = 0;
> > > 529 u64 trimmed = 0;
> > > 530 u64 minlen = 0;
> > > 531 u64 now = ktime_get_ns();
> > > 532
> > > 533 discard_ctl = container_of(work, struct btrfs_discard_ctl, work.work);
> > > 534
> > > 535 block_group = peek_discard_list(discard_ctl, &discard_state,
> > > 536 &discard_index, now);
> > > 537 if (!block_group)
> > > 538 return;
> > > 539 if (!btrfs_run_discard_work(discard_ctl)) {
> > > 540 spin_lock(&discard_ctl->lock);
> > > 541 btrfs_put_block_group(block_group);
> > > 542 discard_ctl->block_group = NULL;
> > > 543 spin_unlock(&discard_ctl->lock);
> > > 544 return;
> > > 545 }
> > > 546 if (now < block_group->discard_eligible_time) {
> > > 547 spin_lock(&discard_ctl->lock);
> > > 548 btrfs_put_block_group(block_group);
> > > 549 discard_ctl->block_group = NULL;
> > > 550 spin_unlock(&discard_ctl->lock);
> > > 551 btrfs_discard_schedule_work(discard_ctl, false);
> > > 552 return;
> > > 553 }
> > > 554
> > > 555 /* Perform discarding */
> > > 556 minlen = discard_minlen[discard_index];
> > > 557
> > > 558 switch (discard_state) {
> > > 559 case BTRFS_DISCARD_BITMAPS: {
> > > 560 u64 maxlen = 0;
> > > 561
> > > 562 /*
> > > 563 * Use the previous levels minimum discard length as the max
> > > 564 * length filter. In the case something is added to make a
> > > 565 * region go beyond the max filter, the entire bitmap is set
> > > 566 * back to BTRFS_TRIM_STATE_UNTRIMMED.
> > > 567 */
> > > 568 if (discard_index != BTRFS_DISCARD_INDEX_UNUSED)
> > > 569 maxlen = discard_minlen[discard_index - 1];
> > > 570
> > > 571 btrfs_trim_block_group_bitmaps(block_group, &trimmed,
> > > 572 block_group->discard_cursor,
> > > 573 btrfs_block_group_end(block_group),
> > > 574 minlen, maxlen, true);
> > > 575 discard_ctl->discard_bitmap_bytes += trimmed;
> > > 576
> > > 577 break;
> > > 578 }
> > > 579
> > > 580 case BTRFS_DISCARD_FULLY_REMAPPED:
> > > 581 btrfs_trim_fully_remapped_block_group(block_group);
> > > 582 break;
> > > 583
> > > 584 default:
> > > 585 btrfs_trim_block_group_extents(block_group, &trimmed,
> > > 586 block_group->discard_cursor,
> > > 587 btrfs_block_group_end(block_group),
> > > 588 minlen, true);
> > > 589 discard_ctl->discard_extent_bytes += trimmed;
> > > 590
> > > 591 break;
> > > 592 }
> > > 593
> > > 594 /* Determine next steps for a block_group */
> > > 595 if (block_group->discard_cursor >= btrfs_block_group_end(block_group)) {
> > > > 596 if (discard_state == BTRFS_DISCARD_BITMAPS ||
> > > 597 discard_state == BTRFS_DISCARD_FULLY_REMAPPED) {
> > > 598 btrfs_finish_discard_pass(discard_ctl, block_group);
> > > 599 } else {
> > > 600 block_group->discard_cursor = block_group->start;
> > > 601 spin_lock(&discard_ctl->lock);
> > > 602 if (block_group->discard_state !=
> > > 603 BTRFS_DISCARD_RESET_CURSOR)
> > > 604 block_group->discard_state =
> > > 605 BTRFS_DISCARD_BITMAPS;
> > > 606 spin_unlock(&discard_ctl->lock);
> > > 607 }
> > > 608 }
> > > 609
> > > 610 now = ktime_get_ns();
> > > 611 spin_lock(&discard_ctl->lock);
> > > 612 discard_ctl->prev_discard = trimmed;
> > > 613 discard_ctl->prev_discard_time = now;
> > > 614 btrfs_put_block_group(block_group);
> > > 615 discard_ctl->block_group = NULL;
> > > 616 __btrfs_discard_schedule_work(discard_ctl, now, false);
> > > 617 spin_unlock(&discard_ctl->lock);
> > > 618 }
> > > 619
> > >
> > > --
> > > 0-DAY CI Kernel Test Service
> > > https://github.com/intel/lkp-tests/wiki
>
>
^ permalink raw reply [flat|nested] 42+ messages in thread
* Re: [PATCH v4 15/16] btrfs: handle discarding fully-remapped block groups
2025-10-24 18:12 ` [PATCH v4 15/16] btrfs: handle discarding fully-remapped block groups Mark Harmstone
2025-10-27 16:04 ` kernel test robot
@ 2025-10-31 22:11 ` Boris Burkov
2025-11-03 17:01 ` Mark Harmstone
1 sibling, 1 reply; 42+ messages in thread
From: Boris Burkov @ 2025-10-31 22:11 UTC (permalink / raw)
To: Mark Harmstone; +Cc: linux-btrfs
On Fri, Oct 24, 2025 at 07:12:16PM +0100, Mark Harmstone wrote:
> Discard normally works by iterating over the free-space entries of a
> block group. This doesn't work for fully-remapped block groups, as we
> removed their free-space entries when we started relocation.
>
> For sync discard, call btrfs_discard_extent() when we commit the
> transaction in which the last identity remap was removed.
>
> For async discard, add a new function btrfs_trim_fully_remapped_block_group()
> to be called by the discard worker, which iterates over the block
> group's range using the normal async discard rules. Once we reach the
> end, remove the chunk's stripes and device extents to get back its free
> space.
>
> Signed-off-by: Mark Harmstone <mark@harmstone.com>
> ---
> fs/btrfs/block-group.c | 2 ++
> fs/btrfs/block-group.h | 1 +
> fs/btrfs/discard.c | 57 ++++++++++++++++++++++++++----
> fs/btrfs/extent-tree.c | 10 ++++++
> fs/btrfs/free-space-cache.c | 70 +++++++++++++++++++++++++++++++++++++
> fs/btrfs/free-space-cache.h | 1 +
> 6 files changed, 134 insertions(+), 7 deletions(-)
>
> diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
> index 8feddb472882..0c91553b02cf 100644
> --- a/fs/btrfs/block-group.c
> +++ b/fs/btrfs/block-group.c
> @@ -4833,4 +4833,6 @@ void btrfs_mark_bg_fully_remapped(struct btrfs_block_group *bg,
>
> spin_unlock(&fs_info->unused_bgs_lock);
>
> + if (btrfs_test_opt(fs_info, DISCARD_ASYNC))
> + btrfs_discard_queue_work(&fs_info->discard_ctl, bg);
> }
> diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h
> index 4522074a45c2..b0b16efea19a 100644
> --- a/fs/btrfs/block-group.h
> +++ b/fs/btrfs/block-group.h
> @@ -49,6 +49,7 @@ enum btrfs_discard_state {
> BTRFS_DISCARD_EXTENTS,
> BTRFS_DISCARD_BITMAPS,
> BTRFS_DISCARD_RESET_CURSOR,
> + BTRFS_DISCARD_FULLY_REMAPPED,
> };
>
> /*
> diff --git a/fs/btrfs/discard.c b/fs/btrfs/discard.c
> index ee5f5b2788e1..f9890037395a 100644
> --- a/fs/btrfs/discard.c
> +++ b/fs/btrfs/discard.c
> @@ -215,6 +215,27 @@ static struct btrfs_block_group *find_next_block_group(
> return ret_block_group;
> }
>
> +/*
> + * Returns whether a block group is empty.
> + *
> + * @block_group: block_group of interest
> + *
> + * "Empty" here means that there are no extents physically located within the
> + * device extents corresponding to this block group.
> + *
> + * For a remapped block group, this means that all of its identity remaps have
> + * been removed. For a non-remapped block group, this means that no extents
> + * have an address within its range, and that nothing has been remapped to be
> + * within it.
> + */
> +static bool block_group_is_empty(struct btrfs_block_group *block_group)
> +{
> + if (block_group->flags & BTRFS_BLOCK_GROUP_REMAPPED)
> + return block_group->identity_remap_count == 0;
> + else
> + return block_group->used == 0 && block_group->remap_bytes == 0;
> +}
> +
> /*
> * Look up next block group and set it for use.
> *
> @@ -241,8 +262,10 @@ static struct btrfs_block_group *peek_discard_list(
> block_group = find_next_block_group(discard_ctl, now);
>
> if (block_group && now >= block_group->discard_eligible_time) {
> + bool empty = block_group_is_empty(block_group);
> +
> if (block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED &&
> - block_group->used != 0) {
> + !empty) {
> if (btrfs_is_block_group_data_only(block_group)) {
> __add_to_discard_list(discard_ctl, block_group);
> /*
> @@ -267,7 +290,15 @@ static struct btrfs_block_group *peek_discard_list(
> }
> if (block_group->discard_state == BTRFS_DISCARD_RESET_CURSOR) {
> block_group->discard_cursor = block_group->start;
> - block_group->discard_state = BTRFS_DISCARD_EXTENTS;
> +
> + if (block_group->flags & BTRFS_BLOCK_GROUP_REMAPPED &&
> + empty) {
> + block_group->discard_state =
> + BTRFS_DISCARD_FULLY_REMAPPED;
> + } else {
> + block_group->discard_state =
> + BTRFS_DISCARD_EXTENTS;
> + }
> }
> }
> if (block_group) {
> @@ -373,7 +404,7 @@ void btrfs_discard_queue_work(struct btrfs_discard_ctl *discard_ctl,
> if (!block_group || !btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC))
> return;
>
> - if (block_group->used == 0 && block_group->remap_bytes == 0)
> + if (block_group_is_empty(block_group))
> add_to_discard_unused_list(discard_ctl, block_group);
> else
> add_to_discard_list(discard_ctl, block_group);
> @@ -470,7 +501,7 @@ static void btrfs_finish_discard_pass(struct btrfs_discard_ctl *discard_ctl,
> {
> remove_from_discard_list(discard_ctl, block_group);
>
> - if (block_group->used == 0) {
> + if (block_group_is_empty(block_group)) {
> if (btrfs_is_free_space_trimmed(block_group))
> btrfs_mark_bg_unused(block_group);
> else
> @@ -524,7 +555,8 @@ static void btrfs_discard_workfn(struct work_struct *work)
> /* Perform discarding */
> minlen = discard_minlen[discard_index];
>
> - if (discard_state == BTRFS_DISCARD_BITMAPS) {
> + switch (discard_state) {
> + case BTRFS_DISCARD_BITMAPS: {
> u64 maxlen = 0;
>
> /*
> @@ -541,17 +573,28 @@ static void btrfs_discard_workfn(struct work_struct *work)
> btrfs_block_group_end(block_group),
> minlen, maxlen, true);
> discard_ctl->discard_bitmap_bytes += trimmed;
> - } else {
> +
> + break;
> + }
> +
> + case BTRFS_DISCARD_FULLY_REMAPPED:
> + btrfs_trim_fully_remapped_block_group(block_group);
> + break;
> +
> + default:
> btrfs_trim_block_group_extents(block_group, &trimmed,
> block_group->discard_cursor,
> btrfs_block_group_end(block_group),
> minlen, true);
> discard_ctl->discard_extent_bytes += trimmed;
> +
> + break;
> }
>
> /* Determine next steps for a block_group */
> if (block_group->discard_cursor >= btrfs_block_group_end(block_group)) {
> - if (discard_state == BTRFS_DISCARD_BITMAPS) {
> + if (discard_state == BTRFS_DISCARD_BITMAPS ||
> + discard_state == BTRFS_DISCARD_FULLY_REMAPPED) {
> btrfs_finish_discard_pass(discard_ctl, block_group);
> } else {
> block_group->discard_cursor = block_group->start;
> diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
> index 82dc88915b7e..82d102a157e9 100644
> --- a/fs/btrfs/extent-tree.c
> +++ b/fs/btrfs/extent-tree.c
> @@ -2860,6 +2860,12 @@ int btrfs_handle_fully_remapped_bgs(struct btrfs_trans_handle *trans)
> list_for_each_entry_safe(block_group, tmp, fully_remapped_bgs, bg_list) {
> struct btrfs_chunk_map *map;
>
> + btrfs_discard_queue_work(&fs_info->discard_ctl, block_group);
Any reason to queue this when DISCARD_ASYNC isn't set?
i.e., put this in the if (btrfs_test_opt(..)) below?
> +
> + /* for async discard the below gets done in discard job */
> + if (btrfs_test_opt(fs_info, DISCARD_ASYNC))
> + continue;
> +
> map = btrfs_get_chunk_map(fs_info, block_group->start, 1);
> if (IS_ERR(map))
> return PTR_ERR(map);
> @@ -2870,6 +2876,10 @@ int btrfs_handle_fully_remapped_bgs(struct btrfs_trans_handle *trans)
> return ret;
> }
>
> + if (!TRANS_ABORTED(trans))
> + btrfs_discard_extent(fs_info, block_group->start,
> + block_group->length, NULL, false);
> +
> /*
> * Set num_stripes to 0, so that btrfs_remove_dev_extents()
> * won't run a second time.
> diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
> index 91670d0af179..5d5e3401e723 100644
> --- a/fs/btrfs/free-space-cache.c
> +++ b/fs/btrfs/free-space-cache.c
> @@ -29,6 +29,7 @@
> #include "file-item.h"
> #include "file.h"
> #include "super.h"
> +#include "relocation.h"
>
> #define BITS_PER_BITMAP (PAGE_SIZE * 8UL)
> #define MAX_CACHE_BYTES_PER_GIG SZ_64K
> @@ -3066,6 +3067,11 @@ bool btrfs_is_free_space_trimmed(struct btrfs_block_group *block_group)
> struct rb_node *node;
> bool ret = true;
>
> + if (block_group->flags & BTRFS_BLOCK_GROUP_REMAPPED &&
> + block_group->identity_remap_count == 0) {
> + return true;
> + }
> +
> spin_lock(&ctl->tree_lock);
> node = rb_first(&ctl->free_space_offset);
>
> @@ -3830,6 +3836,70 @@ static int trim_no_bitmap(struct btrfs_block_group *block_group,
> return ret;
> }
>
> +void btrfs_trim_fully_remapped_block_group(struct btrfs_block_group *bg)
> +{
> + struct btrfs_fs_info *fs_info = bg->fs_info;
> + struct btrfs_discard_ctl *discard_ctl = &fs_info->discard_ctl;
> + int ret = 0;
> + u64 bytes, trimmed;
> + const u64 max_discard_size = READ_ONCE(discard_ctl->max_discard_size);
> + u64 end = btrfs_block_group_end(bg);
> + struct btrfs_trans_handle *trans;
> + struct btrfs_chunk_map *map;
> +
> + bytes = end - bg->discard_cursor;
> +
> + if (max_discard_size &&
> + bytes >= (max_discard_size +
> + BTRFS_ASYNC_DISCARD_MIN_FILTER)) {
> + bytes = max_discard_size;
> + }
> +
> + ret = btrfs_discard_extent(fs_info, bg->discard_cursor, bytes, &trimmed,
> + false);
> + if (ret)
> + return;
> +
> + bg->discard_cursor += trimmed;
> +
> + if (bg->discard_cursor < end)
> + return;
> +
> + trans = btrfs_start_transaction(fs_info->tree_root, 0);
> + if (IS_ERR(trans))
> + return;
> +
> + map = btrfs_get_chunk_map(fs_info, bg->start, 1);
> + if (IS_ERR(map)) {
> + ret = PTR_ERR(map);
> + btrfs_abort_transaction(trans, ret);
> + return;
> + }
> +
> + ret = btrfs_last_identity_remap_gone(trans, map, bg);
> + if (ret) {
> + btrfs_free_chunk_map(map);
> + btrfs_abort_transaction(trans, ret);
> + return;
> + }
> +
> + btrfs_end_transaction(trans);
> +
> + /*
> + * Set num_stripes to 0, so that btrfs_remove_dev_extents()
> + * won't run a second time.
> + */
> + map->num_stripes = 0;
> +
> + btrfs_free_chunk_map(map);
> +
> + if (bg->used == 0) {
> + spin_lock(&fs_info->unused_bgs_lock);
> + list_move_tail(&bg->bg_list, &fs_info->unused_bgs);
> + spin_unlock(&fs_info->unused_bgs_lock);
> + }
> +}
> +
> /*
> * If we break out of trimming a bitmap prematurely, we should reset the
> * trimming bit. In a rather contrived case, it's possible to race here so
> diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
> index 9f1dbfdee8ca..33fc3b245648 100644
> --- a/fs/btrfs/free-space-cache.h
> +++ b/fs/btrfs/free-space-cache.h
> @@ -166,6 +166,7 @@ int btrfs_trim_block_group_extents(struct btrfs_block_group *block_group,
> int btrfs_trim_block_group_bitmaps(struct btrfs_block_group *block_group,
> u64 *trimmed, u64 start, u64 end, u64 minlen,
> u64 maxlen, bool async);
> +void btrfs_trim_fully_remapped_block_group(struct btrfs_block_group *bg);
>
> bool btrfs_free_space_cache_v1_active(struct btrfs_fs_info *fs_info);
> int btrfs_set_free_space_cache_v1_active(struct btrfs_fs_info *fs_info, bool active);
> --
> 2.49.1
>
^ permalink raw reply [flat|nested] 42+ messages in thread* Re: [PATCH v4 15/16] btrfs: handle discarding fully-remapped block groups
2025-10-31 22:11 ` Boris Burkov
@ 2025-11-03 17:01 ` Mark Harmstone
0 siblings, 0 replies; 42+ messages in thread
From: Mark Harmstone @ 2025-11-03 17:01 UTC (permalink / raw)
To: Boris Burkov; +Cc: linux-btrfs
On 31/10/2025 10.11 pm, Boris Burkov wrote:
> On Fri, Oct 24, 2025 at 07:12:16PM +0100, Mark Harmstone wrote:
>> Discard normally works by iterating over the free-space entries of a
>> block group. This doesn't work for fully-remapped block groups, as we
>> removed their free-space entries when we started relocation.
>>
>> For sync discard, call btrfs_discard_extent() when we commit the
>> transaction in which the last identity remap was removed.
>>
>> For async discard, add a new function btrfs_trim_fully_remapped_block_group()
>> to be called by the discard worker, which iterates over the block
>> group's range using the normal async discard rules. Once we reach the
>> end, remove the chunk's stripes and device extents to get back its free
>> space.
>>
>> Signed-off-by: Mark Harmstone <mark@harmstone.com>
>> ---
>> fs/btrfs/block-group.c | 2 ++
>> fs/btrfs/block-group.h | 1 +
>> fs/btrfs/discard.c | 57 ++++++++++++++++++++++++++----
>> fs/btrfs/extent-tree.c | 10 ++++++
>> fs/btrfs/free-space-cache.c | 70 +++++++++++++++++++++++++++++++++++++
>> fs/btrfs/free-space-cache.h | 1 +
>> 6 files changed, 134 insertions(+), 7 deletions(-)
>>
>> diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
>> index 8feddb472882..0c91553b02cf 100644
>> --- a/fs/btrfs/block-group.c
>> +++ b/fs/btrfs/block-group.c
>> @@ -4833,4 +4833,6 @@ void btrfs_mark_bg_fully_remapped(struct btrfs_block_group *bg,
>>
>> spin_unlock(&fs_info->unused_bgs_lock);
>>
>> + if (btrfs_test_opt(fs_info, DISCARD_ASYNC))
>> + btrfs_discard_queue_work(&fs_info->discard_ctl, bg);
>> }
>> diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h
>> index 4522074a45c2..b0b16efea19a 100644
>> --- a/fs/btrfs/block-group.h
>> +++ b/fs/btrfs/block-group.h
>> @@ -49,6 +49,7 @@ enum btrfs_discard_state {
>> BTRFS_DISCARD_EXTENTS,
>> BTRFS_DISCARD_BITMAPS,
>> BTRFS_DISCARD_RESET_CURSOR,
>> + BTRFS_DISCARD_FULLY_REMAPPED,
>> };
>>
>> /*
>> diff --git a/fs/btrfs/discard.c b/fs/btrfs/discard.c
>> index ee5f5b2788e1..f9890037395a 100644
>> --- a/fs/btrfs/discard.c
>> +++ b/fs/btrfs/discard.c
>> @@ -215,6 +215,27 @@ static struct btrfs_block_group *find_next_block_group(
>> return ret_block_group;
>> }
>>
>> +/*
>> + * Returns whether a block group is empty.
>> + *
>> + * @block_group: block_group of interest
>> + *
>> + * "Empty" here means that there are no extents physically located within the
>> + * device extents corresponding to this block group.
>> + *
>> + * For a remapped block group, this means that all of its identity remaps have
>> + * been removed. For a non-remapped block group, this means that no extents
>> + * have an address within its range, and that nothing has been remapped to be
>> + * within it.
>> + */
>> +static bool block_group_is_empty(struct btrfs_block_group *block_group)
>> +{
>> + if (block_group->flags & BTRFS_BLOCK_GROUP_REMAPPED)
>> + return block_group->identity_remap_count == 0;
>> + else
>> + return block_group->used == 0 && block_group->remap_bytes == 0;
>> +}
>> +
>> /*
>> * Look up next block group and set it for use.
>> *
>> @@ -241,8 +262,10 @@ static struct btrfs_block_group *peek_discard_list(
>> block_group = find_next_block_group(discard_ctl, now);
>>
>> if (block_group && now >= block_group->discard_eligible_time) {
>> + bool empty = block_group_is_empty(block_group);
>> +
>> if (block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED &&
>> - block_group->used != 0) {
>> + !empty) {
>> if (btrfs_is_block_group_data_only(block_group)) {
>> __add_to_discard_list(discard_ctl, block_group);
>> /*
>> @@ -267,7 +290,15 @@ static struct btrfs_block_group *peek_discard_list(
>> }
>> if (block_group->discard_state == BTRFS_DISCARD_RESET_CURSOR) {
>> block_group->discard_cursor = block_group->start;
>> - block_group->discard_state = BTRFS_DISCARD_EXTENTS;
>> +
>> + if (block_group->flags & BTRFS_BLOCK_GROUP_REMAPPED &&
>> + empty) {
>> + block_group->discard_state =
>> + BTRFS_DISCARD_FULLY_REMAPPED;
>> + } else {
>> + block_group->discard_state =
>> + BTRFS_DISCARD_EXTENTS;
>> + }
>> }
>> }
>> if (block_group) {
>> @@ -373,7 +404,7 @@ void btrfs_discard_queue_work(struct btrfs_discard_ctl *discard_ctl,
>> if (!block_group || !btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC))
>> return;
>>
>> - if (block_group->used == 0 && block_group->remap_bytes == 0)
>> + if (block_group_is_empty(block_group))
>> add_to_discard_unused_list(discard_ctl, block_group);
>> else
>> add_to_discard_list(discard_ctl, block_group);
>> @@ -470,7 +501,7 @@ static void btrfs_finish_discard_pass(struct btrfs_discard_ctl *discard_ctl,
>> {
>> remove_from_discard_list(discard_ctl, block_group);
>>
>> - if (block_group->used == 0) {
>> + if (block_group_is_empty(block_group)) {
>> if (btrfs_is_free_space_trimmed(block_group))
>> btrfs_mark_bg_unused(block_group);
>> else
>> @@ -524,7 +555,8 @@ static void btrfs_discard_workfn(struct work_struct *work)
>> /* Perform discarding */
>> minlen = discard_minlen[discard_index];
>>
>> - if (discard_state == BTRFS_DISCARD_BITMAPS) {
>> + switch (discard_state) {
>> + case BTRFS_DISCARD_BITMAPS: {
>> u64 maxlen = 0;
>>
>> /*
>> @@ -541,17 +573,28 @@ static void btrfs_discard_workfn(struct work_struct *work)
>> btrfs_block_group_end(block_group),
>> minlen, maxlen, true);
>> discard_ctl->discard_bitmap_bytes += trimmed;
>> - } else {
>> +
>> + break;
>> + }
>> +
>> + case BTRFS_DISCARD_FULLY_REMAPPED:
>> + btrfs_trim_fully_remapped_block_group(block_group);
>> + break;
>> +
>> + default:
>> btrfs_trim_block_group_extents(block_group, &trimmed,
>> block_group->discard_cursor,
>> btrfs_block_group_end(block_group),
>> minlen, true);
>> discard_ctl->discard_extent_bytes += trimmed;
>> +
>> + break;
>> }
>>
>> /* Determine next steps for a block_group */
>> if (block_group->discard_cursor >= btrfs_block_group_end(block_group)) {
>> - if (discard_state == BTRFS_DISCARD_BITMAPS) {
>> + if (discard_state == BTRFS_DISCARD_BITMAPS ||
>> + discard_state == BTRFS_DISCARD_FULLY_REMAPPED) {
>> btrfs_finish_discard_pass(discard_ctl, block_group);
>> } else {
>> block_group->discard_cursor = block_group->start;
>> diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
>> index 82dc88915b7e..82d102a157e9 100644
>> --- a/fs/btrfs/extent-tree.c
>> +++ b/fs/btrfs/extent-tree.c
>> @@ -2860,6 +2860,12 @@ int btrfs_handle_fully_remapped_bgs(struct btrfs_trans_handle *trans)
>> list_for_each_entry_safe(block_group, tmp, fully_remapped_bgs, bg_list) {
>> struct btrfs_chunk_map *map;
>>
>> + btrfs_discard_queue_work(&fs_info->discard_ctl, block_group);
>
> Any reason to queue this when DISCARD_ASYNC isn't set?
> i.e., put this in the if (btrfs_test_opt(..)) below?
Oops, you're right - btrfs_discard_queue_work() returns early if
DISCARD_ASYNC isn't set, so we might as well move it into the if.
>
>> +
>> + /* for async discard the below gets done in discard job */
>> + if (btrfs_test_opt(fs_info, DISCARD_ASYNC))
>> + continue;
>> +
>> map = btrfs_get_chunk_map(fs_info, block_group->start, 1);
>> if (IS_ERR(map))
>> return PTR_ERR(map);
>> @@ -2870,6 +2876,10 @@ int btrfs_handle_fully_remapped_bgs(struct btrfs_trans_handle *trans)
>> return ret;
>> }
>>
>> + if (!TRANS_ABORTED(trans))
>> + btrfs_discard_extent(fs_info, block_group->start,
>> + block_group->length, NULL, false);
>> +
>> /*
>> * Set num_stripes to 0, so that btrfs_remove_dev_extents()
>> * won't run a second time.
>> diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
>> index 91670d0af179..5d5e3401e723 100644
>> --- a/fs/btrfs/free-space-cache.c
>> +++ b/fs/btrfs/free-space-cache.c
>> @@ -29,6 +29,7 @@
>> #include "file-item.h"
>> #include "file.h"
>> #include "super.h"
>> +#include "relocation.h"
>>
>> #define BITS_PER_BITMAP (PAGE_SIZE * 8UL)
>> #define MAX_CACHE_BYTES_PER_GIG SZ_64K
>> @@ -3066,6 +3067,11 @@ bool btrfs_is_free_space_trimmed(struct btrfs_block_group *block_group)
>> struct rb_node *node;
>> bool ret = true;
>>
>> + if (block_group->flags & BTRFS_BLOCK_GROUP_REMAPPED &&
>> + block_group->identity_remap_count == 0) {
>> + return true;
>> + }
>> +
>> spin_lock(&ctl->tree_lock);
>> node = rb_first(&ctl->free_space_offset);
>>
>> @@ -3830,6 +3836,70 @@ static int trim_no_bitmap(struct btrfs_block_group *block_group,
>> return ret;
>> }
>>
>> +void btrfs_trim_fully_remapped_block_group(struct btrfs_block_group *bg)
>> +{
>> + struct btrfs_fs_info *fs_info = bg->fs_info;
>> + struct btrfs_discard_ctl *discard_ctl = &fs_info->discard_ctl;
>> + int ret = 0;
>> + u64 bytes, trimmed;
>> + const u64 max_discard_size = READ_ONCE(discard_ctl->max_discard_size);
>> + u64 end = btrfs_block_group_end(bg);
>> + struct btrfs_trans_handle *trans;
>> + struct btrfs_chunk_map *map;
>> +
>> + bytes = end - bg->discard_cursor;
>> +
>> + if (max_discard_size &&
>> + bytes >= (max_discard_size +
>> + BTRFS_ASYNC_DISCARD_MIN_FILTER)) {
>> + bytes = max_discard_size;
>> + }
>> +
>> + ret = btrfs_discard_extent(fs_info, bg->discard_cursor, bytes, &trimmed,
>> + false);
>> + if (ret)
>> + return;
>> +
>> + bg->discard_cursor += trimmed;
>> +
>> + if (bg->discard_cursor < end)
>> + return;
>> +
>> + trans = btrfs_start_transaction(fs_info->tree_root, 0);
>> + if (IS_ERR(trans))
>> + return;
>> +
>> + map = btrfs_get_chunk_map(fs_info, bg->start, 1);
>> + if (IS_ERR(map)) {
>> + ret = PTR_ERR(map);
>> + btrfs_abort_transaction(trans, ret);
>> + return;
>> + }
>> +
>> + ret = btrfs_last_identity_remap_gone(trans, map, bg);
>> + if (ret) {
>> + btrfs_free_chunk_map(map);
>> + btrfs_abort_transaction(trans, ret);
>> + return;
>> + }
>> +
>> + btrfs_end_transaction(trans);
>> +
>> + /*
>> + * Set num_stripes to 0, so that btrfs_remove_dev_extents()
>> + * won't run a second time.
>> + */
>> + map->num_stripes = 0;
>> +
>> + btrfs_free_chunk_map(map);
>> +
>> + if (bg->used == 0) {
>> + spin_lock(&fs_info->unused_bgs_lock);
>> + list_move_tail(&bg->bg_list, &fs_info->unused_bgs);
>> + spin_unlock(&fs_info->unused_bgs_lock);
>> + }
>> +}
>> +
>> /*
>> * If we break out of trimming a bitmap prematurely, we should reset the
>> * trimming bit. In a rather contrived case, it's possible to race here so
>> diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
>> index 9f1dbfdee8ca..33fc3b245648 100644
>> --- a/fs/btrfs/free-space-cache.h
>> +++ b/fs/btrfs/free-space-cache.h
>> @@ -166,6 +166,7 @@ int btrfs_trim_block_group_extents(struct btrfs_block_group *block_group,
>> int btrfs_trim_block_group_bitmaps(struct btrfs_block_group *block_group,
>> u64 *trimmed, u64 start, u64 end, u64 minlen,
>> u64 maxlen, bool async);
>> +void btrfs_trim_fully_remapped_block_group(struct btrfs_block_group *bg);
>>
>> bool btrfs_free_space_cache_v1_active(struct btrfs_fs_info *fs_info);
>> int btrfs_set_free_space_cache_v1_active(struct btrfs_fs_info *fs_info, bool active);
>> --
>> 2.49.1
>>
^ permalink raw reply [flat|nested] 42+ messages in thread
* [PATCH v4 16/16] btrfs: add stripe removal pending flag
2025-10-24 18:12 [PATCH v4 00/16] Remap tree Mark Harmstone
` (14 preceding siblings ...)
2025-10-24 18:12 ` [PATCH v4 15/16] btrfs: handle discarding fully-remapped block groups Mark Harmstone
@ 2025-10-24 18:12 ` Mark Harmstone
15 siblings, 0 replies; 42+ messages in thread
From: Mark Harmstone @ 2025-10-24 18:12 UTC (permalink / raw)
To: linux-btrfs; +Cc: Mark Harmstone
If the filesystem is unmounted while the async discard of a fully remapped
block group is in progress, its unused device extents will never be freed.
To counter this, add a new flag BTRFS_BLOCK_GROUP_STRIPE_REMOVAL_PENDING
to say that this has been interrupted. Set it in the transaction in which
the last identity remap has been removed, clear it when we remove the
device extents, and if we encounter it on mount queue that block group
up for discard.
Signed-off-by: Mark Harmstone <mark@harmstone.com>
---
fs/btrfs/block-group.c | 43 ++++++++++++++++++++++++++++++++-
fs/btrfs/free-space-cache.c | 7 ++++++
fs/btrfs/relocation.c | 18 ++++++++++++++
include/uapi/linux/btrfs_tree.h | 1 +
4 files changed, 68 insertions(+), 1 deletion(-)
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 0c91553b02cf..8eb452068e1f 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -2526,6 +2526,24 @@ static int read_one_block_group(struct btrfs_fs_info *info,
inc_block_group_ro(cache, 1);
}
+ if (cache->flags & BTRFS_BLOCK_GROUP_STRIPE_REMOVAL_PENDING) {
+ spin_lock(&info->unused_bgs_lock);
+
+ if (list_empty(&cache->bg_list)) {
+ btrfs_get_block_group(cache);
+ list_add_tail(&cache->bg_list,
+ &info->fully_remapped_bgs);
+ } else {
+ list_move_tail(&cache->bg_list,
+ &info->fully_remapped_bgs);
+ }
+
+ spin_unlock(&info->unused_bgs_lock);
+
+ if (btrfs_test_opt(info, DISCARD_ASYNC))
+ btrfs_discard_queue_work(&info->discard_ctl, cache);
+ }
+
return 0;
error:
btrfs_put_block_group(cache);
@@ -4833,6 +4851,29 @@ void btrfs_mark_bg_fully_remapped(struct btrfs_block_group *bg,
spin_unlock(&fs_info->unused_bgs_lock);
- if (btrfs_test_opt(fs_info, DISCARD_ASYNC))
+ if (btrfs_test_opt(fs_info, DISCARD_ASYNC)) {
+ bool bg_already_dirty = true;
+
+ spin_lock(&bg->lock);
+ bg->flags |= BTRFS_BLOCK_GROUP_STRIPE_REMOVAL_PENDING;
+ spin_unlock(&bg->lock);
+
+ spin_lock(&trans->transaction->dirty_bgs_lock);
+ if (list_empty(&bg->dirty_list)) {
+ list_add_tail(&bg->dirty_list,
+ &trans->transaction->dirty_bgs);
+ bg_already_dirty = false;
+ btrfs_get_block_group(bg);
+ }
+ spin_unlock(&trans->transaction->dirty_bgs_lock);
+
+ /*
+ * Modified block groups are accounted for in
+ * the delayed_refs_rsv.
+ */
+ if (!bg_already_dirty)
+ btrfs_inc_delayed_refs_rsv_bg_updates(trans->fs_info);
+
btrfs_discard_queue_work(&fs_info->discard_ctl, bg);
+ }
}
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 5d5e3401e723..60c0df6f002c 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -3068,6 +3068,7 @@ bool btrfs_is_free_space_trimmed(struct btrfs_block_group *block_group)
bool ret = true;
if (block_group->flags & BTRFS_BLOCK_GROUP_REMAPPED &&
+ !(block_group->flags & BTRFS_BLOCK_GROUP_STRIPE_REMOVAL_PENDING) &&
block_group->identity_remap_count == 0) {
return true;
}
@@ -3847,6 +3848,11 @@ void btrfs_trim_fully_remapped_block_group(struct btrfs_block_group *bg)
struct btrfs_trans_handle *trans;
struct btrfs_chunk_map *map;
+ if (!(bg->flags & BTRFS_BLOCK_GROUP_STRIPE_REMOVAL_PENDING)) {
+ bg->discard_cursor = end;
+ goto skip_discard;
+ }
+
bytes = end - bg->discard_cursor;
if (max_discard_size &&
@@ -3893,6 +3899,7 @@ void btrfs_trim_fully_remapped_block_group(struct btrfs_block_group *bg)
btrfs_free_chunk_map(map);
+skip_discard:
if (bg->used == 0) {
spin_lock(&fs_info->unused_bgs_lock);
list_move_tail(&bg->bg_list, &fs_info->unused_bgs);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index ebbc619be682..e01ff0174fb1 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -4740,6 +4740,7 @@ int btrfs_last_identity_remap_gone(struct btrfs_trans_handle *trans,
struct btrfs_block_group *bg)
{
int ret;
+ bool bg_already_dirty = true;
BTRFS_PATH_AUTO_FREE(path);
ret = btrfs_remove_dev_extents(trans, chunk);
@@ -4764,6 +4765,23 @@ int btrfs_last_identity_remap_gone(struct btrfs_trans_handle *trans,
btrfs_remove_bg_from_sinfo(bg);
+ spin_lock(&bg->lock);
+ bg->flags &= ~BTRFS_BLOCK_GROUP_STRIPE_REMOVAL_PENDING;
+ spin_unlock(&bg->lock);
+
+ spin_lock(&trans->transaction->dirty_bgs_lock);
+ if (list_empty(&bg->dirty_list)) {
+ list_add_tail(&bg->dirty_list,
+ &trans->transaction->dirty_bgs);
+ bg_already_dirty = false;
+ btrfs_get_block_group(bg);
+ }
+ spin_unlock(&trans->transaction->dirty_bgs_lock);
+
+ /* Modified block groups are accounted for in the delayed_refs_rsv. */
+ if (!bg_already_dirty)
+ btrfs_inc_delayed_refs_rsv_bg_updates(trans->fs_info);
+
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h
index 89bcb80081a6..36a7d1a3cbe3 100644
--- a/include/uapi/linux/btrfs_tree.h
+++ b/include/uapi/linux/btrfs_tree.h
@@ -1173,6 +1173,7 @@ struct btrfs_dev_replace_item {
#define BTRFS_BLOCK_GROUP_RAID1C4 (1ULL << 10)
#define BTRFS_BLOCK_GROUP_REMAPPED (1ULL << 11)
#define BTRFS_BLOCK_GROUP_REMAP (1ULL << 12)
+#define BTRFS_BLOCK_GROUP_STRIPE_REMOVAL_PENDING (1ULL << 13)
#define BTRFS_BLOCK_GROUP_RESERVED (BTRFS_AVAIL_ALLOC_BIT_SINGLE | \
BTRFS_SPACE_INFO_GLOBAL_RSV)
--
2.49.1
^ permalink raw reply related [flat|nested] 42+ messages in thread