linux-btrfs.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 0/2] btrfs: find_free_extent cleanups
@ 2025-09-23  1:13 Leo Martins
  2025-09-23  1:13 ` [PATCH 1/2] btrfs: remove ffe RAID loop Leo Martins
  2025-09-23  1:13 ` [PATCH 2/2] btrfs: add tracing for find_free_extent skip conditions Leo Martins
  0 siblings, 2 replies; 5+ messages in thread
From: Leo Martins @ 2025-09-23  1:13 UTC (permalink / raw)
  To: linux-btrfs, kernel-team

This patch series improves the btrfs extent allocator with both cleanup
and enhanced debugging capabilities.

The first patch removes redundant RAID loop logic that became obsolete
after previous changes ensured allocations only occur from block groups
with matching flags.

The second patch adds comprehensive tracing to find_free_extent() to
improve debugging and performance analysis capabilities.

Testing:
- Ran xfstests btrfs/auto
- Verified trace events output correctly

Leo Martins (2):
  btrfs: remove ffe RAID loop
  btrfs: add tracing for find_free_extent skip conditions

 fs/btrfs/extent-tree.c       | 76 +++++++++++++++++-------------------
 fs/btrfs/extent-tree.h       | 18 +++++++++
 include/trace/events/btrfs.h | 65 ++++++++++++++++++++++++++++++
 3 files changed, 118 insertions(+), 41 deletions(-)

-- 
2.47.3


^ permalink raw reply	[flat|nested] 5+ messages in thread

* [PATCH 1/2] btrfs: remove ffe RAID loop
  2025-09-23  1:13 [PATCH 0/2] btrfs: find_free_extent cleanups Leo Martins
@ 2025-09-23  1:13 ` Leo Martins
  2025-09-24 18:54   ` Boris Burkov
  2025-09-23  1:13 ` [PATCH 2/2] btrfs: add tracing for find_free_extent skip conditions Leo Martins
  1 sibling, 1 reply; 5+ messages in thread
From: Leo Martins @ 2025-09-23  1:13 UTC (permalink / raw)
  To: linux-btrfs, kernel-team

This patch removes the RAID loop from find_free_extent since it
is impossible to allocate from a block group with a different
RAID profile.

Historically, we've been able to fulfill allocation requests
from any block group. For example, a request for RAID0 could be
fulfilled by a RAID1 block group or a request for METADATA could be
fulfilled by a DATA block group.

"btrfs: extent-tree: Make sure we only allocate extents from block
groups with the same type" changed this behavior to skip block groups
with different flags than the request. This makes the duplication
compatibility check redundant since we're going to keep searching
regardless.

Signed-off-by: Leo Martins <loemra.dev@gmail.com>
---
 fs/btrfs/extent-tree.c | 40 +++-------------------------------------
 1 file changed, 3 insertions(+), 37 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index a4416c451b25..072d9bb84dd8 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4153,8 +4153,7 @@ static int can_allocate_chunk(struct btrfs_fs_info *fs_info,
 static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
 					struct btrfs_key *ins,
 					struct find_free_extent_ctl *ffe_ctl,
-					struct btrfs_space_info *space_info,
-					bool full_search)
+					struct btrfs_space_info *space_info)
 {
 	struct btrfs_root *root = fs_info->chunk_root;
 	int ret;
@@ -4171,20 +4170,15 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
 	if (ffe_ctl->loop >= LOOP_CACHING_WAIT && ffe_ctl->have_caching_bg)
 		return 1;
 
-	ffe_ctl->index++;
-	if (ffe_ctl->index < BTRFS_NR_RAID_TYPES)
-		return 1;
-
 	/* See the comments for btrfs_loop_type for an explanation of the phases. */
 	if (ffe_ctl->loop < LOOP_NO_EMPTY_SIZE) {
-		ffe_ctl->index = 0;
 		/*
 		 * We want to skip the LOOP_CACHING_WAIT step if we don't have
 		 * any uncached bgs and we've already done a full search
 		 * through.
 		 */
 		if (ffe_ctl->loop == LOOP_CACHING_NOWAIT &&
-		    (!ffe_ctl->orig_have_caching_bg && full_search))
+		    (!ffe_ctl->orig_have_caching_bg))
 			ffe_ctl->loop++;
 		ffe_ctl->loop++;
 
@@ -4384,7 +4378,6 @@ static noinline int find_free_extent(struct btrfs_root *root,
 	int cache_block_group_error = 0;
 	struct btrfs_block_group *block_group = NULL;
 	struct btrfs_space_info *space_info;
-	bool full_search = false;
 
 	WARN_ON(ffe_ctl->num_bytes < fs_info->sectorsize);
 
@@ -4477,9 +4470,6 @@ static noinline int find_free_extent(struct btrfs_root *root,
 search:
 	trace_btrfs_find_free_extent_search_loop(root, ffe_ctl);
 	ffe_ctl->have_caching_bg = false;
-	if (ffe_ctl->index == btrfs_bg_flags_to_raid_index(ffe_ctl->flags) ||
-	    ffe_ctl->index == 0)
-		full_search = true;
 	down_read(&space_info->groups_sem);
 	list_for_each_entry(block_group,
 			    &space_info->block_groups[ffe_ctl->index], list) {
@@ -4498,30 +4488,7 @@ static noinline int find_free_extent(struct btrfs_root *root,
 		btrfs_grab_block_group(block_group, ffe_ctl->delalloc);
 		ffe_ctl->search_start = block_group->start;
 
-		/*
-		 * this can happen if we end up cycling through all the
-		 * raid types, but we want to make sure we only allocate
-		 * for the proper type.
-		 */
 		if (!block_group_bits(block_group, ffe_ctl->flags)) {
-			u64 extra = BTRFS_BLOCK_GROUP_DUP |
-				BTRFS_BLOCK_GROUP_RAID1_MASK |
-				BTRFS_BLOCK_GROUP_RAID56_MASK |
-				BTRFS_BLOCK_GROUP_RAID10;
-
-			/*
-			 * if they asked for extra copies and this block group
-			 * doesn't provide them, bail.  This does allow us to
-			 * fill raid0 from raid1.
-			 */
-			if ((ffe_ctl->flags & extra) && !(block_group->flags & extra))
-				goto loop;
-
-			/*
-			 * This block group has different flags than we want.
-			 * It's possible that we have MIXED_GROUP flag but no
-			 * block group is mixed.  Just skip such block group.
-			 */
 			btrfs_release_block_group(block_group, ffe_ctl->delalloc);
 			continue;
 		}
@@ -4620,8 +4587,7 @@ static noinline int find_free_extent(struct btrfs_root *root,
 	}
 	up_read(&space_info->groups_sem);
 
-	ret = find_free_extent_update_loop(fs_info, ins, ffe_ctl, space_info,
-					   full_search);
+	ret = find_free_extent_update_loop(fs_info, ins, ffe_ctl, space_info);
 	if (ret > 0)
 		goto search;
 
-- 
2.47.3


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* [PATCH 2/2] btrfs: add tracing for find_free_extent skip conditions
  2025-09-23  1:13 [PATCH 0/2] btrfs: find_free_extent cleanups Leo Martins
  2025-09-23  1:13 ` [PATCH 1/2] btrfs: remove ffe RAID loop Leo Martins
@ 2025-09-23  1:13 ` Leo Martins
  2025-09-24 19:14   ` Boris Burkov
  1 sibling, 1 reply; 5+ messages in thread
From: Leo Martins @ 2025-09-23  1:13 UTC (permalink / raw)
  To: linux-btrfs, kernel-team

Add detailed tracing to the find_free_extent() function to improve
observability of extent allocation behavior. This patch introduces:

- A new trace event 'btrfs_find_free_extent_skip' that captures
  allocation context and skip reasons
- Comprehensive set of FFE_SKIP_* constants defining specific
  reasons why block groups are skipped during allocation
- Trace points at all major skip conditions in the allocator loop

The trace event includes key allocation parameters (root, size, flags,
loop iteration) and block group details (start, length, flags) along
with the specific skip reason and associated error codes.

These trace points will help diagnose allocation performance
issues, understand allocation patterns, and debug extent allocator
behavior.

Signed-off-by: Leo Martins <loemra.dev@gmail.com>
---
 fs/btrfs/extent-tree.c       | 36 +++++++++++++++++---
 fs/btrfs/extent-tree.h       | 18 ++++++++++
 include/trace/events/btrfs.h | 65 ++++++++++++++++++++++++++++++++++++
 3 files changed, 115 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 072d9bb84dd8..ed50f7357f19 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4425,8 +4425,11 @@ static noinline int find_free_extent(struct btrfs_root *root,
 	}
 
 	ret = prepare_allocation(fs_info, ffe_ctl, space_info, ins);
-	if (ret < 0)
+	if (ret < 0) {
+		trace_btrfs_find_free_extent_skip(root, ffe_ctl, NULL,
+						  FFE_SKIP_PREPARE_ALLOC_FAILED, ret);
 		return ret;
+	}
 
 	ffe_ctl->search_start = max(ffe_ctl->search_start,
 				    first_logical_byte(fs_info));
@@ -4453,6 +4456,8 @@ static noinline int find_free_extent(struct btrfs_root *root,
 				 * target because our list pointers are not
 				 * valid
 				 */
+				trace_btrfs_find_free_extent_skip(root, ffe_ctl, block_group,
+								  FFE_SKIP_HINTED_BG_INVALID, 0);
 				btrfs_put_block_group(block_group);
 				up_read(&space_info->groups_sem);
 			} else {
@@ -4464,6 +4469,8 @@ static noinline int find_free_extent(struct btrfs_root *root,
 				goto have_block_group;
 			}
 		} else if (block_group) {
+			trace_btrfs_find_free_extent_skip(root, ffe_ctl, block_group,
+							  FFE_SKIP_HINTED_BG_INVALID, 0);
 			btrfs_put_block_group(block_group);
 		}
 	}
@@ -4478,6 +4485,8 @@ static noinline int find_free_extent(struct btrfs_root *root,
 		ffe_ctl->hinted = false;
 		/* If the block group is read-only, we can skip it entirely. */
 		if (unlikely(block_group->ro)) {
+			trace_btrfs_find_free_extent_skip(root, ffe_ctl, block_group,
+							  FFE_SKIP_BG_READ_ONLY, 0);
 			if (ffe_ctl->for_treelog)
 				btrfs_clear_treelog_bg(block_group);
 			if (ffe_ctl->for_data_reloc)
@@ -4489,6 +4498,8 @@ static noinline int find_free_extent(struct btrfs_root *root,
 		ffe_ctl->search_start = block_group->start;
 
 		if (!block_group_bits(block_group, ffe_ctl->flags)) {
+			trace_btrfs_find_free_extent_skip(root, ffe_ctl, block_group,
+							  FFE_SKIP_BG_WRONG_FLAGS, 0);
 			btrfs_release_block_group(block_group, ffe_ctl->delalloc);
 			continue;
 		}
@@ -4508,6 +4519,8 @@ static noinline int find_free_extent(struct btrfs_root *root,
 			 * error that caused problems, not ENOSPC.
 			 */
 			if (ret < 0) {
+				trace_btrfs_find_free_extent_skip(root, ffe_ctl, block_group,
+								  FFE_SKIP_BG_CACHE_ERROR, ret);
 				if (!cache_block_group_error)
 					cache_block_group_error = ret;
 				ret = 0;
@@ -4517,18 +4530,26 @@ static noinline int find_free_extent(struct btrfs_root *root,
 		}
 
 		if (unlikely(block_group->cached == BTRFS_CACHE_ERROR)) {
+			trace_btrfs_find_free_extent_skip(root, ffe_ctl, block_group,
+							  FFE_SKIP_BG_CACHE_ERROR, -EIO);
 			if (!cache_block_group_error)
 				cache_block_group_error = -EIO;
 			goto loop;
 		}
 
-		if (!find_free_extent_check_size_class(ffe_ctl, block_group))
+		if (!find_free_extent_check_size_class(ffe_ctl, block_group)) {
+			trace_btrfs_find_free_extent_skip(root, ffe_ctl, block_group,
+							  FFE_SKIP_SIZE_CLASS_MISMATCH, 0);
 			goto loop;
+		}
 
 		bg_ret = NULL;
 		ret = do_allocation(block_group, ffe_ctl, &bg_ret);
-		if (ret > 0)
+		if (ret > 0) {
+			trace_btrfs_find_free_extent_skip(root, ffe_ctl, block_group,
+							  FFE_SKIP_DO_ALLOCATION_FAILED, ret);
 			goto loop;
+		}
 
 		if (bg_ret && bg_ret != block_group) {
 			btrfs_release_block_group(block_group, ffe_ctl->delalloc);
@@ -4542,22 +4563,29 @@ static noinline int find_free_extent(struct btrfs_root *root,
 		/* move on to the next group */
 		if (ffe_ctl->search_start + ffe_ctl->num_bytes >
 		    block_group->start + block_group->length) {
+			trace_btrfs_find_free_extent_skip(root, ffe_ctl, block_group,
+							  FFE_SKIP_EXTENDS_BEYOND_BG, 0);
 			btrfs_add_free_space_unused(block_group,
 					    ffe_ctl->found_offset,
 					    ffe_ctl->num_bytes);
 			goto loop;
 		}
 
-		if (ffe_ctl->found_offset < ffe_ctl->search_start)
+		if (ffe_ctl->found_offset < ffe_ctl->search_start) {
+			trace_btrfs_find_free_extent_skip(root, ffe_ctl, block_group,
+							  FFE_SKIP_FOUND_BEFORE_SEARCH_START, 0);
 			btrfs_add_free_space_unused(block_group,
 					ffe_ctl->found_offset,
 					ffe_ctl->search_start - ffe_ctl->found_offset);
+		}
 
 		ret = btrfs_add_reserved_bytes(block_group, ffe_ctl->ram_bytes,
 					       ffe_ctl->num_bytes,
 					       ffe_ctl->delalloc,
 					       ffe_ctl->loop >= LOOP_WRONG_SIZE_CLASS);
 		if (ret == -EAGAIN) {
+			trace_btrfs_find_free_extent_skip(root, ffe_ctl, block_group,
+							  FFE_SKIP_ADD_RESERVED_FAILED, -EAGAIN);
 			btrfs_add_free_space_unused(block_group,
 					ffe_ctl->found_offset,
 					ffe_ctl->num_bytes);
diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h
index e970ac42a871..667581c454a3 100644
--- a/fs/btrfs/extent-tree.h
+++ b/fs/btrfs/extent-tree.h
@@ -23,6 +23,24 @@ enum btrfs_extent_allocation_policy {
 	BTRFS_EXTENT_ALLOC_ZONED,
 };
 
+/*
+ * Enum for find_free_extent skip reasons used in trace events.
+ * Each enum corresponds to a specific unhappy path in the allocator.
+ */
+enum {
+	FFE_SKIP_PREPARE_ALLOC_FAILED,
+	FFE_SKIP_HINTED_BG_INVALID,
+	FFE_SKIP_BG_READ_ONLY,
+	FFE_SKIP_BG_WRONG_FLAGS,
+	FFE_SKIP_BG_CACHE_FAILED,
+	FFE_SKIP_BG_CACHE_ERROR,
+	FFE_SKIP_SIZE_CLASS_MISMATCH,
+	FFE_SKIP_DO_ALLOCATION_FAILED,
+	FFE_SKIP_EXTENDS_BEYOND_BG,
+	FFE_SKIP_FOUND_BEFORE_SEARCH_START,
+	FFE_SKIP_ADD_RESERVED_FAILED,
+};
+
 struct find_free_extent_ctl {
 	/* Basic allocation info */
 	u64 ram_bytes;
diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
index 7e418f065b94..39544a706654 100644
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -103,6 +103,18 @@ struct find_free_extent_ctl;
 	EM( COMMIT_TRANS,		"COMMIT_TRANS")			\
 	EMe(RESET_ZONES,		"RESET_ZONES")
 
+#define FIND_FREE_EXTENT_SKIP_REASONS						\
+	EM( FFE_SKIP_PREPARE_ALLOC_FAILED,	"PREPARE_ALLOC_FAILED")		\
+	EM( FFE_SKIP_HINTED_BG_INVALID,		"HINTED_BG_INVALID")		\
+	EM( FFE_SKIP_BG_READ_ONLY,		"BG_READ_ONLY")			\
+	EM( FFE_SKIP_BG_WRONG_FLAGS,		"BG_WRONG_FLAGS")		\
+	EM( FFE_SKIP_BG_CACHE_ERROR,		"BG_CACHE_ERROR")		\
+	EM( FFE_SKIP_SIZE_CLASS_MISMATCH,	"SIZE_CLASS_MISMATCH")		\
+	EM( FFE_SKIP_DO_ALLOCATION_FAILED,	"DO_ALLOCATION_FAILED")		\
+	EM( FFE_SKIP_EXTENDS_BEYOND_BG,		"EXTENDS_BEYOND_BG")		\
+	EM( FFE_SKIP_FOUND_BEFORE_SEARCH_START,	"FOUND_BEFORE_SEARCH_START")	\
+	EMe(FFE_SKIP_ADD_RESERVED_FAILED,	"ADD_RESERVED_FAILED")
+
 /*
  * First define the enums in the above macros to be exported to userspace via
  * TRACE_DEFINE_ENUM().
@@ -118,6 +130,7 @@ FI_TYPES
 QGROUP_RSV_TYPES
 IO_TREE_OWNER
 FLUSH_STATES
+FIND_FREE_EXTENT_SKIP_REASONS
 
 /*
  * Now redefine the EM and EMe macros to map the enums to the strings that will
@@ -1388,6 +1401,58 @@ DEFINE_EVENT(btrfs__reserve_extent, btrfs_reserve_extent_cluster,
 	TP_ARGS(block_group, ffe_ctl)
 );
 
+TRACE_EVENT(btrfs_find_free_extent_skip,
+
+	TP_PROTO(const struct btrfs_root *root,
+		 const struct find_free_extent_ctl *ffe_ctl,
+		 const struct btrfs_block_group *block_group,
+		 int reason,
+		 s64 error_or_value),
+
+	TP_ARGS(root, ffe_ctl, block_group, reason, error_or_value),
+
+	TP_STRUCT__entry_btrfs(
+		__field(	u64,	root_objectid		)
+		__field(	u64,	num_bytes		)
+		__field(	u64,	empty_size		)
+		__field(	u64,	flags			)
+		__field(	int,	loop			)
+		__field(	bool,	hinted			)
+		__field(	int,	size_class		)
+		__field(	u64,	bg_start		)
+		__field(	u64,	bg_length		)
+		__field(	u64,	bg_flags		)
+		__field(	int,	reason			)
+		__field(	s64,	error_or_value		)
+	),
+
+	TP_fast_assign_btrfs(root->fs_info,
+		__entry->root_objectid	= btrfs_root_id(root);
+		__entry->num_bytes	= ffe_ctl->num_bytes;
+		__entry->empty_size	= ffe_ctl->empty_size;
+		__entry->flags		= ffe_ctl->flags;
+		__entry->loop		= ffe_ctl->loop;
+		__entry->hinted		= ffe_ctl->hinted;
+		__entry->size_class	= ffe_ctl->size_class;
+		__entry->bg_start	= block_group ? block_group->start : 0;
+		__entry->bg_length	= block_group ? block_group->length : 0;
+		__entry->bg_flags	= block_group ? block_group->flags : 0;
+		__entry->reason		= reason;
+		__entry->error_or_value	= error_or_value;
+	),
+
+	TP_printk_btrfs(
+"root=%llu(%s) len=%llu empty_size=%llu flags=%llu(%s) loop=%d hinted=%d size_class=%d bg=[%llu+%llu] bg_flags=%llu(%s) reason=%s error_or_value=%lld",
+		  show_root_type(__entry->root_objectid),
+		  __entry->num_bytes, __entry->empty_size, __entry->flags,
+		  __print_flags((unsigned long)__entry->flags, "|", BTRFS_GROUP_FLAGS),
+		  __entry->loop, __entry->hinted, __entry->size_class,
+		  __entry->bg_start, __entry->bg_length, __entry->bg_flags,
+		  __print_flags((unsigned long)__entry->bg_flags, "|", BTRFS_GROUP_FLAGS),
+		  __print_symbolic(__entry->reason, FIND_FREE_EXTENT_SKIP_REASONS),
+		  __entry->error_or_value)
+);
+
 TRACE_EVENT(btrfs_find_cluster,
 
 	TP_PROTO(const struct btrfs_block_group *block_group, u64 start,
-- 
2.47.3


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH 1/2] btrfs: remove ffe RAID loop
  2025-09-23  1:13 ` [PATCH 1/2] btrfs: remove ffe RAID loop Leo Martins
@ 2025-09-24 18:54   ` Boris Burkov
  0 siblings, 0 replies; 5+ messages in thread
From: Boris Burkov @ 2025-09-24 18:54 UTC (permalink / raw)
  To: Leo Martins; +Cc: linux-btrfs, kernel-team

On Mon, Sep 22, 2025 at 06:13:14PM -0700, Leo Martins wrote:
> This patch removes the RAID loop from find_free_extent since it
> is impossible to allocate from a block group with a different
> RAID profile.

I hope we can pull this off; removing one of these loops will be great
for readability, even if most of them are empty/trivial 99%+ of the time
and it has no meaningful impact on performance.

> 
> Historically, we've been able to fulfill allocation requests
> from any block group. For example, a request for RAID0 could be
> fulfilled by a RAID1 block group or a request for METADATA could be
> fulfilled by a DATA block group.
> 
> "btrfs: extent-tree: Make sure we only allocate extents from block
> groups with the same type" changed this behavior to skip block groups

Please use the standardized format for referring to previous patches.
e.g.,
2a28468e525f ("btrfs: extent-tree: Make sure we only allocate extents from block groups with the same type")
I have the git alias:
fixes = show -s --pretty='format:%h (\"%s\")'
in my gitconfig that generates it for Fixes: tags, but you can adapt
that however is most helpful for you.

> with different flags than the request. This makes the duplication
> compatibility check redundant since we're going to keep searching
> regardless.

I agree with you from reading the code, but I would sleep easier at
night if you had evidence that this doesn't appreciably change the
outcome during raid rebalances. Can you check if there is an fstest that
keeps allocating during a raid rebalance, and if there isn't, can you
add one?

Thanks,
Boris

> 
> Signed-off-by: Leo Martins <loemra.dev@gmail.com>
> ---
>  fs/btrfs/extent-tree.c | 40 +++-------------------------------------
>  1 file changed, 3 insertions(+), 37 deletions(-)
> 
> diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
> index a4416c451b25..072d9bb84dd8 100644
> --- a/fs/btrfs/extent-tree.c
> +++ b/fs/btrfs/extent-tree.c
> @@ -4153,8 +4153,7 @@ static int can_allocate_chunk(struct btrfs_fs_info *fs_info,
>  static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
>  					struct btrfs_key *ins,
>  					struct find_free_extent_ctl *ffe_ctl,
> -					struct btrfs_space_info *space_info,
> -					bool full_search)
> +					struct btrfs_space_info *space_info)
>  {
>  	struct btrfs_root *root = fs_info->chunk_root;
>  	int ret;
> @@ -4171,20 +4170,15 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
>  	if (ffe_ctl->loop >= LOOP_CACHING_WAIT && ffe_ctl->have_caching_bg)
>  		return 1;
>  
> -	ffe_ctl->index++;
> -	if (ffe_ctl->index < BTRFS_NR_RAID_TYPES)
> -		return 1;
> -
>  	/* See the comments for btrfs_loop_type for an explanation of the phases. */
>  	if (ffe_ctl->loop < LOOP_NO_EMPTY_SIZE) {
> -		ffe_ctl->index = 0;
>  		/*
>  		 * We want to skip the LOOP_CACHING_WAIT step if we don't have
>  		 * any uncached bgs and we've already done a full search
>  		 * through.
>  		 */
>  		if (ffe_ctl->loop == LOOP_CACHING_NOWAIT &&
> -		    (!ffe_ctl->orig_have_caching_bg && full_search))
> +		    (!ffe_ctl->orig_have_caching_bg))
>  			ffe_ctl->loop++;
>  		ffe_ctl->loop++;
>  
> @@ -4384,7 +4378,6 @@ static noinline int find_free_extent(struct btrfs_root *root,
>  	int cache_block_group_error = 0;
>  	struct btrfs_block_group *block_group = NULL;
>  	struct btrfs_space_info *space_info;
> -	bool full_search = false;

I'm not 100% sure this is compatible with hints. If our allocation is
hinted (or clustered for that matter, but already not handled...) we
will jump into the middle of the loop without setting full_search=true,
so it is not true that we have looked at the caching state of every bg,
and waiting for caching could be a useful step. However, we will just
loop at LOOP_UNSET_SIZE_CLASS instead, which is also probably fine.

So I would say this is *probably* fine but does constitute a behavior
change and violates an existing invariant, even if it's one that is not
that heavily relied upon.

>  
>  	WARN_ON(ffe_ctl->num_bytes < fs_info->sectorsize);
>  
> @@ -4477,9 +4470,6 @@ static noinline int find_free_extent(struct btrfs_root *root,
>  search:
>  	trace_btrfs_find_free_extent_search_loop(root, ffe_ctl);
>  	ffe_ctl->have_caching_bg = false;
> -	if (ffe_ctl->index == btrfs_bg_flags_to_raid_index(ffe_ctl->flags) ||
> -	    ffe_ctl->index == 0)
> -		full_search = true;
>  	down_read(&space_info->groups_sem);
>  	list_for_each_entry(block_group,
>  			    &space_info->block_groups[ffe_ctl->index], list) {
> @@ -4498,30 +4488,7 @@ static noinline int find_free_extent(struct btrfs_root *root,
>  		btrfs_grab_block_group(block_group, ffe_ctl->delalloc);
>  		ffe_ctl->search_start = block_group->start;
>  
> -		/*
> -		 * this can happen if we end up cycling through all the
> -		 * raid types, but we want to make sure we only allocate
> -		 * for the proper type.
> -		 */
>  		if (!block_group_bits(block_group, ffe_ctl->flags)) {
> -			u64 extra = BTRFS_BLOCK_GROUP_DUP |
> -				BTRFS_BLOCK_GROUP_RAID1_MASK |
> -				BTRFS_BLOCK_GROUP_RAID56_MASK |
> -				BTRFS_BLOCK_GROUP_RAID10;
> -
> -			/*
> -			 * if they asked for extra copies and this block group
> -			 * doesn't provide them, bail.  This does allow us to
> -			 * fill raid0 from raid1.
> -			 */
> -			if ((ffe_ctl->flags & extra) && !(block_group->flags & extra))
> -				goto loop;
> -
> -			/*
> -			 * This block group has different flags than we want.
> -			 * It's possible that we have MIXED_GROUP flag but no
> -			 * block group is mixed.  Just skip such block group.
> -			 */
>  			btrfs_release_block_group(block_group, ffe_ctl->delalloc);
>  			continue;
>  		}
> @@ -4620,8 +4587,7 @@ static noinline int find_free_extent(struct btrfs_root *root,
>  	}
>  	up_read(&space_info->groups_sem);
>  
> -	ret = find_free_extent_update_loop(fs_info, ins, ffe_ctl, space_info,
> -					   full_search);
> +	ret = find_free_extent_update_loop(fs_info, ins, ffe_ctl, space_info);
>  	if (ret > 0)
>  		goto search;
>  
> -- 
> 2.47.3
> 

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH 2/2] btrfs: add tracing for find_free_extent skip conditions
  2025-09-23  1:13 ` [PATCH 2/2] btrfs: add tracing for find_free_extent skip conditions Leo Martins
@ 2025-09-24 19:14   ` Boris Burkov
  0 siblings, 0 replies; 5+ messages in thread
From: Boris Burkov @ 2025-09-24 19:14 UTC (permalink / raw)
  To: Leo Martins; +Cc: linux-btrfs, kernel-team

On Mon, Sep 22, 2025 at 06:13:15PM -0700, Leo Martins wrote:
> Add detailed tracing to the find_free_extent() function to improve
> observability of extent allocation behavior. This patch introduces:
> 
> - A new trace event 'btrfs_find_free_extent_skip' that captures
>   allocation context and skip reasons
> - Comprehensive set of FFE_SKIP_* constants defining specific
>   reasons why block groups are skipped during allocation
> - Trace points at all major skip conditions in the allocator loop
> 
> The trace event includes key allocation parameters (root, size, flags,
> loop iteration) and block group details (start, length, flags) along
> with the specific skip reason and associated error codes.
> 
> These trace points will help diagnose allocation performance
> issues, understand allocation patterns, and debug extent allocator
> behavior.

Looks great. A couple naming/semantics improvements suggested inline.

Have you run a "full" tracing of ffe including tracing the loop with
the other tracepoints btrfs_find_free_extent_search_loop and
btrfs_reserve_extent? If the bpftrace program and output from that
aren't huge, could be nice to include them in the commit message too.

Thanks,
Boris
> 
> Signed-off-by: Leo Martins <loemra.dev@gmail.com>
> ---
>  fs/btrfs/extent-tree.c       | 36 +++++++++++++++++---
>  fs/btrfs/extent-tree.h       | 18 ++++++++++
>  include/trace/events/btrfs.h | 65 ++++++++++++++++++++++++++++++++++++
>  3 files changed, 115 insertions(+), 4 deletions(-)
> 
> diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
> index 072d9bb84dd8..ed50f7357f19 100644
> --- a/fs/btrfs/extent-tree.c
> +++ b/fs/btrfs/extent-tree.c
> @@ -4425,8 +4425,11 @@ static noinline int find_free_extent(struct btrfs_root *root,
>  	}
>  
>  	ret = prepare_allocation(fs_info, ffe_ctl, space_info, ins);
> -	if (ret < 0)
> +	if (ret < 0) {
> +		trace_btrfs_find_free_extent_skip(root, ffe_ctl, NULL,
> +						  FFE_SKIP_PREPARE_ALLOC_FAILED, ret);

This one is less a "skip" and more of a "fail", I think mixing them is
probably more confusing than it's worth.

Then you can also name it btrfs_find_free_extent_skip_block_group which
makes more sense too.

>  		return ret;
> +	}
>  
>  	ffe_ctl->search_start = max(ffe_ctl->search_start,
>  				    first_logical_byte(fs_info));
> @@ -4453,6 +4456,8 @@ static noinline int find_free_extent(struct btrfs_root *root,
>  				 * target because our list pointers are not
>  				 * valid
>  				 */
> +				trace_btrfs_find_free_extent_skip(root, ffe_ctl, block_group,
> +								  FFE_SKIP_HINTED_BG_INVALID, 0);
>  				btrfs_put_block_group(block_group);
>  				up_read(&space_info->groups_sem);
>  			} else {
> @@ -4464,6 +4469,8 @@ static noinline int find_free_extent(struct btrfs_root *root,
>  				goto have_block_group;
>  			}
>  		} else if (block_group) {
> +			trace_btrfs_find_free_extent_skip(root, ffe_ctl, block_group,
> +							  FFE_SKIP_HINTED_BG_INVALID, 0);

If you get rid of INVALID, you can just check the above conditions to
categorize it. Or create a special HINT_INVALID case or something?

>  			btrfs_put_block_group(block_group);
>  		}
>  	}
> @@ -4478,6 +4485,8 @@ static noinline int find_free_extent(struct btrfs_root *root,
>  		ffe_ctl->hinted = false;
>  		/* If the block group is read-only, we can skip it entirely. */
>  		if (unlikely(block_group->ro)) {
> +			trace_btrfs_find_free_extent_skip(root, ffe_ctl, block_group,
> +							  FFE_SKIP_BG_READ_ONLY, 0);

One of the INVALID cases is also checking bg->ro, so I think having a
READ_ONLY category is kinda confusing. Maybe make this INVALID too? Or
change that one to READ_ONLY too? Or make both of them "REMOVING" or
something. tl;dr: I feel like they're really the same condition just
hinted/un-hinted.

>  			if (ffe_ctl->for_treelog)
>  				btrfs_clear_treelog_bg(block_group);
>  			if (ffe_ctl->for_data_reloc)
> @@ -4489,6 +4498,8 @@ static noinline int find_free_extent(struct btrfs_root *root,
>  		ffe_ctl->search_start = block_group->start;
>  
>  		if (!block_group_bits(block_group, ffe_ctl->flags)) {
> +			trace_btrfs_find_free_extent_skip(root, ffe_ctl, block_group,
> +							  FFE_SKIP_BG_WRONG_FLAGS, 0);

you can stash the wrong flags from the bg in error_or_value

>  			btrfs_release_block_group(block_group, ffe_ctl->delalloc);
>  			continue;
>  		}
> @@ -4508,6 +4519,8 @@ static noinline int find_free_extent(struct btrfs_root *root,
>  			 * error that caused problems, not ENOSPC.
>  			 */
>  			if (ret < 0) {
> +				trace_btrfs_find_free_extent_skip(root, ffe_ctl, block_group,
> +								  FFE_SKIP_BG_CACHE_ERROR, ret);
>  				if (!cache_block_group_error)
>  					cache_block_group_error = ret;
>  				ret = 0;
> @@ -4517,18 +4530,26 @@ static noinline int find_free_extent(struct btrfs_root *root,
>  		}
>  
>  		if (unlikely(block_group->cached == BTRFS_CACHE_ERROR)) {
> +			trace_btrfs_find_free_extent_skip(root, ffe_ctl, block_group,
> +							  FFE_SKIP_BG_CACHE_ERROR, -EIO);
>  			if (!cache_block_group_error)
>  				cache_block_group_error = -EIO;
>  			goto loop;
>  		}
>  
> -		if (!find_free_extent_check_size_class(ffe_ctl, block_group))
> +		if (!find_free_extent_check_size_class(ffe_ctl, block_group)) {
> +			trace_btrfs_find_free_extent_skip(root, ffe_ctl, block_group,
> +							  FFE_SKIP_SIZE_CLASS_MISMATCH, 0);

you can stash the wrong size class from the bg in error_or_value

>  			goto loop;
> +		}
>  
>  		bg_ret = NULL;
>  		ret = do_allocation(block_group, ffe_ctl, &bg_ret);
> -		if (ret > 0)
> +		if (ret > 0) {
> +			trace_btrfs_find_free_extent_skip(root, ffe_ctl, block_group,
> +							  FFE_SKIP_DO_ALLOCATION_FAILED, ret);
>  			goto loop;
> +		}
>  
>  		if (bg_ret && bg_ret != block_group) {
>  			btrfs_release_block_group(block_group, ffe_ctl->delalloc);
> @@ -4542,22 +4563,29 @@ static noinline int find_free_extent(struct btrfs_root *root,
>  		/* move on to the next group */
>  		if (ffe_ctl->search_start + ffe_ctl->num_bytes >
>  		    block_group->start + block_group->length) {
> +			trace_btrfs_find_free_extent_skip(root, ffe_ctl, block_group,
> +							  FFE_SKIP_EXTENDS_BEYOND_BG, 0);

I would prefer this name and "FOUND_BEFORE_SEARCH_START" to have a
similar structure to suggest that they are the same sort of error. Or
you can just do something like SEARCH_BOUNDS. The trace's error_or_value
can include the offending value as well?

>  			btrfs_add_free_space_unused(block_group,
>  					    ffe_ctl->found_offset,
>  					    ffe_ctl->num_bytes);
>  			goto loop;
>  		}
>  
> -		if (ffe_ctl->found_offset < ffe_ctl->search_start)
> +		if (ffe_ctl->found_offset < ffe_ctl->search_start) {
> +			trace_btrfs_find_free_extent_skip(root, ffe_ctl, block_group,
> +							  FFE_SKIP_FOUND_BEFORE_SEARCH_START, 0);
>  			btrfs_add_free_space_unused(block_group,
>  					ffe_ctl->found_offset,
>  					ffe_ctl->search_start - ffe_ctl->found_offset);
> +		}
>  
>  		ret = btrfs_add_reserved_bytes(block_group, ffe_ctl->ram_bytes,
>  					       ffe_ctl->num_bytes,
>  					       ffe_ctl->delalloc,
>  					       ffe_ctl->loop >= LOOP_WRONG_SIZE_CLASS);
>  		if (ret == -EAGAIN) {
> +			trace_btrfs_find_free_extent_skip(root, ffe_ctl, block_group,
> +							  FFE_SKIP_ADD_RESERVED_FAILED, -EAGAIN);
>  			btrfs_add_free_space_unused(block_group,
>  					ffe_ctl->found_offset,
>  					ffe_ctl->num_bytes);
> diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h
> index e970ac42a871..667581c454a3 100644
> --- a/fs/btrfs/extent-tree.h
> +++ b/fs/btrfs/extent-tree.h
> @@ -23,6 +23,24 @@ enum btrfs_extent_allocation_policy {
>  	BTRFS_EXTENT_ALLOC_ZONED,
>  };
>  
> +/*
> + * Enum for find_free_extent skip reasons used in trace events.
> + * Each enum corresponds to a specific unhappy path in the allocator.
> + */
> +enum {
> +	FFE_SKIP_PREPARE_ALLOC_FAILED,
> +	FFE_SKIP_HINTED_BG_INVALID,
> +	FFE_SKIP_BG_READ_ONLY,
> +	FFE_SKIP_BG_WRONG_FLAGS,
> +	FFE_SKIP_BG_CACHE_FAILED,
> +	FFE_SKIP_BG_CACHE_ERROR,
> +	FFE_SKIP_SIZE_CLASS_MISMATCH,
> +	FFE_SKIP_DO_ALLOCATION_FAILED,
> +	FFE_SKIP_EXTENDS_BEYOND_BG,
> +	FFE_SKIP_FOUND_BEFORE_SEARCH_START,
> +	FFE_SKIP_ADD_RESERVED_FAILED,
> +};
> +
>  struct find_free_extent_ctl {
>  	/* Basic allocation info */
>  	u64 ram_bytes;
> diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
> index 7e418f065b94..39544a706654 100644
> --- a/include/trace/events/btrfs.h
> +++ b/include/trace/events/btrfs.h
> @@ -103,6 +103,18 @@ struct find_free_extent_ctl;
>  	EM( COMMIT_TRANS,		"COMMIT_TRANS")			\
>  	EMe(RESET_ZONES,		"RESET_ZONES")
>  
> +#define FIND_FREE_EXTENT_SKIP_REASONS						\
> +	EM( FFE_SKIP_PREPARE_ALLOC_FAILED,	"PREPARE_ALLOC_FAILED")		\
> +	EM( FFE_SKIP_HINTED_BG_INVALID,		"HINTED_BG_INVALID")		\
> +	EM( FFE_SKIP_BG_READ_ONLY,		"BG_READ_ONLY")			\
> +	EM( FFE_SKIP_BG_WRONG_FLAGS,		"BG_WRONG_FLAGS")		\
> +	EM( FFE_SKIP_BG_CACHE_ERROR,		"BG_CACHE_ERROR")		\
> +	EM( FFE_SKIP_SIZE_CLASS_MISMATCH,	"SIZE_CLASS_MISMATCH")		\
> +	EM( FFE_SKIP_DO_ALLOCATION_FAILED,	"DO_ALLOCATION_FAILED")		\
> +	EM( FFE_SKIP_EXTENDS_BEYOND_BG,		"EXTENDS_BEYOND_BG")		\
> +	EM( FFE_SKIP_FOUND_BEFORE_SEARCH_START,	"FOUND_BEFORE_SEARCH_START")	\
> +	EMe(FFE_SKIP_ADD_RESERVED_FAILED,	"ADD_RESERVED_FAILED")
> +
>  /*
>   * First define the enums in the above macros to be exported to userspace via
>   * TRACE_DEFINE_ENUM().
> @@ -118,6 +130,7 @@ FI_TYPES
>  QGROUP_RSV_TYPES
>  IO_TREE_OWNER
>  FLUSH_STATES
> +FIND_FREE_EXTENT_SKIP_REASONS
>  
>  /*
>   * Now redefine the EM and EMe macros to map the enums to the strings that will
> @@ -1388,6 +1401,58 @@ DEFINE_EVENT(btrfs__reserve_extent, btrfs_reserve_extent_cluster,
>  	TP_ARGS(block_group, ffe_ctl)
>  );
>  
> +TRACE_EVENT(btrfs_find_free_extent_skip,
> +
> +	TP_PROTO(const struct btrfs_root *root,
> +		 const struct find_free_extent_ctl *ffe_ctl,
> +		 const struct btrfs_block_group *block_group,
> +		 int reason,
> +		 s64 error_or_value),
> +
> +	TP_ARGS(root, ffe_ctl, block_group, reason, error_or_value),
> +
> +	TP_STRUCT__entry_btrfs(
> +		__field(	u64,	root_objectid		)
> +		__field(	u64,	num_bytes		)
> +		__field(	u64,	empty_size		)
> +		__field(	u64,	flags			)
> +		__field(	int,	loop			)
> +		__field(	bool,	hinted			)
> +		__field(	int,	size_class		)
> +		__field(	u64,	bg_start		)
> +		__field(	u64,	bg_length		)
> +		__field(	u64,	bg_flags		)
> +		__field(	int,	reason			)
> +		__field(	s64,	error_or_value		)
> +	),
> +
> +	TP_fast_assign_btrfs(root->fs_info,
> +		__entry->root_objectid	= btrfs_root_id(root);
> +		__entry->num_bytes	= ffe_ctl->num_bytes;
> +		__entry->empty_size	= ffe_ctl->empty_size;
> +		__entry->flags		= ffe_ctl->flags;
> +		__entry->loop		= ffe_ctl->loop;
> +		__entry->hinted		= ffe_ctl->hinted;
> +		__entry->size_class	= ffe_ctl->size_class;
> +		__entry->bg_start	= block_group ? block_group->start : 0;
> +		__entry->bg_length	= block_group ? block_group->length : 0;
> +		__entry->bg_flags	= block_group ? block_group->flags : 0;
> +		__entry->reason		= reason;
> +		__entry->error_or_value	= error_or_value;
> +	),
> +
> +	TP_printk_btrfs(
> +"root=%llu(%s) len=%llu empty_size=%llu flags=%llu(%s) loop=%d hinted=%d size_class=%d bg=[%llu+%llu] bg_flags=%llu(%s) reason=%s error_or_value=%lld",
> +		  show_root_type(__entry->root_objectid),
> +		  __entry->num_bytes, __entry->empty_size, __entry->flags,
> +		  __print_flags((unsigned long)__entry->flags, "|", BTRFS_GROUP_FLAGS),
> +		  __entry->loop, __entry->hinted, __entry->size_class,
> +		  __entry->bg_start, __entry->bg_length, __entry->bg_flags,
> +		  __print_flags((unsigned long)__entry->bg_flags, "|", BTRFS_GROUP_FLAGS),
> +		  __print_symbolic(__entry->reason, FIND_FREE_EXTENT_SKIP_REASONS),
> +		  __entry->error_or_value)
> +);
> +
>  TRACE_EVENT(btrfs_find_cluster,
>  
>  	TP_PROTO(const struct btrfs_block_group *block_group, u64 start,
> -- 
> 2.47.3
> 

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2025-09-24 19:14 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-09-23  1:13 [PATCH 0/2] btrfs: find_free_extent cleanups Leo Martins
2025-09-23  1:13 ` [PATCH 1/2] btrfs: remove ffe RAID loop Leo Martins
2025-09-24 18:54   ` Boris Burkov
2025-09-23  1:13 ` [PATCH 2/2] btrfs: add tracing for find_free_extent skip conditions Leo Martins
2025-09-24 19:14   ` Boris Burkov

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).