* [PATCH 1/3] btrfs: move reclaiming of a single block group into its own function
2026-03-02 14:39 [PATCH 0/3] btrfs: zoned: fix hang with generic/551 Johannes Thumshirn
@ 2026-03-02 14:39 ` Johannes Thumshirn
2026-03-03 7:34 ` Damien Le Moal
2026-03-02 14:39 ` [PATCH 2/3] btrfs: create btrfs_reclaim_block_groups() Johannes Thumshirn
2026-03-02 14:39 ` [PATCH 3/3] btrfs: zoned: limit number of zones reclaimed in flush_space Johannes Thumshirn
2 siblings, 1 reply; 8+ messages in thread
From: Johannes Thumshirn @ 2026-03-02 14:39 UTC (permalink / raw)
To: linux-btrfs; +Cc: Naohiro Aota, Damien Le Moal, Johannes Thumshirn
The main work of reclaiming a single block-group in
btrfs_reclaim_bgs_work() is done inside the loop iterating over all the
block_groups in the fs_info->reclaim_bgs list.
Factor out reclaim of a single block group from the loop to improve
readability.
No functional change intented.
Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
---
fs/btrfs/block-group.c | 256 +++++++++++++++++++++--------------------
1 file changed, 133 insertions(+), 123 deletions(-)
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 896737cc432e..e95f68d246c6 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -1909,6 +1909,137 @@ static bool should_reclaim_block_group(const struct btrfs_block_group *bg, u64 b
return true;
}
+static int btrfs_reclaim_block_group(struct btrfs_block_group *bg)
+{
+ struct btrfs_fs_info *fs_info = bg->fs_info;
+ struct btrfs_space_info *space_info = bg->space_info;
+ u64 used;
+ u64 reserved;
+ u64 old_total;
+ int ret = 0;
+
+ /* Don't race with allocators so take the groups_sem */
+ down_write(&space_info->groups_sem);
+
+ spin_lock(&space_info->lock);
+ spin_lock(&bg->lock);
+ if (bg->reserved || bg->pinned || bg->ro) {
+ /*
+ * We want to bail if we made new allocations or have
+ * outstanding allocations in this block group. We do
+ * the ro check in case balance is currently acting on
+ * this block group.
+ */
+ spin_unlock(&bg->lock);
+ spin_unlock(&space_info->lock);
+ up_write(&space_info->groups_sem);
+ return 0;
+ }
+
+ if (bg->used == 0) {
+ /*
+ * It is possible that we trigger relocation on a block
+ * group as its extents are deleted and it first goes
+ * below the threshold, then shortly after goes empty.
+ *
+ * In this case, relocating it does delete it, but has
+ * some overhead in relocation specific metadata, looking
+ * for the non-existent extents and running some extra
+ * transactions, which we can avoid by using one of the
+ * other mechanisms for dealing with empty block groups.
+ */
+ if (!btrfs_test_opt(fs_info, DISCARD_ASYNC))
+ btrfs_mark_bg_unused(bg);
+ spin_unlock(&bg->lock);
+ spin_unlock(&space_info->lock);
+ up_write(&space_info->groups_sem);
+ return 0;
+ }
+
+ /*
+ * The block group might no longer meet the reclaim condition by
+ * the time we get around to reclaiming it, so to avoid
+ * reclaiming overly full block_groups, skip reclaiming them.
+ *
+ * Since the decision making process also depends on the amount
+ * being freed, pass in a fake giant value to skip that extra
+ * check, which is more meaningful when adding to the list in
+ * the first place.
+ */
+ if (!should_reclaim_block_group(bg, bg->length)) {
+ spin_unlock(&bg->lock);
+ spin_unlock(&space_info->lock);
+ up_write(&space_info->groups_sem);
+ return 0;
+ }
+
+ spin_unlock(&bg->lock);
+ old_total = space_info->total_bytes;
+ spin_unlock(&space_info->lock);
+
+ /*
+ * Get out fast, in case we're read-only or unmounting the
+ * filesystem. It is OK to drop block groups from the list even
+ * for the read-only case. As we did take the super write lock,
+ * "mount -o remount,ro" won't happen and read-only filesystem
+ * means it is forced read-only due to a fatal error. So, it
+ * never gets back to read-write to let us reclaim again.
+ */
+ if (btrfs_need_cleaner_sleep(fs_info)) {
+ up_write(&space_info->groups_sem);
+ return 0;
+ }
+
+ ret = inc_block_group_ro(bg, false);
+ up_write(&space_info->groups_sem);
+ if (ret < 0)
+ return ret;
+
+ /*
+ * The amount of bytes reclaimed corresponds to the sum of the
+ * "used" and "reserved" counters. We have set the block group
+ * to RO above, which prevents reservations from happening but
+ * we may have existing reservations for which allocation has
+ * not yet been done - btrfs_update_block_group() was not yet
+ * called, which is where we will transfer a reserved extent's
+ * size from the "reserved" counter to the "used" counter - this
+ * happens when running delayed references. When we relocate the
+ * chunk below, relocation first flushes delalloc, waits for
+ * ordered extent completion (which is where we create delayed
+ * references for data extents) and commits the current
+ * transaction (which runs delayed references), and only after
+ * it does the actual work to move extents out of the block
+ * group. So the reported amount of reclaimed bytes is
+ * effectively the sum of the 'used' and 'reserved' counters.
+ */
+ spin_lock(&bg->lock);
+ used = bg->used;
+ reserved = bg->reserved;
+ spin_unlock(&bg->lock);
+
+ trace_btrfs_reclaim_block_group(bg);
+ ret = btrfs_relocate_chunk(fs_info, bg->start, false);
+ if (ret) {
+ btrfs_dec_block_group_ro(bg);
+ btrfs_err(fs_info, "error relocating chunk %llu",
+ bg->start);
+ used = 0;
+ reserved = 0;
+ spin_lock(&space_info->lock);
+ space_info->reclaim_errors++;
+ spin_unlock(&space_info->lock);
+ }
+ spin_lock(&space_info->lock);
+ space_info->reclaim_count++;
+ space_info->reclaim_bytes += used;
+ space_info->reclaim_bytes += reserved;
+ if (space_info->total_bytes < old_total)
+ btrfs_set_periodic_reclaim_ready(space_info, true);
+ spin_unlock(&space_info->lock);
+
+ return ret;
+}
+
void btrfs_reclaim_bgs_work(struct work_struct *work)
{
struct btrfs_fs_info *fs_info =
@@ -1942,10 +2073,7 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
*/
list_sort(NULL, &fs_info->reclaim_bgs, reclaim_bgs_cmp);
while (!list_empty(&fs_info->reclaim_bgs)) {
- u64 used;
- u64 reserved;
- u64 old_total;
- int ret = 0;
+ int ret;
bg = list_first_entry(&fs_info->reclaim_bgs,
struct btrfs_block_group,
@@ -1954,126 +2082,8 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
space_info = bg->space_info;
spin_unlock(&fs_info->unused_bgs_lock);
+ ret = btrfs_reclaim_block_group(bg);
- /* Don't race with allocators so take the groups_sem */
- down_write(&space_info->groups_sem);
-
- spin_lock(&space_info->lock);
- spin_lock(&bg->lock);
- if (bg->reserved || bg->pinned || bg->ro) {
- /*
- * We want to bail if we made new allocations or have
- * outstanding allocations in this block group. We do
- * the ro check in case balance is currently acting on
- * this block group.
- */
- spin_unlock(&bg->lock);
- spin_unlock(&space_info->lock);
- up_write(&space_info->groups_sem);
- goto next;
- }
- if (bg->used == 0) {
- /*
- * It is possible that we trigger relocation on a block
- * group as its extents are deleted and it first goes
- * below the threshold, then shortly after goes empty.
- *
- * In this case, relocating it does delete it, but has
- * some overhead in relocation specific metadata, looking
- * for the non-existent extents and running some extra
- * transactions, which we can avoid by using one of the
- * other mechanisms for dealing with empty block groups.
- */
- if (!btrfs_test_opt(fs_info, DISCARD_ASYNC))
- btrfs_mark_bg_unused(bg);
- spin_unlock(&bg->lock);
- spin_unlock(&space_info->lock);
- up_write(&space_info->groups_sem);
- goto next;
-
- }
- /*
- * The block group might no longer meet the reclaim condition by
- * the time we get around to reclaiming it, so to avoid
- * reclaiming overly full block_groups, skip reclaiming them.
- *
- * Since the decision making process also depends on the amount
- * being freed, pass in a fake giant value to skip that extra
- * check, which is more meaningful when adding to the list in
- * the first place.
- */
- if (!should_reclaim_block_group(bg, bg->length)) {
- spin_unlock(&bg->lock);
- spin_unlock(&space_info->lock);
- up_write(&space_info->groups_sem);
- goto next;
- }
-
- spin_unlock(&bg->lock);
- old_total = space_info->total_bytes;
- spin_unlock(&space_info->lock);
-
- /*
- * Get out fast, in case we're read-only or unmounting the
- * filesystem. It is OK to drop block groups from the list even
- * for the read-only case. As we did take the super write lock,
- * "mount -o remount,ro" won't happen and read-only filesystem
- * means it is forced read-only due to a fatal error. So, it
- * never gets back to read-write to let us reclaim again.
- */
- if (btrfs_need_cleaner_sleep(fs_info)) {
- up_write(&space_info->groups_sem);
- goto next;
- }
-
- ret = inc_block_group_ro(bg, false);
- up_write(&space_info->groups_sem);
- if (ret < 0)
- goto next;
-
- /*
- * The amount of bytes reclaimed corresponds to the sum of the
- * "used" and "reserved" counters. We have set the block group
- * to RO above, which prevents reservations from happening but
- * we may have existing reservations for which allocation has
- * not yet been done - btrfs_update_block_group() was not yet
- * called, which is where we will transfer a reserved extent's
- * size from the "reserved" counter to the "used" counter - this
- * happens when running delayed references. When we relocate the
- * chunk below, relocation first flushes delalloc, waits for
- * ordered extent completion (which is where we create delayed
- * references for data extents) and commits the current
- * transaction (which runs delayed references), and only after
- * it does the actual work to move extents out of the block
- * group. So the reported amount of reclaimed bytes is
- * effectively the sum of the 'used' and 'reserved' counters.
- */
- spin_lock(&bg->lock);
- used = bg->used;
- reserved = bg->reserved;
- spin_unlock(&bg->lock);
-
- trace_btrfs_reclaim_block_group(bg);
- ret = btrfs_relocate_chunk(fs_info, bg->start, false);
- if (ret) {
- btrfs_dec_block_group_ro(bg);
- btrfs_err(fs_info, "error relocating chunk %llu",
- bg->start);
- used = 0;
- reserved = 0;
- spin_lock(&space_info->lock);
- space_info->reclaim_errors++;
- spin_unlock(&space_info->lock);
- }
- spin_lock(&space_info->lock);
- space_info->reclaim_count++;
- space_info->reclaim_bytes += used;
- space_info->reclaim_bytes += reserved;
- if (space_info->total_bytes < old_total)
- btrfs_set_periodic_reclaim_ready(space_info, true);
- spin_unlock(&space_info->lock);
-
-next:
if (ret && !READ_ONCE(space_info->periodic_reclaim))
btrfs_link_bg_list(bg, &retry_list);
btrfs_put_block_group(bg);
--
2.53.0
^ permalink raw reply related [flat|nested] 8+ messages in thread* [PATCH 3/3] btrfs: zoned: limit number of zones reclaimed in flush_space
2026-03-02 14:39 [PATCH 0/3] btrfs: zoned: fix hang with generic/551 Johannes Thumshirn
2026-03-02 14:39 ` [PATCH 1/3] btrfs: move reclaiming of a single block group into its own function Johannes Thumshirn
2026-03-02 14:39 ` [PATCH 2/3] btrfs: create btrfs_reclaim_block_groups() Johannes Thumshirn
@ 2026-03-02 14:39 ` Johannes Thumshirn
2026-03-03 7:41 ` Damien Le Moal
2 siblings, 1 reply; 8+ messages in thread
From: Johannes Thumshirn @ 2026-03-02 14:39 UTC (permalink / raw)
To: linux-btrfs; +Cc: Naohiro Aota, Damien Le Moal, Johannes Thumshirn
Limit the number of zones reclaimed in flush_space()'s RECLAIM_ZONES
state.
This prevents possibly long running reclaim sweeps to block other tasks in
the system, while the system is under preassure anyways, causing the
tasks to hang.
An example of this can be seen here, triggered by fstests generic/551:
generic/551 [ 27.042349] run fstests generic/551 at 2026-02-27 11:05:30
BTRFS: device fsid 78c16e29-20d9-4c8e-bc04-7ba431be38ff devid 1 transid 8 /dev/vdb (254:16) scanned by mount (806)
BTRFS info (device vdb): first mount of filesystem 78c16e29-20d9-4c8e-bc04-7ba431be38ff
BTRFS info (device vdb): using crc32c checksum algorithm
BTRFS info (device vdb): host-managed zoned block device /dev/vdb, 64 zones of 268435456 bytes
BTRFS info (device vdb): zoned mode enabled with zone size 268435456
BTRFS info (device vdb): checking UUID tree
BTRFS info (device vdb): enabling free space tree
INFO: task kworker/u38:1:90 blocked for more than 120 seconds.
Not tainted 7.0.0-rc1+ #345
"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
task:kworker/u38:1 state:D stack:0 pid:90 tgid:90 ppid:2 task_flags:0x4208060 flags:0x00080000
Workqueue: events_unbound btrfs_async_reclaim_data_space
Call Trace:
<TASK>
__schedule+0x34f/0xe70
schedule+0x41/0x140
schedule_timeout+0xa3/0x110
? mark_held_locks+0x40/0x70
? lockdep_hardirqs_on_prepare+0xd8/0x1c0
? trace_hardirqs_on+0x18/0x100
? lockdep_hardirqs_on+0x84/0x130
? _raw_spin_unlock_irq+0x33/0x50
wait_for_completion+0xa4/0x150
? __flush_work+0x24c/0x550
__flush_work+0x339/0x550
? __pfx_wq_barrier_func+0x10/0x10
? wait_for_completion+0x39/0x150
flush_space+0x243/0x660
? find_held_lock+0x2b/0x80
? kvm_sched_clock_read+0x11/0x20
? local_clock_noinstr+0x17/0x110
? local_clock+0x15/0x30
? lock_release+0x1b7/0x4b0
do_async_reclaim_data_space+0xe8/0x160
btrfs_async_reclaim_data_space+0x19/0x30
process_one_work+0x20a/0x5f0
? lock_is_held_type+0xcd/0x130
worker_thread+0x1e2/0x3c0
? __pfx_worker_thread+0x10/0x10
kthread+0x103/0x150
? __pfx_kthread+0x10/0x10
ret_from_fork+0x20d/0x320
? __pfx_kthread+0x10/0x10
ret_from_fork_asm+0x1a/0x30
</TASK>
Showing all locks held in the system:
1 lock held by khungtaskd/67:
#0: ffffffff824d58e0 (rcu_read_lock){....}-{1:3}, at: debug_show_all_locks+0x3d/0x194
2 locks held by kworker/u38:1/90:
#0: ffff8881000aa158 ((wq_completion)events_unbound){+.+.}-{0:0}, at: process_one_work+0x3c4/0x5f0
#1: ffffc90000c17e58 ((work_completion)(&fs_info->async_data_reclaim_work)){+.+.}-{0:0}, at: process_one_work+0x1c0/0x5f0
5 locks held by kworker/u39:1/191:
#0: ffff8881000aa158 ((wq_completion)events_unbound){+.+.}-{0:0}, at: process_one_work+0x3c4/0x5f0
#1: ffffc90000dfbe58 ((work_completion)(&fs_info->reclaim_bgs_work)){+.+.}-{0:0}, at: process_one_work+0x1c0/0x5f0
#2: ffff888101da0420 (sb_writers#9){.+.+}-{0:0}, at: process_one_work+0x20a/0x5f0
#3: ffff88811040a648 (&fs_info->reclaim_bgs_lock){+.+.}-{4:4}, at: btrfs_reclaim_bgs_work+0x1de/0x770
#4: ffff888110408a18 (&fs_info->cleaner_mutex){+.+.}-{4:4}, at: btrfs_relocate_block_group+0x95a/0x20f0
1 lock held by aio-dio-write-v/980:
#0: ffff888110093008 (&sb->s_type->i_mutex_key#15){++++}-{4:4}, at: btrfs_inode_lock+0x51/0xb0
=============================================
To prevent these long running reclaims from blocking the system, only
reclaim 5 block_groups in the RECLAIM_ZONES state of flush_space(). Also
as these reclaims are now constrained, it opens up the use for a
synchronous call to brtfs_reclaim_block_groups(), eliminating the need
to place the reclaim task on a workqueue and then flushing the workqueue
again.
Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
---
fs/btrfs/block-group.c | 12 ++++++++----
fs/btrfs/block-group.h | 1 +
fs/btrfs/space-info.c | 3 +--
3 files changed, 10 insertions(+), 6 deletions(-)
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index f0e4a1dd812f..169496d2b3c5 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -1909,7 +1909,7 @@ static bool should_reclaim_block_group(const struct btrfs_block_group *bg, u64 b
return true;
}
-static int btrfs_reclaim_block_group(struct btrfs_block_group *bg)
+static int btrfs_reclaim_block_group(struct btrfs_block_group *bg, int *reclaimed)
{
struct btrfs_fs_info *fs_info = bg->fs_info;
struct btrfs_space_info *space_info = bg->space_info;
@@ -2036,15 +2036,17 @@ static int btrfs_reclaim_block_group(struct btrfs_block_group *bg)
if (space_info->total_bytes < old_total)
btrfs_set_periodic_reclaim_ready(space_info, true);
spin_unlock(&space_info->lock);
+ (*reclaimed)++;
return ret;
}
-static void btrfs_reclaim_block_groups(struct btrfs_fs_info *fs_info)
+void btrfs_reclaim_block_groups(struct btrfs_fs_info *fs_info, unsigned int limit)
{
struct btrfs_block_group *bg;
struct btrfs_space_info *space_info;
LIST_HEAD(retry_list);
+ int reclaimed = 0;
if (!btrfs_should_reclaim(fs_info))
return;
@@ -2080,7 +2082,7 @@ static void btrfs_reclaim_block_groups(struct btrfs_fs_info *fs_info)
space_info = bg->space_info;
spin_unlock(&fs_info->unused_bgs_lock);
- ret = btrfs_reclaim_block_group(bg);
+ ret = btrfs_reclaim_block_group(bg, &reclaimed);
if (ret && !READ_ONCE(space_info->periodic_reclaim))
btrfs_link_bg_list(bg, &retry_list);
@@ -2099,6 +2101,8 @@ static void btrfs_reclaim_block_groups(struct btrfs_fs_info *fs_info)
if (!mutex_trylock(&fs_info->reclaim_bgs_lock))
goto end;
spin_lock(&fs_info->unused_bgs_lock);
+ if (reclaimed >= limit)
+ break;
}
spin_unlock(&fs_info->unused_bgs_lock);
mutex_unlock(&fs_info->reclaim_bgs_lock);
@@ -2114,7 +2118,7 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
struct btrfs_fs_info *fs_info =
container_of(work, struct btrfs_fs_info, reclaim_bgs_work);
- btrfs_reclaim_block_groups(fs_info);
+ btrfs_reclaim_block_groups(fs_info, -1);
}
void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info)
diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h
index c03e04292900..0504cb357992 100644
--- a/fs/btrfs/block-group.h
+++ b/fs/btrfs/block-group.h
@@ -350,6 +350,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
struct btrfs_chunk_map *map);
void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info);
void btrfs_mark_bg_unused(struct btrfs_block_group *bg);
+void btrfs_reclaim_block_groups(struct btrfs_fs_info *fs_info, unsigned int limit);
void btrfs_reclaim_bgs_work(struct work_struct *work);
void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info);
void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg);
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index 0e5274c3b988..57b74d0608ae 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -918,8 +918,7 @@ static void flush_space(struct btrfs_space_info *space_info, u64 num_bytes,
if (btrfs_is_zoned(fs_info)) {
btrfs_reclaim_sweep(fs_info);
btrfs_delete_unused_bgs(fs_info);
- btrfs_reclaim_bgs(fs_info);
- flush_work(&fs_info->reclaim_bgs_work);
+ btrfs_reclaim_block_groups(fs_info, 5);
ASSERT(current->journal_info == NULL);
ret = btrfs_commit_current_transaction(root);
} else {
--
2.53.0
^ permalink raw reply related [flat|nested] 8+ messages in thread