* [PATCH 0/3] btrfs: zoned: reduce lock contention in zoned extent allocator
@ 2025-06-23 16:56 Johannes Thumshirn
2025-06-23 16:56 ` [PATCH 1/3] btrfs: zoned: get rid of relocation_bg_lock Johannes Thumshirn
` (2 more replies)
0 siblings, 3 replies; 6+ messages in thread
From: Johannes Thumshirn @ 2025-06-23 16:56 UTC (permalink / raw)
To: linux-btrfs; +Cc: Damien Le Moal, Naohiro Aota, Johannes Thumshirn
From: Johannes Thumshirn <johannes.thumshirn@wdc.com>
While investigating performance issues on zoned btrfs, I've analyzed
lockstat output while running fio workloads.
Three locks that have always been on top are the space_info's ->lock, as
well as fs_info's ->treelog_bg_lock and ->relocation_bg_lock.
The first one is not as trivial to get rid and most likely legit in most
cases (see patch #3). The other two are only protecting a single value in
fs_info and can be replaced by REAC_ONCE() and WRITE_ONCE().
The series has been run through fstests on tcmu-runner emulated SMR drives
as well as Damien's fio benchmarks on real ZNS hardware and no regressions
have been observed.
Johannes Thumshirn (3):
btrfs: zoned: get rid of relocation_bg_lock
btrfs: zoned: get rid of treelog_bg_lock
btrfs: zoned: don't hold space_info lock on zoned allocation
fs/btrfs/disk-io.c | 2 --
fs/btrfs/extent-tree.c | 76 ++++++++++++------------------------------
fs/btrfs/fs.h | 7 +---
fs/btrfs/zoned.c | 13 ++++----
fs/btrfs/zoned.h | 7 ++--
5 files changed, 32 insertions(+), 73 deletions(-)
--
2.49.0
^ permalink raw reply [flat|nested] 6+ messages in thread
* [PATCH 1/3] btrfs: zoned: get rid of relocation_bg_lock
2025-06-23 16:56 [PATCH 0/3] btrfs: zoned: reduce lock contention in zoned extent allocator Johannes Thumshirn
@ 2025-06-23 16:56 ` Johannes Thumshirn
2025-06-24 1:29 ` Damien Le Moal
2025-06-23 16:56 ` [PATCH 2/3] btrfs: zoned: get rid of treelog_bg_lock Johannes Thumshirn
2025-06-23 16:56 ` [PATCH 3/3] btrfs: zoned: don't hold space_info lock on zoned allocation Johannes Thumshirn
2 siblings, 1 reply; 6+ messages in thread
From: Johannes Thumshirn @ 2025-06-23 16:56 UTC (permalink / raw)
To: linux-btrfs; +Cc: Damien Le Moal, Naohiro Aota, Johannes Thumshirn
From: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Lockstat analysis of benchmark workloads shows a very high contention of
the relocation_bg_lock. But the relocation_bg_lock only protects a single
field in 'struct btrfs_fs_info', namely 'u64 data_reloc_bg'.
Use READ_ONCE()/WRITE_ONCE() to access 'btrfs_fs_info::data_reloc_bg'.
Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
---
fs/btrfs/disk-io.c | 1 -
fs/btrfs/extent-tree.c | 28 +++++++++++-----------------
fs/btrfs/fs.h | 6 +-----
fs/btrfs/zoned.c | 11 +++++------
4 files changed, 17 insertions(+), 29 deletions(-)
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index ee3cdd7035cc..24896322376d 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2791,7 +2791,6 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
spin_lock_init(&fs_info->unused_bgs_lock);
spin_lock_init(&fs_info->treelog_bg_lock);
spin_lock_init(&fs_info->zone_active_bgs_lock);
- spin_lock_init(&fs_info->relocation_bg_lock);
rwlock_init(&fs_info->tree_mod_log_lock);
rwlock_init(&fs_info->global_root_lock);
mutex_init(&fs_info->unused_bg_unpin_mutex);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 10f50c725313..a9bda68a1883 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3865,14 +3865,10 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group,
* Do not allow non-relocation blocks in the dedicated relocation block
* group, and vice versa.
*/
- spin_lock(&fs_info->relocation_bg_lock);
- data_reloc_bytenr = fs_info->data_reloc_bg;
+ data_reloc_bytenr = READ_ONCE(fs_info->data_reloc_bg);
if (data_reloc_bytenr &&
((ffe_ctl->for_data_reloc && bytenr != data_reloc_bytenr) ||
(!ffe_ctl->for_data_reloc && bytenr == data_reloc_bytenr)))
- skip = true;
- spin_unlock(&fs_info->relocation_bg_lock);
- if (skip)
return 1;
/* Check RO and no space case before trying to activate it */
@@ -3899,7 +3895,6 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group,
spin_lock(&space_info->lock);
spin_lock(&block_group->lock);
spin_lock(&fs_info->treelog_bg_lock);
- spin_lock(&fs_info->relocation_bg_lock);
if (ret)
goto out;
@@ -3908,8 +3903,8 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group,
block_group->start == fs_info->treelog_bg ||
fs_info->treelog_bg == 0);
ASSERT(!ffe_ctl->for_data_reloc ||
- block_group->start == fs_info->data_reloc_bg ||
- fs_info->data_reloc_bg == 0);
+ block_group->start == data_reloc_bytenr ||
+ data_reloc_bytenr == 0);
if (block_group->ro ||
(!ffe_ctl->for_data_reloc &&
@@ -3932,7 +3927,7 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group,
* Do not allow currently used block group to be the data relocation
* dedicated block group.
*/
- if (ffe_ctl->for_data_reloc && !fs_info->data_reloc_bg &&
+ if (ffe_ctl->for_data_reloc && data_reloc_bytenr == 0 &&
(block_group->used || block_group->reserved)) {
ret = 1;
goto out;
@@ -3957,8 +3952,8 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group,
fs_info->treelog_bg = block_group->start;
if (ffe_ctl->for_data_reloc) {
- if (!fs_info->data_reloc_bg)
- fs_info->data_reloc_bg = block_group->start;
+ if (READ_ONCE(fs_info->data_reloc_bg) == 0)
+ WRITE_ONCE(fs_info->data_reloc_bg, block_group->start);
/*
* Do not allow allocations from this block group, unless it is
* for data relocation. Compared to increasing the ->ro, setting
@@ -3994,8 +3989,7 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group,
if (ret && ffe_ctl->for_treelog)
fs_info->treelog_bg = 0;
if (ret && ffe_ctl->for_data_reloc)
- fs_info->data_reloc_bg = 0;
- spin_unlock(&fs_info->relocation_bg_lock);
+ WRITE_ONCE(fs_info->data_reloc_bg, 0);
spin_unlock(&fs_info->treelog_bg_lock);
spin_unlock(&block_group->lock);
spin_unlock(&space_info->lock);
@@ -4304,10 +4298,10 @@ static int prepare_allocation_zoned(struct btrfs_fs_info *fs_info,
ffe_ctl->hint_byte = fs_info->treelog_bg;
spin_unlock(&fs_info->treelog_bg_lock);
} else if (ffe_ctl->for_data_reloc) {
- spin_lock(&fs_info->relocation_bg_lock);
- if (fs_info->data_reloc_bg)
- ffe_ctl->hint_byte = fs_info->data_reloc_bg;
- spin_unlock(&fs_info->relocation_bg_lock);
+ u64 data_reloc_bg = READ_ONCE(fs_info->data_reloc_bg);
+
+ if (data_reloc_bg)
+ ffe_ctl->hint_byte = data_reloc_bg;
} else if (ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA) {
struct btrfs_block_group *block_group;
diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
index b239e4b8421c..570f4b85096c 100644
--- a/fs/btrfs/fs.h
+++ b/fs/btrfs/fs.h
@@ -849,11 +849,7 @@ struct btrfs_fs_info {
spinlock_t treelog_bg_lock;
u64 treelog_bg;
- /*
- * Start of the dedicated data relocation block group, protected by
- * relocation_bg_lock.
- */
- spinlock_t relocation_bg_lock;
+ /* Start of the dedicated data relocation block group */
u64 data_reloc_bg;
struct mutex zoned_data_reloc_io_lock;
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index a4a309050b67..f58bd85ddf5a 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -2495,11 +2495,10 @@ void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg,
void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg)
{
struct btrfs_fs_info *fs_info = bg->fs_info;
+ u64 data_reloc_bg = READ_ONCE(fs_info->data_reloc_bg);
- spin_lock(&fs_info->relocation_bg_lock);
- if (fs_info->data_reloc_bg == bg->start)
- fs_info->data_reloc_bg = 0;
- spin_unlock(&fs_info->relocation_bg_lock);
+ if (data_reloc_bg == bg->start)
+ WRITE_ONCE(fs_info->data_reloc_bg, 0);
}
void btrfs_zoned_reserve_data_reloc_bg(struct btrfs_fs_info *fs_info)
@@ -2518,7 +2517,7 @@ void btrfs_zoned_reserve_data_reloc_bg(struct btrfs_fs_info *fs_info)
if (!btrfs_is_zoned(fs_info))
return;
- if (fs_info->data_reloc_bg)
+ if (READ_ONCE(fs_info->data_reloc_bg))
return;
if (sb_rdonly(fs_info->sb))
@@ -2539,7 +2538,7 @@ void btrfs_zoned_reserve_data_reloc_bg(struct btrfs_fs_info *fs_info)
continue;
}
- fs_info->data_reloc_bg = bg->start;
+ WRITE_ONCE(fs_info->data_reloc_bg, bg->start);
set_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &bg->runtime_flags);
btrfs_zone_activate(bg);
--
2.49.0
^ permalink raw reply related [flat|nested] 6+ messages in thread
* [PATCH 2/3] btrfs: zoned: get rid of treelog_bg_lock
2025-06-23 16:56 [PATCH 0/3] btrfs: zoned: reduce lock contention in zoned extent allocator Johannes Thumshirn
2025-06-23 16:56 ` [PATCH 1/3] btrfs: zoned: get rid of relocation_bg_lock Johannes Thumshirn
@ 2025-06-23 16:56 ` Johannes Thumshirn
2025-06-24 1:33 ` Damien Le Moal
2025-06-23 16:56 ` [PATCH 3/3] btrfs: zoned: don't hold space_info lock on zoned allocation Johannes Thumshirn
2 siblings, 1 reply; 6+ messages in thread
From: Johannes Thumshirn @ 2025-06-23 16:56 UTC (permalink / raw)
To: linux-btrfs; +Cc: Damien Le Moal, Naohiro Aota, Johannes Thumshirn
From: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Lockstat analysis of benchmark workloads shows a very high contention of
the treelog_bg_lock. But the treelog_bg_lock only protects a single
field in 'struct btrfs_fs_info', namely 'u64 treelog_bg'.
Use READ_ONCE()/WRITE_ONCE() to access 'btrfs_fs_info::treelog_bg'.
Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
---
fs/btrfs/disk-io.c | 1 -
fs/btrfs/extent-tree.c | 45 +++++++++++-------------------------------
fs/btrfs/fs.h | 1 -
fs/btrfs/zoned.c | 2 +-
fs/btrfs/zoned.h | 7 +++----
5 files changed, 15 insertions(+), 41 deletions(-)
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 24896322376d..a54218717cb4 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2789,7 +2789,6 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
spin_lock_init(&fs_info->defrag_inodes_lock);
spin_lock_init(&fs_info->super_lock);
spin_lock_init(&fs_info->unused_bgs_lock);
- spin_lock_init(&fs_info->treelog_bg_lock);
spin_lock_init(&fs_info->zone_active_bgs_lock);
rwlock_init(&fs_info->tree_mod_log_lock);
rwlock_init(&fs_info->global_root_lock);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index a9bda68a1883..46358a555f78 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3809,22 +3809,6 @@ static int do_allocation_clustered(struct btrfs_block_group *block_group,
return find_free_extent_unclustered(block_group, ffe_ctl);
}
-/*
- * Tree-log block group locking
- * ============================
- *
- * fs_info::treelog_bg_lock protects the fs_info::treelog_bg which
- * indicates the starting address of a block group, which is reserved only
- * for tree-log metadata.
- *
- * Lock nesting
- * ============
- *
- * space_info::lock
- * block_group::lock
- * fs_info::treelog_bg_lock
- */
-
/*
* Simple allocator for sequential-only block group. It only allows sequential
* allocation. No need to play with trees. This function also reserves the
@@ -3844,7 +3828,6 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group,
u64 log_bytenr;
u64 data_reloc_bytenr;
int ret = 0;
- bool skip = false;
ASSERT(btrfs_is_zoned(block_group->fs_info));
@@ -3852,13 +3835,9 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group,
* Do not allow non-tree-log blocks in the dedicated tree-log block
* group, and vice versa.
*/
- spin_lock(&fs_info->treelog_bg_lock);
- log_bytenr = fs_info->treelog_bg;
+ log_bytenr = READ_ONCE(fs_info->treelog_bg);
if (log_bytenr && ((ffe_ctl->for_treelog && bytenr != log_bytenr) ||
(!ffe_ctl->for_treelog && bytenr == log_bytenr)))
- skip = true;
- spin_unlock(&fs_info->treelog_bg_lock);
- if (skip)
return 1;
/*
@@ -3894,14 +3873,13 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group,
spin_lock(&space_info->lock);
spin_lock(&block_group->lock);
- spin_lock(&fs_info->treelog_bg_lock);
if (ret)
goto out;
ASSERT(!ffe_ctl->for_treelog ||
- block_group->start == fs_info->treelog_bg ||
- fs_info->treelog_bg == 0);
+ block_group->start == log_bytenr ||
+ log_bytenr == 0);
ASSERT(!ffe_ctl->for_data_reloc ||
block_group->start == data_reloc_bytenr ||
data_reloc_bytenr == 0);
@@ -3917,7 +3895,7 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group,
* Do not allow currently using block group to be tree-log dedicated
* block group.
*/
- if (ffe_ctl->for_treelog && !fs_info->treelog_bg &&
+ if (ffe_ctl->for_treelog && log_bytenr == 0 &&
(block_group->used || block_group->reserved)) {
ret = 1;
goto out;
@@ -3948,8 +3926,8 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group,
goto out;
}
- if (ffe_ctl->for_treelog && !fs_info->treelog_bg)
- fs_info->treelog_bg = block_group->start;
+ if (ffe_ctl->for_treelog && READ_ONCE(fs_info->treelog_bg) == 0)
+ WRITE_ONCE(fs_info->treelog_bg, block_group->start);
if (ffe_ctl->for_data_reloc) {
if (READ_ONCE(fs_info->data_reloc_bg) == 0)
@@ -3987,10 +3965,9 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group,
out:
if (ret && ffe_ctl->for_treelog)
- fs_info->treelog_bg = 0;
+ WRITE_ONCE(fs_info->treelog_bg, 0);
if (ret && ffe_ctl->for_data_reloc)
WRITE_ONCE(fs_info->data_reloc_bg, 0);
- spin_unlock(&fs_info->treelog_bg_lock);
spin_unlock(&block_group->lock);
spin_unlock(&space_info->lock);
return ret;
@@ -4293,10 +4270,10 @@ static int prepare_allocation_zoned(struct btrfs_fs_info *fs_info,
struct find_free_extent_ctl *ffe_ctl)
{
if (ffe_ctl->for_treelog) {
- spin_lock(&fs_info->treelog_bg_lock);
- if (fs_info->treelog_bg)
- ffe_ctl->hint_byte = fs_info->treelog_bg;
- spin_unlock(&fs_info->treelog_bg_lock);
+ u64 treelog_bg = READ_ONCE(fs_info->treelog_bg);
+
+ if (treelog_bg)
+ ffe_ctl->hint_byte = treelog_bg;
} else if (ffe_ctl->for_data_reloc) {
u64 data_reloc_bg = READ_ONCE(fs_info->data_reloc_bg);
diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
index 570f4b85096c..a388af40a251 100644
--- a/fs/btrfs/fs.h
+++ b/fs/btrfs/fs.h
@@ -846,7 +846,6 @@ struct btrfs_fs_info {
u64 max_zone_append_size;
struct mutex zoned_meta_io_lock;
- spinlock_t treelog_bg_lock;
u64 treelog_bg;
/* Start of the dedicated data relocation block group */
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index f58bd85ddf5a..d58bffb84b05 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -1948,7 +1948,7 @@ static bool check_bg_is_active(struct btrfs_eb_write_context *ctx,
if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags))
return true;
- if (fs_info->treelog_bg == block_group->start) {
+ if (READ_ONCE(fs_info->treelog_bg) == block_group->start) {
if (!btrfs_zone_activate(block_group)) {
int ret_fin = btrfs_zone_finish_one_bg(fs_info);
diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h
index 6e11533b8e14..c1b3a5c3a799 100644
--- a/fs/btrfs/zoned.h
+++ b/fs/btrfs/zoned.h
@@ -383,14 +383,13 @@ static inline void btrfs_zoned_meta_io_unlock(struct btrfs_fs_info *fs_info)
static inline void btrfs_clear_treelog_bg(struct btrfs_block_group *bg)
{
struct btrfs_fs_info *fs_info = bg->fs_info;
+ u64 treelog_bg = READ_ONCE(fs_info->treelog_bg);
if (!btrfs_is_zoned(fs_info))
return;
- spin_lock(&fs_info->treelog_bg_lock);
- if (fs_info->treelog_bg == bg->start)
- fs_info->treelog_bg = 0;
- spin_unlock(&fs_info->treelog_bg_lock);
+ if (treelog_bg == bg->start)
+ WRITE_ONCE(fs_info->treelog_bg, 0);
}
static inline void btrfs_zoned_data_reloc_lock(struct btrfs_inode *inode)
--
2.49.0
^ permalink raw reply related [flat|nested] 6+ messages in thread
* [PATCH 3/3] btrfs: zoned: don't hold space_info lock on zoned allocation
2025-06-23 16:56 [PATCH 0/3] btrfs: zoned: reduce lock contention in zoned extent allocator Johannes Thumshirn
2025-06-23 16:56 ` [PATCH 1/3] btrfs: zoned: get rid of relocation_bg_lock Johannes Thumshirn
2025-06-23 16:56 ` [PATCH 2/3] btrfs: zoned: get rid of treelog_bg_lock Johannes Thumshirn
@ 2025-06-23 16:56 ` Johannes Thumshirn
2 siblings, 0 replies; 6+ messages in thread
From: Johannes Thumshirn @ 2025-06-23 16:56 UTC (permalink / raw)
To: linux-btrfs; +Cc: Damien Le Moal, Naohiro Aota, Johannes Thumshirn
From: Johannes Thumshirn <johannes.thumshirn@wdc.com>
The zoned extent allocator holds 'struct btrfs_space_info::lock' nearly
over the entirety of the allocation process, but nothing in
do_allocation_zoned() is actually accessing fields of 'struct
btrfs_space_info'.
Furthermore taking lock_stat snapshots in performance testing, always shows
the space_info::lock as the most contented lock in the entire system.
Remove locking the space_info lock during do_allocation_zoned() to reduce
lock contention.
Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
---
fs/btrfs/extent-tree.c | 3 ---
1 file changed, 3 deletions(-)
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 46358a555f78..da731f6d4dad 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3819,7 +3819,6 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group,
struct btrfs_block_group **bg_ret)
{
struct btrfs_fs_info *fs_info = block_group->fs_info;
- struct btrfs_space_info *space_info = block_group->space_info;
struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
u64 start = block_group->start;
u64 num_bytes = ffe_ctl->num_bytes;
@@ -3871,7 +3870,6 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group,
*/
}
- spin_lock(&space_info->lock);
spin_lock(&block_group->lock);
if (ret)
@@ -3969,7 +3967,6 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group,
if (ret && ffe_ctl->for_data_reloc)
WRITE_ONCE(fs_info->data_reloc_bg, 0);
spin_unlock(&block_group->lock);
- spin_unlock(&space_info->lock);
return ret;
}
--
2.49.0
^ permalink raw reply related [flat|nested] 6+ messages in thread
* Re: [PATCH 1/3] btrfs: zoned: get rid of relocation_bg_lock
2025-06-23 16:56 ` [PATCH 1/3] btrfs: zoned: get rid of relocation_bg_lock Johannes Thumshirn
@ 2025-06-24 1:29 ` Damien Le Moal
0 siblings, 0 replies; 6+ messages in thread
From: Damien Le Moal @ 2025-06-24 1:29 UTC (permalink / raw)
To: Johannes Thumshirn, linux-btrfs; +Cc: Naohiro Aota, Johannes Thumshirn
On 6/24/25 1:56 AM, Johannes Thumshirn wrote:
> From: Johannes Thumshirn <johannes.thumshirn@wdc.com>
>
> Lockstat analysis of benchmark workloads shows a very high contention of
> the relocation_bg_lock. But the relocation_bg_lock only protects a single
s/of the/on
s/But the/But or s/But the/But the lock
> field in 'struct btrfs_fs_info', namely 'u64 data_reloc_bg'.
>
> Use READ_ONCE()/WRITE_ONCE() to access 'btrfs_fs_info::data_reloc_bg'.
>
> Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
[...]
> @@ -3908,8 +3903,8 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group,
> block_group->start == fs_info->treelog_bg ||
> fs_info->treelog_bg == 0);
> ASSERT(!ffe_ctl->for_data_reloc ||
> - block_group->start == fs_info->data_reloc_bg ||
> - fs_info->data_reloc_bg == 0);
> + block_group->start == data_reloc_bytenr ||
> + data_reloc_bytenr == 0);
Shouldn't all these use of data_reloc_bytenr be replaced with
READ_ONCE(fs_info->data_reloc_bg) ? That is, if data_reloc_bg can change while
this function is running.
> @@ -3957,8 +3952,8 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group,
> fs_info->treelog_bg = block_group->start;
>
> if (ffe_ctl->for_data_reloc) {
> - if (!fs_info->data_reloc_bg)
> - fs_info->data_reloc_bg = block_group->start;
> + if (READ_ONCE(fs_info->data_reloc_bg) == 0)
> + WRITE_ONCE(fs_info->data_reloc_bg, block_group->start);
See ! You did it here :)
> @@ -4304,10 +4298,10 @@ static int prepare_allocation_zoned(struct btrfs_fs_info *fs_info,
> ffe_ctl->hint_byte = fs_info->treelog_bg;
> spin_unlock(&fs_info->treelog_bg_lock);
> } else if (ffe_ctl->for_data_reloc) {
> - spin_lock(&fs_info->relocation_bg_lock);
> - if (fs_info->data_reloc_bg)
> - ffe_ctl->hint_byte = fs_info->data_reloc_bg;
> - spin_unlock(&fs_info->relocation_bg_lock);
> + u64 data_reloc_bg = READ_ONCE(fs_info->data_reloc_bg);
> +
> + if (data_reloc_bg)
> + ffe_ctl->hint_byte = data_reloc_bg;
Why not drop the data_reloc_bg variable and use
READ_ONCE(fs_info->data_reloc_bg) directly ?
> diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
> index a4a309050b67..f58bd85ddf5a 100644
> --- a/fs/btrfs/zoned.c
> +++ b/fs/btrfs/zoned.c
> @@ -2495,11 +2495,10 @@ void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg,
> void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg)
> {
> struct btrfs_fs_info *fs_info = bg->fs_info;
> + u64 data_reloc_bg = READ_ONCE(fs_info->data_reloc_bg);
>
> - spin_lock(&fs_info->relocation_bg_lock);
> - if (fs_info->data_reloc_bg == bg->start)
> - fs_info->data_reloc_bg = 0;
> - spin_unlock(&fs_info->relocation_bg_lock);
> + if (data_reloc_bg == bg->start)
> + WRITE_ONCE(fs_info->data_reloc_bg, 0);
data_reloc_bg variable is not needed.
--
Damien Le Moal
Western Digital Research
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH 2/3] btrfs: zoned: get rid of treelog_bg_lock
2025-06-23 16:56 ` [PATCH 2/3] btrfs: zoned: get rid of treelog_bg_lock Johannes Thumshirn
@ 2025-06-24 1:33 ` Damien Le Moal
0 siblings, 0 replies; 6+ messages in thread
From: Damien Le Moal @ 2025-06-24 1:33 UTC (permalink / raw)
To: Johannes Thumshirn, linux-btrfs; +Cc: Naohiro Aota, Johannes Thumshirn
On 6/24/25 1:56 AM, Johannes Thumshirn wrote:
> From: Johannes Thumshirn <johannes.thumshirn@wdc.com>
>
> Lockstat analysis of benchmark workloads shows a very high contention of
> the treelog_bg_lock. But the treelog_bg_lock only protects a single
> field in 'struct btrfs_fs_info', namely 'u64 treelog_bg'.
>
> Use READ_ONCE()/WRITE_ONCE() to access 'btrfs_fs_info::treelog_bg'.
>
> Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
> ---
> fs/btrfs/disk-io.c | 1 -
> fs/btrfs/extent-tree.c | 45 +++++++++++-------------------------------
> fs/btrfs/fs.h | 1 -
> fs/btrfs/zoned.c | 2 +-
> fs/btrfs/zoned.h | 7 +++----
> 5 files changed, 15 insertions(+), 41 deletions(-)
>
> diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
> index 24896322376d..a54218717cb4 100644
> --- a/fs/btrfs/disk-io.c
> +++ b/fs/btrfs/disk-io.c
> @@ -2789,7 +2789,6 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
> spin_lock_init(&fs_info->defrag_inodes_lock);
> spin_lock_init(&fs_info->super_lock);
> spin_lock_init(&fs_info->unused_bgs_lock);
> - spin_lock_init(&fs_info->treelog_bg_lock);
> spin_lock_init(&fs_info->zone_active_bgs_lock);
> rwlock_init(&fs_info->tree_mod_log_lock);
> rwlock_init(&fs_info->global_root_lock);
> diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
> index a9bda68a1883..46358a555f78 100644
> --- a/fs/btrfs/extent-tree.c
> +++ b/fs/btrfs/extent-tree.c
> @@ -3809,22 +3809,6 @@ static int do_allocation_clustered(struct btrfs_block_group *block_group,
> return find_free_extent_unclustered(block_group, ffe_ctl);
> }
>
> -/*
> - * Tree-log block group locking
> - * ============================
> - *
> - * fs_info::treelog_bg_lock protects the fs_info::treelog_bg which
> - * indicates the starting address of a block group, which is reserved only
> - * for tree-log metadata.
> - *
> - * Lock nesting
> - * ============
> - *
> - * space_info::lock
> - * block_group::lock
> - * fs_info::treelog_bg_lock
> - */
> -
> /*
> * Simple allocator for sequential-only block group. It only allows sequential
> * allocation. No need to play with trees. This function also reserves the
> @@ -3844,7 +3828,6 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group,
> u64 log_bytenr;
> u64 data_reloc_bytenr;
> int ret = 0;
> - bool skip = false;
>
> ASSERT(btrfs_is_zoned(block_group->fs_info));
>
> @@ -3852,13 +3835,9 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group,
> * Do not allow non-tree-log blocks in the dedicated tree-log block
> * group, and vice versa.
> */
> - spin_lock(&fs_info->treelog_bg_lock);
> - log_bytenr = fs_info->treelog_bg;
> + log_bytenr = READ_ONCE(fs_info->treelog_bg);
> if (log_bytenr && ((ffe_ctl->for_treelog && bytenr != log_bytenr) ||
> (!ffe_ctl->for_treelog && bytenr == log_bytenr)))
> - skip = true;
> - spin_unlock(&fs_info->treelog_bg_lock);
> - if (skip)
> return 1;
>
> /*
> @@ -3894,14 +3873,13 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group,
>
> spin_lock(&space_info->lock);
> spin_lock(&block_group->lock);
> - spin_lock(&fs_info->treelog_bg_lock);
>
> if (ret)
> goto out;
>
> ASSERT(!ffe_ctl->for_treelog ||
> - block_group->start == fs_info->treelog_bg ||
> - fs_info->treelog_bg == 0);
> + block_group->start == log_bytenr ||
> + log_bytenr == 0);
> ASSERT(!ffe_ctl->for_data_reloc ||
> block_group->start == data_reloc_bytenr ||
> data_reloc_bytenr == 0);
> @@ -3917,7 +3895,7 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group,
> * Do not allow currently using block group to be tree-log dedicated
> * block group.
> */
> - if (ffe_ctl->for_treelog && !fs_info->treelog_bg &&
> + if (ffe_ctl->for_treelog && log_bytenr == 0 &&
Similar comment as previous patch. It feels that all the log_bytenr uses should
really be replaced with READ_ONCE(fs_info->treelog_bg). But I am not sure about
the concurrency with that field changing...
> (block_group->used || block_group->reserved)) {
> ret = 1;
> goto out;
> @@ -3948,8 +3926,8 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group,
> goto out;
> }
>
> - if (ffe_ctl->for_treelog && !fs_info->treelog_bg)
> - fs_info->treelog_bg = block_group->start;
> + if (ffe_ctl->for_treelog && READ_ONCE(fs_info->treelog_bg) == 0)
> + WRITE_ONCE(fs_info->treelog_bg, block_group->start);
>
> if (ffe_ctl->for_data_reloc) {
> if (READ_ONCE(fs_info->data_reloc_bg) == 0)
[...]
> diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h
> index 6e11533b8e14..c1b3a5c3a799 100644
> --- a/fs/btrfs/zoned.h
> +++ b/fs/btrfs/zoned.h
> @@ -383,14 +383,13 @@ static inline void btrfs_zoned_meta_io_unlock(struct btrfs_fs_info *fs_info)
> static inline void btrfs_clear_treelog_bg(struct btrfs_block_group *bg)
> {
> struct btrfs_fs_info *fs_info = bg->fs_info;
> + u64 treelog_bg = READ_ONCE(fs_info->treelog_bg);
>
> if (!btrfs_is_zoned(fs_info))
> return;
>
> - spin_lock(&fs_info->treelog_bg_lock);
> - if (fs_info->treelog_bg == bg->start)
> - fs_info->treelog_bg = 0;
> - spin_unlock(&fs_info->treelog_bg_lock);
> + if (treelog_bg == bg->start)
Variable treelog_bg is not needed.
> + WRITE_ONCE(fs_info->treelog_bg, 0);
> }
>
> static inline void btrfs_zoned_data_reloc_lock(struct btrfs_inode *inode)
--
Damien Le Moal
Western Digital Research
^ permalink raw reply [flat|nested] 6+ messages in thread
end of thread, other threads:[~2025-06-24 1:35 UTC | newest]
Thread overview: 6+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-06-23 16:56 [PATCH 0/3] btrfs: zoned: reduce lock contention in zoned extent allocator Johannes Thumshirn
2025-06-23 16:56 ` [PATCH 1/3] btrfs: zoned: get rid of relocation_bg_lock Johannes Thumshirn
2025-06-24 1:29 ` Damien Le Moal
2025-06-23 16:56 ` [PATCH 2/3] btrfs: zoned: get rid of treelog_bg_lock Johannes Thumshirn
2025-06-24 1:33 ` Damien Le Moal
2025-06-23 16:56 ` [PATCH 3/3] btrfs: zoned: don't hold space_info lock on zoned allocation Johannes Thumshirn
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).