* [f2fs-dev] [PATCH] f2fs: reduce nat_tree_lock hold time when flush nat entries
@ 2025-08-13 4:04 wangzijie
2025-08-18 2:21 ` Zhiguo Niu
2025-08-25 2:51 ` Chao Yu
0 siblings, 2 replies; 6+ messages in thread
From: wangzijie @ 2025-08-13 4:04 UTC (permalink / raw)
To: jaegeuk, chao
Cc: linux-f2fs-devel, linux-kernel, bintian.wang, feng.han, wangzijie
Sometimes I suffered the nat_tree_lock contention between f2fs_write_checkpoint
and f2fs_get_node_info. Commit a9419b6("f2fs: do not bother checkpoint by
f2fs_get_node_info") also mentioned that situation.
My idea is, when flush nat entries, we can use some structures to record nat
pages we may read, and readahead them before hold nat_tree_lock. Before
impletement code, I did some survey and found a submittion in community.
Subject: f2fs: use bucket sort to avoid tree lookup and list sort when nat flushing
Link: https://lore.kernel.org/linux-f2fs-devel/20170520122435.17574-2-houpengyang@huawei.com/
This patch aims to improve nat entry set sort by using bucket.
I steal that structure and readahead nat pages contain nat entry set which cannot be moved
to journal according to dirty nat entry set bucket.
By doing this, I think there are two benefits to reducing nat_tree_lock hold time when
when flush nat entries.
1. avoid nat set tree lookup and sort
2. readahead nat pages before holding nat_tree_lock
Signed-off-by: wangzijie <wangzijie1@honor.com>
---
fs/f2fs/f2fs.h | 1 +
fs/f2fs/node.c | 70 ++++++++++++++++++++++++--------------------------
fs/f2fs/node.h | 2 +-
3 files changed, 35 insertions(+), 38 deletions(-)
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 46be75605..b27cc059f 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -975,6 +975,7 @@ struct f2fs_nm_info {
struct radix_tree_root nat_set_root;/* root of the nat set cache */
struct f2fs_rwsem nat_tree_lock; /* protect nat entry tree */
struct list_head nat_entries; /* cached nat entry list (clean) */
+ struct list_head nat_dirty_set[NAT_ENTRY_PER_BLOCK + 1]; /* store dirty nat set */
spinlock_t nat_list_lock; /* protect clean nat entry list */
unsigned int nat_cnt[MAX_NAT_STATE]; /* the # of cached nat entries */
unsigned int nat_blocks; /* # of nat blocks */
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 27743b93e..87c975ee8 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -244,6 +244,12 @@ static void __del_from_nat_cache(struct f2fs_nm_info *nm_i, struct nat_entry *e)
__free_nat_entry(e);
}
+static void __relocate_nat_entry_set(struct f2fs_nm_info *nm_i,
+ struct nat_entry_set *set)
+{
+ list_move_tail(&set->set_list, &nm_i->nat_dirty_set[set->entry_cnt]);
+}
+
static struct nat_entry_set *__grab_nat_entry_set(struct f2fs_nm_info *nm_i,
struct nat_entry *ne)
{
@@ -260,6 +266,7 @@ static struct nat_entry_set *__grab_nat_entry_set(struct f2fs_nm_info *nm_i,
head->set = set;
head->entry_cnt = 0;
f2fs_radix_tree_insert(&nm_i->nat_set_root, set, head);
+ __relocate_nat_entry_set(nm_i, head);
}
return head;
}
@@ -279,8 +286,10 @@ static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i,
* 2. update old block address to new one;
*/
if (!new_ne && (get_nat_flag(ne, IS_PREALLOC) ||
- !get_nat_flag(ne, IS_DIRTY)))
+ !get_nat_flag(ne, IS_DIRTY))) {
head->entry_cnt++;
+ __relocate_nat_entry_set(nm_i, head);
+ }
set_nat_flag(ne, IS_PREALLOC, new_ne);
@@ -309,6 +318,7 @@ static void __clear_nat_cache_dirty(struct f2fs_nm_info *nm_i,
set_nat_flag(ne, IS_DIRTY, false);
set->entry_cnt--;
+ __relocate_nat_entry_set(nm_i, set);
nm_i->nat_cnt[DIRTY_NAT]--;
nm_i->nat_cnt[RECLAIMABLE_NAT]++;
}
@@ -2976,24 +2986,6 @@ static void remove_nats_in_journal(struct f2fs_sb_info *sbi)
up_write(&curseg->journal_rwsem);
}
-static void __adjust_nat_entry_set(struct nat_entry_set *nes,
- struct list_head *head, int max)
-{
- struct nat_entry_set *cur;
-
- if (nes->entry_cnt >= max)
- goto add_out;
-
- list_for_each_entry(cur, head, set_list) {
- if (cur->entry_cnt >= nes->entry_cnt) {
- list_add(&nes->set_list, cur->set_list.prev);
- return;
- }
- }
-add_out:
- list_add_tail(&nes->set_list, head);
-}
-
static void __update_nat_bits(struct f2fs_sb_info *sbi, nid_t start_nid,
const struct f2fs_nat_block *nat_blk)
{
@@ -3095,6 +3087,7 @@ static int __flush_nat_entry_set(struct f2fs_sb_info *sbi,
/* Allow dirty nats by node block allocation in write_begin */
if (!set->entry_cnt) {
+ list_del(&set->set_list);
radix_tree_delete(&NM_I(sbi)->nat_set_root, set->set);
kmem_cache_free(nat_entry_set_slab, set);
}
@@ -3109,11 +3102,8 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
struct f2fs_nm_info *nm_i = NM_I(sbi);
struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
struct f2fs_journal *journal = curseg->journal;
- struct nat_entry_set *setvec[NAT_VEC_SIZE];
struct nat_entry_set *set, *tmp;
- unsigned int found;
- nid_t set_idx = 0;
- LIST_HEAD(sets);
+ int i;
int err = 0;
/*
@@ -3129,6 +3119,16 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
if (!nm_i->nat_cnt[DIRTY_NAT])
return 0;
+ /* readahead sets which cannot be moved to journal */
+ if (!__has_cursum_space(journal, nm_i->nat_cnt[DIRTY_NAT], NAT_JOURNAL)) {
+ for (i = MAX_NAT_JENTRIES(journal); i <= NAT_ENTRY_PER_BLOCK; i++) {
+ list_for_each_entry_safe(set, tmp, &nm_i->nat_dirty_set[i], set_list) {
+ f2fs_ra_meta_pages(sbi, set->set, 1,
+ META_NAT, true);
+ }
+ }
+ }
+
f2fs_down_write(&nm_i->nat_tree_lock);
/*
@@ -3141,21 +3141,13 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
nm_i->nat_cnt[DIRTY_NAT], NAT_JOURNAL))
remove_nats_in_journal(sbi);
- while ((found = __gang_lookup_nat_set(nm_i,
- set_idx, NAT_VEC_SIZE, setvec))) {
- unsigned idx;
-
- set_idx = setvec[found - 1]->set + 1;
- for (idx = 0; idx < found; idx++)
- __adjust_nat_entry_set(setvec[idx], &sets,
- MAX_NAT_JENTRIES(journal));
- }
-
/* flush dirty nats in nat entry set */
- list_for_each_entry_safe(set, tmp, &sets, set_list) {
- err = __flush_nat_entry_set(sbi, set, cpc);
- if (err)
- break;
+ for (i = 0; i <= NAT_ENTRY_PER_BLOCK; i++) {
+ list_for_each_entry_safe(set, tmp, &nm_i->nat_dirty_set[i], set_list) {
+ err = __flush_nat_entry_set(sbi, set, cpc);
+ if (err)
+ break;
+ }
}
f2fs_up_write(&nm_i->nat_tree_lock);
@@ -3249,6 +3241,7 @@ static int init_node_manager(struct f2fs_sb_info *sbi)
struct f2fs_nm_info *nm_i = NM_I(sbi);
unsigned char *version_bitmap;
unsigned int nat_segs;
+ int i;
int err;
nm_i->nat_blkaddr = le32_to_cpu(sb_raw->nat_blkaddr);
@@ -3275,6 +3268,9 @@ static int init_node_manager(struct f2fs_sb_info *sbi)
INIT_LIST_HEAD(&nm_i->nat_entries);
spin_lock_init(&nm_i->nat_list_lock);
+ for (i = 0; i <= NAT_ENTRY_PER_BLOCK; i++)
+ INIT_LIST_HEAD(&nm_i->nat_dirty_set[i]);
+
mutex_init(&nm_i->build_lock);
spin_lock_init(&nm_i->nid_list_lock);
init_f2fs_rwsem(&nm_i->nat_tree_lock);
diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
index 030390543..d805d4ce7 100644
--- a/fs/f2fs/node.h
+++ b/fs/f2fs/node.h
@@ -158,7 +158,7 @@ enum mem_type {
};
struct nat_entry_set {
- struct list_head set_list; /* link with other nat sets */
+ struct list_head set_list; /* link with nat sets which have same entry_cnt */
struct list_head entry_list; /* link with dirty nat entries */
nid_t set; /* set number*/
unsigned int entry_cnt; /* the # of nat entries in set */
--
2.25.1
^ permalink raw reply related [flat|nested] 6+ messages in thread
* Re: [f2fs-dev] [PATCH] f2fs: reduce nat_tree_lock hold time when flush nat entries
2025-08-13 4:04 [f2fs-dev] [PATCH] f2fs: reduce nat_tree_lock hold time when flush nat entries wangzijie
@ 2025-08-18 2:21 ` Zhiguo Niu
2025-08-19 2:36 ` wangzijie
2025-08-25 2:51 ` Chao Yu
1 sibling, 1 reply; 6+ messages in thread
From: Zhiguo Niu @ 2025-08-18 2:21 UTC (permalink / raw)
To: wangzijie; +Cc: jaegeuk, chao, linux-kernel, feng.han, linux-f2fs-devel
wangzijie <wangzijie1@honor.com> 于2025年8月13日周三 12:06写道:
>
> Sometimes I suffered the nat_tree_lock contention between f2fs_write_checkpoint
> and f2fs_get_node_info. Commit a9419b6("f2fs: do not bother checkpoint by
> f2fs_get_node_info") also mentioned that situation.
>
> My idea is, when flush nat entries, we can use some structures to record nat
> pages we may read, and readahead them before hold nat_tree_lock. Before
> impletement code, I did some survey and found a submittion in community.
>
> Subject: f2fs: use bucket sort to avoid tree lookup and list sort when nat flushing
> Link: https://lore.kernel.org/linux-f2fs-devel/20170520122435.17574-2-houpengyang@huawei.com/
> This patch aims to improve nat entry set sort by using bucket.
> I steal that structure and readahead nat pages contain nat entry set which cannot be moved
> to journal according to dirty nat entry set bucket.
>
> By doing this, I think there are two benefits to reducing nat_tree_lock hold time when
> when flush nat entries.
> 1. avoid nat set tree lookup and sort
> 2. readahead nat pages before holding nat_tree_lock
>
> Signed-off-by: wangzijie <wangzijie1@honor.com>
> ---
> fs/f2fs/f2fs.h | 1 +
> fs/f2fs/node.c | 70 ++++++++++++++++++++++++--------------------------
> fs/f2fs/node.h | 2 +-
> 3 files changed, 35 insertions(+), 38 deletions(-)
>
> diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
> index 46be75605..b27cc059f 100644
> --- a/fs/f2fs/f2fs.h
> +++ b/fs/f2fs/f2fs.h
> @@ -975,6 +975,7 @@ struct f2fs_nm_info {
> struct radix_tree_root nat_set_root;/* root of the nat set cache */
> struct f2fs_rwsem nat_tree_lock; /* protect nat entry tree */
> struct list_head nat_entries; /* cached nat entry list (clean) */
> + struct list_head nat_dirty_set[NAT_ENTRY_PER_BLOCK + 1]; /* store dirty nat set */
> spinlock_t nat_list_lock; /* protect clean nat entry list */
> unsigned int nat_cnt[MAX_NAT_STATE]; /* the # of cached nat entries */
> unsigned int nat_blocks; /* # of nat blocks */
> diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
> index 27743b93e..87c975ee8 100644
> --- a/fs/f2fs/node.c
> +++ b/fs/f2fs/node.c
> @@ -244,6 +244,12 @@ static void __del_from_nat_cache(struct f2fs_nm_info *nm_i, struct nat_entry *e)
> __free_nat_entry(e);
> }
>
> +static void __relocate_nat_entry_set(struct f2fs_nm_info *nm_i,
> + struct nat_entry_set *set)
> +{
> + list_move_tail(&set->set_list, &nm_i->nat_dirty_set[set->entry_cnt]);
> +}
Hi zijie,
Is there any lock protecting this list related operations in the
current process?
> +
> static struct nat_entry_set *__grab_nat_entry_set(struct f2fs_nm_info *nm_i,
> struct nat_entry *ne)
> {
> @@ -260,6 +266,7 @@ static struct nat_entry_set *__grab_nat_entry_set(struct f2fs_nm_info *nm_i,
> head->set = set;
> head->entry_cnt = 0;
> f2fs_radix_tree_insert(&nm_i->nat_set_root, set, head);
> + __relocate_nat_entry_set(nm_i, head);
> }
> return head;
> }
> @@ -279,8 +286,10 @@ static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i,
> * 2. update old block address to new one;
> */
> if (!new_ne && (get_nat_flag(ne, IS_PREALLOC) ||
> - !get_nat_flag(ne, IS_DIRTY)))
> + !get_nat_flag(ne, IS_DIRTY))) {
> head->entry_cnt++;
> + __relocate_nat_entry_set(nm_i, head);
> + }
>
> set_nat_flag(ne, IS_PREALLOC, new_ne);
>
> @@ -309,6 +318,7 @@ static void __clear_nat_cache_dirty(struct f2fs_nm_info *nm_i,
>
> set_nat_flag(ne, IS_DIRTY, false);
> set->entry_cnt--;
> + __relocate_nat_entry_set(nm_i, set);
> nm_i->nat_cnt[DIRTY_NAT]--;
> nm_i->nat_cnt[RECLAIMABLE_NAT]++;
> }
> @@ -2976,24 +2986,6 @@ static void remove_nats_in_journal(struct f2fs_sb_info *sbi)
> up_write(&curseg->journal_rwsem);
> }
>
> -static void __adjust_nat_entry_set(struct nat_entry_set *nes,
> - struct list_head *head, int max)
> -{
> - struct nat_entry_set *cur;
> -
> - if (nes->entry_cnt >= max)
> - goto add_out;
> -
> - list_for_each_entry(cur, head, set_list) {
> - if (cur->entry_cnt >= nes->entry_cnt) {
> - list_add(&nes->set_list, cur->set_list.prev);
> - return;
> - }
> - }
> -add_out:
> - list_add_tail(&nes->set_list, head);
> -}
> -
> static void __update_nat_bits(struct f2fs_sb_info *sbi, nid_t start_nid,
> const struct f2fs_nat_block *nat_blk)
> {
> @@ -3095,6 +3087,7 @@ static int __flush_nat_entry_set(struct f2fs_sb_info *sbi,
>
> /* Allow dirty nats by node block allocation in write_begin */
> if (!set->entry_cnt) {
> + list_del(&set->set_list);
> radix_tree_delete(&NM_I(sbi)->nat_set_root, set->set);
> kmem_cache_free(nat_entry_set_slab, set);
> }
> @@ -3109,11 +3102,8 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
> struct f2fs_nm_info *nm_i = NM_I(sbi);
> struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
> struct f2fs_journal *journal = curseg->journal;
> - struct nat_entry_set *setvec[NAT_VEC_SIZE];
> struct nat_entry_set *set, *tmp;
> - unsigned int found;
> - nid_t set_idx = 0;
> - LIST_HEAD(sets);
> + int i;
> int err = 0;
>
> /*
> @@ -3129,6 +3119,16 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
> if (!nm_i->nat_cnt[DIRTY_NAT])
> return 0;
>
> + /* readahead sets which cannot be moved to journal */
> + if (!__has_cursum_space(journal, nm_i->nat_cnt[DIRTY_NAT], NAT_JOURNAL)) {
> + for (i = MAX_NAT_JENTRIES(journal); i <= NAT_ENTRY_PER_BLOCK; i++) {
i am a little confused, why is there "i <= NAT_ENTRY_PER_BLOCK;"?
do we need to calculate according to the current nat block/entry number?
thanks!
> + list_for_each_entry_safe(set, tmp, &nm_i->nat_dirty_set[i], set_list) {
> + f2fs_ra_meta_pages(sbi, set->set, 1,
> + META_NAT, true);
> + }
> + }
> + }
> +
> f2fs_down_write(&nm_i->nat_tree_lock);
>
> /*
> @@ -3141,21 +3141,13 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
> nm_i->nat_cnt[DIRTY_NAT], NAT_JOURNAL))
> remove_nats_in_journal(sbi);
>
> - while ((found = __gang_lookup_nat_set(nm_i,
> - set_idx, NAT_VEC_SIZE, setvec))) {
> - unsigned idx;
> -
> - set_idx = setvec[found - 1]->set + 1;
> - for (idx = 0; idx < found; idx++)
> - __adjust_nat_entry_set(setvec[idx], &sets,
> - MAX_NAT_JENTRIES(journal));
> - }
> -
> /* flush dirty nats in nat entry set */
> - list_for_each_entry_safe(set, tmp, &sets, set_list) {
> - err = __flush_nat_entry_set(sbi, set, cpc);
> - if (err)
> - break;
> + for (i = 0; i <= NAT_ENTRY_PER_BLOCK; i++) {
> + list_for_each_entry_safe(set, tmp, &nm_i->nat_dirty_set[i], set_list) {
> + err = __flush_nat_entry_set(sbi, set, cpc);
> + if (err)
> + break;
> + }
> }
>
> f2fs_up_write(&nm_i->nat_tree_lock);
> @@ -3249,6 +3241,7 @@ static int init_node_manager(struct f2fs_sb_info *sbi)
> struct f2fs_nm_info *nm_i = NM_I(sbi);
> unsigned char *version_bitmap;
> unsigned int nat_segs;
> + int i;
> int err;
>
> nm_i->nat_blkaddr = le32_to_cpu(sb_raw->nat_blkaddr);
> @@ -3275,6 +3268,9 @@ static int init_node_manager(struct f2fs_sb_info *sbi)
> INIT_LIST_HEAD(&nm_i->nat_entries);
> spin_lock_init(&nm_i->nat_list_lock);
>
> + for (i = 0; i <= NAT_ENTRY_PER_BLOCK; i++)
> + INIT_LIST_HEAD(&nm_i->nat_dirty_set[i]);
> +
> mutex_init(&nm_i->build_lock);
> spin_lock_init(&nm_i->nid_list_lock);
> init_f2fs_rwsem(&nm_i->nat_tree_lock);
> diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
> index 030390543..d805d4ce7 100644
> --- a/fs/f2fs/node.h
> +++ b/fs/f2fs/node.h
> @@ -158,7 +158,7 @@ enum mem_type {
> };
>
> struct nat_entry_set {
> - struct list_head set_list; /* link with other nat sets */
> + struct list_head set_list; /* link with nat sets which have same entry_cnt */
> struct list_head entry_list; /* link with dirty nat entries */
> nid_t set; /* set number*/
> unsigned int entry_cnt; /* the # of nat entries in set */
> --
> 2.25.1
>
>
>
> _______________________________________________
> Linux-f2fs-devel mailing list
> Linux-f2fs-devel@lists.sourceforge.net
> https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [f2fs-dev] [PATCH] f2fs: reduce nat_tree_lock hold time when flush nat entries
2025-08-18 2:21 ` Zhiguo Niu
@ 2025-08-19 2:36 ` wangzijie
0 siblings, 0 replies; 6+ messages in thread
From: wangzijie @ 2025-08-19 2:36 UTC (permalink / raw)
To: niuzhiguo84
Cc: chao, feng.han, jaegeuk, linux-f2fs-devel, linux-kernel,
wangzijie1
>>
>> Sometimes I suffered the nat_tree_lock contention between f2fs_write_checkpoint
>> and f2fs_get_node_info. Commit a9419b6("f2fs: do not bother checkpoint by
>> f2fs_get_node_info") also mentioned that situation.
>>
>> My idea is, when flush nat entries, we can use some structures to record nat
>> pages we may read, and readahead them before hold nat_tree_lock. Before
>> impletement code, I did some survey and found a submittion in community.
>>
>> Subject: f2fs: use bucket sort to avoid tree lookup and list sort when nat flushing
>> Link: https://lore.kernel.org/linux-f2fs-devel/20170520122435.17574-2-houpengyang@huawei.com/
>> This patch aims to improve nat entry set sort by using bucket.
>> I steal that structure and readahead nat pages contain nat entry set which cannot be moved
>> to journal according to dirty nat entry set bucket.
>>
>> By doing this, I think there are two benefits to reducing nat_tree_lock hold time when
>> when flush nat entries.
>> 1. avoid nat set tree lookup and sort
>> 2. readahead nat pages before holding nat_tree_lock
>>
>> Signed-off-by: wangzijie <wangzijie1@honor.com>
>> ---
>> fs/f2fs/f2fs.h | 1 +
>> fs/f2fs/node.c | 70 ++++++++++++++++++++++++--------------------------
>> fs/f2fs/node.h | 2 +-
>> 3 files changed, 35 insertions(+), 38 deletions(-)
>>
>> diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
>> index 46be75605..b27cc059f 100644
>> --- a/fs/f2fs/f2fs.h
>> +++ b/fs/f2fs/f2fs.h
>> @@ -975,6 +975,7 @@ struct f2fs_nm_info {
>> struct radix_tree_root nat_set_root;/* root of the nat set cache */
>> struct f2fs_rwsem nat_tree_lock; /* protect nat entry tree */
>> struct list_head nat_entries; /* cached nat entry list (clean) */
>> + struct list_head nat_dirty_set[NAT_ENTRY_PER_BLOCK + 1]; /* store dirty nat set */
>> spinlock_t nat_list_lock; /* protect clean nat entry list */
>> unsigned int nat_cnt[MAX_NAT_STATE]; /* the # of cached nat entries */
>> unsigned int nat_blocks; /* # of nat blocks */
>> diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
>> index 27743b93e..87c975ee8 100644
>> --- a/fs/f2fs/node.c
>> +++ b/fs/f2fs/node.c
>> @@ -244,6 +244,12 @@ static void __del_from_nat_cache(struct f2fs_nm_info *nm_i, struct nat_entry *e)
>> __free_nat_entry(e);
>> }
>>
>> +static void __relocate_nat_entry_set(struct f2fs_nm_info *nm_i,
>> + struct nat_entry_set *set)
>> +{
>> + list_move_tail(&set->set_list, &nm_i->nat_dirty_set[set->entry_cnt]);
>> +}
>Hi zijie,
>Is there any lock protecting this list related operations in the
>current process?
Hi, Zhiguo
I think the lock protection needed is as same as __set_nat_cache_dirty().
>> +
>> static struct nat_entry_set *__grab_nat_entry_set(struct f2fs_nm_info *nm_i,
>> struct nat_entry *ne)
>> {
>> @@ -260,6 +266,7 @@ static struct nat_entry_set *__grab_nat_entry_set(struct f2fs_nm_info *nm_i,
>> head->set = set;
>> head->entry_cnt = 0;
>> f2fs_radix_tree_insert(&nm_i->nat_set_root, set, head);
>> + __relocate_nat_entry_set(nm_i, head);
>> }
>> return head;
>> }
>> @@ -279,8 +286,10 @@ static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i,
>> * 2. update old block address to new one;
>> */
>> if (!new_ne && (get_nat_flag(ne, IS_PREALLOC) ||
>> - !get_nat_flag(ne, IS_DIRTY)))
>> + !get_nat_flag(ne, IS_DIRTY))) {
>> head->entry_cnt++;
>> + __relocate_nat_entry_set(nm_i, head);
>> + }
>>
>> set_nat_flag(ne, IS_PREALLOC, new_ne);
>>
>> @@ -309,6 +318,7 @@ static void __clear_nat_cache_dirty(struct f2fs_nm_info *nm_i,
>>
>> set_nat_flag(ne, IS_DIRTY, false);
>> set->entry_cnt--;
>> + __relocate_nat_entry_set(nm_i, set);
>> nm_i->nat_cnt[DIRTY_NAT]--;
>> nm_i->nat_cnt[RECLAIMABLE_NAT]++;
>> }
>> @@ -2976,24 +2986,6 @@ static void remove_nats_in_journal(struct f2fs_sb_info *sbi)
>> up_write(&curseg->journal_rwsem);
>> }
>>
>> -static void __adjust_nat_entry_set(struct nat_entry_set *nes,
>> - struct list_head *head, int max)
>> -{
>> - struct nat_entry_set *cur;
>> -
>> - if (nes->entry_cnt >= max)
>> - goto add_out;
>> -
>> - list_for_each_entry(cur, head, set_list) {
>> - if (cur->entry_cnt >= nes->entry_cnt) {
>> - list_add(&nes->set_list, cur->set_list.prev);
>> - return;
>> - }
>> - }
>> -add_out:
>> - list_add_tail(&nes->set_list, head);
>> -}
>> -
>> static void __update_nat_bits(struct f2fs_sb_info *sbi, nid_t start_nid,
>> const struct f2fs_nat_block *nat_blk)
>> {
>> @@ -3095,6 +3087,7 @@ static int __flush_nat_entry_set(struct f2fs_sb_info *sbi,
>>
>> /* Allow dirty nats by node block allocation in write_begin */
>> if (!set->entry_cnt) {
>> + list_del(&set->set_list);
>> radix_tree_delete(&NM_I(sbi)->nat_set_root, set->set);
>> kmem_cache_free(nat_entry_set_slab, set);
>> }
>> @@ -3109,11 +3102,8 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
>> struct f2fs_nm_info *nm_i = NM_I(sbi);
>> struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
>> struct f2fs_journal *journal = curseg->journal;
>> - struct nat_entry_set *setvec[NAT_VEC_SIZE];
>> struct nat_entry_set *set, *tmp;
>> - unsigned int found;
>> - nid_t set_idx = 0;
>> - LIST_HEAD(sets);
>> + int i;
>> int err = 0;
>>
>> /*
>> @@ -3129,6 +3119,16 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
>> if (!nm_i->nat_cnt[DIRTY_NAT])
>> return 0;
>>
>> + /* readahead sets which cannot be moved to journal */
>> + if (!__has_cursum_space(journal, nm_i->nat_cnt[DIRTY_NAT], NAT_JOURNAL)) {
>> + for (i = MAX_NAT_JENTRIES(journal); i <= NAT_ENTRY_PER_BLOCK; i++) {
>i am a little confused, why is there "i <= NAT_ENTRY_PER_BLOCK;"?
>do we need to calculate according to the current nat block/entry number?
>thanks!
Yes, you are right, we can calculate according to DIRTY_NAT count to end this loop early.
>> + list_for_each_entry_safe(set, tmp, &nm_i->nat_dirty_set[i], set_list) {
>> + f2fs_ra_meta_pages(sbi, set->set, 1,
>> + META_NAT, true);
>> + }
>> + }
>> + }
>> +
>> f2fs_down_write(&nm_i->nat_tree_lock);
>>
>> /*
>> @@ -3141,21 +3141,13 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
>> nm_i->nat_cnt[DIRTY_NAT], NAT_JOURNAL))
>> remove_nats_in_journal(sbi);
>>
>> - while ((found = __gang_lookup_nat_set(nm_i,
>> - set_idx, NAT_VEC_SIZE, setvec))) {
>> - unsigned idx;
>> -
>> - set_idx = setvec[found - 1]->set + 1;
>> - for (idx = 0; idx < found; idx++)
>> - __adjust_nat_entry_set(setvec[idx], &sets,
>> - MAX_NAT_JENTRIES(journal));
>> - }
>> -
>> /* flush dirty nats in nat entry set */
>> - list_for_each_entry_safe(set, tmp, &sets, set_list) {
>> - err = __flush_nat_entry_set(sbi, set, cpc);
>> - if (err)
>> - break;
>> + for (i = 0; i <= NAT_ENTRY_PER_BLOCK; i++) {
>> + list_for_each_entry_safe(set, tmp, &nm_i->nat_dirty_set[i], set_list) {
>> + err = __flush_nat_entry_set(sbi, set, cpc);
>> + if (err)
>> + break;
>> + }
>> }
>>
>> f2fs_up_write(&nm_i->nat_tree_lock);
>> @@ -3249,6 +3241,7 @@ static int init_node_manager(struct f2fs_sb_info *sbi)
>> struct f2fs_nm_info *nm_i = NM_I(sbi);
>> unsigned char *version_bitmap;
>> unsigned int nat_segs;
>> + int i;
>> int err;
>>
>> nm_i->nat_blkaddr = le32_to_cpu(sb_raw->nat_blkaddr);
>> @@ -3275,6 +3268,9 @@ static int init_node_manager(struct f2fs_sb_info *sbi)
>> INIT_LIST_HEAD(&nm_i->nat_entries);
>> spin_lock_init(&nm_i->nat_list_lock);
>>
>> + for (i = 0; i <= NAT_ENTRY_PER_BLOCK; i++)
>> + INIT_LIST_HEAD(&nm_i->nat_dirty_set[i]);
>> +
>> mutex_init(&nm_i->build_lock);
>> spin_lock_init(&nm_i->nid_list_lock);
>> init_f2fs_rwsem(&nm_i->nat_tree_lock);
>> diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
>> index 030390543..d805d4ce7 100644
>> --- a/fs/f2fs/node.h
>> +++ b/fs/f2fs/node.h
>> @@ -158,7 +158,7 @@ enum mem_type {
>> };
>>
>> struct nat_entry_set {
>> - struct list_head set_list; /* link with other nat sets */
>> + struct list_head set_list; /* link with nat sets which have same entry_cnt */
>> struct list_head entry_list; /* link with dirty nat entries */
>> nid_t set; /* set number*/
>> unsigned int entry_cnt; /* the # of nat entries in set */
>> --
>> 2.25.1
By the way, Chao and Zhiguo
I noticed that in f2fs_flush_nat_entries(), the check for nat_cnt[DIRTY_NAT] is out of
nat_tree_lock, and in f2fs_write_checkpoint(), there has below check without nat_tree_lock:
if (cpc->reason & CP_DISCARD) {
if (!f2fs_exist_trim_candidates(sbi, cpc)) {
unblock_operations(sbi);
goto out;
}
if (NM_I(sbi)->nat_cnt[DIRTY_NAT] == 0 &&
SIT_I(sbi)->dirty_sentries == 0 &&
prefree_segments(sbi) == 0) {
f2fs_flush_sit_entries(sbi, cpc);
f2fs_clear_prefree_segments(sbi, cpc);
unblock_operations(sbi);
goto out;
}
}
Is there a missing or we don't need nat_tree_lock in these situations?
And do you think the way in this patch is appropriate for reducing nat_tree_lock
hold time?
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [f2fs-dev] [PATCH] f2fs: reduce nat_tree_lock hold time when flush nat entries
2025-08-13 4:04 [f2fs-dev] [PATCH] f2fs: reduce nat_tree_lock hold time when flush nat entries wangzijie
2025-08-18 2:21 ` Zhiguo Niu
@ 2025-08-25 2:51 ` Chao Yu
2025-08-26 1:44 ` wangzijie
1 sibling, 1 reply; 6+ messages in thread
From: Chao Yu @ 2025-08-25 2:51 UTC (permalink / raw)
To: wangzijie, jaegeuk
Cc: chao, linux-f2fs-devel, linux-kernel, bintian.wang, feng.han
On 8/13/25 12:04, wangzijie wrote:
> Sometimes I suffered the nat_tree_lock contention between f2fs_write_checkpoint
> and f2fs_get_node_info. Commit a9419b6("f2fs: do not bother checkpoint by
> f2fs_get_node_info") also mentioned that situation.
>
> My idea is, when flush nat entries, we can use some structures to record nat
> pages we may read, and readahead them before hold nat_tree_lock. Before
> impletement code, I did some survey and found a submittion in community.
>
> Subject: f2fs: use bucket sort to avoid tree lookup and list sort when nat flushing
> Link: https://lore.kernel.org/linux-f2fs-devel/20170520122435.17574-2-houpengyang@huawei.com/
> This patch aims to improve nat entry set sort by using bucket.
> I steal that structure and readahead nat pages contain nat entry set which cannot be moved
> to journal according to dirty nat entry set bucket.
>
> By doing this, I think there are two benefits to reducing nat_tree_lock hold time when
> when flush nat entries.
Zijie,
Can you please figure out some numbers for this patch? something like
checkpoint latency or average or extreme time to grab nat_tree_lock...?
> 1. avoid nat set tree lookup and sort
> 2. readahead nat pages before holding nat_tree_lock
It may cause performance regression if it races w/ drop_caches?
Thanks,
>
> Signed-off-by: wangzijie <wangzijie1@honor.com>
> ---
> fs/f2fs/f2fs.h | 1 +
> fs/f2fs/node.c | 70 ++++++++++++++++++++++++--------------------------
> fs/f2fs/node.h | 2 +-
> 3 files changed, 35 insertions(+), 38 deletions(-)
>
> diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
> index 46be75605..b27cc059f 100644
> --- a/fs/f2fs/f2fs.h
> +++ b/fs/f2fs/f2fs.h
> @@ -975,6 +975,7 @@ struct f2fs_nm_info {
> struct radix_tree_root nat_set_root;/* root of the nat set cache */
> struct f2fs_rwsem nat_tree_lock; /* protect nat entry tree */
> struct list_head nat_entries; /* cached nat entry list (clean) */
> + struct list_head nat_dirty_set[NAT_ENTRY_PER_BLOCK + 1]; /* store dirty nat set */
> spinlock_t nat_list_lock; /* protect clean nat entry list */
> unsigned int nat_cnt[MAX_NAT_STATE]; /* the # of cached nat entries */
> unsigned int nat_blocks; /* # of nat blocks */
> diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
> index 27743b93e..87c975ee8 100644
> --- a/fs/f2fs/node.c
> +++ b/fs/f2fs/node.c
> @@ -244,6 +244,12 @@ static void __del_from_nat_cache(struct f2fs_nm_info *nm_i, struct nat_entry *e)
> __free_nat_entry(e);
> }
>
> +static void __relocate_nat_entry_set(struct f2fs_nm_info *nm_i,
> + struct nat_entry_set *set)
> +{
> + list_move_tail(&set->set_list, &nm_i->nat_dirty_set[set->entry_cnt]);
> +}
> +
> static struct nat_entry_set *__grab_nat_entry_set(struct f2fs_nm_info *nm_i,
> struct nat_entry *ne)
> {
> @@ -260,6 +266,7 @@ static struct nat_entry_set *__grab_nat_entry_set(struct f2fs_nm_info *nm_i,
> head->set = set;
> head->entry_cnt = 0;
> f2fs_radix_tree_insert(&nm_i->nat_set_root, set, head);
> + __relocate_nat_entry_set(nm_i, head);
> }
> return head;
> }
> @@ -279,8 +286,10 @@ static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i,
> * 2. update old block address to new one;
> */
> if (!new_ne && (get_nat_flag(ne, IS_PREALLOC) ||
> - !get_nat_flag(ne, IS_DIRTY)))
> + !get_nat_flag(ne, IS_DIRTY))) {
> head->entry_cnt++;
> + __relocate_nat_entry_set(nm_i, head);
> + }
>
> set_nat_flag(ne, IS_PREALLOC, new_ne);
>
> @@ -309,6 +318,7 @@ static void __clear_nat_cache_dirty(struct f2fs_nm_info *nm_i,
>
> set_nat_flag(ne, IS_DIRTY, false);
> set->entry_cnt--;
> + __relocate_nat_entry_set(nm_i, set);
> nm_i->nat_cnt[DIRTY_NAT]--;
> nm_i->nat_cnt[RECLAIMABLE_NAT]++;
> }
> @@ -2976,24 +2986,6 @@ static void remove_nats_in_journal(struct f2fs_sb_info *sbi)
> up_write(&curseg->journal_rwsem);
> }
>
> -static void __adjust_nat_entry_set(struct nat_entry_set *nes,
> - struct list_head *head, int max)
> -{
> - struct nat_entry_set *cur;
> -
> - if (nes->entry_cnt >= max)
> - goto add_out;
> -
> - list_for_each_entry(cur, head, set_list) {
> - if (cur->entry_cnt >= nes->entry_cnt) {
> - list_add(&nes->set_list, cur->set_list.prev);
> - return;
> - }
> - }
> -add_out:
> - list_add_tail(&nes->set_list, head);
> -}
> -
> static void __update_nat_bits(struct f2fs_sb_info *sbi, nid_t start_nid,
> const struct f2fs_nat_block *nat_blk)
> {
> @@ -3095,6 +3087,7 @@ static int __flush_nat_entry_set(struct f2fs_sb_info *sbi,
>
> /* Allow dirty nats by node block allocation in write_begin */
> if (!set->entry_cnt) {
> + list_del(&set->set_list);
> radix_tree_delete(&NM_I(sbi)->nat_set_root, set->set);
> kmem_cache_free(nat_entry_set_slab, set);
> }
> @@ -3109,11 +3102,8 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
> struct f2fs_nm_info *nm_i = NM_I(sbi);
> struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
> struct f2fs_journal *journal = curseg->journal;
> - struct nat_entry_set *setvec[NAT_VEC_SIZE];
> struct nat_entry_set *set, *tmp;
> - unsigned int found;
> - nid_t set_idx = 0;
> - LIST_HEAD(sets);
> + int i;
> int err = 0;
>
> /*
> @@ -3129,6 +3119,16 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
> if (!nm_i->nat_cnt[DIRTY_NAT])
> return 0;
>
> + /* readahead sets which cannot be moved to journal */
> + if (!__has_cursum_space(journal, nm_i->nat_cnt[DIRTY_NAT], NAT_JOURNAL)) {
> + for (i = MAX_NAT_JENTRIES(journal); i <= NAT_ENTRY_PER_BLOCK; i++) {
> + list_for_each_entry_safe(set, tmp, &nm_i->nat_dirty_set[i], set_list) {
> + f2fs_ra_meta_pages(sbi, set->set, 1,
> + META_NAT, true);
> + }
> + }
> + }
> +
> f2fs_down_write(&nm_i->nat_tree_lock);
>
> /*
> @@ -3141,21 +3141,13 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
> nm_i->nat_cnt[DIRTY_NAT], NAT_JOURNAL))
> remove_nats_in_journal(sbi);
>
> - while ((found = __gang_lookup_nat_set(nm_i,
> - set_idx, NAT_VEC_SIZE, setvec))) {
> - unsigned idx;
> -
> - set_idx = setvec[found - 1]->set + 1;
> - for (idx = 0; idx < found; idx++)
> - __adjust_nat_entry_set(setvec[idx], &sets,
> - MAX_NAT_JENTRIES(journal));
> - }
> -
> /* flush dirty nats in nat entry set */
> - list_for_each_entry_safe(set, tmp, &sets, set_list) {
> - err = __flush_nat_entry_set(sbi, set, cpc);
> - if (err)
> - break;
> + for (i = 0; i <= NAT_ENTRY_PER_BLOCK; i++) {
> + list_for_each_entry_safe(set, tmp, &nm_i->nat_dirty_set[i], set_list) {
> + err = __flush_nat_entry_set(sbi, set, cpc);
> + if (err)
> + break;
> + }
> }
>
> f2fs_up_write(&nm_i->nat_tree_lock);
> @@ -3249,6 +3241,7 @@ static int init_node_manager(struct f2fs_sb_info *sbi)
> struct f2fs_nm_info *nm_i = NM_I(sbi);
> unsigned char *version_bitmap;
> unsigned int nat_segs;
> + int i;
> int err;
>
> nm_i->nat_blkaddr = le32_to_cpu(sb_raw->nat_blkaddr);
> @@ -3275,6 +3268,9 @@ static int init_node_manager(struct f2fs_sb_info *sbi)
> INIT_LIST_HEAD(&nm_i->nat_entries);
> spin_lock_init(&nm_i->nat_list_lock);
>
> + for (i = 0; i <= NAT_ENTRY_PER_BLOCK; i++)
> + INIT_LIST_HEAD(&nm_i->nat_dirty_set[i]);
> +
> mutex_init(&nm_i->build_lock);
> spin_lock_init(&nm_i->nid_list_lock);
> init_f2fs_rwsem(&nm_i->nat_tree_lock);
> diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
> index 030390543..d805d4ce7 100644
> --- a/fs/f2fs/node.h
> +++ b/fs/f2fs/node.h
> @@ -158,7 +158,7 @@ enum mem_type {
> };
>
> struct nat_entry_set {
> - struct list_head set_list; /* link with other nat sets */
> + struct list_head set_list; /* link with nat sets which have same entry_cnt */
> struct list_head entry_list; /* link with dirty nat entries */
> nid_t set; /* set number*/
> unsigned int entry_cnt; /* the # of nat entries in set */
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [f2fs-dev] [PATCH] f2fs: reduce nat_tree_lock hold time when flush nat entries
2025-08-25 2:51 ` Chao Yu
@ 2025-08-26 1:44 ` wangzijie
2025-08-28 3:20 ` Chao Yu
0 siblings, 1 reply; 6+ messages in thread
From: wangzijie @ 2025-08-26 1:44 UTC (permalink / raw)
To: linux-f2fs-devel; +Cc: chao, feng.han, jaegeuk, linux-kernel, wangzijie1
> On 8/13/25 12:04, wangzijie wrote:
> > Sometimes I suffered the nat_tree_lock contention between f2fs_write_checkpoint
> > and f2fs_get_node_info. Commit a9419b6("f2fs: do not bother checkpoint by
> > f2fs_get_node_info") also mentioned that situation.
> >
> > My idea is, when flush nat entries, we can use some structures to record nat
> > pages we may read, and readahead them before hold nat_tree_lock. Before
> > impletement code, I did some survey and found a submittion in community.
> >
> > Subject: f2fs: use bucket sort to avoid tree lookup and list sort when nat flushing
> > Link: https://lore.kernel.org/linux-f2fs-devel/20170520122435.17574-2-houpengyang@huawei.com/
> > This patch aims to improve nat entry set sort by using bucket.
> > I steal that structure and readahead nat pages contain nat entry set which cannot be moved
> > to journal according to dirty nat entry set bucket.
> >
> > By doing this, I think there are two benefits to reducing nat_tree_lock hold time when
> > when flush nat entries.
>
> Zijie,
>
> Can you please figure out some numbers for this patch? something like
> checkpoint latency or average or extreme time to grab nat_tree_lock...?
>
> > 1. avoid nat set tree lookup and sort
> > 2. readahead nat pages before holding nat_tree_lock
>
> It may cause performance regression if it races w/ drop_caches?
>
> Thanks,
Hi, Chao
Do you mean that it will race with f2fs_try_to_free_nats()?
In this patch, nat_tree_lock is not held when readahead nat pages, but
in fact we need it. Am I right?
> >
> > Signed-off-by: wangzijie <wangzijie1@honor.com>
> > ---
> > fs/f2fs/f2fs.h | 1 +
> > fs/f2fs/node.c | 70 ++++++++++++++++++++++++--------------------------
> > fs/f2fs/node.h | 2 +-
> > 3 files changed, 35 insertions(+), 38 deletions(-)
> >
> > diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
> > index 46be75605..b27cc059f 100644
> > --- a/fs/f2fs/f2fs.h
> > +++ b/fs/f2fs/f2fs.h
> > @@ -975,6 +975,7 @@ struct f2fs_nm_info {
> > struct radix_tree_root nat_set_root;/* root of the nat set cache */
> > struct f2fs_rwsem nat_tree_lock; /* protect nat entry tree */
> > struct list_head nat_entries; /* cached nat entry list (clean) */
> > + struct list_head nat_dirty_set[NAT_ENTRY_PER_BLOCK + 1]; /* store dirty nat set */
> > spinlock_t nat_list_lock; /* protect clean nat entry list */
> > unsigned int nat_cnt[MAX_NAT_STATE]; /* the # of cached nat entries */
> > unsigned int nat_blocks; /* # of nat blocks */
> > diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
> > index 27743b93e..87c975ee8 100644
> > --- a/fs/f2fs/node.c
> > +++ b/fs/f2fs/node.c
> > @@ -244,6 +244,12 @@ static void __del_from_nat_cache(struct f2fs_nm_info *nm_i, struct nat_entry *e)
> > __free_nat_entry(e);
> > }
> >
> > +static void __relocate_nat_entry_set(struct f2fs_nm_info *nm_i,
> > + struct nat_entry_set *set)
> > +{
> > + list_move_tail(&set->set_list, &nm_i->nat_dirty_set[set->entry_cnt]);
> > +}
> > +
> > static struct nat_entry_set *__grab_nat_entry_set(struct f2fs_nm_info *nm_i,
> > struct nat_entry *ne)
> > {
> > @@ -260,6 +266,7 @@ static struct nat_entry_set *__grab_nat_entry_set(struct f2fs_nm_info *nm_i,
> > head->set = set;
> > head->entry_cnt = 0;
> > f2fs_radix_tree_insert(&nm_i->nat_set_root, set, head);
> > + __relocate_nat_entry_set(nm_i, head);
> > }
> > return head;
> > }
> > @@ -279,8 +286,10 @@ static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i,
> > * 2. update old block address to new one;
> > */
> > if (!new_ne && (get_nat_flag(ne, IS_PREALLOC) ||
> > - !get_nat_flag(ne, IS_DIRTY)))
> > + !get_nat_flag(ne, IS_DIRTY))) {
> > head->entry_cnt++;
> > + __relocate_nat_entry_set(nm_i, head);
> > + }
> >
> > set_nat_flag(ne, IS_PREALLOC, new_ne);
> >
> > @@ -309,6 +318,7 @@ static void __clear_nat_cache_dirty(struct f2fs_nm_info *nm_i,
> >
> > set_nat_flag(ne, IS_DIRTY, false);
> > set->entry_cnt--;
> > + __relocate_nat_entry_set(nm_i, set);
> > nm_i->nat_cnt[DIRTY_NAT]--;
> > nm_i->nat_cnt[RECLAIMABLE_NAT]++;
> > }
> > @@ -2976,24 +2986,6 @@ static void remove_nats_in_journal(struct f2fs_sb_info *sbi)
> > up_write(&curseg->journal_rwsem);
> > }
> >
> > -static void __adjust_nat_entry_set(struct nat_entry_set *nes,
> > - struct list_head *head, int max)
> > -{
> > - struct nat_entry_set *cur;
> > -
> > - if (nes->entry_cnt >= max)
> > - goto add_out;
> > -
> > - list_for_each_entry(cur, head, set_list) {
> > - if (cur->entry_cnt >= nes->entry_cnt) {
> > - list_add(&nes->set_list, cur->set_list.prev);
> > - return;
> > - }
> > - }
> > -add_out:
> > - list_add_tail(&nes->set_list, head);
> > -}
> > -
> > static void __update_nat_bits(struct f2fs_sb_info *sbi, nid_t start_nid,
> > const struct f2fs_nat_block *nat_blk)
> > {
> > @@ -3095,6 +3087,7 @@ static int __flush_nat_entry_set(struct f2fs_sb_info *sbi,
> >
> > /* Allow dirty nats by node block allocation in write_begin */
> > if (!set->entry_cnt) {
> > + list_del(&set->set_list);
> > radix_tree_delete(&NM_I(sbi)->nat_set_root, set->set);
> > kmem_cache_free(nat_entry_set_slab, set);
> > }
> > @@ -3109,11 +3102,8 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
> > struct f2fs_nm_info *nm_i = NM_I(sbi);
> > struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
> > struct f2fs_journal *journal = curseg->journal;
> > - struct nat_entry_set *setvec[NAT_VEC_SIZE];
> > struct nat_entry_set *set, *tmp;
> > - unsigned int found;
> > - nid_t set_idx = 0;
> > - LIST_HEAD(sets);
> > + int i;
> > int err = 0;
> >
> > /*
> > @@ -3129,6 +3119,16 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
> > if (!nm_i->nat_cnt[DIRTY_NAT])
> > return 0;
> >
> > + /* readahead sets which cannot be moved to journal */
> > + if (!__has_cursum_space(journal, nm_i->nat_cnt[DIRTY_NAT], NAT_JOURNAL)) {
> > + for (i = MAX_NAT_JENTRIES(journal); i <= NAT_ENTRY_PER_BLOCK; i++) {
> > + list_for_each_entry_safe(set, tmp, &nm_i->nat_dirty_set[i], set_list) {
> > + f2fs_ra_meta_pages(sbi, set->set, 1,
> > + META_NAT, true);
> > + }
> > + }
> > + }
> > +
> > f2fs_down_write(&nm_i->nat_tree_lock);
> >
> > /*
> > @@ -3141,21 +3141,13 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
> > nm_i->nat_cnt[DIRTY_NAT], NAT_JOURNAL))
> > remove_nats_in_journal(sbi);
> >
> > - while ((found = __gang_lookup_nat_set(nm_i,
> > - set_idx, NAT_VEC_SIZE, setvec))) {
> > - unsigned idx;
> > -
> > - set_idx = setvec[found - 1]->set + 1;
> > - for (idx = 0; idx < found; idx++)
> > - __adjust_nat_entry_set(setvec[idx], &sets,
> > - MAX_NAT_JENTRIES(journal));
> > - }
> > -
> > /* flush dirty nats in nat entry set */
> > - list_for_each_entry_safe(set, tmp, &sets, set_list) {
> > - err = __flush_nat_entry_set(sbi, set, cpc);
> > - if (err)
> > - break;
> > + for (i = 0; i <= NAT_ENTRY_PER_BLOCK; i++) {
> > + list_for_each_entry_safe(set, tmp, &nm_i->nat_dirty_set[i], set_list) {
> > + err = __flush_nat_entry_set(sbi, set, cpc);
> > + if (err)
> > + break;
> > + }
> > }
> >
> > f2fs_up_write(&nm_i->nat_tree_lock);
> > @@ -3249,6 +3241,7 @@ static int init_node_manager(struct f2fs_sb_info *sbi)
> > struct f2fs_nm_info *nm_i = NM_I(sbi);
> > unsigned char *version_bitmap;
> > unsigned int nat_segs;
> > + int i;
> > int err;
> >
> > nm_i->nat_blkaddr = le32_to_cpu(sb_raw->nat_blkaddr);
> > @@ -3275,6 +3268,9 @@ static int init_node_manager(struct f2fs_sb_info *sbi)
> > INIT_LIST_HEAD(&nm_i->nat_entries);
> > spin_lock_init(&nm_i->nat_list_lock);
> >
> > + for (i = 0; i <= NAT_ENTRY_PER_BLOCK; i++)
> > + INIT_LIST_HEAD(&nm_i->nat_dirty_set[i]);
> > +
> > mutex_init(&nm_i->build_lock);
> > spin_lock_init(&nm_i->nid_list_lock);
> > init_f2fs_rwsem(&nm_i->nat_tree_lock);
> > diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
> > index 030390543..d805d4ce7 100644
> > --- a/fs/f2fs/node.h
> > +++ b/fs/f2fs/node.h
> > @@ -158,7 +158,7 @@ enum mem_type {
> > };
> >
> > struct nat_entry_set {
> > - struct list_head set_list; /* link with other nat sets */
> > + struct list_head set_list; /* link with nat sets which have same entry_cnt */
> > struct list_head entry_list; /* link with dirty nat entries */
> > nid_t set; /* set number*/
> > unsigned int entry_cnt; /* the # of nat entries in set */
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [f2fs-dev] [PATCH] f2fs: reduce nat_tree_lock hold time when flush nat entries
2025-08-26 1:44 ` wangzijie
@ 2025-08-28 3:20 ` Chao Yu
0 siblings, 0 replies; 6+ messages in thread
From: Chao Yu @ 2025-08-28 3:20 UTC (permalink / raw)
To: wangzijie, linux-f2fs-devel; +Cc: chao, feng.han, jaegeuk, linux-kernel
On 8/26/25 09:44, wangzijie wrote:
>> On 8/13/25 12:04, wangzijie wrote:
>>> Sometimes I suffered the nat_tree_lock contention between f2fs_write_checkpoint
>>> and f2fs_get_node_info. Commit a9419b6("f2fs: do not bother checkpoint by
>>> f2fs_get_node_info") also mentioned that situation.
>>>
>>> My idea is, when flush nat entries, we can use some structures to record nat
>>> pages we may read, and readahead them before hold nat_tree_lock. Before
>>> impletement code, I did some survey and found a submittion in community.
>>>
>>> Subject: f2fs: use bucket sort to avoid tree lookup and list sort when nat flushing
>>> Link: https://lore.kernel.org/linux-f2fs-devel/20170520122435.17574-2-houpengyang@huawei.com/
>>> This patch aims to improve nat entry set sort by using bucket.
>>> I steal that structure and readahead nat pages contain nat entry set which cannot be moved
>>> to journal according to dirty nat entry set bucket.
>>>
>>> By doing this, I think there are two benefits to reducing nat_tree_lock hold time when
>>> when flush nat entries.
>>
>> Zijie,
>>
>> Can you please figure out some numbers for this patch? something like
>> checkpoint latency or average or extreme time to grab nat_tree_lock...?
>>
>>> 1. avoid nat set tree lookup and sort
>>> 2. readahead nat pages before holding nat_tree_lock
>>
>> It may cause performance regression if it races w/ drop_caches?
>>
>> Thanks,
>
> Hi, Chao
> Do you mean that it will race with f2fs_try_to_free_nats()?
No, I meant get_current_nat_folio() readahead may race with memory reclaim? e.g.
all uptodate nat pages are reclaimed before late updates? Seems a corner case.
Thanks,
> In this patch, nat_tree_lock is not held when readahead nat pages, but
> in fact we need it. Am I right?
>
>>>
>>> Signed-off-by: wangzijie <wangzijie1@honor.com>
>>> ---
>>> fs/f2fs/f2fs.h | 1 +
>>> fs/f2fs/node.c | 70 ++++++++++++++++++++++++--------------------------
>>> fs/f2fs/node.h | 2 +-
>>> 3 files changed, 35 insertions(+), 38 deletions(-)
>>>
>>> diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
>>> index 46be75605..b27cc059f 100644
>>> --- a/fs/f2fs/f2fs.h
>>> +++ b/fs/f2fs/f2fs.h
>>> @@ -975,6 +975,7 @@ struct f2fs_nm_info {
>>> struct radix_tree_root nat_set_root;/* root of the nat set cache */
>>> struct f2fs_rwsem nat_tree_lock; /* protect nat entry tree */
>>> struct list_head nat_entries; /* cached nat entry list (clean) */
>>> + struct list_head nat_dirty_set[NAT_ENTRY_PER_BLOCK + 1]; /* store dirty nat set */
>>> spinlock_t nat_list_lock; /* protect clean nat entry list */
>>> unsigned int nat_cnt[MAX_NAT_STATE]; /* the # of cached nat entries */
>>> unsigned int nat_blocks; /* # of nat blocks */
>>> diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
>>> index 27743b93e..87c975ee8 100644
>>> --- a/fs/f2fs/node.c
>>> +++ b/fs/f2fs/node.c
>>> @@ -244,6 +244,12 @@ static void __del_from_nat_cache(struct f2fs_nm_info *nm_i, struct nat_entry *e)
>>> __free_nat_entry(e);
>>> }
>>>
>>> +static void __relocate_nat_entry_set(struct f2fs_nm_info *nm_i,
>>> + struct nat_entry_set *set)
>>> +{
>>> + list_move_tail(&set->set_list, &nm_i->nat_dirty_set[set->entry_cnt]);
>>> +}
>>> +
>>> static struct nat_entry_set *__grab_nat_entry_set(struct f2fs_nm_info *nm_i,
>>> struct nat_entry *ne)
>>> {
>>> @@ -260,6 +266,7 @@ static struct nat_entry_set *__grab_nat_entry_set(struct f2fs_nm_info *nm_i,
>>> head->set = set;
>>> head->entry_cnt = 0;
>>> f2fs_radix_tree_insert(&nm_i->nat_set_root, set, head);
>>> + __relocate_nat_entry_set(nm_i, head);
>>> }
>>> return head;
>>> }
>>> @@ -279,8 +286,10 @@ static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i,
>>> * 2. update old block address to new one;
>>> */
>>> if (!new_ne && (get_nat_flag(ne, IS_PREALLOC) ||
>>> - !get_nat_flag(ne, IS_DIRTY)))
>>> + !get_nat_flag(ne, IS_DIRTY))) {
>>> head->entry_cnt++;
>>> + __relocate_nat_entry_set(nm_i, head);
>>> + }
>>>
>>> set_nat_flag(ne, IS_PREALLOC, new_ne);
>>>
>>> @@ -309,6 +318,7 @@ static void __clear_nat_cache_dirty(struct f2fs_nm_info *nm_i,
>>>
>>> set_nat_flag(ne, IS_DIRTY, false);
>>> set->entry_cnt--;
>>> + __relocate_nat_entry_set(nm_i, set);
>>> nm_i->nat_cnt[DIRTY_NAT]--;
>>> nm_i->nat_cnt[RECLAIMABLE_NAT]++;
>>> }
>>> @@ -2976,24 +2986,6 @@ static void remove_nats_in_journal(struct f2fs_sb_info *sbi)
>>> up_write(&curseg->journal_rwsem);
>>> }
>>>
>>> -static void __adjust_nat_entry_set(struct nat_entry_set *nes,
>>> - struct list_head *head, int max)
>>> -{
>>> - struct nat_entry_set *cur;
>>> -
>>> - if (nes->entry_cnt >= max)
>>> - goto add_out;
>>> -
>>> - list_for_each_entry(cur, head, set_list) {
>>> - if (cur->entry_cnt >= nes->entry_cnt) {
>>> - list_add(&nes->set_list, cur->set_list.prev);
>>> - return;
>>> - }
>>> - }
>>> -add_out:
>>> - list_add_tail(&nes->set_list, head);
>>> -}
>>> -
>>> static void __update_nat_bits(struct f2fs_sb_info *sbi, nid_t start_nid,
>>> const struct f2fs_nat_block *nat_blk)
>>> {
>>> @@ -3095,6 +3087,7 @@ static int __flush_nat_entry_set(struct f2fs_sb_info *sbi,
>>>
>>> /* Allow dirty nats by node block allocation in write_begin */
>>> if (!set->entry_cnt) {
>>> + list_del(&set->set_list);
>>> radix_tree_delete(&NM_I(sbi)->nat_set_root, set->set);
>>> kmem_cache_free(nat_entry_set_slab, set);
>>> }
>>> @@ -3109,11 +3102,8 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
>>> struct f2fs_nm_info *nm_i = NM_I(sbi);
>>> struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
>>> struct f2fs_journal *journal = curseg->journal;
>>> - struct nat_entry_set *setvec[NAT_VEC_SIZE];
>>> struct nat_entry_set *set, *tmp;
>>> - unsigned int found;
>>> - nid_t set_idx = 0;
>>> - LIST_HEAD(sets);
>>> + int i;
>>> int err = 0;
>>>
>>> /*
>>> @@ -3129,6 +3119,16 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
>>> if (!nm_i->nat_cnt[DIRTY_NAT])
>>> return 0;
>>>
>>> + /* readahead sets which cannot be moved to journal */
>>> + if (!__has_cursum_space(journal, nm_i->nat_cnt[DIRTY_NAT], NAT_JOURNAL)) {
>>> + for (i = MAX_NAT_JENTRIES(journal); i <= NAT_ENTRY_PER_BLOCK; i++) {
>>> + list_for_each_entry_safe(set, tmp, &nm_i->nat_dirty_set[i], set_list) {
>>> + f2fs_ra_meta_pages(sbi, set->set, 1,
>>> + META_NAT, true);
>>> + }
>>> + }
>>> + }
>>> +
>>> f2fs_down_write(&nm_i->nat_tree_lock);
>>>
>>> /*
>>> @@ -3141,21 +3141,13 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
>>> nm_i->nat_cnt[DIRTY_NAT], NAT_JOURNAL))
>>> remove_nats_in_journal(sbi);
>>>
>>> - while ((found = __gang_lookup_nat_set(nm_i,
>>> - set_idx, NAT_VEC_SIZE, setvec))) {
>>> - unsigned idx;
>>> -
>>> - set_idx = setvec[found - 1]->set + 1;
>>> - for (idx = 0; idx < found; idx++)
>>> - __adjust_nat_entry_set(setvec[idx], &sets,
>>> - MAX_NAT_JENTRIES(journal));
>>> - }
>>> -
>>> /* flush dirty nats in nat entry set */
>>> - list_for_each_entry_safe(set, tmp, &sets, set_list) {
>>> - err = __flush_nat_entry_set(sbi, set, cpc);
>>> - if (err)
>>> - break;
>>> + for (i = 0; i <= NAT_ENTRY_PER_BLOCK; i++) {
>>> + list_for_each_entry_safe(set, tmp, &nm_i->nat_dirty_set[i], set_list) {
>>> + err = __flush_nat_entry_set(sbi, set, cpc);
>>> + if (err)
>>> + break;
>>> + }
>>> }
>>>
>>> f2fs_up_write(&nm_i->nat_tree_lock);
>>> @@ -3249,6 +3241,7 @@ static int init_node_manager(struct f2fs_sb_info *sbi)
>>> struct f2fs_nm_info *nm_i = NM_I(sbi);
>>> unsigned char *version_bitmap;
>>> unsigned int nat_segs;
>>> + int i;
>>> int err;
>>>
>>> nm_i->nat_blkaddr = le32_to_cpu(sb_raw->nat_blkaddr);
>>> @@ -3275,6 +3268,9 @@ static int init_node_manager(struct f2fs_sb_info *sbi)
>>> INIT_LIST_HEAD(&nm_i->nat_entries);
>>> spin_lock_init(&nm_i->nat_list_lock);
>>>
>>> + for (i = 0; i <= NAT_ENTRY_PER_BLOCK; i++)
>>> + INIT_LIST_HEAD(&nm_i->nat_dirty_set[i]);
>>> +
>>> mutex_init(&nm_i->build_lock);
>>> spin_lock_init(&nm_i->nid_list_lock);
>>> init_f2fs_rwsem(&nm_i->nat_tree_lock);
>>> diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
>>> index 030390543..d805d4ce7 100644
>>> --- a/fs/f2fs/node.h
>>> +++ b/fs/f2fs/node.h
>>> @@ -158,7 +158,7 @@ enum mem_type {
>>> };
>>>
>>> struct nat_entry_set {
>>> - struct list_head set_list; /* link with other nat sets */
>>> + struct list_head set_list; /* link with nat sets which have same entry_cnt */
>>> struct list_head entry_list; /* link with dirty nat entries */
>>> nid_t set; /* set number*/
>>> unsigned int entry_cnt; /* the # of nat entries in set */
>
^ permalink raw reply [flat|nested] 6+ messages in thread
end of thread, other threads:[~2025-08-28 3:20 UTC | newest]
Thread overview: 6+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-08-13 4:04 [f2fs-dev] [PATCH] f2fs: reduce nat_tree_lock hold time when flush nat entries wangzijie
2025-08-18 2:21 ` Zhiguo Niu
2025-08-19 2:36 ` wangzijie
2025-08-25 2:51 ` Chao Yu
2025-08-26 1:44 ` wangzijie
2025-08-28 3:20 ` Chao Yu
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).