linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [f2fs-dev] [PATCH] f2fs: reduce nat_tree_lock hold time when flush nat entries
@ 2025-08-13  4:04 wangzijie
  2025-08-18  2:21 ` Zhiguo Niu
  2025-08-25  2:51 ` Chao Yu
  0 siblings, 2 replies; 6+ messages in thread
From: wangzijie @ 2025-08-13  4:04 UTC (permalink / raw)
  To: jaegeuk, chao
  Cc: linux-f2fs-devel, linux-kernel, bintian.wang, feng.han, wangzijie

Sometimes I suffered the nat_tree_lock contention between f2fs_write_checkpoint
and f2fs_get_node_info. Commit a9419b6("f2fs: do not bother checkpoint by
f2fs_get_node_info") also mentioned that situation.

My idea is, when flush nat entries, we can use some structures to record nat
pages we may read, and readahead them before hold nat_tree_lock. Before
impletement code, I did some survey and found a submittion in community.

Subject: f2fs: use bucket sort to avoid tree lookup and list sort when nat flushing
Link: https://lore.kernel.org/linux-f2fs-devel/20170520122435.17574-2-houpengyang@huawei.com/
This patch aims to improve nat entry set sort by using bucket.
I steal that structure and readahead nat pages contain nat entry set which cannot be moved
to journal according to dirty nat entry set bucket.

By doing this, I think there are two benefits to reducing nat_tree_lock hold time when
when flush nat entries.
1. avoid nat set tree lookup and sort
2. readahead nat pages before holding nat_tree_lock

Signed-off-by: wangzijie <wangzijie1@honor.com>
---
 fs/f2fs/f2fs.h |  1 +
 fs/f2fs/node.c | 70 ++++++++++++++++++++++++--------------------------
 fs/f2fs/node.h |  2 +-
 3 files changed, 35 insertions(+), 38 deletions(-)

diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 46be75605..b27cc059f 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -975,6 +975,7 @@ struct f2fs_nm_info {
 	struct radix_tree_root nat_set_root;/* root of the nat set cache */
 	struct f2fs_rwsem nat_tree_lock;	/* protect nat entry tree */
 	struct list_head nat_entries;	/* cached nat entry list (clean) */
+	struct list_head nat_dirty_set[NAT_ENTRY_PER_BLOCK + 1];	/* store dirty nat set */
 	spinlock_t nat_list_lock;	/* protect clean nat entry list */
 	unsigned int nat_cnt[MAX_NAT_STATE]; /* the # of cached nat entries */
 	unsigned int nat_blocks;	/* # of nat blocks */
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 27743b93e..87c975ee8 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -244,6 +244,12 @@ static void __del_from_nat_cache(struct f2fs_nm_info *nm_i, struct nat_entry *e)
 	__free_nat_entry(e);
 }
 
+static void __relocate_nat_entry_set(struct f2fs_nm_info *nm_i,
+							struct nat_entry_set *set)
+{
+	list_move_tail(&set->set_list, &nm_i->nat_dirty_set[set->entry_cnt]);
+}
+
 static struct nat_entry_set *__grab_nat_entry_set(struct f2fs_nm_info *nm_i,
 							struct nat_entry *ne)
 {
@@ -260,6 +266,7 @@ static struct nat_entry_set *__grab_nat_entry_set(struct f2fs_nm_info *nm_i,
 		head->set = set;
 		head->entry_cnt = 0;
 		f2fs_radix_tree_insert(&nm_i->nat_set_root, set, head);
+		__relocate_nat_entry_set(nm_i, head);
 	}
 	return head;
 }
@@ -279,8 +286,10 @@ static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i,
 	 * 2. update old block address to new one;
 	 */
 	if (!new_ne && (get_nat_flag(ne, IS_PREALLOC) ||
-				!get_nat_flag(ne, IS_DIRTY)))
+				!get_nat_flag(ne, IS_DIRTY))) {
 		head->entry_cnt++;
+		__relocate_nat_entry_set(nm_i, head);
+	}
 
 	set_nat_flag(ne, IS_PREALLOC, new_ne);
 
@@ -309,6 +318,7 @@ static void __clear_nat_cache_dirty(struct f2fs_nm_info *nm_i,
 
 	set_nat_flag(ne, IS_DIRTY, false);
 	set->entry_cnt--;
+	__relocate_nat_entry_set(nm_i, set);
 	nm_i->nat_cnt[DIRTY_NAT]--;
 	nm_i->nat_cnt[RECLAIMABLE_NAT]++;
 }
@@ -2976,24 +2986,6 @@ static void remove_nats_in_journal(struct f2fs_sb_info *sbi)
 	up_write(&curseg->journal_rwsem);
 }
 
-static void __adjust_nat_entry_set(struct nat_entry_set *nes,
-						struct list_head *head, int max)
-{
-	struct nat_entry_set *cur;
-
-	if (nes->entry_cnt >= max)
-		goto add_out;
-
-	list_for_each_entry(cur, head, set_list) {
-		if (cur->entry_cnt >= nes->entry_cnt) {
-			list_add(&nes->set_list, cur->set_list.prev);
-			return;
-		}
-	}
-add_out:
-	list_add_tail(&nes->set_list, head);
-}
-
 static void __update_nat_bits(struct f2fs_sb_info *sbi, nid_t start_nid,
 		const struct f2fs_nat_block *nat_blk)
 {
@@ -3095,6 +3087,7 @@ static int __flush_nat_entry_set(struct f2fs_sb_info *sbi,
 
 	/* Allow dirty nats by node block allocation in write_begin */
 	if (!set->entry_cnt) {
+		list_del(&set->set_list);
 		radix_tree_delete(&NM_I(sbi)->nat_set_root, set->set);
 		kmem_cache_free(nat_entry_set_slab, set);
 	}
@@ -3109,11 +3102,8 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 	struct f2fs_nm_info *nm_i = NM_I(sbi);
 	struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
 	struct f2fs_journal *journal = curseg->journal;
-	struct nat_entry_set *setvec[NAT_VEC_SIZE];
 	struct nat_entry_set *set, *tmp;
-	unsigned int found;
-	nid_t set_idx = 0;
-	LIST_HEAD(sets);
+	int i;
 	int err = 0;
 
 	/*
@@ -3129,6 +3119,16 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 	if (!nm_i->nat_cnt[DIRTY_NAT])
 		return 0;
 
+	/* readahead sets which cannot be moved to journal */
+	if (!__has_cursum_space(journal, nm_i->nat_cnt[DIRTY_NAT], NAT_JOURNAL)) {
+		for (i = MAX_NAT_JENTRIES(journal); i <= NAT_ENTRY_PER_BLOCK; i++) {
+			list_for_each_entry_safe(set, tmp, &nm_i->nat_dirty_set[i], set_list) {
+				f2fs_ra_meta_pages(sbi, set->set, 1,
+								META_NAT, true);
+			}
+		}
+	}
+
 	f2fs_down_write(&nm_i->nat_tree_lock);
 
 	/*
@@ -3141,21 +3141,13 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 			nm_i->nat_cnt[DIRTY_NAT], NAT_JOURNAL))
 		remove_nats_in_journal(sbi);
 
-	while ((found = __gang_lookup_nat_set(nm_i,
-					set_idx, NAT_VEC_SIZE, setvec))) {
-		unsigned idx;
-
-		set_idx = setvec[found - 1]->set + 1;
-		for (idx = 0; idx < found; idx++)
-			__adjust_nat_entry_set(setvec[idx], &sets,
-						MAX_NAT_JENTRIES(journal));
-	}
-
 	/* flush dirty nats in nat entry set */
-	list_for_each_entry_safe(set, tmp, &sets, set_list) {
-		err = __flush_nat_entry_set(sbi, set, cpc);
-		if (err)
-			break;
+	for (i = 0; i <= NAT_ENTRY_PER_BLOCK; i++) {
+		list_for_each_entry_safe(set, tmp, &nm_i->nat_dirty_set[i], set_list) {
+			err = __flush_nat_entry_set(sbi, set, cpc);
+			if (err)
+				break;
+		}
 	}
 
 	f2fs_up_write(&nm_i->nat_tree_lock);
@@ -3249,6 +3241,7 @@ static int init_node_manager(struct f2fs_sb_info *sbi)
 	struct f2fs_nm_info *nm_i = NM_I(sbi);
 	unsigned char *version_bitmap;
 	unsigned int nat_segs;
+	int i;
 	int err;
 
 	nm_i->nat_blkaddr = le32_to_cpu(sb_raw->nat_blkaddr);
@@ -3275,6 +3268,9 @@ static int init_node_manager(struct f2fs_sb_info *sbi)
 	INIT_LIST_HEAD(&nm_i->nat_entries);
 	spin_lock_init(&nm_i->nat_list_lock);
 
+	for (i = 0; i <= NAT_ENTRY_PER_BLOCK; i++)
+		INIT_LIST_HEAD(&nm_i->nat_dirty_set[i]);
+
 	mutex_init(&nm_i->build_lock);
 	spin_lock_init(&nm_i->nid_list_lock);
 	init_f2fs_rwsem(&nm_i->nat_tree_lock);
diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
index 030390543..d805d4ce7 100644
--- a/fs/f2fs/node.h
+++ b/fs/f2fs/node.h
@@ -158,7 +158,7 @@ enum mem_type {
 };
 
 struct nat_entry_set {
-	struct list_head set_list;	/* link with other nat sets */
+	struct list_head set_list;	/* link with nat sets which have same entry_cnt */
 	struct list_head entry_list;	/* link with dirty nat entries */
 	nid_t set;			/* set number*/
 	unsigned int entry_cnt;		/* the # of nat entries in set */
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [f2fs-dev] [PATCH] f2fs: reduce nat_tree_lock hold time when flush nat entries
  2025-08-13  4:04 [f2fs-dev] [PATCH] f2fs: reduce nat_tree_lock hold time when flush nat entries wangzijie
@ 2025-08-18  2:21 ` Zhiguo Niu
  2025-08-19  2:36   ` wangzijie
  2025-08-25  2:51 ` Chao Yu
  1 sibling, 1 reply; 6+ messages in thread
From: Zhiguo Niu @ 2025-08-18  2:21 UTC (permalink / raw)
  To: wangzijie; +Cc: jaegeuk, chao, linux-kernel, feng.han, linux-f2fs-devel

wangzijie <wangzijie1@honor.com> 于2025年8月13日周三 12:06写道:
>
> Sometimes I suffered the nat_tree_lock contention between f2fs_write_checkpoint
> and f2fs_get_node_info. Commit a9419b6("f2fs: do not bother checkpoint by
> f2fs_get_node_info") also mentioned that situation.
>
> My idea is, when flush nat entries, we can use some structures to record nat
> pages we may read, and readahead them before hold nat_tree_lock. Before
> impletement code, I did some survey and found a submittion in community.
>
> Subject: f2fs: use bucket sort to avoid tree lookup and list sort when nat flushing
> Link: https://lore.kernel.org/linux-f2fs-devel/20170520122435.17574-2-houpengyang@huawei.com/
> This patch aims to improve nat entry set sort by using bucket.
> I steal that structure and readahead nat pages contain nat entry set which cannot be moved
> to journal according to dirty nat entry set bucket.
>
> By doing this, I think there are two benefits to reducing nat_tree_lock hold time when
> when flush nat entries.
> 1. avoid nat set tree lookup and sort
> 2. readahead nat pages before holding nat_tree_lock
>
> Signed-off-by: wangzijie <wangzijie1@honor.com>
> ---
>  fs/f2fs/f2fs.h |  1 +
>  fs/f2fs/node.c | 70 ++++++++++++++++++++++++--------------------------
>  fs/f2fs/node.h |  2 +-
>  3 files changed, 35 insertions(+), 38 deletions(-)
>
> diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
> index 46be75605..b27cc059f 100644
> --- a/fs/f2fs/f2fs.h
> +++ b/fs/f2fs/f2fs.h
> @@ -975,6 +975,7 @@ struct f2fs_nm_info {
>         struct radix_tree_root nat_set_root;/* root of the nat set cache */
>         struct f2fs_rwsem nat_tree_lock;        /* protect nat entry tree */
>         struct list_head nat_entries;   /* cached nat entry list (clean) */
> +       struct list_head nat_dirty_set[NAT_ENTRY_PER_BLOCK + 1];        /* store dirty nat set */
>         spinlock_t nat_list_lock;       /* protect clean nat entry list */
>         unsigned int nat_cnt[MAX_NAT_STATE]; /* the # of cached nat entries */
>         unsigned int nat_blocks;        /* # of nat blocks */
> diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
> index 27743b93e..87c975ee8 100644
> --- a/fs/f2fs/node.c
> +++ b/fs/f2fs/node.c
> @@ -244,6 +244,12 @@ static void __del_from_nat_cache(struct f2fs_nm_info *nm_i, struct nat_entry *e)
>         __free_nat_entry(e);
>  }
>
> +static void __relocate_nat_entry_set(struct f2fs_nm_info *nm_i,
> +                                                       struct nat_entry_set *set)
> +{
> +       list_move_tail(&set->set_list, &nm_i->nat_dirty_set[set->entry_cnt]);
> +}
Hi zijie,
Is there any lock protecting this list related operations in the
current process?
> +
>  static struct nat_entry_set *__grab_nat_entry_set(struct f2fs_nm_info *nm_i,
>                                                         struct nat_entry *ne)
>  {
> @@ -260,6 +266,7 @@ static struct nat_entry_set *__grab_nat_entry_set(struct f2fs_nm_info *nm_i,
>                 head->set = set;
>                 head->entry_cnt = 0;
>                 f2fs_radix_tree_insert(&nm_i->nat_set_root, set, head);
> +               __relocate_nat_entry_set(nm_i, head);
>         }
>         return head;
>  }
> @@ -279,8 +286,10 @@ static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i,
>          * 2. update old block address to new one;
>          */
>         if (!new_ne && (get_nat_flag(ne, IS_PREALLOC) ||
> -                               !get_nat_flag(ne, IS_DIRTY)))
> +                               !get_nat_flag(ne, IS_DIRTY))) {
>                 head->entry_cnt++;
> +               __relocate_nat_entry_set(nm_i, head);
> +       }
>
>         set_nat_flag(ne, IS_PREALLOC, new_ne);
>
> @@ -309,6 +318,7 @@ static void __clear_nat_cache_dirty(struct f2fs_nm_info *nm_i,
>
>         set_nat_flag(ne, IS_DIRTY, false);
>         set->entry_cnt--;
> +       __relocate_nat_entry_set(nm_i, set);
>         nm_i->nat_cnt[DIRTY_NAT]--;
>         nm_i->nat_cnt[RECLAIMABLE_NAT]++;
>  }
> @@ -2976,24 +2986,6 @@ static void remove_nats_in_journal(struct f2fs_sb_info *sbi)
>         up_write(&curseg->journal_rwsem);
>  }
>
> -static void __adjust_nat_entry_set(struct nat_entry_set *nes,
> -                                               struct list_head *head, int max)
> -{
> -       struct nat_entry_set *cur;
> -
> -       if (nes->entry_cnt >= max)
> -               goto add_out;
> -
> -       list_for_each_entry(cur, head, set_list) {
> -               if (cur->entry_cnt >= nes->entry_cnt) {
> -                       list_add(&nes->set_list, cur->set_list.prev);
> -                       return;
> -               }
> -       }
> -add_out:
> -       list_add_tail(&nes->set_list, head);
> -}
> -
>  static void __update_nat_bits(struct f2fs_sb_info *sbi, nid_t start_nid,
>                 const struct f2fs_nat_block *nat_blk)
>  {
> @@ -3095,6 +3087,7 @@ static int __flush_nat_entry_set(struct f2fs_sb_info *sbi,
>
>         /* Allow dirty nats by node block allocation in write_begin */
>         if (!set->entry_cnt) {
> +               list_del(&set->set_list);
>                 radix_tree_delete(&NM_I(sbi)->nat_set_root, set->set);
>                 kmem_cache_free(nat_entry_set_slab, set);
>         }
> @@ -3109,11 +3102,8 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
>         struct f2fs_nm_info *nm_i = NM_I(sbi);
>         struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
>         struct f2fs_journal *journal = curseg->journal;
> -       struct nat_entry_set *setvec[NAT_VEC_SIZE];
>         struct nat_entry_set *set, *tmp;
> -       unsigned int found;
> -       nid_t set_idx = 0;
> -       LIST_HEAD(sets);
> +       int i;
>         int err = 0;
>
>         /*
> @@ -3129,6 +3119,16 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
>         if (!nm_i->nat_cnt[DIRTY_NAT])
>                 return 0;
>
> +       /* readahead sets which cannot be moved to journal */
> +       if (!__has_cursum_space(journal, nm_i->nat_cnt[DIRTY_NAT], NAT_JOURNAL)) {
> +               for (i = MAX_NAT_JENTRIES(journal); i <= NAT_ENTRY_PER_BLOCK; i++) {
i am a little confused, why is there "i <= NAT_ENTRY_PER_BLOCK;"?
do we need to calculate according to the current nat block/entry number?
thanks!
> +                       list_for_each_entry_safe(set, tmp, &nm_i->nat_dirty_set[i], set_list) {
> +                               f2fs_ra_meta_pages(sbi, set->set, 1,
> +                                                               META_NAT, true);
> +                       }
> +               }
> +       }
> +
>         f2fs_down_write(&nm_i->nat_tree_lock);
>
>         /*
> @@ -3141,21 +3141,13 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
>                         nm_i->nat_cnt[DIRTY_NAT], NAT_JOURNAL))
>                 remove_nats_in_journal(sbi);
>
> -       while ((found = __gang_lookup_nat_set(nm_i,
> -                                       set_idx, NAT_VEC_SIZE, setvec))) {
> -               unsigned idx;
> -
> -               set_idx = setvec[found - 1]->set + 1;
> -               for (idx = 0; idx < found; idx++)
> -                       __adjust_nat_entry_set(setvec[idx], &sets,
> -                                               MAX_NAT_JENTRIES(journal));
> -       }
> -
>         /* flush dirty nats in nat entry set */
> -       list_for_each_entry_safe(set, tmp, &sets, set_list) {
> -               err = __flush_nat_entry_set(sbi, set, cpc);
> -               if (err)
> -                       break;
> +       for (i = 0; i <= NAT_ENTRY_PER_BLOCK; i++) {
> +               list_for_each_entry_safe(set, tmp, &nm_i->nat_dirty_set[i], set_list) {
> +                       err = __flush_nat_entry_set(sbi, set, cpc);
> +                       if (err)
> +                               break;
> +               }
>         }
>
>         f2fs_up_write(&nm_i->nat_tree_lock);
> @@ -3249,6 +3241,7 @@ static int init_node_manager(struct f2fs_sb_info *sbi)
>         struct f2fs_nm_info *nm_i = NM_I(sbi);
>         unsigned char *version_bitmap;
>         unsigned int nat_segs;
> +       int i;
>         int err;
>
>         nm_i->nat_blkaddr = le32_to_cpu(sb_raw->nat_blkaddr);
> @@ -3275,6 +3268,9 @@ static int init_node_manager(struct f2fs_sb_info *sbi)
>         INIT_LIST_HEAD(&nm_i->nat_entries);
>         spin_lock_init(&nm_i->nat_list_lock);
>
> +       for (i = 0; i <= NAT_ENTRY_PER_BLOCK; i++)
> +               INIT_LIST_HEAD(&nm_i->nat_dirty_set[i]);
> +
>         mutex_init(&nm_i->build_lock);
>         spin_lock_init(&nm_i->nid_list_lock);
>         init_f2fs_rwsem(&nm_i->nat_tree_lock);
> diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
> index 030390543..d805d4ce7 100644
> --- a/fs/f2fs/node.h
> +++ b/fs/f2fs/node.h
> @@ -158,7 +158,7 @@ enum mem_type {
>  };
>
>  struct nat_entry_set {
> -       struct list_head set_list;      /* link with other nat sets */
> +       struct list_head set_list;      /* link with nat sets which have same entry_cnt */
>         struct list_head entry_list;    /* link with dirty nat entries */
>         nid_t set;                      /* set number*/
>         unsigned int entry_cnt;         /* the # of nat entries in set */
> --
> 2.25.1
>
>
>
> _______________________________________________
> Linux-f2fs-devel mailing list
> Linux-f2fs-devel@lists.sourceforge.net
> https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [f2fs-dev] [PATCH] f2fs: reduce nat_tree_lock hold time when flush nat entries
  2025-08-18  2:21 ` Zhiguo Niu
@ 2025-08-19  2:36   ` wangzijie
  0 siblings, 0 replies; 6+ messages in thread
From: wangzijie @ 2025-08-19  2:36 UTC (permalink / raw)
  To: niuzhiguo84
  Cc: chao, feng.han, jaegeuk, linux-f2fs-devel, linux-kernel,
	wangzijie1

>>
>> Sometimes I suffered the nat_tree_lock contention between f2fs_write_checkpoint
>> and f2fs_get_node_info. Commit a9419b6("f2fs: do not bother checkpoint by
>> f2fs_get_node_info") also mentioned that situation.
>>
>> My idea is, when flush nat entries, we can use some structures to record nat
>> pages we may read, and readahead them before hold nat_tree_lock. Before
>> impletement code, I did some survey and found a submittion in community.
>>
>> Subject: f2fs: use bucket sort to avoid tree lookup and list sort when nat flushing
>> Link: https://lore.kernel.org/linux-f2fs-devel/20170520122435.17574-2-houpengyang@huawei.com/
>> This patch aims to improve nat entry set sort by using bucket.
>> I steal that structure and readahead nat pages contain nat entry set which cannot be moved
>> to journal according to dirty nat entry set bucket.
>>
>> By doing this, I think there are two benefits to reducing nat_tree_lock hold time when
>> when flush nat entries.
>> 1. avoid nat set tree lookup and sort
>> 2. readahead nat pages before holding nat_tree_lock
>>
>> Signed-off-by: wangzijie <wangzijie1@honor.com>
>> ---
>>  fs/f2fs/f2fs.h |  1 +
>>  fs/f2fs/node.c | 70 ++++++++++++++++++++++++--------------------------
>>  fs/f2fs/node.h |  2 +-
>>  3 files changed, 35 insertions(+), 38 deletions(-)
>>
>> diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
>> index 46be75605..b27cc059f 100644
>> --- a/fs/f2fs/f2fs.h
>> +++ b/fs/f2fs/f2fs.h
>> @@ -975,6 +975,7 @@ struct f2fs_nm_info {
>>         struct radix_tree_root nat_set_root;/* root of the nat set cache */
>>         struct f2fs_rwsem nat_tree_lock;        /* protect nat entry tree */
>>         struct list_head nat_entries;   /* cached nat entry list (clean) */
>> +       struct list_head nat_dirty_set[NAT_ENTRY_PER_BLOCK + 1];        /* store dirty nat set */
>>         spinlock_t nat_list_lock;       /* protect clean nat entry list */
>>         unsigned int nat_cnt[MAX_NAT_STATE]; /* the # of cached nat entries */
>>         unsigned int nat_blocks;        /* # of nat blocks */
>> diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
>> index 27743b93e..87c975ee8 100644
>> --- a/fs/f2fs/node.c
>> +++ b/fs/f2fs/node.c
>> @@ -244,6 +244,12 @@ static void __del_from_nat_cache(struct f2fs_nm_info *nm_i, struct nat_entry *e)
>>         __free_nat_entry(e);
>>  }
>>
>> +static void __relocate_nat_entry_set(struct f2fs_nm_info *nm_i,
>> +                                                       struct nat_entry_set *set)
>> +{
>> +       list_move_tail(&set->set_list, &nm_i->nat_dirty_set[set->entry_cnt]);
>> +}
>Hi zijie,
>Is there any lock protecting this list related operations in the
>current process?

Hi, Zhiguo
I think the lock protection needed is as same as __set_nat_cache_dirty().

>> +
>>  static struct nat_entry_set *__grab_nat_entry_set(struct f2fs_nm_info *nm_i,
>>                                                         struct nat_entry *ne)
>>  {
>> @@ -260,6 +266,7 @@ static struct nat_entry_set *__grab_nat_entry_set(struct f2fs_nm_info *nm_i,
>>                 head->set = set;
>>                 head->entry_cnt = 0;
>>                 f2fs_radix_tree_insert(&nm_i->nat_set_root, set, head);
>> +               __relocate_nat_entry_set(nm_i, head);
>>         }
>>         return head;
>>  }
>> @@ -279,8 +286,10 @@ static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i,
>>          * 2. update old block address to new one;
>>          */
>>         if (!new_ne && (get_nat_flag(ne, IS_PREALLOC) ||
>> -                               !get_nat_flag(ne, IS_DIRTY)))
>> +                               !get_nat_flag(ne, IS_DIRTY))) {
>>                 head->entry_cnt++;
>> +               __relocate_nat_entry_set(nm_i, head);
>> +       }
>>
>>         set_nat_flag(ne, IS_PREALLOC, new_ne);
>>
>> @@ -309,6 +318,7 @@ static void __clear_nat_cache_dirty(struct f2fs_nm_info *nm_i,
>>
>>         set_nat_flag(ne, IS_DIRTY, false);
>>         set->entry_cnt--;
>> +       __relocate_nat_entry_set(nm_i, set);
>>         nm_i->nat_cnt[DIRTY_NAT]--;
>>         nm_i->nat_cnt[RECLAIMABLE_NAT]++;
>>  }
>> @@ -2976,24 +2986,6 @@ static void remove_nats_in_journal(struct f2fs_sb_info *sbi)
>>         up_write(&curseg->journal_rwsem);
>>  }
>>
>> -static void __adjust_nat_entry_set(struct nat_entry_set *nes,
>> -                                               struct list_head *head, int max)
>> -{
>> -       struct nat_entry_set *cur;
>> -
>> -       if (nes->entry_cnt >= max)
>> -               goto add_out;
>> -
>> -       list_for_each_entry(cur, head, set_list) {
>> -               if (cur->entry_cnt >= nes->entry_cnt) {
>> -                       list_add(&nes->set_list, cur->set_list.prev);
>> -                       return;
>> -               }
>> -       }
>> -add_out:
>> -       list_add_tail(&nes->set_list, head);
>> -}
>> -
>>  static void __update_nat_bits(struct f2fs_sb_info *sbi, nid_t start_nid,
>>                 const struct f2fs_nat_block *nat_blk)
>>  {
>> @@ -3095,6 +3087,7 @@ static int __flush_nat_entry_set(struct f2fs_sb_info *sbi,
>>
>>         /* Allow dirty nats by node block allocation in write_begin */
>>         if (!set->entry_cnt) {
>> +               list_del(&set->set_list);
>>                 radix_tree_delete(&NM_I(sbi)->nat_set_root, set->set);
>>                 kmem_cache_free(nat_entry_set_slab, set);
>>         }
>> @@ -3109,11 +3102,8 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
>>         struct f2fs_nm_info *nm_i = NM_I(sbi);
>>         struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
>>         struct f2fs_journal *journal = curseg->journal;
>> -       struct nat_entry_set *setvec[NAT_VEC_SIZE];
>>         struct nat_entry_set *set, *tmp;
>> -       unsigned int found;
>> -       nid_t set_idx = 0;
>> -       LIST_HEAD(sets);
>> +       int i;
>>         int err = 0;
>>
>>         /*
>> @@ -3129,6 +3119,16 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
>>         if (!nm_i->nat_cnt[DIRTY_NAT])
>>                 return 0;
>>
>> +       /* readahead sets which cannot be moved to journal */
>> +       if (!__has_cursum_space(journal, nm_i->nat_cnt[DIRTY_NAT], NAT_JOURNAL)) {
>> +               for (i = MAX_NAT_JENTRIES(journal); i <= NAT_ENTRY_PER_BLOCK; i++) {
>i am a little confused, why is there "i <= NAT_ENTRY_PER_BLOCK;"?
>do we need to calculate according to the current nat block/entry number?
>thanks!

Yes, you are right, we can calculate according to DIRTY_NAT count to end this loop early.


>> +                       list_for_each_entry_safe(set, tmp, &nm_i->nat_dirty_set[i], set_list) {
>> +                               f2fs_ra_meta_pages(sbi, set->set, 1,
>> +                                                               META_NAT, true);
>> +                       }
>> +               }
>> +       }
>> +
>>         f2fs_down_write(&nm_i->nat_tree_lock);
>>
>>         /*
>> @@ -3141,21 +3141,13 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
>>                         nm_i->nat_cnt[DIRTY_NAT], NAT_JOURNAL))
>>                 remove_nats_in_journal(sbi);
>>
>> -       while ((found = __gang_lookup_nat_set(nm_i,
>> -                                       set_idx, NAT_VEC_SIZE, setvec))) {
>> -               unsigned idx;
>> -
>> -               set_idx = setvec[found - 1]->set + 1;
>> -               for (idx = 0; idx < found; idx++)
>> -                       __adjust_nat_entry_set(setvec[idx], &sets,
>> -                                               MAX_NAT_JENTRIES(journal));
>> -       }
>> -
>>         /* flush dirty nats in nat entry set */
>> -       list_for_each_entry_safe(set, tmp, &sets, set_list) {
>> -               err = __flush_nat_entry_set(sbi, set, cpc);
>> -               if (err)
>> -                       break;
>> +       for (i = 0; i <= NAT_ENTRY_PER_BLOCK; i++) {
>> +               list_for_each_entry_safe(set, tmp, &nm_i->nat_dirty_set[i], set_list) {
>> +                       err = __flush_nat_entry_set(sbi, set, cpc);
>> +                       if (err)
>> +                               break;
>> +               }
>>         }
>>
>>         f2fs_up_write(&nm_i->nat_tree_lock);
>> @@ -3249,6 +3241,7 @@ static int init_node_manager(struct f2fs_sb_info *sbi)
>>         struct f2fs_nm_info *nm_i = NM_I(sbi);
>>         unsigned char *version_bitmap;
>>         unsigned int nat_segs;
>> +       int i;
>>         int err;
>>
>>         nm_i->nat_blkaddr = le32_to_cpu(sb_raw->nat_blkaddr);
>> @@ -3275,6 +3268,9 @@ static int init_node_manager(struct f2fs_sb_info *sbi)
>>         INIT_LIST_HEAD(&nm_i->nat_entries);
>>         spin_lock_init(&nm_i->nat_list_lock);
>>
>> +       for (i = 0; i <= NAT_ENTRY_PER_BLOCK; i++)
>> +               INIT_LIST_HEAD(&nm_i->nat_dirty_set[i]);
>> +
>>         mutex_init(&nm_i->build_lock);
>>         spin_lock_init(&nm_i->nid_list_lock);
>>         init_f2fs_rwsem(&nm_i->nat_tree_lock);
>> diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
>> index 030390543..d805d4ce7 100644
>> --- a/fs/f2fs/node.h
>> +++ b/fs/f2fs/node.h
>> @@ -158,7 +158,7 @@ enum mem_type {
>>  };
>>
>>  struct nat_entry_set {
>> -       struct list_head set_list;      /* link with other nat sets */
>> +       struct list_head set_list;      /* link with nat sets which have same entry_cnt */
>>         struct list_head entry_list;    /* link with dirty nat entries */
>>         nid_t set;                      /* set number*/
>>         unsigned int entry_cnt;         /* the # of nat entries in set */
>> --
>> 2.25.1

By the way, Chao and Zhiguo
I noticed that in f2fs_flush_nat_entries(), the check for nat_cnt[DIRTY_NAT] is out of
nat_tree_lock, and in f2fs_write_checkpoint(), there has below check without nat_tree_lock:

if (cpc->reason & CP_DISCARD) {
	if (!f2fs_exist_trim_candidates(sbi, cpc)) {
		unblock_operations(sbi);
		goto out;
	}

	if (NM_I(sbi)->nat_cnt[DIRTY_NAT] == 0 &&
			SIT_I(sbi)->dirty_sentries == 0 &&
			prefree_segments(sbi) == 0) {
		f2fs_flush_sit_entries(sbi, cpc);
		f2fs_clear_prefree_segments(sbi, cpc);
		unblock_operations(sbi);
		goto out;
	}
}

Is there a missing or we don't need nat_tree_lock in these situations?

And do you think the way in this patch is appropriate for reducing nat_tree_lock
hold time?

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [f2fs-dev] [PATCH] f2fs: reduce nat_tree_lock hold time when flush nat entries
  2025-08-13  4:04 [f2fs-dev] [PATCH] f2fs: reduce nat_tree_lock hold time when flush nat entries wangzijie
  2025-08-18  2:21 ` Zhiguo Niu
@ 2025-08-25  2:51 ` Chao Yu
  2025-08-26  1:44   ` wangzijie
  1 sibling, 1 reply; 6+ messages in thread
From: Chao Yu @ 2025-08-25  2:51 UTC (permalink / raw)
  To: wangzijie, jaegeuk
  Cc: chao, linux-f2fs-devel, linux-kernel, bintian.wang, feng.han

On 8/13/25 12:04, wangzijie wrote:
> Sometimes I suffered the nat_tree_lock contention between f2fs_write_checkpoint
> and f2fs_get_node_info. Commit a9419b6("f2fs: do not bother checkpoint by
> f2fs_get_node_info") also mentioned that situation.
> 
> My idea is, when flush nat entries, we can use some structures to record nat
> pages we may read, and readahead them before hold nat_tree_lock. Before
> impletement code, I did some survey and found a submittion in community.
> 
> Subject: f2fs: use bucket sort to avoid tree lookup and list sort when nat flushing
> Link: https://lore.kernel.org/linux-f2fs-devel/20170520122435.17574-2-houpengyang@huawei.com/
> This patch aims to improve nat entry set sort by using bucket.
> I steal that structure and readahead nat pages contain nat entry set which cannot be moved
> to journal according to dirty nat entry set bucket.
> 
> By doing this, I think there are two benefits to reducing nat_tree_lock hold time when
> when flush nat entries.

Zijie,

Can you please figure out some numbers for this patch? something like
checkpoint latency or average or extreme time to grab nat_tree_lock...?

> 1. avoid nat set tree lookup and sort
> 2. readahead nat pages before holding nat_tree_lock

It may cause performance regression if it races w/ drop_caches?

Thanks,

> 
> Signed-off-by: wangzijie <wangzijie1@honor.com>
> ---
>  fs/f2fs/f2fs.h |  1 +
>  fs/f2fs/node.c | 70 ++++++++++++++++++++++++--------------------------
>  fs/f2fs/node.h |  2 +-
>  3 files changed, 35 insertions(+), 38 deletions(-)
> 
> diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
> index 46be75605..b27cc059f 100644
> --- a/fs/f2fs/f2fs.h
> +++ b/fs/f2fs/f2fs.h
> @@ -975,6 +975,7 @@ struct f2fs_nm_info {
>  	struct radix_tree_root nat_set_root;/* root of the nat set cache */
>  	struct f2fs_rwsem nat_tree_lock;	/* protect nat entry tree */
>  	struct list_head nat_entries;	/* cached nat entry list (clean) */
> +	struct list_head nat_dirty_set[NAT_ENTRY_PER_BLOCK + 1];	/* store dirty nat set */
>  	spinlock_t nat_list_lock;	/* protect clean nat entry list */
>  	unsigned int nat_cnt[MAX_NAT_STATE]; /* the # of cached nat entries */
>  	unsigned int nat_blocks;	/* # of nat blocks */
> diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
> index 27743b93e..87c975ee8 100644
> --- a/fs/f2fs/node.c
> +++ b/fs/f2fs/node.c
> @@ -244,6 +244,12 @@ static void __del_from_nat_cache(struct f2fs_nm_info *nm_i, struct nat_entry *e)
>  	__free_nat_entry(e);
>  }
>  
> +static void __relocate_nat_entry_set(struct f2fs_nm_info *nm_i,
> +							struct nat_entry_set *set)
> +{
> +	list_move_tail(&set->set_list, &nm_i->nat_dirty_set[set->entry_cnt]);
> +}
> +
>  static struct nat_entry_set *__grab_nat_entry_set(struct f2fs_nm_info *nm_i,
>  							struct nat_entry *ne)
>  {
> @@ -260,6 +266,7 @@ static struct nat_entry_set *__grab_nat_entry_set(struct f2fs_nm_info *nm_i,
>  		head->set = set;
>  		head->entry_cnt = 0;
>  		f2fs_radix_tree_insert(&nm_i->nat_set_root, set, head);
> +		__relocate_nat_entry_set(nm_i, head);
>  	}
>  	return head;
>  }
> @@ -279,8 +286,10 @@ static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i,
>  	 * 2. update old block address to new one;
>  	 */
>  	if (!new_ne && (get_nat_flag(ne, IS_PREALLOC) ||
> -				!get_nat_flag(ne, IS_DIRTY)))
> +				!get_nat_flag(ne, IS_DIRTY))) {
>  		head->entry_cnt++;
> +		__relocate_nat_entry_set(nm_i, head);
> +	}
>  
>  	set_nat_flag(ne, IS_PREALLOC, new_ne);
>  
> @@ -309,6 +318,7 @@ static void __clear_nat_cache_dirty(struct f2fs_nm_info *nm_i,
>  
>  	set_nat_flag(ne, IS_DIRTY, false);
>  	set->entry_cnt--;
> +	__relocate_nat_entry_set(nm_i, set);
>  	nm_i->nat_cnt[DIRTY_NAT]--;
>  	nm_i->nat_cnt[RECLAIMABLE_NAT]++;
>  }
> @@ -2976,24 +2986,6 @@ static void remove_nats_in_journal(struct f2fs_sb_info *sbi)
>  	up_write(&curseg->journal_rwsem);
>  }
>  
> -static void __adjust_nat_entry_set(struct nat_entry_set *nes,
> -						struct list_head *head, int max)
> -{
> -	struct nat_entry_set *cur;
> -
> -	if (nes->entry_cnt >= max)
> -		goto add_out;
> -
> -	list_for_each_entry(cur, head, set_list) {
> -		if (cur->entry_cnt >= nes->entry_cnt) {
> -			list_add(&nes->set_list, cur->set_list.prev);
> -			return;
> -		}
> -	}
> -add_out:
> -	list_add_tail(&nes->set_list, head);
> -}
> -
>  static void __update_nat_bits(struct f2fs_sb_info *sbi, nid_t start_nid,
>  		const struct f2fs_nat_block *nat_blk)
>  {
> @@ -3095,6 +3087,7 @@ static int __flush_nat_entry_set(struct f2fs_sb_info *sbi,
>  
>  	/* Allow dirty nats by node block allocation in write_begin */
>  	if (!set->entry_cnt) {
> +		list_del(&set->set_list);
>  		radix_tree_delete(&NM_I(sbi)->nat_set_root, set->set);
>  		kmem_cache_free(nat_entry_set_slab, set);
>  	}
> @@ -3109,11 +3102,8 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
>  	struct f2fs_nm_info *nm_i = NM_I(sbi);
>  	struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
>  	struct f2fs_journal *journal = curseg->journal;
> -	struct nat_entry_set *setvec[NAT_VEC_SIZE];
>  	struct nat_entry_set *set, *tmp;
> -	unsigned int found;
> -	nid_t set_idx = 0;
> -	LIST_HEAD(sets);
> +	int i;
>  	int err = 0;
>  
>  	/*
> @@ -3129,6 +3119,16 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
>  	if (!nm_i->nat_cnt[DIRTY_NAT])
>  		return 0;
>  
> +	/* readahead sets which cannot be moved to journal */
> +	if (!__has_cursum_space(journal, nm_i->nat_cnt[DIRTY_NAT], NAT_JOURNAL)) {
> +		for (i = MAX_NAT_JENTRIES(journal); i <= NAT_ENTRY_PER_BLOCK; i++) {
> +			list_for_each_entry_safe(set, tmp, &nm_i->nat_dirty_set[i], set_list) {
> +				f2fs_ra_meta_pages(sbi, set->set, 1,
> +								META_NAT, true);
> +			}
> +		}
> +	}
> +
>  	f2fs_down_write(&nm_i->nat_tree_lock);
>  
>  	/*
> @@ -3141,21 +3141,13 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
>  			nm_i->nat_cnt[DIRTY_NAT], NAT_JOURNAL))
>  		remove_nats_in_journal(sbi);
>  
> -	while ((found = __gang_lookup_nat_set(nm_i,
> -					set_idx, NAT_VEC_SIZE, setvec))) {
> -		unsigned idx;
> -
> -		set_idx = setvec[found - 1]->set + 1;
> -		for (idx = 0; idx < found; idx++)
> -			__adjust_nat_entry_set(setvec[idx], &sets,
> -						MAX_NAT_JENTRIES(journal));
> -	}
> -
>  	/* flush dirty nats in nat entry set */
> -	list_for_each_entry_safe(set, tmp, &sets, set_list) {
> -		err = __flush_nat_entry_set(sbi, set, cpc);
> -		if (err)
> -			break;
> +	for (i = 0; i <= NAT_ENTRY_PER_BLOCK; i++) {
> +		list_for_each_entry_safe(set, tmp, &nm_i->nat_dirty_set[i], set_list) {
> +			err = __flush_nat_entry_set(sbi, set, cpc);
> +			if (err)
> +				break;
> +		}
>  	}
>  
>  	f2fs_up_write(&nm_i->nat_tree_lock);
> @@ -3249,6 +3241,7 @@ static int init_node_manager(struct f2fs_sb_info *sbi)
>  	struct f2fs_nm_info *nm_i = NM_I(sbi);
>  	unsigned char *version_bitmap;
>  	unsigned int nat_segs;
> +	int i;
>  	int err;
>  
>  	nm_i->nat_blkaddr = le32_to_cpu(sb_raw->nat_blkaddr);
> @@ -3275,6 +3268,9 @@ static int init_node_manager(struct f2fs_sb_info *sbi)
>  	INIT_LIST_HEAD(&nm_i->nat_entries);
>  	spin_lock_init(&nm_i->nat_list_lock);
>  
> +	for (i = 0; i <= NAT_ENTRY_PER_BLOCK; i++)
> +		INIT_LIST_HEAD(&nm_i->nat_dirty_set[i]);
> +
>  	mutex_init(&nm_i->build_lock);
>  	spin_lock_init(&nm_i->nid_list_lock);
>  	init_f2fs_rwsem(&nm_i->nat_tree_lock);
> diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
> index 030390543..d805d4ce7 100644
> --- a/fs/f2fs/node.h
> +++ b/fs/f2fs/node.h
> @@ -158,7 +158,7 @@ enum mem_type {
>  };
>  
>  struct nat_entry_set {
> -	struct list_head set_list;	/* link with other nat sets */
> +	struct list_head set_list;	/* link with nat sets which have same entry_cnt */
>  	struct list_head entry_list;	/* link with dirty nat entries */
>  	nid_t set;			/* set number*/
>  	unsigned int entry_cnt;		/* the # of nat entries in set */


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [f2fs-dev] [PATCH] f2fs: reduce nat_tree_lock hold time when flush nat entries
  2025-08-25  2:51 ` Chao Yu
@ 2025-08-26  1:44   ` wangzijie
  2025-08-28  3:20     ` Chao Yu
  0 siblings, 1 reply; 6+ messages in thread
From: wangzijie @ 2025-08-26  1:44 UTC (permalink / raw)
  To: linux-f2fs-devel; +Cc: chao, feng.han, jaegeuk, linux-kernel, wangzijie1

> On 8/13/25 12:04, wangzijie wrote:
> > Sometimes I suffered the nat_tree_lock contention between f2fs_write_checkpoint
> > and f2fs_get_node_info. Commit a9419b6("f2fs: do not bother checkpoint by
> > f2fs_get_node_info") also mentioned that situation.
> > 
> > My idea is, when flush nat entries, we can use some structures to record nat
> > pages we may read, and readahead them before hold nat_tree_lock. Before
> > impletement code, I did some survey and found a submittion in community.
> > 
> > Subject: f2fs: use bucket sort to avoid tree lookup and list sort when nat flushing
> > Link: https://lore.kernel.org/linux-f2fs-devel/20170520122435.17574-2-houpengyang@huawei.com/
> > This patch aims to improve nat entry set sort by using bucket.
> > I steal that structure and readahead nat pages contain nat entry set which cannot be moved
> > to journal according to dirty nat entry set bucket.
> > 
> > By doing this, I think there are two benefits to reducing nat_tree_lock hold time when
> > when flush nat entries.
> 
> Zijie,
> 
> Can you please figure out some numbers for this patch? something like
> checkpoint latency or average or extreme time to grab nat_tree_lock...?
> 
> > 1. avoid nat set tree lookup and sort
> > 2. readahead nat pages before holding nat_tree_lock
> 
> It may cause performance regression if it races w/ drop_caches?
> 
> Thanks,

Hi, Chao
Do you mean that it will race with f2fs_try_to_free_nats()?
In this patch, nat_tree_lock is not held when readahead nat pages, but
in fact we need it. Am I right?

> > 
> > Signed-off-by: wangzijie <wangzijie1@honor.com>
> > ---
> >  fs/f2fs/f2fs.h |  1 +
> >  fs/f2fs/node.c | 70 ++++++++++++++++++++++++--------------------------
> >  fs/f2fs/node.h |  2 +-
> >  3 files changed, 35 insertions(+), 38 deletions(-)
> > 
> > diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
> > index 46be75605..b27cc059f 100644
> > --- a/fs/f2fs/f2fs.h
> > +++ b/fs/f2fs/f2fs.h
> > @@ -975,6 +975,7 @@ struct f2fs_nm_info {
> >  	struct radix_tree_root nat_set_root;/* root of the nat set cache */
> >  	struct f2fs_rwsem nat_tree_lock;	/* protect nat entry tree */
> >  	struct list_head nat_entries;	/* cached nat entry list (clean) */
> > +	struct list_head nat_dirty_set[NAT_ENTRY_PER_BLOCK + 1];	/* store dirty nat set */
> >  	spinlock_t nat_list_lock;	/* protect clean nat entry list */
> >  	unsigned int nat_cnt[MAX_NAT_STATE]; /* the # of cached nat entries */
> >  	unsigned int nat_blocks;	/* # of nat blocks */
> > diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
> > index 27743b93e..87c975ee8 100644
> > --- a/fs/f2fs/node.c
> > +++ b/fs/f2fs/node.c
> > @@ -244,6 +244,12 @@ static void __del_from_nat_cache(struct f2fs_nm_info *nm_i, struct nat_entry *e)
> >  	__free_nat_entry(e);
> >  }
> >  
> > +static void __relocate_nat_entry_set(struct f2fs_nm_info *nm_i,
> > +							struct nat_entry_set *set)
> > +{
> > +	list_move_tail(&set->set_list, &nm_i->nat_dirty_set[set->entry_cnt]);
> > +}
> > +
> >  static struct nat_entry_set *__grab_nat_entry_set(struct f2fs_nm_info *nm_i,
> >  							struct nat_entry *ne)
> >  {
> > @@ -260,6 +266,7 @@ static struct nat_entry_set *__grab_nat_entry_set(struct f2fs_nm_info *nm_i,
> >  		head->set = set;
> >  		head->entry_cnt = 0;
> >  		f2fs_radix_tree_insert(&nm_i->nat_set_root, set, head);
> > +		__relocate_nat_entry_set(nm_i, head);
> >  	}
> >  	return head;
> >  }
> > @@ -279,8 +286,10 @@ static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i,
> >  	 * 2. update old block address to new one;
> >  	 */
> >  	if (!new_ne && (get_nat_flag(ne, IS_PREALLOC) ||
> > -				!get_nat_flag(ne, IS_DIRTY)))
> > +				!get_nat_flag(ne, IS_DIRTY))) {
> >  		head->entry_cnt++;
> > +		__relocate_nat_entry_set(nm_i, head);
> > +	}
> >  
> >  	set_nat_flag(ne, IS_PREALLOC, new_ne);
> >  
> > @@ -309,6 +318,7 @@ static void __clear_nat_cache_dirty(struct f2fs_nm_info *nm_i,
> >  
> >  	set_nat_flag(ne, IS_DIRTY, false);
> >  	set->entry_cnt--;
> > +	__relocate_nat_entry_set(nm_i, set);
> >  	nm_i->nat_cnt[DIRTY_NAT]--;
> >  	nm_i->nat_cnt[RECLAIMABLE_NAT]++;
> >  }
> > @@ -2976,24 +2986,6 @@ static void remove_nats_in_journal(struct f2fs_sb_info *sbi)
> >  	up_write(&curseg->journal_rwsem);
> >  }
> >  
> > -static void __adjust_nat_entry_set(struct nat_entry_set *nes,
> > -						struct list_head *head, int max)
> > -{
> > -	struct nat_entry_set *cur;
> > -
> > -	if (nes->entry_cnt >= max)
> > -		goto add_out;
> > -
> > -	list_for_each_entry(cur, head, set_list) {
> > -		if (cur->entry_cnt >= nes->entry_cnt) {
> > -			list_add(&nes->set_list, cur->set_list.prev);
> > -			return;
> > -		}
> > -	}
> > -add_out:
> > -	list_add_tail(&nes->set_list, head);
> > -}
> > -
> >  static void __update_nat_bits(struct f2fs_sb_info *sbi, nid_t start_nid,
> >  		const struct f2fs_nat_block *nat_blk)
> >  {
> > @@ -3095,6 +3087,7 @@ static int __flush_nat_entry_set(struct f2fs_sb_info *sbi,
> >  
> >  	/* Allow dirty nats by node block allocation in write_begin */
> >  	if (!set->entry_cnt) {
> > +		list_del(&set->set_list);
> >  		radix_tree_delete(&NM_I(sbi)->nat_set_root, set->set);
> >  		kmem_cache_free(nat_entry_set_slab, set);
> >  	}
> > @@ -3109,11 +3102,8 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
> >  	struct f2fs_nm_info *nm_i = NM_I(sbi);
> >  	struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
> >  	struct f2fs_journal *journal = curseg->journal;
> > -	struct nat_entry_set *setvec[NAT_VEC_SIZE];
> >  	struct nat_entry_set *set, *tmp;
> > -	unsigned int found;
> > -	nid_t set_idx = 0;
> > -	LIST_HEAD(sets);
> > +	int i;
> >  	int err = 0;
> >  
> >  	/*
> > @@ -3129,6 +3119,16 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
> >  	if (!nm_i->nat_cnt[DIRTY_NAT])
> >  		return 0;
> >  
> > +	/* readahead sets which cannot be moved to journal */
> > +	if (!__has_cursum_space(journal, nm_i->nat_cnt[DIRTY_NAT], NAT_JOURNAL)) {
> > +		for (i = MAX_NAT_JENTRIES(journal); i <= NAT_ENTRY_PER_BLOCK; i++) {
> > +			list_for_each_entry_safe(set, tmp, &nm_i->nat_dirty_set[i], set_list) {
> > +				f2fs_ra_meta_pages(sbi, set->set, 1,
> > +								META_NAT, true);
> > +			}
> > +		}
> > +	}
> > +
> >  	f2fs_down_write(&nm_i->nat_tree_lock);
> >  
> >  	/*
> > @@ -3141,21 +3141,13 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
> >  			nm_i->nat_cnt[DIRTY_NAT], NAT_JOURNAL))
> >  		remove_nats_in_journal(sbi);
> >  
> > -	while ((found = __gang_lookup_nat_set(nm_i,
> > -					set_idx, NAT_VEC_SIZE, setvec))) {
> > -		unsigned idx;
> > -
> > -		set_idx = setvec[found - 1]->set + 1;
> > -		for (idx = 0; idx < found; idx++)
> > -			__adjust_nat_entry_set(setvec[idx], &sets,
> > -						MAX_NAT_JENTRIES(journal));
> > -	}
> > -
> >  	/* flush dirty nats in nat entry set */
> > -	list_for_each_entry_safe(set, tmp, &sets, set_list) {
> > -		err = __flush_nat_entry_set(sbi, set, cpc);
> > -		if (err)
> > -			break;
> > +	for (i = 0; i <= NAT_ENTRY_PER_BLOCK; i++) {
> > +		list_for_each_entry_safe(set, tmp, &nm_i->nat_dirty_set[i], set_list) {
> > +			err = __flush_nat_entry_set(sbi, set, cpc);
> > +			if (err)
> > +				break;
> > +		}
> >  	}
> >  
> >  	f2fs_up_write(&nm_i->nat_tree_lock);
> > @@ -3249,6 +3241,7 @@ static int init_node_manager(struct f2fs_sb_info *sbi)
> >  	struct f2fs_nm_info *nm_i = NM_I(sbi);
> >  	unsigned char *version_bitmap;
> >  	unsigned int nat_segs;
> > +	int i;
> >  	int err;
> >  
> >  	nm_i->nat_blkaddr = le32_to_cpu(sb_raw->nat_blkaddr);
> > @@ -3275,6 +3268,9 @@ static int init_node_manager(struct f2fs_sb_info *sbi)
> >  	INIT_LIST_HEAD(&nm_i->nat_entries);
> >  	spin_lock_init(&nm_i->nat_list_lock);
> >  
> > +	for (i = 0; i <= NAT_ENTRY_PER_BLOCK; i++)
> > +		INIT_LIST_HEAD(&nm_i->nat_dirty_set[i]);
> > +
> >  	mutex_init(&nm_i->build_lock);
> >  	spin_lock_init(&nm_i->nid_list_lock);
> >  	init_f2fs_rwsem(&nm_i->nat_tree_lock);
> > diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
> > index 030390543..d805d4ce7 100644
> > --- a/fs/f2fs/node.h
> > +++ b/fs/f2fs/node.h
> > @@ -158,7 +158,7 @@ enum mem_type {
> >  };
> >  
> >  struct nat_entry_set {
> > -	struct list_head set_list;	/* link with other nat sets */
> > +	struct list_head set_list;	/* link with nat sets which have same entry_cnt */
> >  	struct list_head entry_list;	/* link with dirty nat entries */
> >  	nid_t set;			/* set number*/
> >  	unsigned int entry_cnt;		/* the # of nat entries in set */


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [f2fs-dev] [PATCH] f2fs: reduce nat_tree_lock hold time when flush nat entries
  2025-08-26  1:44   ` wangzijie
@ 2025-08-28  3:20     ` Chao Yu
  0 siblings, 0 replies; 6+ messages in thread
From: Chao Yu @ 2025-08-28  3:20 UTC (permalink / raw)
  To: wangzijie, linux-f2fs-devel; +Cc: chao, feng.han, jaegeuk, linux-kernel

On 8/26/25 09:44, wangzijie wrote:
>> On 8/13/25 12:04, wangzijie wrote:
>>> Sometimes I suffered the nat_tree_lock contention between f2fs_write_checkpoint
>>> and f2fs_get_node_info. Commit a9419b6("f2fs: do not bother checkpoint by
>>> f2fs_get_node_info") also mentioned that situation.
>>>
>>> My idea is, when flush nat entries, we can use some structures to record nat
>>> pages we may read, and readahead them before hold nat_tree_lock. Before
>>> impletement code, I did some survey and found a submittion in community.
>>>
>>> Subject: f2fs: use bucket sort to avoid tree lookup and list sort when nat flushing
>>> Link: https://lore.kernel.org/linux-f2fs-devel/20170520122435.17574-2-houpengyang@huawei.com/
>>> This patch aims to improve nat entry set sort by using bucket.
>>> I steal that structure and readahead nat pages contain nat entry set which cannot be moved
>>> to journal according to dirty nat entry set bucket.
>>>
>>> By doing this, I think there are two benefits to reducing nat_tree_lock hold time when
>>> when flush nat entries.
>>
>> Zijie,
>>
>> Can you please figure out some numbers for this patch? something like
>> checkpoint latency or average or extreme time to grab nat_tree_lock...?
>>
>>> 1. avoid nat set tree lookup and sort
>>> 2. readahead nat pages before holding nat_tree_lock
>>
>> It may cause performance regression if it races w/ drop_caches?
>>
>> Thanks,
> 
> Hi, Chao
> Do you mean that it will race with f2fs_try_to_free_nats()?

No, I meant get_current_nat_folio() readahead may race with memory reclaim? e.g.
all uptodate nat pages are reclaimed before late updates? Seems a corner case.

Thanks,

> In this patch, nat_tree_lock is not held when readahead nat pages, but
> in fact we need it. Am I right?
> 
>>>
>>> Signed-off-by: wangzijie <wangzijie1@honor.com>
>>> ---
>>>  fs/f2fs/f2fs.h |  1 +
>>>  fs/f2fs/node.c | 70 ++++++++++++++++++++++++--------------------------
>>>  fs/f2fs/node.h |  2 +-
>>>  3 files changed, 35 insertions(+), 38 deletions(-)
>>>
>>> diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
>>> index 46be75605..b27cc059f 100644
>>> --- a/fs/f2fs/f2fs.h
>>> +++ b/fs/f2fs/f2fs.h
>>> @@ -975,6 +975,7 @@ struct f2fs_nm_info {
>>>  	struct radix_tree_root nat_set_root;/* root of the nat set cache */
>>>  	struct f2fs_rwsem nat_tree_lock;	/* protect nat entry tree */
>>>  	struct list_head nat_entries;	/* cached nat entry list (clean) */
>>> +	struct list_head nat_dirty_set[NAT_ENTRY_PER_BLOCK + 1];	/* store dirty nat set */
>>>  	spinlock_t nat_list_lock;	/* protect clean nat entry list */
>>>  	unsigned int nat_cnt[MAX_NAT_STATE]; /* the # of cached nat entries */
>>>  	unsigned int nat_blocks;	/* # of nat blocks */
>>> diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
>>> index 27743b93e..87c975ee8 100644
>>> --- a/fs/f2fs/node.c
>>> +++ b/fs/f2fs/node.c
>>> @@ -244,6 +244,12 @@ static void __del_from_nat_cache(struct f2fs_nm_info *nm_i, struct nat_entry *e)
>>>  	__free_nat_entry(e);
>>>  }
>>>  
>>> +static void __relocate_nat_entry_set(struct f2fs_nm_info *nm_i,
>>> +							struct nat_entry_set *set)
>>> +{
>>> +	list_move_tail(&set->set_list, &nm_i->nat_dirty_set[set->entry_cnt]);
>>> +}
>>> +
>>>  static struct nat_entry_set *__grab_nat_entry_set(struct f2fs_nm_info *nm_i,
>>>  							struct nat_entry *ne)
>>>  {
>>> @@ -260,6 +266,7 @@ static struct nat_entry_set *__grab_nat_entry_set(struct f2fs_nm_info *nm_i,
>>>  		head->set = set;
>>>  		head->entry_cnt = 0;
>>>  		f2fs_radix_tree_insert(&nm_i->nat_set_root, set, head);
>>> +		__relocate_nat_entry_set(nm_i, head);
>>>  	}
>>>  	return head;
>>>  }
>>> @@ -279,8 +286,10 @@ static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i,
>>>  	 * 2. update old block address to new one;
>>>  	 */
>>>  	if (!new_ne && (get_nat_flag(ne, IS_PREALLOC) ||
>>> -				!get_nat_flag(ne, IS_DIRTY)))
>>> +				!get_nat_flag(ne, IS_DIRTY))) {
>>>  		head->entry_cnt++;
>>> +		__relocate_nat_entry_set(nm_i, head);
>>> +	}
>>>  
>>>  	set_nat_flag(ne, IS_PREALLOC, new_ne);
>>>  
>>> @@ -309,6 +318,7 @@ static void __clear_nat_cache_dirty(struct f2fs_nm_info *nm_i,
>>>  
>>>  	set_nat_flag(ne, IS_DIRTY, false);
>>>  	set->entry_cnt--;
>>> +	__relocate_nat_entry_set(nm_i, set);
>>>  	nm_i->nat_cnt[DIRTY_NAT]--;
>>>  	nm_i->nat_cnt[RECLAIMABLE_NAT]++;
>>>  }
>>> @@ -2976,24 +2986,6 @@ static void remove_nats_in_journal(struct f2fs_sb_info *sbi)
>>>  	up_write(&curseg->journal_rwsem);
>>>  }
>>>  
>>> -static void __adjust_nat_entry_set(struct nat_entry_set *nes,
>>> -						struct list_head *head, int max)
>>> -{
>>> -	struct nat_entry_set *cur;
>>> -
>>> -	if (nes->entry_cnt >= max)
>>> -		goto add_out;
>>> -
>>> -	list_for_each_entry(cur, head, set_list) {
>>> -		if (cur->entry_cnt >= nes->entry_cnt) {
>>> -			list_add(&nes->set_list, cur->set_list.prev);
>>> -			return;
>>> -		}
>>> -	}
>>> -add_out:
>>> -	list_add_tail(&nes->set_list, head);
>>> -}
>>> -
>>>  static void __update_nat_bits(struct f2fs_sb_info *sbi, nid_t start_nid,
>>>  		const struct f2fs_nat_block *nat_blk)
>>>  {
>>> @@ -3095,6 +3087,7 @@ static int __flush_nat_entry_set(struct f2fs_sb_info *sbi,
>>>  
>>>  	/* Allow dirty nats by node block allocation in write_begin */
>>>  	if (!set->entry_cnt) {
>>> +		list_del(&set->set_list);
>>>  		radix_tree_delete(&NM_I(sbi)->nat_set_root, set->set);
>>>  		kmem_cache_free(nat_entry_set_slab, set);
>>>  	}
>>> @@ -3109,11 +3102,8 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
>>>  	struct f2fs_nm_info *nm_i = NM_I(sbi);
>>>  	struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
>>>  	struct f2fs_journal *journal = curseg->journal;
>>> -	struct nat_entry_set *setvec[NAT_VEC_SIZE];
>>>  	struct nat_entry_set *set, *tmp;
>>> -	unsigned int found;
>>> -	nid_t set_idx = 0;
>>> -	LIST_HEAD(sets);
>>> +	int i;
>>>  	int err = 0;
>>>  
>>>  	/*
>>> @@ -3129,6 +3119,16 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
>>>  	if (!nm_i->nat_cnt[DIRTY_NAT])
>>>  		return 0;
>>>  
>>> +	/* readahead sets which cannot be moved to journal */
>>> +	if (!__has_cursum_space(journal, nm_i->nat_cnt[DIRTY_NAT], NAT_JOURNAL)) {
>>> +		for (i = MAX_NAT_JENTRIES(journal); i <= NAT_ENTRY_PER_BLOCK; i++) {
>>> +			list_for_each_entry_safe(set, tmp, &nm_i->nat_dirty_set[i], set_list) {
>>> +				f2fs_ra_meta_pages(sbi, set->set, 1,
>>> +								META_NAT, true);
>>> +			}
>>> +		}
>>> +	}
>>> +
>>>  	f2fs_down_write(&nm_i->nat_tree_lock);
>>>  
>>>  	/*
>>> @@ -3141,21 +3141,13 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
>>>  			nm_i->nat_cnt[DIRTY_NAT], NAT_JOURNAL))
>>>  		remove_nats_in_journal(sbi);
>>>  
>>> -	while ((found = __gang_lookup_nat_set(nm_i,
>>> -					set_idx, NAT_VEC_SIZE, setvec))) {
>>> -		unsigned idx;
>>> -
>>> -		set_idx = setvec[found - 1]->set + 1;
>>> -		for (idx = 0; idx < found; idx++)
>>> -			__adjust_nat_entry_set(setvec[idx], &sets,
>>> -						MAX_NAT_JENTRIES(journal));
>>> -	}
>>> -
>>>  	/* flush dirty nats in nat entry set */
>>> -	list_for_each_entry_safe(set, tmp, &sets, set_list) {
>>> -		err = __flush_nat_entry_set(sbi, set, cpc);
>>> -		if (err)
>>> -			break;
>>> +	for (i = 0; i <= NAT_ENTRY_PER_BLOCK; i++) {
>>> +		list_for_each_entry_safe(set, tmp, &nm_i->nat_dirty_set[i], set_list) {
>>> +			err = __flush_nat_entry_set(sbi, set, cpc);
>>> +			if (err)
>>> +				break;
>>> +		}
>>>  	}
>>>  
>>>  	f2fs_up_write(&nm_i->nat_tree_lock);
>>> @@ -3249,6 +3241,7 @@ static int init_node_manager(struct f2fs_sb_info *sbi)
>>>  	struct f2fs_nm_info *nm_i = NM_I(sbi);
>>>  	unsigned char *version_bitmap;
>>>  	unsigned int nat_segs;
>>> +	int i;
>>>  	int err;
>>>  
>>>  	nm_i->nat_blkaddr = le32_to_cpu(sb_raw->nat_blkaddr);
>>> @@ -3275,6 +3268,9 @@ static int init_node_manager(struct f2fs_sb_info *sbi)
>>>  	INIT_LIST_HEAD(&nm_i->nat_entries);
>>>  	spin_lock_init(&nm_i->nat_list_lock);
>>>  
>>> +	for (i = 0; i <= NAT_ENTRY_PER_BLOCK; i++)
>>> +		INIT_LIST_HEAD(&nm_i->nat_dirty_set[i]);
>>> +
>>>  	mutex_init(&nm_i->build_lock);
>>>  	spin_lock_init(&nm_i->nid_list_lock);
>>>  	init_f2fs_rwsem(&nm_i->nat_tree_lock);
>>> diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
>>> index 030390543..d805d4ce7 100644
>>> --- a/fs/f2fs/node.h
>>> +++ b/fs/f2fs/node.h
>>> @@ -158,7 +158,7 @@ enum mem_type {
>>>  };
>>>  
>>>  struct nat_entry_set {
>>> -	struct list_head set_list;	/* link with other nat sets */
>>> +	struct list_head set_list;	/* link with nat sets which have same entry_cnt */
>>>  	struct list_head entry_list;	/* link with dirty nat entries */
>>>  	nid_t set;			/* set number*/
>>>  	unsigned int entry_cnt;		/* the # of nat entries in set */
> 


^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2025-08-28  3:20 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-08-13  4:04 [f2fs-dev] [PATCH] f2fs: reduce nat_tree_lock hold time when flush nat entries wangzijie
2025-08-18  2:21 ` Zhiguo Niu
2025-08-19  2:36   ` wangzijie
2025-08-25  2:51 ` Chao Yu
2025-08-26  1:44   ` wangzijie
2025-08-28  3:20     ` Chao Yu

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).