public inbox for linux-ext4@vger.kernel.org
 help / color / mirror / Atom feed
From: Baokun Li <libaokun1@huawei.com>
To: <linux-ext4@vger.kernel.org>
Cc: <tytso@mit.edu>, <jack@suse.cz>, <adilger.kernel@dilger.ca>,
	<ojaswin@linux.ibm.com>, <linux-kernel@vger.kernel.org>,
	<yi.zhang@huawei.com>, <yangerkun@huawei.com>,
	<libaokun1@huawei.com>
Subject: [PATCH v2 16/16] ext4: ensure global ordered traversal across all free groups xarrays
Date: Mon, 23 Jun 2025 15:33:04 +0800	[thread overview]
Message-ID: <20250623073304.3275702-17-libaokun1@huawei.com> (raw)
In-Reply-To: <20250623073304.3275702-1-libaokun1@huawei.com>

Although we now perform ordered traversal within an xarray, this is
currently limited to a single xarray, traversing right then left. However,
we have multiple such xarrays, which prevents us from guaranteeing a
linear-like traversal where all groups on the right are visited before all
groups on the left.

Therefore, this change modifies the traversal to first iterate through
all right groups across all xarrays, and then all left groups across all
xarrays. This achieves a linear-like effect, mitigating contention
between block allocation and block freeing paths.

Performance test data follows:

CPU: Kunpeng 920   |          P80            |            P1           |
Memory: 512GB      |-------------------------|-------------------------|
Disk: 960GB SSD    | base  |    patched      | base  |    patched      |
-------------------|-------|-----------------|-------|-----------------|
mb_optimize_scan=0 | 20976 | 20619  (-1.7%)  | 319396| 299238 (-6.3%)  |
mb_optimize_scan=1 | 14580 | 20119  (+37.9%) | 319237| 315268 (-1.2%)  |

CPU: AMD 9654 * 2  |          P96            |            P1           |
Memory: 1536GB     |-------------------------|-------------------------|
Disk: 960GB SSD    | base  |    patched      | base  |    patched      |
-------------------|-------|-----------------|-------|-----------------|
mb_optimize_scan=0 | 51713 | 51983 (+0.5%)   | 206655| 207033 (0.18%)  |
mb_optimize_scan=1 | 35527 | 48486 (+36.4%)  | 212574| 202415 (+4.7%)  |

Signed-off-by: Baokun Li <libaokun1@huawei.com>
---
 fs/ext4/mballoc.c | 69 ++++++++++++++++++++++++++++++++---------------
 1 file changed, 47 insertions(+), 22 deletions(-)

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index d8372a649a0c..d26a0e8e3f7e 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -877,22 +877,20 @@ mb_update_avg_fragment_size(struct super_block *sb, struct ext4_group_info *grp)
 		     grp->bb_group, new, ret);
 }
 
-static int ext4_mb_scan_groups_xarray(struct ext4_allocation_context *ac,
-				      struct xarray *xa, ext4_group_t start)
+static int ext4_mb_scan_groups_xa_range(struct ext4_allocation_context *ac,
+					struct xarray *xa,
+					ext4_group_t start, ext4_group_t end)
 {
 	struct super_block *sb = ac->ac_sb;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	enum criteria cr = ac->ac_criteria;
 	ext4_group_t ngroups = ext4_get_groups_count(sb);
 	unsigned long group = start;
-	ext4_group_t end;
 	struct ext4_group_info *grp;
 
-	if (WARN_ON_ONCE(start >= ngroups))
+	if (WARN_ON_ONCE(end >= ngroups || start > end))
 		return 0;
-	end = ngroups - 1;
 
-wrap_around:
 	xa_for_each_range(xa, group, grp, start, end) {
 		int err;
 
@@ -906,28 +904,23 @@ static int ext4_mb_scan_groups_xarray(struct ext4_allocation_context *ac,
 		cond_resched();
 	}
 
-	if (start) {
-		end = start - 1;
-		start = 0;
-		goto wrap_around;
-	}
-
 	return 0;
 }
 
 /*
  * Find a suitable group of given order from the largest free orders xarray.
  */
-static int
-ext4_mb_scan_groups_largest_free_order(struct ext4_allocation_context *ac,
-				       int order, ext4_group_t start)
+static inline int
+ext4_mb_scan_groups_largest_free_order_range(struct ext4_allocation_context *ac,
+					     int order, ext4_group_t start,
+					     ext4_group_t end)
 {
 	struct xarray *xa = &EXT4_SB(ac->ac_sb)->s_mb_largest_free_orders[order];
 
 	if (xa_empty(xa))
 		return 0;
 
-	return ext4_mb_scan_groups_xarray(ac, xa, start);
+	return ext4_mb_scan_groups_xa_range(ac, xa, start, end - 1);
 }
 
 /*
@@ -940,13 +933,23 @@ static int ext4_mb_scan_groups_p2_aligned(struct ext4_allocation_context *ac,
 	struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
 	int i;
 	int ret = 0;
+	ext4_group_t start, end;
 
 	ac->ac_flags |= EXT4_MB_CR_POWER2_ALIGNED_OPTIMIZED;
+	start = group;
+	end = ext4_get_groups_count(ac->ac_sb);
+wrap_around:
 	for (i = ac->ac_2order; i < MB_NUM_ORDERS(ac->ac_sb); i++) {
-		ret = ext4_mb_scan_groups_largest_free_order(ac, i, group);
+		ret = ext4_mb_scan_groups_largest_free_order_range(ac, i,
+								   start, end);
 		if (ret || ac->ac_status != AC_STATUS_CONTINUE)
 			goto out;
 	}
+	if (start) {
+		end = start;
+		start = 0;
+		goto wrap_around;
+	}
 
 	if (sbi->s_mb_stats)
 		atomic64_inc(&sbi->s_bal_cX_failed[ac->ac_criteria]);
@@ -961,15 +964,17 @@ static int ext4_mb_scan_groups_p2_aligned(struct ext4_allocation_context *ac,
 /*
  * Find a suitable group of given order from the average fragments xarray.
  */
-static int ext4_mb_scan_groups_avg_frag_order(struct ext4_allocation_context *ac,
-					      int order, ext4_group_t start)
+static int
+ext4_mb_scan_groups_avg_frag_order_range(struct ext4_allocation_context *ac,
+					 int order, ext4_group_t start,
+					 ext4_group_t end)
 {
 	struct xarray *xa = &EXT4_SB(ac->ac_sb)->s_mb_avg_fragment_size[order];
 
 	if (xa_empty(xa))
 		return 0;
 
-	return ext4_mb_scan_groups_xarray(ac, xa, start);
+	return ext4_mb_scan_groups_xa_range(ac, xa, start, end - 1);
 }
 
 /*
@@ -981,14 +986,24 @@ static int ext4_mb_scan_groups_goal_fast(struct ext4_allocation_context *ac,
 {
 	struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
 	int i, ret = 0;
+	ext4_group_t start, end;
 
 	ac->ac_flags |= EXT4_MB_CR_GOAL_LEN_FAST_OPTIMIZED;
+	start = group;
+	end = ext4_get_groups_count(ac->ac_sb);
+wrap_around:
 	i = mb_avg_fragment_size_order(ac->ac_sb, ac->ac_g_ex.fe_len);
 	for (; i < MB_NUM_ORDERS(ac->ac_sb); i++) {
-		ret = ext4_mb_scan_groups_avg_frag_order(ac, i, group);
+		ret = ext4_mb_scan_groups_avg_frag_order_range(ac, i,
+							       start, end);
 		if (ret || ac->ac_status != AC_STATUS_CONTINUE)
 			goto out;
 	}
+	if (start) {
+		end = start;
+		start = 0;
+		goto wrap_around;
+	}
 
 	if (sbi->s_mb_stats)
 		atomic64_inc(&sbi->s_bal_cX_failed[ac->ac_criteria]);
@@ -1025,6 +1040,7 @@ static int ext4_mb_scan_groups_best_avail(struct ext4_allocation_context *ac,
 	struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
 	int i, order, min_order;
 	unsigned long num_stripe_clusters = 0;
+	ext4_group_t start, end;
 
 	/*
 	 * mb_avg_fragment_size_order() returns order in a way that makes
@@ -1057,6 +1073,9 @@ static int ext4_mb_scan_groups_best_avail(struct ext4_allocation_context *ac,
 		min_order = fls(ac->ac_o_ex.fe_len);
 
 	ac->ac_flags |= EXT4_MB_CR_BEST_AVAIL_LEN_OPTIMIZED;
+	start = group;
+	end = ext4_get_groups_count(ac->ac_sb);
+wrap_around:
 	for (i = order; i >= min_order; i--) {
 		int frag_order;
 		/*
@@ -1079,10 +1098,16 @@ static int ext4_mb_scan_groups_best_avail(struct ext4_allocation_context *ac,
 		frag_order = mb_avg_fragment_size_order(ac->ac_sb,
 							ac->ac_g_ex.fe_len);
 
-		ret = ext4_mb_scan_groups_avg_frag_order(ac, frag_order, group);
+		ret = ext4_mb_scan_groups_avg_frag_order_range(ac, frag_order,
+							       start, end);
 		if (ret || ac->ac_status != AC_STATUS_CONTINUE)
 			goto out;
 	}
+	if (start) {
+		end = start;
+		start = 0;
+		goto wrap_around;
+	}
 
 	/* Reset goal length to original goal length before falling into CR_GOAL_LEN_SLOW */
 	ac->ac_g_ex.fe_len = ac->ac_orig_goal_len;
-- 
2.46.1


      parent reply	other threads:[~2025-06-23  7:47 UTC|newest]

Thread overview: 51+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-06-23  7:32 [PATCH v2 00/16] ext4: better scalability for ext4 block allocation Baokun Li
2025-06-23  7:32 ` [PATCH v2 01/16] ext4: add ext4_try_lock_group() to skip busy groups Baokun Li
2025-06-27 18:06   ` Jan Kara
2025-07-14  6:53   ` Ojaswin Mujoo
2025-06-23  7:32 ` [PATCH v2 02/16] ext4: remove unnecessary s_mb_last_start Baokun Li
2025-06-27 18:15   ` Jan Kara
2025-06-30  3:32     ` Baokun Li
2025-06-30  7:31       ` Jan Kara
2025-06-30  7:52         ` Baokun Li
2025-07-14  7:00           ` Ojaswin Mujoo
2025-06-23  7:32 ` [PATCH v2 03/16] ext4: remove unnecessary s_md_lock on update s_mb_last_group Baokun Li
2025-06-27 18:19   ` Jan Kara
2025-06-30  3:48     ` Baokun Li
2025-06-30  7:47       ` Jan Kara
2025-06-30  9:21         ` Baokun Li
2025-06-30 16:32           ` Jan Kara
2025-07-01  2:39             ` Baokun Li
2025-07-01 12:21               ` Jan Kara
2025-07-01 13:17                 ` Baokun Li
2025-07-08 13:08                 ` Baokun Li
2025-07-10 14:38                   ` Jan Kara
2025-07-14  3:01                     ` Theodore Ts'o
2025-07-14  7:00                       ` Baokun Li
2025-07-01  2:57   ` kernel test robot
2025-06-23  7:32 ` [PATCH v2 04/16] ext4: utilize multiple global goals to reduce contention Baokun Li
2025-06-27 18:31   ` Jan Kara
2025-06-30  6:50     ` Baokun Li
2025-06-30  8:38       ` Jan Kara
2025-06-30 10:02         ` Baokun Li
2025-06-30 17:41           ` Jan Kara
2025-07-01  3:32             ` Baokun Li
2025-07-01 11:53               ` Jan Kara
2025-07-01 12:12                 ` Baokun Li
2025-06-23  7:32 ` [PATCH v2 05/16] ext4: get rid of some obsolete EXT4_MB_HINT flags Baokun Li
2025-06-23  7:32 ` [PATCH v2 06/16] ext4: fix typo in CR_GOAL_LEN_SLOW comment Baokun Li
2025-06-23  7:32 ` [PATCH v2 07/16] ext4: convert sbi->s_mb_free_pending to atomic_t Baokun Li
2025-06-27 18:33   ` Jan Kara
2025-06-23  7:32 ` [PATCH v2 08/16] ext4: merge freed extent with existing extents before insertion Baokun Li
2025-06-27 19:11   ` Jan Kara
2025-06-23  7:32 ` [PATCH v2 09/16] ext4: fix zombie groups in average fragment size lists Baokun Li
2025-06-27 19:14   ` Jan Kara
2025-06-30  6:53     ` Baokun Li
2025-06-23  7:32 ` [PATCH v2 10/16] ext4: fix largest free orders lists corruption on mb_optimize_scan switch Baokun Li
2025-06-27 19:34   ` Jan Kara
2025-06-30  7:34     ` Baokun Li
2025-06-23  7:32 ` [PATCH v2 11/16] ext4: factor out __ext4_mb_scan_group() Baokun Li
2025-06-23  7:33 ` [PATCH v2 12/16] ext4: factor out ext4_mb_might_prefetch() Baokun Li
2025-06-23  7:33 ` [PATCH v2 13/16] ext4: factor out ext4_mb_scan_group() Baokun Li
2025-06-23  7:33 ` [PATCH v2 14/16] ext4: convert free group lists to ordered xarrays Baokun Li
2025-06-23  7:33 ` [PATCH v2 15/16] ext4: refactor choose group to scan group Baokun Li
2025-06-23  7:33 ` Baokun Li [this message]

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20250623073304.3275702-17-libaokun1@huawei.com \
    --to=libaokun1@huawei.com \
    --cc=adilger.kernel@dilger.ca \
    --cc=jack@suse.cz \
    --cc=linux-ext4@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=ojaswin@linux.ibm.com \
    --cc=tytso@mit.edu \
    --cc=yangerkun@huawei.com \
    --cc=yi.zhang@huawei.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox