* [PATCH 1/2] ext4: refactor size prediction into helper functions
2025-12-08 8:32 [PATCH 0/2] ext4: align preallocation size to stripe width Yu Kuai
@ 2025-12-08 8:32 ` Yu Kuai
2025-12-08 8:32 ` [PATCH 2/2] ext4: align preallocation size to stripe width Yu Kuai
1 sibling, 0 replies; 5+ messages in thread
From: Yu Kuai @ 2025-12-08 8:32 UTC (permalink / raw)
To: tytso, adilger.kernel, linux-ext4; +Cc: linux-kernel, yukuai
The ext4_mb_normalize_request() function contains a large if-else
ladder for predicting file size and uses a macro NRL_CHECK_SIZE.
Factor these out into proper helper functions to improve code
readability and maintainability.
This patch introduces:
- ext4_mb_check_size(): static inline function replacing NRL_CHECK_SIZE macro
- ext4_mb_predict_file_size(): extracts size prediction logic
No functional change.
Signed-off-by: Yu Kuai <yukuai@fnnas.com>
---
fs/ext4/mballoc.c | 101 +++++++++++++++++++++++++++-------------------
1 file changed, 60 insertions(+), 41 deletions(-)
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 9087183602e4..eb46a4f5fb4f 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -4489,6 +4489,63 @@ ext4_mb_pa_adjust_overlap(struct ext4_allocation_context *ac,
*end = new_end;
}
+/*
+ * Check if request size allows for chunk-based allocation
+ */
+static inline bool ext4_mb_check_size(loff_t req, loff_t size,
+ int max, int chunk_size)
+{
+ return (req <= size) || (max <= chunk_size);
+}
+
+/*
+ * Predict file size for preallocation. Returns the predicted size
+ * in bytes and sets start_off if alignment is needed for large files.
+ */
+static loff_t ext4_mb_predict_file_size(struct ext4_sb_info *sbi,
+ struct ext4_allocation_context *ac,
+ loff_t size, loff_t *start_off)
+{
+ int bsbits = ac->ac_sb->s_blocksize_bits;
+ int max = 2 << bsbits;
+
+ *start_off = 0;
+
+ if (size <= 16 * 1024) {
+ size = 16 * 1024;
+ } else if (size <= 32 * 1024) {
+ size = 32 * 1024;
+ } else if (size <= 64 * 1024) {
+ size = 64 * 1024;
+ } else if (size <= 128 * 1024) {
+ size = 128 * 1024;
+ } else if (size <= 256 * 1024) {
+ size = 256 * 1024;
+ } else if (size <= 512 * 1024) {
+ size = 512 * 1024;
+ } else if (size <= 1024 * 1024) {
+ size = 1024 * 1024;
+ } else if (ext4_mb_check_size(size, 4 * 1024 * 1024, max, 2 * 1024)) {
+ *start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
+ (21 - bsbits)) << 21;
+ size = 2 * 1024 * 1024;
+ } else if (ext4_mb_check_size(size, 8 * 1024 * 1024, max, 4 * 1024)) {
+ *start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
+ (22 - bsbits)) << 22;
+ size = 4 * 1024 * 1024;
+ } else if (ext4_mb_check_size(EXT4_C2B(sbi, ac->ac_o_ex.fe_len),
+ (8<<20)>>bsbits, max, 8 * 1024)) {
+ *start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
+ (23 - bsbits)) << 23;
+ size = 8 * 1024 * 1024;
+ } else {
+ *start_off = (loff_t)ac->ac_o_ex.fe_logical << bsbits;
+ size = (loff_t)EXT4_C2B(sbi, ac->ac_o_ex.fe_len) << bsbits;
+ }
+
+ return size;
+}
+
/*
* Normalization means making request better in terms of
* size and alignment
@@ -4500,7 +4557,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
struct ext4_super_block *es = sbi->s_es;
int bsbits, max;
- loff_t size, start_off, end;
+ loff_t size, start_off = 0, end;
loff_t orig_size __maybe_unused;
ext4_lblk_t start;
@@ -4533,47 +4590,9 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
size = i_size_read(ac->ac_inode);
orig_size = size;
- /* max size of free chunks */
- max = 2 << bsbits;
+ /* Predict file size for preallocation */
+ size = ext4_mb_predict_file_size(sbi, ac, size, &start_off);
-#define NRL_CHECK_SIZE(req, size, max, chunk_size) \
- (req <= (size) || max <= (chunk_size))
-
- /* first, try to predict filesize */
- /* XXX: should this table be tunable? */
- start_off = 0;
- if (size <= 16 * 1024) {
- size = 16 * 1024;
- } else if (size <= 32 * 1024) {
- size = 32 * 1024;
- } else if (size <= 64 * 1024) {
- size = 64 * 1024;
- } else if (size <= 128 * 1024) {
- size = 128 * 1024;
- } else if (size <= 256 * 1024) {
- size = 256 * 1024;
- } else if (size <= 512 * 1024) {
- size = 512 * 1024;
- } else if (size <= 1024 * 1024) {
- size = 1024 * 1024;
- } else if (NRL_CHECK_SIZE(size, 4 * 1024 * 1024, max, 2 * 1024)) {
- start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
- (21 - bsbits)) << 21;
- size = 2 * 1024 * 1024;
- } else if (NRL_CHECK_SIZE(size, 8 * 1024 * 1024, max, 4 * 1024)) {
- start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
- (22 - bsbits)) << 22;
- size = 4 * 1024 * 1024;
- } else if (NRL_CHECK_SIZE(EXT4_C2B(sbi, ac->ac_o_ex.fe_len),
- (8<<20)>>bsbits, max, 8 * 1024)) {
- start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
- (23 - bsbits)) << 23;
- size = 8 * 1024 * 1024;
- } else {
- start_off = (loff_t) ac->ac_o_ex.fe_logical << bsbits;
- size = (loff_t) EXT4_C2B(sbi,
- ac->ac_o_ex.fe_len) << bsbits;
- }
size = size >> bsbits;
start = start_off >> bsbits;
--
2.51.0
^ permalink raw reply related [flat|nested] 5+ messages in thread* [PATCH 2/2] ext4: align preallocation size to stripe width
2025-12-08 8:32 [PATCH 0/2] ext4: align preallocation size to stripe width Yu Kuai
2025-12-08 8:32 ` [PATCH 1/2] ext4: refactor size prediction into helper functions Yu Kuai
@ 2025-12-08 8:32 ` Yu Kuai
2025-12-10 15:49 ` kernel test robot
2025-12-11 23:08 ` kernel test robot
1 sibling, 2 replies; 5+ messages in thread
From: Yu Kuai @ 2025-12-08 8:32 UTC (permalink / raw)
To: tytso, adilger.kernel, linux-ext4; +Cc: linux-kernel, yukuai
When stripe width (io_opt) is configured, align the predicted
preallocation size to stripe boundaries. This ensures optimal I/O
performance on RAID and other striped storage devices by avoiding
partial stripe operations.
The current implementation uses hardcoded size predictions (16KB, 32KB,
64KB, etc.) that are not stripe-aware. This causes physical block
offsets on disk to be misaligned to stripe boundaries, leading to
read-modify-write penalties on RAID arrays and reduced performance.
This patch makes size prediction stripe-aware by using multiples of
stripe size (1x, 2x, 4x, 8x, 16x, 32x) when s_stripe is set.
Additionally, the start offset is aligned to stripe boundaries using
rounddown(), which works correctly for both power-of-2 and non-power-of-2
stripe sizes. For devices without stripe configuration, the original
behavior is preserved.
The predicted size is limited to max free chunk size (2 << bsbits) to
ensure reasonable allocation requests, with the limit rounded down to
maintain stripe alignment.
Test case:
Device: 32-disk RAID5, 64KB chunk size
Stripe: 496 blocks (31 data disks × 16 blocks/disk)
Before patch (misaligned physical offsets):
ext: logical_offset: physical_offset: length:
0: 0.. 63487: 34816.. 98303: 63488
1: 63488..126975: 100352..163839: 63488
2: 126976..190463: 165888..229375: 63488
3: 190464..253951: 231424..294911: 63488
4: 253952..262143: 296960..305151: 8192
Physical offsets: 34816 % 496 = 96 (misaligned)
100352 % 496 = 160 (misaligned)
165888 % 496 = 224 (misaligned)
→ Causes partial stripe writes on RAID
After patch (aligned physical offsets):
ext: logical_offset: physical_offset: length:
0: 0.. 17855: 9920.. 27775: 17856
1: 17856.. 42159: 34224.. 58527: 24304
2: 42160.. 73407: 65968.. 97215: 31248
3: 73408.. 97711: 99696..123999: 24304
... (all extents aligned until EOF)
Physical offsets: 9920 % 496 = 0 (aligned)
34224 % 496 = 0 (aligned)
65968 % 496 = 0 (aligned)
Extent lengths: 17856=496×36, 24304=496×49, 31248=496×63
→ Optimal RAID performance, no partial stripe writes
Benefits:
- Eliminates read-modify-write operations on RAID arrays
- Improves sequential write performance on striped devices
- Maintains proper alignment throughout file lifetime
- Works with any stripe size (power-of-2 or not)
Signed-off-by: Yu Kuai <yukuai@fnnas.com>
---
fs/ext4/mballoc.c | 60 +++++++++++++++++++++++++++++++++++++++++++++--
1 file changed, 58 insertions(+), 2 deletions(-)
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index eb46a4f5fb4f..dbd0b239cc96 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -4500,7 +4500,10 @@ static inline bool ext4_mb_check_size(loff_t req, loff_t size,
/*
* Predict file size for preallocation. Returns the predicted size
- * in bytes and sets start_off if alignment is needed for large files.
+ * in bytes. When stripe width (io_opt) is configured, returns sizes
+ * that are multiples of stripe for optimal RAID performance.
+ *
+ * Sets start_off if alignment is needed for large files.
*/
static loff_t ext4_mb_predict_file_size(struct ext4_sb_info *sbi,
struct ext4_allocation_context *ac,
@@ -4511,6 +4514,59 @@ static loff_t ext4_mb_predict_file_size(struct ext4_sb_info *sbi,
*start_off = 0;
+ /*
+ * For RAID/striped devices, align preallocation size to stripe
+ * width (io_opt) for optimal I/O performance. Use power-of-2
+ * multiples of stripe size for size prediction.
+ */
+ if (sbi->s_stripe) {
+ loff_t stripe_bytes = (loff_t)sbi->s_stripe << bsbits;
+ loff_t max_size = (loff_t)max << bsbits;
+
+ /*
+ * TODO: If stripe is larger than max chunk size, we can't
+ * do stripe-aligned allocation. Fall back to traditional
+ * size prediction. This can happen with very large stripe
+ * configurations on small block sizes.
+ */
+ if (stripe_bytes > max_size)
+ goto no_stripe;
+
+ if (size <= stripe_bytes) {
+ size = stripe_bytes;
+ } else if (size <= stripe_bytes * 2) {
+ size = stripe_bytes * 2;
+ } else if (size <= stripe_bytes * 4) {
+ size = stripe_bytes * 4;
+ } else if (size <= stripe_bytes * 8) {
+ size = stripe_bytes * 8;
+ } else if (size <= stripe_bytes * 16) {
+ size = stripe_bytes * 16;
+ } else if (size <= stripe_bytes * 32) {
+ size = stripe_bytes * 32;
+ } else {
+ size = roundup(size, stripe_bytes);
+ }
+
+ /*
+ * Limit size to max free chunk size, rounded down to
+ * stripe alignment.
+ */
+ if (size > max_size)
+ size = rounddown(max_size, stripe_bytes);
+
+ /*
+ * Align start offset to stripe boundary for large allocations
+ * to ensure both start and size are stripe-aligned.
+ */
+ *start_off = rounddown((loff_t)ac->ac_o_ex.fe_logical << bsbits,
+ stripe_bytes);
+
+ return size;
+ }
+
+no_stripe:
+ /* No stripe: use traditional hardcoded size prediction */
if (size <= 16 * 1024) {
size = 16 * 1024;
} else if (size <= 32 * 1024) {
@@ -4556,7 +4612,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
{
struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
struct ext4_super_block *es = sbi->s_es;
- int bsbits, max;
+ int bsbits;
loff_t size, start_off = 0, end;
loff_t orig_size __maybe_unused;
ext4_lblk_t start;
--
2.51.0
^ permalink raw reply related [flat|nested] 5+ messages in thread