public inbox for linux-ext4@vger.kernel.org
 help / color / mirror / Atom feed
* [PATCH] ext4: make mballoc max prealloc size configurable
@ 2026-04-10  3:56 guzebing
  0 siblings, 0 replies; only message in thread
From: guzebing @ 2026-04-10  3:56 UTC (permalink / raw)
  To: tytso, adilger.kernel, libaokun, jack, ojaswin, ritesh.list,
	yi.zhang, guzebing
  Cc: linux-kernel, linux-ext4

From: Guzebing <guzebing@bytedance.com>

Add per-superblock sysfs knob mb_max_prealloc_kb (min 8MiB, roundup
pow2) and use it in request normalization.

When multiple tasks write to different files on the same filesystem
concurrently, each file ends up with 8 MiB extents. If the preallocation
size is increased, the resulting extent size grows accordingly. Due
to the readahead mechanism on NVMe SSDs, files with larger extents
achieve higher sequential read throughput.

On an ext4 filesystem on an NVMe Gen4 data drive, dd read throughput
for a file with 8 MiB extents is 455 MB/s, while for a file with
32 MiB extents it reaches 702 MB/s.

Steps to reproduce:
1.Configure the maximum preallocation size to 8 MiB or 32 MiB:
echo 8192 > /sys/fs/ext4/nvme13n1/mb_max_prealloc_kb
echo 32768 > /sys/fs/ext4/nvme13n1/mb_max_prealloc_kb

2.Run the following commands simultaneously so that the extents of
the two files are physically interleaved, resulting in 8 MiB or 32 MiB
extents:
dd if=/dev/zero of=/mnt/store1/501.txt bs=128K count=80K oflag=direct
dd if=/dev/zero of=/mnt/store1/502.txt bs=128K count=80K oflag=direct

3.Read back the file and measure the read throughput:
dd if=/mnt/store1/501.txt of=/dev/null bs=128K count=80K iflag=direct

Signed-off-by: Guzebing <guzebing@bytedance.com>
---
 Documentation/ABI/testing/sysfs-fs-ext4 |  8 +++++++
 fs/ext4/ext4.h                          |  1 +
 fs/ext4/mballoc.c                       |  2 +-
 fs/ext4/super.c                         |  1 +
 fs/ext4/sysfs.c                         | 28 ++++++++++++++++++++++++-
 5 files changed, 38 insertions(+), 2 deletions(-)

diff --git a/Documentation/ABI/testing/sysfs-fs-ext4 b/Documentation/ABI/testing/sysfs-fs-ext4
index 2edd0a6672d3a..316ae1d1ec18b 100644
--- a/Documentation/ABI/testing/sysfs-fs-ext4
+++ b/Documentation/ABI/testing/sysfs-fs-ext4
@@ -48,6 +48,14 @@ Description:
 		will have its blocks allocated out of its own unique
 		preallocation pool.
 
+What:		/sys/fs/ext4/<disk>/mb_max_prealloc_kb
+Date:		April 2026
+Contact:	"Linux Ext4 Development List" <linux-ext4@vger.kernel.org>
+Description:
+		Maximum size (in kilobytes) used by the multiblock allocator's
+		normalized request preallocation heuristic. Values are rounded
+		up to a power of two and clamped to a minimum of 8192 (8MiB).
+
 What:		/sys/fs/ext4/<disk>/inode_readahead_blks
 Date:		March 2008
 Contact:	"Theodore Ts'o" <tytso@mit.edu>
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 7617e2d454ea5..bce99740740f5 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1634,6 +1634,7 @@ struct ext4_sb_info {
 	unsigned int s_mb_best_avail_max_trim_order;
 	unsigned int s_sb_update_sec;
 	unsigned int s_sb_update_kb;
+	unsigned int s_mb_max_prealloc_kb;
 
 	/* where last allocation was done - for stream allocation */
 	ext4_group_t *s_mb_last_groups;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index bb58eafb87bcd..f5f63c56fcdac 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -4589,7 +4589,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
 					(8<<20)>>bsbits, max, 8 * 1024)) {
 		start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
 							(23 - bsbits)) << 23;
-		size = 8 * 1024 * 1024;
+		size = (loff_t)sbi->s_mb_max_prealloc_kb << 10;
 	} else {
 		start_off = (loff_t) ac->ac_o_ex.fe_logical << bsbits;
 		size	  = (loff_t) EXT4_C2B(sbi,
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index a34efb44e73d7..f815e31657cc9 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -5447,6 +5447,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
 		sbi->s_stripe = 0;
 	}
 	sbi->s_extent_max_zeroout_kb = 32;
+	sbi->s_mb_max_prealloc_kb = 8 * 1024;
 
 	/*
 	 * set up enough so that it can read an inode
diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
index 923b375e017fa..6339492eb2fa7 100644
--- a/fs/ext4/sysfs.c
+++ b/fs/ext4/sysfs.c
@@ -10,6 +10,8 @@
 
 #include <linux/time.h>
 #include <linux/fs.h>
+#include <linux/log2.h>
+#include <linux/limits.h>
 #include <linux/seq_file.h>
 #include <linux/slab.h>
 #include <linux/proc_fs.h>
@@ -41,6 +43,7 @@ typedef enum {
 	attr_pointer_atomic,
 	attr_journal_task,
 	attr_err_report_sec,
+	attr_mb_max_prealloc_kb,
 } attr_id_t;
 
 typedef enum {
@@ -115,6 +118,25 @@ static ssize_t reserved_clusters_store(struct ext4_sb_info *sbi,
 	return count;
 }
 
+static ssize_t mb_max_prealloc_kb_store(struct ext4_sb_info *sbi,
+					const char *buf, size_t count)
+{
+	unsigned int v;
+	int ret;
+	unsigned long rounded;
+
+	ret = kstrtouint(skip_spaces(buf), 0, &v);
+	if (ret)
+		return ret;
+	if (v < 8192)
+		v = 8192;
+	rounded = roundup_pow_of_two((unsigned long)v);
+	if (rounded > UINT_MAX)
+		return -EINVAL;
+	sbi->s_mb_max_prealloc_kb = (unsigned int)rounded;
+	return count;
+}
+
 static ssize_t trigger_test_error(struct ext4_sb_info *sbi,
 				  const char *buf, size_t count)
 {
@@ -288,6 +310,7 @@ EXT4_RW_ATTR_SBI_UI(mb_prefetch_limit, s_mb_prefetch_limit);
 EXT4_RW_ATTR_SBI_UL(last_trim_minblks, s_last_trim_minblks);
 EXT4_RW_ATTR_SBI_UI(sb_update_sec, s_sb_update_sec);
 EXT4_RW_ATTR_SBI_UI(sb_update_kb, s_sb_update_kb);
+EXT4_ATTR_OFFSET(mb_max_prealloc_kb, 0644, mb_max_prealloc_kb, ext4_sb_info, s_mb_max_prealloc_kb);
 
 static unsigned int old_bump_val = 128;
 EXT4_ATTR_PTR(max_writeback_mb_bump, 0444, pointer_ui, &old_bump_val);
@@ -341,6 +364,7 @@ static struct attribute *ext4_attrs[] = {
 	ATTR_LIST(last_trim_minblks),
 	ATTR_LIST(sb_update_sec),
 	ATTR_LIST(sb_update_kb),
+	ATTR_LIST(mb_max_prealloc_kb),
 	ATTR_LIST(err_report_sec),
 	NULL,
 };
@@ -431,6 +455,7 @@ static ssize_t ext4_generic_attr_show(struct ext4_attr *a,
 	case attr_mb_order:
 	case attr_pointer_pi:
 	case attr_pointer_ui:
+	case attr_mb_max_prealloc_kb:
 		if (a->attr_ptr == ptr_ext4_super_block_offset)
 			return sysfs_emit(buf, "%u\n", le32_to_cpup(ptr));
 		return sysfs_emit(buf, "%u\n", *((unsigned int *) ptr));
@@ -557,6 +582,8 @@ static ssize_t ext4_attr_store(struct kobject *kobj,
 		return reserved_clusters_store(sbi, buf, len);
 	case attr_inode_readahead:
 		return inode_readahead_blks_store(sbi, buf, len);
+	case attr_mb_max_prealloc_kb:
+		return mb_max_prealloc_kb_store(sbi, buf, len);
 	case attr_trigger_test_error:
 		return trigger_test_error(sbi, buf, len);
 	case attr_err_report_sec:
@@ -695,4 +722,3 @@ void ext4_exit_sysfs(void)
 	remove_proc_entry(proc_dirname, NULL);
 	ext4_proc_root = NULL;
 }
-
-- 
2.20.1


^ permalink raw reply related	[flat|nested] only message in thread

only message in thread, other threads:[~2026-04-10  3:56 UTC | newest]

Thread overview: (only message) (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-04-10  3:56 [PATCH] ext4: make mballoc max prealloc size configurable guzebing

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox