[PATCH] ext4: rralloc - (former rotalloc) improved round-robin allocation policy

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH] ext4: rralloc - (former rotalloc) improved round-robin allocation policy
       [not found] <20260225201520.220071-1-mario_lohajner.ref@rocketmail.com>
@ 2026-02-25 20:15 ` Mario Lohajner
  2026-02-25 23:49   ` Andreas Dilger
  0 siblings, 1 reply; 11+ messages in thread
From: Mario Lohajner @ 2026-02-25 20:15 UTC (permalink / raw)
  To: tytso
  Cc: libaokun1, adilger.kernel, linux-ext4, linux-kernel, yangerkun,
	libaokun9, Mario Lohajner

V2 patch incorporating feedback from previous discussion:

- per-inode atomic cursors to enforce stream sequentiality
- per-CPU starting points to reduce contention
- allocator isolation maintained; regular allocator untouched
- name changed to rralloc to avoid confusion with "rotational"
- preliminary tests confirm expected performance

Files modified:
- fs/ext4/ext4.h
	rralloc policy declared, per-CPU cursors & allocator vector

- fs/ext4/ialloc.c
	initialize (zero) per-inode cursor

- fs/ext4/mballoc.h
	expose allocator functions for vectoring in super.c

- fs/ext4/super.c
	parse rralloc option, init per-CPU cursors and allocator vector

- fs/ext4/mballoc.c
	add rotating allocator, vectored allocator

Signed-off-by: Mario Lohajner <mario_lohajner@rocketmail.com>
---
 fs/ext4/ext4.h    |  10 +++-
 fs/ext4/ialloc.c  |   3 +-
 fs/ext4/mballoc.c | 115 ++++++++++++++++++++++++++++++++++++++++++++--
 fs/ext4/mballoc.h |   3 ++
 fs/ext4/super.c   |  33 ++++++++++++-
 5 files changed, 157 insertions(+), 7 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 293f698b7042..210332affd47 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -229,6 +229,9 @@ struct ext4_allocation_request {
 	unsigned int flags;
 };
 
+/* rralloc show pointer type to compiler */
+struct ext4_allocation_context;
+
 /*
  * Logical to physical block mapping, used by ext4_map_blocks()
  *
@@ -1032,7 +1035,8 @@ struct ext4_inode_info {
 	__le32	i_data[15];	/* unconverted */
 	__u32	i_dtime;
 	ext4_fsblk_t	i_file_acl;
-
+	/* rralloc per inode cursor */
+	atomic_t	cursor;
 	/*
 	 * i_block_group is the number of the block group which contains
 	 * this file's inode.  Constant across the lifetime of the inode,
@@ -1217,6 +1221,7 @@ struct ext4_inode_info {
  * Mount flags set via mount options or defaults
  */
 #define EXT4_MOUNT_NO_MBCACHE		0x00001 /* Do not use mbcache */
+#define EXT4_MOUNT_RRALLOC			0x00002 /* Use round-robin policy/allocator */
 #define EXT4_MOUNT_GRPID		0x00004	/* Create files with directory's group */
 #define EXT4_MOUNT_DEBUG		0x00008	/* Some debugging messages */
 #define EXT4_MOUNT_ERRORS_CONT		0x00010	/* Continue on errors */
@@ -1546,6 +1551,9 @@ struct ext4_sb_info {
 	unsigned long s_mount_flags;
 	unsigned int s_def_mount_opt;
 	unsigned int s_def_mount_opt2;
+	/* rralloc per-cpu cursors and allocator vector */
+	ext4_group_t __percpu *s_rralloc_cursor;
+	int (*s_vectored_allocator)(struct ext4_allocation_context *ac);
 	ext4_fsblk_t s_sb_block;
 	atomic64_t s_resv_clusters;
 	kuid_t s_resuid;
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index b20a1bf866ab..c72cee642eca 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -962,7 +962,8 @@ struct inode *__ext4_new_inode(struct mnt_idmap *idmap,
 	if (!inode)
 		return ERR_PTR(-ENOMEM);
 	ei = EXT4_I(inode);
-
+	/* Zero the rralloc per-inode cursor */
+	atomic_set(&ei->cursor, 0);
 	/*
 	 * Initialize owners and quota early so that we don't have to account
 	 * for quota initialization worst case in standard inode creating
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 20e9fdaf4301..df3805bb4a2f 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2266,9 +2266,19 @@ static void ext4_mb_use_best_found(struct ext4_allocation_context *ac,
 	folio_get(ac->ac_buddy_folio);
 	/* store last allocated for subsequent stream allocation */
 	if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
-		int hash = ac->ac_inode->i_ino % sbi->s_mb_nr_global_goals;
+		/* update global goals */
+		if (!test_opt(ac->ac_sb, RRALLOC)) {
+			int hash = ac->ac_inode->i_ino % sbi->s_mb_nr_global_goals;
+
+			WRITE_ONCE(sbi->s_mb_last_groups[hash], ac->ac_f_ex.fe_group);
+		} else {
+			/* update inode cursor and current per-cpu cursor */
+			ext4_group_t cursor = ac->ac_f_ex.fe_group;
+			struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
 
-		WRITE_ONCE(sbi->s_mb_last_groups[hash], ac->ac_f_ex.fe_group);
+			atomic_set(&ei->cursor, cursor);
+			*this_cpu_ptr(sbi->s_rralloc_cursor) = cursor;
+		}
 	}
 
 	/*
@@ -2991,7 +3001,7 @@ static int ext4_mb_scan_group(struct ext4_allocation_context *ac,
 	return ret;
 }
 
-static noinline_for_stack int
+noinline_for_stack int
 ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
 {
 	ext4_group_t i;
@@ -3111,6 +3121,102 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
 	return err;
 }
 
+/* Rotating allocator (round-robin) */
+noinline_for_stack int
+ext4_mb_rotating_allocator(struct ext4_allocation_context *ac)
+{
+	ext4_group_t goal;
+	int err = 0;
+	struct super_block *sb = ac->ac_sb;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_buddy e4b;
+	struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
+	ext4_group_t start = *this_cpu_ptr(sbi->s_rralloc_cursor);
+
+	/* if inode cursor=0, use per-cpu cursor */
+	goal = atomic_cmpxchg(&ei->cursor, 0, start);
+	if (!goal)
+		goal = start;
+
+	ac->ac_g_ex.fe_group = goal;
+
+	/* first, try the goal */
+	err = ext4_mb_find_by_goal(ac, &e4b);
+	if (err || ac->ac_status == AC_STATUS_FOUND)
+		goto out;
+
+	/* RRallocation promotes stream behavior */
+	ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
+	ac->ac_flags |= EXT4_MB_HINT_FIRST;
+	ac->ac_flags &= ~EXT4_MB_HINT_GOAL_ONLY;
+	ac->ac_g_ex.fe_group = goal;
+	ac->ac_g_ex.fe_start = -1;
+	ac->ac_2order = 0;
+	ac->ac_criteria = CR_ANY_FREE;
+	ac->ac_e4b = &e4b;
+	ac->ac_prefetch_ios = 0;
+	ac->ac_first_err = 0;
+repeat:
+	while (ac->ac_criteria < EXT4_MB_NUM_CRS) {
+		err = ext4_mb_scan_groups(ac);
+		if (err)
+			goto out;
+
+		if (ac->ac_status != AC_STATUS_CONTINUE)
+			break;
+	}
+
+	if (ac->ac_b_ex.fe_len > 0 && ac->ac_status != AC_STATUS_FOUND &&
+	    !(ac->ac_flags & EXT4_MB_HINT_FIRST)) {
+		/*
+		 * We've been searching too long. Let's try to allocate
+		 * the best chunk we've found so far
+		 */
+		ext4_mb_try_best_found(ac, &e4b);
+		if (ac->ac_status != AC_STATUS_FOUND) {
+			int lost;
+
+			/*
+			 * Someone more lucky has already allocated it.
+			 * The only thing we can do is just take first
+			 * found block(s)
+			 */
+			lost = atomic_inc_return(&sbi->s_mb_lost_chunks);
+			mb_debug(sb, "lost chunk, group: %u, start: %d, len: %d, lost: %d\n",
+				 ac->ac_b_ex.fe_group, ac->ac_b_ex.fe_start,
+				 ac->ac_b_ex.fe_len, lost);
+
+			ac->ac_b_ex.fe_group = 0;
+			ac->ac_b_ex.fe_start = 0;
+			ac->ac_b_ex.fe_len = 0;
+			ac->ac_status = AC_STATUS_CONTINUE;
+			ac->ac_flags |= EXT4_MB_HINT_FIRST;
+			ac->ac_criteria = CR_ANY_FREE;
+			goto repeat;
+		}
+	}
+
+	if (sbi->s_mb_stats && ac->ac_status == AC_STATUS_FOUND) {
+		atomic64_inc(&sbi->s_bal_cX_hits[ac->ac_criteria]);
+		if (ac->ac_flags & EXT4_MB_STREAM_ALLOC &&
+		    ac->ac_b_ex.fe_group == ac->ac_g_ex.fe_group)
+			atomic_inc(&sbi->s_bal_stream_goals);
+	}
+
+out:
+	if (!err && ac->ac_status != AC_STATUS_FOUND && ac->ac_first_err)
+		err = ac->ac_first_err;
+
+	mb_debug(sb, "Best len %d, origin len %d, ac_status %u, ac_flags 0x%x, cr %d ret %d\n",
+		 ac->ac_b_ex.fe_len, ac->ac_o_ex.fe_len, ac->ac_status,
+		 ac->ac_flags, ac->ac_criteria, err);
+
+	if (ac->ac_prefetch_nr)
+		ext4_mb_prefetch_fini(sb, ac->ac_prefetch_grp, ac->ac_prefetch_nr);
+
+	return err;
+}
+
 static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos)
 {
 	struct super_block *sb = pde_data(file_inode(seq->file));
@@ -6313,7 +6419,8 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
 			goto errout;
 repeat:
 		/* allocate space in core */
-		*errp = ext4_mb_regular_allocator(ac);
+		/* use vector separation for rralloc allocator */
+		*errp = sbi->s_vectored_allocator(ac);
 		/*
 		 * pa allocated above is added to grp->bb_prealloc_list only
 		 * when we were able to allocate some block i.e. when
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index 15a049f05d04..27d7a7dd7044 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -270,4 +270,7 @@ ext4_mballoc_query_range(
 	ext4_mballoc_query_range_fn	formatter,
 	void				*priv);
 
+/* Expose rotating & regular allocator for vectoring */
+int ext4_mb_rotating_allocator(struct ext4_allocation_context *ac);
+int ext4_mb_regular_allocator(struct ext4_allocation_context *ac);
 #endif
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 43f680c750ae..1e4cf6a40c88 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1284,6 +1284,10 @@ static void ext4_put_super(struct super_block *sb)
 	int aborted = 0;
 	int err;
 
+	/* free per cpu cursors */
+	if (sbi->s_rralloc_cursor)
+		free_percpu(sbi->s_rralloc_cursor);
+
 	/*
 	 * Unregister sysfs before destroying jbd2 journal.
 	 * Since we could still access attr_journal_task attribute via sysfs
@@ -1683,7 +1687,7 @@ enum {
 	Opt_dioread_nolock, Opt_dioread_lock,
 	Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable,
 	Opt_max_dir_size_kb, Opt_nojournal_checksum, Opt_nombcache,
-	Opt_no_prefetch_block_bitmaps, Opt_mb_optimize_scan,
+	Opt_no_prefetch_block_bitmaps, Opt_mb_optimize_scan, Opt_rralloc,
 	Opt_errors, Opt_data, Opt_data_err, Opt_jqfmt, Opt_dax_type,
 #ifdef CONFIG_EXT4_DEBUG
 	Opt_fc_debug_max_replay, Opt_fc_debug_force
@@ -1805,6 +1809,7 @@ static const struct fs_parameter_spec ext4_param_specs[] = {
 	fsparam_u32	("init_itable",		Opt_init_itable),
 	fsparam_flag	("init_itable",		Opt_init_itable),
 	fsparam_flag	("noinit_itable",	Opt_noinit_itable),
+	fsparam_flag	("rralloc",	Opt_rralloc),
 #ifdef CONFIG_EXT4_DEBUG
 	fsparam_flag	("fc_debug_force",	Opt_fc_debug_force),
 	fsparam_u32	("fc_debug_max_replay",	Opt_fc_debug_max_replay),
@@ -1886,6 +1891,7 @@ static const struct mount_opts {
 	{Opt_noauto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_SET},
 	{Opt_auto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_CLEAR},
 	{Opt_noinit_itable, EXT4_MOUNT_INIT_INODE_TABLE, MOPT_CLEAR},
+	{Opt_rralloc, EXT4_MOUNT_RRALLOC, MOPT_SET},
 	{Opt_dax_type, 0, MOPT_EXT4_ONLY},
 	{Opt_journal_dev, 0, MOPT_NO_EXT2},
 	{Opt_journal_path, 0, MOPT_NO_EXT2},
@@ -2272,6 +2278,9 @@ static int ext4_parse_param(struct fs_context *fc, struct fs_parameter *param)
 			ctx->s_li_wait_mult = result.uint_32;
 		ctx->spec |= EXT4_SPEC_s_li_wait_mult;
 		return 0;
+	case Opt_rralloc:
+		ctx_set_mount_opt(ctx, EXT4_MOUNT_RRALLOC);
+		return 0;
 	case Opt_max_dir_size_kb:
 		ctx->s_max_dir_size_kb = result.uint_32;
 		ctx->spec |= EXT4_SPEC_s_max_dir_size_kb;
@@ -5311,6 +5320,9 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
 	struct ext4_fs_context *ctx = fc->fs_private;
 	int silent = fc->sb_flags & SB_SILENT;
 
+	/* Unconditional default regular allocator (rralloc separation) */
+	sbi->s_vectored_allocator = ext4_mb_regular_allocator;
+
 	/* Set defaults for the variables that will be set during parsing */
 	if (!(ctx->spec & EXT4_SPEC_JOURNAL_IOPRIO))
 		ctx->journal_ioprio = EXT4_DEF_JOURNAL_IOPRIO;
@@ -5522,6 +5534,25 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
 		}
 	}
 
+	/* rralloc: initialize per-cpu cursors and rotational allocator */
+	if (test_opt(sb, RRALLOC)) {
+		sbi->s_rralloc_cursor = alloc_percpu(ext4_group_t);
+		if (!sbi->s_rralloc_cursor)
+			return -ENOMEM;
+
+		int ncpus = num_possible_cpus();
+		ext4_group_t total_groups = ext4_get_groups_count(sb);
+		ext4_group_t groups_per_cpu = total_groups / ncpus;
+		int cpu;
+
+		for_each_possible_cpu(cpu) {
+			*per_cpu_ptr(sbi->s_rralloc_cursor, cpu) = cpu * groups_per_cpu;
+		}
+
+		/* Vectored allocator to round-robin allocator */
+		sbi->s_vectored_allocator = ext4_mb_rotating_allocator;
+	}
+
 	/*
 	 * Get the # of file system overhead blocks from the
 	 * superblock if present.
-- 
2.53.0


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [PATCH] ext4: rralloc - (former rotalloc) improved round-robin allocation policy
  2026-02-25 20:15 ` [PATCH] ext4: rralloc - (former rotalloc) improved round-robin allocation policy Mario Lohajner
@ 2026-02-25 23:49   ` Andreas Dilger
  2026-02-26  2:48     ` Theodore Tso
  0 siblings, 1 reply; 11+ messages in thread
From: Andreas Dilger @ 2026-02-25 23:49 UTC (permalink / raw)
  To: Mario Lohajner
  Cc: tytso, libaokun1, adilger.kernel, linux-ext4, linux-kernel,
	yangerkun, libaokun9

On Feb 25, 2026, at 13:15, Mario Lohajner <mario_lohajner@rocketmail.com> wrote:
> 
> V2 patch incorporating feedback from previous discussion:
> 
> - per-inode atomic cursors to enforce stream sequentiality
> - per-CPU starting points to reduce contention
> - allocator isolation maintained; regular allocator untouched
> - name changed to rralloc to avoid confusion with "rotational"
> - preliminary tests confirm expected performance

Mario, can you please include a summary of the performance test
results into the commit message so that the effectiveness of the
patch can be evaluated.  This should include test(s) run and
their arguments, along with table of before/after numbers.

Cheers, Andreas

> Files modified:
> - fs/ext4/ext4.h
> rralloc policy declared, per-CPU cursors & allocator vector
> 
> - fs/ext4/ialloc.c
> initialize (zero) per-inode cursor
> 
> - fs/ext4/mballoc.h
> expose allocator functions for vectoring in super.c
> 
> - fs/ext4/super.c
> parse rralloc option, init per-CPU cursors and allocator vector
> 
> - fs/ext4/mballoc.c
> add rotating allocator, vectored allocator
> 
> Signed-off-by: Mario Lohajner <mario_lohajner@rocketmail.com>


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] ext4: rralloc - (former rotalloc) improved round-robin allocation policy
  2026-02-25 23:49   ` Andreas Dilger
@ 2026-02-26  2:48     ` Theodore Tso
  2026-02-26 21:50       ` Mario Lohajner
  0 siblings, 1 reply; 11+ messages in thread
From: Theodore Tso @ 2026-02-26  2:48 UTC (permalink / raw)
  To: Andreas Dilger
  Cc: Mario Lohajner, libaokun1, adilger.kernel, linux-ext4,
	linux-kernel, yangerkun, libaokun9

On Wed, Feb 25, 2026 at 04:49:30PM -0700, Andreas Dilger wrote:
> 
> Mario, can you please include a summary of the performance test
> results into the commit message so that the effectiveness of the
> patch can be evaluated.  This should include test(s) run and
> their arguments, along with table of before/after numbers.

The tests should also include an explanation of the hardware that you
ran the test on.  Some example of cover letters that include
perforance improvement results:

https://lore.kernel.org/all/20251025032221.2905818-1-libaokun@huaweicloud.com/
https://lore.kernel.org/all/20260105014522.1937690-1-yi.zhang@huaweicloud.com/

Cheers,

					- Ted

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] ext4: rralloc - (former rotalloc) improved round-robin allocation policy
  2026-02-26  2:48     ` Theodore Tso
@ 2026-02-26 21:50       ` Mario Lohajner
  2026-02-27  1:12         ` Theodore Tso
  0 siblings, 1 reply; 11+ messages in thread
From: Mario Lohajner @ 2026-02-26 21:50 UTC (permalink / raw)
  To: Theodore Tso, Andreas Dilger
  Cc: libaokun1, adilger.kernel, linux-ext4, linux-kernel, yangerkun,
	libaokun9



On 26. 02. 2026. 03:48, Theodore Tso wrote:
> On Wed, Feb 25, 2026 at 04:49:30PM -0700, Andreas Dilger wrote:
>>
>> Mario, can you please include a summary of the performance test
>> results into the commit message so that the effectiveness of the
>> patch can be evaluated.  This should include test(s) run and
>> their arguments, along with table of before/after numbers.
> 
> The tests should also include an explanation of the hardware that you
> ran the test on.  Some example of cover letters that include
> perforance improvement results:
> 
> https://lore.kernel.org/all/20251025032221.2905818-1-libaokun@huaweicloud.com/
> https://lore.kernel.org/all/20260105014522.1937690-1-yi.zhang@huaweicloud.com/
> 
> Cheers,
> 
> 					- Ted

Hello Andreas, hello Theodore!

These are the results of synthetic tests designed to evaluate whether
the round-robin allocator (rralloc) maintains its allocation behavior
without degrading performance, and to examine its behavior under high
concurrency and stress conditions.

The primary purpose of rralloc is to improve allocation distribution
and avoid hotspotting. Performance improvements are not the goal here,
throughput similar to regular allocator is considered favorable.

# Workloads
*** Large Sequential Files
(Evaluates sequential write throughput with multi-job concurrency)

fio --name=seqwrite --directory=. --rw=write --bs=1M --numjobs=6 \
--size=4G --runtime=300 --time_based --group_reporting

*** Small Files
(Validates allocator behavior and latency under small, random writes)

fio --name=smallfiles --directory=. --rw=write --bs=4k --numjobs=6 \
--size=1G --ioengine=psync --time_based --runtime=180 --group_reporting

*** Multi-core / Multi-task Stress
(Tests high concurrency and stress conditions)

fio --name=allocstress --directory=. --rw=write --bs=4k --numjobs=48 \
--size=2G --ioengine=psync --time_based --runtime=180 --group_reporting

# Results summary:

| Test        | Allocator | BW (MiB/s) | IOPS  | Avg lat |
| ----------- | --------- | ---------- | ----- | ------- |
| seqwrite    | Regular   | 497        | 496   | 9.19 ms |
|             | rralloc   | 538        | 537   | 9.37 ms |
| smallfiles  | Regular   | 707        | 181k  | 2.03 µs |
|             | rralloc   | 586        | 150k  | 2.18 µs |
| allocstress | Regular   | 166        | 42.4k | 1.13 ms |
|             | rralloc   | 283        | 72.5k | 0.66 ms |


The results indicate that a round-robin allocation strategy can be
introduced without compromising baseline I/O characteristics.

Across tested workloads, rralloc preserves expected I/O behavior and
does not introduce performance regressions of practical significance.

| Test     | Hardware                                   |
| -------- | ------------------------------------------ |
| OS       | Fedora Linux 42 Workstation x86_64         |
| Host     | 90HU007QGE ideacentre 510-15ICB            |
| Kernel   | 6.18.9-dirty                               |
| CPU      | Intel i5-8400 (6 cores) @ 2.801GHz         |
| GPU1     | AMD Radeon RX 560                          |
| GPU2     | Intel CoffeeLake-S GT2 / UHD Graphics 630  |
| Memory   | 1379 MiB / 31,958 MiB                      |
| NVMe     | INTEL SSDPEKNW512G8H (HPS1)                |
| SATA SSD | Samsung SSD 860 PRO 256GB (RVM02B6Q)       |

Note:
These results reflect synthetic tests on the described hardware.
Independent reproduction and verification of these findings are welcome.
Variations in hardware, kernel version, or workload configuration may
affect absolute numbers, but the qualitative observation—that rralloc
preserves baseline I/O behavior—remains valid.

For more details (raw outputs) please refer to:
https://github.com/mlohajner/RRALLOC

Regards,
Mario Lohajner (manjo)

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] ext4: rralloc - (former rotalloc) improved round-robin allocation policy
  2026-02-26 21:50       ` Mario Lohajner
@ 2026-02-27  1:12         ` Theodore Tso
  2026-02-27 14:46           ` Mario Lohajner
  0 siblings, 1 reply; 11+ messages in thread
From: Theodore Tso @ 2026-02-27  1:12 UTC (permalink / raw)
  To: Mario Lohajner
  Cc: Andreas Dilger, libaokun1, adilger.kernel, linux-ext4,
	linux-kernel, yangerkun, libaokun9

On Thu, Feb 26, 2026 at 10:50:29PM +0100, Mario Lohajner wrote:
> The primary purpose of rralloc is to improve allocation distribution
> and avoid hotspotting.   Performance improvements are not the goal here...

You haven't explained *why* allocation distribution and avoiding
hotspotting is something we should care about.

If it's not performance, then why?  How does reducing hotspotting
improve things for the user?  Why should we care about this goal that
apparently is so important to you?

					- Ted

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] ext4: rralloc - (former rotalloc) improved round-robin allocation policy
  2026-02-27  1:12         ` Theodore Tso
@ 2026-02-27 14:46           ` Mario Lohajner
  2026-02-27 16:43             ` Theodore Tso
  0 siblings, 1 reply; 11+ messages in thread
From: Mario Lohajner @ 2026-02-27 14:46 UTC (permalink / raw)
  To: Theodore Tso
  Cc: Andreas Dilger, libaokun1, adilger.kernel, linux-ext4,
	linux-kernel, yangerkun, libaokun9

On 27. 02. 2026. 02:12, Theodore Tso wrote:
> On Thu, Feb 26, 2026 at 10:50:29PM +0100, Mario Lohajner wrote:
>> The primary purpose of rralloc is to improve allocation distribution
>> and avoid hotspotting.   Performance improvements are not the goal here...
> 
> You haven't explained *why* allocation distribution and avoiding
> hotspotting is something we should care about.
> 
> If it's not performance, then why?  How does reducing hotspotting
> improve things for the user?  Why should we care about this goal that
> apparently is so important to you?
> 
> 					- Ted

Hello Ted,

The motivation behind rralloc is to promote even allocation across the
available LBA under overwrite-heavy workloads.

With the regular allocator, repeated allocations can concentrate the
pressure in specific regions (e.g., in-place overwrites or LBA start).

rralloc spreads allocations across the LBA, reducing localized
contention while:
	* promoting existing stream allocation behavior
	* distributing LBA space per CPU
	* preserving intra-file locality and heuristics
	* using the entire LBA in a round-robin manner
	* minimizing contention and races
	* keeping the regular allocator isolated and intact

Block group usage analysis confirms that rralloc distributes
allocations evenly without degrading baseline throughput:
	* small/medium/large file fragmentation experiments
	* synthetic tests
	* real-world tests (kernel source tree copies)

https://github.com/mlohajner/RRALLOC

Why it matters:
Concentrated allocation can create contention, write amplification, and
uneven LBA utilization even on modern NVMe/SSD devices.
rralloc promotes round-robin allocation across the entire LBA,
with per-CPU zones, ensuring more even allocation distribution while
leaving throughput and existing heuristics unchanged.

Workloads include (but not limited to):
	* media files processing and rendering
	* builds/compilations
	* database workloads

End user impact:
Users can enable rralloc at mount to take advantage of this alternative
allocation policy.
Regular allocator behavior remains unchanged for those who prefer linear
or traditional allocation.

This approach is backward-compatible, non-intrusive, and preserves
on-disk format and existing heuristics.

Preliminary observations under heavy multi-threaded workloads suggest
reduced contention effects, but this has not yet been fully characterized.

Regards,
Mario Lohajner (manjo)

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] ext4: rralloc - (former rotalloc) improved round-robin allocation policy
  2026-02-27 14:46           ` Mario Lohajner
@ 2026-02-27 16:43             ` Theodore Tso
  2026-03-02 20:04               ` Mario Lohajner
  0 siblings, 1 reply; 11+ messages in thread
From: Theodore Tso @ 2026-02-27 16:43 UTC (permalink / raw)
  To: Mario Lohajner
  Cc: Andreas Dilger, libaokun1, adilger.kernel, linux-ext4,
	linux-kernel, yangerkun, libaokun9

On Fri, Feb 27, 2026 at 03:46:59PM +0100, Mario Lohajner wrote:
> 
> Concentrated allocation can create contention, write amplification, and
> uneven LBA utilization even on modern NVMe/SSD devices.

Uneven LBA utilization is the thing where I'm asking, "why should we care".

In terms of how this would cause contention and write amplification,
<<citation needed>>.  What is your benchmarks where you can
demonstrate this, and how common is this across NVMe/SSD devices?
That is, if it's just one trashy product, maybe it should just be
avoided --- especially if it has other problems.

						- Ted

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] ext4: rralloc - (former rotalloc) improved round-robin allocation policy
  2026-02-27 16:43             ` Theodore Tso
@ 2026-03-02 20:04               ` Mario Lohajner
  2026-03-03  1:33                 ` Theodore Tso
  0 siblings, 1 reply; 11+ messages in thread
From: Mario Lohajner @ 2026-03-02 20:04 UTC (permalink / raw)
  To: Theodore Tso
  Cc: Andreas Dilger, libaokun1, adilger.kernel, linux-ext4,
	linux-kernel, yangerkun, libaokun9



On 27. 02. 2026. 17:43, Theodore Tso wrote:
> On Fri, Feb 27, 2026 at 03:46:59PM +0100, Mario Lohajner wrote:
>>
>> Concentrated allocation can create contention, write amplification, and
>> uneven LBA utilization even on modern NVMe/SSD devices.
> 
> Uneven LBA utilization is the thing where I'm asking, "why should we care".
> 
> In terms of how this would cause contention and write amplification,
> <<citation needed>>.  What is your benchmarks where you can
> demonstrate this, and how common is this across NVMe/SSD devices?
> That is, if it's just one trashy product, maybe it should just be
> avoided --- especially if it has other problems.
> 
> 						- Ted

RRALLOC spreads allocation starting points across block groups to avoid
repeated concentration under parallel load.

This reduces short-term allocation concentration in the same regions
when multiple CPUs allocate concurrently.

In high-concurrency testing, performance is consistently comparable to
or occasionally better than the regular allocator. No regressions have
been observed across tested configurations.

Under sustained parallel allocation pressure, testing shows improved
tail-latency stability compared to the current allocator.

Additional high-concurrency test results are available at:
https://github.com/mlohajner/RRALLOC

Regards,
Mario Lohajner

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] ext4: rralloc - (former rotalloc) improved round-robin allocation policy
  2026-03-02 20:04               ` Mario Lohajner
@ 2026-03-03  1:33                 ` Theodore Tso
  2026-03-03 13:28                   ` Mario Lohajner
  0 siblings, 1 reply; 11+ messages in thread
From: Theodore Tso @ 2026-03-03  1:33 UTC (permalink / raw)
  To: Mario Lohajner
  Cc: Andreas Dilger, libaokun1, adilger.kernel, linux-ext4,
	linux-kernel, yangerkun, libaokun9

On Mon, Mar 02, 2026 at 09:04:44PM +0100, Mario Lohajner wrote:
> RRALLOC spreads allocation starting points across block groups to avoid
> repeated concentration under parallel load.

There are already other ways in which we spread allocations across
block groups.  You need to tell explain a specific workload where this
actually makes a difference.

Also note that in most use cases, files are written once, and read
multiple times.  So spreading blocks across different block groups is
can often be actively harmful.

> In high-concurrency testing, performance is consistently comparable to
> or occasionally better than the regular allocator. No regressions have
> been observed across tested configurations.

No regressions, and only "occasionally better" not enough of a justifiation.

What is your real life workload which is motivating your efforts?

     	     	       		      	 - Ted

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] ext4: rralloc - (former rotalloc) improved round-robin allocation policy
  2026-03-03  1:33                 ` Theodore Tso
@ 2026-03-03 13:28                   ` Mario Lohajner
  2026-03-05  2:47                     ` Theodore Tso
  0 siblings, 1 reply; 11+ messages in thread
From: Mario Lohajner @ 2026-03-03 13:28 UTC (permalink / raw)
  To: Theodore Tso
  Cc: Andreas Dilger, libaokun1, adilger.kernel, linux-ext4,
	linux-kernel, yangerkun, libaokun9

On 03. 03. 2026. 02:33, Theodore Tso wrote:
> On Mon, Mar 02, 2026 at 09:04:44PM +0100, Mario Lohajner wrote:
>> RRALLOC spreads allocation starting points across block groups to avoid
>> repeated concentration under parallel load.
> 
> There are already other ways in which we spread allocations across
> block groups.  You need to tell explain a specific workload where this
> actually makes a difference.
> 
> Also note that in most use cases, files are written once, and read
> multiple times.  So spreading blocks across different block groups is
> can often be actively harmful.
> 
>> In high-concurrency testing, performance is consistently comparable to
>> or occasionally better than the regular allocator. No regressions have
>> been observed across tested configurations.
> 
> No regressions, and only "occasionally better" not enough of a justifiation.
> 
> What is your real life workload which is motivating your efforts?
> 
>       	     	       		      	 - Ted

RRALLOC targets sustained parallel overwrite-heavy workloads such as
scratch disks, rendering outputs, database storage and VM image storage.

It introduces a round-robin allocation policy across block groups to
reduce short-term allocation concentration under high concurrency.

It is not intended to improve write-once/read-many workloads and remains
disabled by default.

I do understand that without clearly measurable workload-specific 
improvement, this is likely not sufficient justification for upstream 
inclusion.

I will continue evaluating and refining the allocator out-of-tree.
If I am able to demonstrate concrete and reproducible benefits beyond
allocation geometry and occasional contention-related effects,
I will revisit the discussion with additional data.

Thank you for the review and valuable feedback.

Regards,
Mario Lohajner

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] ext4: rralloc - (former rotalloc) improved round-robin allocation policy
  2026-03-03 13:28                   ` Mario Lohajner
@ 2026-03-05  2:47                     ` Theodore Tso
  0 siblings, 0 replies; 11+ messages in thread
From: Theodore Tso @ 2026-03-05  2:47 UTC (permalink / raw)
  To: Mario Lohajner
  Cc: Andreas Dilger, libaokun1, adilger.kernel, linux-ext4,
	linux-kernel, yangerkun, libaokun9

On Tue, Mar 03, 2026 at 02:28:47PM +0100, Mario Lohajner wrote:
> RRALLOC targets sustained parallel overwrite-heavy workloads such as
> scratch disks, rendering outputs, database storage and VM image storage.
> ....
> It is not intended to improve write-once/read-many workloads and remains
> disabled by default.

First of all, databases and VM images are use cases which are almost
the definition of write-once / read-many workloads.

As far as your first two examples, I've been part of teams that have
built storage systems for scratch disks and rendering outputs at an
extremely large scale, at *extremely* large scale.  (A public estimate
from 2013[1], for which I make no comments about how accurate it was
back then, but it's fair to say that there have been at least a few more
data centers built since then; also, disks and SSD have gotten
somewhat more efficient from storage density since them.  :-)

[1] https://what-if.xkcd.com/63/

Having built and supported systems for these first two use cases, I
can quite confidentially tell you that the problem that you are
trying to solve for weren't even *close* to real world issues that we
had to overcome.

Now, it may be that you are doing some very different (or perhaps very
dumb; I can't say given how few details you've given).  But what
you've described is so vague and scatter-shot that it could have come
from the output of a very Large Language Model given a very sloppily
written prompt.  (In other words, what is commonlly called "AI Slop".)

If you want to be convincing, you'll need to give a lot more specific
detail about the nature of the workloads.  How many Petabytes (or
whatever the appropriate unit in your case) per hour of data is being
written?  What kind of storage devices are you using?  How many are
you using?  Attached to how many servers?  How many files are being
written in parallel?  At what throughput rate?

When you use stock ext4 for this workload, what are you seeing?  What
sort of benchmarking did you use to convince yourself that the
bottleneck is indeed block allocation algorithm.  What kind of
percentage increase did your replacement algorithm have for this
specific workload.

If you want to see examples of well-written papers of various
performance improvements, I will refer you to papers from Usenix's
File System and Storage Technologies conference[2] for examples of how
to write a convincing paper when you're not free to share *all* of the
details of the workload, or the specific storage devices that you are
using.  The problem is right now, you've shared nothing about your
specific workload.

[2] https://www.usenix.org/conferences/byname/146

Cheers,

						- Ted

^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2026-03-05  2:48 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
     [not found] <20260225201520.220071-1-mario_lohajner.ref@rocketmail.com>
2026-02-25 20:15 ` [PATCH] ext4: rralloc - (former rotalloc) improved round-robin allocation policy Mario Lohajner
2026-02-25 23:49   ` Andreas Dilger
2026-02-26  2:48     ` Theodore Tso
2026-02-26 21:50       ` Mario Lohajner
2026-02-27  1:12         ` Theodore Tso
2026-02-27 14:46           ` Mario Lohajner
2026-02-27 16:43             ` Theodore Tso
2026-03-02 20:04               ` Mario Lohajner
2026-03-03  1:33                 ` Theodore Tso
2026-03-03 13:28                   ` Mario Lohajner
2026-03-05  2:47                     ` Theodore Tso

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox