Linux EXT4 FS development

Linux EXT4 FS development
 help / color / mirror / Atom feed

* [PATCH v1] ext4: add mb_stats_clear for mballoc statistics
From: Baolin Liu @ 2026-04-14 10:02 UTC (permalink / raw)
  To: tytso, adilger.kernel
  Cc: liubaolin12138, linux-ext4, linux-kernel, wangguanyu, Baolin Liu

From: Baolin Liu <liubaolin@kylinos.cn>

Add a write-only mb_stats_clear sysfs knob to reset ext4 mballoc
runtime statistics.This makes it easier to inspect allocator
activity for a specific workload instead of using counters
accumulated since mount.

Signed-off-by: Baolin Liu <liubaolin@kylinos.cn>
---
 fs/ext4/ext4.h    |  1 +
 fs/ext4/mballoc.c | 31 +++++++++++++++++++++++++++++++
 fs/ext4/sysfs.c   | 24 ++++++++++++++++++++++++
 3 files changed, 56 insertions(+)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 7617e2d454ea..3a32e1a515dd 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2995,6 +2995,7 @@ int ext4_fc_record_regions(struct super_block *sb, int ino,
 extern const struct seq_operations ext4_mb_seq_groups_ops;
 extern const struct seq_operations ext4_mb_seq_structs_summary_ops;
 extern int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset);
+extern void ext4_mb_stats_clear(struct ext4_sb_info *sbi);
 extern int ext4_mb_init(struct super_block *);
 extern void ext4_mb_release(struct super_block *);
 extern ext4_fsblk_t ext4_mb_new_blocks(handle_t *,
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index bb58eafb87bc..382c91586b26 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -3219,6 +3219,8 @@ int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset)
 	}
 	seq_printf(seq, "\treqs: %u\n", atomic_read(&sbi->s_bal_reqs));
 	seq_printf(seq, "\tsuccess: %u\n", atomic_read(&sbi->s_bal_success));
+	seq_printf(seq, "\tblocks_allocated: %u\n",
+		   atomic_read(&sbi->s_bal_allocated));
 
 	seq_printf(seq, "\tgroups_scanned: %u\n",
 		   atomic_read(&sbi->s_bal_groups_scanned));
@@ -4721,6 +4723,35 @@ static void ext4_mb_collect_stats(struct ext4_allocation_context *ac)
 		trace_ext4_mballoc_prealloc(ac);
 }
 
+void ext4_mb_stats_clear(struct ext4_sb_info *sbi)
+{
+	int i;
+
+	atomic_set(&sbi->s_bal_reqs, 0);
+	atomic_set(&sbi->s_bal_success, 0);
+	atomic_set(&sbi->s_bal_allocated, 0);
+	atomic_set(&sbi->s_bal_groups_scanned, 0);
+
+	for (i = 0; i < EXT4_MB_NUM_CRS; i++) {
+		atomic64_set(&sbi->s_bal_cX_hits[i], 0);
+		atomic64_set(&sbi->s_bal_cX_groups_considered[i], 0);
+		atomic_set(&sbi->s_bal_cX_ex_scanned[i], 0);
+		atomic64_set(&sbi->s_bal_cX_failed[i], 0);
+	}
+
+	atomic_set(&sbi->s_bal_ex_scanned, 0);
+	atomic_set(&sbi->s_bal_goals, 0);
+	atomic_set(&sbi->s_bal_stream_goals, 0);
+	atomic_set(&sbi->s_bal_len_goals, 0);
+	atomic_set(&sbi->s_bal_2orders, 0);
+	atomic_set(&sbi->s_bal_breaks, 0);
+	atomic_set(&sbi->s_mb_lost_chunks, 0);
+	atomic_set(&sbi->s_mb_buddies_generated, 0);
+	atomic64_set(&sbi->s_mb_generation_time, 0);
+	atomic_set(&sbi->s_mb_preallocated, 0);
+	atomic_set(&sbi->s_mb_discarded, 0);
+}
+
 /*
  * Called on failure; free up any blocks from the inode PA for this
  * context.  We don't need this for MB_GROUP_PA because we only change
diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
index 923b375e017f..a5bd88a99f22 100644
--- a/fs/ext4/sysfs.c
+++ b/fs/ext4/sysfs.c
@@ -41,6 +41,7 @@ typedef enum {
 	attr_pointer_atomic,
 	attr_journal_task,
 	attr_err_report_sec,
+	attr_mb_stats_clear,
 } attr_id_t;
 
 typedef enum {
@@ -161,6 +162,25 @@ static ssize_t err_report_sec_store(struct ext4_sb_info *sbi,
 	return count;
 }
 
+static ssize_t mb_stats_clear_store(struct ext4_sb_info *sbi,
+				    const char *buf, size_t count)
+{
+	int val;
+	int ret;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	ret = kstrtoint(skip_spaces(buf), 0, &val);
+	if (ret)
+		return ret;
+	if (val != 1)
+		return -EINVAL;
+
+	ext4_mb_stats_clear(sbi);
+	return count;
+}
+
 static ssize_t journal_task_show(struct ext4_sb_info *sbi, char *buf)
 {
 	if (!sbi->s_journal)
@@ -251,6 +271,7 @@ EXT4_ATTR_OFFSET(mb_best_avail_max_trim_order, 0644, mb_order,
 EXT4_ATTR_OFFSET(err_report_sec, 0644, err_report_sec, ext4_sb_info, s_err_report_sec);
 EXT4_RW_ATTR_SBI_UI(inode_goal, s_inode_goal);
 EXT4_RW_ATTR_SBI_UI(mb_stats, s_mb_stats);
+EXT4_ATTR(mb_stats_clear, 0200, mb_stats_clear);
 EXT4_RW_ATTR_SBI_UI(mb_max_to_scan, s_mb_max_to_scan);
 EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan);
 EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
@@ -301,6 +322,7 @@ static struct attribute *ext4_attrs[] = {
 	ATTR_LIST(inode_readahead_blks),
 	ATTR_LIST(inode_goal),
 	ATTR_LIST(mb_stats),
+	ATTR_LIST(mb_stats_clear),
 	ATTR_LIST(mb_max_to_scan),
 	ATTR_LIST(mb_min_to_scan),
 	ATTR_LIST(mb_order2_req),
@@ -561,6 +583,8 @@ static ssize_t ext4_attr_store(struct kobject *kobj,
 		return trigger_test_error(sbi, buf, len);
 	case attr_err_report_sec:
 		return err_report_sec_store(sbi, buf, len);
+	case attr_mb_stats_clear:
+		return mb_stats_clear_store(sbi, buf, len);
 	default:
 		return ext4_generic_attr_store(a, sbi, buf, len);
 	}
-- 
2.51.0


^ permalink raw reply related

* Re: [PATCH v7 07/22] iomap: introduce IOMAP_F_FSVERITY and teach writeback to handle fsverity
From: Christoph Hellwig @ 2026-04-14  8:13 UTC (permalink / raw)
  To: Andrey Albershteyn
  Cc: linux-xfs, fsverity, linux-fsdevel, ebiggers, hch, linux-ext4,
	linux-f2fs-devel, linux-btrfs, djwong
In-Reply-To: <20260409131404.1545834-8-aalbersh@kernel.org>

Looks good:

Reviewed-by: Christoph Hellwig <hch@lst.de>


^ permalink raw reply

* Re: [PATCH v7 03/22] ovl: use core fsverity ensure info interface
From: Christoph Hellwig @ 2026-04-14  8:13 UTC (permalink / raw)
  To: Andrey Albershteyn
  Cc: linux-xfs, fsverity, linux-fsdevel, ebiggers, hch, linux-ext4,
	linux-f2fs-devel, linux-btrfs, djwong
In-Reply-To: <20260409131404.1545834-4-aalbersh@kernel.org>

On Thu, Apr 09, 2026 at 03:13:35PM +0200, Andrey Albershteyn wrote:
> -	if (!fsverity_active(inode) && IS_VERITY(inode)) {
> -		/*
> -		 * If this inode was not yet opened, the verity info hasn't been
> -		 * loaded yet, so we need to do that here to force it into memory.
> -		 */
> -		filp = kernel_file_open(datapath, O_RDONLY, current_cred());
> -		if (IS_ERR(filp))
> -			return PTR_ERR(filp);
> -		fput(filp);
> -	}
> +	if (fsverity_active(inode))
> +		fsverity_ensure_verity_info(inode);

fsverity_ensure_verity_info already is a no-op for !fsverity_active,
so the check could be remove.  Also we should probably propagate the
error return from fsverity_ensure_verity_info here.


^ permalink raw reply

* Re: [PATCH v7 02/22] fsverity: expose ensure_fsverity_info()
From: Christoph Hellwig @ 2026-04-14  8:11 UTC (permalink / raw)
  To: Andrey Albershteyn
  Cc: linux-xfs, fsverity, linux-fsdevel, ebiggers, hch, linux-ext4,
	linux-f2fs-devel, linux-btrfs, djwong
In-Reply-To: <20260409131404.1545834-3-aalbersh@kernel.org>

Looks good:

Reviewed-by: Christoph Hellwig <hch@lst.de>


^ permalink raw reply

* Re: [PATCH 4/6] generic/765: Ignore mkfs warning
From: Darrick J. Wong @ 2026-04-13 23:28 UTC (permalink / raw)
  To: Theodore Tso
  Cc: Ojaswin Mujoo, Zorro Lang, fstests, fdmanana, ritesh.list,
	naohiro.aota, wqu, Disha Goel, linux-ext4
In-Reply-To: <20260413204215.GA5461@macsyma-wired.lan>

On Mon, Apr 13, 2026 at 04:42:15PM -0400, Theodore Tso wrote:
> > > > > > The output can get corrupted with warnings like below because clustersize
> > > > > > more than 16xbs is experimental:
> > > > > > 
> > > > > > + 16 times the block size is considered experimental
> > > > > > 
> > > > > > Hence pipe these to seqres.full to avoid false negatives.
> 
> You could also suppress the warnings using the -q option, for example:
> 
> mke2fs -Fq -t ext4 -O bigalloc,quota -b 4096 -C 131072 /tmp/foo.img 4G
> 
> > > Futher, mke2fs has multiple instances where we print warnings to stderr,
> > > should we go and fix all of them as well?
> > 
> > "stderr" meaning "standard error", I'd say that errors are anything that
> > prohibits the format from completing, and only errors should go there.
> 
> Sure, I'll accept those changes.  But adding -q will allow the test to
> pass using older versions of e2fsprogs, while still allowing stderr to
> go out the expected output.

Aha, maybe that's why I've never run into this when QA'ing fstests.

--D

^ permalink raw reply

* [PATCH] jbd2: enforce power-of-two default revoke hash size at compile time
From: Milos Nikic @ 2026-04-13 21:27 UTC (permalink / raw)
  To: jack, tytso, linux-ext4; +Cc: linux-kernel, Milos Nikic

The jbd2 revoke table relies on bitwise AND operations for fast hash
indexing, which requires the hash table size to be a strict power of two.

Currently, this requirement is only enforced at runtime via a J_ASSERT
in jbd2_journal_init_revoke(). While this successfully catches invalid
dynamic allocations, it means a developer accidentally modifying the
hardcoded JOURNAL_REVOKE_DEFAULT_HASH macro will experience a system
panic upon mounting the filesystem during testing.

Add a BUILD_BUG_ON() in journal_init_common() to validate the default
macro at compile time. This acts as an immediate, zero-overhead
safeguard, preventing compilation entirely if the default hash size is
mathematically invalid.

Signed-off-by: Milos Nikic <nikic.milos@gmail.com>
---
 fs/jbd2/journal.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 4f397fcdb13c..62b36a2fc4e2 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -1565,6 +1565,7 @@ static journal_t *journal_init_common(struct block_device *bdev,
 	/* The journal is marked for error until we succeed with recovery! */
 	journal->j_flags = JBD2_ABORT;

+	BUILD_BUG_ON(!is_power_of_2(JOURNAL_REVOKE_DEFAULT_HASH));
 	/* Set up a default-sized revoke table for the new mount. */
 	err = jbd2_journal_init_revoke(journal, JOURNAL_REVOKE_DEFAULT_HASH);
 	if (err)
-- 
2.53.0

^ permalink raw reply related

* Re: [PATCH 4/6] generic/765: Ignore mkfs warning
From: Theodore Tso @ 2026-04-13 20:42 UTC (permalink / raw)
  To: Darrick J. Wong
  Cc: Ojaswin Mujoo, Zorro Lang, fstests, fdmanana, ritesh.list,
	naohiro.aota, wqu, Disha Goel, linux-ext4
In-Reply-To: <20260413162120.GV6212@frogsfrogsfrogs>

> > > > > The output can get corrupted with warnings like below because clustersize
> > > > > more than 16xbs is experimental:
> > > > > 
> > > > > + 16 times the block size is considered experimental
> > > > > 
> > > > > Hence pipe these to seqres.full to avoid false negatives.

You could also suppress the warnings using the -q option, for example:

mke2fs -Fq -t ext4 -O bigalloc,quota -b 4096 -C 131072 /tmp/foo.img 4G

> > Futher, mke2fs has multiple instances where we print warnings to stderr,
> > should we go and fix all of them as well?
> 
> "stderr" meaning "standard error", I'd say that errors are anything that
> prohibits the format from completing, and only errors should go there.

Sure, I'll accept those changes.  But adding -q will allow the test to
pass using older versions of e2fsprogs, while still allowing stderr to
go out the expected output.

					- Ted

^ permalink raw reply

* Re: [patch 15/38] ptp: ptp_vmclock: Replace get_cycles() usage
From: Arnd Bergmann @ 2026-04-13 19:30 UTC (permalink / raw)
  To: David Woodhouse, Thomas Gleixner, LKML
  Cc: x86, Baolu Lu, iommu, Michael Grzeschik, Netdev, linux-wireless,
	Herbert Xu, linux-crypto, Vlastimil Babka (SUSE), linux-mm,
	Bernie Thompson, linux-fbdev, Theodore Ts'o, linux-ext4,
	Andrew Morton, Uladzislau Rezki (Sony), Marco Elver,
	Dmitry Vyukov, kasan-dev, Andrey Ryabinin, Thomas Sailer,
	linux-hams, Jason A . Donenfeld, Richard Henderson, linux-alpha,
	Russell King, linux-arm-kernel, Catalin Marinas, Huacai Chen,
	loongarch, Geert Uytterhoeven, linux-m68k, Dinh Nguyen,
	Jonas Bonn, linux-openrisc@vger.kernel.org, Helge Deller,
	linux-parisc, Michael Ellerman, linuxppc-dev, Paul Walmsley,
	linux-riscv, Heiko Carstens, linux-s390, David S . Miller,
	sparclinux
In-Reply-To: <7a48b636cb3146f4f7134c6d4fe42070ac2edb43.camel@infradead.org>

On Mon, Apr 13, 2026, at 17:33, David Woodhouse wrote:
> On Fri, 2026-04-10 at 14:19 +0200, Thomas Gleixner wrote:
>
> ... depend on TSC_RELIABLE¹, since if the guest doesn't believe that it
> is, then the guest shouldn't be trying to use it as the basis for
> precise timing.
>
> ¹ (Or... one of the other zoo of TSC flags for the gradually reducing
> brokenness over the years...)

It looks like this is sufficiently handled in the caller:

static int vmclock_get_crosststamp(struct vmclock_state *st,
                                   struct ptp_system_timestamp *sts,
                                   struct system_counterval_t *system_counter,
                                   struct timespec64 *tspec)
{
....
#ifdef CONFIG_X86
        /*
         * We'd expect the hypervisor to know this and to report the clock
         * status as VMCLOCK_STATUS_UNRELIABLE. But be paranoid.
         */
        if (check_tsc_unstable())
                return -EINVAL;
#endif

With 486 and ELAN out of the way, Winchip6 seems to be the only
one without X86_FEATURE_TSC, so I think the next logical step would
be to turn off Winchip6 as well and remove all X86_FEATURE_TSC
and CONFIG_X86_TSC checks.

      Arnd

^ permalink raw reply

* [PATCH] jbd2: validate transaction state before dropping from journal
From: Milos Nikic @ 2026-04-13 18:08 UTC (permalink / raw)
  To: jack, tytso, linux-ext4; +Cc: linux-kernel, Milos Nikic

Currently, __jbd2_journal_drop_transaction() unlinks the transaction
from the journal's checkpoint lists and only then proceeds to validate the
transaction's internal state using a series of J_ASSERTs.

There is no need to 'mutate before validate'. If we are going to halt the
system that makes manipulating corrupted pointers in memory irrelevant.

Move the state validation block above the pointer manipulation. This
ensures the transaction is entirely valid before modifying the journal's
internal lists, modernizing the function's logic and paving the way
for future graceful degradation of these assertions.

Signed-off-by: Milos Nikic <nikic.milos@gmail.com>
---
 fs/jbd2/checkpoint.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 1508e2f54462..c82b6bedd27b 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -703,6 +703,15 @@ void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transact
 {
 	assert_spin_locked(&journal->j_list_lock);
 
+	J_ASSERT(transaction->t_state == T_FINISHED);
+	J_ASSERT(transaction->t_buffers == NULL);
+	J_ASSERT(transaction->t_forget == NULL);
+	J_ASSERT(transaction->t_shadow_list == NULL);
+	J_ASSERT(transaction->t_checkpoint_list == NULL);
+	J_ASSERT(atomic_read(&transaction->t_updates) == 0);
+	J_ASSERT(journal->j_committing_transaction != transaction);
+	J_ASSERT(journal->j_running_transaction != transaction);
+
 	journal->j_shrink_transaction = NULL;
 	if (transaction->t_cpnext) {
 		transaction->t_cpnext->t_cpprev = transaction->t_cpprev;
@@ -714,15 +723,6 @@ void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transact
 			journal->j_checkpoint_transactions = NULL;
 	}
 
-	J_ASSERT(transaction->t_state == T_FINISHED);
-	J_ASSERT(transaction->t_buffers == NULL);
-	J_ASSERT(transaction->t_forget == NULL);
-	J_ASSERT(transaction->t_shadow_list == NULL);
-	J_ASSERT(transaction->t_checkpoint_list == NULL);
-	J_ASSERT(atomic_read(&transaction->t_updates) == 0);
-	J_ASSERT(journal->j_committing_transaction != transaction);
-	J_ASSERT(journal->j_running_transaction != transaction);
-
 	trace_jbd2_drop_transaction(journal, transaction);
 
 	jbd2_debug(1, "Dropping transaction %d, all done\n", transaction->t_tid);
-- 
2.53.0


^ permalink raw reply related

* Re: [PATCH 4/6] generic/765: Ignore mkfs warning
From: Darrick J. Wong @ 2026-04-13 16:21 UTC (permalink / raw)
  To: Ojaswin Mujoo
  Cc: Zorro Lang, fstests, fdmanana, ritesh.list, naohiro.aota, wqu,
	Disha Goel, linux-ext4, Theodore Ts'o
In-Reply-To: <adyxi7156VAbL-G1@li-dc0c254c-257c-11b2-a85c-98b6c1322444.ibm.com>

On Mon, Apr 13, 2026 at 02:34:11PM +0530, Ojaswin Mujoo wrote:
> On Sun, Apr 12, 2026 at 10:38:18PM +0530, Ojaswin Mujoo wrote:
> > On Fri, Apr 10, 2026 at 09:54:35AM -0700, Darrick J. Wong wrote:
> > > On Fri, Apr 10, 2026 at 12:06:04PM +0530, Ojaswin Mujoo wrote:
> > > > This test validates atomic writes for all possible block sizes. In ext4, for
> > > > smaller block sizes with configurations like:
> > > > 
> > > > export MKFS_OPTIONS="-O bigalloc,quota -b 65536 -C 131072"
> > > > 
> > > > The output can get corrupted with warnings like below because clustersize
> > > > more than 16xbs is experimental:
> > > > 
> > > > + 16 times the block size is considered experimental
> > > > 
> > > > Hence pipe these to seqres.full to avoid false negatives.
> > > > 
> > > > Reported-by: Disha Goel <disgoel@linux.ibm.com>
> > > > Signed-off-by: Ojaswin Mujoo <ojaswin@linux.ibm.com>
> > > 
> > > Seems fine, though I think I'd rather mke2fs get patched to fix this.
> > > 
> > > I'd let the maintainer decide if he wants this, but ... I'm not sure if
> > > he's actually going to receive this message. :/
> > 
> > Hi Darrick, yes I'm preparing a quick patch for this. We might still
> > need this so people on older distro who wont have the e2fsprogs fix dont
> > see the failure.
> 
> Hi Darrick,
> 
> Looking at it again, I am just wondering if its really an issue that
> mke2fs is printing this warning to stderr. Isn't stderr the right place
> for errors and warning (even non critical ones)?
> 
> Futher, mke2fs has multiple instances where we print warnings to stderr,
> should we go and fix all of them as well?

"stderr" meaning "standard error", I'd say that errors are anything that
prohibits the format from completing, and only errors should go there.

We ought to kick this to the ext4 list though...

--D

> Regards,
> ojaswin
> 
> > 
> > Regards,
> > ojaswin
> > 
> > > 
> > > --D
> > > 
> > > > ---
> > > >  tests/generic/765 | 3 ++-
> > > >  1 file changed, 2 insertions(+), 1 deletion(-)
> > > > 
> > > > diff --git a/tests/generic/765 b/tests/generic/765
> > > > index 09be53db..b7cc70f1 100755
> > > > --- a/tests/generic/765
> > > > +++ b/tests/generic/765
> > > > @@ -64,7 +64,8 @@ test_atomic_writes()
> > > >      local bsize=$1
> > > >  
> > > >      get_mkfs_opts $bsize
> > > > -    _scratch_mkfs $mkfs_opts >> $seqres.full
> > > > +    _scratch_mkfs $mkfs_opts &>> $seqres.full || \
> > > > +        echo "mkfs $mkfs_opts failed"
> > > >      _scratch_mount
> > > >  
> > > >      test "$FSTYP" = "xfs" && _xfs_force_bdev data $SCRATCH_MNT
> > > > -- 
> > > > 2.53.0
> > > > 
> > > > 
> 

^ permalink raw reply

* Re: [patch 15/38] ptp: ptp_vmclock: Replace get_cycles() usage
From: David Woodhouse @ 2026-04-13 15:33 UTC (permalink / raw)
  To: Thomas Gleixner, LKML
  Cc: Arnd Bergmann, x86, Lu Baolu, iommu, Michael Grzeschik, netdev,
	linux-wireless, Herbert Xu, linux-crypto, Vlastimil Babka,
	linux-mm, Bernie Thompson, linux-fbdev, Theodore Tso, linux-ext4,
	Andrew Morton, Uladzislau Rezki, Marco Elver, Dmitry Vyukov,
	kasan-dev, Andrey Ryabinin, Thomas Sailer, linux-hams,
	Jason A. Donenfeld, Richard Henderson, linux-alpha, Russell King,
	linux-arm-kernel, Catalin Marinas, Huacai Chen, loongarch,
	Geert Uytterhoeven, linux-m68k, Dinh Nguyen, Jonas Bonn,
	linux-openrisc, Helge Deller, linux-parisc, Michael Ellerman,
	linuxppc-dev, Paul Walmsley, linux-riscv, Heiko Carstens,
	linux-s390, David S. Miller, sparclinux
In-Reply-To: <20260410120318.592237447@kernel.org>

[-- Attachment #1: Type: text/plain, Size: 994 bytes --]

On Fri, 2026-04-10 at 14:19 +0200, Thomas Gleixner wrote:
> get_cycles() is not really well defined and similar to other usaage of the
> underlying hardware CPU counters the PTP vmclock should use an explicit
> interface as well.
> 
> Implement ptp_vmclock_read_cpu_counter() in arm64 and x86 and simplify the
> Kconfig selection while at it.
> 
> No functional change.
> 
> Signed-off-by: Thomas Gleixner <tglx@kernel.org>
> Cc: David Woodhouse <dwmw2@infradead.org>

Acked-by: David Woodhouse <dwmw@amazon.co.uk>

Although I might follow up with a change to make this...

> +static inline u64 ptp_vmclock_read_cpu_counter(void)
> +{
> +	return cpu_feature_enabled(X86_FEATURE_TSC) ? rdtsc() : 0;
> +}
> +

... depend on TSC_RELIABLE¹, since if the guest doesn't believe that it
is, then the guest shouldn't be trying to use it as the basis for
precise timing.

¹ (Or... one of the other zoo of TSC flags for the gradually reducing
brokenness over the years...)

[-- Attachment #2: smime.p7s --]
[-- Type: application/pkcs7-signature, Size: 5069 bytes --]

^ permalink raw reply

* Re: [patch 10/38] arcnet: Remove function timing code
From: David Woodhouse @ 2026-04-13 15:29 UTC (permalink / raw)
  To: Thomas Gleixner, LKML
  Cc: Michael Grzeschik, netdev, Arnd Bergmann, x86, Lu Baolu, iommu,
	linux-wireless, Herbert Xu, linux-crypto, Vlastimil Babka,
	linux-mm, Bernie Thompson, linux-fbdev, Theodore Tso, linux-ext4,
	Andrew Morton, Uladzislau Rezki, Marco Elver, Dmitry Vyukov,
	kasan-dev, Andrey Ryabinin, Thomas Sailer, linux-hams,
	Jason A. Donenfeld, Richard Henderson, linux-alpha, Russell King,
	linux-arm-kernel, Catalin Marinas, Huacai Chen, loongarch,
	Geert Uytterhoeven, linux-m68k, Dinh Nguyen, Jonas Bonn,
	linux-openrisc, Helge Deller, linux-parisc, Michael Ellerman,
	linuxppc-dev, Paul Walmsley, linux-riscv, Heiko Carstens,
	linux-s390, David S. Miller, sparclinux
In-Reply-To: <20260410120318.253872322@kernel.org>

[-- Attachment #1: Type: text/plain, Size: 778 bytes --]

On Fri, 2026-04-10 at 14:19 +0200, Thomas Gleixner wrote:
> ARCNET is a museums piece and the function timing can be done with
> ftrace. Remove the cruft.
> 
> Signed-off-by: Thomas Gleixner <tglx@kernel.org>
> Cc: Michael Grzeschik <m.grzeschik@pengutronix.de>
> Cc: netdev@vger.kernel.org
> ---
>  drivers/net/arcnet/arc-rimi.c  |    4 ++--
>  drivers/net/arcnet/arcdevice.h |   20 +-------------------
>  drivers/net/arcnet/com20020.c  |    6 ++----
>  drivers/net/arcnet/com90io.c   |    6 ++----
>  drivers/net/arcnet/com90xx.c   |    4 ++--
>  5 files changed, 9 insertions(+), 31 deletions(-)

Acked-by: David Woodhouse <dwmw2@infradead.org>

By coincidence, I took the last of my ARCNET cards to the tip just this
morning...

[-- Attachment #2: smime.p7s --]
[-- Type: application/pkcs7-signature, Size: 5069 bytes --]

^ permalink raw reply

* Re: [patch 17/38] ext4: Replace get_cycles() usage with ktime_get()
From: Arnd Bergmann @ 2026-04-13 14:46 UTC (permalink / raw)
  To: Thomas Gleixner, LKML
  Cc: Theodore Ts'o, linux-ext4, x86, Baolu Lu, iommu,
	Michael Grzeschik, Netdev, linux-wireless, Herbert Xu,
	linux-crypto, Vlastimil Babka (SUSE), linux-mm, David Woodhouse,
	Bernie Thompson, linux-fbdev, Andrew Morton,
	Uladzislau Rezki (Sony), Marco Elver, Dmitry Vyukov, kasan-dev,
	Andrey Ryabinin, Thomas Sailer, linux-hams, Jason A . Donenfeld,
	Richard Henderson, linux-alpha, Russell King, linux-arm-kernel,
	Catalin Marinas, Huacai Chen, loongarch, Geert Uytterhoeven,
	linux-m68k, Dinh Nguyen, Jonas Bonn,
	linux-openrisc@vger.kernel.org, Helge Deller, linux-parisc,
	Michael Ellerman, linuxppc-dev, Paul Walmsley, linux-riscv,
	Heiko Carstens, linux-s390, David S . Miller, sparclinux
In-Reply-To: <20260410120318.727211419@kernel.org>

On Fri, Apr 10, 2026, at 14:19, Thomas Gleixner wrote:
> get_cycles() is not guaranteed to be functional on all systems/platforms
> and the values returned are unitless and not easy to map to something
> useful.
>
> Use ktime_get() instead, which provides nanosecond timestamps and is
> functional everywhere.
>
> This is part of a larger effort to limit get_cycles() usage to low level
> architecture code.
>
> Signed-off-by: Thomas Gleixner <tglx@kernel.org>
> Cc: "Theodore Ts'o" <tytso@mit.edu>
> Cc: linux-ext4@vger.kernel.org

I think this is technically an ABI chance, since the time
difference gets exported through procfs, but the new version
is clearly the right thing to do since it replaces a hardware
specific value with a portable one.

       Arnd

^ permalink raw reply

* Re: [patch 38/38] treewide: Remove asm/timex.h includes from generic code
From: Arnd Bergmann @ 2026-04-13 14:45 UTC (permalink / raw)
  To: Thomas Gleixner, LKML
  Cc: x86, Baolu Lu, iommu, Michael Grzeschik, Netdev, linux-wireless,
	Herbert Xu, linux-crypto, Vlastimil Babka (SUSE), linux-mm,
	David Woodhouse, Bernie Thompson, linux-fbdev, Theodore Ts'o,
	linux-ext4, Andrew Morton, Uladzislau Rezki (Sony), Marco Elver,
	Dmitry Vyukov, kasan-dev, Andrey Ryabinin, Thomas Sailer,
	linux-hams, Jason A . Donenfeld, Richard Henderson, linux-alpha,
	Russell King, linux-arm-kernel, Catalin Marinas, Huacai Chen,
	loongarch, Geert Uytterhoeven, linux-m68k, Dinh Nguyen,
	Jonas Bonn, linux-openrisc@vger.kernel.org, Helge Deller,
	linux-parisc, Michael Ellerman, linuxppc-dev, Paul Walmsley,
	linux-riscv, Heiko Carstens, linux-s390, David S . Miller,
	sparclinux
In-Reply-To: <20260410120320.163559629@kernel.org>

On Fri, Apr 10, 2026, at 14:21, Thomas Gleixner wrote:
> asm/timex.h does not provide any functionality for non-architecture code
> anymore.
>
> Remove the asm-generic fallback and all references in include and source
> files along with the random_get_entropy() #ifdeffery in timex.h.
>
> Signed-off-by: Thomas Gleixner <tglx@kernel.org>
> ---
>  include/asm-generic/Kbuild  |    1 -
>  include/asm-generic/timex.h |   15 ---------------
>  include/linux/random.h      |    3 +++
>  include/linux/timex.h       |   26 --------------------------

Acked-by: Arnd Bergmann <arnd@arndb.de>

^ permalink raw reply

* Re: [patch 32/38] powerpc/spufs: Use mftb() directly
From: Arnd Bergmann @ 2026-04-13 14:43 UTC (permalink / raw)
  To: Thomas Gleixner, LKML
  Cc: Michael Ellerman, linuxppc-dev, x86, Baolu Lu, iommu,
	Michael Grzeschik, Netdev, linux-wireless, Herbert Xu,
	linux-crypto, Vlastimil Babka (SUSE), linux-mm, David Woodhouse,
	Bernie Thompson, linux-fbdev, Theodore Ts'o, linux-ext4,
	Andrew Morton, Uladzislau Rezki (Sony), Marco Elver,
	Dmitry Vyukov, kasan-dev, Andrey Ryabinin, Thomas Sailer,
	linux-hams, Jason A . Donenfeld, Richard Henderson, linux-alpha,
	Russell King, linux-arm-kernel, Catalin Marinas, Huacai Chen,
	loongarch, Geert Uytterhoeven, linux-m68k, Dinh Nguyen,
	Jonas Bonn, linux-openrisc@vger.kernel.org, Helge Deller,
	linux-parisc, Paul Walmsley, linux-riscv, Heiko Carstens,
	linux-s390, David S . Miller, sparclinux
In-Reply-To: <20260410120319.723429844@kernel.org>

On Fri, Apr 10, 2026, at 14:21, Thomas Gleixner wrote:
> There is no reason to indirect via get_cycles(), which is about to be
> removed.
>
> Use mftb() directly.
>
> Signed-off-by: Thomas Gleixner <tglx@kernel.org>
> Cc: Michael Ellerman <mpe@ellerman.id.au>
> Cc: linuxppc-dev@lists.ozlabs.org

Acked-by: Arnd Bergmann <arnd@arndb.de>

^ permalink raw reply

* Re: [patch 14/38] slub: Use prandom instead of get_cycles()
From: Vlastimil Babka (SUSE) @ 2026-04-13 13:45 UTC (permalink / raw)
  To: hu.shengming, harry
  Cc: tglx, linux-kernel, linux-mm, arnd, x86, baolu.lu, iommu,
	m.grzeschik, netdev, linux-wireless, herbert, linux-crypto, dwmw2,
	bernie, linux-fbdev, tytso, linux-ext4, akpm, urezki, elver,
	dvyukov, kasan-dev, ryabinin.a.a, t.sailer, linux-hams, Jason,
	richard.henderson, linux-alpha, linux, linux-arm-kernel,
	catalin.marinas, chenhuacai, loongarch, geert, linux-m68k,
	dinguyen, jonas, linux-openrisc, deller, linux-parisc, mpe,
	linuxppc-dev, pjw, linux-riscv, hca, linux-s390, davem,
	sparclinux, hao.li, cl, rientjes, roman.gushchin
In-Reply-To: <20260413210252672ZfdcegJLJtyvlYdFAUBlr@zte.com.cn>

On 4/13/26 15:02, hu.shengming@zte.com.cn wrote:
> Harry wrote:
>> [Resending after fixing broken email headers]
>> 
>> On Fri, Apr 10, 2026 at 02:19:37PM +0200, Thomas Gleixner wrote:
>> > The decision whether to scan remote nodes is based on a 'random' number
>> > retrieved via get_cycles(). get_cycles() is about to be removed.
>> > 
>> > There is already prandom state in the code, so use that instead.
>> > 
>> > Signed-off-by: Thomas Gleixner <tglx@kernel.org>
>> > Cc: Vlastimil Babka <vbabka@kernel.org>
>> > Cc: linux-mm@kvack.org
>> > ---
>> 
>> Acked-by: Harry Yoo (Oracle) <harry@kernel.org>
>> 
>> Is this for this merge window?

I'd say it's not intended for 7.1 as it's not in -next and v1 was posted
just before the merge window.

>> This may conflict with upcoming changes on freelist shuffling [1]
>> (not queued for slab/for-next yet though), but it should be easy to
>> resolve.

Indeed, it's a simple conflict.

> 
> Hi Harry,
> 
> Would you like me to wait for this patch to land linux-next and then
> rebase and send v6 on top?

Just send it now based same as previously so we can finish the reviews, and
we'll deal with it after rc1.


^ permalink raw reply

* Re: [RFC v4 0/7] ext4: fast commit: snapshot inode state for FC log
From: Theodore Tso @ 2026-04-13 13:12 UTC (permalink / raw)
  To: Li Chen
  Cc: Zhang Yi, Andreas Dilger, Steven Rostedt, Masami Hiramatsu,
	Mathieu Desnoyers, linux-ext4, linux-trace-kernel, linux-kernel
In-Reply-To: <19d86eec635.f7072461135455.4960134919814592320@linux.beauty>

On Mon, Apr 13, 2026 at 09:01:28PM +0800, Li Chen wrote:
> Absolutely! It's great to learn about the Sashiko development site.
> I will address the real issues in the next version.

Note that Sashiko will sometimes report a pre-existing issue as if it
were a problem with the commit.  If that happens, feel free to ignore
its complaint; what I consider best practice is to either (a) fix it
in the a subsequent patch or patch series, or (b) leave a TODO in the
code.

I've asked the Sashiko folks to add way for URI's for each issue that
are identified by Sashiko, so we can put a URL in the TODO comment for
someone who wants to fix it later, and to make it easier for Sashiko
to identified pre-existing issues so it doesn't comment on the same
issue across multiple commit reviews (and perhaps save on the some LLM
token budget :-).

In the next few days, for patches sent to linux-ext4, Sashiko will
start e-mailing its reviews to the patch submitter and to me as the
maintainer.  Once we can reduce the false positive rate, I'll ask that
the reviews be cc'ed to the linux-ext4 mailing list.  But it seems
good enough that to send e-mails to the patch submitter and the
maintainer --- but that's a decision that each subsystem maintainer
will be making on their own.

Cheers,

						- Ted

^ permalink raw reply

* Re: [patch 14/38] slub: Use prandom instead of get_cycles()
From: hu.shengming @ 2026-04-13 13:02 UTC (permalink / raw)
  To: harry
  Cc: tglx, linux-kernel, vbabka, linux-mm, arnd, x86, baolu.lu, iommu,
	m.grzeschik, netdev, linux-wireless, herbert, linux-crypto, dwmw2,
	bernie, linux-fbdev, tytso, linux-ext4, akpm, urezki, elver,
	dvyukov, kasan-dev, ryabinin.a.a, t.sailer, linux-hams, Jason,
	richard.henderson, linux-alpha, linux, linux-arm-kernel,
	catalin.marinas, chenhuacai, loongarch, geert, linux-m68k,
	dinguyen, jonas, linux-openrisc, deller, linux-parisc, mpe,
	linuxppc-dev, pjw, linux-riscv, hca, linux-s390, davem,
	sparclinux, hao.li, cl, rientjes, roman.gushchin
In-Reply-To: <adyyNeVTkXQlnh_2@hyeyoo>

Harry wrote:
> [Resending after fixing broken email headers]
> 
> On Fri, Apr 10, 2026 at 02:19:37PM +0200, Thomas Gleixner wrote:
> > The decision whether to scan remote nodes is based on a 'random' number
> > retrieved via get_cycles(). get_cycles() is about to be removed.
> > 
> > There is already prandom state in the code, so use that instead.
> > 
> > Signed-off-by: Thomas Gleixner <tglx@kernel.org>
> > Cc: Vlastimil Babka <vbabka@kernel.org>
> > Cc: linux-mm@kvack.org
> > ---
> 
> Acked-by: Harry Yoo (Oracle) <harry@kernel.org>
> 
> Is this for this merge window?
> 
> This may conflict with upcoming changes on freelist shuffling [1]
> (not queued for slab/for-next yet though), but it should be easy to
> resolve.
> 

Hi Harry,

Would you like me to wait for this patch to land linux-next and then
rebase and send v6 on top?

Thanks,

--
With Best Regards,
Shengming

> [Cc'ing Shengming and SLAB ALLOCATOR folks]
> [1] https://lore.kernel.org/linux-mm/20260409204352095kKWVYKtZImN59ybO6iRNj@zte.com.cn
> 
> -- 
> Cheers,
> Harry / Hyeonggon
> 
> >  mm/slub.c |   37 +++++++++++++++++++++++--------------
> >  1 file changed, 23 insertions(+), 14 deletions(-)
> > 
> > --- a/mm/slub.c
> > +++ b/mm/slub.c
> > @@ -3302,6 +3302,25 @@ static inline struct slab *alloc_slab_pa
> >      return slab;
> >  }
> >  
> > +#if defined(CONFIG_SLAB_FREELIST_RANDOM) || defined(CONFIG_NUMA)
> > +static DEFINE_PER_CPU(struct rnd_state, slab_rnd_state);
> > +
> > +static unsigned int slab_get_prandom_state(unsigned int limit)
> > +{
> > +    struct rnd_state *state;
> > +    unsigned int res;
> > +
> > +    /*
> > +     * An interrupt or NMI handler might interrupt and change
> > +     * the state in the middle, but that's safe.
> > +     */
> > +    state = &get_cpu_var(slab_rnd_state);
> > +    res = prandom_u32_state(state) % limit;
> > +    put_cpu_var(slab_rnd_state);
> > +    return res;
> > +}
> > +#endif
> > +
> >  #ifdef CONFIG_SLAB_FREELIST_RANDOM
> >  /* Pre-initialize the random sequence cache */
> >  static int init_cache_random_seq(struct kmem_cache *s)
> > @@ -3365,8 +3384,6 @@ static void *next_freelist_entry(struct
> >      return (char *)start + idx;
> >  }
> >  
> > -static DEFINE_PER_CPU(struct rnd_state, slab_rnd_state);
> > -
> >  /* Shuffle the single linked freelist based on a random pre-computed sequence */
> >  static bool shuffle_freelist(struct kmem_cache *s, struct slab *slab,
> >                   bool allow_spin)
> > @@ -3383,15 +3400,7 @@ static bool shuffle_freelist(struct kmem
> >      if (allow_spin) {
> >          pos = get_random_u32_below(freelist_count);
> >      } else {
> > -        struct rnd_state *state;
> > -
> > -        /*
> > -         * An interrupt or NMI handler might interrupt and change
> > -         * the state in the middle, but that's safe.
> > -         */
> > -        state = &get_cpu_var(slab_rnd_state);
> > -        pos = prandom_u32_state(state) % freelist_count;
> > -        put_cpu_var(slab_rnd_state);
> > +        pos = slab_get_prandom_state(freelist_count);
> >      }
> >  
> >      page_limit = slab->objects * s->size;
> > @@ -3882,7 +3891,7 @@ static void *get_from_any_partial(struct
> >       * with available objects.
> >       */
> >      if (!s->remote_node_defrag_ratio ||
> > -            get_cycles() % 1024 > s->remote_node_defrag_ratio)
> > +        slab_get_prandom_state(1024) > s->remote_node_defrag_ratio)
> >          return NULL;
> >  
> >      do {
> > @@ -7102,7 +7111,7 @@ static unsigned int
> >  
> >      /* see get_from_any_partial() for the defrag ratio description */
> >      if (!s->remote_node_defrag_ratio ||
> > -            get_cycles() % 1024 > s->remote_node_defrag_ratio)
> > +        slab_get_prandom_state(1024) > s->remote_node_defrag_ratio)
> >          return 0;
> >  
> >      do {
> > @@ -8421,7 +8430,7 @@ void __init kmem_cache_init_late(void)
> >      flushwq = alloc_workqueue("slub_flushwq", WQ_MEM_RECLAIM | WQ_PERCPU,
> >                    0);
> >      WARN_ON(!flushwq);
> > -#ifdef CONFIG_SLAB_FREELIST_RANDOM
> > +#if defined(CONFIG_SLAB_FREELIST_RANDOM) || defined(CONFIG_NUMA)
> >      prandom_init_once(&slab_rnd_state);
> >  #endif
> >  }
> > 
> >

^ permalink raw reply

* Re: [RFC v4 0/7] ext4: fast commit: snapshot inode state for FC log
From: Li Chen @ 2026-04-13 13:01 UTC (permalink / raw)
  To: Theodore Tso
  Cc: Zhang Yi, Andreas Dilger, Steven Rostedt, Masami Hiramatsu,
	Mathieu Desnoyers, linux-ext4, linux-trace-kernel, linux-kernel
In-Reply-To: <20260410011843.GD99725@macsyma-wired.lan>

Hi Ted,

 ---- On Fri, 10 Apr 2026 09:18:43 +0800  Theodore Tso <tytso@mit.edu> wrote --- 
 > On Tue, Jan 20, 2026 at 07:25:29PM +0800, Li Chen wrote:
 > > Hi,
 > > 
 > > (This RFC v4 series is based on linux-next tag next-20260106, plus the
 > > prerequisite patch "ext4: fast commit: make s_fc_lock reclaim-safe" posted at:
 > > https://lore.kernel.org/all/20260106120621.440126-1-me@linux.beauty/)
 > 
 > Can you take a look at the Sashiko reviews here:
 > 
 >     https://sashiko.dev/#/patchset/20260408112020.716706-1-me%40linux.beauty
 > 
 > There seems to be at least one legitimate concern, which is the
 > potential cur_lblk overflow.  There are a couple of others which I
 > think is real; could you please look at their review comments?

Absolutely! It's great to learn about the Sashiko development site.
I will address the real issues in the next version.

Regards,
Li


^ permalink raw reply

* Re: [RFC v6 6/7] ext4: fast commit: add lock_updates tracepoint
From: Li Chen @ 2026-04-13 12:58 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Zhang Yi, Theodore Ts'o, Andreas Dilger, Baokun Li, Jan Kara,
	Ojaswin Mujoo, Ritesh Harjani, Zhang Yi, Masami Hiramatsu,
	Mathieu Desnoyers, linux-ext4, linux-kernel, linux-trace-kernel
In-Reply-To: <20260408160405.45a5ee09@gandalf.local.home>

Hi Steven,

 ---- On Thu, 09 Apr 2026 04:02:56 +0800  Steven Rostedt <rostedt@goodmis.org> wrote --- 
 > On Wed,  8 Apr 2026 19:20:17 +0800
 > Li Chen <me@linux.beauty> wrote:
 > 
 > > Commit-time fast commit snapshots run under jbd2_journal_lock_updates(),
 > > so it is useful to quantify the time spent with updates locked and to
 > > understand why snapshotting can fail.
 > > 
 > > Add a new tracepoint, ext4_fc_lock_updates, reporting the time spent in
 > > the updates-locked window along with the number of snapshotted inodes
 > > and ranges. Record the first snapshot failure reason in a stable snap_err
 > > field for tooling.
 > > 
 > 
 > [..]
 > 
 > > @@ -1338,13 +1375,13 @@ static int ext4_fc_perform_commit(journal_t *journal)
 > >      if (ret)
 > >          return ret;
 > >  
 > > -
 > >      ret = ext4_fc_alloc_snapshot_inodes(sb, &inodes, &inodes_size);
 > >      if (ret)
 > >          return ret;
 > >  
 > >      /* Step 4: Mark all inodes as being committed. */
 > >      jbd2_journal_lock_updates(journal);
 > > +    lock_start = ktime_get();
 > 
 > ktime_get() is rather quick but if you care about micro-optimizations, you
 > could have:
 > 
 >     if (trace_ext4_fc_lock_updates_enabled())
 >         lock_start = ktime_get();
 >     else    
 >         lock_start = 0;
 > 
 > >      /*
 > >       * The journal is now locked. No more handles can start and all the
 > >       * previous handles are now drained. Snapshotting happens in this
 > > @@ -1358,8 +1395,15 @@ static int ext4_fc_perform_commit(journal_t *journal)
 > >      }
 > >      ext4_fc_unlock(sb, alloc_ctx);
 > >  
 > > -    ret = ext4_fc_snapshot_inodes(journal, inodes, inodes_size);
 > > +    ret = ext4_fc_snapshot_inodes(journal, inodes, inodes_size,
 > > +                      &snap_inodes, &snap_ranges, &snap_err);
 > >      jbd2_journal_unlock_updates(journal);
 > > +    if (trace_ext4_fc_lock_updates_enabled()) {
 > 
 >     if (trace_ext4_fc_lock_updates_enabled() && lock_start) {
 > 
 > But feel free to ignore this if the overhead of always calling ktime_get()
 > is not an issue.
 > 
 > 
 > > +        locked_ns = ktime_to_ns(ktime_sub(ktime_get(), lock_start));
 > > +        trace_ext4_fc_lock_updates(sb, commit_tid, locked_ns,
 > > +                       snap_inodes, snap_ranges, ret,
 > > +                       snap_err);
 > > +    }
 > >      kvfree(inodes);
 > >      if (ret)
 > >          return ret;
 > > @@ -1564,7 +1608,7 @@ int ext4_fc_commit(journal_t *journal, tid_t commit_tid)
 > >          journal_ioprio = EXT4_DEF_JOURNAL_IOPRIO;
 > >      set_task_ioprio(current, journal_ioprio);
 > >      fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize;
 > > -    ret = ext4_fc_perform_commit(journal);
 > > +    ret = ext4_fc_perform_commit(journal, commit_tid);
 > >      if (ret < 0) {
 > >          if (ret == -EAGAIN || ret == -E2BIG || ret == -ECANCELED)
 > >              status = EXT4_FC_STATUS_INELIGIBLE;
 > > diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
 > > index f493642cf121..7028a28316fa 100644
 > > --- a/include/trace/events/ext4.h
 > > +++ b/include/trace/events/ext4.h
 > > @@ -107,6 +107,26 @@ TRACE_DEFINE_ENUM(EXT4_FC_REASON_VERITY);
 > >  TRACE_DEFINE_ENUM(EXT4_FC_REASON_MOVE_EXT);
 > >  TRACE_DEFINE_ENUM(EXT4_FC_REASON_MAX);
 > >  
 > > +#undef EM
 > > +#undef EMe
 > > +#define EM(a)    TRACE_DEFINE_ENUM(EXT4_FC_SNAP_ERR_##a);
 > > +#define EMe(a)    TRACE_DEFINE_ENUM(EXT4_FC_SNAP_ERR_##a);
 > > +
 > > +#define TRACE_SNAP_ERR                        \
 > > +    EM(NONE)                        \
 > > +    EM(ES_MISS)                        \
 > > +    EM(ES_DELAYED)                        \
 > > +    EM(ES_OTHER)                        \
 > > +    EM(INODES_CAP)                        \
 > > +    EM(RANGES_CAP)                        \
 > > +    EM(NOMEM)                        \
 > > +    EMe(INODE_LOC)
 > > +
 > > +TRACE_SNAP_ERR
 > > +
 > > +#undef EM
 > > +#undef EMe
 > > +
 > >  #define show_fc_reason(reason)                        \
 > >      __print_symbolic(reason,                    \
 > >          { EXT4_FC_REASON_XATTR,        "XATTR"},        \
 > > @@ -2818,6 +2838,47 @@ TRACE_EVENT(ext4_fc_commit_stop,
 > >            __entry->num_fc_ineligible, __entry->nblks_agg, __entry->tid)
 > >  );
 > >  
 > > +#define EM(a)    { EXT4_FC_SNAP_ERR_##a, #a },
 > > +#define EMe(a)    { EXT4_FC_SNAP_ERR_##a, #a }
 > > +
 > > +TRACE_EVENT(ext4_fc_lock_updates,
 > > +        TP_PROTO(struct super_block *sb, tid_t commit_tid, u64 locked_ns,
 > > +             unsigned int nr_inodes, unsigned int nr_ranges, int err,
 > > +             int snap_err),
 > > +
 > > +    TP_ARGS(sb, commit_tid, locked_ns, nr_inodes, nr_ranges, err, snap_err),
 > > +
 > > +    TP_STRUCT__entry(/* entry */
 > > +        __field(dev_t, dev)
 > > +        __field(tid_t, tid)
 > > +        __field(u64, locked_ns)
 > > +        __field(unsigned int, nr_inodes)
 > > +        __field(unsigned int, nr_ranges)
 > > +        __field(int, err)
 > > +        __field(int, snap_err)
 > > +    ),
 > > +
 > > +    TP_fast_assign(/* assign */
 > > +        __entry->dev = sb->s_dev;
 > > +        __entry->tid = commit_tid;
 > > +        __entry->locked_ns = locked_ns;
 > > +        __entry->nr_inodes = nr_inodes;
 > > +        __entry->nr_ranges = nr_ranges;
 > > +        __entry->err = err;
 > > +        __entry->snap_err = snap_err;
 > > +    ),
 > > +
 > > +    TP_printk("dev %d,%d tid %u locked_ns %llu nr_inodes %u nr_ranges %u err %d snap_err %s",
 > > +          MAJOR(__entry->dev), MINOR(__entry->dev), __entry->tid,
 > > +          __entry->locked_ns, __entry->nr_inodes, __entry->nr_ranges,
 > > +          __entry->err, __print_symbolic(__entry->snap_err,
 > > +                         TRACE_SNAP_ERR))
 > > +);
 > > +
 > > +#undef EM
 > > +#undef EMe
 > > +#undef TRACE_SNAP_ERR
 > > +
 > >  #define FC_REASON_NAME_STAT(reason)                    \
 > >      show_fc_reason(reason),                        \
 > >      __entry->fc_ineligible_rc[reason]
 > 
 > As for the rest:
 > 
 > Reviewed-by: Steven Rostedt (Google) <rostedt@goodmis.org>
 > 
 > [ Please add this reviewed-by to any new versions so I remember I already
 >   looked at it. ]

Sure, thanks a lot for your thoughtful review!

Regards,
Li


^ permalink raw reply

* Re: [RFC v2 0/1] ext4: fail fast on repeated buffer_head reads after IO failure
From: Theodore Tso @ 2026-04-13 12:47 UTC (permalink / raw)
  To: Diangang Li
  Cc: adilger.kernel, linux-ext4, linux-fsdevel, linux-kernel,
	changfengnan, yizhang089, willy, Diangang Li
In-Reply-To: <20260413062500.1380307-1-diangangli@gmail.com>

On Mon, Apr 13, 2026 at 02:24:59PM +0800, Diangang Li wrote:
> From: Diangang Li <lidiangang@bytedance.com>
> 
> A production system reported hung tasks blocked for 300s+ in ext4
> buffer_head paths....
> 
>   [Tue Mar 24 14:16:24 2026] blk_update_request: I/O error, dev sdi,
>       sector 10704150288 op 0x0:(READ) flags 0x0 phys_seg 1 prio class 0
>   [Tue Mar 24 14:16:25 2026] blk_update_request: I/O error, dev sdi,
>       sector 10704488160 op 0x0:(READ) flags 0x0 phys_seg 1 prio class 0
>   [Tue Mar 24 14:16:26 2026] blk_update_request: I/O error, dev sdi,
>       sector 10704382912 op 0x0:(READ) flags 0x0 phys_seg 1 prio class 0

I wonder whether the ext4 layer is the right place to be handle this
sort of issue.  For example, it could be handled by having a subsystem
scanning dmesg (or by wiring up notifications so block device errors
get sent to a userspace daemon), and when certain criteria is met, the
machine is automatically sent to hardware operations to run
diagnostics and (most likey) replace the failing disk.

It could also be handled in the driver or SCSI layer so the "fail
fast" semantics are handled there, so that it supports all file
systems, not just ext4.  The SCSI layer also has more information
about the type of error; you might want to handle things like media
errors differently from Fibre Channel or iSCSI timeouts (which might
be something where "fast fast" is not appropriate).

By the time the error gets propagated up to the buffer head, we lose a
lot of detail about why the error took place.  Also, in the long term
we will hopefully be moving away from using buffer cache.

   		     	    	      	    - Ted

^ permalink raw reply

* Re: [patch 07/38] treewide: Consolidate cycles_t
From: Ojaswin Mujoo @ 2026-04-13  9:15 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: LKML, Arnd Bergmann, x86, Lu Baolu, iommu, Michael Grzeschik,
	netdev, linux-wireless, Herbert Xu, linux-crypto, Vlastimil Babka,
	linux-mm, David Woodhouse, Bernie Thompson, linux-fbdev,
	Theodore Tso, linux-ext4, Andrew Morton, Uladzislau Rezki,
	Marco Elver, Dmitry Vyukov, kasan-dev, Andrey Ryabinin,
	Thomas Sailer, linux-hams, Jason A. Donenfeld, Richard Henderson,
	linux-alpha, Russell King, linux-arm-kernel, Catalin Marinas,
	Huacai Chen, loongarch, Geert Uytterhoeven, linux-m68k,
	Dinh Nguyen, Jonas Bonn, linux-openrisc, Helge Deller,
	linux-parisc, Michael Ellerman, linuxppc-dev, Paul Walmsley,
	linux-riscv, Heiko Carstens, linux-s390, David S. Miller,
	sparclinux
In-Reply-To: <20260410120318.045532623@kernel.org>

On Fri, Apr 10, 2026 at 02:19:03PM +0200, Thomas Gleixner wrote:
> Most architectures define cycles_t as unsigned long execpt:
> 
>  - x86 requires it to be 64-bit independent of the 32-bit/64-bit build.
> 
>  - parisc and mips define it as unsigned int
> 
>    parisc has no real reason to do so as there are only a few usage sites
>    which either expand it to a 64-bit value or utilize only the lower
>    32bits.
> 
>    mips has no real requirement either.
> 
> Move the typedef to types.h and provide a config switch to enforce the
> 64-bit type for x86.
> 
> Signed-off-by: Thomas Gleixner <tglx@kernel.org>
> ---
>  arch/Kconfig                       |    4 ++++
>  arch/alpha/include/asm/timex.h     |    3 ---
>  arch/arm/include/asm/timex.h       |    1 -
>  arch/loongarch/include/asm/timex.h |    2 --
>  arch/m68k/include/asm/timex.h      |    2 --
>  arch/mips/include/asm/timex.h      |    2 --
>  arch/nios2/include/asm/timex.h     |    2 --
>  arch/parisc/include/asm/timex.h    |    2 --
>  arch/powerpc/include/asm/timex.h   |    4 +---
>  arch/riscv/include/asm/timex.h     |    2 --
>  arch/s390/include/asm/timex.h      |    2 --
>  arch/sparc/include/asm/timex_64.h  |    1 -
>  arch/x86/Kconfig                   |    1 +
>  arch/x86/include/asm/tsc.h         |    2 --
>  include/asm-generic/timex.h        |    1 -
>  include/linux/types.h              |    6 ++++++
>  16 files changed, 12 insertions(+), 25 deletions(-)
> 
<...>

> --- a/arch/powerpc/include/asm/timex.h
> +++ b/arch/powerpc/include/asm/timex.h
> @@ -11,9 +11,7 @@
>  #include <asm/cputable.h>
>  #include <asm/vdso/timebase.h>
>  
> -typedef unsigned long cycles_t;
> -
> -static inline cycles_t get_cycles(void)
> +ostatic inline cycles_t get_cycles(void)

Hi Thomas, I'm in middle of testing this series on powerpc. In the meantime I
noticed that there's probably a small typo here (althrough this is fixed
later)

Regards,
ojaswin
>  {
>  	return mftb();
>  }

^ permalink raw reply

* Re: [patch 14/38] slub: Use prandom instead of get_cycles()
From: Harry Yoo (Oracle) @ 2026-04-13  9:07 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: LKML, Vlastimil Babka, linux-mm, Arnd Bergmann, x86, Lu Baolu,
	iommu, Michael Grzeschik, netdev, linux-wireless, Herbert Xu,
	linux-crypto, David Woodhouse, Bernie Thompson, linux-fbdev,
	Theodore Tso, linux-ext4, Andrew Morton, Uladzislau Rezki,
	Marco Elver, Dmitry Vyukov, kasan-dev, Andrey Ryabinin,
	Thomas Sailer, linux-hams, Jason A. Donenfeld, Richard Henderson,
	linux-alpha, Russell King, linux-arm-kernel, Catalin Marinas,
	Huacai Chen, loongarch, Geert Uytterhoeven, linux-m68k,
	Dinh Nguyen, Jonas Bonn, linux-openrisc, Helge Deller,
	linux-parisc, Michael Ellerman, linuxppc-dev, Paul Walmsley,
	linux-riscv, Heiko Carstens, linux-s390, David S. Miller,
	sparclinux, Hao Li, Christoph Lameter, David Rientjes,
	Roman Gushchin, Shengming Hu
In-Reply-To: <20260410120318.525653921@kernel.org>

[Resending after fixing broken email headers]

On Fri, Apr 10, 2026 at 02:19:37PM +0200, Thomas Gleixner wrote:
> The decision whether to scan remote nodes is based on a 'random' number
> retrieved via get_cycles(). get_cycles() is about to be removed.
> 
> There is already prandom state in the code, so use that instead.
> 
> Signed-off-by: Thomas Gleixner <tglx@kernel.org>
> Cc: Vlastimil Babka <vbabka@kernel.org>
> Cc: linux-mm@kvack.org
> ---

Acked-by: Harry Yoo (Oracle) <harry@kernel.org>

Is this for this merge window?

This may conflict with upcoming changes on freelist shuffling [1]
(not queued for slab/for-next yet though), but it should be easy to
resolve.

[Cc'ing Shengming and SLAB ALLOCATOR folks]
[1] https://lore.kernel.org/linux-mm/20260409204352095kKWVYKtZImN59ybO6iRNj@zte.com.cn

-- 
Cheers,
Harry / Hyeonggon

>  mm/slub.c |   37 +++++++++++++++++++++++--------------
>  1 file changed, 23 insertions(+), 14 deletions(-)
> 
> --- a/mm/slub.c
> +++ b/mm/slub.c
> @@ -3302,6 +3302,25 @@ static inline struct slab *alloc_slab_pa
>  	return slab;
>  }
>  
> +#if defined(CONFIG_SLAB_FREELIST_RANDOM) || defined(CONFIG_NUMA)
> +static DEFINE_PER_CPU(struct rnd_state, slab_rnd_state);
> +
> +static unsigned int slab_get_prandom_state(unsigned int limit)
> +{
> +	struct rnd_state *state;
> +	unsigned int res;
> +
> +	/*
> +	 * An interrupt or NMI handler might interrupt and change
> +	 * the state in the middle, but that's safe.
> +	 */
> +	state = &get_cpu_var(slab_rnd_state);
> +	res = prandom_u32_state(state) % limit;
> +	put_cpu_var(slab_rnd_state);
> +	return res;
> +}
> +#endif
> +
>  #ifdef CONFIG_SLAB_FREELIST_RANDOM
>  /* Pre-initialize the random sequence cache */
>  static int init_cache_random_seq(struct kmem_cache *s)
> @@ -3365,8 +3384,6 @@ static void *next_freelist_entry(struct
>  	return (char *)start + idx;
>  }
>  
> -static DEFINE_PER_CPU(struct rnd_state, slab_rnd_state);
> -
>  /* Shuffle the single linked freelist based on a random pre-computed sequence */
>  static bool shuffle_freelist(struct kmem_cache *s, struct slab *slab,
>  			     bool allow_spin)
> @@ -3383,15 +3400,7 @@ static bool shuffle_freelist(struct kmem
>  	if (allow_spin) {
>  		pos = get_random_u32_below(freelist_count);
>  	} else {
> -		struct rnd_state *state;
> -
> -		/*
> -		 * An interrupt or NMI handler might interrupt and change
> -		 * the state in the middle, but that's safe.
> -		 */
> -		state = &get_cpu_var(slab_rnd_state);
> -		pos = prandom_u32_state(state) % freelist_count;
> -		put_cpu_var(slab_rnd_state);
> +		pos = slab_get_prandom_state(freelist_count);
>  	}
>  
>  	page_limit = slab->objects * s->size;
> @@ -3882,7 +3891,7 @@ static void *get_from_any_partial(struct
>  	 * with available objects.
>  	 */
>  	if (!s->remote_node_defrag_ratio ||
> -			get_cycles() % 1024 > s->remote_node_defrag_ratio)
> +	    slab_get_prandom_state(1024) > s->remote_node_defrag_ratio)
>  		return NULL;
>  
>  	do {
> @@ -7102,7 +7111,7 @@ static unsigned int
>  
>  	/* see get_from_any_partial() for the defrag ratio description */
>  	if (!s->remote_node_defrag_ratio ||
> -			get_cycles() % 1024 > s->remote_node_defrag_ratio)
> +	    slab_get_prandom_state(1024) > s->remote_node_defrag_ratio)
>  		return 0;
>  
>  	do {
> @@ -8421,7 +8430,7 @@ void __init kmem_cache_init_late(void)
>  	flushwq = alloc_workqueue("slub_flushwq", WQ_MEM_RECLAIM | WQ_PERCPU,
>  				  0);
>  	WARN_ON(!flushwq);
> -#ifdef CONFIG_SLAB_FREELIST_RANDOM
> +#if defined(CONFIG_SLAB_FREELIST_RANDOM) || defined(CONFIG_NUMA)
>  	prandom_init_once(&slab_rnd_state);
>  #endif
>  }
> 
> 

^ permalink raw reply

* Re: [patch 14/38] slub: Use prandom instead of get_cycles()
From: Vlastimil Babka (SUSE) @ 2026-04-13  9:00 UTC (permalink / raw)
  To: Thomas Gleixner, LKML
  Cc: linux-mm, Arnd Bergmann, x86, Lu Baolu, iommu, Michael Grzeschik,
	netdev, linux-wireless, Herbert Xu, linux-crypto, David Woodhouse,
	Bernie Thompson, linux-fbdev, Theodore Tso, linux-ext4,
	Andrew Morton, Uladzislau Rezki, Marco Elver, Dmitry Vyukov,
	kasan-dev, Andrey Ryabinin, Thomas Sailer, linux-hams,
	Jason A. Donenfeld, Richard Henderson, linux-alpha, Russell King,
	linux-arm-kernel, Catalin Marinas, Huacai Chen, loongarch,
	Geert Uytterhoeven, linux-m68k, Dinh Nguyen, Jonas Bonn,
	linux-openrisc, Helge Deller, linux-parisc, Michael Ellerman,
	linuxppc-dev, Paul Walmsley, linux-riscv, Heiko Carstens,
	linux-s390, David S. Miller, sparclinux, Harry Yoo (Oracle),
	Hao Li
In-Reply-To: <20260410120318.525653921@kernel.org>

On 4/10/26 14:19, Thomas Gleixner wrote:
> The decision whether to scan remote nodes is based on a 'random' number
> retrieved via get_cycles(). get_cycles() is about to be removed.
> 
> There is already prandom state in the code, so use that instead.
> 
> Signed-off-by: Thomas Gleixner <tglx@kernel.org>
> Cc: Vlastimil Babka <vbabka@kernel.org>
> Cc: linux-mm@kvack.org

LGTM.

Acked-by: Vlastimil Babka (SUSE) <vbabka@kernel.org>


^ permalink raw reply

* Re: [PATCH] ext4: make mballoc max prealloc size configurable
From: Jan Kara @ 2026-04-13  8:37 UTC (permalink / raw)
  To: guzebing
  Cc: tytso, adilger.kernel, libaokun, jack, ojaswin, ritesh.list,
	yi.zhang, guzebing, linux-kernel, linux-ext4
In-Reply-To: <20260410035635.1381920-1-guzebing1612@gmail.com>

On Fri 10-04-26 11:56:35, guzebing wrote:
> From: Guzebing <guzebing@bytedance.com>
> 
> Add per-superblock sysfs knob mb_max_prealloc_kb (min 8MiB, roundup
> pow2) and use it in request normalization.
> 
> When multiple tasks write to different files on the same filesystem
> concurrently, each file ends up with 8 MiB extents. If the preallocation
> size is increased, the resulting extent size grows accordingly. Due
> to the readahead mechanism on NVMe SSDs, files with larger extents
> achieve higher sequential read throughput.
> 
> On an ext4 filesystem on an NVMe Gen4 data drive, dd read throughput
> for a file with 8 MiB extents is 455 MB/s, while for a file with
> 32 MiB extents it reaches 702 MB/s.

Hum, I think you are not speaking about general Linux readahead code here..

> Steps to reproduce:
> 1.Configure the maximum preallocation size to 8 MiB or 32 MiB:
> echo 8192 > /sys/fs/ext4/nvme13n1/mb_max_prealloc_kb
> echo 32768 > /sys/fs/ext4/nvme13n1/mb_max_prealloc_kb
> 
> 2.Run the following commands simultaneously so that the extents of
> the two files are physically interleaved, resulting in 8 MiB or 32 MiB
> extents:
> dd if=/dev/zero of=/mnt/store1/501.txt bs=128K count=80K oflag=direct
> dd if=/dev/zero of=/mnt/store1/502.txt bs=128K count=80K oflag=direct
> 
> 3.Read back the file and measure the read throughput:
> dd if=/mnt/store1/501.txt of=/dev/null bs=128K count=80K iflag=direct

OK, seeing that you are using direct IO here you are likely speaking about
some internal mechanism within the SSD that is happier when the IO is more
contiguous in the LBA space?

In general I find the example you show with dd not very performance
relevant. If you care about performance, then you should be running
multiple direct IO requests in parallel (either with AIO/DIO or with
iouring). Or you should be using buffered IO to do this for you behind the
scenes. So do you have a more realistic usecase where the extent allocation
size matters so much or is this mostly a benchmarking exercise?

								Honza
> 
> Signed-off-by: Guzebing <guzebing@bytedance.com>
> ---
>  Documentation/ABI/testing/sysfs-fs-ext4 |  8 +++++++
>  fs/ext4/ext4.h                          |  1 +
>  fs/ext4/mballoc.c                       |  2 +-
>  fs/ext4/super.c                         |  1 +
>  fs/ext4/sysfs.c                         | 28 ++++++++++++++++++++++++-
>  5 files changed, 38 insertions(+), 2 deletions(-)
> 
> diff --git a/Documentation/ABI/testing/sysfs-fs-ext4 b/Documentation/ABI/testing/sysfs-fs-ext4
> index 2edd0a6672d3a..316ae1d1ec18b 100644
> --- a/Documentation/ABI/testing/sysfs-fs-ext4
> +++ b/Documentation/ABI/testing/sysfs-fs-ext4
> @@ -48,6 +48,14 @@ Description:
>  		will have its blocks allocated out of its own unique
>  		preallocation pool.
>  
> +What:		/sys/fs/ext4/<disk>/mb_max_prealloc_kb
> +Date:		April 2026
> +Contact:	"Linux Ext4 Development List" <linux-ext4@vger.kernel.org>
> +Description:
> +		Maximum size (in kilobytes) used by the multiblock allocator's
> +		normalized request preallocation heuristic. Values are rounded
> +		up to a power of two and clamped to a minimum of 8192 (8MiB).
> +
>  What:		/sys/fs/ext4/<disk>/inode_readahead_blks
>  Date:		March 2008
>  Contact:	"Theodore Ts'o" <tytso@mit.edu>
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 7617e2d454ea5..bce99740740f5 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -1634,6 +1634,7 @@ struct ext4_sb_info {
>  	unsigned int s_mb_best_avail_max_trim_order;
>  	unsigned int s_sb_update_sec;
>  	unsigned int s_sb_update_kb;
> +	unsigned int s_mb_max_prealloc_kb;
>  
>  	/* where last allocation was done - for stream allocation */
>  	ext4_group_t *s_mb_last_groups;
> diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
> index bb58eafb87bcd..f5f63c56fcdac 100644
> --- a/fs/ext4/mballoc.c
> +++ b/fs/ext4/mballoc.c
> @@ -4589,7 +4589,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
>  					(8<<20)>>bsbits, max, 8 * 1024)) {
>  		start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
>  							(23 - bsbits)) << 23;
> -		size = 8 * 1024 * 1024;
> +		size = (loff_t)sbi->s_mb_max_prealloc_kb << 10;
>  	} else {
>  		start_off = (loff_t) ac->ac_o_ex.fe_logical << bsbits;
>  		size	  = (loff_t) EXT4_C2B(sbi,
> diff --git a/fs/ext4/super.c b/fs/ext4/super.c
> index a34efb44e73d7..f815e31657cc9 100644
> --- a/fs/ext4/super.c
> +++ b/fs/ext4/super.c
> @@ -5447,6 +5447,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
>  		sbi->s_stripe = 0;
>  	}
>  	sbi->s_extent_max_zeroout_kb = 32;
> +	sbi->s_mb_max_prealloc_kb = 8 * 1024;
>  
>  	/*
>  	 * set up enough so that it can read an inode
> diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
> index 923b375e017fa..6339492eb2fa7 100644
> --- a/fs/ext4/sysfs.c
> +++ b/fs/ext4/sysfs.c
> @@ -10,6 +10,8 @@
>  
>  #include <linux/time.h>
>  #include <linux/fs.h>
> +#include <linux/log2.h>
> +#include <linux/limits.h>
>  #include <linux/seq_file.h>
>  #include <linux/slab.h>
>  #include <linux/proc_fs.h>
> @@ -41,6 +43,7 @@ typedef enum {
>  	attr_pointer_atomic,
>  	attr_journal_task,
>  	attr_err_report_sec,
> +	attr_mb_max_prealloc_kb,
>  } attr_id_t;
>  
>  typedef enum {
> @@ -115,6 +118,25 @@ static ssize_t reserved_clusters_store(struct ext4_sb_info *sbi,
>  	return count;
>  }
>  
> +static ssize_t mb_max_prealloc_kb_store(struct ext4_sb_info *sbi,
> +					const char *buf, size_t count)
> +{
> +	unsigned int v;
> +	int ret;
> +	unsigned long rounded;
> +
> +	ret = kstrtouint(skip_spaces(buf), 0, &v);
> +	if (ret)
> +		return ret;
> +	if (v < 8192)
> +		v = 8192;
> +	rounded = roundup_pow_of_two((unsigned long)v);
> +	if (rounded > UINT_MAX)
> +		return -EINVAL;
> +	sbi->s_mb_max_prealloc_kb = (unsigned int)rounded;
> +	return count;
> +}
> +
>  static ssize_t trigger_test_error(struct ext4_sb_info *sbi,
>  				  const char *buf, size_t count)
>  {
> @@ -288,6 +310,7 @@ EXT4_RW_ATTR_SBI_UI(mb_prefetch_limit, s_mb_prefetch_limit);
>  EXT4_RW_ATTR_SBI_UL(last_trim_minblks, s_last_trim_minblks);
>  EXT4_RW_ATTR_SBI_UI(sb_update_sec, s_sb_update_sec);
>  EXT4_RW_ATTR_SBI_UI(sb_update_kb, s_sb_update_kb);
> +EXT4_ATTR_OFFSET(mb_max_prealloc_kb, 0644, mb_max_prealloc_kb, ext4_sb_info, s_mb_max_prealloc_kb);
>  
>  static unsigned int old_bump_val = 128;
>  EXT4_ATTR_PTR(max_writeback_mb_bump, 0444, pointer_ui, &old_bump_val);
> @@ -341,6 +364,7 @@ static struct attribute *ext4_attrs[] = {
>  	ATTR_LIST(last_trim_minblks),
>  	ATTR_LIST(sb_update_sec),
>  	ATTR_LIST(sb_update_kb),
> +	ATTR_LIST(mb_max_prealloc_kb),
>  	ATTR_LIST(err_report_sec),
>  	NULL,
>  };
> @@ -431,6 +455,7 @@ static ssize_t ext4_generic_attr_show(struct ext4_attr *a,
>  	case attr_mb_order:
>  	case attr_pointer_pi:
>  	case attr_pointer_ui:
> +	case attr_mb_max_prealloc_kb:
>  		if (a->attr_ptr == ptr_ext4_super_block_offset)
>  			return sysfs_emit(buf, "%u\n", le32_to_cpup(ptr));
>  		return sysfs_emit(buf, "%u\n", *((unsigned int *) ptr));
> @@ -557,6 +582,8 @@ static ssize_t ext4_attr_store(struct kobject *kobj,
>  		return reserved_clusters_store(sbi, buf, len);
>  	case attr_inode_readahead:
>  		return inode_readahead_blks_store(sbi, buf, len);
> +	case attr_mb_max_prealloc_kb:
> +		return mb_max_prealloc_kb_store(sbi, buf, len);
>  	case attr_trigger_test_error:
>  		return trigger_test_error(sbi, buf, len);
>  	case attr_err_report_sec:
> @@ -695,4 +722,3 @@ void ext4_exit_sysfs(void)
>  	remove_proc_entry(proc_dirname, NULL);
>  	ext4_proc_root = NULL;
>  }
> -
> -- 
> 2.20.1
> 
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox