Linux EXT4 FS development

Linux EXT4 FS development
 help / color / mirror / Atom feed

* [PATCH 02/17] proc: replace __get_free_page() with kmalloc()
From: Mike Rapoport (Microsoft) @ 2026-05-23 17:54 UTC (permalink / raw)
  To: Jan Kara, Mark Fasheh, Joel Becker, Joseph Qi, Ryusuke Konishi,
	Viacheslav Dubeyko, Trond Myklebust, Anna Schumaker, Chuck Lever,
	Jeff Layton, NeilBrown, Olga Kornievskaia, Dai Ngo, Tom Talpey,
	Alexander Viro, Christian Brauner, Jan Kara, Dave Kleikamp,
	Theodore Ts'o, Miklos Szeredi, Andreas Hindborg, Breno Leitao,
	Kees Cook, Tigran A. Aivazian
  Cc: linux-kernel, linux-fsdevel, ocfs2-devel, linux-nilfs, linux-nfs,
	jfs-discussion, linux-ext4, linux-mm, Mike Rapoport (Microsoft)
In-Reply-To: <20260523-b4-fs-v1-0-275e36a83f0e@kernel.org>

A few functions in fs/proc/base.c use __get_free_page() to allocate a
temporary buffer.

kmalloc() is a better API for such use and it also provides better
scalability and more debugging possibilities.

Replace use of __get_free_page() with kmalloc().

Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
---
 fs/proc/base.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/fs/proc/base.c b/fs/proc/base.c
index d9acfa89c894..e129dc509b79 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -261,7 +261,7 @@ static ssize_t get_mm_proctitle(struct mm_struct *mm, char __user *buf,
 	if (pos >= PAGE_SIZE)
 		return 0;
 
-	page = (char *)__get_free_page(GFP_KERNEL);
+	page = kmalloc(PAGE_SIZE, GFP_KERNEL);
 	if (!page)
 		return -ENOMEM;
 
@@ -284,7 +284,7 @@ static ssize_t get_mm_proctitle(struct mm_struct *mm, char __user *buf,
 			ret = len;
 		}
 	}
-	free_page((unsigned long)page);
+	kfree(page);
 	return ret;
 }
 
@@ -347,7 +347,7 @@ static ssize_t get_mm_cmdline(struct mm_struct *mm, char __user *buf,
 	if (count > arg_end - pos)
 		count = arg_end - pos;
 
-	page = (char *)__get_free_page(GFP_KERNEL);
+	page = kmalloc(PAGE_SIZE, GFP_KERNEL);
 	if (!page)
 		return -ENOMEM;
 
@@ -371,7 +371,7 @@ static ssize_t get_mm_cmdline(struct mm_struct *mm, char __user *buf,
 		count -= got;
 	}
 
-	free_page((unsigned long)page);
+	kfree(page);
 	return len;
 }
 
@@ -908,7 +908,7 @@ static ssize_t mem_rw(struct file *file, char __user *buf,
 	if (!mm)
 		return 0;
 
-	page = (char *)__get_free_page(GFP_KERNEL);
+	page = kmalloc(PAGE_SIZE, GFP_KERNEL);
 	if (!page)
 		return -ENOMEM;
 
@@ -949,7 +949,7 @@ static ssize_t mem_rw(struct file *file, char __user *buf,
 
 	mmput(mm);
 free:
-	free_page((unsigned long) page);
+	kfree(page);
 	return copied;
 }
 
@@ -1016,7 +1016,7 @@ static ssize_t environ_read(struct file *file, char __user *buf,
 	if (!mm || !mm->env_end)
 		return 0;
 
-	page = (char *)__get_free_page(GFP_KERNEL);
+	page = kmalloc(PAGE_SIZE, GFP_KERNEL);
 	if (!page)
 		return -ENOMEM;
 
@@ -1062,7 +1062,7 @@ static ssize_t environ_read(struct file *file, char __user *buf,
 	mmput(mm);
 
 free:
-	free_page((unsigned long) page);
+	kfree(page);
 	return ret;
 }
 

-- 
2.53.0


^ permalink raw reply related

* [PATCH 01/17] quota: allocate dquot_hash with kmalloc()
From: Mike Rapoport (Microsoft) @ 2026-05-23 17:54 UTC (permalink / raw)
  To: Jan Kara, Mark Fasheh, Joel Becker, Joseph Qi, Ryusuke Konishi,
	Viacheslav Dubeyko, Trond Myklebust, Anna Schumaker, Chuck Lever,
	Jeff Layton, NeilBrown, Olga Kornievskaia, Dai Ngo, Tom Talpey,
	Alexander Viro, Christian Brauner, Jan Kara, Dave Kleikamp,
	Theodore Ts'o, Miklos Szeredi, Andreas Hindborg, Breno Leitao,
	Kees Cook, Tigran A. Aivazian
  Cc: linux-kernel, linux-fsdevel, ocfs2-devel, linux-nilfs, linux-nfs,
	jfs-discussion, linux-ext4, linux-mm, Mike Rapoport (Microsoft)
In-Reply-To: <20260523-b4-fs-v1-0-275e36a83f0e@kernel.org>

dquot_init() allocates a single page for dquot_hash with
__get_free_pages().

kmalloc() is a better API for such use and it also provides better
scalability and more debugging possibilities.

Replace use of __get_free_pages() with kmalloc() and get rid of the order
variable that remained 0 for more than 20 years.

Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
---
 fs/quota/dquot.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 64cf42721496..9850de3955d3 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -3022,7 +3022,7 @@ static const struct ctl_table fs_dqstats_table[] = {
 static int __init dquot_init(void)
 {
 	int i, ret;
-	unsigned long nr_hash, order;
+	unsigned long nr_hash;
 	struct shrinker *dqcache_shrinker;
 
 	printk(KERN_NOTICE "VFS: Disk quotas %s\n", __DQUOT_VERSION__);
@@ -3035,8 +3035,7 @@ static int __init dquot_init(void)
 				SLAB_PANIC),
 			NULL);
 
-	order = 0;
-	dquot_hash = (struct hlist_head *)__get_free_pages(GFP_KERNEL, order);
+	dquot_hash = kmalloc(PAGE_SIZE, GFP_KERNEL);
 	if (!dquot_hash)
 		panic("Cannot create dquot hash table");
 
@@ -3046,7 +3045,7 @@ static int __init dquot_init(void)
 		panic("Cannot create dquot stat counters");
 
 	/* Find power-of-two hlist_heads which can fit into allocation */
-	nr_hash = (1UL << order) * PAGE_SIZE / sizeof(struct hlist_head);
+	nr_hash = PAGE_SIZE / sizeof(struct hlist_head);
 	dq_hash_bits = ilog2(nr_hash);
 
 	nr_hash = 1UL << dq_hash_bits;
@@ -3054,8 +3053,8 @@ static int __init dquot_init(void)
 	for (i = 0; i < nr_hash; i++)
 		INIT_HLIST_HEAD(dquot_hash + i);
 
-	pr_info("VFS: Dquot-cache hash table entries: %ld (order %ld,"
-		" %ld bytes)\n", nr_hash, order, (PAGE_SIZE << order));
+	pr_info("VFS: Dquot-cache hash table entries: %ld (%ld bytes)\n",
+		nr_hash, PAGE_SIZE);
 
 	dqcache_shrinker = shrinker_alloc(0, "dquota-cache");
 	if (!dqcache_shrinker)

-- 
2.53.0


^ permalink raw reply related

* [PATCH 00/17] fs: replace __get_free_pages() call with kmalloc()
From: Mike Rapoport (Microsoft) @ 2026-05-23 17:54 UTC (permalink / raw)
  To: Jan Kara, Mark Fasheh, Joel Becker, Joseph Qi, Ryusuke Konishi,
	Viacheslav Dubeyko, Trond Myklebust, Anna Schumaker, Chuck Lever,
	Jeff Layton, NeilBrown, Olga Kornievskaia, Dai Ngo, Tom Talpey,
	Alexander Viro, Christian Brauner, Jan Kara, Dave Kleikamp,
	Theodore Ts'o, Miklos Szeredi, Andreas Hindborg, Breno Leitao,
	Kees Cook, Tigran A. Aivazian
  Cc: linux-kernel, linux-fsdevel, ocfs2-devel, linux-nilfs, linux-nfs,
	jfs-discussion, linux-ext4, linux-mm, Mike Rapoport (Microsoft)

This is a (small) part of larger work of replacing page allocator calls
with kmalloc.

Also in git:
https://git.kernel.org/pub/scm/linux/kernel/git/rppt/linux.git gfp-to-kmalloc/fs

Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
---
Mike Rapoport (Microsoft) (17):
      quota: allocate dquot_hash with kmalloc()
      proc: replace __get_free_page() with kmalloc()
      ocfs2/dlm: replace __get_free_page() with kmalloc()
      nilfs2: replace get_zeroed_page() with kzalloc()
      NFS: replace __get_free_page() with kmalloc() in nfs_show_devname()
      NFS: remove unused page and page2 in nfs4_replace_transport()
      NFSD: replace __get_free_page() with kmalloc() in nfsd_buffered_readdir()
      libfs: simple_transaction_get(): replace get_zeroed_page() with kzalloc()
      jfs: replace __get_free_page() with kmalloc()
      jbd2: replace __get_free_pages() with kmalloc()
      isofs: replace __get_free_page() with kmalloc()
      fuse: replace __get_free_page() with kmalloc()
      fs/select: replace __get_free_page() with kmalloc()
      fs/namespace: use __getname() to allocate mntpath buffer
      configfs: replace __get_free_pages() with kzalloc()
      binfmt_misc: replace __get_free_page() with kmalloc()
      bfs: replace get_zeroed_page() with kzalloc()

 fs/bfs/inode.c             |  4 ++--
 fs/binfmt_misc.c           |  4 ++--
 fs/configfs/file.c         |  7 +++----
 fs/fuse/ioctl.c            |  5 +++--
 fs/isofs/dir.c             |  5 +++--
 fs/jbd2/journal.c          |  7 ++-----
 fs/jfs/jfs_dtree.c         | 16 ++++++++--------
 fs/libfs.c                 |  6 +++---
 fs/namespace.c             |  4 ++--
 fs/nfs/nfs4namespace.c     | 15 +--------------
 fs/nfs/super.c             |  4 ++--
 fs/nfsd/vfs.c              |  4 ++--
 fs/nilfs2/ioctl.c          |  4 ++--
 fs/ocfs2/dlm/dlmdebug.c    | 24 +++++++++---------------
 fs/ocfs2/dlm/dlmdomain.c   |  8 +++++---
 fs/ocfs2/dlm/dlmmaster.c   |  5 ++---
 fs/ocfs2/dlm/dlmrecovery.c |  4 ++--
 fs/proc/base.c             | 16 ++++++++--------
 fs/quota/dquot.c           | 11 +++++------
 fs/select.c                |  4 ++--
 20 files changed, 68 insertions(+), 89 deletions(-)
---
base-commit: 5d6919055dec134de3c40167a490f33c74c12581
change-id: 20260522-b4-fs-5e5c70f31664

Best regards,
--  
Sincerely yours,
Mike.


^ permalink raw reply

* Re: DEPT (the dependency tracker) as AI review prompt?
From: Yunseong Kim @ 2026-05-23 15:04 UTC (permalink / raw)
  To: Harry Yoo, Yunseong Kim
  Cc: Byungchul Park, linux-kernel, kernel_team, torvalds,
	damien.lemoal, linux-ide, adilger.kernel, linux-ext4, mingo,
	peterz, will, tglx, rostedt, joel, sashal, daniel.vetter,
	duyuyang, johannes.berg, tj, tytso, willy, david, amir73il,
	gregkh, kernel-team, linux-mm, akpm, mhocko, minchan, hannes,
	vdavydov.dev, sj, jglisse, dennis, cl, penberg, rientjes, vbabka,
	ngupta, linux-block, linux-fsdevel, jack, jlayton, dan.j.williams,
	hch, djwong, dri-devel, rodrigosiqueiramelo, melissa.srw,
	hamohammed.sa, harry.yoo, chris.p.wilson, gwan-gyeong.mun,
	max.byungchul.park, boqun.feng, longman, yunseong.kim,
	yeoreum.yun, netdev, matthew.brost, her0gyugyu, corbet,
	catalin.marinas, bp, x86, hpa, luto, sumit.semwal, gustavo,
	christian.koenig, andi.shyti, arnd, lorenzo.stoakes, Liam.Howlett,
	rppt, surenb, mcgrof, petr.pavlu, da.gomez, samitolvanen, paulmck,
	frederic, neeraj.upadhyay, joelagnelf, josh, urezki,
	mathieu.desnoyers, jiangshanlai, qiang.zhang, juri.lelli,
	vincent.guittot, dietmar.eggemann, bsegall, mgorman, vschneid,
	chuck.lever, neil, okorniev, Dai.Ngo, tom, trondmy, anna, kees,
	bigeasy, clrkwllms, mark.rutland, ada.coupriediaz,
	kristina.martsenko, wangkefeng.wang, broonie, kevin.brodsky, dwmw,
	shakeel.butt, ast, ziy, yuzhao, baolin.wang, usamaarif642,
	joel.granados, richard.weiyang, geert+renesas, tim.c.chen, linux,
	alexander.shishkin, lillian, chenhuacai, francesco,
	guoweikang.kernel, link, jpoimboe, masahiroy, brauner,
	thomas.weissschuh, oleg, mjguzik, andrii, wangfushuai, linux-doc,
	linux-arm-kernel, linux-media, linaro-mm-sig, linux-i2c,
	linux-arch, linux-modules, rcu, linux-nfs, linux-rt-devel,
	2407018371, dakr, miguel.ojeda.sandonis, neilb, bagasdotme,
	wsa+renesas, dave.hansen, geert, ojeda, alex.gaynor, gary,
	bjorn3_gh, lossin, a.hindborg, aliceryhl, tmgross, rust-for-linux,
	Chris Mason, Roman Gushchin, Josef Bacik
In-Reply-To: <0592b09b-a084-4d9d-bcbf-1b77e45226cf@kernel.org>

Hi Harry,

On 5/23/26 16:34, Harry Yoo wrote:
> 
> 
> On 5/23/26 11:00 PM, Yunseong Kim wrote:
>> I've previously experimented with running DEPT alongside syzkaller fuzzing,
>> and many hung tasks missed by lockdep are caught by DEPT, but the resulting
>> high volume of reports makes it easy for issues to get lost in the massive
>> log output. Sorting through that output manually is a huge bottleneck, so
>> leveraging a well-crafted AI prompt to triage the warnings and filter out
>> the false positives would be incredibly valuable.
> 
> I mean both 1) detection of deadlock issues AND 2) false positive elimination with AI.

I completely agree.  Implanting DEPT's model into an AI review prompt
is a great idea. As you suggested, the patterns we develop for the AI
could provide valuable feedback to enhance DEPT's itself.

> If the review prompt is only used to eliminate DEPT's false positives, I think that would be quite hard to get broad use.
> 
> Someone would have to build out-of-tree DEPT, collect the reports, and then feed those back into the AI. I don't think building that kind of pipeline would actually work well in practice.

I also have a huge dept report of DEPT reports, and manually
reviewing all of them is makes me sigh. The constant kernel rebuilds
required for lockup testing every time are also quite expensive.

Thanks for the summary!

Best Regards,
Yunseong

^ permalink raw reply

* Re: DEPT (the dependency tracker) as AI review prompt?
From: Harry Yoo @ 2026-05-23 14:34 UTC (permalink / raw)
  To: Yunseong Kim
  Cc: Byungchul Park, linux-kernel, kernel_team, torvalds,
	damien.lemoal, linux-ide, adilger.kernel, linux-ext4, mingo,
	peterz, will, tglx, rostedt, joel, sashal, daniel.vetter,
	duyuyang, johannes.berg, tj, tytso, willy, david, amir73il,
	gregkh, kernel-team, linux-mm, akpm, mhocko, minchan, hannes,
	vdavydov.dev, sj, jglisse, dennis, cl, penberg, rientjes, vbabka,
	ngupta, linux-block, linux-fsdevel, jack, jlayton, dan.j.williams,
	hch, djwong, dri-devel, rodrigosiqueiramelo, melissa.srw,
	hamohammed.sa, harry.yoo, chris.p.wilson, gwan-gyeong.mun,
	max.byungchul.park, boqun.feng, longman, yunseong.kim,
	yeoreum.yun, netdev, matthew.brost, her0gyugyu, corbet,
	catalin.marinas, bp, x86, hpa, luto, sumit.semwal, gustavo,
	christian.koenig, andi.shyti, arnd, lorenzo.stoakes, Liam.Howlett,
	rppt, surenb, mcgrof, petr.pavlu, da.gomez, samitolvanen, paulmck,
	frederic, neeraj.upadhyay, joelagnelf, josh, urezki,
	mathieu.desnoyers, jiangshanlai, qiang.zhang, juri.lelli,
	vincent.guittot, dietmar.eggemann, bsegall, mgorman, vschneid,
	chuck.lever, neil, okorniev, Dai.Ngo, tom, trondmy, anna, kees,
	bigeasy, clrkwllms, mark.rutland, ada.coupriediaz,
	kristina.martsenko, wangkefeng.wang, broonie, kevin.brodsky, dwmw,
	shakeel.butt, ast, ziy, yuzhao, baolin.wang, usamaarif642,
	joel.granados, richard.weiyang, geert+renesas, tim.c.chen, linux,
	alexander.shishkin, lillian, chenhuacai, francesco,
	guoweikang.kernel, link, jpoimboe, masahiroy, brauner,
	thomas.weissschuh, oleg, mjguzik, andrii, wangfushuai, linux-doc,
	linux-arm-kernel, linux-media, linaro-mm-sig, linux-i2c,
	linux-arch, linux-modules, rcu, linux-nfs, linux-rt-devel,
	2407018371, dakr, miguel.ojeda.sandonis, neilb, bagasdotme,
	wsa+renesas, dave.hansen, geert, ojeda, alex.gaynor, gary,
	bjorn3_gh, lossin, a.hindborg, aliceryhl, tmgross, rust-for-linux,
	Chris Mason, Roman Gushchin, Josef Bacik, Yunseong Kim
In-Reply-To: <CA+7O06GxeDLR9RcKDN2i-Rgc4kgzz6BfF4b0XAH4tFx=A723Nw@mail.gmail.com>



On 5/23/26 11:00 PM, Yunseong Kim wrote:
> I've previously experimented with running DEPT alongside syzkaller fuzzing,
> and many hung tasks missed by lockdep are caught by DEPT, but the resulting
> high volume of reports makes it easy for issues to get lost in the massive
> log output. Sorting through that output manually is a huge bottleneck, so
> leveraging a well-crafted AI prompt to triage the warnings and filter out
> the false positives would be incredibly valuable.

I mean both 1) detection of deadlock issues AND 2) false positive 
elimination with AI.

If the review prompt is only used to eliminate DEPT's false positives, I 
think that would be quite hard to get broad use.

Someone would have to build out-of-tree DEPT, collect the reports, and 
then feed those back into the AI. I don't think building that kind of 
pipeline would actually work well in practice.

-- 
Cheers,
Harry / Hyeonggon


^ permalink raw reply

* Re: DEPT (the dependency tracker) as AI review prompt? (was: DEPT v18)
From: Yunseong Kim @ 2026-05-23 14:00 UTC (permalink / raw)
  To: Harry Yoo
  Cc: Byungchul Park, linux-kernel, kernel_team, torvalds,
	damien.lemoal, linux-ide, adilger.kernel, linux-ext4, mingo,
	peterz, will, tglx, rostedt, joel, sashal, daniel.vetter,
	duyuyang, johannes.berg, tj, tytso, willy, david, amir73il,
	gregkh, kernel-team, linux-mm, akpm, mhocko, minchan, hannes,
	vdavydov.dev, sj, jglisse, dennis, cl, penberg, rientjes, vbabka,
	ngupta, linux-block, linux-fsdevel, jack, jlayton, dan.j.williams,
	hch, djwong, dri-devel, rodrigosiqueiramelo, melissa.srw,
	hamohammed.sa, harry.yoo, chris.p.wilson, gwan-gyeong.mun,
	max.byungchul.park, boqun.feng, longman, yunseong.kim, ysk,
	yeoreum.yun, netdev, matthew.brost, her0gyugyu, corbet,
	catalin.marinas, bp, x86, hpa, luto, sumit.semwal, gustavo,
	christian.koenig, andi.shyti, arnd, lorenzo.stoakes, Liam.Howlett,
	rppt, surenb, mcgrof, petr.pavlu, da.gomez, samitolvanen, paulmck,
	frederic, neeraj.upadhyay, joelagnelf, josh, urezki,
	mathieu.desnoyers, jiangshanlai, qiang.zhang, juri.lelli,
	vincent.guittot, dietmar.eggemann, bsegall, mgorman, vschneid,
	chuck.lever, neil, okorniev, Dai.Ngo, tom, trondmy, anna, kees,
	bigeasy, clrkwllms, mark.rutland, ada.coupriediaz,
	kristina.martsenko, wangkefeng.wang, broonie, kevin.brodsky, dwmw,
	shakeel.butt, ast, ziy, yuzhao, baolin.wang, usamaarif642,
	joel.granados, richard.weiyang, geert+renesas, tim.c.chen, linux,
	alexander.shishkin, lillian, chenhuacai, francesco,
	guoweikang.kernel, link, jpoimboe, masahiroy, brauner,
	thomas.weissschuh, oleg, mjguzik, andrii, wangfushuai, linux-doc,
	linux-arm-kernel, linux-media, linaro-mm-sig, linux-i2c,
	linux-arch, linux-modules, rcu, linux-nfs, linux-rt-devel,
	2407018371, dakr, miguel.ojeda.sandonis, neilb, bagasdotme,
	wsa+renesas, dave.hansen, geert, ojeda, alex.gaynor, gary,
	bjorn3_gh, lossin, a.hindborg, aliceryhl, tmgross, rust-for-linux,
	Chris Mason, Roman Gushchin, Josef Bacik, Yunseong Kim
In-Reply-To: <6b2a816f-eb3b-4e0c-a024-ee2e3743eb04@kernel.org>

Hi Harry,

On Sat, May 23, 2026 at 2:33 PM Harry Yoo <harry@kernel.org> wrote:
>
> Can we start DEPT as an AI review prompt, by documenting DEPT's
> dependency tracking model and false positive elimination rules as a
> carefully crafted prompt?
>
> While DEPT can identify deadlock issues beyond lockdep's capabilities,
> it is hard to enable in automated testing; without fine-grained
> annotations it can produce a high rate of false positives, and verifying
> them requires significant human effort.
>
> The open source AI Review Prompt has locking.md file [1] that teaches
> the AI how to review locks and detect misuse.
>
> If we can write a review prompt for DEPT in a similar manner and have
> the AI do the deadlock detection and false positive elimination, I think
> we could identify those problems more effectively with much less human
> effort.
>
> [1]
> https://github.com/masoncl/review-prompts/blob/main/kernel/subsystem/locking.md
>
> --
> Cheers,
> Harry / Hyeonggon

I think this is an excellent idea, Harry.

I've previously experimented with running DEPT alongside syzkaller fuzzing,
and many hung tasks missed by lockdep are caught by DEPT, but the resulting
high volume of reports makes it easy for issues to get lost in the massive
log output. Sorting through that output manually is a huge bottleneck, so
leveraging a well-crafted AI prompt to triage the warnings and filter out
the false positives would be incredibly valuable.

Leveraging an AI prompt to triage these warnings would be incredibly valuable.
I'd be happy to help translate DEPT's tracking model into specific rules for
reducing false positives and establishing solid filtering patterns.

> On 12/5/25 4:18 PM, Byungchul Park wrote:
> > I'm happy to see that DEPT reported real problems in practice:
> >
> >     https://lore.kernel.org/lkml/6383cde5-cf4b-facf-6e07-1378a485657d@I-love.SAKURA.ne.jp/
> >     https://lore.kernel.org/lkml/1674268856-31807-1-git-send-email-byungchul.park@lge.com/
> >     https://lore.kernel.org/all/b6e00e77-4a8c-4e05-ab79-266bf05fcc2d@igalia.com/
> >
> > I’ve added documentation describing DEPT — this should help you
> > understand what DEPT is and how it works.  You can use DEPT simply by
> > enabling CONFIG_DEPT and checking dmesg at runtime.
> > ---
> >
> > Hi Linus and folks,
> >
> > I’ve been developing a tool to detect deadlock possibilities by tracking
> > waits/events — rather than lock acquisition order — to cover all the
> > synchronization mechanisms.  To summarize the design rationale, starting
> > from the problem statement, through analysis, to the solution:
> >
> >     CURRENT STATUS
> >     --------------
> >     Lockdep tracks lock acquisition order to identify deadlock conditions.
> >     Additionally, it tracks IRQ state changes — via {en,dis}able — to
> >     detect cases where locks are acquired unintentionally during
> >     interrupt handling.
> >
> >     PROBLEM
> >     -------
> >     Waits and their associated events that are never reachable can
> >     eventually lead to deadlocks.  However, since Lockdep focuses solely
> >     on lock acquisition order, it has inherent limitations when handling
> >     waits and events.
> >
> >     Moreover, by tracking only lock acquisition order, Lockdep cannot
> >     properly handle read locks or cross-event scenarios — such as
> >     wait_for_completion() and complete() — making it increasingly
> >     inadequate as a general-purpose deadlock detection tool.
> >
> >     SOLUTION
> >     --------
> >     Once again, waits and their associated events that are never
> >     reachable can eventually lead to deadlocks.  The new solution, DEPT,
> >     focuses directly on waits and events.  DEPT monitors waits and events,
> >     and reports them when any become unreachable.
> >
> > DEPT provides:
> >
> >     * Correct handling of read locks.
> >     * Support for general waits and events.
> >     * Continuous operation, even after multiple reports.
> >     * Simple, intuitive annotation APIs.
> >
> > There are still false positives, and some are already being worked on
> > for suppression.  Especially splitting the folio class into several
> > appropriate classes e.g. block device mapping class and regular file
> > mapping class, is currently under active development by me and Yeoreum
> > Yun.
> >> Anyway, these efforts will need to continue for a while, as we’ve seen
> > with lockdep over two decades.  DEPT is tagged as EXPERIMENTAL in
> > Kconfig — meaning it’s not yet suitable for use as an automation tool.
> >
> > However, for those who are interested in using DEPT to analyze complex
> > synchronization patterns and extract dependency insights, DEPT would be
> > a great tool for the purpose.

Best regards,
Yunseong

^ permalink raw reply

* DEPT (the dependency tracker) as AI review prompt? (was: DEPT v18)
From: Harry Yoo @ 2026-05-23 12:32 UTC (permalink / raw)
  To: Byungchul Park, linux-kernel
  Cc: kernel_team, torvalds, damien.lemoal, linux-ide, adilger.kernel,
	linux-ext4, mingo, peterz, will, tglx, rostedt, joel, sashal,
	daniel.vetter, duyuyang, johannes.berg, tj, tytso, willy, david,
	amir73il, gregkh, kernel-team, linux-mm, akpm, mhocko, minchan,
	hannes, vdavydov.dev, sj, jglisse, dennis, cl, penberg, rientjes,
	vbabka, ngupta, linux-block, josef, linux-fsdevel, jack, jlayton,
	dan.j.williams, hch, djwong, dri-devel, rodrigosiqueiramelo,
	melissa.srw, hamohammed.sa, harry.yoo, chris.p.wilson,
	gwan-gyeong.mun, max.byungchul.park, boqun.feng, longman,
	yunseong.kim, ysk, yeoreum.yun, netdev, matthew.brost, her0gyugyu,
	corbet, catalin.marinas, bp, x86, hpa, luto, sumit.semwal,
	gustavo, christian.koenig, andi.shyti, arnd, lorenzo.stoakes,
	Liam.Howlett, rppt, surenb, mcgrof, petr.pavlu, da.gomez,
	samitolvanen, paulmck, frederic, neeraj.upadhyay, joelagnelf,
	josh, urezki, mathieu.desnoyers, jiangshanlai, qiang.zhang,
	juri.lelli, vincent.guittot, dietmar.eggemann, bsegall, mgorman,
	vschneid, chuck.lever, neil, okorniev, Dai.Ngo, tom, trondmy,
	anna, kees, bigeasy, clrkwllms, mark.rutland, ada.coupriediaz,
	kristina.martsenko, wangkefeng.wang, broonie, kevin.brodsky, dwmw,
	shakeel.butt, ast, ziy, yuzhao, baolin.wang, usamaarif642,
	joel.granados, richard.weiyang, geert+renesas, tim.c.chen, linux,
	alexander.shishkin, lillian, chenhuacai, francesco,
	guoweikang.kernel, link, jpoimboe, masahiroy, brauner,
	thomas.weissschuh, oleg, mjguzik, andrii, wangfushuai, linux-doc,
	linux-arm-kernel, linux-media, linaro-mm-sig, linux-i2c,
	linux-arch, linux-modules, rcu, linux-nfs, linux-rt-devel,
	2407018371, dakr, miguel.ojeda.sandonis, neilb, bagasdotme,
	wsa+renesas, dave.hansen, geert, ojeda, alex.gaynor, gary,
	bjorn3_gh, lossin, a.hindborg, aliceryhl, tmgross, rust-for-linux,
	Chris Mason, Roman Gushchin, Josef Bacik
In-Reply-To: <20251205071855.72743-1-byungchul@sk.com>

Can we start DEPT as an AI review prompt, by documenting DEPT's 
dependency tracking model and false positive elimination rules as a 
carefully crafted prompt?

While DEPT can identify deadlock issues beyond lockdep's capabilities, 
it is hard to enable in automated testing; without fine-grained 
annotations it can produce a high rate of false positives, and verifying 
them requires significant human effort.

The open source AI Review Prompt has locking.md file [1] that teaches 
the AI how to review locks and detect misuse.

If we can write a review prompt for DEPT in a similar manner and have 
the AI do the deadlock detection and false positive elimination, I think 
we could identify those problems more effectively with much less human 
effort.

[1] 
https://github.com/masoncl/review-prompts/blob/main/kernel/subsystem/locking.md

-- 
Cheers,
Harry / Hyeonggon

On 12/5/25 4:18 PM, Byungchul Park wrote:
> I'm happy to see that DEPT reported real problems in practice:
> 
>     https://lore.kernel.org/lkml/6383cde5-cf4b-facf-6e07-1378a485657d@I-love.SAKURA.ne.jp/
>     https://lore.kernel.org/lkml/1674268856-31807-1-git-send-email-byungchul.park@lge.com/
>     https://lore.kernel.org/all/b6e00e77-4a8c-4e05-ab79-266bf05fcc2d@igalia.com/
> 
> I’ve added documentation describing DEPT — this should help you
> understand what DEPT is and how it works.  You can use DEPT simply by
> enabling CONFIG_DEPT and checking dmesg at runtime.
> ---
> 
> Hi Linus and folks,
> 
> I’ve been developing a tool to detect deadlock possibilities by tracking
> waits/events — rather than lock acquisition order — to cover all the
> synchronization mechanisms.  To summarize the design rationale, starting
> from the problem statement, through analysis, to the solution:
> 
>     CURRENT STATUS
>     --------------
>     Lockdep tracks lock acquisition order to identify deadlock conditions.
>     Additionally, it tracks IRQ state changes — via {en,dis}able — to
>     detect cases where locks are acquired unintentionally during
>     interrupt handling.
>     
>     PROBLEM
>     -------
>     Waits and their associated events that are never reachable can
>     eventually lead to deadlocks.  However, since Lockdep focuses solely
>     on lock acquisition order, it has inherent limitations when handling
>     waits and events.
>     
>     Moreover, by tracking only lock acquisition order, Lockdep cannot
>     properly handle read locks or cross-event scenarios — such as
>     wait_for_completion() and complete() — making it increasingly
>     inadequate as a general-purpose deadlock detection tool.
>     
>     SOLUTION
>     --------
>     Once again, waits and their associated events that are never
>     reachable can eventually lead to deadlocks.  The new solution, DEPT,
>     focuses directly on waits and events.  DEPT monitors waits and events,
>     and reports them when any become unreachable.
> 
> DEPT provides:
> 
>     * Correct handling of read locks.
>     * Support for general waits and events.
>     * Continuous operation, even after multiple reports.
>     * Simple, intuitive annotation APIs.
> 
> There are still false positives, and some are already being worked on
> for suppression.  Especially splitting the folio class into several
> appropriate classes e.g. block device mapping class and regular file
> mapping class, is currently under active development by me and Yeoreum
> Yun.
>> Anyway, these efforts will need to continue for a while, as we’ve seen
> with lockdep over two decades.  DEPT is tagged as EXPERIMENTAL in
> Kconfig — meaning it’s not yet suitable for use as an automation tool.
> 
> However, for those who are interested in using DEPT to analyze complex
> synchronization patterns and extract dependency insights, DEPT would be
> a great tool for the purpose.


^ permalink raw reply

* [PATCH] ext2: Remove deprecated DAX support
From: Ashwin Gundarapu @ 2026-05-23 12:20 UTC (permalink / raw)
  To: jack; +Cc: linux-ext4, linux-kernel

DAX support in ext2 was deprecated in commit d5a2693f93e4
("ext2: Deprecate DAX") with a removal deadline of end of 2025.
Remove all DAX code from ext2 as scheduled.

This removes the DAX mount option, IOMAP DAX support, DAX file
operations, DAX address_space_operations, and the DAX fault handler.

Signed-off-by: Ashwin Gundarapu <linuxuser509@zohomail.in>
---
 fs/ext2/ext2.h  |   5 +-
 fs/ext2/file.c  | 118 ++----------------------------------------------
 fs/ext2/inode.c |  60 ++----------------------
 fs/ext2/super.c |  45 +++---------------
 4 files changed, 16 insertions(+), 212 deletions(-)

diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 3eb1f342645c..3a26308ec841 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -114,8 +114,6 @@ struct ext2_sb_info {
 	 */
 	spinlock_t s_lock;
 	struct mb_cache *s_ea_block_cache;
-	struct dax_device *s_daxdev;
-	u64 s_dax_part_off;
 };

 static inline spinlock_t *
@@ -373,11 +371,10 @@ struct ext2_inode {
 #define EXT2_MOUNT_NO_UID32		0x000200  /* Disable 32-bit UIDs */
 #define EXT2_MOUNT_XATTR_USER		0x004000  /* Extended user attributes */
 #define EXT2_MOUNT_POSIX_ACL		0x008000  /* POSIX Access Control Lists */
-#define EXT2_MOUNT_XIP			0x010000  /* Obsolete, use DAX */
+#define EXT2_MOUNT_XIP			0x010000  /* Obsolete*/
 #define EXT2_MOUNT_USRQUOTA		0x020000  /* user quota */
 #define EXT2_MOUNT_GRPQUOTA		0x040000  /* group quota */
 #define EXT2_MOUNT_RESERVATION		0x080000  /* Preallocation */
-#define EXT2_MOUNT_DAX			0x100000  /* Direct Access */
 
 
 #define clear_opt(o, opt)		o &= ~EXT2_MOUNT_##opt
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index d9b1eb34694a..0fd9208af062 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -21,7 +21,6 @@

 #include <linux/time.h>
 #include <linux/pagemap.h>
-#include <linux/dax.h>
 #include <linux/filelock.h>
 #include <linux/quotaops.h>
 #include <linux/iomap.h>
@@ -32,111 +31,6 @@
 #include "acl.h"
 #include "trace.h"

-#ifdef CONFIG_FS_DAX
-static ssize_t ext2_dax_read_iter(struct kiocb *iocb, struct iov_iter *to)
-{
-	struct inode *inode = iocb->ki_filp->f_mapping->host;
-	ssize_t ret;
-
-	if (!iov_iter_count(to))
-		return 0; /* skip atime */
-
-	inode_lock_shared(inode);
-	ret = dax_iomap_rw(iocb, to, &ext2_iomap_ops);
-	inode_unlock_shared(inode);
-
-	file_accessed(iocb->ki_filp);
-	return ret;
-}
-
-static ssize_t ext2_dax_write_iter(struct kiocb *iocb, struct iov_iter *from)
-{
-	struct file *file = iocb->ki_filp;
-	struct inode *inode = file->f_mapping->host;
-	ssize_t ret;
-
-	inode_lock(inode);
-	ret = generic_write_checks(iocb, from);
-	if (ret <= 0)
-		goto out_unlock;
-	ret = file_remove_privs(file);
-	if (ret)
-		goto out_unlock;
-	ret = file_update_time(file);
-	if (ret)
-		goto out_unlock;
-
-	ret = dax_iomap_rw(iocb, from, &ext2_iomap_ops);
-	if (ret > 0 && iocb->ki_pos > i_size_read(inode)) {
-		i_size_write(inode, iocb->ki_pos);
-		mark_inode_dirty(inode);
-	}
-
-out_unlock:
-	inode_unlock(inode);
-	if (ret > 0)
-		ret = generic_write_sync(iocb, ret);
-	return ret;
-}
-
-/*
- * The lock ordering for ext2 DAX fault paths is:
- *
- * mmap_lock (MM)
- *   sb_start_pagefault (vfs, freeze)
- *     address_space->invalidate_lock
- *       address_space->i_mmap_rwsem or page_lock (mutually exclusive in DAX)
- *         ext2_inode_info->truncate_mutex
- *
- * The default page_lock and i_size verification done by non-DAX fault paths
- * is sufficient because ext2 doesn't support hole punching.
- */
-static vm_fault_t ext2_dax_fault(struct vm_fault *vmf)
-{
-	struct inode *inode = file_inode(vmf->vma->vm_file);
-	vm_fault_t ret;
-	bool write = (vmf->flags & FAULT_FLAG_WRITE) &&
-		(vmf->vma->vm_flags & VM_SHARED);
-
-	if (write) {
-		sb_start_pagefault(inode->i_sb);
-		file_update_time(vmf->vma->vm_file);
-	}
-	filemap_invalidate_lock_shared(inode->i_mapping);
-
-	ret = dax_iomap_fault(vmf, 0, NULL, NULL, &ext2_iomap_ops);
-
-	filemap_invalidate_unlock_shared(inode->i_mapping);
-	if (write)
-		sb_end_pagefault(inode->i_sb);
-	return ret;
-}
-
-static const struct vm_operations_struct ext2_dax_vm_ops = {
-	.fault		= ext2_dax_fault,
-	/*
-	 * .huge_fault is not supported for DAX because allocation in ext2
-	 * cannot be reliably aligned to huge page sizes and so pmd faults
-	 * will always fail and fail back to regular faults.
-	 */
-	.page_mkwrite	= ext2_dax_fault,
-	.pfn_mkwrite	= ext2_dax_fault,
-};
-
-static int ext2_file_mmap_prepare(struct vm_area_desc *desc)
-{
-	struct file *file = desc->file;
-
-	if (!IS_DAX(file_inode(file)))
-		return generic_file_mmap_prepare(desc);
-
-	file_accessed(file);
-	desc->vm_ops = &ext2_dax_vm_ops;
-	return 0;
-}
-#else
-#define ext2_file_mmap_prepare	generic_file_mmap_prepare
-#endif
 
 /*
  * Called when filp is released. This happens when all file descriptors
@@ -285,10 +179,7 @@ static ssize_t ext2_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
 
 static ssize_t ext2_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
 {
-#ifdef CONFIG_FS_DAX
-	if (IS_DAX(iocb->ki_filp->f_mapping->host))
-		return ext2_dax_read_iter(iocb, to);
-#endif
+
 	if (iocb->ki_flags & IOCB_DIRECT)
 		return ext2_dio_read_iter(iocb, to);

@@ -297,10 +188,7 @@ static ssize_t ext2_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
 
 static ssize_t ext2_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 {
-#ifdef CONFIG_FS_DAX
-	if (IS_DAX(iocb->ki_filp->f_mapping->host))
-		return ext2_dax_write_iter(iocb, from);
-#endif
+
 	if (iocb->ki_flags & IOCB_DIRECT)
 		return ext2_dio_write_iter(iocb, from);

@@ -321,7 +209,7 @@ const struct file_operations ext2_file_operations = {
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	= ext2_compat_ioctl,
 #endif
-	.mmap_prepare	= ext2_file_mmap_prepare,
+	.mmap_prepare = generic_file_mmap_prepare,
 	.open		= ext2_file_open,
 	.release	= ext2_release_file,
 	.fsync		= ext2_fsync,
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 74aca5eb572d..91fc4709836f 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -26,7 +26,6 @@
 #include <linux/time.h>
 #include <linux/highuid.h>
 #include <linux/pagemap.h>
-#include <linux/dax.h>
 #include <linux/blkdev.h>
 #include <linux/quotaops.h>
 #include <linux/writeback.h>
@@ -741,27 +740,6 @@ static int ext2_get_blocks(struct inode *inode,
 		goto cleanup;
 	}
 
-	if (IS_DAX(inode)) {
-		/*
-		 * We must unmap blocks before zeroing so that writeback cannot
-		 * overwrite zeros with stale data from block device page cache.
-		 */
-		clean_bdev_aliases(inode->i_sb->s_bdev,
-				   le32_to_cpu(chain[depth-1].key),
-				   count);
-		/*
-		 * block must be initialised before we put it in the tree
-		 * so that it's not found by another thread before it's
-		 * initialised
-		 */
-		err = sb_issue_zeroout(inode->i_sb,
-				le32_to_cpu(chain[depth-1].key), count,
-				GFP_KERNEL);
-		if (err) {
-			mutex_unlock(&ei->truncate_mutex);
-			goto cleanup;
-		}
-	}
 	*new = true;
 
 	ext2_splice_branch(inode, iblock, partial, indirect_blks, count);
@@ -841,10 +819,7 @@ static int ext2_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
 
 	iomap->flags = 0;
 	iomap->offset = (u64)first_block << blkbits;
-	if (flags & IOMAP_DAX)
-		iomap->dax_dev = sbi->s_daxdev;
-	else
-		iomap->bdev = inode->i_sb->s_bdev;
+        iomap->bdev = inode->i_sb->s_bdev;
 
 	if (ret == 0) {
 		/*
@@ -859,8 +834,6 @@ static int ext2_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
 	} else {
 		iomap->type = IOMAP_MAPPED;
 		iomap->addr = (u64)bno << blkbits;
-		if (flags & IOMAP_DAX)
-			iomap->addr += sbi->s_dax_part_off;
 		iomap->length = (u64)ret << blkbits;
 		iomap->flags |= IOMAP_F_MERGED;
 	}
@@ -962,13 +935,6 @@ ext2_writepages(struct address_space *mapping, struct writeback_control *wbc)
 	return mpage_writepages(mapping, wbc, ext2_get_block);
 }
 
-static int
-ext2_dax_writepages(struct address_space *mapping, struct writeback_control *wbc)
-{
-	struct ext2_sb_info *sbi = EXT2_SB(mapping->host->i_sb);
-
-	return dax_writeback_mapping_range(mapping, sbi->s_daxdev, wbc);
-}

 const struct address_space_operations ext2_aops = {
 	.dirty_folio		= block_dirty_folio,
@@ -984,10 +950,6 @@ const struct address_space_operations ext2_aops = {
 	.error_remove_folio	= generic_error_remove_folio,
 };
 
-static const struct address_space_operations ext2_dax_aops = {
-	.writepages		= ext2_dax_writepages,
-	.dirty_folio		= noop_dirty_folio,
-};
 
 /*
  * Probably it should be a library function... search for first non-zero word
@@ -1186,9 +1148,6 @@ static void __ext2_truncate_blocks(struct inode *inode, loff_t offset)
 	blocksize = inode->i_sb->s_blocksize;
 	iblock = (offset + blocksize-1) >> EXT2_BLOCK_SIZE_BITS(inode->i_sb);

-#ifdef CONFIG_FS_DAX
-	WARN_ON(!rwsem_is_locked(&inode->i_mapping->invalidate_lock));
-#endif
 
 	n = ext2_block_to_path(inode, iblock, offsets, NULL);
 	if (n == 0)
@@ -1290,12 +1249,8 @@ static int ext2_setsize(struct inode *inode, loff_t newsize)
 
 	inode_dio_wait(inode);
 
-	if (IS_DAX(inode))
-		error = dax_truncate_page(inode, newsize, NULL,
-					  &ext2_iomap_ops);
-	else
-		error = block_truncate_page(inode->i_mapping,
-				newsize, ext2_get_block);
+        error = block_truncate_page(inode->i_mapping,
+                                newsize, ext2_get_block);
 	if (error)
 		return error;
 
@@ -1363,7 +1318,7 @@ void ext2_set_inode_flags(struct inode *inode)
 	unsigned int flags = EXT2_I(inode)->i_flags;

 	inode->i_flags &= ~(S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME |
-				S_DIRSYNC | S_DAX);
+				S_DIRSYNC);
 	if (flags & EXT2_SYNC_FL)
 		inode->i_flags |= S_SYNC;
 	if (flags & EXT2_APPEND_FL)
@@ -1374,18 +1329,13 @@ void ext2_set_inode_flags(struct inode *inode)
 		inode->i_flags |= S_NOATIME;
 	if (flags & EXT2_DIRSYNC_FL)
 		inode->i_flags |= S_DIRSYNC;
-	if (test_opt(inode->i_sb, DAX) && S_ISREG(inode->i_mode))
-		inode->i_flags |= S_DAX;
 }
 
 void ext2_set_file_ops(struct inode *inode)
 {
 	inode->i_op = &ext2_file_inode_operations;
 	inode->i_fop = &ext2_file_operations;
-	if (IS_DAX(inode))
-		inode->i_mapping->a_ops = &ext2_dax_aops;
-	else
-		inode->i_mapping->a_ops = &ext2_aops;
+    inode->i_mapping->a_ops = &ext2_aops;
 }
 
 struct inode *ext2_iget (struct super_block *sb, unsigned long ino)
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 19f76d8cb473..ede4ad84828c 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -34,7 +34,6 @@
 #include <linux/log2.h>
 #include <linux/quotaops.h>
 #include <linux/uaccess.h>
-#include <linux/dax.h>
 #include <linux/iversion.h>
 #include "ext2.h"
 #include "xattr.h"
@@ -198,7 +197,6 @@ static void ext2_put_super (struct super_block * sb)
 	brelse (sbi->s_sbh);
 	sb->s_fs_info = NULL;
 	kfree(sbi->s_blockgroup_lock);
-	fs_put_dax(sbi->s_daxdev, NULL);
 	kfree(sbi);
 }
 
@@ -332,8 +330,6 @@ static int ext2_show_options(struct seq_file *seq, struct dentry *root)
 	if (test_opt(sb, XIP))
 		seq_puts(seq, ",xip");
 
-	if (test_opt(sb, DAX))
-		seq_puts(seq, ",dax");
 
 	if (!test_opt(sb, RESERVATION))
 		seq_puts(seq, ",noreservation");
@@ -432,7 +428,7 @@ static const struct export_operations ext2_export_ops = {
 enum {
 	Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid, Opt_resgid, Opt_resuid,
 	Opt_sb, Opt_errors, Opt_nouid32, Opt_debug, Opt_oldalloc, Opt_orlov,
-	Opt_nobh, Opt_user_xattr, Opt_acl, Opt_xip, Opt_dax, Opt_ignore,
+	Opt_nobh, Opt_user_xattr, Opt_acl, Opt_xip, Opt_ignore,
 	Opt_quota, Opt_usrquota, Opt_grpquota, Opt_reservation,
 };
 
@@ -462,7 +458,6 @@ static const struct fs_parameter_spec ext2_param_spec[] = {
 	fsparam_flag_no	("user_xattr", Opt_user_xattr),
 	fsparam_flag_no	("acl", Opt_acl),
 	fsparam_flag	("xip", Opt_xip),
-	fsparam_flag	("dax", Opt_dax),
 	fsparam_flag	("grpquota", Opt_grpquota),
 	fsparam_flag	("noquota", Opt_ignore),
 	fsparam_flag	("quota", Opt_quota),
@@ -595,20 +590,9 @@ static int ext2_parse_param(struct fs_context *fc, struct fs_parameter *param)
 		ext2_msg_fc(fc, KERN_INFO, "(no)acl options not supported");
 		break;
 #endif
-	case Opt_xip:
-		ext2_msg_fc(fc, KERN_INFO, "use dax instead of xip");
-		ctx_set_mount_opt(ctx, EXT2_MOUNT_XIP);
-		fallthrough;
-	case Opt_dax:
-#ifdef CONFIG_FS_DAX
-		ext2_msg_fc(fc, KERN_WARNING,
-		    "DAX enabled. Warning: DAX support in ext2 driver is deprecated"
-		    " and will be removed at the end of 2025. Please use ext4 driver instead.");
-		ctx_set_mount_opt(ctx, EXT2_MOUNT_DAX);
-#else
-		ext2_msg_fc(fc, KERN_INFO, "dax option not supported");
-#endif
-		break;
+        case Opt_xip:
+                ext2_msg_fc(fc, KERN_ERR, "DAX support has been removed. Please use ext4 instead.");
+                return -EINVAL;
 
 #if defined(CONFIG_QUOTA)
 	case Opt_quota:
@@ -906,8 +890,6 @@ static int ext2_fill_super(struct super_block *sb, struct fs_context *fc)
 	}
 	sb->s_fs_info = sbi;
 	sbi->s_sb_block = sb_block;
-	sbi->s_daxdev = fs_dax_get_by_bdev(sb->s_bdev, &sbi->s_dax_part_off,
-					   NULL, NULL);
 
 	spin_lock_init(&sbi->s_lock);
 	ret = -EINVAL;
@@ -992,16 +974,8 @@ static int ext2_fill_super(struct super_block *sb, struct fs_context *fc)
 	}
 	blocksize = BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size);
 
-	if (test_opt(sb, DAX)) {
-		if (!sbi->s_daxdev) {
-			ext2_msg(sb, KERN_ERR,
-				"DAX unsupported by block device. Turning off DAX.");
-			clear_opt(sbi->s_mount_opt, DAX);
-		} else if (blocksize != PAGE_SIZE) {
-			ext2_msg(sb, KERN_ERR, "unsupported blocksize for DAX\n");
-			clear_opt(sbi->s_mount_opt, DAX);
-		}
-	}
+
+

 	/* If the blocksize doesn't match, re-read the thing.. */
 	if (sb->s_blocksize != blocksize) {
@@ -1252,7 +1226,6 @@ static int ext2_fill_super(struct super_block *sb, struct fs_context *fc)
 failed_mount:
 	brelse(bh);
 failed_sbi:
-	fs_put_dax(sbi->s_daxdev, NULL);
 	sb->s_fs_info = NULL;
 	kfree(sbi->s_blockgroup_lock);
 	kfree(sbi);
@@ -1379,11 +1352,7 @@ static int ext2_reconfigure(struct fs_context *fc)

 	spin_lock(&sbi->s_lock);
 	es = sbi->s_es;
-	if ((sbi->s_mount_opt ^ new_opts.s_mount_opt) & EXT2_MOUNT_DAX) {
-		ext2_msg(sb, KERN_WARNING, "warning: refusing change of "
-			 "dax flag with busy inodes while remounting");
-		new_opts.s_mount_opt ^= EXT2_MOUNT_DAX;
-	}
+
 	if ((bool)(flags & SB_RDONLY) == sb_rdonly(sb))
 		goto out_set;
 	if (flags & SB_RDONLY) {
--
2.43.0


^ permalink raw reply related

* Re: [PATCH 0/2] fs: refactor code to use clear_and_wake_up_bit()
From: Christian Brauner @ 2026-05-22 13:14 UTC (permalink / raw)
  To: linux-fsdevel, linux-ext4, linux-kernel, Jan Kara, shuo chen,
	Theodore Ts'o, linux-kernel-mentees, shuah, patch-reply,
	Agatha Isabelle Moreira
  Cc: Christian Brauner
In-Reply-To: <ag4PEP52c8rxrYPc@guidai>

On Wed, 20 May 2026 16:45:35 -0300, Agatha Isabelle Moreira wrote:
> Refactor code to use `clear_and_wake_up_bit()` instead of manual calls
> to:
>        	clear_bit_unlock();
> 	smp_mb__after_atomic();
> 	wake_up_bit();
> 
> The helper function `clear_and_wake_up_bit()` was introduced in
> 'commit 8236b0ae31c83 ("bdi: wake up concurrent wb_shutdown()
> callers.")' as a generic way of doing the same sequence of operations,
> but several pieces of code still remain.
> 
> [...]

Applied to the vfs-7.2.misc branch of the vfs/vfs.git tree.
Patches in the vfs-7.2.misc branch should appear in linux-next soon.

Please report any outstanding bugs that were missed during review in a
new review to the original patch series allowing us to drop it.

It's encouraged to provide Acked-bys and Reviewed-bys even though the
patch has now been applied. If possible patch trailers will be updated.

Note that commit hashes shown below are subject to change due to rebase,
trailer updates or similar. If in doubt, please check the listed branch.

tree:   https://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs.git
branch: vfs-7.2.misc

[1/2] fs: buffer: use clear_and_wake_up_bit() in unlock_buffer()
      https://git.kernel.org/vfs/vfs/c/89aafbd6d52b
[2/2] fs: jbd2: use clear_and_wake_up_bit() in journal_end_buffer_io_sync()
      https://git.kernel.org/vfs/vfs/c/d7a7e95e5f50

^ permalink raw reply

* Re: [PATCH v10 00/22] fs-verity support for XFS with post EOF merkle tree
From: Christoph Hellwig @ 2026-05-22 12:07 UTC (permalink / raw)
  To: Christian Brauner
  Cc: Carlos Maiolino, Andrey Albershteyn, Christoph Hellwig,
	Andrey Albershteyn, linux-xfs, fsverity, linux-fsdevel, ebiggers,
	linux-ext4, linux-f2fs-devel, linux-btrfs, linux-unionfs, djwong,
	david
In-Reply-To: <20260522-unbehagen-baumethode-erlesen-811a4065eeac@brauner>

On Fri, May 22, 2026 at 12:03:20PM +0200, Christian Brauner wrote:
> > I was expecting this to come through xfs tree too if Eric and Christian
> > agree.
> 
> You may take it through the xfs tree if there are no conflicts with
> vfs-7.2.iomap. If there are I want to add the iomap changes into
> vfs-7.2.iomap that you can pull in.

Merging the iomap bits through the iomap branch might make sense, given
that iomap usually tends to see quite a bit of activity.

^ permalink raw reply

* Re: [PATCH v10 00/22] fs-verity support for XFS with post EOF merkle tree
From: Christian Brauner @ 2026-05-22 10:03 UTC (permalink / raw)
  To: Carlos Maiolino
  Cc: Andrey Albershteyn, Christoph Hellwig, Andrey Albershteyn,
	linux-xfs, fsverity, linux-fsdevel, ebiggers, linux-ext4,
	linux-f2fs-devel, linux-btrfs, linux-unionfs, djwong, david
In-Reply-To: <ag7uW7KQ5mVyCGEv@nidhogg.toxiclabs.cc>

On Thu, May 21, 2026 at 01:38:38PM +0200, Carlos Maiolino wrote:
> On Thu, May 21, 2026 at 11:42:13AM +0200, Andrey Albershteyn wrote:
> > On 2026-05-21 11:07:05, Christoph Hellwig wrote:
> > > On Wed, May 20, 2026 at 02:36:58PM +0200, Andrey Albershteyn wrote:
> > > > This series based on v7.1-rc4.
> > > 
> > > How are we going to merge this?  It touches at three subsystem trees
> > > (fsverity, vfs/iomap, xfs) so some coordination will be needed.
> > 
> > As most of the patches are xfs, it's probably make sense to go
> > through xfs tree
> > 
> > Carlos, what do you think?
> 
> I was expecting this to come through xfs tree too if Eric and Christian
> agree.

You may take it through the xfs tree if there are no conflicts with
vfs-7.2.iomap. If there are I want to add the iomap changes into
vfs-7.2.iomap that you can pull in.

^ permalink raw reply

* [PATCH v5 v5 3/3] ext4: allow controlling mballoc stats through proc mb_stats
From: Baolin Liu @ 2026-05-22  3:59 UTC (permalink / raw)
  To: tytso, adilger.kernel, libaokun, jack, ojaswin, ritesh.list,
	yi.zhang, corbet, skhan
  Cc: linux-ext4, linux-kernel, Baolin Liu, Andreas Dilger
In-Reply-To: <20260522035905.1145743-1-liubaolin12138@163.com>

From: Baolin Liu <liubaolin@kylinos.cn>

Make /proc/fs/ext4/<dev>/mb_stats writable. Writing 0 disables mballoc
statistics collection, writing 1 enables it, and writing -1 clears the
current statistics before enabling collection.
Update the ext4 documentation for proc mb_stats, document that the
sysfs mb_stats entry is deprecated, and point proc.rst to
Documentation/admin-guide/ext4.rst for ext4-specific /proc entries.

Reviewed-by: Ojaswin Mujoo <ojaswin@linux.ibm.com>
Reviewed-by: Andreas Dilger <adilger@dilger.ca>
Reviewed-by: Ritesh Harjani (IBM) <ritesh.list@gmail.com>
Reviewed-by: Zhang Yi <yi.zhang@huawei.com>
Reviewed-by: Baokun Li <libaokun@linux.alibaba.com>
Reviewed-by: Ted Tso <tytso@mit.edu>
Signed-off-by: Baolin Liu <liubaolin@kylinos.cn>
---
 Documentation/ABI/testing/sysfs-fs-ext4 |  3 +-
 Documentation/admin-guide/ext4.rst      |  9 ++++-
 Documentation/filesystems/proc.rst      | 13 +------
 fs/ext4/ext4.h                          |  1 +
 fs/ext4/mballoc.c                       | 31 +++++++++++++++-
 fs/ext4/sysfs.c                         | 48 +++++++++++++++++++++++--
 6 files changed, 88 insertions(+), 17 deletions(-)

diff --git a/Documentation/ABI/testing/sysfs-fs-ext4 b/Documentation/ABI/testing/sysfs-fs-ext4
index 2edd0a6672d3..7bf06c533343 100644
--- a/Documentation/ABI/testing/sysfs-fs-ext4
+++ b/Documentation/ABI/testing/sysfs-fs-ext4
@@ -5,7 +5,8 @@ Description:
 		 Controls whether the multiblock allocator should
 		 collect statistics, which are shown during the unmount.
 		 1 means to collect statistics, 0 means not to collect
-		 statistics
+		 statistics. This sysfs entry is deprecated, and users
+		 should prefer /proc/fs/ext4/<disk>/mb_stats.
 
 What:		/sys/fs/ext4/<disk>/mb_group_prealloc
 Date:		March 2008
diff --git a/Documentation/admin-guide/ext4.rst b/Documentation/admin-guide/ext4.rst
index ac0c709ea9e7..ca76e981b2aa 100644
--- a/Documentation/admin-guide/ext4.rst
+++ b/Documentation/admin-guide/ext4.rst
@@ -436,6 +436,12 @@ Files in /proc/fs/ext4/<devname>
   mb_groups
         details of multiblock allocator buddy cache of free blocks
 
+  mb_stats
+        reports runtime statistics from the multiblock allocator
+        (mballoc). Writing 0 disables statistics collection, writing
+        1 enables statistics collection, and writing -1 clears the
+        current statistics and enables statistics collection.
+
 /sys entries
 ============
 
@@ -493,7 +499,8 @@ Files in /sys/fs/ext4/<devname>:
   mb_stats
         Controls whether the multiblock allocator should collect statistics,
         which are shown during the unmount. 1 means to collect statistics, 0
-        means not to collect statistics.
+        means not to collect statistics. This sysfs entry is deprecated, and
+        users should prefer /proc/fs/ext4/<devname>/mb_stats.
 
   mb_stream_req
         Files which have fewer blocks than this tunable parameter will have
diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst
index b0c0d1b45b99..dd487004b862 100644
--- a/Documentation/filesystems/proc.rst
+++ b/Documentation/filesystems/proc.rst
@@ -1623,18 +1623,7 @@ softirq.
 1.8 Ext4 file system parameters
 -------------------------------
 
-Information about mounted ext4 file systems can be found in
-/proc/fs/ext4.  Each mounted filesystem will have a directory in
-/proc/fs/ext4 based on its device name (i.e., /proc/fs/ext4/hdc or
-/proc/fs/ext4/sda9 or /proc/fs/ext4/dm-0).   The files in each per-device
-directory are shown in Table 1-12, below.
-
-.. table:: Table 1-12: Files in /proc/fs/ext4/<devname>
-
- ==============  ==========================================================
- File            Content
- mb_groups       details of multiblock allocator buddy cache of free blocks
- ==============  ==========================================================
+See Documentation/admin-guide/ext4.rst for ext4-specific /proc entries.
 
 1.9 /proc/consoles
 -------------------
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index df96bcd53a59..483438d5742b 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2995,6 +2995,7 @@ int ext4_fc_record_regions(struct super_block *sb, int ino,
 extern const struct seq_operations ext4_mb_seq_groups_ops;
 extern const struct seq_operations ext4_mb_seq_structs_summary_ops;
 extern int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset);
+extern void ext4_mb_stats_clear(struct ext4_sb_info *sbi);
 extern int ext4_mb_init(struct super_block *);
 extern void ext4_mb_release(struct super_block *);
 extern ext4_fsblk_t ext4_mb_new_blocks(handle_t *,
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index fed6d854877b..3f6454ada587 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -3214,7 +3214,7 @@ int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset)
 		seq_puts(seq, "\tmb stats collection turned off.\n");
 		seq_puts(
 			seq,
-			"\tTo enable, please write \"1\" to sysfs file mb_stats.\n");
+			"\tTo enable, please write \"1\" to proc file mb_stats.\n");
 		return 0;
 	}
 	seq_printf(seq, "\tblocks_allocated: %u\n",
@@ -4723,6 +4723,35 @@ static void ext4_mb_collect_stats(struct ext4_allocation_context *ac)
 		trace_ext4_mballoc_prealloc(ac);
 }
 
+void ext4_mb_stats_clear(struct ext4_sb_info *sbi)
+{
+	int i;
+
+	atomic_set(&sbi->s_bal_reqs, 0);
+	atomic_set(&sbi->s_bal_success, 0);
+	atomic_set(&sbi->s_bal_allocated, 0);
+	atomic_set(&sbi->s_bal_groups_scanned, 0);
+
+	for (i = 0; i < EXT4_MB_NUM_CRS; i++) {
+		atomic64_set(&sbi->s_bal_cX_hits[i], 0);
+		atomic64_set(&sbi->s_bal_cX_groups_considered[i], 0);
+		atomic_set(&sbi->s_bal_cX_ex_scanned[i], 0);
+		atomic64_set(&sbi->s_bal_cX_failed[i], 0);
+	}
+
+	atomic_set(&sbi->s_bal_ex_scanned, 0);
+	atomic_set(&sbi->s_bal_goals, 0);
+	atomic_set(&sbi->s_bal_stream_goals, 0);
+	atomic_set(&sbi->s_bal_len_goals, 0);
+	atomic_set(&sbi->s_bal_2orders, 0);
+	atomic_set(&sbi->s_bal_breaks, 0);
+	atomic_set(&sbi->s_mb_lost_chunks, 0);
+	atomic_set(&sbi->s_mb_buddies_generated, 0);
+	atomic64_set(&sbi->s_mb_generation_time, 0);
+	atomic_set(&sbi->s_mb_preallocated, 0);
+	atomic_set(&sbi->s_mb_discarded, 0);
+}
+
 /*
  * Called on failure; free up any blocks from the inode PA for this
  * context.  We don't need this for MB_GROUP_PA because we only change
diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
index 47e06c32c6fb..76e6c346231a 100644
--- a/fs/ext4/sysfs.c
+++ b/fs/ext4/sysfs.c
@@ -53,6 +53,50 @@ typedef enum {
 static const char proc_dirname[] = "fs/ext4";
 static struct proc_dir_entry *ext4_proc_root;
 
+static int ext4_mb_stats_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, ext4_seq_mb_stats_show, pde_data(inode));
+}
+
+static ssize_t ext4_mb_stats_write(struct file *file, const char __user *buf,
+				   size_t count, loff_t *ppos)
+{
+	struct super_block *sb = pde_data(file_inode(file));
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	int val;
+	int ret;
+
+	ret = kstrtoint_from_user(buf, count, 0, &val);
+	if (ret)
+		return ret;
+
+	switch (val) {
+	case -1:
+		WRITE_ONCE(sbi->s_mb_stats, 0);
+		ext4_mb_stats_clear(sbi);
+		WRITE_ONCE(sbi->s_mb_stats, 1);
+		break;
+	case 1:
+		WRITE_ONCE(sbi->s_mb_stats, 1);
+		break;
+	case 0:
+		WRITE_ONCE(sbi->s_mb_stats, 0);
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return count;
+}
+
+static const struct proc_ops ext4_mb_stats_proc_ops = {
+	.proc_open	= ext4_mb_stats_open,
+	.proc_read	= seq_read,
+	.proc_lseek	= seq_lseek,
+	.proc_release	= single_release,
+	.proc_write	= ext4_mb_stats_write,
+};
+
 struct ext4_attr {
 	struct attribute attr;
 	short attr_id;
@@ -666,8 +710,8 @@ int ext4_register_sysfs(struct super_block *sb)
 					ext4_fc_info_show, sb);
 		proc_create_seq_data("mb_groups", S_IRUGO, sbi->s_proc,
 				&ext4_mb_seq_groups_ops, sb);
-		proc_create_single_data("mb_stats", 0444, sbi->s_proc,
-				ext4_seq_mb_stats_show, sb);
+		proc_create_data("mb_stats", 0644, sbi->s_proc,
+				 &ext4_mb_stats_proc_ops, sb);
 		proc_create_seq_data("mb_structs_summary", 0444, sbi->s_proc,
 				&ext4_mb_seq_structs_summary_ops, sb);
 	}
-- 
2.51.0


^ permalink raw reply related

* [PATCH v5 v5 0/3] ext4: improve mballoc statistics reporting and control
From: Baolin Liu @ 2026-05-22  3:59 UTC (permalink / raw)
  To: tytso, adilger.kernel, libaokun, jack, ojaswin, ritesh.list,
	yi.zhang, corbet, skhan
  Cc: linux-ext4, linux-kernel, Baolin Liu

This series improves mballoc statistics reporting and control by adding
blocks_allocated to /proc/fs/ext4/<dev>/mb_stats, using
READ_ONCE()/WRITE_ONCE() for concurrent accesses to s_mb_stats, and 
allowing mballoc stats to be controlled through
/proc/fs/ext4/<dev>/mb_stats.

Changes in v5: 
 - Use READ_ONCE()/WRITE_ONCE() for s_mb_stats accesses instead of
   converting s_mb_stats to atomic_t, as suggested in review.
 - For proc mb_stats writes of -1, disable stats collection before
   clearing the counters, then re-enable it.

Baolin Liu (3):
  ext4: add blocks_allocated to mb_stats output
  ext4: use READ_ONCE/WRITE_ONCE for s_mb_stats
  ext4: allow controlling mballoc stats through proc mb_stats

 Documentation/ABI/testing/sysfs-fs-ext4 |  3 +-
 Documentation/admin-guide/ext4.rst      |  9 ++-
 Documentation/filesystems/proc.rst      | 13 +----
 fs/ext4/ext4.h                          |  1 +
 fs/ext4/mballoc.c                       | 57 ++++++++++++++-----
 fs/ext4/sysfs.c                         | 73 ++++++++++++++++++++++++-
 6 files changed, 126 insertions(+), 30 deletions(-)

-- 
2.51.0


^ permalink raw reply

* [PATCH v5 v5 2/3] ext4: use READ_ONCE/WRITE_ONCE for s_mb_stats
From: Baolin Liu @ 2026-05-22  3:59 UTC (permalink / raw)
  To: tytso, adilger.kernel, libaokun, jack, ojaswin, ritesh.list,
	yi.zhang, corbet, skhan
  Cc: linux-ext4, linux-kernel, Baolin Liu
In-Reply-To: <20260522035905.1145743-1-liubaolin12138@163.com>

From: Baolin Liu <liubaolin@kylinos.cn>

Use READ_ONCE()/WRITE_ONCE() for concurrent accesses to
s_mb_stats.

Signed-off-by: Baolin Liu <liubaolin@kylinos.cn>
---
 fs/ext4/mballoc.c | 24 ++++++++++++------------
 fs/ext4/sysfs.c   | 25 ++++++++++++++++++++++++-
 2 files changed, 36 insertions(+), 13 deletions(-)

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index d36b0f7b5d7d..fed6d854877b 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -924,7 +924,7 @@ static int ext4_mb_scan_groups_xa_range(struct ext4_allocation_context *ac,
 	xa_for_each_range(xa, group, grp, start, end - 1) {
 		int err;
 
-		if (sbi->s_mb_stats)
+		if (READ_ONCE(sbi->s_mb_stats))
 			atomic64_inc(&sbi->s_bal_cX_groups_considered[cr]);
 
 		err = ext4_mb_scan_group(ac, grp->bb_group);
@@ -980,7 +980,7 @@ static int ext4_mb_scan_groups_p2_aligned(struct ext4_allocation_context *ac,
 		goto wrap_around;
 	}
 
-	if (sbi->s_mb_stats)
+	if (READ_ONCE(sbi->s_mb_stats))
 		atomic64_inc(&sbi->s_bal_cX_failed[ac->ac_criteria]);
 
 	/* Increment cr and search again if no group is found */
@@ -1031,7 +1031,7 @@ static int ext4_mb_scan_groups_goal_fast(struct ext4_allocation_context *ac,
 		goto wrap_around;
 	}
 
-	if (sbi->s_mb_stats)
+	if (READ_ONCE(sbi->s_mb_stats))
 		atomic64_inc(&sbi->s_bal_cX_failed[ac->ac_criteria]);
 	/*
 	 * CR_BEST_AVAIL_LEN works based on the concept that we have
@@ -1135,7 +1135,7 @@ static int ext4_mb_scan_groups_best_avail(struct ext4_allocation_context *ac,
 
 	/* Reset goal length to original goal length before falling into CR_GOAL_LEN_SLOW */
 	ac->ac_g_ex.fe_len = ac->ac_orig_goal_len;
-	if (sbi->s_mb_stats)
+	if (READ_ONCE(sbi->s_mb_stats))
 		atomic64_inc(&sbi->s_bal_cX_failed[ac->ac_criteria]);
 	ac->ac_criteria = CR_GOAL_LEN_SLOW;
 
@@ -1184,7 +1184,7 @@ static int ext4_mb_scan_groups_linear(struct ext4_allocation_context *ac,
 		ac->ac_criteria++;
 
 	/* Processed all groups and haven't found blocks */
-	if (sbi->s_mb_stats && i == ngroups)
+	if (READ_ONCE(sbi->s_mb_stats) && i == ngroups)
 		atomic64_inc(&sbi->s_bal_cX_failed[cr]);
 
 	return 0;
@@ -2541,7 +2541,7 @@ void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac,
 
 		BUG_ON(ac->ac_f_ex.fe_len != ac->ac_g_ex.fe_len);
 
-		if (EXT4_SB(sb)->s_mb_stats)
+		if (READ_ONCE(EXT4_SB(sb)->s_mb_stats))
 			atomic_inc(&EXT4_SB(sb)->s_bal_2orders);
 
 		break;
@@ -2786,7 +2786,7 @@ static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac,
 
 	if (!grp)
 		return -EFSCORRUPTED;
-	if (sbi->s_mb_stats)
+	if (READ_ONCE(sbi->s_mb_stats))
 		atomic64_inc(&sbi->s_bal_cX_groups_considered[ac->ac_criteria]);
 	if (should_lock) {
 		ext4_lock_group(sb, group);
@@ -3097,7 +3097,7 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
 		}
 	}
 
-	if (sbi->s_mb_stats && ac->ac_status == AC_STATUS_FOUND) {
+	if (READ_ONCE(sbi->s_mb_stats) && ac->ac_status == AC_STATUS_FOUND) {
 		atomic64_inc(&sbi->s_bal_cX_hits[ac->ac_criteria]);
 		if (ac->ac_flags & EXT4_MB_STREAM_ALLOC &&
 		    ac->ac_b_ex.fe_group == ac->ac_g_ex.fe_group)
@@ -3210,7 +3210,7 @@ int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset)
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 
 	seq_puts(seq, "mballoc:\n");
-	if (!sbi->s_mb_stats) {
+	if (!READ_ONCE(sbi->s_mb_stats)) {
 		seq_puts(seq, "\tmb stats collection turned off.\n");
 		seq_puts(
 			seq,
@@ -3787,7 +3787,7 @@ int ext4_mb_init(struct super_block *sb)
 
 	sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN;
 	sbi->s_mb_min_to_scan = MB_DEFAULT_MIN_TO_SCAN;
-	sbi->s_mb_stats = MB_DEFAULT_STATS;
+	WRITE_ONCE(sbi->s_mb_stats, MB_DEFAULT_STATS);
 	sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD;
 	sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS;
 	sbi->s_mb_best_avail_max_trim_order = MB_DEFAULT_BEST_AVAIL_TRIM_ORDER;
@@ -3929,7 +3929,7 @@ void ext4_mb_release(struct super_block *sb)
 	kfree(sbi->s_mb_offsets);
 	kfree(sbi->s_mb_maxs);
 	iput(sbi->s_buddy_cache);
-	if (sbi->s_mb_stats) {
+	if (READ_ONCE(sbi->s_mb_stats)) {
 		ext4_msg(sb, KERN_INFO,
 		       "mballoc: %u blocks %u reqs (%u success)",
 				atomic_read(&sbi->s_bal_allocated),
@@ -4694,7 +4694,7 @@ static void ext4_mb_collect_stats(struct ext4_allocation_context *ac)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
 
-	if (sbi->s_mb_stats && ac->ac_g_ex.fe_len >= 1) {
+	if (READ_ONCE(sbi->s_mb_stats) && ac->ac_g_ex.fe_len >= 1) {
 		atomic_inc(&sbi->s_bal_reqs);
 		atomic_add(ac->ac_b_ex.fe_len, &sbi->s_bal_allocated);
 		if (ac->ac_b_ex.fe_len >= ac->ac_o_ex.fe_len)
diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
index afe12bcc1603..47e06c32c6fb 100644
--- a/fs/ext4/sysfs.c
+++ b/fs/ext4/sysfs.c
@@ -41,6 +41,7 @@ typedef enum {
 	attr_pointer_atomic,
 	attr_journal_task,
 	attr_err_report_sec,
+	attr_mb_stats,
 } attr_id_t;
 
 typedef enum {
@@ -241,6 +242,7 @@ EXT4_ATTR_FUNC(session_write_kbytes, 0444);
 EXT4_ATTR_FUNC(lifetime_write_kbytes, 0444);
 EXT4_ATTR_FUNC(reserved_clusters, 0644);
 EXT4_ATTR_FUNC(sra_exceeded_retry_limit, 0444);
+EXT4_ATTR_FUNC(mb_stats, 0644);
 
 EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, inode_readahead,
 		 ext4_sb_info, s_inode_readahead_blks);
@@ -250,7 +252,6 @@ EXT4_ATTR_OFFSET(mb_best_avail_max_trim_order, 0644, mb_order,
 		 ext4_sb_info, s_mb_best_avail_max_trim_order);
 EXT4_ATTR_OFFSET(err_report_sec, 0644, err_report_sec, ext4_sb_info, s_err_report_sec);
 EXT4_RW_ATTR_SBI_UI(inode_goal, s_inode_goal);
-EXT4_RW_ATTR_SBI_UI(mb_stats, s_mb_stats);
 EXT4_RW_ATTR_SBI_UI(mb_max_to_scan, s_mb_max_to_scan);
 EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan);
 EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
@@ -451,6 +452,24 @@ static ssize_t ext4_generic_attr_show(struct ext4_attr *a,
 	return 0;
 }
 
+static ssize_t mb_stats_show(struct ext4_sb_info *sbi, char *buf)
+{
+	return sysfs_emit(buf, "%u\n", READ_ONCE(sbi->s_mb_stats));
+}
+
+static ssize_t mb_stats_store(struct ext4_sb_info *sbi,
+			      const char *buf, size_t len)
+{
+	unsigned int t;
+	int ret;
+
+	ret = kstrtouint(skip_spaces(buf), 0, &t);
+	if (ret)
+		return ret;
+	WRITE_ONCE(sbi->s_mb_stats, t);
+	return len;
+}
+
 static ssize_t ext4_attr_show(struct kobject *kobj,
 			      struct attribute *attr, char *buf)
 {
@@ -475,6 +494,8 @@ static ssize_t ext4_attr_show(struct kobject *kobj,
 		return sysfs_emit(buf, "%llu\n",
 				(unsigned long long)
 			percpu_counter_sum(&sbi->s_sra_exceeded_retry_limit));
+	case attr_mb_stats:
+		return mb_stats_show(sbi, buf);
 	case attr_feature:
 		return sysfs_emit(buf, "supported\n");
 	case attr_first_error_time:
@@ -559,6 +580,8 @@ static ssize_t ext4_attr_store(struct kobject *kobj,
 		return inode_readahead_blks_store(sbi, buf, len);
 	case attr_trigger_test_error:
 		return trigger_test_error(sbi, buf, len);
+	case attr_mb_stats:
+		return mb_stats_store(sbi, buf, len);
 	case attr_err_report_sec:
 		return err_report_sec_store(sbi, buf, len);
 	default:
-- 
2.51.0


^ permalink raw reply related

* [PATCH v5 v5 1/3] ext4: add blocks_allocated to mb_stats output
From: Baolin Liu @ 2026-05-22  3:59 UTC (permalink / raw)
  To: tytso, adilger.kernel, libaokun, jack, ojaswin, ritesh.list,
	yi.zhang, corbet, skhan
  Cc: linux-ext4, linux-kernel, Baolin Liu, Andreas Dilger
In-Reply-To: <20260522035905.1145743-1-liubaolin12138@163.com>

From: Baolin Liu <liubaolin@kylinos.cn>

Add blocks_allocated to /proc/fs/ext4/<dev>/mb_stats so that the
reported statistics match the mballoc summary printed at unmount time.

Reviewed-by: Ojaswin Mujoo <ojaswin@linux.ibm.com>
Reviewed-by: Andreas Dilger <adilger@dilger.ca>
Reviewed-by: Ritesh Harjani (IBM) <ritesh.list@gmail.com>
Reviewed-by: Zhang Yi <yi.zhang@huawei.com>
Signed-off-by: Baolin Liu <liubaolin@kylinos.cn>
---
 fs/ext4/mballoc.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index bb58eafb87bc..d36b0f7b5d7d 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -3217,6 +3217,8 @@ int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset)
 			"\tTo enable, please write \"1\" to sysfs file mb_stats.\n");
 		return 0;
 	}
+	seq_printf(seq, "\tblocks_allocated: %u\n",
+		   atomic_read(&sbi->s_bal_allocated));
 	seq_printf(seq, "\treqs: %u\n", atomic_read(&sbi->s_bal_reqs));
 	seq_printf(seq, "\tsuccess: %u\n", atomic_read(&sbi->s_bal_success));
 
-- 
2.51.0


^ permalink raw reply related

* [PATCH] jbd2: update outdated comment for jbd2_journal_try_to_free_buffers()
From: Zhang Yi @ 2026-05-22  3:05 UTC (permalink / raw)
  To: linux-ext4
  Cc: linux-fsdevel, linux-kernel, tytso, adilger.kernel, libaokun,
	jack, ojaswin, ritesh.list, yi.zhang, yi.zhang, yizhang089,
	yangerkun, yukuai

From: Zhang Yi <yi.zhang@huawei.com>

jbd2_journal_try_to_free_buffers() currently only tries to remove
checkpointed data buffers from the checkpoint list for data=journal
mode, and bails out if any buffer is still attached to a transaction.
For data=ordered and writeback modes, data buffers never have
journal_heads, so the function degenerates to a plain
try_to_free_buffers() call.

Besides, The release of metadata buffers has been delegated to the jbd2
journal shrinker in commit 4ba3fcdde7e3 ("jbd2,ext4: add a shrinker to
release checkpointed buffers"). jbd2_journal_try_to_free_buffers() is
not used for handling metadata buffers now.

However, the comment above the function still references
jbd2_journal_dirty_data(), __jbd2_journal_unfile_buffer(), t_datalist,
BKL, and BUF_CLEAN, all of which were removed in commit 87c89c232c8f
("jbd2: Remove data=ordered mode support using jbd buffer heads").

Replace it with a description of what the function actually does now.

Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
---
 fs/jbd2/transaction.c | 39 ++++++++++++---------------------------
 1 file changed, 12 insertions(+), 27 deletions(-)

diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 4885903bbd10..239bcf88ed1c 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -2139,38 +2139,23 @@ static void __jbd2_journal_unfile_buffer(struct journal_head *jh)
 }
 
 /**
- * jbd2_journal_try_to_free_buffers() - try to free page buffers.
+ * jbd2_journal_try_to_free_buffers() - try to free folio buffers.
  * @journal: journal for operation
  * @folio: Folio to detach data from.
  *
- * For all the buffers on this page,
- * if they are fully written out ordered data, move them onto BUF_CLEAN
- * so try_to_free_buffers() can reap them.
+ * For each buffer_head on @folio, if the buffer has a journal_head but
+ * is not attached to a running or committing transaction, try to remove
+ * it from the checkpoint list.  This is needed for data=journal mode
+ * where data buffers are journaled: once they are checkpointed, the
+ * journal_head can be detached and the buffer freed.  If any buffer is
+ * still attached to a transaction, the folio cannot be released and we
+ * bail out.  Otherwise we call try_to_free_buffers() to detach all
+ * buffer_heads from the folio.
  *
- * This function returns non-zero if we wish try_to_free_buffers()
- * to be called. We do this if the page is releasable by try_to_free_buffers().
- * We also do it if the page has locked or dirty buffers and the caller wants
- * us to perform sync or async writeout.
+ * For data=ordered and writeback modes, data buffers never have
+ * journal_heads, so this degenerates to a plain try_to_free_buffers().
  *
- * This complicates JBD locking somewhat.  We aren't protected by the
- * BKL here.  We wish to remove the buffer from its committing or
- * running transaction's ->t_datalist via __jbd2_journal_unfile_buffer.
- *
- * This may *change* the value of transaction_t->t_datalist, so anyone
- * who looks at t_datalist needs to lock against this function.
- *
- * Even worse, someone may be doing a jbd2_journal_dirty_data on this
- * buffer.  So we need to lock against that.  jbd2_journal_dirty_data()
- * will come out of the lock with the buffer dirty, which makes it
- * ineligible for release here.
- *
- * Who else is affected by this?  hmm...  Really the only contender
- * is do_get_write_access() - it could be looking at the buffer while
- * journal_try_to_free_buffer() is changing its state.  But that
- * cannot happen because we never reallocate freed data as metadata
- * while the data is part of a transaction.  Yes?
- *
- * Return false on failure, true on success
+ * Return: true if the folio's buffers were freed, false otherwise
  */
 bool jbd2_journal_try_to_free_buffers(journal_t *journal, struct folio *folio)
 {
-- 
2.52.0


^ permalink raw reply related

* Re: [PATCH v10 00/22] fs-verity support for XFS with post EOF merkle tree
From: Eric Biggers @ 2026-05-22  2:06 UTC (permalink / raw)
  To: Carlos Maiolino
  Cc: Andrey Albershteyn, Christoph Hellwig, Andrey Albershteyn,
	linux-xfs, fsverity, linux-fsdevel, linux-ext4, linux-f2fs-devel,
	linux-btrfs, linux-unionfs, djwong, david, brauner, amir73il,
	miklos
In-Reply-To: <ag7vc9gaHo3XtsBJ@nidhogg.toxiclabs.cc>

On Thu, May 21, 2026 at 01:42:42PM +0200, Carlos Maiolino wrote:
> On Thu, May 21, 2026 at 01:38:45PM +0200, Carlos Maiolino wrote:
> > On Thu, May 21, 2026 at 11:42:13AM +0200, Andrey Albershteyn wrote:
> > > On 2026-05-21 11:07:05, Christoph Hellwig wrote:
> > > > On Wed, May 20, 2026 at 02:36:58PM +0200, Andrey Albershteyn wrote:
> > > > > This series based on v7.1-rc4.
> > > > 
> > > > How are we going to merge this?  It touches at three subsystem trees
> > > > (fsverity, vfs/iomap, xfs) so some coordination will be needed.
> > > 
> > > As most of the patches are xfs, it's probably make sense to go
> > > through xfs tree
> > > 
> > > Carlos, what do you think?
> > 
> > I was expecting this to come through xfs tree too if Eric and Christian
> > agree.
> > FWIW I'm adding Christian to the Cc
> 
> Woops... Also Adding Amir and Miklos to the Cc list due the overlayfs
> patch:

Please go ahead and take it through the XFS tree for 7.2 if you think
it's ready.

- Eric

^ permalink raw reply

* Re: [PATCH v5 09/10] fstests: add pre_clone_tune_uuid() healper
From: Anand Jain @ 2026-05-21 13:10 UTC (permalink / raw)
  To: fstests, zlang, Zorro Lang
  Cc: linux-btrfs, linux-ext4, linux-xfs, linux-f2fs, amir73il, hch,
	Anand Jain
In-Reply-To: <0d9f455d20145103faaf4e45262ee44418cbaca1.1779367627.git.asj@kernel.org>



On 21/5/26 20:54, Anand Jain wrote:
> pre_clone_tune_uuid() changes the UUID of the golden filesystem before it
> is cloned.
> 
> Signed-off-by: Anand Jain <asj@kernel.org>
> ---
>  common/rc | 20 ++++++++++++++++++++
>  1 file changed, 20 insertions(+)
> 
> diff --git a/common/rc b/common/rc
> index 7ae9877918c8..1b3a32d9ea9b 100644
> --- a/common/rc
> +++ b/common/rc
> @@ -1517,6 +1517,26 @@ _scratch_resvblks()
>  	esac
>  }
>  
> +pre_clone_tune_uuid()


I missed the "_" prefix for pre_clone_tune_uuid() to match
the common/rc style.

Zorro,

If there are no other rerolls, can you please fix it at merge
or should I resend?

Thanks


> +{
> +	local temp_mnt=$TEST_DIR/${seq}_mnt
> +	local dev=$1
> +
> +	case $FSTYP in
> +	xfs)
> +		_require_command "$XFS_ADMIN_PROG" "xfs_admin"
> +		$XFS_ADMIN_PROG -U generate $dev >> $seqres.full
> +		;;
> +	btrfs)
> +		_require_command "$BTRFS_TUNE_PROG" "btrfstune"
> +		$BTRFS_TUNE_PROG -m $dev
> +		;;
> +	*)
> +		_notrun "Require filesystem with metadata_uuid feature"
> +		;;
> +	esac
> +}
> +
>  _loop_image_create_clone()
>  {
>  	local -n _ret=$1


^ permalink raw reply

* [PATCH 2/2] ext4: get ext4_group_desc in ext4_mb_prefetch only when necessary
From: Bohdan Trach @ 2026-05-21 12:59 UTC (permalink / raw)
  To: Theodore Ts'o, Andreas Dilger, Baokun Li, Jan Kara,
	Ojaswin Mujoo, Ritesh Harjani (IBM), Zhang Yi
  Cc: mchehab+huawei, lilith.oberhauser, Bohdan Trach, linux-ext4,
	linux-kernel
In-Reply-To: <20260521125931.16474-1-bohdan.trach@huaweicloud.com>

Getting ext4_group_desc structure can contribute to the cost of
ext4_mb_prefetch() without any need, as most groups fail the
!EXT4_MB_GRP_TEST_AND_SET_READ check.

Optimize ext4_mb_prefetch by getting the group description only when
necessary.

The result is further increase in performance of fallocate() system call
path that triggers ext4_mb_prefetch() via a linear group scan.

Signed-off-by: Bohdan Trach <bohdan.trach@huaweicloud.com>
---
 fs/ext4/mballoc.c | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 25e3d9204233..e60499fb5a14 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2861,8 +2861,6 @@ ext4_group_t ext4_mb_prefetch(struct super_block *sb, ext4_group_t group,
 
 	blk_start_plug(&plug);
 	while (nr-- > 0) {
-		struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group,
-								  NULL);
 		struct ext4_group_info *grp = ext4_get_group_info(sb, group);
 
 		/*
@@ -2872,14 +2870,17 @@ ext4_group_t ext4_mb_prefetch(struct super_block *sb, ext4_group_t group,
 		 * prefetch once, so we avoid getblk() call, which can
 		 * be expensive.
 		 */
-		if (gdp && grp && !EXT4_MB_GRP_TEST_AND_SET_READ(grp) &&
-		    EXT4_MB_GRP_NEED_INIT(grp) &&
-		    ext4_free_group_clusters(sb, gdp) > 0 ) {
-			bh = ext4_read_block_bitmap_nowait(sb, group, true);
-			if (!IS_ERR_OR_NULL(bh)) {
-				if (!buffer_uptodate(bh) && cnt)
-					(*cnt)++;
-				brelse(bh);
+		if (group < ngroups && grp && !EXT4_MB_GRP_TEST_AND_SET_READ(grp) &&
+		    EXT4_MB_GRP_NEED_INIT(grp)) {
+			struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL);
+
+			if (gdp && ext4_free_group_clusters(sb, gdp) > 0) {
+				bh = ext4_read_block_bitmap_nowait(sb, group, true);
+				if (!IS_ERR_OR_NULL(bh)) {
+					if (!buffer_uptodate(bh) && cnt)
+						(*cnt)++;
+					brelse(bh);
+				}
 			}
 		}
 		if (++group >= ngroups)
-- 
2.43.0


^ permalink raw reply related

* [PATCH 1/2] ext4: avoid RWM atomic in EXT4_MB_GRP_TEST_AND_SET_READ
From: Bohdan Trach @ 2026-05-21 12:59 UTC (permalink / raw)
  To: Theodore Ts'o, Andreas Dilger, Baokun Li, Jan Kara,
	Ojaswin Mujoo, Ritesh Harjani (IBM), Zhang Yi
  Cc: mchehab+huawei, lilith.oberhauser, Bohdan Trach, linux-ext4,
	linux-kernel
In-Reply-To: <20260521125931.16474-1-bohdan.trach@huaweicloud.com>

EXT4_MB_GRP_TEST_AND_SET_READ uses test_and_set_bit function which
issues an atomic write. This can cause high overhead due to cache
contention when multiple threads iterate over groups in a tight loop,
as is the case for ext4_mb_prefetch(). We have seen this to be a
problem for Kunpeng 920b CPUs which uses a single ARM LSE instruction
for this purpose.

This change significantly reduces costs of fallocate() operations which
trigger linear group scans on large multicore machines where
test_and_set_bit issues an atomic write operation unconditionally.

Signed-off-by: Bohdan Trach <bohdan.trach@huaweicloud.com>
---
 fs/ext4/ext4.h | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 56b82d4a15d7..0713207811a6 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -3551,7 +3551,17 @@ struct ext4_group_info {
 #define EXT4_MB_GRP_CLEAR_TRIMMED(grp)	\
 	(clear_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state)))
 #define EXT4_MB_GRP_TEST_AND_SET_READ(grp)	\
-	(test_and_set_bit(EXT4_GROUP_INFO_BBITMAP_READ_BIT, &((grp)->bb_state)))
+	(ext4_mb_grp_test_and_set_read((grp)))
+
+static inline int ext4_mb_grp_test_and_set_read(struct ext4_group_info *grp)
+{
+	int r = test_bit_acquire(EXT4_GROUP_INFO_BBITMAP_READ_BIT, &grp->bb_state);
+
+	if (!r)
+		return test_and_set_bit(EXT4_GROUP_INFO_BBITMAP_READ_BIT, &grp->bb_state);
+	else
+		return r;
+}
 
 #define EXT4_MAX_CONTENTION		8
 #define EXT4_CONTENTION_THRESHOLD	2
-- 
2.43.0


^ permalink raw reply related

* [PATCH 0/2] ext4: optimize ext4_mb_prefetch
From: Bohdan Trach @ 2026-05-21 12:59 UTC (permalink / raw)
  To: Theodore Ts'o, Andreas Dilger, Baokun Li, Jan Kara,
	Ojaswin Mujoo, Ritesh Harjani (IBM), Zhang Yi
  Cc: mchehab+huawei, lilith.oberhauser, Bohdan Trach, linux-ext4,
	linux-kernel

Dear Ted,

We have been profiling scalability of some rocksdb-related workloads on
ext4 file system and have found a case where significant time ends up
being spent in ext4_mb_prefetch() function. This happens because
ext4_mb_scan_groups_linear() path is triggered in ext4_mb_scan_groups().
We have noticed that on larger, filled disks, this function can take
lots of time.

We have added a test for this issue to our fork of will-it-scale [1],
which you can use to reproduce the issue.(the actual workload does a few
writes after fallocate, they have been dropped to better illustrate the
issue).
1) https://github.com/open-s4c/will-it-scale/blob/master/tests/fallocate3.c

On this series, we optimize this code path:
Patch 1: change EXT4_MB_GRP_TEST_AND_SET_READ() to reduce the rate of
         atomic RMW operation via test_and_set_bit, which has quite
         high cost on large multicore CPUs, especially under
         contention for the group's flag cache lines.
         As this bit is only ever set, but never unset, it should be
         possible to reduce the cost of this check by calling
         test_bit[_acquire]() first.
Patch 2: restructure the ext4_mb_prefetch loop operations such that
         ext4_group_desc is fetched only after the checks based on
         ext4_group_info succeed.

This series has been tested with
        kvm-xfstests -c ext4/all -g auto
and did not introduce any new issues.

Performance test: we have used a our will-it-scale drop-in test we have
provided above, and used three machines for running it:
- Kunpeng 920 (arm64, 96 CPUs * 1 socket, 128G RAM, SAS HDD: Seagate
  Exos 10E2400 1.2TB)
- Kunpeng 920b (arm64, 80 CPUs * 2 sockets, 502G RAM, SATA SSD: Huawei
  ES3000 V6 0.96TB)
- AMD 9654 (x86_64, 96 CPUs * 2 sockets, 1.5T RAM, NVME SSD: Samsung SSD
  970 EVO Plus 1TB)
We have performed tests with existing file systems, as well as more limited
tests with a fixed-size file systems.

Benchmark on an existing file system for Kunpeng 920 (842G FS, 31% space
used) with the patch based on kernel 7.0.6:
| thr. | base | patched |      improv. |
|      | perf |    perf |              |
|------|------|---------|--------------|
|    1 | 1286 |    1608 |  +25.0388802 |
|    2 | 1673 |    1680 |   +0.4184100 |
|    4 | 1698 |    1712 |   +0.8244994 |
|    8 | 1721 |    1730 |   +0.5229518 |
|   16 | 1739 |    2313 |  +33.0074756 |
|   32 | 1742 |    3571 | +104.9942595 |
|   64 | 1735 |    3427 |  +97.5216138 |
|   96 | 1688 |    1814 |   +7.4644550 |

Benchmark on an existing file system for Kunpeng 920b (802G ext4 FS, 68%
space used) with the patch based on kernel 6.6:
| thr. | base | patched |  improv. |
|      | perf |    perf |          |
|------|------|---------|----------|
|    1 | 1613 |   1625  |   +0.74% |
|    2 | 1620 |   2603  |  +60.67% |
|    4 | 1624 |   4894  | +201.35% |
|    8 | 2505 |   8328  | +232.45% |
|   16 | 4736 |  11632  | +145.60% |
|   32 | 7784 |  13124  |  +68.60% |
|   64 | 8094 |   8636  |   +6.69% |
|  128 | 6914 |   7890  |  +14.11% |

Benchmark on an existing file system for AMD 9654 (15T FS, 6% space
used), kernel 7.1-rc3. This shows the performance impact on a mostly
free file system.
| thr. |  base | patched |    improv. |
|      |  perf |    perf |            |
|------|-------|---------|------------|
|    1 | 30901 |   31191 | +0.9384810 |
|    2 | 50874 |   50504 | -0.7272870 |
|    4 | 66068 |   64108 | -2.9666404 |
|    8 | 63963 |   61927 | -3.1830902 |
|   16 | 47809 |   47044 | -1.6001171 |
|   32 | 42441 |   42326 | -0.2709644 |
|   64 | 39773 |   39929 | +0.3922259 |
|  128 | 37065 |   36413 | -1.7590719 |

We have also performed the test with kernel 6.6 on both Kunpeng920b and
AMD 9654 with much smaller FS image (133G) to have more controlled
benchmarking environment, although this reduces the measured benefits as
well compared to a bigger FS with more groups to iterate over:

AMD 9654 performance:
| thr. |  base | patched |  improv. |
|      |  perf |    perf |          |
|------|----------------------------|
| 25% full file system:             |
|------|----------------------------|
|    1 |  5964 |    6778 |  +13.64% |
|    2 | 11811 |   13415 |  +13.58% |
|    4 | 20111 |   23570 |  +17.19% |
|    8 | 30083 |   36296 |  +20.65% |
|   16 | 27781 |   38302 |  +37.87% |
|   32 | 28325 |   36930 |  +30.37% |
|   64 | 26044 |   29952 |  +15.00% |
|  128 | 19969 |   20882 |   +4.57% |
|------|----------------------------|
| 50% full file system:             |
|------|----------------------------|
|    1 |  4093 |    7380 |  +80.30% |
|    2 | 13168 |   13906 |   +5.60% |
|    4 | 21440 |   22623 |   +5.51% |
|    8 | 30523 |   32360 |   +6.01% |
|   16 | 27502 |   34017 |  +23.68% |
|   32 | 27189 |   32480 |  +19.46% |
|   64 | 24146 |   26463 |   +9.59% |
|  128 | 18386 |   18631 |   +1.33% |
|------|----------------------------|
| 75% full file system:             |
|------|----------------------------|
|    1 |  5738 |    7208 |  +25.61% |
|    2 | 13869 |   15309 |  +10.38% |
|    4 | 21803 |   23447 |   +7.54% |
|    8 | 29004 |   30766 |   +6.07% |
|   16 | 25542 |   30584 |  +19.74% |
|   32 | 24242 |   28631 |  +18.10% |
|   64 | 20631 |   22833 |  +10.67% |
|  128 | 14603 |   15086 |   +3.30% |

Kunpeng K920b performance:
| thr. |  base | patched | improv. |
|      |  perf |    perf |         |
|------|---------------------------|
| 25% full file system:            |
|------|---------------------------|
|    1 |  5398 |    7025 | +30.14% |
|    2 |  7451 |   12299 | +65.06% |
|    4 | 12574 |   20899 | +66.20% |
|    8 | 18645 |   27694 | +48.53% |
|   16 | 25088 |   31739 | +26.51% |
|   32 | 26699 |   27632 |  +3.49% |
|   64 | 14943 |   19547 | +30.81% |
|  128 | 13047 |   14544 | +11.47% |
|------|---------------------------|
| 50% full file system:            |
|------|---------------------------|
|    1 |  4881 |    6618 | +35.58% |
|    2 |  6544 |   11660 | +78.17% |
|    4 | 11156 |   19506 | +74.84% |
|    8 | 16842 |   25835 | +53.39% |
|   16 | 23305 |   29260 | +25.55% |
|   32 | 24622 |   25303 |  +2.76% |
|   64 | 13814 |   17707 | +28.18% |
|  128 | 12061 |   13180 |  +9.27% |
|------|---------------------------|
| 75% full file system:            |
|------|---------------------------|
|    1 |  7037 |   10580 | +50.34% |
|    2 |  9216 |    9075 |  -1.52% |
|    4 | 14534 |   22076 | +51.89% |
|    8 | 19341 |   25936 | +34.09% |
|   16 | 23592 |   27409 | +16.17% |
|   32 | 23680 |   23078 |  -2.54% |
|   64 | 12836 |   15902 | +23.88% |
|  128 |  9614 |   10341 |  +7.56% |

Thanks,
Bohdan.

Bohdan Trach (2):
  ext4: avoid RWM atomic in EXT4_MB_GRP_TEST_AND_SET_READ
  ext4: get ext4_group_desc in ext4_mb_prefetch only when necessary

 fs/ext4/ext4.h    | 12 +++++++++++-
 fs/ext4/mballoc.c | 21 +++++++++++----------
 2 files changed, 22 insertions(+), 11 deletions(-)

-- 
2.43.0


^ permalink raw reply

* [PATCH v5 10/10] fstests: test UUID consistency for clones with metadata_uuid
From: Anand Jain @ 2026-05-21 12:55 UTC (permalink / raw)
  To: fstests
  Cc: linux-btrfs, linux-ext4, linux-xfs, linux-f2fs, amir73il, zlang,
	hch
In-Reply-To: <cover.1779367627.git.asj@kernel.org>

Btrfs and xfs uses the metadata_uuid superblock feature to change the
on-disk UUID without rewriting every block header. This patch adds a
sanity check to ensure UUID consistency when a filesystem with
metadata_uuid enabled is cloned.

Signed-off-by: Anand Jain <asj@kernel.org>
---
 tests/generic/806     | 78 +++++++++++++++++++++++++++++++++++++++++++
 tests/generic/806.out | 19 +++++++++++
 2 files changed, 97 insertions(+)
 create mode 100644 tests/generic/806
 create mode 100644 tests/generic/806.out

diff --git a/tests/generic/806 b/tests/generic/806
new file mode 100644
index 000000000000..222d138976d3
--- /dev/null
+++ b/tests/generic/806
@@ -0,0 +1,78 @@
+#! /bin/bash
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (c) 2026 Anand Jain <asj@kernel.org>.  All Rights Reserved.
+#
+# FS QA Test 806
+#
+# Verify that the cloned filesystem UUID remains consistent, even when the
+# `metadata_uuid` feature is enabled.
+#
+
+. ./common/preamble
+. ./common/filter
+
+_begin_fstest auto quick mount clone
+
+_require_test
+_require_block_device $TEST_DEV
+_require_loop
+
+_cleanup()
+{
+	cd /
+	rm -r -f $tmp.*
+	umount $mnt1 $mnt2 2>/dev/null
+	_loop_image_destroy "${devs[@]}" 2> /dev/null
+}
+
+filter_pool()
+{
+	sed -e "s|${devs[0]}|DEV1|g" -e "s|${mnt1}|MNT1|g" \
+	    -e "s|${devs[1]}|DEV2|g" -e "s|${mnt2}|MNT2|g" | _filter_spaces
+}
+
+print_info()
+{
+	local mntpt=$1
+	local tgt=$(findmnt -no SOURCE $mntpt)
+	local fsuuid=$(blkid -s UUID -o value $tgt)
+
+	echo "mntpt=$mntpt tgt=$tgt fsuuid=$fsuuid" >> $seqres.full
+	echo
+	findmnt -o SOURCE,TARGET,UUID "$tgt" | tail -n +2 | \
+				sed -e "s/${fsuuid}/FSUUID/g" | filter_pool
+	awk -v dev="$tgt" '$1 == dev { print $1, $2 }' /proc/self/mounts | \
+								filter_pool
+	df --all --output=source,target "$tgt" | tail -n +2 | filter_pool
+}
+
+devs=()
+_loop_image_create_clone devs pre_clone_tune_uuid
+mkdir -p $TEST_DIR/$seq
+mnt1=$TEST_DIR/$seq/mnt1
+mnt2=$TEST_DIR/$seq/mnt2
+mkdir -p $mnt1
+mkdir -p $mnt2
+
+_mount $(_common_dev_mount_options) $(_clone_mount_option) ${devs[0]} $mnt1 || \
+						_fail "Failed to mount dev1"
+_mount $(_common_dev_mount_options) $(_clone_mount_option) ${devs[1]} $mnt2 || \
+						_fail "Failed to mount dev2"
+
+print_info $mnt1
+print_info $mnt2
+
+echo
+echo "**** mount cycle ****"
+_unmount $mnt1
+_unmount $mnt2
+_mount $(_common_dev_mount_options) $(_clone_mount_option) ${devs[1]} $mnt2 || \
+						_fail "Failed to mount dev2"
+_mount $(_common_dev_mount_options) $(_clone_mount_option) ${devs[0]} $mnt1 || \
+						_fail "Failed to mount dev1"
+
+print_info $mnt1
+print_info $mnt2
+
+status=0
+exit
diff --git a/tests/generic/806.out b/tests/generic/806.out
new file mode 100644
index 000000000000..7315e791ba51
--- /dev/null
+++ b/tests/generic/806.out
@@ -0,0 +1,19 @@
+QA output created by 806
+
+DEV1 MNT1 FSUUID
+DEV1 MNT1
+DEV1 MNT1
+
+DEV2 MNT2 FSUUID
+DEV2 MNT2
+DEV2 MNT2
+
+**** mount cycle ****
+
+DEV1 MNT1 FSUUID
+DEV1 MNT1
+DEV1 MNT1
+
+DEV2 MNT2 FSUUID
+DEV2 MNT2
+DEV2 MNT2
-- 
2.43.0


^ permalink raw reply related

* [PATCH v5 09/10] fstests: add pre_clone_tune_uuid() healper
From: Anand Jain @ 2026-05-21 12:54 UTC (permalink / raw)
  To: fstests
  Cc: linux-btrfs, linux-ext4, linux-xfs, linux-f2fs, amir73il, zlang,
	hch
In-Reply-To: <cover.1779367627.git.asj@kernel.org>

pre_clone_tune_uuid() changes the UUID of the golden filesystem before it
is cloned.

Signed-off-by: Anand Jain <asj@kernel.org>
---
 common/rc | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/common/rc b/common/rc
index 7ae9877918c8..1b3a32d9ea9b 100644
--- a/common/rc
+++ b/common/rc
@@ -1517,6 +1517,26 @@ _scratch_resvblks()
 	esac
 }
 
+pre_clone_tune_uuid()
+{
+	local temp_mnt=$TEST_DIR/${seq}_mnt
+	local dev=$1
+
+	case $FSTYP in
+	xfs)
+		_require_command "$XFS_ADMIN_PROG" "xfs_admin"
+		$XFS_ADMIN_PROG -U generate $dev >> $seqres.full
+		;;
+	btrfs)
+		_require_command "$BTRFS_TUNE_PROG" "btrfstune"
+		$BTRFS_TUNE_PROG -m $dev
+		;;
+	*)
+		_notrun "Require filesystem with metadata_uuid feature"
+		;;
+	esac
+}
+
 _loop_image_create_clone()
 {
 	local -n _ret=$1
-- 
2.43.0


^ permalink raw reply related

* [PATCH v5 08/10] fstests: verify exportfs file handles on cloned filesystems
From: Anand Jain @ 2026-05-21 12:54 UTC (permalink / raw)
  To: fstests
  Cc: linux-btrfs, linux-ext4, linux-xfs, linux-f2fs, amir73il, zlang,
	hch
In-Reply-To: <cover.1779367627.git.asj@kernel.org>

Ensure that exportfs can correctly decode file handles on a cloned
filesystem across a mount cycle, by file handles generated on a
cloned device remain valid after mount cycle.

Signed-off-by: Anand Jain <asj@kernel.org>
---
 tests/generic/805     | 73 +++++++++++++++++++++++++++++++++++++++++++
 tests/generic/805.out |  2 ++
 2 files changed, 75 insertions(+)
 create mode 100644 tests/generic/805
 create mode 100644 tests/generic/805.out

diff --git a/tests/generic/805 b/tests/generic/805
new file mode 100644
index 000000000000..98e7172e141f
--- /dev/null
+++ b/tests/generic/805
@@ -0,0 +1,73 @@
+#! /bin/bash
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (c) 2026 Anand Jain <asj@kernel.org>.  All Rights Reserved.
+#
+# FS QA Test No. 805
+
+. ./common/preamble
+
+_begin_fstest auto quick exportfs clone
+
+_require_test
+_require_block_device $TEST_DEV
+_require_exportfs
+_require_loop
+_require_test_program "open_by_handle"
+
+_cleanup()
+{
+	cd /
+	rm -r -f $tmp.*
+	_unmount $mnt1 2>/dev/null
+	_unmount $mnt2 2>/dev/null
+	_loop_image_destroy "${devs[@]}" 2> /dev/null
+}
+
+# Create test dir and test files, encode file handles and store to tmp file
+create_test_files()
+{
+	rm -rf $testdir
+	mkdir -p $testdir
+	$here/src/open_by_handle -cwp -o $tmp.handles_file $testdir $NUMFILES
+}
+
+# Decode file handles loaded from tmp file
+test_file_handles()
+{
+	local opt=$1
+	local when=$2
+
+	echo test_file_handles after $when
+	$here/src/open_by_handle $opt -i $tmp.handles_file $mnt2 $NUMFILES
+}
+
+devs=()
+_loop_image_create_clone devs
+mkdir -p $TEST_DIR/$seq
+mnt1=$TEST_DIR/$seq/mnt1
+mnt2=$TEST_DIR/$seq/mnt2
+mkdir -p $mnt1
+mkdir -p $mnt2
+
+_mount $(_common_dev_mount_options) $(_clone_mount_option) ${devs[0]} $mnt1 || \
+						_fail "Failed to mount dev1"
+_mount $(_common_dev_mount_options) $(_clone_mount_option) ${devs[1]} $mnt2 || \
+						_fail "Failed to mount dev2"
+
+NUMFILES=1
+testdir=$mnt2/testdir
+
+# Decode file handles of files/dir after cycle mount
+create_test_files
+
+_unmount $mnt1
+_unmount $mnt2
+_mount $(_common_dev_mount_options) $(_clone_mount_option) ${devs[1]} $mnt2 || \
+						_fail "Failed to mount dev2"
+_mount $(_common_dev_mount_options) $(_clone_mount_option) ${devs[0]} $mnt1 || \
+						_fail "Failed to mount dev1"
+
+test_file_handles -rp "cycle mount"
+
+status=0
+exit
diff --git a/tests/generic/805.out b/tests/generic/805.out
new file mode 100644
index 000000000000..29b11ec77ffb
--- /dev/null
+++ b/tests/generic/805.out
@@ -0,0 +1,2 @@
+QA output created by 805
+test_file_handles after cycle mount
-- 
2.43.0


^ permalink raw reply related

* [PATCH v5 07/10] fstests: verify IMA isolation on cloned filesystems
From: Anand Jain @ 2026-05-21 12:54 UTC (permalink / raw)
  To: fstests
  Cc: linux-btrfs, linux-ext4, linux-xfs, linux-f2fs, amir73il, zlang,
	hch
In-Reply-To: <cover.1779367627.git.asj@kernel.org>

Add testcase to verify IMA measurement isolation when multiple devices
share the same FSUUID.

Signed-off-by: Anand Jain <asj@kernel.org>
---
 tests/generic/804     | 103 ++++++++++++++++++++++++++++++++++++++++++
 tests/generic/804.out |  10 ++++
 2 files changed, 113 insertions(+)
 create mode 100644 tests/generic/804
 create mode 100644 tests/generic/804.out

diff --git a/tests/generic/804 b/tests/generic/804
new file mode 100644
index 000000000000..9f3459015422
--- /dev/null
+++ b/tests/generic/804
@@ -0,0 +1,103 @@
+#! /bin/bash
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (c) 2026 Anand Jain <asj@kernel.org>.  All Rights Reserved.
+#
+# FS QA Test 804
+# Verify IMA isolation on cloned filesystems:
+# . Mount two devices sharing the same FSUUID (cloned).
+# . Apply an IMA policy to measure files based on that FSUUID.
+# . Create unique files on each mount point to trigger measurements.
+# . Confirm the IMA log correctly attributes events to the respective mounts.
+
+. ./common/preamble
+. ./common/filter
+
+_begin_fstest auto quick clone
+
+_require_test
+_require_block_device $TEST_DEV
+_require_loop
+
+[ "$FSTYP" = "btrfs" ] && _fixed_by_kernel_commit xxxxxxxxxxxx \
+	"btrfs: use on-disk uuid for s_uuid in temp_fsid mounts"
+[ "$FSTYP" = "btrfs" ] && _fixed_by_kernel_commit xxxxxxxxxxxx \
+	"btrfs: derive f_fsid from on-disk fsuuid and dev_t"
+
+_cleanup()
+{
+	cd /
+	rm -r -f $tmp.*
+	_unmount $mnt1 2>/dev/null
+	_unmount $mnt2 2>/dev/null
+	_loop_image_destroy "${devs[@]}" 2> /dev/null
+}
+
+filter_pool()
+{
+	sed -e "s|${devs[0]}|DEV1|g" -e "s|$mnt1|MNT1|g" \
+	    -e "s|${devs[1]}|DEV2|g" -e "s|$mnt2|MNT2|g" | _filter_spaces
+}
+
+do_ima()
+{
+	local ima_policy="/sys/kernel/security/ima/policy"
+	local ima_log="/sys/kernel/security/ima/ascii_runtime_measurements"
+	local fsuuid
+	local mnt=$1
+	local enable=$2
+
+	# Since the in-memory IMA audit log is only cleared upon reboot,
+	# use unique random filenames to avoid log collisions.
+	local foofile=$(mktemp --dry-run foobar_XXXXX)
+
+	echo $mnt $enable | filter_pool
+
+	[ -w "$ima_policy" ] || _notrun "IMA policy not writable"
+
+	fsuuid=$(blkid -s UUID -o value ${devs[0]})
+
+	# Load IMA policy to measure file access specifically for this
+	# filesystem UUID.
+	if [[ $enable -eq 1 ]]; then
+		echo "measure func=FILE_CHECK fsuuid=$fsuuid" > "$ima_policy" || \
+			_notrun "Policy rejected"
+	fi
+
+	# Create a file to trigger measurement and verify its entry in
+	# the IMA log.
+	echo "test_data" > $mnt/$foofile
+
+	# For $ima_log column entry please ref to
+	grep $foofile "$ima_log" | awk '{ print $5 }' | filter_pool | \
+						sed "s/$foofile/FOOBAR_FILE/"
+
+	echo "dbg: $mnt $fsuuid $foofile" >> $seqres.full
+	cat $ima_log | tail -1 >> $seqres.full
+	echo >> $seqres.full
+}
+
+devs=()
+_loop_image_create_clone devs
+mnt1=$TEST_DIR/$seq/mnt1
+mnt2=$TEST_DIR/$seq/mnt2
+mkdir -p $mnt1
+mkdir -p $mnt2
+
+_mount $(_common_dev_mount_options) $(_clone_mount_option) ${devs[0]} $mnt1 || \
+						_fail "Failed to mount dev1"
+_mount $(_common_dev_mount_options) $(_clone_mount_option) ${devs[1]} $mnt2 || \
+						_fail "Failed to mount dev2"
+
+do_ima $mnt1 1
+do_ima $mnt2 0
+
+# Btrfs uses in-memory dynamic temp_fsid
+echo mount cycle
+_unmount $mnt2
+_mount $mount_opts ${devs[1]} $mnt2 || _fail "Failed to mount dev2"
+
+do_ima $mnt1 0
+do_ima $mnt2 0
+
+status=0
+exit
diff --git a/tests/generic/804.out b/tests/generic/804.out
new file mode 100644
index 000000000000..9804181d6c17
--- /dev/null
+++ b/tests/generic/804.out
@@ -0,0 +1,10 @@
+QA output created by 804
+MNT1 1
+MNT1/FOOBAR_FILE
+MNT2 0
+MNT2/FOOBAR_FILE
+mount cycle
+MNT1 0
+MNT1/FOOBAR_FILE
+MNT2 0
+MNT2/FOOBAR_FILE
-- 
2.43.0


^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox