Linux EXT4 FS development
 help / color / mirror / Atom feed
* [PATCH v2 4/4] fs: retire sget()
From: Christian Brauner @ 2026-05-29  8:43 UTC (permalink / raw)
  To: linux-fsdevel
  Cc: Theodore Ts'o, Andreas Dilger, Jan Kara, Ritesh Harjani (IBM),
	linux-ext4, linux-cifs, Alexander Viro,
	Christian Brauner (Amutable)
In-Reply-To: <20260529-work-sget-v2-0-57bbe08604e4@kernel.org>

sget() and sget_fc() have lived side by side as near-duplicate
find-or-create-and-publish helpers for the legacy and fs_context mount
APIs. The three remaining in-tree callers (CIFS plus the ext4 extents
and mballoc KUnit tests) have all been moved to sget_fc(). Nothing
calls sget() anymore.

Delete sget() from fs/super.c and the prototype in <linux/fs.h>.
Update the two comments that referred to "sget()" or "sget{_fc}()" to
just say "sget_fc()".

This removes ~60 lines of code that only existed to be kept in
lockstep with sget_fc() on every superblock publish-path change.

Signed-off-by: Christian Brauner (Amutable) <brauner@kernel.org>
---
 fs/btrfs/super.c   |  2 +-
 fs/super.c         | 71 ++++--------------------------------------------------
 include/linux/fs.h |  4 ---
 3 files changed, 6 insertions(+), 71 deletions(-)

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index b26aa9169e83..636154861d7c 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -2052,7 +2052,7 @@ static int btrfs_get_tree_subvol(struct fs_context *fc)
 	 * then open_ctree will properly initialize the file system specific
 	 * settings later.  btrfs_init_fs_info initializes the static elements
 	 * of the fs_info (locks and such) to make cleanup easier if we find a
-	 * superblock with our given fs_devices later on at sget() time.
+	 * superblock with our given fs_devices later on at sget_fc() time.
 	 */
 	fs_info = kvzalloc_obj(struct btrfs_fs_info);
 	if (!fs_info)
diff --git a/fs/super.c b/fs/super.c
index 378e81efe643..5fe8cea9f8fe 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -328,7 +328,7 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags,
 	init_rwsem(&s->s_umount);
 	lockdep_set_class(&s->s_umount, &type->s_umount_key);
 	/*
-	 * sget() can have s_umount recursion.
+	 * sget_fc() can have s_umount recursion.
 	 *
 	 * When it cannot find a suitable sb, it allocates a new
 	 * one (this one), and tries again to find a suitable old
@@ -439,7 +439,7 @@ static void kill_super_notify(struct super_block *sb)
 
 	/*
 	 * Remove it from @fs_supers so it isn't found by new
-	 * sget{_fc}() walkers anymore. Any concurrent mounter still
+	 * sget_fc() walkers anymore. Any concurrent mounter still
 	 * managing to grab a temporary reference is guaranteed to
 	 * already see SB_DYING and will wait until we notify them about
 	 * SB_DEAD.
@@ -517,7 +517,7 @@ EXPORT_SYMBOL(deactivate_super);
  * @sb: superblock to acquire
  *
  * Acquire a temporary reference on a superblock and try to trade it for
- * an active reference. This is used in sget{_fc}() to wait for a
+ * an active reference. This is used in sget_fc() to wait for a
  * superblock to either become SB_BORN or for it to pass through
  * sb->kill() and be marked as SB_DEAD.
  *
@@ -673,11 +673,11 @@ void generic_shutdown_super(struct super_block *sb)
 	/*
 	 * Broadcast to everyone that grabbed a temporary reference to this
 	 * superblock before we removed it from @fs_supers that the superblock
-	 * is dying. Every walker of @fs_supers outside of sget{_fc}() will now
+	 * is dying. Every walker of @fs_supers outside of sget_fc() will now
 	 * discard this superblock and treat it as dead.
 	 *
 	 * We leave the superblock on @fs_supers so it can be found by
-	 * sget{_fc}() until we passed sb->kill_sb().
+	 * sget_fc() until we passed sb->kill_sb().
 	 */
 	super_wake(sb, SB_DYING);
 	super_unlock_excl(sb);
@@ -808,67 +808,6 @@ struct super_block *sget_fc(struct fs_context *fc,
 }
 EXPORT_SYMBOL(sget_fc);
 
-/**
- *	sget	-	find or create a superblock
- *	@type:	  filesystem type superblock should belong to
- *	@test:	  comparison callback
- *	@set:	  setup callback
- *	@flags:	  mount flags
- *	@data:	  argument to each of them
- */
-struct super_block *sget(struct file_system_type *type,
-			int (*test)(struct super_block *,void *),
-			int (*set)(struct super_block *,void *),
-			int flags,
-			void *data)
-{
-	struct user_namespace *user_ns = current_user_ns();
-	struct super_block *s = NULL;
-	struct super_block *old;
-	int err;
-
-retry:
-	spin_lock(&sb_lock);
-	if (test) {
-		hlist_for_each_entry(old, &type->fs_supers, s_instances) {
-			if (!test(old, data))
-				continue;
-			if (user_ns != old->s_user_ns) {
-				spin_unlock(&sb_lock);
-				destroy_unused_super(s);
-				return ERR_PTR(-EBUSY);
-			}
-			if (!grab_super(old))
-				goto retry;
-			destroy_unused_super(s);
-			return old;
-		}
-	}
-	if (!s) {
-		spin_unlock(&sb_lock);
-		s = alloc_super(type, flags, user_ns);
-		if (!s)
-			return ERR_PTR(-ENOMEM);
-		goto retry;
-	}
-
-	err = set(s, data);
-	if (err) {
-		spin_unlock(&sb_lock);
-		destroy_unused_super(s);
-		return ERR_PTR(err);
-	}
-	s->s_type = type;
-	strscpy(s->s_id, type->name, sizeof(s->s_id));
-	list_add_tail(&s->s_list, &super_blocks);
-	hlist_add_head(&s->s_instances, &type->fs_supers);
-	spin_unlock(&sb_lock);
-	get_filesystem(type);
-	shrinker_register(s->s_shrink);
-	return s;
-}
-EXPORT_SYMBOL(sget);
-
 void drop_super(struct super_block *sb)
 {
 	super_unlock_shared(sb);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 11559c513dfb..6dbe3218dc1e 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2327,10 +2327,6 @@ void free_anon_bdev(dev_t);
 struct super_block *sget_fc(struct fs_context *fc,
 			    int (*test)(struct super_block *, struct fs_context *),
 			    int (*set)(struct super_block *, struct fs_context *));
-struct super_block *sget(struct file_system_type *type,
-			int (*test)(struct super_block *,void *),
-			int (*set)(struct super_block *,void *),
-			int flags, void *data);
 struct super_block *sget_dev(struct fs_context *fc, dev_t dev);
 
 /* Alas, no aliases. Too much hassle with bringing module.h everywhere */

-- 
2.47.3


^ permalink raw reply related

* [PATCH v2 3/4] smb: client: convert cifs_smb3_do_mount() to sget_fc()
From: Christian Brauner @ 2026-05-29  8:43 UTC (permalink / raw)
  To: linux-fsdevel
  Cc: Theodore Ts'o, Andreas Dilger, Jan Kara, Ritesh Harjani (IBM),
	linux-ext4, linux-cifs, Alexander Viro,
	Christian Brauner (Amutable)
In-Reply-To: <20260529-work-sget-v2-0-57bbe08604e4@kernel.org>

The CIFS mount path already runs through fs_context: smb3_get_tree()
calls smb3_get_tree_common() with a struct fs_context * in hand. But
the fc is dropped on the way to sget(). Plumb it through to sget_fc()
so the legacy sget() interface can go.

cifs_smb3_do_mount() now takes (struct fs_context *, struct
smb3_fs_context *). The old (fs_type, flags) pair is reconstructed
from fc->fs_type and fc->sb_flags. The flags argument was always
passed as 0 by the sole caller anyway. The cifs_dbg diagnostic now
prints fc->sb_flags directly.

cifs_match_super() and cifs_set_super() were the two void-data
callbacks for sget(). The match callback now takes
(struct super_block *, struct fs_context *) and reads struct
cifs_mnt_data out of fc->sget_key. The set callback is gone entirely:
sget_fc() pre-populates sb->s_fs_info from fc->s_fs_info before
invoking set() so set_anon_super_fc() (which just allocates an anon
bdev) is sufficient.

Before sget_fc() we stash cifs_sb in fc->s_fs_info, the per-mount data
in fc->sget_key and force fc->sb_flags to SB_NODIRATIME | SB_NOATIME
to reproduce the previous hard-coded behaviour (alloc_super() reads
fc->sb_flags). The original sb_flags is saved and restored around the
call so the rest of the mount path sees the same fc semantics as
before.

mnt_data.flags keeps its historical value of 0 so the CIFS_MS_MASK
comparison in compare_mount_options() returns the same (always-equal)
result.

No functional change. With this in place sget() has no remaining CIFS
caller.

Signed-off-by: Christian Brauner (Amutable) <brauner@kernel.org>
---
 fs/smb/client/cifsfs.c     | 43 ++++++++++++++++++++++++++-----------------
 fs/smb/client/cifsfs.h     |  3 ++-
 fs/smb/client/cifsproto.h  |  3 ++-
 fs/smb/client/connect.c    |  5 +++--
 fs/smb/client/fs_context.c |  2 +-
 5 files changed, 34 insertions(+), 22 deletions(-)

diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c
index 9f76b0347fa9..d5074e3fbb85 100644
--- a/fs/smb/client/cifsfs.c
+++ b/fs/smb/client/cifsfs.c
@@ -12,6 +12,7 @@
 
 #include <linux/module.h>
 #include <linux/fs.h>
+#include <linux/fs_context.h>
 #include <linux/filelock.h>
 #include <linux/mount.h>
 #include <linux/slab.h>
@@ -966,26 +967,19 @@ cifs_get_root(struct smb3_fs_context *ctx, struct super_block *sb)
 	return dentry;
 }
 
-static int cifs_set_super(struct super_block *sb, void *data)
-{
-	struct cifs_mnt_data *mnt_data = data;
-	sb->s_fs_info = mnt_data->cifs_sb;
-	return set_anon_super(sb, NULL);
-}
-
 struct dentry *
-cifs_smb3_do_mount(struct file_system_type *fs_type,
-	      int flags, struct smb3_fs_context *old_ctx)
+cifs_smb3_do_mount(struct fs_context *fc, struct smb3_fs_context *old_ctx)
 {
 	struct cifs_mnt_data mnt_data;
 	struct cifs_sb_info *cifs_sb;
 	struct super_block *sb;
 	struct dentry *root;
+	unsigned int saved_sb_flags;
 	int rc;
 
 	if (cifsFYI) {
-		cifs_dbg(FYI, "%s: devname=%s flags=0x%x\n", __func__,
-			 old_ctx->source, flags);
+		cifs_dbg(FYI, "%s: devname=%s sb_flags=0x%x\n", __func__,
+			 old_ctx->source, fc->sb_flags);
 	} else {
 		cifs_info("Attempting to mount %s\n", old_ctx->source);
 	}
@@ -1012,7 +1006,7 @@ cifs_smb3_do_mount(struct file_system_type *fs_type,
 
 	rc = cifs_mount(cifs_sb, cifs_sb->ctx);
 	if (rc) {
-		if (!(flags & SB_SILENT))
+		if (!(fc->sb_flags & SB_SILENT))
 			cifs_dbg(VFS, "cifs_mount failed w/return code = %d\n",
 				 rc);
 		root = ERR_PTR(rc);
@@ -1021,12 +1015,27 @@ cifs_smb3_do_mount(struct file_system_type *fs_type,
 
 	mnt_data.ctx = cifs_sb->ctx;
 	mnt_data.cifs_sb = cifs_sb;
-	mnt_data.flags = flags;
+	mnt_data.flags = 0;
 
-	/* BB should we make this contingent on mount parm? */
-	flags |= SB_NODIRATIME | SB_NOATIME;
-
-	sb = sget(fs_type, cifs_match_super, cifs_set_super, flags, &mnt_data);
+	/*
+	 * sb->s_flags is set from fc->sb_flags by alloc_super(). CIFS has
+	 * historically forced SB_NODIRATIME | SB_NOATIME on every mount and
+	 * ignored the caller-supplied SB_* flags. Preserve that behaviour by
+	 * overriding fc->sb_flags around the sget_fc() call.
+	 *
+	 * Hand cifs_sb to sget_fc() via fc->s_fs_info; sget_fc() copies it
+	 * onto sb->s_fs_info before running set() and clears fc->s_fs_info
+	 * on successful publish. Pass the rest of the per-mount context to
+	 * cifs_match_super() through fc->sget_key.
+	 */
+	saved_sb_flags = fc->sb_flags;
+	fc->sb_flags = SB_NODIRATIME | SB_NOATIME;
+	fc->s_fs_info = cifs_sb;
+	fc->sget_key = &mnt_data;
+	sb = sget_fc(fc, cifs_match_super, set_anon_super_fc);
+	fc->sget_key = NULL;
+	fc->s_fs_info = NULL;
+	fc->sb_flags = saved_sb_flags;
 	if (IS_ERR(sb)) {
 		cifs_umount(cifs_sb);
 		return ERR_CAST(sb);
diff --git a/fs/smb/client/cifsfs.h b/fs/smb/client/cifsfs.h
index c455b15f2778..0a93f48924a5 100644
--- a/fs/smb/client/cifsfs.h
+++ b/fs/smb/client/cifsfs.h
@@ -144,8 +144,9 @@ ssize_t cifs_file_copychunk_range(unsigned int xid, struct file *src_file,
 long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg);
 void cifs_setsize(struct inode *inode, loff_t offset);
 
+struct fs_context;
 struct smb3_fs_context;
-struct dentry *cifs_smb3_do_mount(struct file_system_type *fs_type, int flags,
+struct dentry *cifs_smb3_do_mount(struct fs_context *fc,
 				  struct smb3_fs_context *old_ctx);
 
 char *cifs_silly_fullpath(struct dentry *dentry);
diff --git a/fs/smb/client/cifsproto.h b/fs/smb/client/cifsproto.h
index 4a25afda9448..a39572cbaadb 100644
--- a/fs/smb/client/cifsproto.h
+++ b/fs/smb/client/cifsproto.h
@@ -19,6 +19,7 @@
 struct statfs;
 struct smb_rqst;
 struct smb3_fs_context;
+struct fs_context;
 
 /*
  *****************************************************************
@@ -236,7 +237,7 @@ void cifs_mount_put_conns(struct cifs_mount_ctx *mnt_ctx);
 int cifs_mount_get_session(struct cifs_mount_ctx *mnt_ctx);
 int cifs_is_path_remote(struct cifs_mount_ctx *mnt_ctx);
 int cifs_mount_get_tcon(struct cifs_mount_ctx *mnt_ctx);
-int cifs_match_super(struct super_block *sb, void *data);
+int cifs_match_super(struct super_block *sb, struct fs_context *fc);
 int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb3_fs_context *ctx);
 void cifs_umount(struct cifs_sb_info *cifs_sb);
 void cifs_mark_open_files_invalid(struct cifs_tcon *tcon);
diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c
index dcde25da468d..79762e6bbe50 100644
--- a/fs/smb/client/connect.c
+++ b/fs/smb/client/connect.c
@@ -6,6 +6,7 @@
  *
  */
 #include <linux/fs.h>
+#include <linux/fs_context.h>
 #include <linux/net.h>
 #include <linux/string.h>
 #include <linux/sched/mm.h>
@@ -2991,9 +2992,9 @@ static int match_prepath(struct super_block *sb,
 }
 
 int
-cifs_match_super(struct super_block *sb, void *data)
+cifs_match_super(struct super_block *sb, struct fs_context *fc)
 {
-	struct cifs_mnt_data *mnt_data = data;
+	struct cifs_mnt_data *mnt_data = fc->sget_key;
 	struct smb3_fs_context *ctx;
 	struct cifs_sb_info *cifs_sb;
 	struct TCP_Server_Info *tcp_srv;
diff --git a/fs/smb/client/fs_context.c b/fs/smb/client/fs_context.c
index b9544eb0381b..6aba4e1c9c27 100644
--- a/fs/smb/client/fs_context.c
+++ b/fs/smb/client/fs_context.c
@@ -920,7 +920,7 @@ static int smb3_get_tree_common(struct fs_context *fc)
 	struct dentry *root;
 	int rc = 0;
 
-	root = cifs_smb3_do_mount(fc->fs_type, 0, ctx);
+	root = cifs_smb3_do_mount(fc, ctx);
 	if (IS_ERR(root))
 		return PTR_ERR(root);
 

-- 
2.47.3


^ permalink raw reply related

* [PATCH v2 2/4] ext4: convert mballoc KUnit test to sget_fc()
From: Christian Brauner @ 2026-05-29  8:43 UTC (permalink / raw)
  To: linux-fsdevel
  Cc: Theodore Ts'o, Andreas Dilger, Jan Kara, Ritesh Harjani (IBM),
	linux-ext4, linux-cifs, Alexander Viro,
	Christian Brauner (Amutable)
In-Reply-To: <20260529-work-sget-v2-0-57bbe08604e4@kernel.org>

Same treatment as the extents KUnit test. The mballoc test uses sget()
as a thin "give me an initialized superblock" wrapper for a fake
file_system_type. Move it onto sget_fc() so sget() can go away.

Add a no-op mbt_init_fs_context() so fs_context_for_mount() has
something to call on the fake fs_type. mbt_set() now takes a struct
fs_context * (still a no-op). mbt_ext4_alloc_super_block() allocates
the fc, hands it to sget_fc() and drops the fc reference once the sb
is published.

No functional change.

Signed-off-by: Christian Brauner (Amutable) <brauner@kernel.org>
---
 fs/ext4/mballoc-test.c | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/fs/ext4/mballoc-test.c b/fs/ext4/mballoc-test.c
index 90ed505fa4b1..d90da44aadbd 100644
--- a/fs/ext4/mballoc-test.c
+++ b/fs/ext4/mballoc-test.c
@@ -5,6 +5,7 @@
 
 #include <kunit/test.h>
 #include <kunit/static_stub.h>
+#include <linux/fs_context.h>
 #include <linux/random.h>
 
 #include "ext4.h"
@@ -63,8 +64,14 @@ static void mbt_kill_sb(struct super_block *sb)
 	generic_shutdown_super(sb);
 }
 
+static int mbt_init_fs_context(struct fs_context *fc)
+{
+	return 0;
+}
+
 static struct file_system_type mbt_fs_type = {
 	.name			= "mballoc test",
+	.init_fs_context	= mbt_init_fs_context,
 	.kill_sb		= mbt_kill_sb,
 };
 
@@ -127,7 +134,7 @@ static void mbt_mb_release(struct super_block *sb)
 	kfree(sb->s_bdev);
 }
 
-static int mbt_set(struct super_block *sb, void *data)
+static int mbt_set(struct super_block *sb, struct fs_context *fc)
 {
 	return 0;
 }
@@ -136,13 +143,19 @@ static struct super_block *mbt_ext4_alloc_super_block(void)
 {
 	struct mbt_ext4_super_block *fsb;
 	struct super_block *sb;
+	struct fs_context *fc;
 	struct ext4_sb_info *sbi;
 
 	fsb = kzalloc_obj(*fsb);
 	if (fsb == NULL)
 		return NULL;
 
-	sb = sget(&mbt_fs_type, NULL, mbt_set, 0, NULL);
+	fc = fs_context_for_mount(&mbt_fs_type, 0);
+	if (IS_ERR(fc))
+		goto out;
+
+	sb = sget_fc(fc, NULL, mbt_set);
+	put_fs_context(fc);
 	if (IS_ERR(sb))
 		goto out;
 

-- 
2.47.3


^ permalink raw reply related

* [PATCH v2 1/4] ext4: convert extents KUnit test to sget_fc()
From: Christian Brauner @ 2026-05-29  8:43 UTC (permalink / raw)
  To: linux-fsdevel
  Cc: Theodore Ts'o, Andreas Dilger, Jan Kara, Ritesh Harjani (IBM),
	linux-ext4, linux-cifs, Alexander Viro,
	Christian Brauner (Amutable)
In-Reply-To: <20260529-work-sget-v2-0-57bbe08604e4@kernel.org>

The extents KUnit test uses sget() to get an initialized superblock for
its fake file_system_type. sget() predates fs_context and we want to
retire it. Switch this caller over to sget_fc().

Add a no-op ext_init_fs_context() so fs_context_for_mount() has
something to call on the fake fs_type. ext_set() now takes a struct
fs_context * (still a no-op). extents_kunit_init() allocates the fc,
hands it to sget_fc() and drops the fc reference once the sb is
published. sget_fc() does not retain a pointer to it.

No functional change for the test.

Signed-off-by: Christian Brauner (Amutable) <brauner@kernel.org>
---
 fs/ext4/extents-test.c | 22 ++++++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

diff --git a/fs/ext4/extents-test.c b/fs/ext4/extents-test.c
index 6b53a3f39fcd..bd7795a82607 100644
--- a/fs/ext4/extents-test.c
+++ b/fs/ext4/extents-test.c
@@ -37,6 +37,7 @@
 
 #include <kunit/test.h>
 #include <kunit/static_stub.h>
+#include <linux/fs_context.h>
 #include <linux/gfp_types.h>
 #include <linux/stddef.h>
 
@@ -130,14 +131,20 @@ static void ext_kill_sb(struct super_block *sb)
 	generic_shutdown_super(sb);
 }
 
-static int ext_set(struct super_block *sb, void *data)
+static int ext_init_fs_context(struct fs_context *fc)
+{
+	return 0;
+}
+
+static int ext_set(struct super_block *sb, struct fs_context *fc)
 {
 	return 0;
 }
 
 static struct file_system_type ext_fs_type = {
-	.name = "extents test",
-	.kill_sb = ext_kill_sb,
+	.name		 = "extents test",
+	.init_fs_context = ext_init_fs_context,
+	.kill_sb	 = ext_kill_sb,
 };
 
 static void extents_kunit_exit(struct kunit *test)
@@ -223,6 +230,7 @@ static int extents_kunit_init(struct kunit *test)
 	struct ext4_inode_info *ei;
 	struct inode *inode;
 	struct super_block *sb;
+	struct fs_context *fc;
 	struct ext4_sb_info *sbi = NULL;
 	struct kunit_ext_test_param *param =
 		(struct kunit_ext_test_param *)(test->param_value);
@@ -232,7 +240,13 @@ static int extents_kunit_init(struct kunit *test)
 	if (sbi == NULL)
 		return -ENOMEM;
 
-	sb = sget(&ext_fs_type, NULL, ext_set, 0, NULL);
+	fc = fs_context_for_mount(&ext_fs_type, 0);
+	if (IS_ERR(fc)) {
+		kfree(sbi);
+		return PTR_ERR(fc);
+	}
+	sb = sget_fc(fc, NULL, ext_set);
+	put_fs_context(fc);
 	if (IS_ERR(sb)) {
 		kfree(sbi);
 		return PTR_ERR(sb);

-- 
2.47.3


^ permalink raw reply related

* [PATCH v2 0/4] super: retire sget()
From: Christian Brauner @ 2026-05-29  8:43 UTC (permalink / raw)
  To: linux-fsdevel
  Cc: Theodore Ts'o, Andreas Dilger, Jan Kara, Ritesh Harjani (IBM),
	linux-ext4, linux-cifs, Alexander Viro,
	Christian Brauner (Amutable)

CIFS plus the two ext4 KUnit tests (extents-test, mballoc-test) were the
last in-tree callers, and all three convert cleanly to sget_fc(). That
lets sget() and its prototype come out, taking ~60 lines that only
existed to be kept in lockstep with sget_fc() on every publish-path
change.

Signed-off-by: Christian Brauner (Amutable) <brauner@kernel.org>
---
Changes in v2:
- Move some changes into a separate series.
- Link to v1: https://patch.msgid.link/20260526-work-sget-v1-0-263f7025cedd@kernel.org

---
Christian Brauner (4):
      ext4: convert extents KUnit test to sget_fc()
      ext4: convert mballoc KUnit test to sget_fc()
      smb: client: convert cifs_smb3_do_mount() to sget_fc()
      fs: retire sget()

 fs/btrfs/super.c           |  2 +-
 fs/ext4/extents-test.c     | 22 +++++++++++---
 fs/ext4/mballoc-test.c     | 17 +++++++++--
 fs/smb/client/cifsfs.c     | 43 +++++++++++++++++-----------
 fs/smb/client/cifsfs.h     |  3 +-
 fs/smb/client/cifsproto.h  |  3 +-
 fs/smb/client/connect.c    |  5 ++--
 fs/smb/client/fs_context.c |  2 +-
 fs/super.c                 | 71 ++++------------------------------------------
 include/linux/fs.h         |  4 ---
 10 files changed, 73 insertions(+), 99 deletions(-)
---
base-commit: 254f49634ee16a731174d2ae34bc50bd5f45e731
change-id: 20260526-work-sget-6bc80b96cba5


^ permalink raw reply

* Re: [PATCH] ext4: fix quota accounting WARN in bigalloc punch hole
From: Qiliang Yuan @ 2026-05-29  8:34 UTC (permalink / raw)
  To: yi.zhang
  Cc: tytso, adilger.kernel, jack, ritesh.list, ojaswin, libaokun,
	linux-ext4, linux-kernel, yzjaurora
In-Reply-To: <73e91ebd-27de-4834-af2f-9b4ac19a4100@huaweicloud.com>

Hi Zhang Yi,

On 5/29/2026 3:32 PM, Zhang Yi wrote:
> On 5/28/2026 6:21 PM, Qiliang Yuan wrote:
>> __es_remove_extent() -> get_rsvd() already correctly excludes
>> boundary clusters that still contain delayed blocks from resv_used.
>> Adding pending to resv_used double-counts those boundary clusters,
>> erroneously releasing reservations that are still needed.
>
> Hmm, the analysis doesn't seem correct to me. Do you mean the
> following case?
>
> # Assume the cluster size is 16KB.
> xfs_io -f -c "pwrite 12k 4k" /mnt/foo
> xfs_io -d -c "pwrite 0 4k" /mnt/foo
> xfs_io -c "fpunch 0 4k" /mnt/foo
>
> During the direct I/O write, quota space will be added in
> ext4_mb_new_blocks() because the EXT4_MB_DELALLOC_RESERVED flag is
> not set. Therefore, in ext4_es_insert_extent(), we should release the
> quota reservations, since this cluster has already been allocated.
>
> Then, in the third operation (punch hole), it will reclaim the added
> dqb_curspace. This should not cause an insufficiency.
>
> Am I missing something?

Thanks for the review!  Let me explain the issue with your specific
example.

After step 1 (delalloc write 4KB@12KB in a 16KB cluster), we have:

  ES tree: [0,1) hole, [1,3) delayed, [3,4) hole   (blocks 0..3)
  Quota:   dqb_rsvspace += 16KB (one cluster reserved)

Step 2 (DIO write 4KB@0KB, RWF_DSYNC):

  The DIO allocates one cluster, but the mapped extent from
  ext4_ext_map_blocks() only covers the written range, e.g. [0,0].

  In ext4_es_insert_extent():

    a) __es_remove_extent([0,0]) removes the [0,1) hole entry, but
       there are no delayed extents within [0,0], so resv_used = 0.

       This is correct: the DIO extent [0,0] does not overlap the
       delayed region [1,3).

    b) __revise_pending() scans outside the newly inserted extent [0,0]:

       - Left boundary (block 0): the range starts at cluster
         boundary, no blocks to scan on the left → no pending insert.

       - Right boundary (block 3): blocks [1,3] are outside [0,0]
         and are delayed → __revise_pending() inserts a pending
         reservation and returns pending = 1.

       This pending reservation means: "cluster containing block 3
       still has delayed blocks, keep this cluster reserved."

    c) Then comes the bug:

         resv_used += pending;   // resv_used = 0 + 1 = 1

       This causes ext4_da_update_reserve_space() to release 16KB
       of quota reservation (dquot_release_reservation_block()).  But
       block 3 is still delayed!  Its quota reservation should NOT
       be released — no blocks within [0,0] actually used the delalloc
       reservation.

    So after DIO:
      dqb_rsvspace: originally 16KB, now 0 (incorrectly released)
      dqb_curspace: 16KB (from ext4_mb_new_blocks)

Step 3 (punch 4KB@0KB):

  ext4_remove_blocks() sees that block 0's cluster has a pending
  reservation → calls ext4_rereserve_cluster() → dquot_reclaim_block()
  tries to move 16KB from dqb_curspace to dqb_rsvspace.  But
  dqb_curspace may already be insufficient (depending on whether
  other allocs/frees have happened), triggering:

    WARNING at dquot_reclaim_space_nodirty

Step 4 (delalloc writeback of block 3):

  ext4_da_update_reserve_space() → dquot_claim_block() tries to
  move 16KB from dqb_rsvspace to dqb_curspace.  Since rsvspace was
  incorrectly released and not fully restored by the punch hole's
  rereseve, this triggers:

    WARNING at dquot_claim_space_nodirty

The key point is: pending from __revise_pending() indicates clusters
that *still contain delayed blocks outside the newly inserted extent*.
These clusters' quota reservations must be preserved — get_rsvd()
inside __es_remove_extent() already correctly excludes them from
resv_used.  Adding pending to resv_used double-counts them.

I also found a pre-existing retry loop issue: if __es_insert_extent()
fails with -ENOMEM on the first pass, resv_used carries a stale value
back to retry and could be double-released.  I'll include a fix for
that in v2.

-- 
Qiliang Yuan

^ permalink raw reply

* Re: [PATCH] ext4: validate dx count against limit in dx_csum
From: Baokun Li @ 2026-05-29  8:00 UTC (permalink / raw)
  To: Artem Blagodarenko; +Cc: adilger.kernel, linux-ext4
In-Reply-To: <20260528160557.6956-1-ablagodarenko@thelustrecollective.com>

On 2026/5/29 00:05, Artem Blagodarenko wrote:
> From: Artem Blagodarenko <artem.blagodarenko@gmail.com>
>
> Sashiko AI, during inspection of the "ext4: replace ext4_dir_entry
> with ext4_dir_entry_2" series, reported a missing bounds check on
> count against limit in DX block checksum verification and setup
> paths.
>
> The code validates that limit fits within block boundaries, but does
> not verify that count <= limit. A maliciously crafted filesystem image
> could therefore provide an artificially large count value.
>
> Since ext4_dx_csum() uses count to determine the checksum region
> size, this may result in an out-of-bounds access crossing page
> boundaries into unmapped memory, potentially leading to a crash during
> checksum verification.
>
> The same issue exists in ext4_dx_csum_set().
>
> The bug was not introduced by the patch under review, so the fix is sent
> separately.
>
> Signed-off-by: Artem Blagodarenko artem.blagodarenko@gmail.com

Looks good, feel free to add:

Reviewed-by: Baokun Li <libaokun@linux.alibaba.com>

> ---
>  fs/ext4/namei.c | 6 ++++--
>  1 file changed, 4 insertions(+), 2 deletions(-)
>
> diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
> index a316fc2ac41b..20b7bac69889 100644
> --- a/fs/ext4/namei.c
> +++ b/fs/ext4/namei.c
> @@ -472,7 +472,8 @@ static int ext4_dx_csum_verify(struct inode *inode,
>  	}
>  	limit = le16_to_cpu(c->limit);
>  	count = le16_to_cpu(c->count);
> -	if (count_offset + (limit * sizeof(struct dx_entry)) >
> +	if (!count || count > limit ||
> +	    count_offset + (limit * sizeof(struct dx_entry)) >
>  	    EXT4_BLOCK_SIZE(inode->i_sb) - sizeof(struct dx_tail)) {
>  		warn_no_space_for_csum(inode);
>  		return 0;
> @@ -501,7 +502,8 @@ static void ext4_dx_csum_set(struct inode *inode, struct ext4_dir_entry_2 *diren
>  	}
>  	limit = le16_to_cpu(c->limit);
>  	count = le16_to_cpu(c->count);
> -	if (count_offset + (limit * sizeof(struct dx_entry)) >
> +	if (!count || count > limit ||
> +	    count_offset + (limit * sizeof(struct dx_entry)) >
>  	    EXT4_BLOCK_SIZE(inode->i_sb) - sizeof(struct dx_tail)) {
>  		warn_no_space_for_csum(inode);
>  		return;



^ permalink raw reply

* Re: [PATCH] jbd2: update outdated comment for jbd2_journal_try_to_free_buffers()
From: Baokun Li @ 2026-05-29  7:44 UTC (permalink / raw)
  To: Zhang Yi
  Cc: linux-ext4, linux-fsdevel, linux-kernel, tytso, adilger.kernel,
	jack, ojaswin, ritesh.list, yi.zhang, yizhang089, yangerkun,
	yukuai
In-Reply-To: <20260522030540.3896201-1-yi.zhang@huaweicloud.com>

On 2026/5/22 11:05, Zhang Yi wrote:
> From: Zhang Yi <yi.zhang@huawei.com>
>
> jbd2_journal_try_to_free_buffers() currently only tries to remove
> checkpointed data buffers from the checkpoint list for data=journal
> mode, and bails out if any buffer is still attached to a transaction.
> For data=ordered and writeback modes, data buffers never have
> journal_heads, so the function degenerates to a plain
> try_to_free_buffers() call.
>
> Besides, The release of metadata buffers has been delegated to the jbd2
> journal shrinker in commit 4ba3fcdde7e3 ("jbd2,ext4: add a shrinker to
> release checkpointed buffers"). jbd2_journal_try_to_free_buffers() is
> not used for handling metadata buffers now.
>
> However, the comment above the function still references
> jbd2_journal_dirty_data(), __jbd2_journal_unfile_buffer(), t_datalist,
> BKL, and BUF_CLEAN, all of which were removed in commit 87c89c232c8f
> ("jbd2: Remove data=ordered mode support using jbd buffer heads").
>
> Replace it with a description of what the function actually does now.
>
> Signed-off-by: Zhang Yi <yi.zhang@huawei.com>

Looks good!

Reviewed-by: Baokun Li <libaokun@linux.alibaba.com>

> ---
>  fs/jbd2/transaction.c | 39 ++++++++++++---------------------------
>  1 file changed, 12 insertions(+), 27 deletions(-)
>
> diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
> index 4885903bbd10..239bcf88ed1c 100644
> --- a/fs/jbd2/transaction.c
> +++ b/fs/jbd2/transaction.c
> @@ -2139,38 +2139,23 @@ static void __jbd2_journal_unfile_buffer(struct journal_head *jh)
>  }
>  
>  /**
> - * jbd2_journal_try_to_free_buffers() - try to free page buffers.
> + * jbd2_journal_try_to_free_buffers() - try to free folio buffers.
>   * @journal: journal for operation
>   * @folio: Folio to detach data from.
>   *
> - * For all the buffers on this page,
> - * if they are fully written out ordered data, move them onto BUF_CLEAN
> - * so try_to_free_buffers() can reap them.
> + * For each buffer_head on @folio, if the buffer has a journal_head but
> + * is not attached to a running or committing transaction, try to remove
> + * it from the checkpoint list.  This is needed for data=journal mode
> + * where data buffers are journaled: once they are checkpointed, the
> + * journal_head can be detached and the buffer freed.  If any buffer is
> + * still attached to a transaction, the folio cannot be released and we
> + * bail out.  Otherwise we call try_to_free_buffers() to detach all
> + * buffer_heads from the folio.
>   *
> - * This function returns non-zero if we wish try_to_free_buffers()
> - * to be called. We do this if the page is releasable by try_to_free_buffers().
> - * We also do it if the page has locked or dirty buffers and the caller wants
> - * us to perform sync or async writeout.
> + * For data=ordered and writeback modes, data buffers never have
> + * journal_heads, so this degenerates to a plain try_to_free_buffers().
>   *
> - * This complicates JBD locking somewhat.  We aren't protected by the
> - * BKL here.  We wish to remove the buffer from its committing or
> - * running transaction's ->t_datalist via __jbd2_journal_unfile_buffer.
> - *
> - * This may *change* the value of transaction_t->t_datalist, so anyone
> - * who looks at t_datalist needs to lock against this function.
> - *
> - * Even worse, someone may be doing a jbd2_journal_dirty_data on this
> - * buffer.  So we need to lock against that.  jbd2_journal_dirty_data()
> - * will come out of the lock with the buffer dirty, which makes it
> - * ineligible for release here.
> - *
> - * Who else is affected by this?  hmm...  Really the only contender
> - * is do_get_write_access() - it could be looking at the buffer while
> - * journal_try_to_free_buffer() is changing its state.  But that
> - * cannot happen because we never reallocate freed data as metadata
> - * while the data is part of a transaction.  Yes?
> - *
> - * Return false on failure, true on success
> + * Return: true if the folio's buffers were freed, false otherwise
>   */
>  bool jbd2_journal_try_to_free_buffers(journal_t *journal, struct folio *folio)
>  {



^ permalink raw reply

* Re: [PATCH] ext4: Fix ERR_PTR(0) in ext4_mkdir()
From: Baokun Li @ 2026-05-29  7:37 UTC (permalink / raw)
  To: Hongling Zeng
  Cc: tytso, adilger.kernel, jack, ojaswin, ritesh.list, yi.zhang,
	linux-ext4, linux-kernel, zhongling0719, neil, brauner, jlayton
In-Reply-To: <20260520074634.53656-1-zenghongling@kylinos.cn>

On 2026/5/20 15:46, Hongling Zeng wrote:
> When mkdir succeeds, ext4_mkdir() returns ERR_PTR(0) which is incorrect.
> It should return NULL instead for success and ERR_PTR() only with
> negative error codes for failure.
>
> Fixes: 88d5baf69082 ("Change inode_operations.mkdir to return struct dentry *")
> Signed-off-by: Hongling Zeng <zenghongling@kylinos.cn>

Looks good.

Reviewed-by: Baokun Li <libaokun@linux.alibaba.com>

> ---
>  fs/ext4/namei.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
> index 4a47fbd8dd30..8cadaeb15b2b 100644
> --- a/fs/ext4/namei.c
> +++ b/fs/ext4/namei.c
> @@ -3054,7 +3054,7 @@ static struct dentry *ext4_mkdir(struct mnt_idmap *idmap, struct inode *dir,
>  out_retry:
>  	if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
>  		goto retry;
> -	return ERR_PTR(err);
> +	return err ? ERR_PTR(err) : NULL;
>  }
>  
>  /*



^ permalink raw reply

* Re: [PATCH] ext4: fix quota accounting WARN in bigalloc punch hole
From: Zhang Yi @ 2026-05-29  7:32 UTC (permalink / raw)
  To: Qiliang Yuan, Theodore Ts'o, Andreas Dilger, Baokun Li,
	Jan Kara, Ojaswin Mujoo, Ritesh Harjani (IBM)
  Cc: linux-ext4, linux-kernel, Zijing Yin
In-Reply-To: <20260528-fix-ext4-bigalloc-punch-hole-quota-v2-v1-1-32871356273b@gmail.com>

Hi, Qiliang!

On 5/28/2026 6:21 PM, Qiliang Yuan wrote:
> When doing direct I/O write on a bigalloc filesystem, the allocated
> extent might not cover entire clusters at its boundaries, leaving
> delayed blocks in those boundary clusters.  In ext4_es_insert_extent(),
> __revise_pending() inserts new pending reservations for those boundary
> clusters, and the return value (pending=true) was added to resv_used,
> causing ext4_da_update_reserve_space() to incorrectly release the
> quota reservations for those boundary clusters.
> 
> Later when PUNCH_HOLE removes the DIO-allocated blocks, the
> extent removal path detects the pending reservation via
> ext4_is_pending() and calls ext4_rereserve_cluster().  This tries
> to reclaim quota from dq_dqb.dqb_curspace back to dqb_rsvspace,
> but since the quota was already incorrectly released, dqb_curspace
> is insufficient, triggering:
> 
>   WARNING at dquot_reclaim_space_nodirty+0x77c/0x8c0

Hmm, the analysis doesn't seem correct to me. Do you mean the
following case?

# Assume the cluster size is 16KB.
xfs_io -f -c "pwrite 12k 4k" /mnt/foo
xfs_io -d -c "pwrite 0 4k" /mnt/foo
xfs_io -c "fpunch 0 4k" /mnt/foo

During the direct I/O write, quota space will be added in
ext4_mb_new_blocks() because the EXT4_MB_DELALLOC_RESERVED flag is
not set. Therefore, in ext4_es_insert_extent(), we should release the
quota reservations, since this cluster has already been allocated.

Then, in the third operation (punch hole), it will reclaim the added
dqb_curspace. This should not cause an insufficiency.

Am I missing something?

> 
> The subsequent delalloc writeback then fires a second WARN from
> dquot_claim_space_nodirty() for the same reason: dqb_rsvspace was
> depleted by the earlier incorrect release.
> 
> __es_remove_extent() -> get_rsvd() already correctly excludes
> boundary clusters that still contain delayed blocks from resv_used.
> Adding pending to resv_used double-counts those boundary clusters,
> erroneously releasing reservations that are still needed.
> 
> Remove the pending variable and the resv_used += pending addition.

That's not correct. Assume the following case.

# Assume the cluster size is 16KB.
xfs_io -f -c "pwrite 0 16k" /mnt/foo
xfs_io -c "sync_range -w 0 4k" /mnt/foo

Although only part of the cluster is written back, the cluster has been
allocated. Therefore, the quota needs to be claimed. However, since we
only wrote back a portion of a cluster, __es_remove_extent() will not
return the reserved clusters that need to be consumed (i.e., resv_used
is zero). Therefore, we need to determine whether a new pending
allocation has been created by checking the pending status, so that we
can correctly claim the quota.

Thanks,
Yi

> 
> Fixes: c543e2429640 ("ext4: update delalloc data reserve spcae in ext4_es_insert_extent()")
> Reported-by: Zijing Yin <yzjaurora@gmail.com>
> Closes: https://bugzilla.kernel.org/show_bug.cgi?id=221570
> Signed-off-by: Qiliang Yuan <realwujing@gmail.com>
> ---
>  fs/ext4/extents_status.c | 4 +---
>  1 file changed, 1 insertion(+), 3 deletions(-)
> 
> diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
> index 6e4a191e82191..fefe0bb8ac4d1 100644
> --- a/fs/ext4/extents_status.c
> +++ b/fs/ext4/extents_status.c
> @@ -909,7 +909,7 @@ void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
>  	struct extent_status newes;
>  	ext4_lblk_t end = lblk + len - 1;
>  	int err1 = 0, err2 = 0, err3 = 0;
> -	int resv_used = 0, pending = 0;
> +	int resv_used = 0;
>  	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
>  	struct extent_status *es1 = NULL;
>  	struct extent_status *es2 = NULL;
> @@ -977,7 +977,6 @@ void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
>  			__free_pending(pr);
>  			pr = NULL;
>  		}
> -		pending = err3;
>  	}
>  	ext4_es_inc_seq(inode);
>  error:
> @@ -998,7 +997,6 @@ void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
>  	 * any previously delayed allocated clusters instead of claim them
>  	 * again.
>  	 */
> -	resv_used += pending;
>  	if (resv_used)
>  		ext4_da_update_reserve_space(inode, resv_used,
>  					     delalloc_reserve_used);
> 
> ---
> base-commit: eb3f4b7426cfd2b79d65b7d37155480b32259a11
> change-id: 20260528-fix-ext4-bigalloc-punch-hole-quota-v2-2adca315d1ba
> 
> Best regards,


^ permalink raw reply

* Re: [PATCH] ext4: convert legacy ext4_debug() to standard pr_debug()
From: Baokun Li @ 2026-05-29  7:32 UTC (permalink / raw)
  To: lirongqing
  Cc: Theodore Ts'o, Jan Kara, Zhang Yi, Andreas Dilger,
	Ojaswin Mujoo, Ritesh Harjani, linux-ext4, linux-kernel
In-Reply-To: <20260521074634.2697-1-lirongqing@baidu.com>

On 2026/5/21 15:46, lirongqing wrote:
> From: Li RongQing <lirongqing@baidu.com>
>
> The ext4 file system historically implemented its own debug logging macro
> ext4_debug() via EXT4FS_DEBUG conditional compilation. This legacy
> implementation suffers from two major drawbacks:
>
> 1. It makes two consecutive un-serialized printk() calls, which can
>    lead to severe log interleaving and corruption under multi-core
>    concurrent workloads.
> 2. It completely bypasses the standard modern kernel dynamic debug
>    (CONFIG_DYNAMIC_DEBUG) infrastructure.
>
> Clean up the legacy implementation by leveraging pr_debug(). This squashes
> the multiple printk() calls into a single atomic execution, ensuring
> log integrity, while seamlessly hooking ext4 into the kernel's native
> dynamic debug framework.
>
> The redundant __FILE__ and __LINE__ macros are intentionally removed from
> the string format because the dynamic debug infrastructure can already
> append them automatically at runtime (via the '+fl' flags) if desired.
> This avoids redundancy and double-logging in modern production/debugging
> environments while keeping the macro clean and robust against dangling
> comma compiler errors.
>
> Signed-off-by: Li RongQing <lirongqing@baidu.com>

Thanks for cleaning this up.

> ---
>  fs/ext4/ext4.h | 20 ++------------------
>  1 file changed, 2 insertions(+), 18 deletions(-)
>
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 94283a9..39e86ff 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -62,24 +62,8 @@
>   */
>  #define DOUBLE_CHECK__
>  
> -/*
> - * Define EXT4FS_DEBUG to produce debug messages
> - */
> -#undef EXT4FS_DEBUG
> -

This looks like it's just disabling EXT4FS_DEBUG, but it's actually
the only toggle point — enabling debug required manually flipping this
to #define EXT4FS_DEBUG in the source.

Since balloc.c, ialloc.c, and page-io.c still have #ifdef EXT4FS_DEBUG
blocks that weren't cleaned up here, it would be cleaner to replace
those with CONFIG_EXT4_DEBUG so they fall under a single
Kconfig-controlled umbrella.

> -/*
> - * Debug code
> - */
> -#ifdef EXT4FS_DEBUG
> -#define ext4_debug(f, a...)						\
> -	do {								\
> -		printk(KERN_DEBUG "EXT4-fs DEBUG (%s, %d): %s:",	\
> -			__FILE__, __LINE__, __func__);			\
> -		printk(KERN_DEBUG f, ## a);				\
> -	} while (0)
> -#else
> -#define ext4_debug(fmt, ...)	no_printk(fmt, ##__VA_ARGS__)
> -#endif
> +#define ext4_debug(fmt, ...)						\
> +	pr_debug("EXT4-fs DEBUG %s: " fmt, __func__,  ##__VA_ARGS__)

Nit: an extra space between __func__, and ##__VA_ARGS__.

>  
>   /*
>    * Turn on EXT_DEBUG to enable ext4_ext_show_path/leaf/move in extents.c

Also, could ext4_debug be moved next to ext_debug and placed under
#ifdef CONFIG_EXT4_DEBUG together, for a cleaner layout?


Cheers,
Baokun


^ permalink raw reply

* Re: [PATCH] ext4: Remove mention of PageWriteback
From: Baokun Li @ 2026-05-29  6:33 UTC (permalink / raw)
  To: Matthew Wilcox (Oracle)
  Cc: Theodore Ts'o, Andreas Dilger, Jan Kara, Ojaswin Mujoo,
	Ritesh Harjani (IBM), Zhang Yi, linux-ext4, linux-kernel
In-Reply-To: <20260526190805.341676-1-willy@infradead.org>

On 2026/5/27 03:08, Matthew Wilcox (Oracle) wrote:
> Update a comment to refer to the concept of writeback instead of the
> (now obsolete) detail of how it's implemented.
>
> Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>

Looks good to me.

Reviewed-by: Baokun Li <libaokun@linux.alibaba.com>

> ---
>  fs/ext4/page-io.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
> index dc82e7b57e75..bc674aa4a656 100644
> --- a/fs/ext4/page-io.c
> +++ b/fs/ext4/page-io.c
> @@ -168,7 +168,7 @@ static void ext4_release_io_end(ext4_io_end_t *io_end)
>   * written. On IO failure, check if journal abort is needed. Note that
>   * we are protected from truncate touching same part of extent tree by the
>   * fact that truncate code waits for all DIO to finish (thus exclusion from
> - * direct IO is achieved) and also waits for PageWriteback bits. Thus we
> + * direct IO is achieved) and also waits for writeback to complete. Thus we
>   * cannot get to ext4_ext_truncate() before all IOs overlapping that range are
>   * completed (happens from ext4_free_ioend()).
>   */



^ permalink raw reply

* Re: [PATCH] ext4: Remove mention of PageWriteback
From: Ojaswin Mujoo @ 2026-05-29  6:01 UTC (permalink / raw)
  To: Matthew Wilcox (Oracle)
  Cc: Theodore Ts'o, Andreas Dilger, Baokun Li, Jan Kara,
	Ritesh Harjani (IBM), Zhang Yi, linux-ext4, linux-kernel
In-Reply-To: <20260526190805.341676-1-willy@infradead.org>

On Tue, May 26, 2026 at 08:08:02PM +0100, Matthew Wilcox (Oracle) wrote:
> Update a comment to refer to the concept of writeback instead of the
> (now obsolete) detail of how it's implemented.
> 
> Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
> ---
>  fs/ext4/page-io.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
> index dc82e7b57e75..bc674aa4a656 100644
> --- a/fs/ext4/page-io.c
> +++ b/fs/ext4/page-io.c
> @@ -168,7 +168,7 @@ static void ext4_release_io_end(ext4_io_end_t *io_end)
>   * written. On IO failure, check if journal abort is needed. Note that
>   * we are protected from truncate touching same part of extent tree by the
>   * fact that truncate code waits for all DIO to finish (thus exclusion from
> - * direct IO is achieved) and also waits for PageWriteback bits. Thus we
> + * direct IO is achieved) and also waits for writeback to complete. Thus we

Looks good,

Reviewed-by: Ojaswin Mujoo <ojaswin@linux.ibm.com>

Regards,
Ojaswin

>   * cannot get to ext4_ext_truncate() before all IOs overlapping that range are
>   * completed (happens from ext4_free_ioend()).
>   */
> -- 
> 2.47.3
> 

^ permalink raw reply

* Re: [PATCH v6 06/11] fstests: verify f_fsid for cloned filesystems
From: Darrick J. Wong @ 2026-05-29  4:39 UTC (permalink / raw)
  To: Anand Jain
  Cc: fstests, linux-btrfs, linux-ext4, linux-xfs, linux-f2fs-devel,
	zlang, hch
In-Reply-To: <e029755044a32de6ec2a0d3391f3ff0089fc3c30.1779939330.git.asj@kernel.org>

On Thu, May 28, 2026 at 12:05:37PM +0800, Anand Jain wrote:
> Verify that the cloned filesystem provides an f_fsid that is persistent
> across mount cycles, yet unique from the original filesystem's f_fsid.

Might want to add that last part to the test description itself, because
otherwise I don't know what 'verify' means.

> Signed-off-by: Anand Jain <asj@kernel.org>
> ---
>  tests/generic/802     | 67 +++++++++++++++++++++++++++++++++++++++++++
>  tests/generic/802.out |  7 +++++
>  2 files changed, 74 insertions(+)
>  create mode 100644 tests/generic/802
>  create mode 100644 tests/generic/802.out
> 
> diff --git a/tests/generic/802 b/tests/generic/802
> new file mode 100644
> index 000000000000..653e74e11b53
> --- /dev/null
> +++ b/tests/generic/802
> @@ -0,0 +1,67 @@
> +#! /bin/bash
> +# SPDX-License-Identifier: GPL-2.0
> +# Copyright (c) 2026 Anand Jain <asj@kernel.org>.  All Rights Reserved.
> +#
> +# FS QA Test 802
> +# Verify f_fsid and s_uuid of cloned filesystems across mount cycle.
> +
> +. ./common/preamble
> +
> +_begin_fstest auto quick mount clone
> +
> +_require_test
> +_require_block_device $TEST_DEV
> +_require_loop
> +
> +[ "$FSTYP" = "btrfs" ] && _fixed_by_kernel_commit xxxxxxxxxxxx \
> +	"btrfs: use on-disk uuid for s_uuid in temp_fsid mounts"
> +[ "$FSTYP" = "btrfs" ] && _fixed_by_kernel_commit xxxxxxxxxxxx \
> +	"btrfs: derive f_fsid from on-disk fsuuid and dev_t"

_fixed_by_fs_commit?

> +
> +_cleanup()
> +{
> +	cd /
> +	rm -r -f $tmp.*
> +	umount $mnt1 $mnt2 2>/dev/null
> +	_loop_image_destroy "${devs[@]}" 2> /dev/null
> +}
> +
> +# Setup base loop device and its clone
> +devs=()
> +_loop_image_create_clone devs
> +mkdir -p $TEST_DIR/$seq
> +mnt1=$TEST_DIR/$seq/mnt1
> +mnt2=$TEST_DIR/$seq/mnt2
> +mkdir -p $mnt1
> +mkdir -p $mnt2
> +
> +# Mount both filesystems simultaneously using mandatory clone mount options
> +_mount $(_common_dev_mount_options) $(_clone_mount_option) ${devs[0]} $mnt1 || \
> +						_fail "Failed to mount dev1"
> +_mount $(_common_dev_mount_options) $(_clone_mount_option) ${devs[1]} $mnt2 || \
> +						_fail "Failed to mount dev2"
> +
> +# Capture baseline filesystem IDs for comparison
> +fsid_scratch=$(stat -f -c "%i" $mnt1)
> +fsid_clone=$(stat -f -c "%i" $mnt2)
> +
> +echo "**** fsid initially ****"
> +echo $fsid_scratch | sed -e "s/$fsid_scratch/FSID_SCRATCH/g"
> +echo $fsid_clone | sed -e "s/$fsid_clone/FSID_CLONE/g"

Why echo only to sed?

--D

> +
> +# Verify that the fsids remain stable after a mount cycle, even when the
> +# mount order is reversed.
> +echo "**** fsid after mount cycle ****"
> +_unmount $mnt1
> +_unmount $mnt2
> +_mount $(_common_dev_mount_options) $(_clone_mount_option) ${devs[1]} $mnt2 || \
> +						_fail "Failed to mount dev2"
> +_mount $(_common_dev_mount_options) $(_clone_mount_option) ${devs[0]} $mnt1 || \
> +						_fail "Failed to mount dev1"
> +
> +# Compare post mount-cycle values against the baseline
> +stat -f -c "%i" $mnt1 | sed -e "s/$fsid_scratch/FSID_SCRATCH/g"
> +stat -f -c "%i" $mnt2 | sed -e "s/$fsid_clone/FSID_CLONE/g"
> +
> +status=0
> +exit
> diff --git a/tests/generic/802.out b/tests/generic/802.out
> new file mode 100644
> index 000000000000..d1e008f122bb
> --- /dev/null
> +++ b/tests/generic/802.out
> @@ -0,0 +1,7 @@
> +QA output created by 802
> +**** fsid initially ****
> +FSID_SCRATCH
> +FSID_CLONE
> +**** fsid after mount cycle ****
> +FSID_SCRATCH
> +FSID_CLONE
> -- 
> 2.43.0
> 
> 

^ permalink raw reply

* Re: [PATCH v6 05/11] fstests: verify fanotify isolation on cloned filesystems
From: Darrick J. Wong @ 2026-05-29  4:36 UTC (permalink / raw)
  To: Anand Jain
  Cc: fstests, linux-btrfs, linux-ext4, linux-xfs, linux-f2fs-devel,
	zlang, hch
In-Reply-To: <ef076b330a047d2f19ed48f5b7166f419433bb73.1779939330.git.asj@kernel.org>

On Thu, May 28, 2026 at 12:05:36PM +0800, Anand Jain wrote:
> Verify that fanotify events are correctly routed to the appropriate
> watcher when cloned filesystems are mounted.
> Helps verify kernel's event notification distinguishes between devices
> sharing the same FSID/UUID.
> 
> Signed-off-by: Anand Jain <asj@kernel.org>
> ---
>  tests/generic/801     | 135 ++++++++++++++++++++++++++++++++++++++++++
>  tests/generic/801.out |   7 +++
>  2 files changed, 142 insertions(+)
>  create mode 100644 tests/generic/801
>  create mode 100644 tests/generic/801.out
> 
> diff --git a/tests/generic/801 b/tests/generic/801
> new file mode 100644
> index 000000000000..3bfb87d41922
> --- /dev/null
> +++ b/tests/generic/801
> @@ -0,0 +1,135 @@
> +#! /bin/bash
> +# SPDX-License-Identifier: GPL-2.0
> +# Copyright (c) 2026 Anand Jain <asj@kernel.org>.  All Rights Reserved.
> +#
> +# FS QA Test 801
> +# Verify fanotify FID functionality on cloned filesystems by setting up
> +# watchers and making sure notifications are in the correct logs files.
> +
> +. ./common/preamble
> +
> +_begin_fstest auto quick mount clone
> +
> +_require_test
> +_require_block_device $TEST_DEV
> +_require_loop
> +_require_command "$FSNOTIFYWAIT_PROG" fsnotifywait
> +_require_unique_f_fsid
> +
> +_cleanup()
> +{
> +	cd /
> +	[[ -n $pid1 ]] && { kill -TERM "$pid1" 2> /dev/null; wait $pid1; }
> +	[[ -n $pid2 ]] && { kill -TERM "$pid2" 2> /dev/null; wait $pid2; }
> +
> +	if [ "$semanage_added" = "yes" ]; then
> +		semanage permissive -d unconfined_t >/dev/null 2>&1 || true
> +	fi
> +
> +	umount $mnt1 $mnt2 2>/dev/null
> +	_loop_image_destroy "${devs[@]}" 2> /dev/null
> +	rm -r -f $tmp.*
> +}
> +
> +# Run fsnotifywait in unbuffered mode to watch filesystem-wide create events
> +monitor_fanotify()
> +{
> +	local mmnt=$1
> +	exec stdbuf -oL $FSNOTIFYWAIT_PROG -m -F -S -e create "$mmnt" 2>&1

I guess you need stdbuf to force fsnotifywait to run in linebuffered
mode even if you pipe/redirect it somewhere?

> +}
> +
> +# Transform f_fsid into the hi.lo format used in fanotify FID logs
> +fsid_to_fid_parts()
> +{
> +	local fsid=$1
> +	# Pad to 16 hex chars (64-bit), then split into two 32-bit halves
> +	local padded=$(printf '%016x' "0x${fsid}")
> +	local hi=$(printf '%x' "0x${padded:0:8}")   # strips leading zeros
> +	local lo=$(printf '%x' "0x${padded:8:8}")   # strips leading zeros
> +	echo "${hi}.${lo}"
> +}
> +
> +# Create base loop device and its clone
> +devs=()
> +_loop_image_create_clone devs
> +mkdir -p $TEST_DIR/$seq
> +mnt1=$TEST_DIR/$seq/mnt1
> +mnt2=$TEST_DIR/$seq/mnt2
> +mkdir -p $mnt1
> +mkdir -p $mnt2
> +
> +# Mount both base and clone filesystems using required clone mount options
> +_mount $(_common_dev_mount_options) $(_clone_mount_option) ${devs[0]} $mnt1 || \
> +						_fail "Failed to mount dev1"
> +_mount $(_common_dev_mount_options) $(_clone_mount_option) ${devs[1]} $mnt2 || \
> +						_fail "Failed to mount dev2"
> +
> +# Fetch filesystem IDs to verify the kernel can differentiate between them
> +fsid1=$(stat -f -c "%i" $mnt1)
> +fsid2=$(stat -f -c "%i" $mnt2)
> +
> +log1=$tmp.fanotify1
> +log2=$tmp.fanotify2
> +
> +pid1=""
> +pid2=""
> +echo "Setup FID fanotify watchers on both mnt1 and mnt2"
> +
> +# Permit unconfined_t domains when SELinux is enforcing to prevent fanotify
> +# blockages
> +semanage_added="no"
> +if [ "$(getenforce 2>/dev/null)" = "Enforcing" ]; then
> +    if ! semanage permissive -l | grep -q "unconfined_t"; then
> +        semanage permissive -a unconfined_t >/dev/null 2>&1 && semanage_added="yes"
> +    fi
> +fi

Is there a cleaner way to manage setting up and automatically undoing
this step?

There might not be, since iirc the suggestion to register cleanup
functions in a cleanups=() array and call them all in reverse order
didn't go anywhere.

> +
> +# Start asynchronous fanotify monitors
> +( monitor_fanotify "$mnt1" > "$log1" ) &
> +pid1=$!
> +( monitor_fanotify "$mnt2" > "$log2" ) &
> +pid2=$!
> +sleep 2
> +
> +echo "Trigger file creation on mnt1"
> +touch $mnt1/file_on_mnt1
> +sync
> +sleep 1
> +
> +echo "Trigger file creation on mnt2"
> +touch $mnt2/file_on_mnt2
> +sync
> +sleep 1
> +
> +echo "Verify fsid in the fanotify"
> +kill $pid1 $pid2
> +wait $pid1 $pid2 2>/dev/null
> +pid1=""
> +pid2=""
> +
> +e_fsid1=$(fsid_to_fid_parts "$fsid1")
> +e_fsid2=$(fsid_to_fid_parts "$fsid2")
> +
> +# Dump debug details to the full log
> +echo $fsid1 $e_fsid1 $fsid2 $e_fsid2 >> $seqres.full
> +cat $log1 >> $seqres.full
> +cat $log2 >> $seqres.full
> +
> +# Ensure monitor 1 only captured events belonging to mnt 1 and fsid 1
> +if grep -qF "$e_fsid1" "$log1" && ! grep -qF "$e_fsid2" "$log1"; then
> +	echo "SUCCESS: mnt1 events found"
> +else
> +	[ ! -s "$log1" ] && echo "  - mnt1 received no events."
> +	grep -qF "$e_fsid2" "$log1" && echo "  - mnt1 received event from mnt2."
> +fi
> +
> +# Ensure monitor 2 only captured events belonging to mnt 2 and fsid 2
> +if grep -qF "$e_fsid2" "$log2" && ! grep -qF "$e_fsid1" "$log2"; then
> +	echo "SUCCESS: mnt2 events found"
> +else
> +	[ ! -s "$log2" ] && echo "  - mnt2 received no events."
> +	grep -qF "$e_fsid1" "$log2" && echo "  - mnt2 received event from mnt1."
> +fi
> +
> +status=0
> +exit
> diff --git a/tests/generic/801.out b/tests/generic/801.out
> new file mode 100644
> index 000000000000..d7b318d9f27c
> --- /dev/null
> +++ b/tests/generic/801.out
> @@ -0,0 +1,7 @@
> +QA output created by 801
> +Setup FID fanotify watchers on both mnt1 and mnt2
> +Trigger file creation on mnt1
> +Trigger file creation on mnt2
> +Verify fsid in the fanotify
> +SUCCESS: mnt1 events found
> +SUCCESS: mnt2 events found
> -- 
> 2.43.0
> 
> 

^ permalink raw reply

* Re: [PATCH v6 04/11] fstests: add _require_unique_f_fsid() helper
From: Darrick J. Wong @ 2026-05-29  4:30 UTC (permalink / raw)
  To: Anand Jain
  Cc: fstests, linux-btrfs, linux-ext4, linux-xfs, linux-f2fs-devel,
	zlang, hch
In-Reply-To: <983ed0f63318c930379ee74220f23aa558c16d51.1779939330.git.asj@kernel.org>

On Thu, May 28, 2026 at 12:05:35PM +0800, Anand Jain wrote:
> Add a helper to check if the target filesystem supports unique f_fsid
> tracking across cloned or snapshot instances.
> 
> Certain filesystems like XFS, Btrfs, and F2FS ensure unique f_fsid
> identifiers per filesystem instance. However, Ext4 derives its f_fsid
> directly from its superblock UUID, which leads to identical f_fsid
> values on cloned images until the UUID is manually modified by userspace.
> 
> Introduce _require_unique_f_fsid() to allow test cases requiring strict
> f_fsid uniqueness to skip gracefully on unsupported filesystems.
> 
> Signed-off-by: Anand Jain <asj@kernel.org>
> ---
>  common/rc | 21 +++++++++++++++++++++
>  1 file changed, 21 insertions(+)
> 
> diff --git a/common/rc b/common/rc
> index 937f478963b4..5446552aed92 100644
> --- a/common/rc
> +++ b/common/rc
> @@ -6314,6 +6314,27 @@ _require_fanotify_ioerrors()
>  	_notrun "$FSTYP does not support fanotify ioerrors"
>  }
>  
> +# Ext4 derives f_fsid from the superblock UUID, meaning clones share the
> +# same f_fsid until their UUIDs diverge. Conversely, XFS, Btrfs,
> +# and F2FS ensure f_fsid remains unique per filesystem instance (often by
> +# deriving it from the UUID and underlying block device.)
> +#
> +# Across all filesystems, a UUID collision causes libblkid tools to return
> +# non-deterministic device mappings. It is ultimately the responsibility

"device mappings", as in /dev/disk/by-id/$UUID ?

> +# of the userspace utility or use-case to enforce uniqueness when a clone
> +# diverges. For details, see mailing list thread discussions titled:
> +#      "ext4: derive f_fsid from block device to avoid collisions".

How about providing a direct lore link?

--D

> +_require_unique_f_fsid()
> +{
> +	# Skip the test if the filesystem does not enforce unique f_fsids
> +	# natively. Checking this dynamically requires recreating a clone
> +	# layout, so we use a static lookup based on FSTYP.
> +	if [ "$FSTYP" == "ext4" ]; then
> +		_notrun "Target filesystem ($FSTYP) does not guarantee unique f_fsid on clones."
> +	fi
> +}
> +
> +
>  # Computes a percentage of the available space in a filesystem and
>  # returns that quantity in MB. The percentage must not contain a percent
>  # sign ("%").
> -- 
> 2.43.0
> 
> 

^ permalink raw reply

* Re: [PATCH v6 03/11] fstests: add FSNOTIFYWAIT_PROG
From: Darrick J. Wong @ 2026-05-29  4:29 UTC (permalink / raw)
  To: Anand Jain
  Cc: fstests, linux-btrfs, linux-ext4, linux-xfs, linux-f2fs-devel,
	zlang, hch
In-Reply-To: <f4dc9d5af00133834acf97e1c72232d2a9b34ac6.1779939330.git.asj@kernel.org>

On Thu, May 28, 2026 at 12:05:34PM +0800, Anand Jain wrote:
> Define `FSNOTIFYWAIT_PROG` for an upcoming test case that uses `fsnotifywait`.
> 
> Signed-off-by: Anand Jain <asj@kernel.org>

Seems fine to me
Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>

--D

> ---
>  common/config | 1 +
>  1 file changed, 1 insertion(+)
> 
> diff --git a/common/config b/common/config
> index d5299d5b926f..5661fa0ec310 100644
> --- a/common/config
> +++ b/common/config
> @@ -242,6 +242,7 @@ export BTRFS_MAP_LOGICAL_PROG=$(type -P btrfs-map-logical)
>  export PARTED_PROG="$(type -P parted)"
>  export XFS_PROPERTY_PROG="$(type -P xfs_property)"
>  export FSCRYPTCTL_PROG="$(type -P fscryptctl)"
> +export FSNOTIFYWAIT_PROG="$(type -P fsnotifywait)"
>  
>  # udev wait functions.
>  #
> -- 
> 2.43.0
> 
> 

^ permalink raw reply

* Re: [PATCH v6 02/11] fstests: add _clone_mount_option() helper
From: Darrick J. Wong @ 2026-05-29  4:28 UTC (permalink / raw)
  To: Anand Jain
  Cc: fstests, linux-btrfs, linux-ext4, linux-xfs, linux-f2fs-devel,
	zlang, hch
In-Reply-To: <4346c80089c61c8f0d62ea696f9d73f2a9669297.1779939330.git.asj@kernel.org>

On Thu, May 28, 2026 at 12:05:33PM +0800, Anand Jain wrote:
> Adds _clone_mount_option() helper function to handle filesystem-specific
> requirements for mounting cloned devices. Abstract the need for -o nouuid
> on XFS.
> 
> Signed-off-by: Anand Jain <asj@kernel.org>
> ---
>  common/rc | 17 +++++++++++++++++
>  1 file changed, 17 insertions(+)
> 
> diff --git a/common/rc b/common/rc
> index d7e3e0bdfb1e..937f478963b4 100644
> --- a/common/rc
> +++ b/common/rc
> @@ -414,6 +414,23 @@ _scratch_mount_options()
>  					$SCRATCH_DEV $SCRATCH_MNT
>  }
>  
> +# Return filesystem-specific mount options required for mounting clone/snapshot
> +# devices.
> +_clone_mount_option()
> +{
> +	local mount_opts=""
> +
> +	case "$FSTYP" in
> +	xfs)
> +		# Allow mounting a duplicate filesystem on the same host
> +		mount_opts="-o nouuid"
> +		;;
> +	*)
> +	esac
> +
> +	echo $mount_opts

I probably would've just echo'd straight from inside the case statement,
but this otherwise looks ok,
Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>

--D

> +}
> +
>  _supports_filetype()
>  {
>  	local dir=$1
> -- 
> 2.43.0
> 
> 

^ permalink raw reply

* Re: [PATCH v6 01/11] fstests: add _loop_image_create_clone() helper
From: Darrick J. Wong @ 2026-05-29  4:27 UTC (permalink / raw)
  To: Anand Jain
  Cc: fstests, linux-btrfs, linux-ext4, linux-xfs, linux-f2fs-devel,
	zlang, hch
In-Reply-To: <421c7cdd5aae27b99d04dddf08c5d9df79c2f790.1779939330.git.asj@kernel.org>

On Thu, May 28, 2026 at 12:05:32PM +0800, Anand Jain wrote:
> Introduce _loop_image_create_clone() and _loop_image_destroy() to mkfs an
> image file and clone it to another image file, and attach a loop device to
> them. And its destroy part.
> 
> Signed-off-by: Anand Jain <asj@kernel.org>
> ---
>  common/rc | 63 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 63 insertions(+)
> 
> diff --git a/common/rc b/common/rc
> index 79189e7e6e94..d7e3e0bdfb1e 100644
> --- a/common/rc
> +++ b/common/rc
> @@ -1520,6 +1520,69 @@ _scratch_resvblks()
>  	esac
>  }
>  
> +# Create a small loop image, run an optional tuning function ($2) on it,
> +# clone it, and attach both to loop devices, returned in ($1).
> +# Args:
> +#   $1: Nameref to return the array of allocated loop devices [base, clone].
> +#   $2: Optional callback function to tune the base filesystem before cloning.
> +_loop_image_create_clone()
> +{
> +	local -n _ret=$1

That switch   ^^ is very clever.  I always wondered how one did indirect
variables in bash.

> +	local pre_clone_tune_func="$2"
> +	local img_file=$TEST_DIR/${seq}.img
> +	local img_file_clone=$TEST_DIR/${seq}_clone.img
> +	local size=$(_small_fs_size_mb 128) # Smallest possible
> +	local loop_devs
> +
> +	# Since we copy the block device image, we keep its size small.
> +	_require_fs_space $TEST_DIR $((size * 1024))
> +
> +	_create_file_sized $((size * 1024 * 1024)) $img_file ||
> +				_fail "Failed: Create $img_file $size"
> +
> +	loop_devs=$(_create_loop_device $img_file)
> +	_ret=($loop_devs)

Should this check that a loopdev actually got created?

> +	case $FSTYP in
> +	xfs)
> +		_mkfs_dev "-s size=4096" ${loop_devs[0]}
> +		;;
> +	btrfs)
> +		_mkfs_dev ${loop_devs[0]}
> +		;;
> +	*)
> +		_mkfs_dev ${loop_devs[0]}
> +		;;
> +	esac
> +
> +	# Only execute if the function argument is not empty
> +	if [ -n "$pre_clone_tune_func" ]; then
> +		$pre_clone_tune_func ${loop_devs[0]}
> +	fi
> +
> +	sync ${loop_devs[0]}
> +	cp $img_file $img_file_clone
> +
> +	loop_devs="$loop_devs $(_create_loop_device $img_file_clone)"

	local lodev="$(_create_loop_device ...)"

	test -z "$lodev" && _fail "second loopdev not created"
	_ret+=("$lodev")

?

> +
> +	_ret=($loop_devs)
> +}
> +
> +# Teardown loop devices and delete their underlying backing image files.
> +# Accepts a list of loop device paths (e.g., /dev/loop0 /dev/loop1).
> +_loop_image_destroy()
> +{
> +	for d in "$@"; do
> +		# Retrieve the path of the backing file
> +		local f=$(losetup --noheadings --output BACK-FILE $d)
> +
> +		# Detach the loop device from the backing file
> +		_destroy_loop_device "$d"
> +
> +		# Clean up the backing disk image file
> +		[ -n "$f" ] && rm -f "$f"
> +	done
> +}
>  
>  # Repair scratch filesystem.  Returns 0 if the FS is good to go (either no
>  # errors found or errors were fixed) and nonzero otherwise; also spits out
> -- 
> 2.43.0
> 
> 

^ permalink raw reply

* Re: [PATCH] ext4: validate dx count against limit in dx_csum
From: Andreas Dilger @ 2026-05-29  0:09 UTC (permalink / raw)
  To: Artem Blagodarenko; +Cc: linux-ext4, Theodore Ts'o
In-Reply-To: <20260528160557.6956-1-ablagodarenko@thelustrecollective.com>

On May 28, 2026, at 10:05, Artem Blagodarenko <artem.blagodarenko@gmail.com> wrote:
> 
> From: Artem Blagodarenko <artem.blagodarenko@gmail.com>
> 
> Sashiko AI, during inspection of the "ext4: replace ext4_dir_entry
> with ext4_dir_entry_2" series, reported a missing bounds check on
> count against limit in DX block checksum verification and setup
> paths.
> 
> The code validates that limit fits within block boundaries, but does
> not verify that count <= limit. A maliciously crafted filesystem image
> could therefore provide an artificially large count value.
> 
> Since ext4_dx_csum() uses count to determine the checksum region
> size, this may result in an out-of-bounds access crossing page
> boundaries into unmapped memory, potentially leading to a crash during
> checksum verification.
> 
> The same issue exists in ext4_dx_csum_set().
> 
> The bug was not introduced by the patch under review, so the fix is sent
> separately.
> 
> Signed-off-by: Artem Blagodarenko artem.blagodarenko@gmail.com

Reviewed-by: Andreas Dilger <adilger@dilger.ca <mailto:adilger@dilger.ca>>

> ---
> fs/ext4/namei.c | 6 ++++--
> 1 file changed, 4 insertions(+), 2 deletions(-)
> 
> diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
> index a316fc2ac41b..20b7bac69889 100644
> --- a/fs/ext4/namei.c
> +++ b/fs/ext4/namei.c
> @@ -472,7 +472,8 @@ static int ext4_dx_csum_verify(struct inode *inode,
> }
> limit = le16_to_cpu(c->limit);
> count = le16_to_cpu(c->count);
> - if (count_offset + (limit * sizeof(struct dx_entry)) >
> + if (!count || count > limit ||
> +    count_offset + (limit * sizeof(struct dx_entry)) >
>    EXT4_BLOCK_SIZE(inode->i_sb) - sizeof(struct dx_tail)) {
> warn_no_space_for_csum(inode);
> return 0;
> @@ -501,7 +502,8 @@ static void ext4_dx_csum_set(struct inode *inode, struct ext4_dir_entry_2 *diren
> }
> limit = le16_to_cpu(c->limit);
> count = le16_to_cpu(c->count);
> - if (count_offset + (limit * sizeof(struct dx_entry)) >
> + if (!count || count > limit ||
> +    count_offset + (limit * sizeof(struct dx_entry)) >
>    EXT4_BLOCK_SIZE(inode->i_sb) - sizeof(struct dx_tail)) {
> warn_no_space_for_csum(inode);
> return;
> -- 
> 2.43.7
> 


Cheers, Andreas






^ permalink raw reply

* [PATCH v2 16/34] jbd2: Convert journal commit to bh_submit()
From: Matthew Wilcox (Oracle) @ 2026-05-28 17:31 UTC (permalink / raw)
  To: Jan Kara
  Cc: Matthew Wilcox (Oracle), Christian Brauner, Christoph Hellwig,
	linux-fsdevel, Theodore Ts'o, linux-ext4
In-Reply-To: <20260528173150.1093780-1-willy@infradead.org>

Avoid an extra indirect function call by using bh_submit()
instead of submit_bh() in journal_submit_commit_record()
and jbd2_journal_commit_transaction().  These both use
journal_end_buffer_io_sync(), so it's more straightforward to do them
both at once.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Jan Kara <jack@suse.cz>
Acked-by: Theodore Ts'o <tytso@mit.edu>
Cc: linux-ext4@vger.kernel.org
---
 fs/jbd2/commit.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 8cf61e7185c4..4e91593d27e5 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -29,8 +29,10 @@
 /*
  * IO end handler for temporary buffer_heads handling writes to the journal.
  */
-static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
+static void journal_end_buffer_io_sync(struct bio *bio)
 {
+	struct buffer_head *bh;
+	bool uptodate = bio_endio_bh(bio, &bh);
 	struct buffer_head *orig_bh = bh->b_private;
 
 	BUFFER_TRACE(bh, "");
@@ -147,13 +149,12 @@ static int journal_submit_commit_record(journal_t *journal,
 	lock_buffer(bh);
 	clear_buffer_dirty(bh);
 	set_buffer_uptodate(bh);
-	bh->b_end_io = journal_end_buffer_io_sync;
 
 	if (journal->j_flags & JBD2_BARRIER &&
 	    !jbd2_has_feature_async_commit(journal))
 		write_flags |= REQ_PREFLUSH | REQ_FUA;
 
-	submit_bh(write_flags, bh);
+	bh_submit(bh, write_flags, journal_end_buffer_io_sync);
 	*cbh = bh;
 	return 0;
 }
@@ -751,9 +752,9 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 				lock_buffer(bh);
 				clear_buffer_dirty(bh);
 				set_buffer_uptodate(bh);
-				bh->b_end_io = journal_end_buffer_io_sync;
-				submit_bh(REQ_OP_WRITE | JBD2_JOURNAL_REQ_FLAGS,
-					  bh);
+				bh_submit(bh,
+					REQ_OP_WRITE | JBD2_JOURNAL_REQ_FLAGS,
+					journal_end_buffer_io_sync);
 			}
 			cond_resched();
 
-- 
2.47.3


^ permalink raw reply related

* [PATCH v2 15/34] ext4: Convert ext4_commit_super() to bh_submit()
From: Matthew Wilcox (Oracle) @ 2026-05-28 17:31 UTC (permalink / raw)
  To: Jan Kara
  Cc: Matthew Wilcox (Oracle), Christian Brauner, Christoph Hellwig,
	linux-fsdevel, Theodore Ts'o, linux-ext4
In-Reply-To: <20260528173150.1093780-1-willy@infradead.org>

Avoid an extra indirect function call and changing the buffer refcount
by using bh_submit() instead of submit_bh().

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Jan Kara <jack@suse.cz>
Acked-by: Theodore Ts'o <tytso@mit.edu>
Cc: linux-ext4@vger.kernel.org
---
 fs/ext4/super.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index bc7faedcb8e4..7283108d7609 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -6316,12 +6316,10 @@ static int ext4_commit_super(struct super_block *sb)
 		clear_buffer_write_io_error(sbh);
 		set_buffer_uptodate(sbh);
 	}
-	get_bh(sbh);
 	/* Clear potential dirty bit if it was journalled update */
 	clear_buffer_dirty(sbh);
-	sbh->b_end_io = end_buffer_write_sync;
-	submit_bh(REQ_OP_WRITE | REQ_SYNC |
-		  (test_opt(sb, BARRIER) ? REQ_FUA : 0), sbh);
+	bh_submit(sbh, REQ_OP_WRITE | REQ_SYNC |
+		  (test_opt(sb, BARRIER) ? REQ_FUA : 0), bh_end_write);
 	wait_on_buffer(sbh);
 	if (buffer_write_io_error(sbh)) {
 		ext4_msg(sb, KERN_ERR, "I/O error while writing "
-- 
2.47.3


^ permalink raw reply related

* [PATCH v2 17/34] jbd2: Convert jbd2_write_superblock() to bh_submit()
From: Matthew Wilcox (Oracle) @ 2026-05-28 17:31 UTC (permalink / raw)
  To: Jan Kara
  Cc: Matthew Wilcox (Oracle), Christian Brauner, Christoph Hellwig,
	linux-fsdevel, Theodore Ts'o, linux-ext4
In-Reply-To: <20260528173150.1093780-1-willy@infradead.org>

Avoid an extra indirect function call and changing the buffer refcount
by using bh_submit() instead of submit_bh().

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Jan Kara <jack@suse.cz>
Acked-by: Theodore Ts'o <tytso@mit.edu>
Cc: linux-ext4@vger.kernel.org
---
 fs/jbd2/journal.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 4f397fcdb13c..2040af8c84cb 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -1820,9 +1820,7 @@ static int jbd2_write_superblock(journal_t *journal, blk_opf_t write_flags)
 	}
 	if (jbd2_journal_has_csum_v2or3(journal))
 		sb->s_checksum = jbd2_superblock_csum(sb);
-	get_bh(bh);
-	bh->b_end_io = end_buffer_write_sync;
-	submit_bh(REQ_OP_WRITE | write_flags, bh);
+	bh_submit(bh, REQ_OP_WRITE | write_flags, bh_end_write);
 	wait_on_buffer(bh);
 	if (buffer_write_io_error(bh)) {
 		clear_buffer_write_io_error(bh);
-- 
2.47.3


^ permalink raw reply related

* [PATCH v2 14/34] ext4: Convert write_mmp_block_thawed() to bh_submit()
From: Matthew Wilcox (Oracle) @ 2026-05-28 17:31 UTC (permalink / raw)
  To: Jan Kara
  Cc: Matthew Wilcox (Oracle), Christian Brauner, Christoph Hellwig,
	linux-fsdevel, linux-ext4
In-Reply-To: <20260528173150.1093780-1-willy@infradead.org>

Avoid an extra indirect function call and changing the buffer refcount
by using bh_submit() instead of submit_bh().

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: linux-ext4@vger.kernel.org
---
 fs/ext4/mmp.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
index 6f57c181ff77..7ce361484b38 100644
--- a/fs/ext4/mmp.c
+++ b/fs/ext4/mmp.c
@@ -46,9 +46,8 @@ static int write_mmp_block_thawed(struct super_block *sb,
 
 	ext4_mmp_csum_set(sb, mmp);
 	lock_buffer(bh);
-	bh->b_end_io = end_buffer_write_sync;
-	get_bh(bh);
-	submit_bh(REQ_OP_WRITE | REQ_SYNC | REQ_META | REQ_PRIO, bh);
+	bh_submit(bh, REQ_OP_WRITE | REQ_SYNC | REQ_META | REQ_PRIO,
+			bh_end_write);
 	wait_on_buffer(bh);
 	if (unlikely(!buffer_uptodate(bh)))
 		return -EIO;
-- 
2.47.3


^ permalink raw reply related

* [PATCH v2 13/34] ext4: Convert ext4_fc_submit_bh() to bh_submit()
From: Matthew Wilcox (Oracle) @ 2026-05-28 17:31 UTC (permalink / raw)
  To: Jan Kara
  Cc: Matthew Wilcox (Oracle), Christian Brauner, Christoph Hellwig,
	linux-fsdevel, linux-ext4
In-Reply-To: <20260528173150.1093780-1-willy@infradead.org>

Avoid an extra indirect function call by converting
ext4_end_buffer_io_sync() from bh_end_io_t to bio_end_io_t and
calling bh_submit().

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: linux-ext4@vger.kernel.org
---
 fs/ext4/fast_commit.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
index b3c22636251d..5773b85e43cb 100644
--- a/fs/ext4/fast_commit.c
+++ b/fs/ext4/fast_commit.c
@@ -184,8 +184,11 @@
 #include <trace/events/ext4.h>
 static struct kmem_cache *ext4_fc_dentry_cachep;
 
-static void ext4_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
+static void ext4_end_buffer_io_sync(struct bio *bio)
 {
+	struct buffer_head *bh;
+	bool uptodate = bio_endio_bh(bio, &bh);
+
 	BUFFER_TRACE(bh, "");
 	if (uptodate) {
 		ext4_debug("%s: Block %lld up-to-date",
@@ -659,8 +662,7 @@ static void ext4_fc_submit_bh(struct super_block *sb, bool is_tail)
 	lock_buffer(bh);
 	set_buffer_dirty(bh);
 	set_buffer_uptodate(bh);
-	bh->b_end_io = ext4_end_buffer_io_sync;
-	submit_bh(REQ_OP_WRITE | write_flags, bh);
+	bh_submit(bh, REQ_OP_WRITE | write_flags, ext4_end_buffer_io_sync);
 	EXT4_SB(sb)->s_fc_bh = NULL;
 }
 
-- 
2.47.3


^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox