* [PATCH v2 4/4] fs: retire sget()
From: Christian Brauner @ 2026-05-29 8:43 UTC (permalink / raw)
To: linux-fsdevel
Cc: Theodore Ts'o, Andreas Dilger, Jan Kara, Ritesh Harjani (IBM),
linux-ext4, linux-cifs, Alexander Viro,
Christian Brauner (Amutable)
In-Reply-To: <20260529-work-sget-v2-0-57bbe08604e4@kernel.org>
sget() and sget_fc() have lived side by side as near-duplicate
find-or-create-and-publish helpers for the legacy and fs_context mount
APIs. The three remaining in-tree callers (CIFS plus the ext4 extents
and mballoc KUnit tests) have all been moved to sget_fc(). Nothing
calls sget() anymore.
Delete sget() from fs/super.c and the prototype in <linux/fs.h>.
Update the two comments that referred to "sget()" or "sget{_fc}()" to
just say "sget_fc()".
This removes ~60 lines of code that only existed to be kept in
lockstep with sget_fc() on every superblock publish-path change.
Signed-off-by: Christian Brauner (Amutable) <brauner@kernel.org>
---
fs/btrfs/super.c | 2 +-
fs/super.c | 71 ++++--------------------------------------------------
include/linux/fs.h | 4 ---
3 files changed, 6 insertions(+), 71 deletions(-)
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index b26aa9169e83..636154861d7c 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -2052,7 +2052,7 @@ static int btrfs_get_tree_subvol(struct fs_context *fc)
* then open_ctree will properly initialize the file system specific
* settings later. btrfs_init_fs_info initializes the static elements
* of the fs_info (locks and such) to make cleanup easier if we find a
- * superblock with our given fs_devices later on at sget() time.
+ * superblock with our given fs_devices later on at sget_fc() time.
*/
fs_info = kvzalloc_obj(struct btrfs_fs_info);
if (!fs_info)
diff --git a/fs/super.c b/fs/super.c
index 378e81efe643..5fe8cea9f8fe 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -328,7 +328,7 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags,
init_rwsem(&s->s_umount);
lockdep_set_class(&s->s_umount, &type->s_umount_key);
/*
- * sget() can have s_umount recursion.
+ * sget_fc() can have s_umount recursion.
*
* When it cannot find a suitable sb, it allocates a new
* one (this one), and tries again to find a suitable old
@@ -439,7 +439,7 @@ static void kill_super_notify(struct super_block *sb)
/*
* Remove it from @fs_supers so it isn't found by new
- * sget{_fc}() walkers anymore. Any concurrent mounter still
+ * sget_fc() walkers anymore. Any concurrent mounter still
* managing to grab a temporary reference is guaranteed to
* already see SB_DYING and will wait until we notify them about
* SB_DEAD.
@@ -517,7 +517,7 @@ EXPORT_SYMBOL(deactivate_super);
* @sb: superblock to acquire
*
* Acquire a temporary reference on a superblock and try to trade it for
- * an active reference. This is used in sget{_fc}() to wait for a
+ * an active reference. This is used in sget_fc() to wait for a
* superblock to either become SB_BORN or for it to pass through
* sb->kill() and be marked as SB_DEAD.
*
@@ -673,11 +673,11 @@ void generic_shutdown_super(struct super_block *sb)
/*
* Broadcast to everyone that grabbed a temporary reference to this
* superblock before we removed it from @fs_supers that the superblock
- * is dying. Every walker of @fs_supers outside of sget{_fc}() will now
+ * is dying. Every walker of @fs_supers outside of sget_fc() will now
* discard this superblock and treat it as dead.
*
* We leave the superblock on @fs_supers so it can be found by
- * sget{_fc}() until we passed sb->kill_sb().
+ * sget_fc() until we passed sb->kill_sb().
*/
super_wake(sb, SB_DYING);
super_unlock_excl(sb);
@@ -808,67 +808,6 @@ struct super_block *sget_fc(struct fs_context *fc,
}
EXPORT_SYMBOL(sget_fc);
-/**
- * sget - find or create a superblock
- * @type: filesystem type superblock should belong to
- * @test: comparison callback
- * @set: setup callback
- * @flags: mount flags
- * @data: argument to each of them
- */
-struct super_block *sget(struct file_system_type *type,
- int (*test)(struct super_block *,void *),
- int (*set)(struct super_block *,void *),
- int flags,
- void *data)
-{
- struct user_namespace *user_ns = current_user_ns();
- struct super_block *s = NULL;
- struct super_block *old;
- int err;
-
-retry:
- spin_lock(&sb_lock);
- if (test) {
- hlist_for_each_entry(old, &type->fs_supers, s_instances) {
- if (!test(old, data))
- continue;
- if (user_ns != old->s_user_ns) {
- spin_unlock(&sb_lock);
- destroy_unused_super(s);
- return ERR_PTR(-EBUSY);
- }
- if (!grab_super(old))
- goto retry;
- destroy_unused_super(s);
- return old;
- }
- }
- if (!s) {
- spin_unlock(&sb_lock);
- s = alloc_super(type, flags, user_ns);
- if (!s)
- return ERR_PTR(-ENOMEM);
- goto retry;
- }
-
- err = set(s, data);
- if (err) {
- spin_unlock(&sb_lock);
- destroy_unused_super(s);
- return ERR_PTR(err);
- }
- s->s_type = type;
- strscpy(s->s_id, type->name, sizeof(s->s_id));
- list_add_tail(&s->s_list, &super_blocks);
- hlist_add_head(&s->s_instances, &type->fs_supers);
- spin_unlock(&sb_lock);
- get_filesystem(type);
- shrinker_register(s->s_shrink);
- return s;
-}
-EXPORT_SYMBOL(sget);
-
void drop_super(struct super_block *sb)
{
super_unlock_shared(sb);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 11559c513dfb..6dbe3218dc1e 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2327,10 +2327,6 @@ void free_anon_bdev(dev_t);
struct super_block *sget_fc(struct fs_context *fc,
int (*test)(struct super_block *, struct fs_context *),
int (*set)(struct super_block *, struct fs_context *));
-struct super_block *sget(struct file_system_type *type,
- int (*test)(struct super_block *,void *),
- int (*set)(struct super_block *,void *),
- int flags, void *data);
struct super_block *sget_dev(struct fs_context *fc, dev_t dev);
/* Alas, no aliases. Too much hassle with bringing module.h everywhere */
--
2.47.3
^ permalink raw reply related
* [PATCH v2 3/4] smb: client: convert cifs_smb3_do_mount() to sget_fc()
From: Christian Brauner @ 2026-05-29 8:43 UTC (permalink / raw)
To: linux-fsdevel
Cc: Theodore Ts'o, Andreas Dilger, Jan Kara, Ritesh Harjani (IBM),
linux-ext4, linux-cifs, Alexander Viro,
Christian Brauner (Amutable)
In-Reply-To: <20260529-work-sget-v2-0-57bbe08604e4@kernel.org>
The CIFS mount path already runs through fs_context: smb3_get_tree()
calls smb3_get_tree_common() with a struct fs_context * in hand. But
the fc is dropped on the way to sget(). Plumb it through to sget_fc()
so the legacy sget() interface can go.
cifs_smb3_do_mount() now takes (struct fs_context *, struct
smb3_fs_context *). The old (fs_type, flags) pair is reconstructed
from fc->fs_type and fc->sb_flags. The flags argument was always
passed as 0 by the sole caller anyway. The cifs_dbg diagnostic now
prints fc->sb_flags directly.
cifs_match_super() and cifs_set_super() were the two void-data
callbacks for sget(). The match callback now takes
(struct super_block *, struct fs_context *) and reads struct
cifs_mnt_data out of fc->sget_key. The set callback is gone entirely:
sget_fc() pre-populates sb->s_fs_info from fc->s_fs_info before
invoking set() so set_anon_super_fc() (which just allocates an anon
bdev) is sufficient.
Before sget_fc() we stash cifs_sb in fc->s_fs_info, the per-mount data
in fc->sget_key and force fc->sb_flags to SB_NODIRATIME | SB_NOATIME
to reproduce the previous hard-coded behaviour (alloc_super() reads
fc->sb_flags). The original sb_flags is saved and restored around the
call so the rest of the mount path sees the same fc semantics as
before.
mnt_data.flags keeps its historical value of 0 so the CIFS_MS_MASK
comparison in compare_mount_options() returns the same (always-equal)
result.
No functional change. With this in place sget() has no remaining CIFS
caller.
Signed-off-by: Christian Brauner (Amutable) <brauner@kernel.org>
---
fs/smb/client/cifsfs.c | 43 ++++++++++++++++++++++++++-----------------
fs/smb/client/cifsfs.h | 3 ++-
fs/smb/client/cifsproto.h | 3 ++-
fs/smb/client/connect.c | 5 +++--
fs/smb/client/fs_context.c | 2 +-
5 files changed, 34 insertions(+), 22 deletions(-)
diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c
index 9f76b0347fa9..d5074e3fbb85 100644
--- a/fs/smb/client/cifsfs.c
+++ b/fs/smb/client/cifsfs.c
@@ -12,6 +12,7 @@
#include <linux/module.h>
#include <linux/fs.h>
+#include <linux/fs_context.h>
#include <linux/filelock.h>
#include <linux/mount.h>
#include <linux/slab.h>
@@ -966,26 +967,19 @@ cifs_get_root(struct smb3_fs_context *ctx, struct super_block *sb)
return dentry;
}
-static int cifs_set_super(struct super_block *sb, void *data)
-{
- struct cifs_mnt_data *mnt_data = data;
- sb->s_fs_info = mnt_data->cifs_sb;
- return set_anon_super(sb, NULL);
-}
-
struct dentry *
-cifs_smb3_do_mount(struct file_system_type *fs_type,
- int flags, struct smb3_fs_context *old_ctx)
+cifs_smb3_do_mount(struct fs_context *fc, struct smb3_fs_context *old_ctx)
{
struct cifs_mnt_data mnt_data;
struct cifs_sb_info *cifs_sb;
struct super_block *sb;
struct dentry *root;
+ unsigned int saved_sb_flags;
int rc;
if (cifsFYI) {
- cifs_dbg(FYI, "%s: devname=%s flags=0x%x\n", __func__,
- old_ctx->source, flags);
+ cifs_dbg(FYI, "%s: devname=%s sb_flags=0x%x\n", __func__,
+ old_ctx->source, fc->sb_flags);
} else {
cifs_info("Attempting to mount %s\n", old_ctx->source);
}
@@ -1012,7 +1006,7 @@ cifs_smb3_do_mount(struct file_system_type *fs_type,
rc = cifs_mount(cifs_sb, cifs_sb->ctx);
if (rc) {
- if (!(flags & SB_SILENT))
+ if (!(fc->sb_flags & SB_SILENT))
cifs_dbg(VFS, "cifs_mount failed w/return code = %d\n",
rc);
root = ERR_PTR(rc);
@@ -1021,12 +1015,27 @@ cifs_smb3_do_mount(struct file_system_type *fs_type,
mnt_data.ctx = cifs_sb->ctx;
mnt_data.cifs_sb = cifs_sb;
- mnt_data.flags = flags;
+ mnt_data.flags = 0;
- /* BB should we make this contingent on mount parm? */
- flags |= SB_NODIRATIME | SB_NOATIME;
-
- sb = sget(fs_type, cifs_match_super, cifs_set_super, flags, &mnt_data);
+ /*
+ * sb->s_flags is set from fc->sb_flags by alloc_super(). CIFS has
+ * historically forced SB_NODIRATIME | SB_NOATIME on every mount and
+ * ignored the caller-supplied SB_* flags. Preserve that behaviour by
+ * overriding fc->sb_flags around the sget_fc() call.
+ *
+ * Hand cifs_sb to sget_fc() via fc->s_fs_info; sget_fc() copies it
+ * onto sb->s_fs_info before running set() and clears fc->s_fs_info
+ * on successful publish. Pass the rest of the per-mount context to
+ * cifs_match_super() through fc->sget_key.
+ */
+ saved_sb_flags = fc->sb_flags;
+ fc->sb_flags = SB_NODIRATIME | SB_NOATIME;
+ fc->s_fs_info = cifs_sb;
+ fc->sget_key = &mnt_data;
+ sb = sget_fc(fc, cifs_match_super, set_anon_super_fc);
+ fc->sget_key = NULL;
+ fc->s_fs_info = NULL;
+ fc->sb_flags = saved_sb_flags;
if (IS_ERR(sb)) {
cifs_umount(cifs_sb);
return ERR_CAST(sb);
diff --git a/fs/smb/client/cifsfs.h b/fs/smb/client/cifsfs.h
index c455b15f2778..0a93f48924a5 100644
--- a/fs/smb/client/cifsfs.h
+++ b/fs/smb/client/cifsfs.h
@@ -144,8 +144,9 @@ ssize_t cifs_file_copychunk_range(unsigned int xid, struct file *src_file,
long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg);
void cifs_setsize(struct inode *inode, loff_t offset);
+struct fs_context;
struct smb3_fs_context;
-struct dentry *cifs_smb3_do_mount(struct file_system_type *fs_type, int flags,
+struct dentry *cifs_smb3_do_mount(struct fs_context *fc,
struct smb3_fs_context *old_ctx);
char *cifs_silly_fullpath(struct dentry *dentry);
diff --git a/fs/smb/client/cifsproto.h b/fs/smb/client/cifsproto.h
index 4a25afda9448..a39572cbaadb 100644
--- a/fs/smb/client/cifsproto.h
+++ b/fs/smb/client/cifsproto.h
@@ -19,6 +19,7 @@
struct statfs;
struct smb_rqst;
struct smb3_fs_context;
+struct fs_context;
/*
*****************************************************************
@@ -236,7 +237,7 @@ void cifs_mount_put_conns(struct cifs_mount_ctx *mnt_ctx);
int cifs_mount_get_session(struct cifs_mount_ctx *mnt_ctx);
int cifs_is_path_remote(struct cifs_mount_ctx *mnt_ctx);
int cifs_mount_get_tcon(struct cifs_mount_ctx *mnt_ctx);
-int cifs_match_super(struct super_block *sb, void *data);
+int cifs_match_super(struct super_block *sb, struct fs_context *fc);
int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb3_fs_context *ctx);
void cifs_umount(struct cifs_sb_info *cifs_sb);
void cifs_mark_open_files_invalid(struct cifs_tcon *tcon);
diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c
index dcde25da468d..79762e6bbe50 100644
--- a/fs/smb/client/connect.c
+++ b/fs/smb/client/connect.c
@@ -6,6 +6,7 @@
*
*/
#include <linux/fs.h>
+#include <linux/fs_context.h>
#include <linux/net.h>
#include <linux/string.h>
#include <linux/sched/mm.h>
@@ -2991,9 +2992,9 @@ static int match_prepath(struct super_block *sb,
}
int
-cifs_match_super(struct super_block *sb, void *data)
+cifs_match_super(struct super_block *sb, struct fs_context *fc)
{
- struct cifs_mnt_data *mnt_data = data;
+ struct cifs_mnt_data *mnt_data = fc->sget_key;
struct smb3_fs_context *ctx;
struct cifs_sb_info *cifs_sb;
struct TCP_Server_Info *tcp_srv;
diff --git a/fs/smb/client/fs_context.c b/fs/smb/client/fs_context.c
index b9544eb0381b..6aba4e1c9c27 100644
--- a/fs/smb/client/fs_context.c
+++ b/fs/smb/client/fs_context.c
@@ -920,7 +920,7 @@ static int smb3_get_tree_common(struct fs_context *fc)
struct dentry *root;
int rc = 0;
- root = cifs_smb3_do_mount(fc->fs_type, 0, ctx);
+ root = cifs_smb3_do_mount(fc, ctx);
if (IS_ERR(root))
return PTR_ERR(root);
--
2.47.3
^ permalink raw reply related
* [PATCH v2 2/4] ext4: convert mballoc KUnit test to sget_fc()
From: Christian Brauner @ 2026-05-29 8:43 UTC (permalink / raw)
To: linux-fsdevel
Cc: Theodore Ts'o, Andreas Dilger, Jan Kara, Ritesh Harjani (IBM),
linux-ext4, linux-cifs, Alexander Viro,
Christian Brauner (Amutable)
In-Reply-To: <20260529-work-sget-v2-0-57bbe08604e4@kernel.org>
Same treatment as the extents KUnit test. The mballoc test uses sget()
as a thin "give me an initialized superblock" wrapper for a fake
file_system_type. Move it onto sget_fc() so sget() can go away.
Add a no-op mbt_init_fs_context() so fs_context_for_mount() has
something to call on the fake fs_type. mbt_set() now takes a struct
fs_context * (still a no-op). mbt_ext4_alloc_super_block() allocates
the fc, hands it to sget_fc() and drops the fc reference once the sb
is published.
No functional change.
Signed-off-by: Christian Brauner (Amutable) <brauner@kernel.org>
---
fs/ext4/mballoc-test.c | 17 +++++++++++++++--
1 file changed, 15 insertions(+), 2 deletions(-)
diff --git a/fs/ext4/mballoc-test.c b/fs/ext4/mballoc-test.c
index 90ed505fa4b1..d90da44aadbd 100644
--- a/fs/ext4/mballoc-test.c
+++ b/fs/ext4/mballoc-test.c
@@ -5,6 +5,7 @@
#include <kunit/test.h>
#include <kunit/static_stub.h>
+#include <linux/fs_context.h>
#include <linux/random.h>
#include "ext4.h"
@@ -63,8 +64,14 @@ static void mbt_kill_sb(struct super_block *sb)
generic_shutdown_super(sb);
}
+static int mbt_init_fs_context(struct fs_context *fc)
+{
+ return 0;
+}
+
static struct file_system_type mbt_fs_type = {
.name = "mballoc test",
+ .init_fs_context = mbt_init_fs_context,
.kill_sb = mbt_kill_sb,
};
@@ -127,7 +134,7 @@ static void mbt_mb_release(struct super_block *sb)
kfree(sb->s_bdev);
}
-static int mbt_set(struct super_block *sb, void *data)
+static int mbt_set(struct super_block *sb, struct fs_context *fc)
{
return 0;
}
@@ -136,13 +143,19 @@ static struct super_block *mbt_ext4_alloc_super_block(void)
{
struct mbt_ext4_super_block *fsb;
struct super_block *sb;
+ struct fs_context *fc;
struct ext4_sb_info *sbi;
fsb = kzalloc_obj(*fsb);
if (fsb == NULL)
return NULL;
- sb = sget(&mbt_fs_type, NULL, mbt_set, 0, NULL);
+ fc = fs_context_for_mount(&mbt_fs_type, 0);
+ if (IS_ERR(fc))
+ goto out;
+
+ sb = sget_fc(fc, NULL, mbt_set);
+ put_fs_context(fc);
if (IS_ERR(sb))
goto out;
--
2.47.3
^ permalink raw reply related
* [PATCH v2 1/4] ext4: convert extents KUnit test to sget_fc()
From: Christian Brauner @ 2026-05-29 8:43 UTC (permalink / raw)
To: linux-fsdevel
Cc: Theodore Ts'o, Andreas Dilger, Jan Kara, Ritesh Harjani (IBM),
linux-ext4, linux-cifs, Alexander Viro,
Christian Brauner (Amutable)
In-Reply-To: <20260529-work-sget-v2-0-57bbe08604e4@kernel.org>
The extents KUnit test uses sget() to get an initialized superblock for
its fake file_system_type. sget() predates fs_context and we want to
retire it. Switch this caller over to sget_fc().
Add a no-op ext_init_fs_context() so fs_context_for_mount() has
something to call on the fake fs_type. ext_set() now takes a struct
fs_context * (still a no-op). extents_kunit_init() allocates the fc,
hands it to sget_fc() and drops the fc reference once the sb is
published. sget_fc() does not retain a pointer to it.
No functional change for the test.
Signed-off-by: Christian Brauner (Amutable) <brauner@kernel.org>
---
fs/ext4/extents-test.c | 22 ++++++++++++++++++----
1 file changed, 18 insertions(+), 4 deletions(-)
diff --git a/fs/ext4/extents-test.c b/fs/ext4/extents-test.c
index 6b53a3f39fcd..bd7795a82607 100644
--- a/fs/ext4/extents-test.c
+++ b/fs/ext4/extents-test.c
@@ -37,6 +37,7 @@
#include <kunit/test.h>
#include <kunit/static_stub.h>
+#include <linux/fs_context.h>
#include <linux/gfp_types.h>
#include <linux/stddef.h>
@@ -130,14 +131,20 @@ static void ext_kill_sb(struct super_block *sb)
generic_shutdown_super(sb);
}
-static int ext_set(struct super_block *sb, void *data)
+static int ext_init_fs_context(struct fs_context *fc)
+{
+ return 0;
+}
+
+static int ext_set(struct super_block *sb, struct fs_context *fc)
{
return 0;
}
static struct file_system_type ext_fs_type = {
- .name = "extents test",
- .kill_sb = ext_kill_sb,
+ .name = "extents test",
+ .init_fs_context = ext_init_fs_context,
+ .kill_sb = ext_kill_sb,
};
static void extents_kunit_exit(struct kunit *test)
@@ -223,6 +230,7 @@ static int extents_kunit_init(struct kunit *test)
struct ext4_inode_info *ei;
struct inode *inode;
struct super_block *sb;
+ struct fs_context *fc;
struct ext4_sb_info *sbi = NULL;
struct kunit_ext_test_param *param =
(struct kunit_ext_test_param *)(test->param_value);
@@ -232,7 +240,13 @@ static int extents_kunit_init(struct kunit *test)
if (sbi == NULL)
return -ENOMEM;
- sb = sget(&ext_fs_type, NULL, ext_set, 0, NULL);
+ fc = fs_context_for_mount(&ext_fs_type, 0);
+ if (IS_ERR(fc)) {
+ kfree(sbi);
+ return PTR_ERR(fc);
+ }
+ sb = sget_fc(fc, NULL, ext_set);
+ put_fs_context(fc);
if (IS_ERR(sb)) {
kfree(sbi);
return PTR_ERR(sb);
--
2.47.3
^ permalink raw reply related
* [PATCH v2 0/4] super: retire sget()
From: Christian Brauner @ 2026-05-29 8:43 UTC (permalink / raw)
To: linux-fsdevel
Cc: Theodore Ts'o, Andreas Dilger, Jan Kara, Ritesh Harjani (IBM),
linux-ext4, linux-cifs, Alexander Viro,
Christian Brauner (Amutable)
CIFS plus the two ext4 KUnit tests (extents-test, mballoc-test) were the
last in-tree callers, and all three convert cleanly to sget_fc(). That
lets sget() and its prototype come out, taking ~60 lines that only
existed to be kept in lockstep with sget_fc() on every publish-path
change.
Signed-off-by: Christian Brauner (Amutable) <brauner@kernel.org>
---
Changes in v2:
- Move some changes into a separate series.
- Link to v1: https://patch.msgid.link/20260526-work-sget-v1-0-263f7025cedd@kernel.org
---
Christian Brauner (4):
ext4: convert extents KUnit test to sget_fc()
ext4: convert mballoc KUnit test to sget_fc()
smb: client: convert cifs_smb3_do_mount() to sget_fc()
fs: retire sget()
fs/btrfs/super.c | 2 +-
fs/ext4/extents-test.c | 22 +++++++++++---
fs/ext4/mballoc-test.c | 17 +++++++++--
fs/smb/client/cifsfs.c | 43 +++++++++++++++++-----------
fs/smb/client/cifsfs.h | 3 +-
fs/smb/client/cifsproto.h | 3 +-
fs/smb/client/connect.c | 5 ++--
fs/smb/client/fs_context.c | 2 +-
fs/super.c | 71 ++++------------------------------------------
include/linux/fs.h | 4 ---
10 files changed, 73 insertions(+), 99 deletions(-)
---
base-commit: 254f49634ee16a731174d2ae34bc50bd5f45e731
change-id: 20260526-work-sget-6bc80b96cba5
^ permalink raw reply
* Re: [PATCH] ext4: fix quota accounting WARN in bigalloc punch hole
From: Qiliang Yuan @ 2026-05-29 8:34 UTC (permalink / raw)
To: yi.zhang
Cc: tytso, adilger.kernel, jack, ritesh.list, ojaswin, libaokun,
linux-ext4, linux-kernel, yzjaurora
In-Reply-To: <73e91ebd-27de-4834-af2f-9b4ac19a4100@huaweicloud.com>
Hi Zhang Yi,
On 5/29/2026 3:32 PM, Zhang Yi wrote:
> On 5/28/2026 6:21 PM, Qiliang Yuan wrote:
>> __es_remove_extent() -> get_rsvd() already correctly excludes
>> boundary clusters that still contain delayed blocks from resv_used.
>> Adding pending to resv_used double-counts those boundary clusters,
>> erroneously releasing reservations that are still needed.
>
> Hmm, the analysis doesn't seem correct to me. Do you mean the
> following case?
>
> # Assume the cluster size is 16KB.
> xfs_io -f -c "pwrite 12k 4k" /mnt/foo
> xfs_io -d -c "pwrite 0 4k" /mnt/foo
> xfs_io -c "fpunch 0 4k" /mnt/foo
>
> During the direct I/O write, quota space will be added in
> ext4_mb_new_blocks() because the EXT4_MB_DELALLOC_RESERVED flag is
> not set. Therefore, in ext4_es_insert_extent(), we should release the
> quota reservations, since this cluster has already been allocated.
>
> Then, in the third operation (punch hole), it will reclaim the added
> dqb_curspace. This should not cause an insufficiency.
>
> Am I missing something?
Thanks for the review! Let me explain the issue with your specific
example.
After step 1 (delalloc write 4KB@12KB in a 16KB cluster), we have:
ES tree: [0,1) hole, [1,3) delayed, [3,4) hole (blocks 0..3)
Quota: dqb_rsvspace += 16KB (one cluster reserved)
Step 2 (DIO write 4KB@0KB, RWF_DSYNC):
The DIO allocates one cluster, but the mapped extent from
ext4_ext_map_blocks() only covers the written range, e.g. [0,0].
In ext4_es_insert_extent():
a) __es_remove_extent([0,0]) removes the [0,1) hole entry, but
there are no delayed extents within [0,0], so resv_used = 0.
This is correct: the DIO extent [0,0] does not overlap the
delayed region [1,3).
b) __revise_pending() scans outside the newly inserted extent [0,0]:
- Left boundary (block 0): the range starts at cluster
boundary, no blocks to scan on the left → no pending insert.
- Right boundary (block 3): blocks [1,3] are outside [0,0]
and are delayed → __revise_pending() inserts a pending
reservation and returns pending = 1.
This pending reservation means: "cluster containing block 3
still has delayed blocks, keep this cluster reserved."
c) Then comes the bug:
resv_used += pending; // resv_used = 0 + 1 = 1
This causes ext4_da_update_reserve_space() to release 16KB
of quota reservation (dquot_release_reservation_block()). But
block 3 is still delayed! Its quota reservation should NOT
be released — no blocks within [0,0] actually used the delalloc
reservation.
So after DIO:
dqb_rsvspace: originally 16KB, now 0 (incorrectly released)
dqb_curspace: 16KB (from ext4_mb_new_blocks)
Step 3 (punch 4KB@0KB):
ext4_remove_blocks() sees that block 0's cluster has a pending
reservation → calls ext4_rereserve_cluster() → dquot_reclaim_block()
tries to move 16KB from dqb_curspace to dqb_rsvspace. But
dqb_curspace may already be insufficient (depending on whether
other allocs/frees have happened), triggering:
WARNING at dquot_reclaim_space_nodirty
Step 4 (delalloc writeback of block 3):
ext4_da_update_reserve_space() → dquot_claim_block() tries to
move 16KB from dqb_rsvspace to dqb_curspace. Since rsvspace was
incorrectly released and not fully restored by the punch hole's
rereseve, this triggers:
WARNING at dquot_claim_space_nodirty
The key point is: pending from __revise_pending() indicates clusters
that *still contain delayed blocks outside the newly inserted extent*.
These clusters' quota reservations must be preserved — get_rsvd()
inside __es_remove_extent() already correctly excludes them from
resv_used. Adding pending to resv_used double-counts them.
I also found a pre-existing retry loop issue: if __es_insert_extent()
fails with -ENOMEM on the first pass, resv_used carries a stale value
back to retry and could be double-released. I'll include a fix for
that in v2.
--
Qiliang Yuan
^ permalink raw reply
* Re: [PATCH] ext4: validate dx count against limit in dx_csum
From: Baokun Li @ 2026-05-29 8:00 UTC (permalink / raw)
To: Artem Blagodarenko; +Cc: adilger.kernel, linux-ext4
In-Reply-To: <20260528160557.6956-1-ablagodarenko@thelustrecollective.com>
On 2026/5/29 00:05, Artem Blagodarenko wrote:
> From: Artem Blagodarenko <artem.blagodarenko@gmail.com>
>
> Sashiko AI, during inspection of the "ext4: replace ext4_dir_entry
> with ext4_dir_entry_2" series, reported a missing bounds check on
> count against limit in DX block checksum verification and setup
> paths.
>
> The code validates that limit fits within block boundaries, but does
> not verify that count <= limit. A maliciously crafted filesystem image
> could therefore provide an artificially large count value.
>
> Since ext4_dx_csum() uses count to determine the checksum region
> size, this may result in an out-of-bounds access crossing page
> boundaries into unmapped memory, potentially leading to a crash during
> checksum verification.
>
> The same issue exists in ext4_dx_csum_set().
>
> The bug was not introduced by the patch under review, so the fix is sent
> separately.
>
> Signed-off-by: Artem Blagodarenko artem.blagodarenko@gmail.com
Looks good, feel free to add:
Reviewed-by: Baokun Li <libaokun@linux.alibaba.com>
> ---
> fs/ext4/namei.c | 6 ++++--
> 1 file changed, 4 insertions(+), 2 deletions(-)
>
> diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
> index a316fc2ac41b..20b7bac69889 100644
> --- a/fs/ext4/namei.c
> +++ b/fs/ext4/namei.c
> @@ -472,7 +472,8 @@ static int ext4_dx_csum_verify(struct inode *inode,
> }
> limit = le16_to_cpu(c->limit);
> count = le16_to_cpu(c->count);
> - if (count_offset + (limit * sizeof(struct dx_entry)) >
> + if (!count || count > limit ||
> + count_offset + (limit * sizeof(struct dx_entry)) >
> EXT4_BLOCK_SIZE(inode->i_sb) - sizeof(struct dx_tail)) {
> warn_no_space_for_csum(inode);
> return 0;
> @@ -501,7 +502,8 @@ static void ext4_dx_csum_set(struct inode *inode, struct ext4_dir_entry_2 *diren
> }
> limit = le16_to_cpu(c->limit);
> count = le16_to_cpu(c->count);
> - if (count_offset + (limit * sizeof(struct dx_entry)) >
> + if (!count || count > limit ||
> + count_offset + (limit * sizeof(struct dx_entry)) >
> EXT4_BLOCK_SIZE(inode->i_sb) - sizeof(struct dx_tail)) {
> warn_no_space_for_csum(inode);
> return;
^ permalink raw reply
* Re: [PATCH] jbd2: update outdated comment for jbd2_journal_try_to_free_buffers()
From: Baokun Li @ 2026-05-29 7:44 UTC (permalink / raw)
To: Zhang Yi
Cc: linux-ext4, linux-fsdevel, linux-kernel, tytso, adilger.kernel,
jack, ojaswin, ritesh.list, yi.zhang, yizhang089, yangerkun,
yukuai
In-Reply-To: <20260522030540.3896201-1-yi.zhang@huaweicloud.com>
On 2026/5/22 11:05, Zhang Yi wrote:
> From: Zhang Yi <yi.zhang@huawei.com>
>
> jbd2_journal_try_to_free_buffers() currently only tries to remove
> checkpointed data buffers from the checkpoint list for data=journal
> mode, and bails out if any buffer is still attached to a transaction.
> For data=ordered and writeback modes, data buffers never have
> journal_heads, so the function degenerates to a plain
> try_to_free_buffers() call.
>
> Besides, The release of metadata buffers has been delegated to the jbd2
> journal shrinker in commit 4ba3fcdde7e3 ("jbd2,ext4: add a shrinker to
> release checkpointed buffers"). jbd2_journal_try_to_free_buffers() is
> not used for handling metadata buffers now.
>
> However, the comment above the function still references
> jbd2_journal_dirty_data(), __jbd2_journal_unfile_buffer(), t_datalist,
> BKL, and BUF_CLEAN, all of which were removed in commit 87c89c232c8f
> ("jbd2: Remove data=ordered mode support using jbd buffer heads").
>
> Replace it with a description of what the function actually does now.
>
> Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
Looks good!
Reviewed-by: Baokun Li <libaokun@linux.alibaba.com>
> ---
> fs/jbd2/transaction.c | 39 ++++++++++++---------------------------
> 1 file changed, 12 insertions(+), 27 deletions(-)
>
> diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
> index 4885903bbd10..239bcf88ed1c 100644
> --- a/fs/jbd2/transaction.c
> +++ b/fs/jbd2/transaction.c
> @@ -2139,38 +2139,23 @@ static void __jbd2_journal_unfile_buffer(struct journal_head *jh)
> }
>
> /**
> - * jbd2_journal_try_to_free_buffers() - try to free page buffers.
> + * jbd2_journal_try_to_free_buffers() - try to free folio buffers.
> * @journal: journal for operation
> * @folio: Folio to detach data from.
> *
> - * For all the buffers on this page,
> - * if they are fully written out ordered data, move them onto BUF_CLEAN
> - * so try_to_free_buffers() can reap them.
> + * For each buffer_head on @folio, if the buffer has a journal_head but
> + * is not attached to a running or committing transaction, try to remove
> + * it from the checkpoint list. This is needed for data=journal mode
> + * where data buffers are journaled: once they are checkpointed, the
> + * journal_head can be detached and the buffer freed. If any buffer is
> + * still attached to a transaction, the folio cannot be released and we
> + * bail out. Otherwise we call try_to_free_buffers() to detach all
> + * buffer_heads from the folio.
> *
> - * This function returns non-zero if we wish try_to_free_buffers()
> - * to be called. We do this if the page is releasable by try_to_free_buffers().
> - * We also do it if the page has locked or dirty buffers and the caller wants
> - * us to perform sync or async writeout.
> + * For data=ordered and writeback modes, data buffers never have
> + * journal_heads, so this degenerates to a plain try_to_free_buffers().
> *
> - * This complicates JBD locking somewhat. We aren't protected by the
> - * BKL here. We wish to remove the buffer from its committing or
> - * running transaction's ->t_datalist via __jbd2_journal_unfile_buffer.
> - *
> - * This may *change* the value of transaction_t->t_datalist, so anyone
> - * who looks at t_datalist needs to lock against this function.
> - *
> - * Even worse, someone may be doing a jbd2_journal_dirty_data on this
> - * buffer. So we need to lock against that. jbd2_journal_dirty_data()
> - * will come out of the lock with the buffer dirty, which makes it
> - * ineligible for release here.
> - *
> - * Who else is affected by this? hmm... Really the only contender
> - * is do_get_write_access() - it could be looking at the buffer while
> - * journal_try_to_free_buffer() is changing its state. But that
> - * cannot happen because we never reallocate freed data as metadata
> - * while the data is part of a transaction. Yes?
> - *
> - * Return false on failure, true on success
> + * Return: true if the folio's buffers were freed, false otherwise
> */
> bool jbd2_journal_try_to_free_buffers(journal_t *journal, struct folio *folio)
> {
^ permalink raw reply
* Re: [PATCH] ext4: Fix ERR_PTR(0) in ext4_mkdir()
From: Baokun Li @ 2026-05-29 7:37 UTC (permalink / raw)
To: Hongling Zeng
Cc: tytso, adilger.kernel, jack, ojaswin, ritesh.list, yi.zhang,
linux-ext4, linux-kernel, zhongling0719, neil, brauner, jlayton
In-Reply-To: <20260520074634.53656-1-zenghongling@kylinos.cn>
On 2026/5/20 15:46, Hongling Zeng wrote:
> When mkdir succeeds, ext4_mkdir() returns ERR_PTR(0) which is incorrect.
> It should return NULL instead for success and ERR_PTR() only with
> negative error codes for failure.
>
> Fixes: 88d5baf69082 ("Change inode_operations.mkdir to return struct dentry *")
> Signed-off-by: Hongling Zeng <zenghongling@kylinos.cn>
Looks good.
Reviewed-by: Baokun Li <libaokun@linux.alibaba.com>
> ---
> fs/ext4/namei.c | 2 +-
> 1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
> index 4a47fbd8dd30..8cadaeb15b2b 100644
> --- a/fs/ext4/namei.c
> +++ b/fs/ext4/namei.c
> @@ -3054,7 +3054,7 @@ static struct dentry *ext4_mkdir(struct mnt_idmap *idmap, struct inode *dir,
> out_retry:
> if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
> goto retry;
> - return ERR_PTR(err);
> + return err ? ERR_PTR(err) : NULL;
> }
>
> /*
^ permalink raw reply
* Re: [PATCH] ext4: fix quota accounting WARN in bigalloc punch hole
From: Zhang Yi @ 2026-05-29 7:32 UTC (permalink / raw)
To: Qiliang Yuan, Theodore Ts'o, Andreas Dilger, Baokun Li,
Jan Kara, Ojaswin Mujoo, Ritesh Harjani (IBM)
Cc: linux-ext4, linux-kernel, Zijing Yin
In-Reply-To: <20260528-fix-ext4-bigalloc-punch-hole-quota-v2-v1-1-32871356273b@gmail.com>
Hi, Qiliang!
On 5/28/2026 6:21 PM, Qiliang Yuan wrote:
> When doing direct I/O write on a bigalloc filesystem, the allocated
> extent might not cover entire clusters at its boundaries, leaving
> delayed blocks in those boundary clusters. In ext4_es_insert_extent(),
> __revise_pending() inserts new pending reservations for those boundary
> clusters, and the return value (pending=true) was added to resv_used,
> causing ext4_da_update_reserve_space() to incorrectly release the
> quota reservations for those boundary clusters.
>
> Later when PUNCH_HOLE removes the DIO-allocated blocks, the
> extent removal path detects the pending reservation via
> ext4_is_pending() and calls ext4_rereserve_cluster(). This tries
> to reclaim quota from dq_dqb.dqb_curspace back to dqb_rsvspace,
> but since the quota was already incorrectly released, dqb_curspace
> is insufficient, triggering:
>
> WARNING at dquot_reclaim_space_nodirty+0x77c/0x8c0
Hmm, the analysis doesn't seem correct to me. Do you mean the
following case?
# Assume the cluster size is 16KB.
xfs_io -f -c "pwrite 12k 4k" /mnt/foo
xfs_io -d -c "pwrite 0 4k" /mnt/foo
xfs_io -c "fpunch 0 4k" /mnt/foo
During the direct I/O write, quota space will be added in
ext4_mb_new_blocks() because the EXT4_MB_DELALLOC_RESERVED flag is
not set. Therefore, in ext4_es_insert_extent(), we should release the
quota reservations, since this cluster has already been allocated.
Then, in the third operation (punch hole), it will reclaim the added
dqb_curspace. This should not cause an insufficiency.
Am I missing something?
>
> The subsequent delalloc writeback then fires a second WARN from
> dquot_claim_space_nodirty() for the same reason: dqb_rsvspace was
> depleted by the earlier incorrect release.
>
> __es_remove_extent() -> get_rsvd() already correctly excludes
> boundary clusters that still contain delayed blocks from resv_used.
> Adding pending to resv_used double-counts those boundary clusters,
> erroneously releasing reservations that are still needed.
>
> Remove the pending variable and the resv_used += pending addition.
That's not correct. Assume the following case.
# Assume the cluster size is 16KB.
xfs_io -f -c "pwrite 0 16k" /mnt/foo
xfs_io -c "sync_range -w 0 4k" /mnt/foo
Although only part of the cluster is written back, the cluster has been
allocated. Therefore, the quota needs to be claimed. However, since we
only wrote back a portion of a cluster, __es_remove_extent() will not
return the reserved clusters that need to be consumed (i.e., resv_used
is zero). Therefore, we need to determine whether a new pending
allocation has been created by checking the pending status, so that we
can correctly claim the quota.
Thanks,
Yi
>
> Fixes: c543e2429640 ("ext4: update delalloc data reserve spcae in ext4_es_insert_extent()")
> Reported-by: Zijing Yin <yzjaurora@gmail.com>
> Closes: https://bugzilla.kernel.org/show_bug.cgi?id=221570
> Signed-off-by: Qiliang Yuan <realwujing@gmail.com>
> ---
> fs/ext4/extents_status.c | 4 +---
> 1 file changed, 1 insertion(+), 3 deletions(-)
>
> diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
> index 6e4a191e82191..fefe0bb8ac4d1 100644
> --- a/fs/ext4/extents_status.c
> +++ b/fs/ext4/extents_status.c
> @@ -909,7 +909,7 @@ void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
> struct extent_status newes;
> ext4_lblk_t end = lblk + len - 1;
> int err1 = 0, err2 = 0, err3 = 0;
> - int resv_used = 0, pending = 0;
> + int resv_used = 0;
> struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
> struct extent_status *es1 = NULL;
> struct extent_status *es2 = NULL;
> @@ -977,7 +977,6 @@ void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
> __free_pending(pr);
> pr = NULL;
> }
> - pending = err3;
> }
> ext4_es_inc_seq(inode);
> error:
> @@ -998,7 +997,6 @@ void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
> * any previously delayed allocated clusters instead of claim them
> * again.
> */
> - resv_used += pending;
> if (resv_used)
> ext4_da_update_reserve_space(inode, resv_used,
> delalloc_reserve_used);
>
> ---
> base-commit: eb3f4b7426cfd2b79d65b7d37155480b32259a11
> change-id: 20260528-fix-ext4-bigalloc-punch-hole-quota-v2-2adca315d1ba
>
> Best regards,
^ permalink raw reply
* Re: [PATCH] ext4: convert legacy ext4_debug() to standard pr_debug()
From: Baokun Li @ 2026-05-29 7:32 UTC (permalink / raw)
To: lirongqing
Cc: Theodore Ts'o, Jan Kara, Zhang Yi, Andreas Dilger,
Ojaswin Mujoo, Ritesh Harjani, linux-ext4, linux-kernel
In-Reply-To: <20260521074634.2697-1-lirongqing@baidu.com>
On 2026/5/21 15:46, lirongqing wrote:
> From: Li RongQing <lirongqing@baidu.com>
>
> The ext4 file system historically implemented its own debug logging macro
> ext4_debug() via EXT4FS_DEBUG conditional compilation. This legacy
> implementation suffers from two major drawbacks:
>
> 1. It makes two consecutive un-serialized printk() calls, which can
> lead to severe log interleaving and corruption under multi-core
> concurrent workloads.
> 2. It completely bypasses the standard modern kernel dynamic debug
> (CONFIG_DYNAMIC_DEBUG) infrastructure.
>
> Clean up the legacy implementation by leveraging pr_debug(). This squashes
> the multiple printk() calls into a single atomic execution, ensuring
> log integrity, while seamlessly hooking ext4 into the kernel's native
> dynamic debug framework.
>
> The redundant __FILE__ and __LINE__ macros are intentionally removed from
> the string format because the dynamic debug infrastructure can already
> append them automatically at runtime (via the '+fl' flags) if desired.
> This avoids redundancy and double-logging in modern production/debugging
> environments while keeping the macro clean and robust against dangling
> comma compiler errors.
>
> Signed-off-by: Li RongQing <lirongqing@baidu.com>
Thanks for cleaning this up.
> ---
> fs/ext4/ext4.h | 20 ++------------------
> 1 file changed, 2 insertions(+), 18 deletions(-)
>
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 94283a9..39e86ff 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -62,24 +62,8 @@
> */
> #define DOUBLE_CHECK__
>
> -/*
> - * Define EXT4FS_DEBUG to produce debug messages
> - */
> -#undef EXT4FS_DEBUG
> -
This looks like it's just disabling EXT4FS_DEBUG, but it's actually
the only toggle point — enabling debug required manually flipping this
to #define EXT4FS_DEBUG in the source.
Since balloc.c, ialloc.c, and page-io.c still have #ifdef EXT4FS_DEBUG
blocks that weren't cleaned up here, it would be cleaner to replace
those with CONFIG_EXT4_DEBUG so they fall under a single
Kconfig-controlled umbrella.
> -/*
> - * Debug code
> - */
> -#ifdef EXT4FS_DEBUG
> -#define ext4_debug(f, a...) \
> - do { \
> - printk(KERN_DEBUG "EXT4-fs DEBUG (%s, %d): %s:", \
> - __FILE__, __LINE__, __func__); \
> - printk(KERN_DEBUG f, ## a); \
> - } while (0)
> -#else
> -#define ext4_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__)
> -#endif
> +#define ext4_debug(fmt, ...) \
> + pr_debug("EXT4-fs DEBUG %s: " fmt, __func__, ##__VA_ARGS__)
Nit: an extra space between __func__, and ##__VA_ARGS__.
>
> /*
> * Turn on EXT_DEBUG to enable ext4_ext_show_path/leaf/move in extents.c
Also, could ext4_debug be moved next to ext_debug and placed under
#ifdef CONFIG_EXT4_DEBUG together, for a cleaner layout?
Cheers,
Baokun
^ permalink raw reply
* Re: [PATCH] ext4: Remove mention of PageWriteback
From: Baokun Li @ 2026-05-29 6:33 UTC (permalink / raw)
To: Matthew Wilcox (Oracle)
Cc: Theodore Ts'o, Andreas Dilger, Jan Kara, Ojaswin Mujoo,
Ritesh Harjani (IBM), Zhang Yi, linux-ext4, linux-kernel
In-Reply-To: <20260526190805.341676-1-willy@infradead.org>
On 2026/5/27 03:08, Matthew Wilcox (Oracle) wrote:
> Update a comment to refer to the concept of writeback instead of the
> (now obsolete) detail of how it's implemented.
>
> Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Looks good to me.
Reviewed-by: Baokun Li <libaokun@linux.alibaba.com>
> ---
> fs/ext4/page-io.c | 2 +-
> 1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
> index dc82e7b57e75..bc674aa4a656 100644
> --- a/fs/ext4/page-io.c
> +++ b/fs/ext4/page-io.c
> @@ -168,7 +168,7 @@ static void ext4_release_io_end(ext4_io_end_t *io_end)
> * written. On IO failure, check if journal abort is needed. Note that
> * we are protected from truncate touching same part of extent tree by the
> * fact that truncate code waits for all DIO to finish (thus exclusion from
> - * direct IO is achieved) and also waits for PageWriteback bits. Thus we
> + * direct IO is achieved) and also waits for writeback to complete. Thus we
> * cannot get to ext4_ext_truncate() before all IOs overlapping that range are
> * completed (happens from ext4_free_ioend()).
> */
^ permalink raw reply
* Re: [PATCH] ext4: Remove mention of PageWriteback
From: Ojaswin Mujoo @ 2026-05-29 6:01 UTC (permalink / raw)
To: Matthew Wilcox (Oracle)
Cc: Theodore Ts'o, Andreas Dilger, Baokun Li, Jan Kara,
Ritesh Harjani (IBM), Zhang Yi, linux-ext4, linux-kernel
In-Reply-To: <20260526190805.341676-1-willy@infradead.org>
On Tue, May 26, 2026 at 08:08:02PM +0100, Matthew Wilcox (Oracle) wrote:
> Update a comment to refer to the concept of writeback instead of the
> (now obsolete) detail of how it's implemented.
>
> Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
> ---
> fs/ext4/page-io.c | 2 +-
> 1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
> index dc82e7b57e75..bc674aa4a656 100644
> --- a/fs/ext4/page-io.c
> +++ b/fs/ext4/page-io.c
> @@ -168,7 +168,7 @@ static void ext4_release_io_end(ext4_io_end_t *io_end)
> * written. On IO failure, check if journal abort is needed. Note that
> * we are protected from truncate touching same part of extent tree by the
> * fact that truncate code waits for all DIO to finish (thus exclusion from
> - * direct IO is achieved) and also waits for PageWriteback bits. Thus we
> + * direct IO is achieved) and also waits for writeback to complete. Thus we
Looks good,
Reviewed-by: Ojaswin Mujoo <ojaswin@linux.ibm.com>
Regards,
Ojaswin
> * cannot get to ext4_ext_truncate() before all IOs overlapping that range are
> * completed (happens from ext4_free_ioend()).
> */
> --
> 2.47.3
>
^ permalink raw reply
* Re: [PATCH v6 06/11] fstests: verify f_fsid for cloned filesystems
From: Darrick J. Wong @ 2026-05-29 4:39 UTC (permalink / raw)
To: Anand Jain
Cc: fstests, linux-btrfs, linux-ext4, linux-xfs, linux-f2fs-devel,
zlang, hch
In-Reply-To: <e029755044a32de6ec2a0d3391f3ff0089fc3c30.1779939330.git.asj@kernel.org>
On Thu, May 28, 2026 at 12:05:37PM +0800, Anand Jain wrote:
> Verify that the cloned filesystem provides an f_fsid that is persistent
> across mount cycles, yet unique from the original filesystem's f_fsid.
Might want to add that last part to the test description itself, because
otherwise I don't know what 'verify' means.
> Signed-off-by: Anand Jain <asj@kernel.org>
> ---
> tests/generic/802 | 67 +++++++++++++++++++++++++++++++++++++++++++
> tests/generic/802.out | 7 +++++
> 2 files changed, 74 insertions(+)
> create mode 100644 tests/generic/802
> create mode 100644 tests/generic/802.out
>
> diff --git a/tests/generic/802 b/tests/generic/802
> new file mode 100644
> index 000000000000..653e74e11b53
> --- /dev/null
> +++ b/tests/generic/802
> @@ -0,0 +1,67 @@
> +#! /bin/bash
> +# SPDX-License-Identifier: GPL-2.0
> +# Copyright (c) 2026 Anand Jain <asj@kernel.org>. All Rights Reserved.
> +#
> +# FS QA Test 802
> +# Verify f_fsid and s_uuid of cloned filesystems across mount cycle.
> +
> +. ./common/preamble
> +
> +_begin_fstest auto quick mount clone
> +
> +_require_test
> +_require_block_device $TEST_DEV
> +_require_loop
> +
> +[ "$FSTYP" = "btrfs" ] && _fixed_by_kernel_commit xxxxxxxxxxxx \
> + "btrfs: use on-disk uuid for s_uuid in temp_fsid mounts"
> +[ "$FSTYP" = "btrfs" ] && _fixed_by_kernel_commit xxxxxxxxxxxx \
> + "btrfs: derive f_fsid from on-disk fsuuid and dev_t"
_fixed_by_fs_commit?
> +
> +_cleanup()
> +{
> + cd /
> + rm -r -f $tmp.*
> + umount $mnt1 $mnt2 2>/dev/null
> + _loop_image_destroy "${devs[@]}" 2> /dev/null
> +}
> +
> +# Setup base loop device and its clone
> +devs=()
> +_loop_image_create_clone devs
> +mkdir -p $TEST_DIR/$seq
> +mnt1=$TEST_DIR/$seq/mnt1
> +mnt2=$TEST_DIR/$seq/mnt2
> +mkdir -p $mnt1
> +mkdir -p $mnt2
> +
> +# Mount both filesystems simultaneously using mandatory clone mount options
> +_mount $(_common_dev_mount_options) $(_clone_mount_option) ${devs[0]} $mnt1 || \
> + _fail "Failed to mount dev1"
> +_mount $(_common_dev_mount_options) $(_clone_mount_option) ${devs[1]} $mnt2 || \
> + _fail "Failed to mount dev2"
> +
> +# Capture baseline filesystem IDs for comparison
> +fsid_scratch=$(stat -f -c "%i" $mnt1)
> +fsid_clone=$(stat -f -c "%i" $mnt2)
> +
> +echo "**** fsid initially ****"
> +echo $fsid_scratch | sed -e "s/$fsid_scratch/FSID_SCRATCH/g"
> +echo $fsid_clone | sed -e "s/$fsid_clone/FSID_CLONE/g"
Why echo only to sed?
--D
> +
> +# Verify that the fsids remain stable after a mount cycle, even when the
> +# mount order is reversed.
> +echo "**** fsid after mount cycle ****"
> +_unmount $mnt1
> +_unmount $mnt2
> +_mount $(_common_dev_mount_options) $(_clone_mount_option) ${devs[1]} $mnt2 || \
> + _fail "Failed to mount dev2"
> +_mount $(_common_dev_mount_options) $(_clone_mount_option) ${devs[0]} $mnt1 || \
> + _fail "Failed to mount dev1"
> +
> +# Compare post mount-cycle values against the baseline
> +stat -f -c "%i" $mnt1 | sed -e "s/$fsid_scratch/FSID_SCRATCH/g"
> +stat -f -c "%i" $mnt2 | sed -e "s/$fsid_clone/FSID_CLONE/g"
> +
> +status=0
> +exit
> diff --git a/tests/generic/802.out b/tests/generic/802.out
> new file mode 100644
> index 000000000000..d1e008f122bb
> --- /dev/null
> +++ b/tests/generic/802.out
> @@ -0,0 +1,7 @@
> +QA output created by 802
> +**** fsid initially ****
> +FSID_SCRATCH
> +FSID_CLONE
> +**** fsid after mount cycle ****
> +FSID_SCRATCH
> +FSID_CLONE
> --
> 2.43.0
>
>
^ permalink raw reply
* Re: [PATCH v6 05/11] fstests: verify fanotify isolation on cloned filesystems
From: Darrick J. Wong @ 2026-05-29 4:36 UTC (permalink / raw)
To: Anand Jain
Cc: fstests, linux-btrfs, linux-ext4, linux-xfs, linux-f2fs-devel,
zlang, hch
In-Reply-To: <ef076b330a047d2f19ed48f5b7166f419433bb73.1779939330.git.asj@kernel.org>
On Thu, May 28, 2026 at 12:05:36PM +0800, Anand Jain wrote:
> Verify that fanotify events are correctly routed to the appropriate
> watcher when cloned filesystems are mounted.
> Helps verify kernel's event notification distinguishes between devices
> sharing the same FSID/UUID.
>
> Signed-off-by: Anand Jain <asj@kernel.org>
> ---
> tests/generic/801 | 135 ++++++++++++++++++++++++++++++++++++++++++
> tests/generic/801.out | 7 +++
> 2 files changed, 142 insertions(+)
> create mode 100644 tests/generic/801
> create mode 100644 tests/generic/801.out
>
> diff --git a/tests/generic/801 b/tests/generic/801
> new file mode 100644
> index 000000000000..3bfb87d41922
> --- /dev/null
> +++ b/tests/generic/801
> @@ -0,0 +1,135 @@
> +#! /bin/bash
> +# SPDX-License-Identifier: GPL-2.0
> +# Copyright (c) 2026 Anand Jain <asj@kernel.org>. All Rights Reserved.
> +#
> +# FS QA Test 801
> +# Verify fanotify FID functionality on cloned filesystems by setting up
> +# watchers and making sure notifications are in the correct logs files.
> +
> +. ./common/preamble
> +
> +_begin_fstest auto quick mount clone
> +
> +_require_test
> +_require_block_device $TEST_DEV
> +_require_loop
> +_require_command "$FSNOTIFYWAIT_PROG" fsnotifywait
> +_require_unique_f_fsid
> +
> +_cleanup()
> +{
> + cd /
> + [[ -n $pid1 ]] && { kill -TERM "$pid1" 2> /dev/null; wait $pid1; }
> + [[ -n $pid2 ]] && { kill -TERM "$pid2" 2> /dev/null; wait $pid2; }
> +
> + if [ "$semanage_added" = "yes" ]; then
> + semanage permissive -d unconfined_t >/dev/null 2>&1 || true
> + fi
> +
> + umount $mnt1 $mnt2 2>/dev/null
> + _loop_image_destroy "${devs[@]}" 2> /dev/null
> + rm -r -f $tmp.*
> +}
> +
> +# Run fsnotifywait in unbuffered mode to watch filesystem-wide create events
> +monitor_fanotify()
> +{
> + local mmnt=$1
> + exec stdbuf -oL $FSNOTIFYWAIT_PROG -m -F -S -e create "$mmnt" 2>&1
I guess you need stdbuf to force fsnotifywait to run in linebuffered
mode even if you pipe/redirect it somewhere?
> +}
> +
> +# Transform f_fsid into the hi.lo format used in fanotify FID logs
> +fsid_to_fid_parts()
> +{
> + local fsid=$1
> + # Pad to 16 hex chars (64-bit), then split into two 32-bit halves
> + local padded=$(printf '%016x' "0x${fsid}")
> + local hi=$(printf '%x' "0x${padded:0:8}") # strips leading zeros
> + local lo=$(printf '%x' "0x${padded:8:8}") # strips leading zeros
> + echo "${hi}.${lo}"
> +}
> +
> +# Create base loop device and its clone
> +devs=()
> +_loop_image_create_clone devs
> +mkdir -p $TEST_DIR/$seq
> +mnt1=$TEST_DIR/$seq/mnt1
> +mnt2=$TEST_DIR/$seq/mnt2
> +mkdir -p $mnt1
> +mkdir -p $mnt2
> +
> +# Mount both base and clone filesystems using required clone mount options
> +_mount $(_common_dev_mount_options) $(_clone_mount_option) ${devs[0]} $mnt1 || \
> + _fail "Failed to mount dev1"
> +_mount $(_common_dev_mount_options) $(_clone_mount_option) ${devs[1]} $mnt2 || \
> + _fail "Failed to mount dev2"
> +
> +# Fetch filesystem IDs to verify the kernel can differentiate between them
> +fsid1=$(stat -f -c "%i" $mnt1)
> +fsid2=$(stat -f -c "%i" $mnt2)
> +
> +log1=$tmp.fanotify1
> +log2=$tmp.fanotify2
> +
> +pid1=""
> +pid2=""
> +echo "Setup FID fanotify watchers on both mnt1 and mnt2"
> +
> +# Permit unconfined_t domains when SELinux is enforcing to prevent fanotify
> +# blockages
> +semanage_added="no"
> +if [ "$(getenforce 2>/dev/null)" = "Enforcing" ]; then
> + if ! semanage permissive -l | grep -q "unconfined_t"; then
> + semanage permissive -a unconfined_t >/dev/null 2>&1 && semanage_added="yes"
> + fi
> +fi
Is there a cleaner way to manage setting up and automatically undoing
this step?
There might not be, since iirc the suggestion to register cleanup
functions in a cleanups=() array and call them all in reverse order
didn't go anywhere.
> +
> +# Start asynchronous fanotify monitors
> +( monitor_fanotify "$mnt1" > "$log1" ) &
> +pid1=$!
> +( monitor_fanotify "$mnt2" > "$log2" ) &
> +pid2=$!
> +sleep 2
> +
> +echo "Trigger file creation on mnt1"
> +touch $mnt1/file_on_mnt1
> +sync
> +sleep 1
> +
> +echo "Trigger file creation on mnt2"
> +touch $mnt2/file_on_mnt2
> +sync
> +sleep 1
> +
> +echo "Verify fsid in the fanotify"
> +kill $pid1 $pid2
> +wait $pid1 $pid2 2>/dev/null
> +pid1=""
> +pid2=""
> +
> +e_fsid1=$(fsid_to_fid_parts "$fsid1")
> +e_fsid2=$(fsid_to_fid_parts "$fsid2")
> +
> +# Dump debug details to the full log
> +echo $fsid1 $e_fsid1 $fsid2 $e_fsid2 >> $seqres.full
> +cat $log1 >> $seqres.full
> +cat $log2 >> $seqres.full
> +
> +# Ensure monitor 1 only captured events belonging to mnt 1 and fsid 1
> +if grep -qF "$e_fsid1" "$log1" && ! grep -qF "$e_fsid2" "$log1"; then
> + echo "SUCCESS: mnt1 events found"
> +else
> + [ ! -s "$log1" ] && echo " - mnt1 received no events."
> + grep -qF "$e_fsid2" "$log1" && echo " - mnt1 received event from mnt2."
> +fi
> +
> +# Ensure monitor 2 only captured events belonging to mnt 2 and fsid 2
> +if grep -qF "$e_fsid2" "$log2" && ! grep -qF "$e_fsid1" "$log2"; then
> + echo "SUCCESS: mnt2 events found"
> +else
> + [ ! -s "$log2" ] && echo " - mnt2 received no events."
> + grep -qF "$e_fsid1" "$log2" && echo " - mnt2 received event from mnt1."
> +fi
> +
> +status=0
> +exit
> diff --git a/tests/generic/801.out b/tests/generic/801.out
> new file mode 100644
> index 000000000000..d7b318d9f27c
> --- /dev/null
> +++ b/tests/generic/801.out
> @@ -0,0 +1,7 @@
> +QA output created by 801
> +Setup FID fanotify watchers on both mnt1 and mnt2
> +Trigger file creation on mnt1
> +Trigger file creation on mnt2
> +Verify fsid in the fanotify
> +SUCCESS: mnt1 events found
> +SUCCESS: mnt2 events found
> --
> 2.43.0
>
>
^ permalink raw reply
* Re: [PATCH v6 04/11] fstests: add _require_unique_f_fsid() helper
From: Darrick J. Wong @ 2026-05-29 4:30 UTC (permalink / raw)
To: Anand Jain
Cc: fstests, linux-btrfs, linux-ext4, linux-xfs, linux-f2fs-devel,
zlang, hch
In-Reply-To: <983ed0f63318c930379ee74220f23aa558c16d51.1779939330.git.asj@kernel.org>
On Thu, May 28, 2026 at 12:05:35PM +0800, Anand Jain wrote:
> Add a helper to check if the target filesystem supports unique f_fsid
> tracking across cloned or snapshot instances.
>
> Certain filesystems like XFS, Btrfs, and F2FS ensure unique f_fsid
> identifiers per filesystem instance. However, Ext4 derives its f_fsid
> directly from its superblock UUID, which leads to identical f_fsid
> values on cloned images until the UUID is manually modified by userspace.
>
> Introduce _require_unique_f_fsid() to allow test cases requiring strict
> f_fsid uniqueness to skip gracefully on unsupported filesystems.
>
> Signed-off-by: Anand Jain <asj@kernel.org>
> ---
> common/rc | 21 +++++++++++++++++++++
> 1 file changed, 21 insertions(+)
>
> diff --git a/common/rc b/common/rc
> index 937f478963b4..5446552aed92 100644
> --- a/common/rc
> +++ b/common/rc
> @@ -6314,6 +6314,27 @@ _require_fanotify_ioerrors()
> _notrun "$FSTYP does not support fanotify ioerrors"
> }
>
> +# Ext4 derives f_fsid from the superblock UUID, meaning clones share the
> +# same f_fsid until their UUIDs diverge. Conversely, XFS, Btrfs,
> +# and F2FS ensure f_fsid remains unique per filesystem instance (often by
> +# deriving it from the UUID and underlying block device.)
> +#
> +# Across all filesystems, a UUID collision causes libblkid tools to return
> +# non-deterministic device mappings. It is ultimately the responsibility
"device mappings", as in /dev/disk/by-id/$UUID ?
> +# of the userspace utility or use-case to enforce uniqueness when a clone
> +# diverges. For details, see mailing list thread discussions titled:
> +# "ext4: derive f_fsid from block device to avoid collisions".
How about providing a direct lore link?
--D
> +_require_unique_f_fsid()
> +{
> + # Skip the test if the filesystem does not enforce unique f_fsids
> + # natively. Checking this dynamically requires recreating a clone
> + # layout, so we use a static lookup based on FSTYP.
> + if [ "$FSTYP" == "ext4" ]; then
> + _notrun "Target filesystem ($FSTYP) does not guarantee unique f_fsid on clones."
> + fi
> +}
> +
> +
> # Computes a percentage of the available space in a filesystem and
> # returns that quantity in MB. The percentage must not contain a percent
> # sign ("%").
> --
> 2.43.0
>
>
^ permalink raw reply
* Re: [PATCH v6 03/11] fstests: add FSNOTIFYWAIT_PROG
From: Darrick J. Wong @ 2026-05-29 4:29 UTC (permalink / raw)
To: Anand Jain
Cc: fstests, linux-btrfs, linux-ext4, linux-xfs, linux-f2fs-devel,
zlang, hch
In-Reply-To: <f4dc9d5af00133834acf97e1c72232d2a9b34ac6.1779939330.git.asj@kernel.org>
On Thu, May 28, 2026 at 12:05:34PM +0800, Anand Jain wrote:
> Define `FSNOTIFYWAIT_PROG` for an upcoming test case that uses `fsnotifywait`.
>
> Signed-off-by: Anand Jain <asj@kernel.org>
Seems fine to me
Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
--D
> ---
> common/config | 1 +
> 1 file changed, 1 insertion(+)
>
> diff --git a/common/config b/common/config
> index d5299d5b926f..5661fa0ec310 100644
> --- a/common/config
> +++ b/common/config
> @@ -242,6 +242,7 @@ export BTRFS_MAP_LOGICAL_PROG=$(type -P btrfs-map-logical)
> export PARTED_PROG="$(type -P parted)"
> export XFS_PROPERTY_PROG="$(type -P xfs_property)"
> export FSCRYPTCTL_PROG="$(type -P fscryptctl)"
> +export FSNOTIFYWAIT_PROG="$(type -P fsnotifywait)"
>
> # udev wait functions.
> #
> --
> 2.43.0
>
>
^ permalink raw reply
* Re: [PATCH v6 02/11] fstests: add _clone_mount_option() helper
From: Darrick J. Wong @ 2026-05-29 4:28 UTC (permalink / raw)
To: Anand Jain
Cc: fstests, linux-btrfs, linux-ext4, linux-xfs, linux-f2fs-devel,
zlang, hch
In-Reply-To: <4346c80089c61c8f0d62ea696f9d73f2a9669297.1779939330.git.asj@kernel.org>
On Thu, May 28, 2026 at 12:05:33PM +0800, Anand Jain wrote:
> Adds _clone_mount_option() helper function to handle filesystem-specific
> requirements for mounting cloned devices. Abstract the need for -o nouuid
> on XFS.
>
> Signed-off-by: Anand Jain <asj@kernel.org>
> ---
> common/rc | 17 +++++++++++++++++
> 1 file changed, 17 insertions(+)
>
> diff --git a/common/rc b/common/rc
> index d7e3e0bdfb1e..937f478963b4 100644
> --- a/common/rc
> +++ b/common/rc
> @@ -414,6 +414,23 @@ _scratch_mount_options()
> $SCRATCH_DEV $SCRATCH_MNT
> }
>
> +# Return filesystem-specific mount options required for mounting clone/snapshot
> +# devices.
> +_clone_mount_option()
> +{
> + local mount_opts=""
> +
> + case "$FSTYP" in
> + xfs)
> + # Allow mounting a duplicate filesystem on the same host
> + mount_opts="-o nouuid"
> + ;;
> + *)
> + esac
> +
> + echo $mount_opts
I probably would've just echo'd straight from inside the case statement,
but this otherwise looks ok,
Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
--D
> +}
> +
> _supports_filetype()
> {
> local dir=$1
> --
> 2.43.0
>
>
^ permalink raw reply
* Re: [PATCH v6 01/11] fstests: add _loop_image_create_clone() helper
From: Darrick J. Wong @ 2026-05-29 4:27 UTC (permalink / raw)
To: Anand Jain
Cc: fstests, linux-btrfs, linux-ext4, linux-xfs, linux-f2fs-devel,
zlang, hch
In-Reply-To: <421c7cdd5aae27b99d04dddf08c5d9df79c2f790.1779939330.git.asj@kernel.org>
On Thu, May 28, 2026 at 12:05:32PM +0800, Anand Jain wrote:
> Introduce _loop_image_create_clone() and _loop_image_destroy() to mkfs an
> image file and clone it to another image file, and attach a loop device to
> them. And its destroy part.
>
> Signed-off-by: Anand Jain <asj@kernel.org>
> ---
> common/rc | 63 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
> 1 file changed, 63 insertions(+)
>
> diff --git a/common/rc b/common/rc
> index 79189e7e6e94..d7e3e0bdfb1e 100644
> --- a/common/rc
> +++ b/common/rc
> @@ -1520,6 +1520,69 @@ _scratch_resvblks()
> esac
> }
>
> +# Create a small loop image, run an optional tuning function ($2) on it,
> +# clone it, and attach both to loop devices, returned in ($1).
> +# Args:
> +# $1: Nameref to return the array of allocated loop devices [base, clone].
> +# $2: Optional callback function to tune the base filesystem before cloning.
> +_loop_image_create_clone()
> +{
> + local -n _ret=$1
That switch ^^ is very clever. I always wondered how one did indirect
variables in bash.
> + local pre_clone_tune_func="$2"
> + local img_file=$TEST_DIR/${seq}.img
> + local img_file_clone=$TEST_DIR/${seq}_clone.img
> + local size=$(_small_fs_size_mb 128) # Smallest possible
> + local loop_devs
> +
> + # Since we copy the block device image, we keep its size small.
> + _require_fs_space $TEST_DIR $((size * 1024))
> +
> + _create_file_sized $((size * 1024 * 1024)) $img_file ||
> + _fail "Failed: Create $img_file $size"
> +
> + loop_devs=$(_create_loop_device $img_file)
> + _ret=($loop_devs)
Should this check that a loopdev actually got created?
> + case $FSTYP in
> + xfs)
> + _mkfs_dev "-s size=4096" ${loop_devs[0]}
> + ;;
> + btrfs)
> + _mkfs_dev ${loop_devs[0]}
> + ;;
> + *)
> + _mkfs_dev ${loop_devs[0]}
> + ;;
> + esac
> +
> + # Only execute if the function argument is not empty
> + if [ -n "$pre_clone_tune_func" ]; then
> + $pre_clone_tune_func ${loop_devs[0]}
> + fi
> +
> + sync ${loop_devs[0]}
> + cp $img_file $img_file_clone
> +
> + loop_devs="$loop_devs $(_create_loop_device $img_file_clone)"
local lodev="$(_create_loop_device ...)"
test -z "$lodev" && _fail "second loopdev not created"
_ret+=("$lodev")
?
> +
> + _ret=($loop_devs)
> +}
> +
> +# Teardown loop devices and delete their underlying backing image files.
> +# Accepts a list of loop device paths (e.g., /dev/loop0 /dev/loop1).
> +_loop_image_destroy()
> +{
> + for d in "$@"; do
> + # Retrieve the path of the backing file
> + local f=$(losetup --noheadings --output BACK-FILE $d)
> +
> + # Detach the loop device from the backing file
> + _destroy_loop_device "$d"
> +
> + # Clean up the backing disk image file
> + [ -n "$f" ] && rm -f "$f"
> + done
> +}
>
> # Repair scratch filesystem. Returns 0 if the FS is good to go (either no
> # errors found or errors were fixed) and nonzero otherwise; also spits out
> --
> 2.43.0
>
>
^ permalink raw reply
* Re: [PATCH] ext4: validate dx count against limit in dx_csum
From: Andreas Dilger @ 2026-05-29 0:09 UTC (permalink / raw)
To: Artem Blagodarenko; +Cc: linux-ext4, Theodore Ts'o
In-Reply-To: <20260528160557.6956-1-ablagodarenko@thelustrecollective.com>
On May 28, 2026, at 10:05, Artem Blagodarenko <artem.blagodarenko@gmail.com> wrote:
>
> From: Artem Blagodarenko <artem.blagodarenko@gmail.com>
>
> Sashiko AI, during inspection of the "ext4: replace ext4_dir_entry
> with ext4_dir_entry_2" series, reported a missing bounds check on
> count against limit in DX block checksum verification and setup
> paths.
>
> The code validates that limit fits within block boundaries, but does
> not verify that count <= limit. A maliciously crafted filesystem image
> could therefore provide an artificially large count value.
>
> Since ext4_dx_csum() uses count to determine the checksum region
> size, this may result in an out-of-bounds access crossing page
> boundaries into unmapped memory, potentially leading to a crash during
> checksum verification.
>
> The same issue exists in ext4_dx_csum_set().
>
> The bug was not introduced by the patch under review, so the fix is sent
> separately.
>
> Signed-off-by: Artem Blagodarenko artem.blagodarenko@gmail.com
Reviewed-by: Andreas Dilger <adilger@dilger.ca <mailto:adilger@dilger.ca>>
> ---
> fs/ext4/namei.c | 6 ++++--
> 1 file changed, 4 insertions(+), 2 deletions(-)
>
> diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
> index a316fc2ac41b..20b7bac69889 100644
> --- a/fs/ext4/namei.c
> +++ b/fs/ext4/namei.c
> @@ -472,7 +472,8 @@ static int ext4_dx_csum_verify(struct inode *inode,
> }
> limit = le16_to_cpu(c->limit);
> count = le16_to_cpu(c->count);
> - if (count_offset + (limit * sizeof(struct dx_entry)) >
> + if (!count || count > limit ||
> + count_offset + (limit * sizeof(struct dx_entry)) >
> EXT4_BLOCK_SIZE(inode->i_sb) - sizeof(struct dx_tail)) {
> warn_no_space_for_csum(inode);
> return 0;
> @@ -501,7 +502,8 @@ static void ext4_dx_csum_set(struct inode *inode, struct ext4_dir_entry_2 *diren
> }
> limit = le16_to_cpu(c->limit);
> count = le16_to_cpu(c->count);
> - if (count_offset + (limit * sizeof(struct dx_entry)) >
> + if (!count || count > limit ||
> + count_offset + (limit * sizeof(struct dx_entry)) >
> EXT4_BLOCK_SIZE(inode->i_sb) - sizeof(struct dx_tail)) {
> warn_no_space_for_csum(inode);
> return;
> --
> 2.43.7
>
Cheers, Andreas
^ permalink raw reply
* [PATCH v2 16/34] jbd2: Convert journal commit to bh_submit()
From: Matthew Wilcox (Oracle) @ 2026-05-28 17:31 UTC (permalink / raw)
To: Jan Kara
Cc: Matthew Wilcox (Oracle), Christian Brauner, Christoph Hellwig,
linux-fsdevel, Theodore Ts'o, linux-ext4
In-Reply-To: <20260528173150.1093780-1-willy@infradead.org>
Avoid an extra indirect function call by using bh_submit()
instead of submit_bh() in journal_submit_commit_record()
and jbd2_journal_commit_transaction(). These both use
journal_end_buffer_io_sync(), so it's more straightforward to do them
both at once.
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Jan Kara <jack@suse.cz>
Acked-by: Theodore Ts'o <tytso@mit.edu>
Cc: linux-ext4@vger.kernel.org
---
fs/jbd2/commit.c | 13 +++++++------
1 file changed, 7 insertions(+), 6 deletions(-)
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 8cf61e7185c4..4e91593d27e5 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -29,8 +29,10 @@
/*
* IO end handler for temporary buffer_heads handling writes to the journal.
*/
-static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
+static void journal_end_buffer_io_sync(struct bio *bio)
{
+ struct buffer_head *bh;
+ bool uptodate = bio_endio_bh(bio, &bh);
struct buffer_head *orig_bh = bh->b_private;
BUFFER_TRACE(bh, "");
@@ -147,13 +149,12 @@ static int journal_submit_commit_record(journal_t *journal,
lock_buffer(bh);
clear_buffer_dirty(bh);
set_buffer_uptodate(bh);
- bh->b_end_io = journal_end_buffer_io_sync;
if (journal->j_flags & JBD2_BARRIER &&
!jbd2_has_feature_async_commit(journal))
write_flags |= REQ_PREFLUSH | REQ_FUA;
- submit_bh(write_flags, bh);
+ bh_submit(bh, write_flags, journal_end_buffer_io_sync);
*cbh = bh;
return 0;
}
@@ -751,9 +752,9 @@ void jbd2_journal_commit_transaction(journal_t *journal)
lock_buffer(bh);
clear_buffer_dirty(bh);
set_buffer_uptodate(bh);
- bh->b_end_io = journal_end_buffer_io_sync;
- submit_bh(REQ_OP_WRITE | JBD2_JOURNAL_REQ_FLAGS,
- bh);
+ bh_submit(bh,
+ REQ_OP_WRITE | JBD2_JOURNAL_REQ_FLAGS,
+ journal_end_buffer_io_sync);
}
cond_resched();
--
2.47.3
^ permalink raw reply related
* [PATCH v2 15/34] ext4: Convert ext4_commit_super() to bh_submit()
From: Matthew Wilcox (Oracle) @ 2026-05-28 17:31 UTC (permalink / raw)
To: Jan Kara
Cc: Matthew Wilcox (Oracle), Christian Brauner, Christoph Hellwig,
linux-fsdevel, Theodore Ts'o, linux-ext4
In-Reply-To: <20260528173150.1093780-1-willy@infradead.org>
Avoid an extra indirect function call and changing the buffer refcount
by using bh_submit() instead of submit_bh().
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Jan Kara <jack@suse.cz>
Acked-by: Theodore Ts'o <tytso@mit.edu>
Cc: linux-ext4@vger.kernel.org
---
fs/ext4/super.c | 6 ++----
1 file changed, 2 insertions(+), 4 deletions(-)
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index bc7faedcb8e4..7283108d7609 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -6316,12 +6316,10 @@ static int ext4_commit_super(struct super_block *sb)
clear_buffer_write_io_error(sbh);
set_buffer_uptodate(sbh);
}
- get_bh(sbh);
/* Clear potential dirty bit if it was journalled update */
clear_buffer_dirty(sbh);
- sbh->b_end_io = end_buffer_write_sync;
- submit_bh(REQ_OP_WRITE | REQ_SYNC |
- (test_opt(sb, BARRIER) ? REQ_FUA : 0), sbh);
+ bh_submit(sbh, REQ_OP_WRITE | REQ_SYNC |
+ (test_opt(sb, BARRIER) ? REQ_FUA : 0), bh_end_write);
wait_on_buffer(sbh);
if (buffer_write_io_error(sbh)) {
ext4_msg(sb, KERN_ERR, "I/O error while writing "
--
2.47.3
^ permalink raw reply related
* [PATCH v2 17/34] jbd2: Convert jbd2_write_superblock() to bh_submit()
From: Matthew Wilcox (Oracle) @ 2026-05-28 17:31 UTC (permalink / raw)
To: Jan Kara
Cc: Matthew Wilcox (Oracle), Christian Brauner, Christoph Hellwig,
linux-fsdevel, Theodore Ts'o, linux-ext4
In-Reply-To: <20260528173150.1093780-1-willy@infradead.org>
Avoid an extra indirect function call and changing the buffer refcount
by using bh_submit() instead of submit_bh().
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Jan Kara <jack@suse.cz>
Acked-by: Theodore Ts'o <tytso@mit.edu>
Cc: linux-ext4@vger.kernel.org
---
fs/jbd2/journal.c | 4 +---
1 file changed, 1 insertion(+), 3 deletions(-)
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 4f397fcdb13c..2040af8c84cb 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -1820,9 +1820,7 @@ static int jbd2_write_superblock(journal_t *journal, blk_opf_t write_flags)
}
if (jbd2_journal_has_csum_v2or3(journal))
sb->s_checksum = jbd2_superblock_csum(sb);
- get_bh(bh);
- bh->b_end_io = end_buffer_write_sync;
- submit_bh(REQ_OP_WRITE | write_flags, bh);
+ bh_submit(bh, REQ_OP_WRITE | write_flags, bh_end_write);
wait_on_buffer(bh);
if (buffer_write_io_error(bh)) {
clear_buffer_write_io_error(bh);
--
2.47.3
^ permalink raw reply related
* [PATCH v2 14/34] ext4: Convert write_mmp_block_thawed() to bh_submit()
From: Matthew Wilcox (Oracle) @ 2026-05-28 17:31 UTC (permalink / raw)
To: Jan Kara
Cc: Matthew Wilcox (Oracle), Christian Brauner, Christoph Hellwig,
linux-fsdevel, linux-ext4
In-Reply-To: <20260528173150.1093780-1-willy@infradead.org>
Avoid an extra indirect function call and changing the buffer refcount
by using bh_submit() instead of submit_bh().
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: linux-ext4@vger.kernel.org
---
fs/ext4/mmp.c | 5 ++---
1 file changed, 2 insertions(+), 3 deletions(-)
diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
index 6f57c181ff77..7ce361484b38 100644
--- a/fs/ext4/mmp.c
+++ b/fs/ext4/mmp.c
@@ -46,9 +46,8 @@ static int write_mmp_block_thawed(struct super_block *sb,
ext4_mmp_csum_set(sb, mmp);
lock_buffer(bh);
- bh->b_end_io = end_buffer_write_sync;
- get_bh(bh);
- submit_bh(REQ_OP_WRITE | REQ_SYNC | REQ_META | REQ_PRIO, bh);
+ bh_submit(bh, REQ_OP_WRITE | REQ_SYNC | REQ_META | REQ_PRIO,
+ bh_end_write);
wait_on_buffer(bh);
if (unlikely(!buffer_uptodate(bh)))
return -EIO;
--
2.47.3
^ permalink raw reply related
* [PATCH v2 13/34] ext4: Convert ext4_fc_submit_bh() to bh_submit()
From: Matthew Wilcox (Oracle) @ 2026-05-28 17:31 UTC (permalink / raw)
To: Jan Kara
Cc: Matthew Wilcox (Oracle), Christian Brauner, Christoph Hellwig,
linux-fsdevel, linux-ext4
In-Reply-To: <20260528173150.1093780-1-willy@infradead.org>
Avoid an extra indirect function call by converting
ext4_end_buffer_io_sync() from bh_end_io_t to bio_end_io_t and
calling bh_submit().
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: linux-ext4@vger.kernel.org
---
fs/ext4/fast_commit.c | 8 +++++---
1 file changed, 5 insertions(+), 3 deletions(-)
diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
index b3c22636251d..5773b85e43cb 100644
--- a/fs/ext4/fast_commit.c
+++ b/fs/ext4/fast_commit.c
@@ -184,8 +184,11 @@
#include <trace/events/ext4.h>
static struct kmem_cache *ext4_fc_dentry_cachep;
-static void ext4_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
+static void ext4_end_buffer_io_sync(struct bio *bio)
{
+ struct buffer_head *bh;
+ bool uptodate = bio_endio_bh(bio, &bh);
+
BUFFER_TRACE(bh, "");
if (uptodate) {
ext4_debug("%s: Block %lld up-to-date",
@@ -659,8 +662,7 @@ static void ext4_fc_submit_bh(struct super_block *sb, bool is_tail)
lock_buffer(bh);
set_buffer_dirty(bh);
set_buffer_uptodate(bh);
- bh->b_end_io = ext4_end_buffer_io_sync;
- submit_bh(REQ_OP_WRITE | write_flags, bh);
+ bh_submit(bh, REQ_OP_WRITE | write_flags, ext4_end_buffer_io_sync);
EXT4_SB(sb)->s_fc_bh = NULL;
}
--
2.47.3
^ permalink raw reply related
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox