* [PATCH v2 06/10] ext4: add ext4_dir_entry_len() and harden dirdata parsing
From: Artem Blagodarenko @ 2026-06-10 15:24 UTC (permalink / raw)
To: linux-ext4; +Cc: adilger.kernel, Artem Blagodarenko, Andreas Dilger
In-Reply-To: <20260610152417.13576-1-ablagodarenko@thelustrecollective.com>
From: Artem Blagodarenko <artem.blagodarenko@gmail.com>
Introduce ext4_dir_entry_len() helper to compute the required
rec_len for a directory entry, taking into account dirdata and
casefold+fscrypt hash space.
Convert ext4_dirent_get_data_len() to take the decoded rec_len
as an argument and add bounds checking when walking dirdata
extensions to avoid overruns on malformed entries.
Update dx_root_limit() to use ext4_dir_entry_len() instead of
open-coded ext4_dir_rec_len() for '.' and '..' entries.
Signed-off-by: Artem Blagodarenko <artem.blagodarenko@gmail.com>
Reviewed-by: Andreas Dilger <adilger@dilger.ca>
---
fs/ext4/ext4.h | 45 ++++++++++++++++++++++++++++++++++++++++++---
fs/ext4/namei.c | 23 +++++++++++++++--------
2 files changed, 57 insertions(+), 11 deletions(-)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index f833f6ef0040..45e90b8be9e8 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -3988,6 +3988,7 @@ static inline bool ext4_dir_entry_is_tail(struct ext4_dir_entry_2 *de)
/*
* ext4_dirent_get_data_len() - Compute the total dirdata length for an entry.
* @de: directory entry
+ * @rec_len: the record length of the directory entry (decoded)
*
* Computes the length of optional data stored after the filename (and its
* implicit NUL terminator). Each extension is indicated by a bit in the
@@ -3996,22 +3997,41 @@ static inline bool ext4_dir_entry_is_tail(struct ext4_dir_entry_2 *de)
*
* Returns 0 for tail entries and for entries with no dirdata.
*/
-static inline int ext4_dirent_get_data_len(struct ext4_dir_entry_2 *de)
+static inline int ext4_dirent_get_data_len(struct ext4_dir_entry_2 *de,
+ unsigned int rec_len)
{
__u8 extra_data_flags;
struct ext4_dirent_data_header *ddh;
int dlen = 0;
+ unsigned int offset;
if (ext4_dir_entry_is_tail(de))
return 0;
extra_data_flags = (de->file_type & ~EXT4_FT_MASK) >> 4;
- ddh = (struct ext4_dirent_data_header *)(de->name + de->name_len +
- 1 /* NUL terminator */);
+ /* offset from start of entry to after filename + NUL */
+ offset = EXT4_BASE_DIR_LEN + de->name_len + 1;
+ /* bounds check: ensure we start reading within the entry */
+ if (offset >= rec_len)
+ return 0;
+
+ ddh = (struct ext4_dirent_data_header *)((char *)de + offset);
+
while (extra_data_flags) {
if (extra_data_flags & 1) {
+ /* bounds check before reading ddh_length */
+ if (offset + sizeof(*ddh) >
+ rec_len)
+ return dlen;
+
+ /* validate ddh_length is reasonable */
+ if (ddh->ddh_length == 0 || ddh->ddh_length >
+ rec_len - offset)
+ return dlen;
+
dlen += ddh->ddh_length + (dlen == 0);
+ offset += ddh->ddh_length;
ddh = ext4_dirdata_next(ddh);
}
extra_data_flags >>= 1;
@@ -4019,6 +4039,25 @@ static inline int ext4_dirent_get_data_len(struct ext4_dir_entry_2 *de)
return dlen;
}
+/*
+ * ext4_dir_entry_len() - Compute the required rec_len for a directory entry.
+ * @de: directory entry (used to read name_len and any dirdata length)
+ * @dir: directory inode (may be NULL for '.' and '..' entries)
+ *
+ * Returns the minimum record length needed to hold @de, rounded up to the
+ * directory alignment and including room for the casefold+fscrypt hash if
+ * the directory requires it.
+ */
+static inline unsigned int ext4_dir_entry_len(struct ext4_dir_entry_2 *de,
+ const struct inode *dir)
+{
+ unsigned int blocksize = (dir && dir->i_sb) ? dir->i_sb->s_blocksize : 4096;
+ unsigned int rec_len = ext4_rec_len_from_disk(de->rec_len, blocksize);
+ unsigned int dirdata = ext4_dirent_get_data_len(de, rec_len);
+
+ return ext4_dir_rec_len(de->name_len + dirdata, dir);
+}
+
extern const struct iomap_ops ext4_iomap_ops;
extern const struct iomap_ops ext4_iomap_report_ops;
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 87d8cd2c6377..0635eac2de8d 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -570,11 +570,15 @@ static inline void dx_set_limit(struct dx_entry *entries, unsigned value)
((struct dx_countlimit *) entries)->limit = cpu_to_le16(value);
}
-static inline unsigned dx_root_limit(struct inode *dir, unsigned infosize)
+static inline unsigned dx_root_limit(struct inode *dir,
+ struct ext4_dir_entry_2 *dot_de)
{
- unsigned int entry_space = dir->i_sb->s_blocksize -
- ext4_dir_rec_len(1, NULL) -
- ext4_dir_rec_len(2, NULL) - infosize;
+ struct dx_root_info *info;
+ unsigned int entry_space;
+
+ info = dx_get_dx_info(dot_de);
+ entry_space = dir->i_sb->s_blocksize - ((char *)info - (char *)dot_de) -
+ info->info_length;
if (ext4_has_feature_metadata_csum(dir->i_sb))
entry_space -= sizeof(struct dx_tail);
@@ -850,10 +854,13 @@ dx_probe(struct ext4_filename *fname, struct inode *dir,
entries = (struct dx_entry *)(((char *)info) + info->info_length);
- if (dx_get_limit(entries) != dx_root_limit(dir, info->info_length)) {
+ if (dx_get_limit(entries) !=
+ dx_root_limit(dir, (struct ext4_dir_entry_2 *)frame->bh->b_data)) {
ext4_warning_inode(dir, "dx entry: limit %u != root limit %u",
dx_get_limit(entries),
- dx_root_limit(dir, info->info_length));
+ dx_root_limit(dir,
+ (struct ext4_dir_entry_2 *)frame->bh->b_data
+ ));
goto fail;
}
@@ -2278,10 +2285,10 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname,
dx_info->hash_version =
EXT4_SB(dir->i_sb)->s_def_hash_version;
- entries = (void *)dx_info + sizeof(*dx_info);
+ entries = (void *)dx_info + dx_info->info_length;
dx_set_block(entries, 1);
dx_set_count(entries, 1);
- dx_set_limit(entries, dx_root_limit(dir, sizeof(*dx_info)));
+ dx_set_limit(entries, dx_root_limit(dir, dot_de));
/* Initialize as for dx_probe */
fname->hinfo.hash_version = dx_info->hash_version;
--
2.43.7
^ permalink raw reply related
* [PATCH v2 04/10] ext4: add dirdata format definitions and access helpers
From: Artem Blagodarenko @ 2026-06-10 15:24 UTC (permalink / raw)
To: linux-ext4
Cc: adilger.kernel, Artem Blagodarenko, Pravin Shelar, Andreas Dilger
In-Reply-To: <20260610152417.13576-1-ablagodarenko@thelustrecollective.com>
From: Artem Blagodarenko <artem.blagodarenko@gmail.com>
Define the on-disk format for ext4 directory entry extension data.
The upper four bits of de->file_type indicate the presence of
optional data stored after the filename NUL terminator. This patch
defines flags for LUFID, 64-bit inode numbers, and casefold hash
data stored in that area.
Add struct ext4_dirent_data_header to describe variable-length
extension records and struct ext4_dirent_hash for hash storage used
by casefold and fscrypt.
Provide ext4_dirdata_next() to advance to the next extension record
and ext4_dirent_get_data_len() to compute the total extension data
length associated with a directory entry.
No functional changes.
Signed-off-by: Pravin Shelar <pravin.shelar@sun.com>
Signed-off-by: Artem Blagodarenko <artem.blagodarenko@gmail.com>
Reviewed-by: Andreas Dilger <adilger@diliger.ca>
---
fs/ext4/ext4.h | 83 ++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 83 insertions(+)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 01b1222b1454..c36c3bf54590 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2470,6 +2470,49 @@ struct ext4_dir_entry_tail {
#define EXT4_FT_SYMLINK 7
#define EXT4_FT_MAX 8
+#define EXT4_FT_MASK 0xf
+
+#if EXT4_FT_MAX > EXT4_FT_MASK
+#error "conflicting EXT4_FT_MAX and EXT4_FT_MASK"
+#endif
+
+/*
+ * d_type has 4 unused bits, so it can hold four types of data. These different
+ * types of data (e.g. fscypt hash, high 32 bits of 64-bit inode number) can be
+ * stored, in flag order, after file-name in ext4 dirent.
+ *
+ * These flags are added to d_type if ext4 dirent has extra data after
+ * filename. This data length is variable and length is stored in first byte
+ * of data. Data starts after filename NUL byte.
+ */
+#define EXT4_DIRENT_LUFID 0x10
+#define EXT4_DIRENT_INO64 0x20
+#define EXT4_DIRENT_CFHASH 0x40
+
+struct ext4_fid {
+ char fid[16]; /* 128-bit unique file identifier */
+};
+
+struct ext4_dirent_data_header {
+ /* length of this header + the whole data blob */
+ __u8 ddh_length;
+} __packed;
+
+struct ext4_dirent_fid {
+ struct ext4_dirent_data_header df_header;
+ struct ext4_fid df_fid[];
+};
+
+#define EXT4_LUFID_MAGIC 0xAD200907UL
+struct ext4_dentry_param {
+ __u32 edp_magic; /* EXT4_LUFID_MAGIC */
+ struct ext4_dirent_fid edp_dfid;
+};
+
+struct ext4_dirent_hash {
+ struct ext4_dirent_data_header dh_header;
+ struct ext4_dir_entry_hash dh_hash;
+} __packed;
#define EXT4_FT_DIR_CSUM 0xDE
@@ -3917,6 +3960,12 @@ static inline void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end)
io_end->flag &= ~EXT4_IO_END_UNWRITTEN;
}
+/*
+ * Advance to the next dirdata record header starting from @ddh.
+ */
+#define ext4_dirdata_next(ddh) \
+ ((struct ext4_dirent_data_header *)((char *)(ddh) + (ddh)->ddh_length))
+
/*
* ext4_dir_entry_is_tail() - Check if a directory entry is a tail entry.
* @de: directory entry to check
@@ -3933,6 +3982,40 @@ static inline bool ext4_dir_entry_is_tail(struct ext4_dir_entry_2 *de)
t->det_reserved_ft == EXT4_FT_DIR_CSUM;
}
+/*
+ * ext4_dirent_get_data_len() - Compute the total dirdata length for an entry.
+ * @de: directory entry
+ *
+ * Computes the length of optional data stored after the filename (and its
+ * implicit NUL terminator). Each extension is indicated by a bit in the
+ * high 4 bits of de->file_type; the first byte of each extension is its
+ * length (including that length byte itself).
+ *
+ * Returns 0 for tail entries and for entries with no dirdata.
+ */
+static inline int ext4_dirent_get_data_len(struct ext4_dir_entry_2 *de)
+{
+ __u8 extra_data_flags;
+ struct ext4_dirent_data_header *ddh;
+ int dlen = 0;
+
+ if (ext4_dir_entry_is_tail(de))
+ return 0;
+
+ extra_data_flags = (de->file_type & ~EXT4_FT_MASK) >> 4;
+ ddh = (struct ext4_dirent_data_header *)(de->name + de->name_len +
+ 1 /* NUL terminator */);
+
+ while (extra_data_flags) {
+ if (extra_data_flags & 1) {
+ dlen += ddh->ddh_length + (dlen == 0);
+ ddh = ext4_dirdata_next(ddh);
+ }
+ extra_data_flags >>= 1;
+ }
+ return dlen;
+}
+
extern const struct iomap_ops ext4_iomap_ops;
extern const struct iomap_ops ext4_iomap_report_ops;
--
2.43.7
^ permalink raw reply related
* [PATCH v2 05/10] ext4: preserve dirdata bits in get_dtype()
From: Artem Blagodarenko @ 2026-06-10 15:24 UTC (permalink / raw)
To: linux-ext4; +Cc: adilger.kernel, Artem Blagodarenko, Andreas Dilger
In-Reply-To: <20260610152417.13576-1-ablagodarenko@thelustrecollective.com>
From: Artem Blagodarenko <artem.blagodarenko@gmail.com>
Mask the filetype with EXT4_FT_MASK when indexing
ext4_filetype_table[] to avoid using dirdata bits as an index.
Preserve the extra bits
stored in the upper part of filetype and propagate them to the
returned dtype value.
Signed-off-by: Artem Blagodarenko <artem.blagodarenko@gmail.com>
Reviewed-by: Andreas Dilger <adilger@dilger.ca>
---
fs/ext4/ext4.h | 9 ++++++---
1 file changed, 6 insertions(+), 3 deletions(-)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index c36c3bf54590..f833f6ef0040 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2960,12 +2960,15 @@ static const unsigned char ext4_filetype_table[] = {
DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
};
-static inline unsigned char get_dtype(struct super_block *sb, int filetype)
+static inline unsigned char get_dtype(struct super_block *sb, int filetype)
{
- if (!ext4_has_feature_filetype(sb) || filetype >= EXT4_FT_MAX)
+ unsigned char fl_index = filetype & EXT4_FT_MASK;
+
+ if (!ext4_has_feature_filetype(sb) || fl_index >= EXT4_FT_MAX)
return DT_UNKNOWN;
- return ext4_filetype_table[filetype];
+ return (ext4_filetype_table[fl_index]) |
+ (filetype & ~EXT4_FT_MASK);
}
extern int ext4_check_all_de(struct inode *dir, struct buffer_head *bh,
void *buf, int buf_size);
--
2.43.7
^ permalink raw reply related
* [PATCH v2 03/10] ext4: refactor dx_root to support variable dirent sizes
From: Artem Blagodarenko @ 2026-06-10 15:24 UTC (permalink / raw)
To: linux-ext4
Cc: adilger.kernel, Artem Blagodarenko, Pravin Shelar, Andreas Dilger
In-Reply-To: <20260610152417.13576-1-ablagodarenko@thelustrecollective.com>
From: Artem Blagodarenko <artem.blagodarenko@gmail.com>
Split monolithic definition of dx_root struct to separate dx_root_info
from fake struct ext4_dir_entry2 for improved code readability.
This allows "." and ".." dirents to have different sizes if necessary,
since we can't assume the rec_len 12 if dx_root dirents have dirdata.
Adds dx_get_dx_info() accessor instead of complex typecast at callers.
Does not change any functionality.
Signed-off-by: Pravin Shelar <pravin.shelar@sun.com>
Signed-off-by: Artem Blagodarenko <artem.blagodarenko@gmail.com>
Reviewed-by: Andreas Dilger <adilger@dilger.ca>
---
fs/ext4/namei.c | 145 +++++++++++++++++++++++-------------------------
1 file changed, 70 insertions(+), 75 deletions(-)
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 0196d954cba1..87d8cd2c6377 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -244,22 +244,13 @@ struct dx_entry
* hash version mod 4 should never be 0. Sincerely, the paranoia department.
*/
-struct dx_root
+struct dx_root_info
{
- struct fake_dirent dot;
- char dot_name[4];
- struct fake_dirent dotdot;
- char dotdot_name[4];
- struct dx_root_info
- {
- __le32 reserved_zero;
- u8 hash_version;
- u8 info_length; /* 8 */
- u8 indirect_levels;
- u8 unused_flags;
- }
- info;
- struct dx_entry entries[];
+ __le32 reserved_zero;
+ u8 hash_version;
+ u8 info_length; /* 8 */
+ u8 indirect_levels;
+ u8 unused_flags;
};
struct dx_node
@@ -528,6 +519,16 @@ ext4_next_entry(struct ext4_dir_entry_2 *p, unsigned long blocksize)
* Future: use high four bits of block for coalesce-on-delete flags
* Mask them off for now.
*/
+static struct dx_root_info *dx_get_dx_info(void *de_buf)
+{
+ /* get dotdot first */
+ de_buf = de_buf + ext4_dir_rec_len(1, NULL);
+
+ /* dx root info is after dotdot entry */
+ de_buf = de_buf + ext4_dir_rec_len(2, NULL);
+
+ return (struct dx_root_info *)de_buf;
+}
static inline ext4_lblk_t dx_get_block(struct dx_entry *entry)
{
@@ -775,7 +776,7 @@ dx_probe(struct ext4_filename *fname, struct inode *dir,
{
unsigned count, indirect, level, i;
struct dx_entry *at, *entries, *p, *q, *m;
- struct dx_root *root;
+ struct dx_root_info *info;
struct dx_frame *frame = frame_in;
struct dx_frame *ret_err = ERR_PTR(ERR_BAD_DX_DIR);
u32 hash;
@@ -787,23 +788,24 @@ dx_probe(struct ext4_filename *fname, struct inode *dir,
if (IS_ERR(frame->bh))
return (struct dx_frame *) frame->bh;
- root = (struct dx_root *) frame->bh->b_data;
- if (root->info.hash_version != DX_HASH_TEA &&
- root->info.hash_version != DX_HASH_HALF_MD4 &&
- root->info.hash_version != DX_HASH_LEGACY &&
- root->info.hash_version != DX_HASH_SIPHASH) {
- ext4_warning_inode(dir, "Unrecognised inode hash code %u",
- root->info.hash_version);
+ info = dx_get_dx_info((struct ext4_dir_entry_2 *)frame->bh->b_data);
+ if (info->hash_version != DX_HASH_TEA &&
+ info->hash_version != DX_HASH_HALF_MD4 &&
+ info->hash_version != DX_HASH_LEGACY &&
+ info->hash_version != DX_HASH_SIPHASH) {
+ ext4_warning(dir->i_sb,
+ "Unrecognised inode hash code %d for directory #%llu",
+ info->hash_version, dir->i_ino);
goto fail;
}
if (ext4_hash_in_dirent(dir)) {
- if (root->info.hash_version != DX_HASH_SIPHASH) {
+ if (info->hash_version != DX_HASH_SIPHASH) {
ext4_warning_inode(dir,
"Hash in dirent, but hash is not SIPHASH");
goto fail;
}
} else {
- if (root->info.hash_version == DX_HASH_SIPHASH) {
+ if (info->hash_version == DX_HASH_SIPHASH) {
ext4_warning_inode(dir,
"Hash code is SIPHASH, but hash not in dirent");
goto fail;
@@ -811,7 +813,7 @@ dx_probe(struct ext4_filename *fname, struct inode *dir,
}
if (fname)
hinfo = &fname->hinfo;
- hinfo->hash_version = root->info.hash_version;
+ hinfo->hash_version = info->hash_version;
if (hinfo->hash_version <= DX_HASH_TEA)
hinfo->hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned;
hinfo->seed = EXT4_SB(dir->i_sb)->s_hash_seed;
@@ -827,13 +829,13 @@ dx_probe(struct ext4_filename *fname, struct inode *dir,
}
hash = hinfo->hash;
- if (root->info.unused_flags & 1) {
+ if (info->unused_flags & 1) {
ext4_warning_inode(dir, "Unimplemented hash flags: %#06x",
- root->info.unused_flags);
+ info->unused_flags);
goto fail;
}
- indirect = root->info.indirect_levels;
+ indirect = info->indirect_levels;
if (indirect >= ext4_dir_htree_level(dir->i_sb)) {
ext4_warning(dir->i_sb,
"Directory (ino: %llu) htree depth %#06x exceed"
@@ -846,14 +848,12 @@ dx_probe(struct ext4_filename *fname, struct inode *dir,
goto fail;
}
- entries = (struct dx_entry *)(((char *)&root->info) +
- root->info.info_length);
+ entries = (struct dx_entry *)(((char *)info) + info->info_length);
- if (dx_get_limit(entries) != dx_root_limit(dir,
- root->info.info_length)) {
+ if (dx_get_limit(entries) != dx_root_limit(dir, info->info_length)) {
ext4_warning_inode(dir, "dx entry: limit %u != root limit %u",
dx_get_limit(entries),
- dx_root_limit(dir, root->info.info_length));
+ dx_root_limit(dir, info->info_length));
goto fail;
}
@@ -939,7 +939,7 @@ static void dx_release(struct dx_frame *frames)
if (frames[0].bh == NULL)
return;
- info = &((struct dx_root *)frames[0].bh->b_data)->info;
+ info = dx_get_dx_info((struct ext4_dir_entry_2 *)frames[0].bh->b_data);
/* save local copy, "info" may be freed after brelse() */
indirect_levels = info->indirect_levels;
for (i = 0; i <= indirect_levels; i++) {
@@ -2151,44 +2151,38 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname,
return err ? err : err2;
}
-static bool ext4_check_dx_root(struct inode *dir, struct dx_root *root)
+static bool ext4_check_dx_root(struct inode *dir,
+ struct ext4_dir_entry_2 *dot_de,
+ struct ext4_dir_entry_2 *dotdot_de,
+ struct ext4_dir_entry_2 **entry)
{
- struct fake_dirent *fde;
const char *error_msg;
- unsigned int rlen;
unsigned int blocksize = dir->i_sb->s_blocksize;
- char *blockend = (char *)root + dir->i_sb->s_blocksize;
+ struct ext4_dir_entry_2 *de = NULL;
- fde = &root->dot;
- if (unlikely(fde->name_len != 1)) {
+ if (unlikely(dot_de->name_len != 1)) {
error_msg = "invalid name_len for '.'";
goto corrupted;
}
- if (unlikely(strncmp(root->dot_name, ".", fde->name_len))) {
+ if (unlikely(strncmp(dot_de->name, ".", dot_de->name_len))) {
error_msg = "invalid name for '.'";
goto corrupted;
}
- rlen = ext4_rec_len_from_disk(fde->rec_len, blocksize);
- if (unlikely((char *)fde + rlen >= blockend)) {
- error_msg = "invalid rec_len for '.'";
- goto corrupted;
- }
- fde = &root->dotdot;
- if (unlikely(fde->name_len != 2)) {
+ if (unlikely(dotdot_de->name_len != 2)) {
error_msg = "invalid name_len for '..'";
goto corrupted;
}
- if (unlikely(strncmp(root->dotdot_name, "..", fde->name_len))) {
+ if (unlikely(strncmp(dotdot_de->name, "..", dotdot_de->name_len))) {
error_msg = "invalid name for '..'";
goto corrupted;
}
- rlen = ext4_rec_len_from_disk(fde->rec_len, blocksize);
- if (unlikely((char *)fde + rlen >= blockend)) {
+ de = ext4_next_entry(dotdot_de, blocksize);
+ if ((char *)de >= (((char *)dot_de) + blocksize)) {
error_msg = "invalid rec_len for '..'";
goto corrupted;
}
-
+ *entry = de;
return true;
corrupted:
@@ -2206,16 +2200,15 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname,
struct inode *inode, struct buffer_head *bh)
{
struct buffer_head *bh2;
- struct dx_root *root;
struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
struct dx_entry *entries;
- struct ext4_dir_entry_2 *de, *de2;
+ struct ext4_dir_entry_2 *de, *de2, *dot_de, *dotdot_de;
char *data2, *top;
unsigned len;
int retval;
unsigned blocksize;
ext4_lblk_t block;
- struct fake_dirent *fde;
+ struct dx_root_info *dx_info;
int csum_size = 0;
if (ext4_has_feature_metadata_csum(inode->i_sb))
@@ -2232,17 +2225,15 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname,
return retval;
}
- root = (struct dx_root *) bh->b_data;
- if (!ext4_check_dx_root(dir, root)) {
+ dot_de = (struct ext4_dir_entry_2 *)bh->b_data;
+ dotdot_de = ext4_next_entry(dot_de, blocksize);
+ if (!ext4_check_dx_root(dir, dot_de, dotdot_de, &de)) {
brelse(bh);
return -EFSCORRUPTED;
}
/* The 0th block becomes the root, move the dirents out */
- fde = &root->dotdot;
- de = (struct ext4_dir_entry_2 *)((char *)fde +
- ext4_rec_len_from_disk(fde->rec_len, blocksize));
- len = ((char *) root) + (blocksize - csum_size) - (char *) de;
+ len = ((char *)dot_de) + (blocksize - csum_size) - (char *)de;
/* Allocate new block for the 0th block's dirents */
bh2 = ext4_append(handle, dir, &block);
@@ -2273,24 +2264,27 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname,
ext4_initialize_dirent_tail(bh2, blocksize);
/* Initialize the root; the dot dirents already exist */
- de = (struct ext4_dir_entry_2 *) (&root->dotdot);
- de->rec_len = ext4_rec_len_to_disk(
- blocksize - ext4_dir_rec_len(2, NULL), blocksize);
- memset (&root->info, 0, sizeof(root->info));
- root->info.info_length = sizeof(root->info);
+ dotdot_de->rec_len =
+ ext4_rec_len_to_disk(blocksize - le16_to_cpu(dot_de->rec_len),
+ blocksize);
+
+ /* initialize hashing info */
+ dx_info = dx_get_dx_info(dot_de);
+ memset(dx_info, 0, sizeof(*dx_info));
+ dx_info->info_length = sizeof(*dx_info);
if (ext4_hash_in_dirent(dir))
- root->info.hash_version = DX_HASH_SIPHASH;
+ dx_info->hash_version = DX_HASH_SIPHASH;
else
- root->info.hash_version =
+ dx_info->hash_version =
EXT4_SB(dir->i_sb)->s_def_hash_version;
- entries = root->entries;
+ entries = (void *)dx_info + sizeof(*dx_info);
dx_set_block(entries, 1);
dx_set_count(entries, 1);
- dx_set_limit(entries, dx_root_limit(dir, sizeof(root->info)));
+ dx_set_limit(entries, dx_root_limit(dir, sizeof(*dx_info)));
/* Initialize as for dx_probe */
- fname->hinfo.hash_version = root->info.hash_version;
+ fname->hinfo.hash_version = dx_info->hash_version;
if (fname->hinfo.hash_version <= DX_HASH_TEA)
fname->hinfo.hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned;
fname->hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed;
@@ -2600,7 +2594,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
if (restart || err)
goto journal_error;
} else {
- struct dx_root *dxroot;
+ struct dx_root_info *info;
memcpy((char *) entries2, (char *) entries,
icount * sizeof(struct dx_entry));
dx_set_limit(entries2, dx_node_limit(dir));
@@ -2608,8 +2602,9 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
/* Set up root */
dx_set_count(entries, 1);
dx_set_block(entries + 0, newblock);
- dxroot = (struct dx_root *)frames[0].bh->b_data;
- dxroot->info.indirect_levels += 1;
+ info = dx_get_dx_info((struct ext4_dir_entry_2 *)
+ frames[0].bh->b_data);
+ info->indirect_levels += 1;
dxtrace(printk(KERN_DEBUG
"Creating %d level index...\n",
dxroot->info.indirect_levels));
--
2.43.7
^ permalink raw reply related
* [PATCH v2 02/10] ext4: add ext4_dir_entry_is_tail()
From: Artem Blagodarenko @ 2026-06-10 15:24 UTC (permalink / raw)
To: linux-ext4; +Cc: adilger.kernel, Artem Blagodarenko, Andreas Dilger
In-Reply-To: <20260610152417.13576-1-ablagodarenko@thelustrecollective.com>
From: Artem Blagodarenko <artem.blagodarenko@gmail.com>
Replace open-coded checks for directory tail entries with a call
to ext4_dir_entry_is_tail(). This helper will also be used by
upcoming changes.
Signed-off-by: Artem Blagodarenko <artem.blagodarenko@gmail.com>
Reviewed-by: Andreas Dilger <adilger@dilger.ca>
---
fs/ext4/ext4.h | 16 ++++++++++++++++
fs/ext4/namei.c | 7 +------
2 files changed, 17 insertions(+), 6 deletions(-)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 94283a991e5c..01b1222b1454 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -3917,6 +3917,22 @@ static inline void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end)
io_end->flag &= ~EXT4_IO_END_UNWRITTEN;
}
+/*
+ * ext4_dir_entry_is_tail() - Check if a directory entry is a tail entry.
+ * @de: directory entry to check
+ *
+ * Returns true if @de is a directory block tail entry (checksum record).
+ */
+static inline bool ext4_dir_entry_is_tail(struct ext4_dir_entry_2 *de)
+{
+ struct ext4_dir_entry_tail *t = (struct ext4_dir_entry_tail *)de;
+
+ return !t->det_reserved_zero1 &&
+ le16_to_cpu(t->det_rec_len) == sizeof(*t) &&
+ !t->det_reserved_zero2 &&
+ t->det_reserved_ft == EXT4_FT_DIR_CSUM;
+}
+
extern const struct iomap_ops ext4_iomap_ops;
extern const struct iomap_ops ext4_iomap_report_ops;
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 5805001ff1d9..0196d954cba1 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -314,7 +314,6 @@ static struct ext4_dir_entry_tail *get_dirent_tail(struct inode *inode,
struct buffer_head *bh)
{
struct ext4_dir_entry_tail *t;
- int blocksize = EXT4_BLOCK_SIZE(inode->i_sb);
#ifdef PARANOID
struct ext4_dir_entry_2 *d, *top;
@@ -334,11 +333,7 @@ static struct ext4_dir_entry_tail *get_dirent_tail(struct inode *inode,
t = EXT4_DIRENT_TAIL(bh->b_data, EXT4_BLOCK_SIZE(inode->i_sb));
#endif
- if (t->det_reserved_zero1 ||
- (ext4_rec_len_from_disk(t->det_rec_len, blocksize) !=
- sizeof(struct ext4_dir_entry_tail)) ||
- t->det_reserved_zero2 ||
- t->det_reserved_ft != EXT4_FT_DIR_CSUM)
+ if (!ext4_dir_entry_is_tail((struct ext4_dir_entry_2 *)t))
return NULL;
return t;
--
2.43.7
^ permalink raw reply related
* [PATCH v2 01/10] ext4: replace ext4_dir_entry with ext4_dir_entry_2
From: Artem Blagodarenko @ 2026-06-10 15:24 UTC (permalink / raw)
To: linux-ext4; +Cc: adilger.kernel, Artem Blagodarenko, Andreas Dilger
In-Reply-To: <20260610152417.13576-1-ablagodarenko@thelustrecollective.com>
From: Artem Blagodarenko <artem.blagodarenko@gmail.com>
Replace remaining uses of struct ext4_dir_entry in namei.c
with struct ext4_dir_entry_2.
The code paths affected by this change already depend on the
filetype feature, so using struct ext4_dir_entry_2 is
appropriate and avoids mixing the two directory entry types
unnecessarily.
This change does not affect support for 16-bit rec_len.
Signed-off-by: Artem Blagodarenko <artem.blagodarenko@gmail.com>
Reviewed-by: Andreas Dilger <adilger@dilger.ca>
---
fs/ext4/namei.c | 38 +++++++++++++++++++-------------------
1 file changed, 19 insertions(+), 19 deletions(-)
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 4a47fbd8dd30..5805001ff1d9 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -102,7 +102,7 @@ static struct buffer_head *ext4_append(handle_t *handle,
}
static int ext4_dx_csum_verify(struct inode *inode,
- struct ext4_dir_entry *dirent);
+ struct ext4_dir_entry_2 *dirent);
/*
* Hints to ext4_read_dirblock regarding whether we expect a directory
@@ -128,7 +128,7 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode,
unsigned int line)
{
struct buffer_head *bh;
- struct ext4_dir_entry *dirent;
+ struct ext4_dir_entry_2 *dirent;
int is_dx_block = 0;
if (block >= inode->i_size >> inode->i_blkbits) {
@@ -160,7 +160,7 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode,
}
if (!bh)
return NULL;
- dirent = (struct ext4_dir_entry *) bh->b_data;
+ dirent = (struct ext4_dir_entry_2 *) bh->b_data;
/* Determine whether or not we have an index block */
if (is_dx(inode)) {
if (block == 0)
@@ -317,13 +317,13 @@ static struct ext4_dir_entry_tail *get_dirent_tail(struct inode *inode,
int blocksize = EXT4_BLOCK_SIZE(inode->i_sb);
#ifdef PARANOID
- struct ext4_dir_entry *d, *top;
+ struct ext4_dir_entry_2 *d, *top;
- d = (struct ext4_dir_entry *)bh->b_data;
- top = (struct ext4_dir_entry *)(bh->b_data +
+ d = (struct ext4_dir_entry_2 *)bh->b_data;
+ top = (struct ext4_dir_entry_2 *)(bh->b_data +
(blocksize - sizeof(struct ext4_dir_entry_tail)));
while (d < top && ext4_rec_len_from_disk(d->rec_len, blocksize))
- d = (struct ext4_dir_entry *)(((void *)d) +
+ d = (struct ext4_dir_entry_2 *)(((void *)d) +
ext4_rec_len_from_disk(d->rec_len, blocksize));
if (d != top)
@@ -410,22 +410,22 @@ int ext4_handle_dirty_dirblock(handle_t *handle,
}
static struct dx_countlimit *get_dx_countlimit(struct inode *inode,
- struct ext4_dir_entry *dirent,
+ struct ext4_dir_entry_2 *dirent,
int *offset)
{
- struct ext4_dir_entry *dp;
+ struct ext4_dir_entry_2 *de;
struct dx_root_info *root;
int count_offset;
int blocksize = EXT4_BLOCK_SIZE(inode->i_sb);
unsigned int rlen = ext4_rec_len_from_disk(dirent->rec_len, blocksize);
if (rlen == blocksize)
- count_offset = 8;
+ count_offset = sizeof(struct dx_node);
else if (rlen == 12) {
- dp = (struct ext4_dir_entry *)(((void *)dirent) + 12);
- if (ext4_rec_len_from_disk(dp->rec_len, blocksize) != blocksize - 12)
+ de = (struct ext4_dir_entry_2 *)(((void *)dirent) + 12);
+ if (ext4_rec_len_from_disk(de->rec_len, blocksize) != blocksize - 12)
return NULL;
- root = (struct dx_root_info *)(((void *)dp + 12));
+ root = (struct dx_root_info *)(((void *)de + 12));
if (root->reserved_zero ||
root->info_length != sizeof(struct dx_root_info))
return NULL;
@@ -438,7 +438,7 @@ static struct dx_countlimit *get_dx_countlimit(struct inode *inode,
return (struct dx_countlimit *)(((void *)dirent) + count_offset);
}
-static __le32 ext4_dx_csum(struct inode *inode, struct ext4_dir_entry *dirent,
+static __le32 ext4_dx_csum(struct inode *inode, struct ext4_dir_entry_2 *dirent,
int count_offset, int count, struct dx_tail *t)
{
struct ext4_inode_info *ei = EXT4_I(inode);
@@ -456,7 +456,7 @@ static __le32 ext4_dx_csum(struct inode *inode, struct ext4_dir_entry *dirent,
}
static int ext4_dx_csum_verify(struct inode *inode,
- struct ext4_dir_entry *dirent)
+ struct ext4_dir_entry_2 *dirent)
{
struct dx_countlimit *c;
struct dx_tail *t;
@@ -485,7 +485,7 @@ static int ext4_dx_csum_verify(struct inode *inode,
return 1;
}
-static void ext4_dx_csum_set(struct inode *inode, struct ext4_dir_entry *dirent)
+static void ext4_dx_csum_set(struct inode *inode, struct ext4_dir_entry_2 *dirent)
{
struct dx_countlimit *c;
struct dx_tail *t;
@@ -515,7 +515,7 @@ static inline int ext4_handle_dirty_dx_node(handle_t *handle,
struct inode *inode,
struct buffer_head *bh)
{
- ext4_dx_csum_set(inode, (struct ext4_dir_entry *)bh->b_data);
+ ext4_dx_csum_set(inode, (struct ext4_dir_entry_2 *)bh->b_data);
return ext4_handle_dirty_metadata(handle, inode, bh);
}
@@ -1488,7 +1488,7 @@ int ext4_search_dir(struct buffer_head *bh, char *search_buf, int buf_size,
}
static int is_dx_internal_node(struct inode *dir, ext4_lblk_t block,
- struct ext4_dir_entry *de)
+ struct ext4_dir_entry_2 *de)
{
struct super_block *sb = dir->i_sb;
@@ -1619,7 +1619,7 @@ static struct buffer_head *__ext4_find_entry(struct inode *dir,
}
if (!buffer_verified(bh) &&
!is_dx_internal_node(dir, block,
- (struct ext4_dir_entry *)bh->b_data) &&
+ (struct ext4_dir_entry_2 *)bh->b_data) &&
!ext4_dirblock_csum_verify(dir, bh)) {
EXT4_ERROR_INODE_ERR(dir, EFSBADCRC,
"checksumming directory "
--
2.43.7
^ permalink raw reply related
* [PATCH v2 00/10] Data in direntry (dirdata) feature
From: Artem Blagodarenko @ 2026-06-10 15:24 UTC (permalink / raw)
To: linux-ext4; +Cc: adilger.kernel, Artem Blagodarenko, Andreas Dilger, syzbot
EXT4 currently stores a hash in the directory entry
(dirent) immediately after the file name to support
simultaneous fscrypt and casefold functionality.
It has been discussed within the EXT4 community that
this hash could instead be stored in dirdata. This
would make it the second (or third, in the case of
64-bit inode counts) user of dirdata.
At the same time, the existing format—where the hash
is placed after the file name—must continue to be
supported. With these patches, EXT4 can handle the
hash in both formats.
The first user of this feature is LUFID -
Locally Unique File ID.
Support for fscrypt and case-insensitive directories
with dirdata enabled has been verified using a
dedicated xfstest submitted to the xfstests list as
a separate patch.
e2fsprogs support is provided in a separate patches
series.
Changes in v2:
- Split the patch set into 10 smaller patchesfor
easier reading and review.
- Added an IOCTL to set the LUFID for testing purposes.
LUFIDs can be listed via debugfs. Corresponding support
has been added in the related e2fsprogs series.
- Removed the dirdata mount option.
- Fixed the following issue:
KASAN: slab-out-of-bounds read in __ext4_check_dir_entry
- Rebased onto the latest codebase.
Artem Blagodarenko (10):
ext4: replace ext4_dir_entry with ext4_dir_entry_2
ext4: add ext4_dir_entry_is_tail()
ext4: refactor dx_root to support variable dirent sizes
ext4: add dirdata format definitions and access helpers
ext4: preserve dirdata bits in get_dtype()
ext4: add ext4_dir_entry_len() and harden dirdata parsing
ext4: rename ext4_dir_rec_len() and clarify dirdata usage
ext4: dirdata feature
ext4: add dirdata set/get helpers
ext4: Add EXT4_IOC_SET_LUFID ioctl for setting LUFID on directory
entries
foofile.txt | 0
fs/ext4/dir.c | 9 +-
fs/ext4/ext4.h | 205 ++++++++++++-
fs/ext4/inline.c | 37 ++-
fs/ext4/ioctl.c | 62 ++++
fs/ext4/namei.c | 587 +++++++++++++++++++++++++++-----------
fs/ext4/sysfs.c | 2 +
include/uapi/linux/ext4.h | 13 +
8 files changed, 723 insertions(+), 192 deletions(-)
create mode 100644 foofile.txt
Reviewed-by: Andreas Dilger <adilger@dilger.ca>
Tested-by: syzbot@syzkaller.appspotmail.com
--
2.43.7
^ permalink raw reply
* [PATCH] iomap: enforce DIO alignment check in iomap
From: cem @ 2026-06-10 14:52 UTC (permalink / raw)
To: brauner
Cc: linux-block, linux-fsdevel, linux-ext4, linux-xfs,
Carlos Maiolino, Keith Busch, Hannes Reinecke, Martin K. Petersen,
Christoph Hellwig, Jens Axboe
From: Carlos Maiolino <cem@kernel.org>
The DIO alignment check has been lifted from iomap layer to rely on the
block layer to enforce proper alignment when issuing direct IO
operations. This though, depending on the IO size and buffer address
passed to the IO operation may lead to user-visible behavior change.
This has been caught initially by LTP test diotest4 running on
PPC architecture, where the test fails because a read() operation
with a supposedly misaligned buffer succeeds instead of an expected
-EINVAL.
This has no direct relationship with PPC, but seems to do with the
IO size crossing page borders or not.
The test allocates a 4k buffer, and then increments the buffer pointer
by a single byte to enforce a misaligned address. It then issues a 4k
read() using such buffer. The read is supposed to return an -EINVAL but
it ends up succeeding.
The allocated buffer is at least a single page, so the read() size being
smaller will end up most of the time within the very same page initially
allocated which seems to suffice the block layer to accept the IO.
On x86 though, the same 4k read will end up crossing page boundaries
causing a bio_split which ends up properly checking the address and
rejecting it due to misalignment.
The test itself is buggy (which seems by design) because it ends up
attempting to read 4096 bytes into a 4095, but I believe the test
expected the address to be rejected prior to any write attempt.
The problematic behavior is reproducible on x86 by reducing the IO size
to something < PAGE_SIZE, so the misaligned read()s will also be accepted
by the block layer.
Fixing this is just a matter of enforcing daddr and memory
alignment back into iomap.
This behavior is reproducible in ext4 and xfs due to both relying on
iomap layer, btrfs does not present this behavior change as it does its
own DIO alignment checking.
Fixes: 7eac33186957 ("iomap: simplify direct io validity check")
Cc: Keith Busch <kbusch@kernel.org>
Cc: Hannes Reinecke <hare@suse.de>
Cc: Martin K. Petersen <martin.petersen@oracle.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Carlos Maiolino <cmaiolino@redhat.com>
Signed-off-by: Carlos Maiolino <cem@kernel.org>
---
While I didn't spot any memory/disk corruption looking into this, it
changes the user behavior that dictates buffer addresses must be
properly aligned when issuing direct IO operations so I thought making
iomap check again for the buffer address alignment is reasonable.
fs/iomap/direct-io.c | 3 +++
1 file changed, 3 insertions(+)
diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index 95254aa1b654..0064984e64e5 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -400,6 +400,9 @@ static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio)
if ((pos | length) & (alignment - 1))
return -EINVAL;
+ if (iov_iter_alignment(dio->submit.iter) & (alignment - 1))
+ return -EINVAL;
+
if (dio->flags & IOMAP_DIO_WRITE) {
bool need_completion_work = true;
--
2.54.0
^ permalink raw reply related
* [PATCH v2] ext4: fix circular lock dependency in ext4_ext_migrate
From: Yun Zhou @ 2026-06-10 10:30 UTC (permalink / raw)
To: tytso, adilger.kernel, libaokun, jack, ojaswin, ritesh.list,
yi.zhang, ebiggers, yun.zhou
Cc: linux-ext4, linux-kernel
In-Reply-To: <20260609084007.3432061-1-yun.zhou@windriver.com>
Move iput(tmp_inode) after ext4_writepages_up_write() to avoid a
circular lock dependency between s_writepages_rwsem and sb_internal
(freeze protection).
The deadlock scenario:
CPU0 (EXT4_IOC_MIGRATE) CPU1 (orphan cleanup during mount)
---- ----
ext4_ext_migrate()
ext4_writepages_down_write()
s_writepages_rwsem (write)
ext4_evict_inode()
sb_start_intwrite() [sb_internal]
...
ext4_writepages()
s_writepages_rwsem (read) [BLOCKED]
iput(tmp_inode)
ext4_evict_inode()
sb_start_intwrite() [BLOCKED]
The tmp_inode is a temporary inode with nlink=0 created solely for
building the extent tree. Its eviction does not require
s_writepages_rwsem protection, so deferring iput() until after
releasing the rwsem is safe.
Reported-by: syzbot+f0b58a1f5075a90dd9a5@syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=f0b58a1f5075a90dd9a5
Fixes: cb85f4d23f79 ("ext4: fix race between writepages and enabling EXT4_EXTENTS_FL")
Signed-off-by: Yun Zhou <yun.zhou@windriver.com>
Reviewed-by: Jan Kara <jack@suse.cz>
---
v2: remove redundant null pointer check for iput(tmp_inode)
fs/ext4/migrate.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 477d43d7e294..5d60ef10fe11 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -464,6 +464,7 @@ int ext4_ext_migrate(struct inode *inode)
if (IS_ERR(tmp_inode)) {
retval = PTR_ERR(tmp_inode);
ext4_journal_stop(handle);
+ tmp_inode = NULL;
goto out_unlock;
}
/*
@@ -591,9 +592,9 @@ int ext4_ext_migrate(struct inode *inode)
ext4_journal_stop(handle);
out_tmp_inode:
unlock_new_inode(tmp_inode);
- iput(tmp_inode);
out_unlock:
ext4_writepages_up_write(inode->i_sb, alloc_ctx);
+ iput(tmp_inode);
return retval;
}
--
2.43.0
^ permalink raw reply related
* Re: [PATCH] ext4: fix circular lock dependency in ext4_ext_migrate
From: Jan Kara @ 2026-06-10 10:21 UTC (permalink / raw)
To: Zhou, Yun
Cc: Jan Kara, tytso, adilger.kernel, libaokun, ojaswin, ritesh.list,
yi.zhang, ebiggers, linux-ext4, linux-kernel
In-Reply-To: <7fe6eec7-acd1-4511-beb7-bac9bbdb9cb2@windriver.com>
On Wed 10-06-26 15:04:33, Zhou, Yun wrote:
>
>
> On 6/9/26 20:05, Jan Kara wrote:
> > Looks good. Feel free to add:
> >
> > Reviewed-by: Jan Kara <jack@suse.cz>
> >
> > Just one nit below:
> >
> > > @@ -591,9 +592,10 @@ int ext4_ext_migrate(struct inode *inode)
> > > ext4_journal_stop(handle);
> > > out_tmp_inode:
> > > unlock_new_inode(tmp_inode);
> > > - iput(tmp_inode);
> > > out_unlock:
> > > ext4_writepages_up_write(inode->i_sb, alloc_ctx);
> > > + if (tmp_inode)
> > > + iput(tmp_inode);
> > iput(NULL) is properly handled so you don't need the if (tmp_inode) check
> > here.
> Hi Jan,
>
> Thank you for your careful review. Should I remove this redundant check in
> v2?
Yes, please. Thank you!
Honza
--
Jan Kara <jack@suse.com>
SUSE Labs, CR
^ permalink raw reply
* Re: [PATCH net v2] ext4: fix out-of-bounds read in ext4_read_inline_dir()
From: Jan Kara @ 2026-06-10 10:01 UTC (permalink / raw)
To: Xiang Mei
Cc: linux-ext4, Theodore Ts'o, Andreas Dilger, Baokun Li,
Jan Kara, Ojaswin Mujoo, Ritesh Harjani, Zhang Yi, Weiming Shi
In-Reply-To: <20260609010739.2278172-1-xmei5@asu.edu>
What does the 'net' in [PATCH net v2] mean?
On Mon 08-06-26 18:07:39, Xiang Mei wrote:
> ext4_read_inline_dir() reads de->rec_len / de->name past the end of its
> inline buffer for a crafted or corrupted inline directory, triggering a
> slab-out-of-bounds read during getdents64():
>
> BUG: KASAN: slab-out-of-bounds in filldir64 (fs/readdir.c:371)
> Read of size 8 at addr ffff88800fd3da3c by task exploit/146
> ...
> kasan_report (mm/kasan/report.c:595)
> filldir64 (fs/readdir.c:371)
> iterate_dir (fs/readdir.c:110)
> ...
>
> The payload is copied into a buffer of exactly inline_size bytes:
>
> dir_buf = kmalloc(inline_size, GFP_NOFS);
>
> but iteration runs in a logical position space extra_offset bytes larger
> than the buffer (extra_size = extra_offset + inline_size), so the synthetic
> "." and ".." entries land at the offsets they would have in a block-based
> directory. A real dirent is formed at "dir_buf + pos - extra_offset", yet
> the loop bounds and the ext4_check_dir_entry() length argument are all
> expressed in the larger extra_size. Two reachable sites dereference a
> dirent before confirming its physical offset is inside the allocation:
>
> In the main loop, ctx->pos is attacker-controlled via lseek() and the entry
> is validated with extra_size, so ext4_check_dir_entry() accepts a dirent
> running up to extra_offset bytes past the allocation before its length
> check fires. ctx->pos is also a signed loff_t: an lseek() to a small value
> below extra_offset makes "ctx->pos - extra_offset" negative, so a check
> that only bounds the top of the buffer is bypassed by underflow and de is
> formed before dir_buf.
>
> In the cookie-rescan loop, entered when i_version changed since the last
> readdir(2), the walk restarts from the beginning with i bounded by
> extra_size, so as i approaches extra_size the unconditional read of
> de->rec_len runs past the allocation before any validation.
>
> Both are the same defect, logical extra_size space versus the physical
> inline_size buffer. In each loop, reject a dirent whose header would not
> fit within inline_size before forming de, and in the main loop also reject a
> position that underflows below extra_offset. Validate the main-loop entry
> against inline_size rather than extra_size. Entries that legitimately fill
> the inline data still pass.
>
> Fixes: c4d8b0235aa9 ("ext4: fix readdir error in case inline_data+^dir_index.")
> Reported-by: Weiming Shi <bestswngs@gmail.com>
> Assisted-by: Claude:claude-opus-4-8
> Signed-off-by: Xiang Mei <xmei5@asu.edu>
Thanks for the analysis and the patch. See some suggestions for improvement
below:
> @@ -1488,10 +1491,20 @@ int ext4_read_inline_dir(struct file *file,
> continue;
> }
>
> + /*
> + * de lives at dir_buf + ctx->pos - extra_offset, so the dirent
> + * header must fit within inline_size. ctx->pos is a signed,
> + * lseek()-controlled loff_t: check the lower bound first, or
> + * ctx->pos < extra_offset underflows and points de before dir_buf.
> + */
> + if (ctx->pos < extra_offset ||
> + ctx->pos - extra_offset + ext4_dir_rec_len(1, NULL) >
> + inline_size)
> + goto out;
So I don't think this is really possible. ctx->pos isn't really fully user
controlled. When you use seek to modify ctx->pos, ext4_dir_llseek() does
set info->cookie to invalid value so the next time we enter
ext4_read_inline_dir() we are guaranteed to revalidate the offset and reset
it to 0, dotdot_offset, or some value greater than extra_size.
> de = (struct ext4_dir_entry_2 *)
> (dir_buf + ctx->pos - extra_offset);
> if (ext4_check_dir_entry(inode, file, de, iloc.bh, dir_buf,
> - extra_size, ctx->pos))
> + inline_size, ctx->pos))
> goto out;
> if (le32_to_cpu(de->inode)) {
> if (!dir_emit(ctx, de->name, de->name_len,
Otherwise the patch looks good.
Honza
--
Jan Kara <jack@suse.com>
SUSE Labs, CR
^ permalink raw reply
* [syzbot ci] Re: ext4: move inline data cleanup to ext4_writepages to fix deadlock
From: syzbot ci @ 2026-06-10 8:06 UTC (permalink / raw)
To: adilger.kernel, daeho.jeong, jack, libaokun, linux-ext4,
linux-kernel, ojaswin, ritesh.list, tytso, yi.zhang, yun.zhou
Cc: syzbot, syzkaller-bugs
In-Reply-To: <20260609154505.2104659-1-yun.zhou@windriver.com>
syzbot ci has tested the following series
[v1] ext4: move inline data cleanup to ext4_writepages to fix deadlock
https://lore.kernel.org/all/20260609154505.2104659-1-yun.zhou@windriver.com
* [PATCH] ext4: move inline data cleanup to ext4_writepages to fix deadlock
and found the following issue:
kernel BUG in ext4_writepages
Full report is available here:
https://ci.syzbot.org/series/1ede6029-df2a-4e08-bffc-05540c1f4934
***
kernel BUG in ext4_writepages
tree: torvalds
URL: https://kernel.googlesource.com/pub/scm/linux/kernel/git/torvalds/linux
base: 2d3090a8aeb596a26935db0955d46c9a5db5c6ce
arch: amd64
compiler: Debian clang version 21.1.8 (++20251221033036+2078da43e25a-1~exp1~20251221153213.50), Debian LLD 21.1.8
config: https://ci.syzbot.org/builds/63ee0324-2d17-4b32-aca2-c6230ff64be6/config
syz repro: https://ci.syzbot.org/findings/676a447c-ea73-43ea-9949-054dac1961e5/syz_repro
EXT4-fs warning (device loop2): ext4_expand_extra_isize_ea:2860: Unable to expand inode 15. Delete some EAs or run e2fsck.
------------[ cut here ]------------
kernel BUG at fs/ext4/inode.c:3047!
Oops: invalid opcode: 0000 [#1] SMP KASAN PTI
CPU: 1 UID: 0 PID: 5875 Comm: syz.2.19 Not tainted syzkaller #0 PREEMPT(full)
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.2-debian-1.16.2-1 04/01/2014
RIP: 0010:ext4_writepages+0x622/0x630 fs/ext4/inode.c:3046
Code: ff e9 61 fc ff ff 44 89 f1 80 e1 07 80 c1 03 38 c1 0f 8c de fc ff ff 4c 89 f7 e8 f9 2f a8 ff e9 d1 fc ff ff e8 ef d7 3c ff 90 <0f> 0b 66 66 66 2e 0f 1f 84 00 00 00 00 00 90 90 90 90 90 90 90 90
RSP: 0018:ffffc900034df2e0 EFLAGS: 00010293
RAX: ffffffff8288dfb1 RBX: 1ffff9200069be60 RCX: ffff888110555940
RDX: 0000000000000000 RSI: 0000004000000000 RDI: 0000000000000000
RBP: ffffc900034df410 R08: ffff8881b48c2f0f R09: 1ffff110369185e1
R10: dffffc0000000000 R11: ffffed10369185e2 R12: dffffc0000000000
R13: 0000004000000000 R14: 0000004610000000 R15: 1ffff11020c6fcc5
FS: 00007f033e9346c0(0000) GS:ffff8882a92a0000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00005556842f3058 CR3: 000000016d5c0000 CR4: 00000000000006f0
Call Trace:
<TASK>
do_writepages+0x32e/0x550 mm/page-writeback.c:2571
__writeback_single_inode+0x133/0x10e0 fs/fs-writeback.c:1764
writeback_single_inode+0x4ac/0xdc0 fs/fs-writeback.c:1883
write_inode_now+0x1c2/0x290 fs/fs-writeback.c:2974
iput_final fs/inode.c:1950 [inline]
iput+0x8c1/0xe80 fs/inode.c:2009
ext4_orphan_cleanup+0xc38/0x1470 fs/ext4/orphan.c:472
__ext4_fill_super fs/ext4/super.c:5701 [inline]
ext4_fill_super+0x5a19/0x6330 fs/ext4/super.c:5824
get_tree_bdev_flags+0x431/0x4f0 fs/super.c:1694
vfs_get_tree+0x92/0x2a0 fs/super.c:1754
fc_mount fs/namespace.c:1193 [inline]
do_new_mount_fc fs/namespace.c:3758 [inline]
do_new_mount+0x341/0xd30 fs/namespace.c:3834
do_mount fs/namespace.c:4167 [inline]
__do_sys_mount fs/namespace.c:4383 [inline]
__se_sys_mount+0x31d/0x420 fs/namespace.c:4360
do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline]
do_syscall_64+0x174/0x580 arch/x86/entry/syscall_64.c:94
entry_SYSCALL_64_after_hwframe+0x77/0x7f
RIP: 0033:0x7f033d99e0ca
Code: 48 c7 c2 e8 ff ff ff f7 d8 64 89 02 b8 ff ff ff ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 49 89 ca b8 a5 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 e8 ff ff ff f7 d8 64 89 01 48
RSP: 002b:00007f033e933e58 EFLAGS: 00000246 ORIG_RAX: 00000000000000a5
RAX: ffffffffffffffda RBX: 00007f033e933ee0 RCX: 00007f033d99e0ca
RDX: 0000200000000040 RSI: 00002000000016c0 RDI: 00007f033e933ea0
RBP: 0000200000000040 R08: 00007f033e933ee0 R09: 000000000000840e
R10: 000000000000840e R11: 0000000000000246 R12: 00002000000016c0
R13: 00007f033e933ea0 R14: 000000000000042f R15: 0000200000000080
</TASK>
Modules linked in:
---[ end trace 0000000000000000 ]---
RIP: 0010:ext4_writepages+0x622/0x630 fs/ext4/inode.c:3046
Code: ff e9 61 fc ff ff 44 89 f1 80 e1 07 80 c1 03 38 c1 0f 8c de fc ff ff 4c 89 f7 e8 f9 2f a8 ff e9 d1 fc ff ff e8 ef d7 3c ff 90 <0f> 0b 66 66 66 2e 0f 1f 84 00 00 00 00 00 90 90 90 90 90 90 90 90
RSP: 0018:ffffc900034df2e0 EFLAGS: 00010293
RAX: ffffffff8288dfb1 RBX: 1ffff9200069be60 RCX: ffff888110555940
RDX: 0000000000000000 RSI: 0000004000000000 RDI: 0000000000000000
RBP: ffffc900034df410 R08: ffff8881b48c2f0f R09: 1ffff110369185e1
R10: dffffc0000000000 R11: ffffed10369185e2 R12: dffffc0000000000
R13: 0000004000000000 R14: 0000004610000000 R15: 1ffff11020c6fcc5
FS: 00007f033e9346c0(0000) GS:ffff8882a92a0000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00005556842f3058 CR3: 000000016d5c0000 CR4: 00000000000006f0
***
If these findings have caused you to resend the series or submit a
separate fix, please add the following tag to your commit message:
Tested-by: syzbot@syzkaller.appspotmail.com
---
This report is generated by a bot. It may contain errors.
syzbot ci engineers can be reached at syzkaller@googlegroups.com.
To test a patch for this bug, please reply with `#syz test`
(should be on a separate line).
The patch should be attached to the email.
Note: arguments like custom git repos and branches are not supported.
^ permalink raw reply
* Re: [PATCH] ext4: fix circular lock dependency in ext4_ext_migrate
From: Zhou, Yun @ 2026-06-10 7:04 UTC (permalink / raw)
To: Jan Kara
Cc: tytso, adilger.kernel, libaokun, ojaswin, ritesh.list, yi.zhang,
ebiggers, linux-ext4, linux-kernel
In-Reply-To: <lr2gyeoay4eai2nujk3siaq7wnqwg3t46an6sipqkmhxarvcrb@tqxhmnstmwnv>
On 6/9/26 20:05, Jan Kara wrote:
> Looks good. Feel free to add:
>
> Reviewed-by: Jan Kara <jack@suse.cz>
>
> Just one nit below:
>
>> @@ -591,9 +592,10 @@ int ext4_ext_migrate(struct inode *inode)
>> ext4_journal_stop(handle);
>> out_tmp_inode:
>> unlock_new_inode(tmp_inode);
>> - iput(tmp_inode);
>> out_unlock:
>> ext4_writepages_up_write(inode->i_sb, alloc_ctx);
>> + if (tmp_inode)
>> + iput(tmp_inode);
> iput(NULL) is properly handled so you don't need the if (tmp_inode) check
> here.
Hi Jan,
Thank you for your careful review. Should I remove this redundant check
in v2?
BR,
Yun
^ permalink raw reply
* Re: [PATCH RFC 7/8] erofs: open via dedicated fs bdev helpers
From: Gao Xiang @ 2026-06-10 6:55 UTC (permalink / raw)
To: Christian Brauner
Cc: Jens Axboe, Alexander Viro, linux-block, linux-kernel,
linux-fsdevel, Carlos Maiolino, linux-xfs, Chris Mason,
David Sterba, linux-btrfs, Theodore Ts'o, linux-ext4,
Gao Xiang, linux-erofs, Christoph Hellwig, Jan Kara
In-Reply-To: <20260603-nieder-ausdehnen-siebdruck-aa96f40ebec6@brauner>
Hi Christian,
On 2026/6/3 21:42, Christian Brauner wrote:
>> May I ask if it's an urgent 7.2 work? If not, I could
>
> No no, it's way too late for that this cycle.
>
>> make a preparation patch for the upcoming 7.2 cycle
>> to handle erofs_map_dev() failure here so you don't
>> need to bother with this in this patchset.
>
> Sounds good. I take it you can just do this yourself without me.
>
>> I will seek more time to resolve the recent todos
>
> Thanks!
>
>> yet always intercepted by other unrelated stuffs.
>
> :)
I removed .shutdown() and .remove_bdev() implementations since I
think it doesn't quite seem necessary for immutable fses, but
would like to know your thoughts too, my overall own comments are
documented in the commit message below:
From 933f6c6f2e704116d9a15815c880196bec7b9ee3 Mon Sep 17 00:00:00 2001
From: Christian Brauner <brauner@kernel.org>
Date: Tue, 2 Jun 2026 12:10:13 +0200
Subject: [PATCH] erofs: open via dedicated fs bdev helpers
Route opens through fs_bdev_file_open_by_path() so each external device
is registered against the correct superblock, and convert the matching
releases.
Gao Xiang: I think typical immutable filesystems don't need .shutdown()
and .remove_bdev() for the following reasons:
- blk_mark_disk_dead() sets GD_DEAD in advance of fs_bdev_mark_dead()
so that the following bios will fail immediately; block_device
references are still valid so it seems overkill to handle dead
blockdevs in the deep filesystem I/O submission path.
- Immutable filesystems like EROFS don't have write paths and journals,
so they don't need to block writes (i.e., new dirty pages), metadata
changes, and abort journals.
- The comment above loop_change_fd() documents a valid read-only use
case we need to support anyway, but it calls disk_force_media_change()
which will call fs_bdev_mark_dead() later: we don't want loop_change_fd()
shutdowns the active filesystems and return -EIO unconditionally.
Currently I think the default behavior (shrink_dcache_sb + evict_inodes)
in fs_bdev_mark_dead() is enough for immutable filesystems, tried to
document in the commit here for later reference.
Signed-off-by: Christian Brauner (Amutable) <brauner@kernel.org>
Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
---
fs/erofs/super.c | 35 +++++++++++++++++++++++------------
1 file changed, 23 insertions(+), 12 deletions(-)
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index 802add6652fd..def9cbfbc9d8 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -153,8 +153,8 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb,
} else if (!sbi->devs->flatdev) {
file = erofs_is_fileio_mode(sbi) ?
filp_open(dif->path, O_RDONLY | O_LARGEFILE, 0) :
- bdev_file_open_by_path(dif->path,
- BLK_OPEN_READ, sb->s_type, NULL);
+ fs_bdev_file_open_by_path(dif->path,
+ BLK_OPEN_READ, sb->s_type, sb);
if (IS_ERR(file)) {
if (file == ERR_PTR(-ENOTBLK))
return -EINVAL;
@@ -843,11 +843,16 @@ static int erofs_fc_reconfigure(struct fs_context *fc)
static int erofs_release_device_info(int id, void *ptr, void *data)
{
+ struct super_block *sb = data;
struct erofs_device_info *dif = ptr;
fs_put_dax(dif->dax_dev, NULL);
- if (dif->file)
- fput(dif->file);
+ if (dif->file) {
+ if (S_ISBLK(file_inode(dif->file)->i_mode))
+ fs_bdev_file_release(dif->file, sb);
+ else
+ fput(dif->file);
+ }
erofs_fscache_unregister_cookie(dif->fscache);
dif->fscache = NULL;
kfree(dif->path);
@@ -855,18 +860,19 @@ static int erofs_release_device_info(int id, void *ptr, void *data)
return 0;
}
-static void erofs_free_dev_context(struct erofs_dev_context *devs)
+static void erofs_free_dev_context(struct erofs_dev_context *devs,
+ struct super_block *sb)
{
if (!devs)
return;
- idr_for_each(&devs->tree, &erofs_release_device_info, NULL);
+ idr_for_each(&devs->tree, &erofs_release_device_info, sb);
idr_destroy(&devs->tree);
kfree(devs);
}
-static void erofs_sb_free(struct erofs_sb_info *sbi)
+static void erofs_sb_free(struct erofs_sb_info *sbi, struct super_block *sb)
{
- erofs_free_dev_context(sbi->devs);
+ erofs_free_dev_context(sbi->devs, sb);
kfree(sbi->fsid);
kfree_sensitive(sbi->domain_id);
if (sbi->dif0.file)
@@ -879,8 +885,13 @@ static void erofs_fc_free(struct fs_context *fc)
{
struct erofs_sb_info *sbi = fc->s_fs_info;
- if (sbi) /* free here if an error occurs before transferring to sb */
- erofs_sb_free(sbi);
+ /*
+ * Freed here only if an error occurs before the sb is set up; at that
+ * point no block-backed device has been claimed (that happens in
+ * fill_super), so the NULL sb never reaches fs_bdev_file_release().
+ */
+ if (sbi)
+ erofs_sb_free(sbi, NULL);
}
static const struct fs_context_operations erofs_context_ops = {
@@ -936,7 +947,7 @@ static void erofs_kill_sb(struct super_block *sb)
erofs_drop_internal_inodes(sbi);
fs_put_dax(sbi->dif0.dax_dev, NULL);
erofs_fscache_unregister_fs(sb);
- erofs_sb_free(sbi);
+ erofs_sb_free(sbi, sb);
sb->s_fs_info = NULL;
}
@@ -948,7 +959,7 @@ static void erofs_put_super(struct super_block *sb)
erofs_shrinker_unregister(sb);
erofs_xattr_prefixes_cleanup(sb);
erofs_drop_internal_inodes(sbi);
- erofs_free_dev_context(sbi->devs);
+ erofs_free_dev_context(sbi->devs, sb);
sbi->devs = NULL;
erofs_fscache_unregister_fs(sb);
}
--
2.43.5
^ permalink raw reply related
* [PATCH v3] ext4: drop s_writepages_rwsem around inline data handling in writepages
From: Yun Zhou @ 2026-06-10 6:37 UTC (permalink / raw)
To: tytso, adilger.kernel, libaokun, jack, ojaswin, ritesh.list,
yi.zhang, ebiggers, yun.zhou
Cc: linux-ext4, linux-kernel
In-Reply-To: <20260609154505.2104659-1-yun.zhou@windriver.com>
ext4_do_writepages() calls ext4_destroy_inline_data() which acquires
xattr_sem while s_writepages_rwsem is held (read). This creates a
circular lock dependency:
CPU0 CPU1
---- ----
ext4_writepages()
ext4_writepages_down_read()
[holds s_writepages_rwsem]
ext4_evict_inode()
__ext4_mark_inode_dirty()
ext4_expand_extra_isize_ea()
ext4_xattr_block_set()
[holds xattr_sem]
iput(old_bh inode)
write_inode_now()
ext4_writepages()
ext4_writepages_down_read()
[BLOCKED on s_writepages_rwsem]
ext4_do_writepages()
ext4_destroy_inline_data()
down_write(xattr_sem)
[BLOCKED on xattr_sem]
Fix by temporarily dropping s_writepages_rwsem for the entire inline
data handling block, including the journal handle start/stop. The
rwsem must be dropped before ext4_journal_start() -- not between
journal_start and journal_stop -- to avoid a secondary deadlock with
ext4_change_inode_journal_flag() which takes rwsem (write) and then
calls jbd2_journal_lock_updates() waiting for active handles to stop.
This is safe because:
- This code runs before any block mapping or IO submission, so no
writepages state depends on the rwsem being held at this point.
- Inline data destruction is a one-way format transition (once cleared,
EXT4_INODE_INLINE_DATA is never set again). The rwsem is
re-acquired after journal_stop, ensuring format stability for the
remainder of writepages.
- The can_map flag identifies the ext4_writepages() path (holds rwsem)
vs ext4_normal_submit_inode_data_buffers() (does not), so the
drop/reacquire is skipped when the rwsem is not held.
Also check the return value of ext4_destroy_inline_data() to avoid
proceeding with an inconsistent inode format on failure.
Reported-by: syzbot+bb2455d02bda0b5701e3@syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=bb2455d02bda0b5701e3
Fixes: c8585c6fcaf2 ("ext4: fix races between changing inode journal mode and ext4_writepages")
Signed-off-by: Yun Zhou <yun.zhou@windriver.com>
---
v3: Drop s_writepages_rwsem before ext4_journal_start() and reacquire
after ext4_journal_stop(), instead of dropping between journal_start
and journal_stop as in v2. This avoids two issues identified in v2
review:
- memalloc_nofs_restore() in ext4_writepages_up_read() would clear
PF_MEMALLOC_NOFS while the jbd2 handle is active.
- Reacquiring s_writepages_rwsem while holding a handle creates an
ABBA deadlock with ext4_change_inode_journal_flag() which takes
the rwsem (write) then calls jbd2_journal_lock_updates().
v2: Instead of moving inline data handling to ext4_writepages(),
temporarily drop s_writepages_rwsem around ext4_destroy_inline_data()
in ext4_do_writepages(). The move approach had a race where concurrent
writes could create dirty pages with inline data after the early check,
and unconditional destruction without dirty pages would lose data.
v1: Moved inline data cleanup from ext4_do_writepages() to
ext4_writepages() before acquiring s_writepages_rwsem.
fs/ext4/inode.c | 31 ++++++++++++++++++++++++++-----
1 file changed, 26 insertions(+), 5 deletions(-)
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index c2c2d6ac7f3d..cd7588a3fa45 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1694,6 +1694,9 @@ struct mpage_da_data {
struct writeback_control *wbc;
unsigned int can_map:1; /* Can writepages call map blocks? */
+ /* Saved memalloc context from ext4_writepages_down_read() */
+ int alloc_ctx;
+
/* These are internal state of ext4_do_writepages() */
loff_t start_pos; /* The start pos to write */
loff_t next_pos; /* Current pos to examine */
@@ -2816,16 +2819,35 @@ static int ext4_do_writepages(struct mpage_da_data *mpd)
* we'd better clear the inline data here.
*/
if (ext4_has_inline_data(inode)) {
- /* Just inode will be modified... */
+ /*
+ * Temporarily drop s_writepages_rwsem because
+ * ext4_destroy_inline_data() acquires xattr_sem, which has
+ * a higher lock ordering rank. Holding both would create a
+ * circular dependency with ext4_xattr_block_set() -> iput()
+ * -> ext4_writepages() -> s_writepages_rwsem.
+ *
+ * Drop the rwsem before starting the journal handle to also
+ * avoid a deadlock with ext4_change_inode_journal_flag(),
+ * which takes rwsem (write) then jbd2_journal_lock_updates().
+ */
+ if (mpd->can_map)
+ ext4_writepages_up_read(inode->i_sb, mpd->alloc_ctx);
handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
if (IS_ERR(handle)) {
+ if (mpd->can_map)
+ mpd->alloc_ctx =
+ ext4_writepages_down_read(inode->i_sb);
ret = PTR_ERR(handle);
goto out_writepages;
}
BUG_ON(ext4_test_inode_state(inode,
EXT4_STATE_MAY_INLINE_DATA));
- ext4_destroy_inline_data(handle, inode);
+ ret = ext4_destroy_inline_data(handle, inode);
ext4_journal_stop(handle);
+ if (mpd->can_map)
+ mpd->alloc_ctx = ext4_writepages_down_read(inode->i_sb);
+ if (ret)
+ goto out_writepages;
}
/*
@@ -3032,13 +3054,12 @@ static int ext4_writepages(struct address_space *mapping,
.can_map = 1,
};
int ret;
- int alloc_ctx;
ret = ext4_emergency_state(sb);
if (unlikely(ret))
return ret;
- alloc_ctx = ext4_writepages_down_read(sb);
+ mpd.alloc_ctx = ext4_writepages_down_read(sb);
ret = ext4_do_writepages(&mpd);
/*
* For data=journal writeback we could have come across pages marked
@@ -3047,7 +3068,7 @@ static int ext4_writepages(struct address_space *mapping,
*/
if (!ret && mpd.journalled_more_data)
ret = ext4_do_writepages(&mpd);
- ext4_writepages_up_read(sb, alloc_ctx);
+ ext4_writepages_up_read(sb, mpd.alloc_ctx);
return ret;
}
--
2.43.0
^ permalink raw reply related
* [PATCH v2] ext4: drop s_writepages_rwsem around ext4_destroy_inline_data
From: Yun Zhou @ 2026-06-10 5:08 UTC (permalink / raw)
To: tytso, adilger.kernel, libaokun, jack, ojaswin, ritesh.list,
yi.zhang, ebiggers, yun.zhou
Cc: linux-ext4, linux-kernel
In-Reply-To: <20260609154505.2104659-1-yun.zhou@windriver.com>
ext4_do_writepages() calls ext4_destroy_inline_data() which acquires
xattr_sem while s_writepages_rwsem is held (read). This creates a
circular lock dependency:
CPU0 CPU1
---- ----
ext4_writepages()
ext4_writepages_down_read()
[holds s_writepages_rwsem]
ext4_evict_inode()
__ext4_mark_inode_dirty()
ext4_expand_extra_isize_ea()
ext4_xattr_block_set()
[holds xattr_sem]
iput(old_bh inode)
write_inode_now()
ext4_writepages()
ext4_writepages_down_read()
[BLOCKED on s_writepages_rwsem]
ext4_do_writepages()
ext4_destroy_inline_data()
down_write(xattr_sem)
[BLOCKED on xattr_sem]
Fix by temporarily dropping s_writepages_rwsem around the call to
ext4_destroy_inline_data(). This is safe because:
- This code runs before any block mapping or IO submission, so no
writepages state depends on the rwsem being held at this point.
- Inline data destruction is a one-way format transition (once cleared,
EXT4_INODE_INLINE_DATA is never set again). The rwsem is
re-acquired immediately after, ensuring format stability for the
remainder of writepages.
- The can_map flag naturally identifies the ext4_writepages() path
(holds rwsem) vs ext4_normal_submit_inode_data_buffers() (does not),
so the drop/reacquire is skipped when the rwsem is not held.
Also check the return value of ext4_destroy_inline_data() -- previously
ignored, a failure would leave inline data intact while writepages
proceeds assuming block-mapped layout.
Reported-by: syzbot+bb2455d02bda0b5701e3@syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=bb2455d02bda0b5701e3
Fixes: c8585c6fcaf2 ("ext4: fix races between changing inode journal mode and ext4_writepages")
Signed-off-by: Yun Zhou <yun.zhou@windriver.com>
---
v2:
- Instead of moving inline data handling to ext4_writepages(),
temporarily drop s_writepages_rwsem around ext4_destroy_inline_data()
in ext4_do_writepages(). The move approach had a race where concurrent
writes could create dirty pages with inline data after the early check,
and unconditional destruction without dirty pages would lose data.
fs/ext4/inode.c | 23 +++++++++++++++++++----
1 file changed, 19 insertions(+), 4 deletions(-)
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index c2c2d6ac7f3d..7ec16adf4685 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1694,6 +1694,9 @@ struct mpage_da_data {
struct writeback_control *wbc;
unsigned int can_map:1; /* Can writepages call map blocks? */
+ /* Saved memalloc context from ext4_writepages_down_read() */
+ int alloc_ctx;
+
/* These are internal state of ext4_do_writepages() */
loff_t start_pos; /* The start pos to write */
loff_t next_pos; /* Current pos to examine */
@@ -2824,8 +2827,21 @@ static int ext4_do_writepages(struct mpage_da_data *mpd)
}
BUG_ON(ext4_test_inode_state(inode,
EXT4_STATE_MAY_INLINE_DATA));
- ext4_destroy_inline_data(handle, inode);
+ /*
+ * Temporarily drop s_writepages_rwsem because
+ * ext4_destroy_inline_data() acquires xattr_sem, which has
+ * a higher lock ordering rank. Holding both would create a
+ * circular dependency with ext4_xattr_block_set() -> iput()
+ * -> ext4_writepages() -> s_writepages_rwsem.
+ */
+ if (mpd->can_map)
+ ext4_writepages_up_read(inode->i_sb, mpd->alloc_ctx);
+ ret = ext4_destroy_inline_data(handle, inode);
+ if (mpd->can_map)
+ mpd->alloc_ctx = ext4_writepages_down_read(inode->i_sb);
ext4_journal_stop(handle);
+ if (ret)
+ goto out_writepages;
}
/*
@@ -3032,13 +3048,12 @@ static int ext4_writepages(struct address_space *mapping,
.can_map = 1,
};
int ret;
- int alloc_ctx;
ret = ext4_emergency_state(sb);
if (unlikely(ret))
return ret;
- alloc_ctx = ext4_writepages_down_read(sb);
+ mpd.alloc_ctx = ext4_writepages_down_read(sb);
ret = ext4_do_writepages(&mpd);
/*
* For data=journal writeback we could have come across pages marked
@@ -3047,7 +3062,7 @@ static int ext4_writepages(struct address_space *mapping,
*/
if (!ret && mpd.journalled_more_data)
ret = ext4_do_writepages(&mpd);
- ext4_writepages_up_read(sb, alloc_ctx);
+ ext4_writepages_up_read(sb, mpd.alloc_ctx);
return ret;
}
--
2.43.0
^ permalink raw reply related
* Re: [PATCH] ext4: fix kernel BUG in ext4_write_inline_data_end
From: Aditya Prakash Srivastava @ 2026-06-10 3:16 UTC (permalink / raw)
To: Jan Kara
Cc: Theodore Ts'o, Andreas Dilger, Baokun Li, Ojaswin Mujoo,
Ritesh Harjani, Zhang Yi, linux-ext4, linux-kernel,
syzbot+0c89d865531d053abb2d
In-Reply-To: <w7zmtkoa4ieb676gkl6m2ax5hp76dxr2rhkfzgqvlydvw4hpfr@hixijfpumliv>
Hi Jan,
Thank you for the review and for the Reviewed-by tag!
Best regards,
Aditya Prakash Srivastava
^ permalink raw reply
* [syzbot] [ext4?] KASAN: use-after-free Read in ext4_xattr_list_entries (2)
From: syzbot @ 2026-06-09 21:03 UTC (permalink / raw)
To: adilger.kernel, jack, libaokun, linux-ext4, linux-kernel, ojaswin,
ritesh.list, syzkaller-bugs, tytso, yi.zhang
Hello,
syzbot found the following issue on:
HEAD commit: 8e65320d91cd Merge tag 'drm-fixes-2026-06-06' of https://g..
git tree: upstream
console output: https://syzkaller.appspot.com/x/log.txt?x=106aabec580000
kernel config: https://syzkaller.appspot.com/x/.config?x=b4166e8ea5fbf7e3
dashboard link: https://syzkaller.appspot.com/bug?extid=3fbf2337de43f5581aec
compiler: Debian clang version 21.1.8 (++20251221033036+2078da43e25a-1~exp1~20251221153213.50), Debian LLD 21.1.8
Unfortunately, I don't have any reproducer for this issue yet.
Downloadable assets:
disk image (non-bootable): https://storage.googleapis.com/syzbot-assets/d900f083ada3/non_bootable_disk-8e65320d.raw.xz
vmlinux: https://storage.googleapis.com/syzbot-assets/0bee42e3c28b/vmlinux-8e65320d.xz
kernel image: https://storage.googleapis.com/syzbot-assets/57e4c1a3c321/bzImage-8e65320d.xz
IMPORTANT: if you fix the issue, please add the following tag to the commit:
Reported-by: syzbot+3fbf2337de43f5581aec@syzkaller.appspotmail.com
loop0: detected capacity change from 0 to 2048
EXT4-fs (loop0): mounted filesystem 00000000-0000-0000-0000-000000000000 r/w without journal. Quota mode: none.
cdc_ether 5-1:1.0: probe with driver cdc_ether failed with error -22
loop0: detected capacity change from 2048 to 64
EXT4-fs error (device loop0): xattr_find_entry:337: inode #15: comm syz.0.0: corrupted xattr entries
==================================================================
BUG: KASAN: use-after-free in ext4_xattr_list_entries+0x302/0x3d0 fs/ext4/xattr.c:724
Read of size 4 at addr ffff8880568de014 by task syz.0.0/5342
CPU: 0 UID: 0 PID: 5342 Comm: syz.0.0 Not tainted syzkaller #0 PREEMPT(full)
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-debian-1.16.3-2 04/01/2014
Call Trace:
<TASK>
dump_stack_lvl+0xe8/0x150 lib/dump_stack.c:120
print_address_description+0x55/0x1e0 mm/kasan/report.c:378
print_report+0x58/0x70 mm/kasan/report.c:482
kasan_report+0x117/0x150 mm/kasan/report.c:595
ext4_xattr_list_entries+0x302/0x3d0 fs/ext4/xattr.c:724
ext4_xattr_ibody_list fs/ext4/xattr.c:793 [inline]
ext4_listxattr+0x221/0x670 fs/ext4/xattr.c:818
vfs_listxattr fs/xattr.c:511 [inline]
listxattr+0x112/0x2a0 fs/xattr.c:933
filename_listxattr fs/xattr.c:966 [inline]
path_listxattrat+0x1a3/0x3f0 fs/xattr.c:993
do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline]
do_syscall_64+0x174/0x580 arch/x86/entry/syscall_64.c:94
entry_SYSCALL_64_after_hwframe+0x77/0x7f
RIP: 0033:0x7f9709f9ce59
Code: ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 e8 ff ff ff f7 d8 64 89 01 48
RSP: 002b:00007f970af29fe8 EFLAGS: 00000246 ORIG_RAX: 00000000000000c3
RAX: ffffffffffffffda RBX: 00007f970a215fa0 RCX: 00007f9709f9ce59
RDX: 000000000000002d RSI: 0000200000000100 RDI: 0000200000000140
RBP: 00007f970a032d6f R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000
R13: 00007f970a216038 R14: 00007f970a215fa0 R15: 00007fff4c5d5648
</TASK>
The buggy address belongs to the physical page:
page: refcount:0 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x568de
flags: 0x4fff00000000000(node=1|zone=1|lastcpupid=0x7ff)
raw: 04fff00000000000 ffffea00015a37c8 ffffea000141f208 0000000000000000
raw: 0000000000000000 0000000000000000 00000000ffffffff 0000000000000000
page dumped because: kasan: bad access detected
page_owner info is not present (never set?)
Memory state around the buggy address:
ffff8880568ddf00: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
ffff8880568ddf80: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
>ffff8880568de000: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff
^
ffff8880568de080: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff
ffff8880568de100: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff
==================================================================
---
This report is generated by a bot. It may contain errors.
See https://goo.gl/tpsmEJ for more information about syzbot.
syzbot engineers can be reached at syzkaller@googlegroups.com.
syzbot will keep track of this issue. See:
https://goo.gl/tpsmEJ#status for how to communicate with syzbot.
If the report is already addressed, let syzbot know by replying with:
#syz fix: exact-commit-title
If you want to overwrite report's subsystems, reply with:
#syz set subsystems: new-subsystem
(See the list of subsystem names on the web dashboard)
If the report is a duplicate of another one, reply with:
#syz dup: exact-subject-of-another-report
If you want to undo deduplication, reply with:
#syz undup
^ permalink raw reply
* [syzbot] Monthly ext4 report (Jun 2026)
From: syzbot @ 2026-06-09 20:32 UTC (permalink / raw)
To: linux-ext4, linux-kernel, syzkaller-bugs
Hello ext4 maintainers/developers,
This is a 31-day syzbot report for the ext4 subsystem.
All related reports/information can be found at:
https://syzkaller.appspot.com/upstream/s/ext4
During the period, 2 new issues were detected and 0 were fixed.
In total, 45 issues are still open and 175 have already been fixed.
There are also 8 low-priority issues.
Some of the still happening issues:
Ref Crashes Repro Title
<1> 10098 Yes possible deadlock in ext4_writepages (2)
https://syzkaller.appspot.com/bug?extid=eb5b4ef634a018917f3c
<2> 7580 Yes KASAN: out-of-bounds Read in ext4_xattr_set_entry
https://syzkaller.appspot.com/bug?extid=f792df426ff0f5ceb8d1
<3> 3212 Yes kernel BUG in ext4_do_writepages
https://syzkaller.appspot.com/bug?extid=d1da16f03614058fdc48
<4> 481 Yes possible deadlock in ext4_evict_inode (5)
https://syzkaller.appspot.com/bug?extid=212e8f62790f8e0bc63b
<5> 406 Yes possible deadlock in wait_transaction_locked (3)
https://syzkaller.appspot.com/bug?extid=5d19358d7eb30ffb0cc5
<6> 139 Yes KMSAN: uninit-value in fscrypt_crypt_data_unit
https://syzkaller.appspot.com/bug?extid=7add5c56bc2a14145d20
<7> 20 No possible deadlock in evict (4)
https://syzkaller.appspot.com/bug?extid=a30a00d3e694e4fa1315
<8> 10 No WARNING in ext4_write_inode (3)
https://syzkaller.appspot.com/bug?extid=070d9738dbe6a10fadc8
<9> 8 Yes INFO: task hung in block_read_full_folio (3)
https://syzkaller.appspot.com/bug?extid=03afbb29537f0336b7ad
<10> 2951 Yes INFO: task hung in sync_inodes_sb (5)
https://syzkaller.appspot.com/bug?extid=30476ec1b6dc84471133
---
This report is generated by a bot. It may contain errors.
See https://goo.gl/tpsmEJ for more information about syzbot.
syzbot engineers can be reached at syzkaller@googlegroups.com.
To disable reminders for individual bugs, reply with the following command:
#syz set <Ref> no-reminders
To change bug's subsystems, reply with:
#syz set <Ref> subsystems: new-subsystem
You may send multiple commands in a single email message.
^ permalink raw reply
* [PATCH 5.10/5.15] ext4: validate p_idx bounds in ext4_ext_correct_indexes
From: Alexey Panov @ 2026-06-09 16:44 UTC (permalink / raw)
To: stable, Greg Kroah-Hartman
Cc: Alexey Panov, Theodore Ts'o, Andreas Dilger, linux-ext4,
linux-kernel, Baokun Li, Jan Kara, Ojaswin Mujoo,
Ritesh Harjani (IBM), Zhang Yi, lvc-project,
syzbot+04c4e65cab786a2e5b7e, Tejas Bharambe, stable
From: Tejas Bharambe <tejas.bharambe@outlook.com>
commit 2acb5c12ebd860f30e4faf67e6cc8c44ddfe5fe8 upstream.
ext4_ext_correct_indexes() walks up the extent tree correcting
index entries when the first extent in a leaf is modified. Before
accessing path[k].p_idx->ei_block, there is no validation that
p_idx falls within the valid range of index entries for that
level.
If the on-disk extent header contains a corrupted or crafted
eh_entries value, p_idx can point past the end of the allocated
buffer, causing a slab-out-of-bounds read.
Fix this by validating path[k].p_idx against EXT_LAST_INDEX() at
both access sites: before the while loop and inside it. Return
-EFSCORRUPTED if the index pointer is out of range, consistent
with how other bounds violations are handled in the ext4 extent
tree code.
Reported-by: syzbot+04c4e65cab786a2e5b7e@syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=04c4e65cab786a2e5b7e
Signed-off-by: Tejas Bharambe <tejas.bharambe@outlook.com>
Link: https://patch.msgid.link/JH0PR06MB66326016F9B6AD24097D232B897CA@JH0PR06MB6632.apcprd06.prod.outlook.com
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Cc: stable@kernel.org
[ Alexey: Adapt goto clean to break because the clean error path is not
present in linux-5.10.y and linux-5.15.y. ]
Signed-off-by: Alexey Panov <apanov@astralinux.ru>
---
Backport fix for CVE-2026-31449
fs/ext4/extents.c | 15 +++++++++++++++
1 file changed, 15 insertions(+)
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 80b7783c65b4..e6dbb2dfb331 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -1736,6 +1736,13 @@ static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode,
err = ext4_ext_get_access(handle, inode, path + k);
if (err)
return err;
+ if (unlikely(path[k].p_idx > EXT_LAST_INDEX(path[k].p_hdr))) {
+ EXT4_ERROR_INODE(inode,
+ "path[%d].p_idx %p > EXT_LAST_INDEX %p",
+ k, path[k].p_idx,
+ EXT_LAST_INDEX(path[k].p_hdr));
+ return -EFSCORRUPTED;
+ }
path[k].p_idx->ei_block = border;
err = ext4_ext_dirty(handle, inode, path + k);
if (err)
@@ -1748,6 +1755,14 @@ static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode,
err = ext4_ext_get_access(handle, inode, path + k);
if (err)
break;
+ if (unlikely(path[k].p_idx > EXT_LAST_INDEX(path[k].p_hdr))) {
+ EXT4_ERROR_INODE(inode,
+ "path[%d].p_idx %p > EXT_LAST_INDEX %p",
+ k, path[k].p_idx,
+ EXT_LAST_INDEX(path[k].p_hdr));
+ err = -EFSCORRUPTED;
+ break;
+ }
path[k].p_idx->ei_block = border;
err = ext4_ext_dirty(handle, inode, path + k);
if (err)
--
2.47.3
^ permalink raw reply related
* [PATCH] ext4: move inline data cleanup to ext4_writepages to fix deadlock
From: Yun Zhou @ 2026-06-09 15:45 UTC (permalink / raw)
To: tytso, adilger.kernel, libaokun, jack, ojaswin, ritesh.list,
yi.zhang, daeho.jeong
Cc: linux-ext4, linux-kernel, yun.zhou
ext4_do_writepages() calls ext4_destroy_inline_data() which acquires
xattr_sem while s_writepages_rwsem is held (read). This creates a
circular lock dependency with the xattr writeback path:
CPU0 CPU1
---- ----
ext4_writepages()
ext4_writepages_down_read()
[holds s_writepages_rwsem]
ext4_evict_inode()
__ext4_mark_inode_dirty()
ext4_expand_extra_isize_ea()
ext4_xattr_block_set()
[holds xattr_sem]
iput(old_bh inode)
write_inode_now()
ext4_writepages()
ext4_writepages_down_read()
[BLOCKED on s_writepages_rwsem]
ext4_do_writepages()
ext4_destroy_inline_data()
down_write(xattr_sem)
[BLOCKED on xattr_sem]
Move inline data destruction from ext4_do_writepages() into
ext4_writepages(), before acquiring s_writepages_rwsem.
This is safe because the other caller of ext4_do_writepages()
(ext4_normal_submit_inode_data_buffers, invoked by jbd2 during commit)
can never encounter inline data: jbd2 only tracks inodes with
block-mapped dirty ranges registered via ext4_jbd2_inode_add_write(),
and all such registration paths either explicitly bail out when inline
data is present (ext4_journalled_write_end) or are logically
unreachable for inline data inodes (ext4_map_blocks requires block
allocation, ext4_block_zero_eof requires existing blocks).
Reported-by: syzbot+bb2455d02bda0b5701e3@syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=bb2455d02bda0b5701e3
Fixes: c8585c6fcaf2 ("ext4: fix races between changing inode journal mode and ext4_writepages")
Signed-off-by: Yun Zhou <yun.zhou@windriver.com>
---
fs/ext4/inode.c | 47 +++++++++++++++++++++++++++++------------------
1 file changed, 29 insertions(+), 18 deletions(-)
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index c2c2d6ac7f3d..0c7461ab4fd0 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2810,24 +2810,6 @@ static int ext4_do_writepages(struct mpage_da_data *mpd)
if (unlikely(ret))
goto out_writepages;
- /*
- * If we have inline data and arrive here, it means that
- * we will soon create the block for the 1st page, so
- * we'd better clear the inline data here.
- */
- if (ext4_has_inline_data(inode)) {
- /* Just inode will be modified... */
- handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- goto out_writepages;
- }
- BUG_ON(ext4_test_inode_state(inode,
- EXT4_STATE_MAY_INLINE_DATA));
- ext4_destroy_inline_data(handle, inode);
- ext4_journal_stop(handle);
- }
-
/*
* data=journal mode does not do delalloc so we just need to writeout /
* journal already mapped buffers. On the other hand we need to commit
@@ -3038,6 +3020,35 @@ static int ext4_writepages(struct address_space *mapping,
if (unlikely(ret))
return ret;
+ /*
+ * Clearing inline data acquires xattr_sem, which ranks above
+ * s_writepages_rwsem. Do it here before taking the rwsem to avoid
+ * a circular dependency:
+ * ext4_writepages (s_writepages_rwsem) -> ext4_destroy_inline_data
+ * (xattr_sem)
+ * ext4_xattr_block_set (xattr_sem) -> iput -> ext4_writepages
+ * (s_writepages_rwsem)
+ *
+ * This is only needed in the ext4_writepages() path. The other
+ * caller of ext4_do_writepages() -- ext4_normal_submit_inode_data_buffers
+ * (jbd2 commit callback) -- cannot encounter inline data because jbd2
+ * only tracks inodes with block-mapped dirty ranges registered via
+ * ext4_jbd2_inode_add_write(), and all such callers either bail out
+ * for inline data inodes (e.g. ext4_journalled_write_end) or are
+ * unreachable for them (ext4_map_blocks, ext4_block_zero_eof).
+ */
+ if (ext4_has_inline_data(mapping->host)) {
+ handle_t *handle;
+
+ handle = ext4_journal_start(mapping->host, EXT4_HT_INODE, 1);
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+ BUG_ON(ext4_test_inode_state(mapping->host,
+ EXT4_STATE_MAY_INLINE_DATA));
+ ext4_destroy_inline_data(handle, mapping->host);
+ ext4_journal_stop(handle);
+ }
+
alloc_ctx = ext4_writepages_down_read(sb);
ret = ext4_do_writepages(&mpd);
/*
--
2.43.0
^ permalink raw reply related
* Re: [PATCH v6 01/11] fstests: add _loop_image_create_clone() helper
From: Darrick J. Wong @ 2026-06-09 14:37 UTC (permalink / raw)
To: Anand Suveer Jain
Cc: fstests, linux-btrfs, linux-ext4, linux-xfs, linux-f2fs-devel,
zlang, hch
In-Reply-To: <9c0989d8-202f-42ab-9347-df082c25aa72@kernel.org>
On Mon, Jun 08, 2026 at 10:39:04PM +0800, Anand Suveer Jain wrote:
> On 29/5/26 12:27, Darrick J. Wong wrote:
> > On Thu, May 28, 2026 at 12:05:32PM +0800, Anand Jain wrote:
> > > Introduce _loop_image_create_clone() and _loop_image_destroy() to mkfs an
> > > image file and clone it to another image file, and attach a loop device to
> > > them. And its destroy part.
> > >
> > > Signed-off-by: Anand Jain <asj@kernel.org>
> > > ---
> > > common/rc | 63 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
> > > 1 file changed, 63 insertions(+)
> > >
> > > diff --git a/common/rc b/common/rc
> > > index 79189e7e6e94..d7e3e0bdfb1e 100644
> > > --- a/common/rc
> > > +++ b/common/rc
> > > @@ -1520,6 +1520,69 @@ _scratch_resvblks()
> > > esac
> > > }
> > > +# Create a small loop image, run an optional tuning function ($2) on it,
> > > +# clone it, and attach both to loop devices, returned in ($1).
> > > +# Args:
> > > +# $1: Nameref to return the array of allocated loop devices [base, clone].
> > > +# $2: Optional callback function to tune the base filesystem before cloning.
> > > +_loop_image_create_clone()
> > > +{
> > > + local -n _ret=$1
> >
> > That switch ^^ is very clever. I always wondered how one did indirect
> > variables in bash.
> >
> > > + local pre_clone_tune_func="$2"
> > > + local img_file=$TEST_DIR/${seq}.img
> > > + local img_file_clone=$TEST_DIR/${seq}_clone.img
> > > + local size=$(_small_fs_size_mb 128) # Smallest possible
> > > + local loop_devs
> > > +
> > > + # Since we copy the block device image, we keep its size small.
> > > + _require_fs_space $TEST_DIR $((size * 1024))
> > > +
> > > + _create_file_sized $((size * 1024 * 1024)) $img_file ||
> > > + _fail "Failed: Create $img_file $size"
> > > +
> > > + loop_devs=$(_create_loop_device $img_file)
> > > + _ret=($loop_devs)
> >
> > Should this check that a loopdev actually got created?
> >
>
> Hmm, in the function _create_loop_device(), we are
> calling _fail if create fails, so no need to duplicate, right?
Oh right. Question withdrawn.
> > > + case $FSTYP in
> > > + xfs)
> > > + _mkfs_dev "-s size=4096" ${loop_devs[0]}
> > > + ;;
> > > + btrfs)
> > > + _mkfs_dev ${loop_devs[0]}
> > > + ;;
> > > + *)
> > > + _mkfs_dev ${loop_devs[0]}
> > > + ;;
> > > + esac
> > > +
> > > + # Only execute if the function argument is not empty
> > > + if [ -n "$pre_clone_tune_func" ]; then
> > > + $pre_clone_tune_func ${loop_devs[0]}
> > > + fi
> > > +
> > > + sync ${loop_devs[0]}
> > > + cp $img_file $img_file_clone
> > > +
>
>
> > > + loop_devs="$loop_devs $(_create_loop_device $img_file_clone)"
> >
> > local lodev="$(_create_loop_device ...)"
> >
> > test -z "$lodev" && _fail "second loopdev not created"
> > _ret+=("$lodev")
> >
> > ?
>
> If the second `_create_loop_device()` happens to fail, it will
> already have called `_fail`, so "second loopdev..." won't be
> used at all.
<nod> Both comments withdrawn :)
--D
>
> Thanks, Anand
>
>
>
> > > +
> > > + _ret=($loop_devs)
> > > +}
> > > +
> > > +# Teardown loop devices and delete their underlying backing image files.
> > > +# Accepts a list of loop device paths (e.g., /dev/loop0 /dev/loop1).
> > > +_loop_image_destroy()
> > > +{
> > > + for d in "$@"; do
> > > + # Retrieve the path of the backing file
> > > + local f=$(losetup --noheadings --output BACK-FILE $d)
> > > +
> > > + # Detach the loop device from the backing file
> > > + _destroy_loop_device "$d"
> > > +
> > > + # Clean up the backing disk image file
> > > + [ -n "$f" ] && rm -f "$f"
> > > + done
> > > +}
> > > # Repair scratch filesystem. Returns 0 if the FS is good to go (either no
> > > # errors found or errors were fixed) and nonzero otherwise; also spits out
> > > --
> > > 2.43.0
> > >
> > >
>
>
^ permalink raw reply
* Re: [PATCH v4] ext4: fix kernel BUG in ext4_write_inline_data_end
From: Aditya Prakash Srivastava @ 2026-06-09 13:08 UTC (permalink / raw)
To: Jan Kara
Cc: Theodore Ts'o, Andreas Dilger, Baokun Li, Ojaswin Mujoo,
Ritesh Harjani, Zhang Yi, sashiko-reviews, linux-ext4,
linux-kernel, syzbot+0c89d865531d053abb2d
In-Reply-To: <o3k4wongcbuacu4rjsb7h2utzsrhpnun55vzdnp46imnlbn5x6@matvyu6j2xhc>
Hi Jan,
Thank you very much for the incredibly detailed review and the design
insights!
I completely agree with your suggestion. Rushing too many fixes for
complex, concurrent race conditions into a single patch makes the
code harder to review and risks introducing subtle regressions.
Let's go ahead with the simple and straightforward v1 patch (which
has your Reviewed-by) to fix the original syzbot crash for now.
I will take your excellent suggestion to use fsdata for state
communication between write_begin and write_end, and I will work on
formulating a separate, cleaner patch series in the future to
address the remaining concurrent locking races you mentioned.
I will withdraw this v4 thread for now.
Thanks again for your guidance!
Best regards,
Aditya Prakash Srivastava
^ permalink raw reply
* Re: [PATCH v4] ext4: fix kernel BUG in ext4_write_inline_data_end
From: Jan Kara @ 2026-06-09 12:46 UTC (permalink / raw)
To: Aditya Prakash Srivastava
Cc: Theodore Ts'o, Andreas Dilger, Jan Kara, Baokun Li,
Ojaswin Mujoo, Ritesh Harjani, Zhang Yi, sashiko-reviews,
linux-ext4, linux-kernel, syzbot+0c89d865531d053abb2d
In-Reply-To: <20260609062005.1702-1-aditya.ansh182@gmail.com>
On Tue 09-06-26 06:20:05, Aditya Prakash Srivastava wrote:
> When the data=journal mount option is used, the ext4_journalled_write_end()
> function incorrectly calls ext4_write_inline_data_end() without checking
> if the EXT4_STATE_MAY_INLINE_DATA flag is still set on the inode.
>
> If a previous attempt to convert the inline data to an extent failed (e.g.
> due to ENOSPC), the EXT4_STATE_MAY_INLINE_DATA flag is cleared, but
> the EXT4_INODE_INLINE_DATA flag remains set. In this scenario, the next
> call to ext4_write_begin() will not prepare the inline data xattr for
> writing, but ext4_journalled_write_end() will incorrectly attempt to write
> to it, triggering a BUG_ON(pos + len > EXT4_I(inode)->i_inline_size) in
> ext4_write_inline_data() since i_inline_size was not expanded.
>
> Additionally, two separate TOCTOU race conditions exist due to concurrent
> ext4_page_mkwrite() execution:
> 1) A concurrent ext4_page_mkwrite() can execute ext4_convert_inline_data()
> between write_begin and write_end, clearing the inline flags. Since block
> buffers were not allocated in write_begin, this results in a NULL pointer
> dereference in the write_end fallback paths because folio_buffers(folio) is
> NULL.
> 2) If ext4_convert_inline_data() clears the flags exactly after the inline
> flags checks pass in write_end, but before ext4_write_inline_data_end()
> acquires the xattr semaphore, the subsequent check will hit a panic via
> BUG_ON(!ext4_has_inline_data(inode)).
Yes, locking of inline data writes is broken (and difficult to fix). Your
v1 patch was actually simple and obvious improvement of the situation.
These additional fixes belong into separate patches.
> Fix these issues completely by:
> 1) Having write_end functions (ext4_write_end(),
> ext4_journalled_write_end(), and ext4_da_do_write_end()) return 0
> (VFS retry) if they fall through to the block fallback path and detect
> that folio_buffers(folio) is NULL, after safely stopping any active
> journal handle (protecting against a NULL handle panic in
> ext4_put_nojournal()).
> 2) Replacing BUG_ON(!ext4_has_inline_data(inode)) inside
> ext4_write_inline_data_end() with a graceful error path. If the inline flag
> is cleared after locking the xattr, we unlock the xattr, release the iloc,
> unlock/put the folio, stop the journal, and return 0 to trigger a retry.
>
> Reported-by: syzbot+0c89d865531d053abb2d@syzkaller.appspotmail.com
> Closes: https://syzkaller.appspot.com/bug?extid=0c89d865531d053abb2d
> Fixes: 3fdcfb668fd7 ("ext4: add journalled write support for inline data")
> Signed-off-by: Aditya Prakash Srivastava <aditya.ansh182@gmail.com>
> ---
> v4:
> - Address critical TOCTOU race condition (reported by Sashiko AI review):
> * Scenario: A buffered write holds the folio lock and evaluates the inline
> flags checks in write_end to true. Before it enters or locks the xattr_sem
> in ext4_write_inline_data_end(), a concurrent memory-mapped page fault
> (ext4_page_mkwrite()) converts the inline data to an extent. This page fault
> bypasses the folio lock (since ext4_convert_inline_data() runs lockless),
> acquires the xattr_sem, and clears the inline flags. When the buffered write
> resumes and enters ext4_write_inline_data_end(), it acquires the xattr_sem
> and immediately triggers BUG_ON(!ext4_has_inline_data(inode)) causing a
> kernel panic.
> * Fix: Replace the BUG_ON() with a graceful error-handling retry path that
> releases all resources (locks/buffers/folios/journals) and returns 0.
> v3:
> - Fix journal handle leak and NULL handle crash (reported by Sashiko AI review):
> * Scenario 1 (leak): During a delayed allocation write (ext4_da_write_begin),
> inline data was prepared and a transaction handle started. If a concurrent
> page fault converts the inline data before write_end, ext4_da_write_end()
> falls through to ext4_da_do_write_end(). If the fallback check for
> !folio_buffers(folio) returns 0 to retry without calling ext4_journal_stop(),
> the transaction handle is leaked open-ended, eventually hanging the filesystem.
> * Scenario 2 (crash): If we blindly call ext4_journal_stop() on a NULL handle
> (e.g., when no transaction was started because we never took the inline path),
> __ext4_journal_stop() delegates to ext4_put_nojournal(NULL) which triggers
> BUG_ON(ref_cnt == 0), panicking the kernel.
> * Fix: Retrieve the active handle in ext4_da_do_write_end() and stop it
> if non-NULL. Also explicitly check "if (handle)" before calling
> ext4_journal_stop() in ext4_write_end() and ext4_journalled_write_end().
> v2:
> - Address TOCTOU race condition (reported by Sashiko AI review):
> * Scenario: A concurrent ext4_page_mkwrite() converts inline data to extents
> and clears the flags between ext4_write_begin() and write_end(). The
> write_end function falls through to the block fallback path. Since block
> buffers were not allocated in write_begin (because it took the inline path),
> folio_buffers(folio) is NULL, causing a NULL pointer dereference in
> ext4_journalled_zero_new_buffers() or ext4_walk_page_buffers(), or silent
> data loss in the standard write path.
> * Fix: Have the write_end functions return 0 if folio_buffers(folio) is NULL,
> triggering a safe VFS-level retry. On the next write attempt, the inline
> flags will be detected as cleared, and blocks/buffers will be properly allocated.
> fs/ext4/inline.c | 9 ++++++++-
> fs/ext4/inode.c | 24 ++++++++++++++++++++++--
> 2 files changed, 30 insertions(+), 3 deletions(-)
>
> diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
> index 8045e4ff270c..161136e84661 100644
> --- a/fs/ext4/inline.c
> +++ b/fs/ext4/inline.c
> @@ -812,7 +812,14 @@ int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len,
> goto out;
> }
> ext4_write_lock_xattr(inode, &no_expand);
> - BUG_ON(!ext4_has_inline_data(inode));
> + if (unlikely(!ext4_has_inline_data(inode))) {
> + ext4_write_unlock_xattr(inode, &no_expand);
> + brelse(iloc.bh);
> + folio_unlock(folio);
> + folio_put(folio);
> + ext4_journal_stop(handle);
> + return 0;
> + }
This deserves a comment before the 'if' that we could have raced with
ext4_page_mkwrite() converting the inode and so we just retry the whole
write.
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index c2c2d6ac7f3d..bc2688e03c19 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -1455,6 +1455,14 @@ static int ext4_write_end(const struct kiocb *iocb,
> return ext4_write_inline_data_end(inode, pos, len, copied,
> folio);
>
> + if (unlikely(!folio_buffers(folio))) {
> + folio_unlock(folio);
> + folio_put(folio);
> + if (handle)
> + ext4_journal_stop(handle);
> + return 0;
> + }
> +
Ouch, this is a crude hack. I think much cleaner solution would be for
ext4_write_begin() to set in fsdata in what state it prepared the inode
(inline, extent based) - we already use that mechanism to communicate some
state for delayed allocations. Then ->write_end handler will use fsdata
(not inode state) to determine what function to call. IMHO the code will be
much more obvious that way.
> @@ -3231,7 +3248,10 @@ static int ext4_da_do_write_end(struct address_space *mapping,
> if (unlikely(!folio_buffers(folio))) {
> folio_unlock(folio);
> folio_put(folio);
> - return -EIO;
> + handle = ext4_journal_current_handle();
> + if (handle)
> + ext4_journal_stop(handle);
> + return 0;
> }
> /*
> * block_write_end() will mark the inode as dirty with I_DIRTY_PAGES
Huh, what is this about? It definitely looks very suspicious...
Honza
--
Jan Kara <jack@suse.com>
SUSE Labs, CR
^ permalink raw reply
* Re: [PATCH] ext4: fix circular lock dependency in ext4_ext_migrate
From: Jan Kara @ 2026-06-09 12:05 UTC (permalink / raw)
To: Yun Zhou
Cc: tytso, adilger.kernel, libaokun, jack, ojaswin, ritesh.list,
yi.zhang, ebiggers, linux-ext4, linux-kernel
In-Reply-To: <20260609084007.3432061-1-yun.zhou@windriver.com>
On Tue 09-06-26 16:40:07, Yun Zhou wrote:
> Move iput(tmp_inode) after ext4_writepages_up_write() to avoid a
> circular lock dependency between s_writepages_rwsem and sb_internal
> (freeze protection).
>
> The deadlock scenario:
>
> CPU0 (EXT4_IOC_MIGRATE) CPU1 (orphan cleanup during mount)
> ---- ----
> ext4_ext_migrate()
> ext4_writepages_down_write()
> s_writepages_rwsem (write)
> ext4_evict_inode()
> sb_start_intwrite() [sb_internal]
> ...
> ext4_writepages()
> s_writepages_rwsem (read) [BLOCKED]
> iput(tmp_inode)
> ext4_evict_inode()
> sb_start_intwrite() [BLOCKED]
>
> The tmp_inode is a temporary inode with nlink=0 created solely for
> building the extent tree. Its eviction does not require
> s_writepages_rwsem protection, so deferring iput() until after
> releasing the rwsem is safe.
>
> Reported-by: syzbot+f0b58a1f5075a90dd9a5@syzkaller.appspotmail.com
> Closes: https://syzkaller.appspot.com/bug?extid=f0b58a1f5075a90dd9a5
> Fixes: cb85f4d23f79 ("ext4: fix race between writepages and enabling EXT4_EXTENTS_FL")
> Signed-off-by: Yun Zhou <yun.zhou@windriver.com>
Looks good. Feel free to add:
Reviewed-by: Jan Kara <jack@suse.cz>
Just one nit below:
> @@ -591,9 +592,10 @@ int ext4_ext_migrate(struct inode *inode)
> ext4_journal_stop(handle);
> out_tmp_inode:
> unlock_new_inode(tmp_inode);
> - iput(tmp_inode);
> out_unlock:
> ext4_writepages_up_write(inode->i_sb, alloc_ctx);
> + if (tmp_inode)
> + iput(tmp_inode);
iput(NULL) is properly handled so you don't need the if (tmp_inode) check
here.
Honza
--
Jan Kara <jack@suse.com>
SUSE Labs, CR
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox