* [PATCH v5 2/2] ext4: improve str2hashbuf by processing 4-byte chunks and removing function pointers
From: Guan-Chun Wu @ 2026-05-30 15:58 UTC (permalink / raw)
To: Theodore Ts'o, Andreas Dilger, Baokun Li, Jan Kara,
Ojaswin Mujoo, Ritesh Harjani, Zhang Yi
Cc: linux-ext4, linux-kernel, edward062254, visitorckw,
david.laight.linux, Guan-Chun Wu
In-Reply-To: <20260530155817.2311587-1-409411716@gms.tku.edu.tw>
The original byte-by-byte implementation with modulo checks is less
efficient. Refactor str2hashbuf_unsigned() and str2hashbuf_signed()
to process input in explicit 4-byte chunks instead of using a
modulus-based loop to emit words byte by byte.
Additionally, the use of function pointers for selecting the appropriate
str2hashbuf implementation has been removed. Instead, the functions are
directly invoked based on the hash type, eliminating the overhead of
dynamic function calls.
Performance test (x86_64, Intel Core i7-10700 @ 2.90GHz, average over 10000
runs, using kernel module for testing):
len | orig_s | new_s | orig_u | new_u
----+--------+-------+--------+-------
1 | 70 | 71 | 63 | 63
8 | 68 | 64 | 64 | 62
32 | 75 | 70 | 75 | 63
64 | 96 | 71 | 100 | 68
255 | 192 | 108 | 187 | 84
This change improves performance, especially for larger input sizes.
Signed-off-by: Guan-Chun Wu <409411716@gms.tku.edu.tw>
---
fs/ext4/hash.c | 64 +++++++++++++++++++++++++++++++++-----------------
1 file changed, 42 insertions(+), 22 deletions(-)
diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c
index 72645bd92..978bd92da 100644
--- a/fs/ext4/hash.c
+++ b/fs/ext4/hash.c
@@ -9,6 +9,7 @@
#include <linux/unicode.h>
#include <linux/compiler.h>
#include <linux/bitops.h>
+#include <linux/unaligned.h>
#include "ext4.h"
#define DELTA 0x9E3779B9
@@ -141,21 +142,28 @@ static void str2hashbuf_signed(const char *msg, int len, __u32 *buf, int num)
pad = (__u32)len | ((__u32)len << 8);
pad |= pad << 16;
- val = pad;
if (len > num*4)
len = num * 4;
- for (i = 0; i < len; i++) {
- val = ((int) scp[i]) + (val << 8);
- if ((i % 4) == 3) {
- *buf++ = val;
- val = pad;
- num--;
- }
+
+ while (len >= 4) {
+ val = ((__u32)scp[0] << 24) + ((__u32)scp[1] << 16) + ((__u32)scp[2] << 8) + scp[3];
+ *buf++ = val;
+ scp += 4;
+ len -= 4;
+ num--;
}
+
+ val = pad;
+
+ for (i = 0; i < len; i++)
+ val = scp[i] + (val << 8);
+
if (--num >= 0)
*buf++ = val;
+
while (--num >= 0)
*buf++ = pad;
+
}
static void str2hashbuf_unsigned(const char *msg, int len, __u32 *buf, int num)
@@ -167,21 +175,28 @@ static void str2hashbuf_unsigned(const char *msg, int len, __u32 *buf, int num)
pad = (__u32)len | ((__u32)len << 8);
pad |= pad << 16;
- val = pad;
if (len > num*4)
len = num * 4;
- for (i = 0; i < len; i++) {
- val = ((int) ucp[i]) + (val << 8);
- if ((i % 4) == 3) {
- *buf++ = val;
- val = pad;
- num--;
- }
+
+ while (len >= 4) {
+ val = get_unaligned_be32(ucp);
+ *buf++ = val;
+ ucp += 4;
+ len -= 4;
+ num--;
}
+
+ val = pad;
+
+ for (i = 0; i < len; i++)
+ val = ucp[i] + (val << 8);
+
if (--num >= 0)
*buf++ = val;
+
while (--num >= 0)
*buf++ = pad;
+
}
/*
@@ -205,8 +220,7 @@ static int __ext4fs_dirhash(const struct inode *dir, const char *name, int len,
const char *p;
int i;
__u32 in[8], buf[4];
- void (*str2hashbuf)(const char *, int, __u32 *, int) =
- str2hashbuf_signed;
+ bool use_unsigned = false;
/* Initialize the default seed for the hash checksum functions */
buf[0] = 0x67452301;
@@ -232,12 +246,15 @@ static int __ext4fs_dirhash(const struct inode *dir, const char *name, int len,
hash = dx_hack_hash_signed(name, len);
break;
case DX_HASH_HALF_MD4_UNSIGNED:
- str2hashbuf = str2hashbuf_unsigned;
+ use_unsigned = true;
fallthrough;
case DX_HASH_HALF_MD4:
p = name;
while (len > 0) {
- (*str2hashbuf)(p, len, in, 8);
+ if (use_unsigned)
+ str2hashbuf_unsigned(p, len, in, 8);
+ else
+ str2hashbuf_signed(p, len, in, 8);
half_md4_transform(buf, in);
len -= 32;
p += 32;
@@ -246,12 +263,15 @@ static int __ext4fs_dirhash(const struct inode *dir, const char *name, int len,
hash = buf[1];
break;
case DX_HASH_TEA_UNSIGNED:
- str2hashbuf = str2hashbuf_unsigned;
+ use_unsigned = true;
fallthrough;
case DX_HASH_TEA:
p = name;
while (len > 0) {
- (*str2hashbuf)(p, len, in, 4);
+ if (use_unsigned)
+ str2hashbuf_unsigned(p, len, in, 4);
+ else
+ str2hashbuf_signed(p, len, in, 4);
TEA_transform(buf, in);
len -= 16;
p += 16;
--
2.34.1
^ permalink raw reply related
* [PATCH v5 1/2] ext4: add Kunit coverage for directory hash computation
From: Guan-Chun Wu @ 2026-05-30 15:58 UTC (permalink / raw)
To: Theodore Ts'o, Andreas Dilger, Baokun Li, Jan Kara,
Ojaswin Mujoo, Ritesh Harjani, Zhang Yi
Cc: linux-ext4, linux-kernel, edward062254, visitorckw,
david.laight.linux, Guan-Chun Wu
In-Reply-To: <20260530155817.2311587-1-409411716@gms.tku.edu.tw>
Introduce Kunit tests for fs/ext4/hash.c to verify ext4fs_dirhash()
across the legacy, half-MD4, and TEA hash variants.
The tests cover empty, seeded hashing, and non-ASCII name handling.
They also verify error paths, including invalid hash versions and
SipHash without a configured key, and check that the signed and
unsigned hash variants differ on non-ASCII input as expected.
When CONFIG_UNICODE is enabled, the tests further verify casefolded-name
hashing and the fallback behavior for invalid input.
Co-developed-by: Chen Hao Yu <edward062254@gmail.com>
Signed-off-by: Chen Hao Yu <edward062254@gmail.com>
Signed-off-by: Guan-Chun Wu <409411716@gms.tku.edu.tw>
---
fs/ext4/Makefile | 2 +-
fs/ext4/hash-test.c | 560 ++++++++++++++++++++++++++++++++++++++++++++
fs/ext4/hash.c | 4 +
3 files changed, 565 insertions(+), 1 deletion(-)
create mode 100644 fs/ext4/hash-test.c
diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile
index 3baee4e7c..3f9fc0eb8 100644
--- a/fs/ext4/Makefile
+++ b/fs/ext4/Makefile
@@ -15,7 +15,7 @@ ext4-y := balloc.o bitmap.o block_validity.o dir.o ext4_jbd2.o extents.o \
ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o
ext4-$(CONFIG_EXT4_FS_SECURITY) += xattr_security.o
ext4-test-objs += inode-test.o mballoc-test.o \
- extents-test.o
+ extents-test.o hash-test.o
obj-$(CONFIG_EXT4_KUNIT_TESTS) += ext4-test.o
ext4-$(CONFIG_FS_VERITY) += verity.o
ext4-$(CONFIG_FS_ENCRYPTION) += crypto.o
diff --git a/fs/ext4/hash-test.c b/fs/ext4/hash-test.c
new file mode 100644
index 000000000..2da66cafb
--- /dev/null
+++ b/fs/ext4/hash-test.c
@@ -0,0 +1,560 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * KUnit tests for ext4 directory hash computation.
+ */
+
+#include <kunit/test.h>
+#include <linux/fs.h>
+#include <linux/string.h>
+#include <linux/unicode.h>
+#include "ext4.h"
+
+static void ext4_hash_init_fake_dir(struct inode *dir, struct super_block *sb)
+{
+ memset(sb, 0, sizeof(*sb));
+ memset(dir, 0, sizeof(*dir));
+ dir->i_sb = sb;
+ strscpy(sb->s_id, "kunit-ext4", sizeof(sb->s_id));
+}
+
+static void ext4_hash_init_fake_dir_with_sbi(struct inode *dir,
+ struct super_block *sb,
+ struct ext4_sb_info *sbi)
+{
+ ext4_hash_init_fake_dir(dir, sb);
+ memset(sbi, 0, sizeof(*sbi));
+ sb->s_fs_info = sbi;
+ sbi->s_sb = sb;
+}
+
+#if IS_ENABLED(CONFIG_UNICODE)
+KUNIT_DEFINE_ACTION_WRAPPER(utf8_unload_action, utf8_unload,
+ struct unicode_map *);
+#endif
+
+static void ext4_hash_init_fake_ext4_dir(struct ext4_inode_info *ei,
+ struct super_block *sb,
+ struct ext4_sb_info *sbi)
+{
+ struct inode *dir = &ei->vfs_inode;
+
+ memset(sb, 0, sizeof(*sb));
+ memset(ei, 0, sizeof(*ei));
+ memset(sbi, 0, sizeof(*sbi));
+
+ strscpy(sb->s_id, "kunit-ext4", sizeof(sb->s_id));
+ sb->s_fs_info = sbi;
+ sbi->s_sb = sb;
+
+ dir->i_sb = sb;
+ dir->i_mode = S_IFDIR;
+
+#ifdef CONFIG_FS_ENCRYPTION
+ fscrypt_set_ops(sb, &ext4_cryptops);
+#endif
+}
+
+struct ext4_dirhash_test_case {
+ const char *name;
+ u32 hash_version;
+ const char *input;
+ int len;
+ u32 seed[4];
+ bool use_seed;
+ u32 expected_hash;
+ u32 expected_minor_hash;
+};
+
+static const struct ext4_dirhash_test_case ext4_dirhash_test_cases[] = {
+ {
+ .name = "legacy_abc",
+ .hash_version = DX_HASH_LEGACY,
+ .input = "abc",
+ .len = 3,
+ .use_seed = false,
+ .expected_hash = 0x75afd992,
+ .expected_minor_hash = 0x00000000,
+ },
+ {
+ .name = "legacy_unsigned_abc",
+ .hash_version = DX_HASH_LEGACY_UNSIGNED,
+ .input = "abc",
+ .len = 3,
+ .use_seed = false,
+ .expected_hash = 0x75afd992,
+ .expected_minor_hash = 0x00000000,
+ },
+ {
+ .name = "half_md4_abc",
+ .hash_version = DX_HASH_HALF_MD4,
+ .input = "abc",
+ .len = 3,
+ .use_seed = false,
+ .expected_hash = 0xd196a868,
+ .expected_minor_hash = 0xc420eb28,
+ },
+ {
+ .name = "half_md4_unsigned_abc",
+ .hash_version = DX_HASH_HALF_MD4_UNSIGNED,
+ .input = "abc",
+ .len = 3,
+ .use_seed = false,
+ .expected_hash = 0xd196a868,
+ .expected_minor_hash = 0xc420eb28,
+ },
+ {
+ .name = "tea_abc",
+ .hash_version = DX_HASH_TEA,
+ .input = "abc",
+ .len = 3,
+ .use_seed = false,
+ .expected_hash = 0xb1435ec4,
+ .expected_minor_hash = 0x3f7eaa0e,
+ },
+ {
+ .name = "tea_unsigned_abc",
+ .hash_version = DX_HASH_TEA_UNSIGNED,
+ .input = "abc",
+ .len = 3,
+ .use_seed = false,
+ .expected_hash = 0xb1435ec4,
+ .expected_minor_hash = 0x3f7eaa0e,
+ },
+ {
+ .name = "empty_half_md4",
+ .hash_version = DX_HASH_HALF_MD4,
+ .input = "",
+ .len = 0,
+ .use_seed = false,
+ .expected_hash = 0xefcdab88,
+ .expected_minor_hash = 0x98badcfe,
+ },
+ {
+ .name = "half_md4_31bytes",
+ .hash_version = DX_HASH_HALF_MD4,
+ .input = "1234567890123456789012345678901",
+ .len = 31,
+ .use_seed = false,
+ .expected_hash = 0xc4db1f78,
+ .expected_minor_hash = 0xea23921b,
+ },
+ {
+ .name = "half_md4_32bytes",
+ .hash_version = DX_HASH_HALF_MD4,
+ .input = "12345678901234567890123456789012",
+ .len = 32,
+ .use_seed = false,
+ .expected_hash = 0xfa6cc63e,
+ .expected_minor_hash = 0x2f77bd1c,
+ },
+ {
+ .name = "half_md4_33bytes",
+ .hash_version = DX_HASH_HALF_MD4,
+ .input = "123456789012345678901234567890123",
+ .len = 33,
+ .use_seed = false,
+ .expected_hash = 0xdc0c2dec,
+ .expected_minor_hash = 0x5ca23365,
+ },
+ {
+ .name = "half_md4_unsigned_31bytes",
+ .hash_version = DX_HASH_HALF_MD4_UNSIGNED,
+ .input = "1234567890123456789012345678901",
+ .len = 31,
+ .use_seed = false,
+ .expected_hash = 0xc4db1f78,
+ .expected_minor_hash = 0xea23921b,
+ },
+ {
+ .name = "half_md4_unsigned_32bytes",
+ .hash_version = DX_HASH_HALF_MD4_UNSIGNED,
+ .input = "12345678901234567890123456789012",
+ .len = 32,
+ .use_seed = false,
+ .expected_hash = 0xfa6cc63e,
+ .expected_minor_hash = 0x2f77bd1c,
+ },
+ {
+ .name = "half_md4_unsigned_33bytes",
+ .hash_version = DX_HASH_HALF_MD4_UNSIGNED,
+ .input = "123456789012345678901234567890123",
+ .len = 33,
+ .use_seed = false,
+ .expected_hash = 0xdc0c2dec,
+ .expected_minor_hash = 0x5ca23365,
+ },
+ {
+ .name = "tea_15bytes",
+ .hash_version = DX_HASH_TEA,
+ .input = "123456789abcdef",
+ .len = 15,
+ .use_seed = false,
+ .expected_hash = 0xa562903a,
+ .expected_minor_hash = 0x6174a00f,
+ },
+ {
+ .name = "tea_16bytes",
+ .hash_version = DX_HASH_TEA,
+ .input = "1234567890abcdef",
+ .len = 16,
+ .use_seed = false,
+ .expected_hash = 0x8449f258,
+ .expected_minor_hash = 0x49a16d46,
+ },
+ {
+ .name = "tea_17bytes",
+ .hash_version = DX_HASH_TEA,
+ .input = "123456789abcdefgh",
+ .len = 17,
+ .use_seed = false,
+ .expected_hash = 0xf32ec10c,
+ .expected_minor_hash = 0x58ceae61,
+ },
+ {
+ .name = "half_md4_seeded",
+ .hash_version = DX_HASH_HALF_MD4,
+ .input = "same-name",
+ .len = 9,
+ .seed = { 0x11111111, 0x22222222, 0x33333333, 0x44444444 },
+ .use_seed = true,
+ .expected_hash = 0x8aebf604,
+ .expected_minor_hash = 0x66ce48fe,
+ },
+ {
+ .name = "half_md4_non_ascii_signed",
+ .hash_version = DX_HASH_HALF_MD4,
+ .input = "\x80\x81\x82\x83\x84",
+ .len = 5,
+ .use_seed = false,
+ .expected_hash = 0x8bab0498,
+ .expected_minor_hash = 0xc326632d,
+ },
+ {
+ .name = "half_md4_non_ascii_unsigned",
+ .hash_version = DX_HASH_HALF_MD4_UNSIGNED,
+ .input = "\x80\x81\x82\x83\x84",
+ .len = 5,
+ .use_seed = false,
+ .expected_hash = 0xbc48596e,
+ .expected_minor_hash = 0xde0fad41,
+ },
+ {
+ .name = "tea_non_ascii_signed",
+ .hash_version = DX_HASH_TEA,
+ .input = "\x80\x81\x82\x83\x84",
+ .len = 5,
+ .use_seed = false,
+ .expected_hash = 0x21e3a154,
+ .expected_minor_hash = 0x90112c3d,
+ },
+ {
+ .name = "tea_non_ascii_unsigned",
+ .hash_version = DX_HASH_TEA_UNSIGNED,
+ .input = "\x80\x81\x82\x83\x84",
+ .len = 5,
+ .use_seed = false,
+ .expected_hash = 0x9b648616,
+ .expected_minor_hash = 0x011dd507,
+ },
+};
+
+static void test_ext4fs_dirhash_vectors(struct kunit *test)
+{
+ struct super_block *sb;
+ struct inode *dir;
+ int i;
+
+ sb = kunit_kzalloc(test, sizeof(*sb), GFP_KERNEL);
+ dir = kunit_kzalloc(test, sizeof(*dir), GFP_KERNEL);
+ KUNIT_ASSERT_NOT_NULL(test, sb);
+ KUNIT_ASSERT_NOT_NULL(test, dir);
+
+ ext4_hash_init_fake_dir(dir, sb);
+
+ for (i = 0; i < ARRAY_SIZE(ext4_dirhash_test_cases); i++) {
+ const struct ext4_dirhash_test_case *tc =
+ &ext4_dirhash_test_cases[i];
+ struct dx_hash_info hinfo;
+ int ret;
+
+ memset(&hinfo, 0, sizeof(hinfo));
+ hinfo.hash_version = tc->hash_version;
+ hinfo.seed = tc->use_seed ? (u32 *)tc->seed : NULL;
+
+ ret = ext4fs_dirhash(dir, tc->input, tc->len, &hinfo);
+
+ KUNIT_ASSERT_EQ_MSG(test, ret, 0, "case=%s", tc->name);
+ KUNIT_EXPECT_EQ_MSG(test, hinfo.hash, tc->expected_hash,
+ "case=%s", tc->name);
+ KUNIT_EXPECT_EQ_MSG(test, hinfo.minor_hash,
+ tc->expected_minor_hash,
+ "case=%s", tc->name);
+ }
+}
+
+static void test_ext4fs_dirhash_seed_changes_result(struct kunit *test)
+{
+ struct super_block *sb;
+ struct inode *dir;
+ u32 seed[4] = { 0x11111111, 0x22222222, 0x33333333, 0x44444444 };
+ struct dx_hash_info plain = {
+ .hash_version = DX_HASH_HALF_MD4,
+ };
+ struct dx_hash_info seeded = {
+ .hash_version = DX_HASH_HALF_MD4,
+ .seed = seed,
+ };
+ int ret_plain, ret_seeded;
+
+ sb = kunit_kzalloc(test, sizeof(*sb), GFP_KERNEL);
+ dir = kunit_kzalloc(test, sizeof(*dir), GFP_KERNEL);
+ KUNIT_ASSERT_NOT_NULL(test, sb);
+ KUNIT_ASSERT_NOT_NULL(test, dir);
+
+ ext4_hash_init_fake_dir(dir, sb);
+
+ ret_plain = ext4fs_dirhash(dir, "same-name", 9, &plain);
+ ret_seeded = ext4fs_dirhash(dir, "same-name", 9, &seeded);
+
+ KUNIT_ASSERT_EQ(test, ret_plain, 0);
+ KUNIT_ASSERT_EQ(test, ret_seeded, 0);
+
+ KUNIT_EXPECT_TRUE(test,
+ plain.hash != seeded.hash ||
+ plain.minor_hash != seeded.minor_hash);
+}
+
+static void test_ext4fs_dirhash_invalid_version_returns_einval(struct kunit *test)
+{
+ struct super_block *sb;
+ struct inode *dir;
+ struct ext4_sb_info *sbi;
+ struct dx_hash_info hinfo = {
+ .hash = 0xdeadbeef,
+ .minor_hash = 0xcafebabe,
+ .hash_version = DX_HASH_LAST + 1,
+ };
+ int ret;
+
+ sb = kunit_kzalloc(test, sizeof(*sb), GFP_KERNEL);
+ dir = kunit_kzalloc(test, sizeof(*dir), GFP_KERNEL);
+ sbi = kunit_kzalloc(test, sizeof(*sbi), GFP_KERNEL);
+ KUNIT_ASSERT_NOT_NULL(test, sb);
+ KUNIT_ASSERT_NOT_NULL(test, dir);
+ KUNIT_ASSERT_NOT_NULL(test, sbi);
+
+ ext4_hash_init_fake_dir_with_sbi(dir, sb, sbi);
+
+ ret = ext4fs_dirhash(dir, "abc", 3, &hinfo);
+
+ KUNIT_EXPECT_EQ(test, ret, -EINVAL);
+ KUNIT_EXPECT_EQ(test, hinfo.hash, 0);
+ KUNIT_EXPECT_EQ(test, hinfo.minor_hash, 0);
+}
+
+static void test_ext4fs_dirhash_siphash_without_key_returns_einval(struct kunit *test)
+{
+ struct super_block *sb;
+ struct ext4_inode_info *ei;
+ struct inode *dir;
+ struct ext4_sb_info *sbi;
+ struct dx_hash_info hinfo = {
+ .hash_version = DX_HASH_SIPHASH,
+ };
+ int ret;
+
+ sb = kunit_kzalloc(test, sizeof(*sb), GFP_KERNEL);
+ ei = kunit_kzalloc(test, sizeof(*ei), GFP_KERNEL);
+ sbi = kunit_kzalloc(test, sizeof(*sbi), GFP_KERNEL);
+ KUNIT_ASSERT_NOT_NULL(test, sb);
+ KUNIT_ASSERT_NOT_NULL(test, ei);
+ KUNIT_ASSERT_NOT_NULL(test, sbi);
+
+ ext4_hash_init_fake_ext4_dir(ei, sb, sbi);
+ dir = &ei->vfs_inode;
+
+ ret = ext4fs_dirhash(dir, "name", strlen("name"), &hinfo);
+
+ KUNIT_EXPECT_EQ(test, ret, -EINVAL);
+}
+
+static void test_ext4fs_dirhash_signed_unsigned_differ_on_nonascii(struct kunit *test)
+{
+ struct super_block *sb;
+ struct inode *dir;
+ static const char input[] = "\x80\xff\x81\xfe\101bc";
+ struct dx_hash_info legacy_signed = {
+ .hash_version = DX_HASH_LEGACY,
+ };
+ struct dx_hash_info legacy_unsigned = {
+ .hash_version = DX_HASH_LEGACY_UNSIGNED,
+ };
+ struct dx_hash_info md4_signed = {
+ .hash_version = DX_HASH_HALF_MD4,
+ };
+ struct dx_hash_info md4_unsigned = {
+ .hash_version = DX_HASH_HALF_MD4_UNSIGNED,
+ };
+ struct dx_hash_info tea_signed = {
+ .hash_version = DX_HASH_TEA,
+ };
+ struct dx_hash_info tea_unsigned = {
+ .hash_version = DX_HASH_TEA_UNSIGNED,
+ };
+ int ret;
+
+ sb = kunit_kzalloc(test, sizeof(*sb), GFP_KERNEL);
+ dir = kunit_kzalloc(test, sizeof(*dir), GFP_KERNEL);
+ KUNIT_ASSERT_NOT_NULL(test, sb);
+ KUNIT_ASSERT_NOT_NULL(test, dir);
+
+ ext4_hash_init_fake_dir(dir, sb);
+
+ ret = ext4fs_dirhash(dir, input, sizeof(input) - 1, &legacy_signed);
+ KUNIT_ASSERT_EQ(test, ret, 0);
+ ret = ext4fs_dirhash(dir, input, sizeof(input) - 1, &legacy_unsigned);
+ KUNIT_ASSERT_EQ(test, ret, 0);
+ KUNIT_EXPECT_NE(test, legacy_signed.hash, legacy_unsigned.hash);
+
+ ret = ext4fs_dirhash(dir, input, sizeof(input) - 1, &md4_signed);
+ KUNIT_ASSERT_EQ(test, ret, 0);
+ ret = ext4fs_dirhash(dir, input, sizeof(input) - 1, &md4_unsigned);
+ KUNIT_ASSERT_EQ(test, ret, 0);
+ KUNIT_EXPECT_TRUE(test,
+ md4_signed.hash != md4_unsigned.hash ||
+ md4_signed.minor_hash != md4_unsigned.minor_hash);
+
+ ret = ext4fs_dirhash(dir, input, sizeof(input) - 1, &tea_signed);
+ KUNIT_ASSERT_EQ(test, ret, 0);
+ ret = ext4fs_dirhash(dir, input, sizeof(input) - 1, &tea_unsigned);
+ KUNIT_ASSERT_EQ(test, ret, 0);
+ KUNIT_EXPECT_TRUE(test,
+ tea_signed.hash != tea_unsigned.hash ||
+ tea_signed.minor_hash != tea_unsigned.minor_hash);
+}
+
+#if IS_ENABLED(CONFIG_UNICODE)
+static void test_ext4fs_dirhash_casefolded_names_hash_consistently(struct kunit *test)
+{
+ struct super_block *sb;
+ struct ext4_inode_info *ei;
+ struct ext4_sb_info *sbi;
+ struct unicode_map *um;
+ struct dx_hash_info h1 = {
+ .hash_version = DX_HASH_HALF_MD4,
+ };
+ struct dx_hash_info h2 = {
+ .hash_version = DX_HASH_HALF_MD4,
+ };
+ int ret, ret1, ret2;
+
+ sb = kunit_kzalloc(test, sizeof(*sb), GFP_KERNEL);
+ ei = kunit_kzalloc(test, sizeof(*ei), GFP_KERNEL);
+ sbi = kunit_kzalloc(test, sizeof(*sbi), GFP_KERNEL);
+ KUNIT_ASSERT_NOT_NULL(test, sb);
+ KUNIT_ASSERT_NOT_NULL(test, ei);
+ KUNIT_ASSERT_NOT_NULL(test, sbi);
+
+ um = utf8_load(UTF8_LATEST);
+ if (IS_ERR(um)) {
+ kunit_skip(test, "utf8_load(UTF8_LATEST) failed: %pe",
+ um);
+ return;
+ }
+
+ ret = kunit_add_action_or_reset(test, utf8_unload_action, um);
+ KUNIT_ASSERT_EQ(test, ret, 0);
+
+ ext4_hash_init_fake_ext4_dir(ei, sb, sbi);
+ sb->s_encoding = um;
+ ei->vfs_inode.i_flags |= S_CASEFOLD;
+
+ KUNIT_ASSERT_TRUE(test, IS_CASEFOLDED(&ei->vfs_inode));
+
+ ret1 = ext4fs_dirhash(&ei->vfs_inode, "Alpha", 5, &h1);
+ ret2 = ext4fs_dirhash(&ei->vfs_inode, "aLPHa", 5, &h2);
+
+ KUNIT_ASSERT_EQ(test, ret1, 0);
+ KUNIT_ASSERT_EQ(test, ret2, 0);
+ KUNIT_EXPECT_EQ(test, h1.hash, h2.hash);
+ KUNIT_EXPECT_EQ(test, h1.minor_hash, h2.minor_hash);
+}
+
+static void test_ext4fs_dirhash_casefold_fallback(struct kunit *test)
+{
+ struct super_block *sb_cf, *sb_plain;
+ struct ext4_inode_info *ei;
+ struct ext4_sb_info *sbi;
+ struct inode *plain_dir;
+ struct unicode_map *um;
+ static const char invalid_utf8[] = "\xc3\x28";
+ struct dx_hash_info folded_dir = {
+ .hash_version = DX_HASH_HALF_MD4,
+ };
+ struct dx_hash_info plain = {
+ .hash_version = DX_HASH_HALF_MD4,
+ };
+ int ret, ret_cf, ret_plain;
+
+ sb_cf = kunit_kzalloc(test, sizeof(*sb_cf), GFP_KERNEL);
+ sb_plain = kunit_kzalloc(test, sizeof(*sb_plain), GFP_KERNEL);
+ ei = kunit_kzalloc(test, sizeof(*ei), GFP_KERNEL);
+ sbi = kunit_kzalloc(test, sizeof(*sbi), GFP_KERNEL);
+ plain_dir = kunit_kzalloc(test, sizeof(*plain_dir), GFP_KERNEL);
+ KUNIT_ASSERT_NOT_NULL(test, sb_cf);
+ KUNIT_ASSERT_NOT_NULL(test, sb_plain);
+ KUNIT_ASSERT_NOT_NULL(test, ei);
+ KUNIT_ASSERT_NOT_NULL(test, sbi);
+ KUNIT_ASSERT_NOT_NULL(test, plain_dir);
+
+ um = utf8_load(UTF8_LATEST);
+ if (IS_ERR(um)) {
+ kunit_skip(test, "utf8_load(UTF8_LATEST) failed: %pe",
+ um);
+ return;
+ }
+
+ ret = kunit_add_action_or_reset(test, utf8_unload_action, um);
+ KUNIT_ASSERT_EQ(test, ret, 0);
+
+ ext4_hash_init_fake_ext4_dir(ei, sb_cf, sbi);
+ sb_cf->s_encoding = um;
+ ei->vfs_inode.i_flags |= S_CASEFOLD;
+
+ KUNIT_ASSERT_TRUE(test, IS_CASEFOLDED(&ei->vfs_inode));
+
+ ext4_hash_init_fake_dir(plain_dir, sb_plain);
+
+ ret_cf = ext4fs_dirhash(&ei->vfs_inode, invalid_utf8,
+ sizeof(invalid_utf8) - 1, &folded_dir);
+ ret_plain = ext4fs_dirhash(plain_dir, invalid_utf8,
+ sizeof(invalid_utf8) - 1, &plain);
+
+ KUNIT_ASSERT_EQ(test, ret_cf, 0);
+ KUNIT_ASSERT_EQ(test, ret_plain, 0);
+ KUNIT_EXPECT_EQ(test, folded_dir.hash, plain.hash);
+ KUNIT_EXPECT_EQ(test, folded_dir.minor_hash, plain.minor_hash);
+}
+#endif
+
+static struct kunit_case ext4_hash_test_cases[] = {
+ KUNIT_CASE(test_ext4fs_dirhash_vectors),
+ KUNIT_CASE(test_ext4fs_dirhash_seed_changes_result),
+ KUNIT_CASE(test_ext4fs_dirhash_invalid_version_returns_einval),
+ KUNIT_CASE(test_ext4fs_dirhash_siphash_without_key_returns_einval),
+ KUNIT_CASE(test_ext4fs_dirhash_signed_unsigned_differ_on_nonascii),
+#if IS_ENABLED(CONFIG_UNICODE)
+ KUNIT_CASE(test_ext4fs_dirhash_casefolded_names_hash_consistently),
+ KUNIT_CASE(test_ext4fs_dirhash_casefold_fallback),
+#endif
+ {}
+};
+
+static struct kunit_suite ext4_hash_test_suite = {
+ .name = "ext4_hash",
+ .test_cases = ext4_hash_test_cases,
+};
+
+kunit_test_suites(&ext4_hash_test_suite);
+
+MODULE_LICENSE("GPL");
diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c
index 48483cd01..72645bd92 100644
--- a/fs/ext4/hash.c
+++ b/fs/ext4/hash.c
@@ -321,3 +321,7 @@ int ext4fs_dirhash(const struct inode *dir, const char *name, int len,
#endif
return __ext4fs_dirhash(dir, name, len, hinfo);
}
+
+#if IS_ENABLED(CONFIG_EXT4_KUNIT_TESTS)
+EXPORT_SYMBOL_FOR_EXT4_TEST(ext4fs_dirhash);
+#endif
--
2.34.1
^ permalink raw reply related
* [PATCH v5 0/2] ext4: add hash Kunit tests and optimize str2hashbuf
From: Guan-Chun Wu @ 2026-05-30 15:58 UTC (permalink / raw)
To: Theodore Ts'o, Andreas Dilger, Baokun Li, Jan Kara,
Ojaswin Mujoo, Ritesh Harjani, Zhang Yi
Cc: linux-ext4, linux-kernel, edward062254, visitorckw,
david.laight.linux, Guan-Chun Wu
This series adds Kunit tests for fs/ext4/hash.c and refactors
the str2hashbuf_{signed,unsigned}() helpers.
Patch 1 adds test coverage for ext4fs_dirhash(), including the main
hash variants and relevant edge cases.
Patch 2 simplifies the str2hashbuf helper implementation by processing
input in 4-byte chunks and removing function-pointer dispatch. This also
reduces overhead and shows roughly 2x improvement on longer inputs in
local testing.
Thanks,
Guan-Chun Wu
Link: https://lore.kernel.org/lkml/20260529200308.889706-1-409411716@gms.tku.edu.tw/
---
v4 -> v5 :
- Fix NULL pointer dereference and out-of-bounds read in SipHash tests.
- Use IS_ERR() instead of NULL check for utf8_load() error handling.
- Fix unicode_map memory leaks on assertion failures via kunit_add_action_or_reset().
- Avoid a UBSAN shift warning in str2hashbuf by casting signed char values
to __u32 before left-shifting them.
---
Guan-Chun Wu (2):
ext4: add Kunit coverage for directory hash computation
ext4: improve str2hashbuf by processing 4-byte chunks and removing
function pointers
fs/ext4/Makefile | 2 +-
fs/ext4/hash-test.c | 560 ++++++++++++++++++++++++++++++++++++++++++++
fs/ext4/hash.c | 68 ++++--
3 files changed, 607 insertions(+), 23 deletions(-)
create mode 100644 fs/ext4/hash-test.c
--
2.34.1
^ permalink raw reply
* [PATCH] ext2: fix ignored return value of generic_write_sync()
From: Danila Chernetsov @ 2026-05-30 12:23 UTC (permalink / raw)
To: Jan Kara; +Cc: Danila Chernetsov, linux-ext4, linux-kernel, lvc-project
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Fix ext2_dio_write_iter() to propagate the error returned by
generic_write_sync() instead of silently discarding it, which could
cause write(2) to return success to userspace on O_SYNC/O_DSYNC files
even when the sync failed.
The correct pattern, already used in ext2_dax_write_iter() in the same
file and in ext4, xfs, f2fs among others, is:
if (ret > 0)
ret = generic_write_sync(iocb, ret);
Found by Linux Verification Center (linuxtesting.org) with SVACE.
Fixes: fb5de4358e1a ("ext2: Move direct-io to use iomap")
Signed-off-by: Danila Chernetsov <listdansp@mail.ru>
---
fs/ext2/file.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index d9b1eb34694a..855a62e96c38 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -272,7 +272,7 @@ static ssize_t ext2_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
pos >> PAGE_SHIFT,
endbyte >> PAGE_SHIFT);
if (ret > 0)
- generic_write_sync(iocb, ret);
+ ret = generic_write_sync(iocb, ret);
}
out_unlock:
--
2.25.1
^ permalink raw reply related
* Re: [PATCH 4/4] select: make select() and poll() waits freezable
From: Jan Kara @ 2026-05-30 10:12 UTC (permalink / raw)
To: Dai Junbing
Cc: linux-fsdevel, viro, brauner, tytso, jack, linux-ext4, jack,
linux-kernel
In-Reply-To: <20260527064912.1038-5-daijunbing@vivo.com>
On Wed 27-05-26 14:49:12, Dai Junbing wrote:
> Tasks blocked in select() or poll() may be woken during suspend and
> resume due to freezer state transitions. This can cause avoidable
> activity in the suspend/resume path and add unnecessary overhead.
>
> Mark the waits in do_select() and do_poll() as freezable so these tasks
> are not unnecessarily woken by the freezer.
>
> Both functions are only used from their respective system call paths,
> where the task sleeps without holding locks that would make freezing
> unsafe.
>
> Signed-off-by: Dai Junbing <daijunbing@vivo.com>
Looks good. Feel free to add:
Reviewed-by: Jan Kara <jack@suse.cz>
Honza
> ---
> fs/select.c | 4 ++--
> 1 file changed, 2 insertions(+), 2 deletions(-)
>
> diff --git a/fs/select.c b/fs/select.c
> index bf71c9838dfe..b0b279748355 100644
> --- a/fs/select.c
> +++ b/fs/select.c
> @@ -600,7 +600,7 @@ static noinline_for_stack int do_select(int n, fd_set_bits *fds, struct timespec
> to = &expire;
> }
>
> - if (!poll_schedule_timeout(&table, TASK_INTERRUPTIBLE,
> + if (!poll_schedule_timeout(&table, TASK_INTERRUPTIBLE|TASK_FREEZABLE,
> to, slack))
> timed_out = 1;
> }
> @@ -962,7 +962,7 @@ static int do_poll(struct poll_list *list, struct poll_wqueues *wait,
> to = &expire;
> }
>
> - if (!poll_schedule_timeout(wait, TASK_INTERRUPTIBLE, to, slack))
> + if (!poll_schedule_timeout(wait, TASK_INTERRUPTIBLE|TASK_FREEZABLE, to, slack))
> timed_out = 1;
> }
> return count;
> --
> 2.25.1
>
--
Jan Kara <jack@suse.com>
SUSE Labs, CR
^ permalink raw reply
* Re: [PATCH 3/4] pipe: mark blocking pipe read and FIFO open sleeps as freezable
From: Jan Kara @ 2026-05-30 10:10 UTC (permalink / raw)
To: Dai Junbing
Cc: linux-fsdevel, viro, brauner, tytso, jack, linux-ext4, jack,
linux-kernel
In-Reply-To: <20260527064912.1038-4-daijunbing@vivo.com>
On Wed 27-05-26 14:49:11, Dai Junbing wrote:
> Tasks blocked in pipe read or FIFO open may be woken during suspend and
> resume due to freezer state transitions. This can cause avoidable
> activity in the suspend/resume path and add unnecessary overhead.
>
> Mark these sleeps as freezable so they are not unnecessarily woken by
> the freezer.
>
> Signed-off-by: Dai Junbing <daijunbing@vivo.com>
So, you don't mention here the reason why not all interruptible sleeps can
be freezable - and that is because you have to make sure we aren't holding
any resources that shouldn't be held when freezing a task. Typically these
are sleeping locks. Freezing task holding a mutex or semaphore or similar
lock is nasty and would need *much* stronger justification than "save a
pointless wakeup on hibernation".
Making sure we don't hold any lock is rather non-trivial in some cases. For
example in your case here, anon_pipe_read() is used in .read_iter method so
you should make sure all places calling .read_iter don't hold some lock. If
nothing else, I'm missing this analysis in the changelog and I actually
strongly suspect it is not true in some corner cases (like when playing
with splice(2), some files in /proc, or similar).
Honza
> diff --git a/fs/pipe.c b/fs/pipe.c
> index 9841648c9cf3..594726a7e542 100644
> --- a/fs/pipe.c
> +++ b/fs/pipe.c
> @@ -385,7 +385,7 @@ anon_pipe_read(struct kiocb *iocb, struct iov_iter *to)
> * since we've done any required wakeups and there's no need
> * to mark anything accessed. And we've dropped the lock.
> */
> - if (wait_event_interruptible_exclusive(pipe->rd_wait, pipe_readable(pipe)) < 0)
> + if (wait_event_freezable_exclusive(pipe->rd_wait, pipe_readable(pipe)) < 0)
> return -ERESTARTSYS;
>
> wake_next_reader = true;
> @@ -1102,7 +1102,7 @@ static int wait_for_partner(struct pipe_inode_info *pipe, unsigned int *cnt)
> int cur = *cnt;
>
> while (cur == *cnt) {
> - prepare_to_wait(&pipe->rd_wait, &rdwait, TASK_INTERRUPTIBLE);
> + prepare_to_wait(&pipe->rd_wait, &rdwait, TASK_INTERRUPTIBLE|TASK_FREEZABLE);
> pipe_unlock(pipe);
> schedule();
> finish_wait(&pipe->rd_wait, &rdwait);
> --
> 2.25.1
>
--
Jan Kara <jack@suse.com>
SUSE Labs, CR
^ permalink raw reply
* Re: [PATCH 2/4] jbd2: make kjournald2 commit wait freezable
From: Jan Kara @ 2026-05-30 9:55 UTC (permalink / raw)
To: Dai Junbing
Cc: linux-fsdevel, viro, brauner, tytso, jack, linux-ext4, jack,
linux-kernel
In-Reply-To: <20260527064912.1038-3-daijunbing@vivo.com>
On Wed 27-05-26 14:49:10, Dai Junbing wrote:
> While waiting for commit work, kjournald2 may be woken during suspend
> and resume due to freezer state transitions. This causes avoidable CPU
> activity in the suspend/resume path and adds unnecessary power overhead.
>
> Make the commit wait freezable so the thread is not unnecessarily woken
> by the freezer during suspend/resume.
>
> Signed-off-by: Dai Junbing <daijunbing@vivo.com>
Looks good to me. Feel free to add:
Reviewed-by: Jan Kara <jack@suse.cz>
Honza
> ---
> fs/jbd2/journal.c | 2 +-
> 1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
> index 4f397fcdb13c..d7ffe60c8793 100644
> --- a/fs/jbd2/journal.c
> +++ b/fs/jbd2/journal.c
> @@ -222,7 +222,7 @@ static int kjournald2(void *arg)
> DEFINE_WAIT(wait);
>
> prepare_to_wait(&journal->j_wait_commit, &wait,
> - TASK_INTERRUPTIBLE);
> + TASK_INTERRUPTIBLE|TASK_FREEZABLE);
> transaction = journal->j_running_transaction;
> if (transaction == NULL ||
> time_before(jiffies, transaction->t_expires)) {
> --
> 2.25.1
>
--
Jan Kara <jack@suse.com>
SUSE Labs, CR
^ permalink raw reply
* Re: [PATCH v2 1/2] ext4: avoid RWM atomic in EXT4_MB_GRP_TEST_AND_SET_READ
From: Jan Kara @ 2026-05-30 9:52 UTC (permalink / raw)
To: Andreas Dilger
Cc: Bohdan Trach, Theodore Ts'o, Baokun Li, Jan Kara,
Ojaswin Mujoo, Ritesh Harjani (IBM), Zhang Yi, mchehab+huawei,
bohdan.trach, lilith.oberhauser, linux-ext4, linux-kernel
In-Reply-To: <2D89BCA2-8317-4399-B926-7F5094236C1C@dilger.ca>
On Wed 27-05-26 13:46:43, Andreas Dilger wrote:
> On May 27, 2026, at 03:03, Bohdan Trach <bohdan.trach@huaweicloud.com> wrote:
> >
> > EXT4_MB_GRP_TEST_AND_SET_READ uses test_and_set_bit function which
> > issues an atomic write. This can cause high overhead due to cache
> > contention when multiple threads iterate over groups in a tight loop,
> > as is the case for ext4_mb_prefetch(). We have seen this to be a
> > problem for Kunpeng 920b CPUs which uses a single ARM LSE instruction
> > for this purpose.
> >
> > Avoid this unconditional atomic write by testing the bit first without
> > changing its value. This is OK for this use case as this bit is never
> > unset.
> >
> > This change significantly reduces costs of fallocate() operations which
> > trigger linear group scans on large multicore machines where
> > test_and_set_bit issues an atomic write operation unconditionally.
> >
> > Signed-off-by: Bohdan Trach <bohdan.trach@huaweicloud.com>
>
> Thanks for the patch. Definitely the benchmarks in the 0/2 email show
> significant gains for the Kunpeng system, and reducing contention makes sense
> as core counts increase and the likely case is that the bit is already set.
>
> That said, I wonder if this should (also/instead) be put into test_and_set_bit()
> itself, or add test_and_unlikely_set_bit() or test_and_rarely_set_bit()
> (or similar) optimized for the case where the bit is likely to already be set.
Well, it is a common enough pattern in the kernel that it might be worth it
but I'd say that's a separate effort spanning much more areas.
> I see in your benchmarking that there is not "apples-to-apples"
> comparisons for ARM(Kunpeng) vs. AMD on the same storage. The storage
> hardware and space usage is different for each test run, and the ARM
> numbers show only marginal gains and more negative than positive results
> at all thread counts:
>
> > Benchmark on an existing file system for AMD 9654 (15T FS, 6% space
> > used), kernel 7.1-rc3. This shows the performance impact on a mostly
> > free file system.
> > | thr. | base | patched | improv. |
> > | | perf | perf | |
> > |------|-------|---------|------------|
> > | 1 | 30901 | 31191 | +0.9384810 |
> > | 2 | 50874 | 50504 | -0.7272870 |
> > | 4 | 66068 | 64108 | -2.9666404 |
> > | 8 | 63963 | 61927 | -3.1830902 |
> > | 16 | 47809 | 47044 | -1.6001171 |
> > | 32 | 42441 | 42326 | -0.2709644 |
> > | 64 | 39773 | 39929 | +0.3922259 |
> > | 128 | 37065 | 36413 | -1.7590719 |
>
> The performance reduction might be caused by the now double memory access on
> AMD that is only adding overhead on that CPU implementation? It would be useful
> to see the testing on Kunpeng vs. AMD/Intel on the same storage device/usage.
>
> That would tell us if it is more appropriate to optimize this in the aarch64
> test_and_set_bit() rather than in ext4.
Well, even for x86 test_bit() || test_and_set_bit() is significantly
cheaper when the cacheline is hot because test_and_set_bit()
unconditionally invalidates the cacheline for all the other CPUs. And the
cost of additional test_bit() for something that is already in L1 cache is
very small (a few cycles) so I think the above differences are either pure
benchmarking noise or due to changes in code alignment or similar.
Honza
--
Jan Kara <jack@suse.com>
SUSE Labs, CR
^ permalink raw reply
* Re: [PATCH v2 1/2] ext4: avoid RWM atomic in EXT4_MB_GRP_TEST_AND_SET_READ
From: Jan Kara @ 2026-05-30 9:42 UTC (permalink / raw)
To: Bohdan Trach
Cc: Theodore Ts'o, Andreas Dilger, Baokun Li, Jan Kara,
Ojaswin Mujoo, Ritesh Harjani (IBM), Zhang Yi, mchehab+huawei,
bohdan.trach, lilith.oberhauser, linux-ext4, linux-kernel
In-Reply-To: <20260527090329.2680170-2-bohdan.trach@huaweicloud.com>
On Wed 27-05-26 11:03:26, Bohdan Trach wrote:
> EXT4_MB_GRP_TEST_AND_SET_READ uses test_and_set_bit function which
> issues an atomic write. This can cause high overhead due to cache
> contention when multiple threads iterate over groups in a tight loop,
> as is the case for ext4_mb_prefetch(). We have seen this to be a
> problem for Kunpeng 920b CPUs which uses a single ARM LSE instruction
> for this purpose.
>
> Avoid this unconditional atomic write by testing the bit first without
> changing its value. This is OK for this use case as this bit is never
> unset.
>
> This change significantly reduces costs of fallocate() operations which
> trigger linear group scans on large multicore machines where
> test_and_set_bit issues an atomic write operation unconditionally.
>
> Signed-off-by: Bohdan Trach <bohdan.trach@huaweicloud.com>
Looks good to me. Feel free to add:
Reviewed-by: Jan Kara <jack@suse.cz>
Honza
> ---
> fs/ext4/ext4.h | 8 +++++++-
> 1 file changed, 7 insertions(+), 1 deletion(-)
>
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 56b82d4a15d7..f8eacf1375f8 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -3551,7 +3551,13 @@ struct ext4_group_info {
> #define EXT4_MB_GRP_CLEAR_TRIMMED(grp) \
> (clear_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state)))
> #define EXT4_MB_GRP_TEST_AND_SET_READ(grp) \
> - (test_and_set_bit(EXT4_GROUP_INFO_BBITMAP_READ_BIT, &((grp)->bb_state)))
> + (ext4_mb_grp_test_and_set_read((grp)))
> +
> +static inline int ext4_mb_grp_test_and_set_read(struct ext4_group_info *grp)
> +{
> + return (test_bit(EXT4_GROUP_INFO_BBITMAP_READ_BIT, &grp->bb_state) ||
> + test_and_set_bit(EXT4_GROUP_INFO_BBITMAP_READ_BIT, &grp->bb_state));
> +}
>
> #define EXT4_MAX_CONTENTION 8
> #define EXT4_CONTENTION_THRESHOLD 2
> --
> 2.43.0
>
--
Jan Kara <jack@suse.com>
SUSE Labs, CR
^ permalink raw reply
* Re: [PATCH v4 18/23] ext4: wait for ordered I/O in the iomap buffered I/O path
From: Zhang Yi @ 2026-05-30 9:32 UTC (permalink / raw)
To: Ojaswin Mujoo
Cc: linux-ext4, linux-fsdevel, linux-kernel, tytso, adilger.kernel,
libaokun, jack, ritesh.list, djwong, hch, yi.zhang, yizhang089,
yangerkun, yukuai
In-Reply-To: <ahhEV3STaaVI4dTs@li-dc0c254c-257c-11b2-a85c-98b6c1322444.ibm.com>
On 5/28/2026 9:34 PM, Ojaswin Mujoo wrote:
> On Wed, May 27, 2026 at 09:28:28PM +0530, Ojaswin Mujoo wrote:
>> On Mon, May 11, 2026 at 03:23:38PM +0800, Zhang Yi wrote:
>>> From: Zhang Yi <yi.zhang@huawei.com>
>>>
>>> For append writes, wait for ordered I/O to complete before updating
>>> i_disksize. This ensures that zeroed data is flushed to disk before the
>>> metadata update, preventing stale data from being exposed during
>>> unaligned post-EOF append writes.
>>>
>>> Suggested-by: Jan Kara <jack@suse.cz>
>>> Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
>>> ---
>>> fs/ext4/ext4.h | 11 +++++++
>>> fs/ext4/inode.c | 80 ++++++++++++++++++++++++++++++++++++++++++-----
>>> fs/ext4/page-io.c | 60 +++++++++++++++++++++++++++++++++++
>>> fs/ext4/super.c | 23 ++++++++++----
>>> 4 files changed, 161 insertions(+), 13 deletions(-)
>>>
[...]
>>> @@ -4746,8 +4771,10 @@ static int ext4_iomap_submit_zero_block(struct inode *inode,
>>> loff_t from, loff_t end)
>>> {
>>> struct address_space *mapping = inode->i_mapping;
>>> + struct ext4_inode_info *ei = EXT4_I(inode);
>>> struct folio *folio;
>>> bool do_submit = false;
>>> + int ret;
>>>
>>> folio = filemap_lock_folio(mapping, from >> PAGE_SHIFT);
>>> if (IS_ERR(folio))
>>> @@ -4757,14 +4784,50 @@ static int ext4_iomap_submit_zero_block(struct inode *inode,
>>> folio_wait_writeback(folio);
>>> WARN_ON_ONCE(folio_test_writeback(folio));
>>>
>>> - if (likely(folio_test_dirty(folio)))
>>> + /*
>>> + * Mark the ordered range. It will be cleared upon I/O completion
>>> + * in ext4_iomap_end_bio(). Any operation that extends i_disksize
>>> + * (including append write end io past the zeroed boundary,
>>> + * truncate up and append fallocate) must wait for this I/O to
>>> + * complete before updating i_disksize.
>>> + *
>>> + * When multiple overlapping unaligned EOF writes are in flight, we
>>> + * only need to track and wait for the first one. Subsequent writes
>>> + * will zero the gap in memory and ensure that the zeroed data is
>>> + * written out along with the valid data in the same block before
>>> + * i_disksize is updated.
>>> + */
>>> + if (likely(folio_test_dirty(folio) &&
>>> + READ_ONCE(ei->i_ordered_len) == 0)) {
>>> + WRITE_ONCE(ei->i_ordered_lblk,
>>> + from >> inode->i_blkbits);
>>> + /*
>>> + * Pairs with smp_rmb() in ext4_iomap_writeback_submit()
>>> + * and ext4_iomap_wb_ordered_wait(). Ensure the updated
>>> + * i_ordered_lblk is visible when i_ordered_len becomes
>>> + * non-zero.
>>> + */
>>> + smp_store_release(&ei->i_ordered_len, 1);
>>> do_submit = true;
>>> + }
>>> folio_unlock(folio);
>>> folio_put(folio);
>>>
>>> /* Submit zeroed block. */
>>> - if (do_submit)
>>> - return filemap_fdatawrite_range(mapping, from, end - 1);
>>> + if (do_submit) {
>>> + ret = filemap_fdatawrite_range(mapping, from, end - 1);
>>> + if (ret) {
>>> + /*
>>> + * Pairs with wait_event() in
>>> + * ext4_iomap_wb_ordered_wait(). Ensure
>>> + * i_ordered_len = 0 is visible before waking up
>>> + * waiters.
>>> + */
>>> + smp_store_release(&ei->i_ordered_len, 0);
>>> + wake_up_all(&ei->i_ordered_wq);
>>> + return ret;
>
> Okay so even if the ordered IO fails we still let the i_disksize updates
> go ahead?
Yes when data_err=ignore, no when data_err=abort.
> I think this is a deviation from the current behavior where we
> abort the journal. If this is acceptable we should atleast add a comment
> on why its okay.
>
I think this behavior is consistent with the current data=ordered mode.
In the data_err=ignore mode, if an I/O write fails, ext4_end_io_end()
does not abort the journal, so i_disksize is still updated normally.
Conversely, in the data_err=abort mode, the journal is aborted, and
since i_disksize is not updated, it cannot be updated afterwards. Am I
missing something?
>>> + }
>>> + }
>>> return 0;
>>> }
>>>
>>> @@ -4827,10 +4890,13 @@ int ext4_block_zero_eof(struct inode *inode, loff_t from, loff_t end)
>>> * data=ordered mode. We submit zeroed range directly here.
>>> * Do not wait for I/O completion for performance.
>>> *
>>> - * TODO: Any operation that extends i_disksize (including
>>> - * append write end io past the zeroed boundary, truncate up,
>>> - * and append fallocate) must wait for the relevant I/O to
>>> - * complete before updating i_disksize.
>>> + * The end_io handler ext4_iomap_wb_ordered_wait() will wait
>>> + * for I/O completion before updating i_disksize if the write
>>> + * extends beyond the zeroed boundary.
>>> + *
>>> + * TODO: Any other operation that extends i_disksize
>>> + * (including truncate up and append fallocate) must wait for
>>> + * the relevant I/O to complete before updating i_disksize.
>>> */
>>> } else if (ext4_inode_buffered_iomap(inode)) {
>>> err = ext4_iomap_submit_zero_block(inode, from, end);
>>> diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
>>> index 3050c887329f..ad05ebb49bf6 100644
>>> --- a/fs/ext4/page-io.c
>>> +++ b/fs/ext4/page-io.c
>>> @@ -613,6 +613,46 @@ int ext4_bio_write_folio(struct ext4_io_submit *io, struct folio *folio,
>>> return 0;
>>> }
>>>
>>> +/*
>>> + * If the old disk size is not block size aligned and the current
>>> + * writeback range is entirely beyond the old EOF block, we should
>>> + * wait for the zeroed data written in ext4_block_zero_eof() to be
>>> + * written out, otherwise, it may expose stale data in that block.
>>> + */
>>> +static void ext4_iomap_wb_ordered_wait(struct inode *inode,
>>> + loff_t pos, loff_t end)
>>> +{
>>> + struct ext4_inode_info *ei = EXT4_I(inode);
>>> + unsigned int blocksize = i_blocksize(inode);
>>> + loff_t disksize = READ_ONCE(ei->i_disksize);
>>> + ext4_lblk_t order_lblk, order_len;
>>> +
>>> + /*
>>> + * Waiting for ordered I/O is unnecessary when:
>>> + * - The on-disk size is block-aligned (no stale data exists).
>>> + * - The write start is within the block of the old EOF
>>> + * (overwriting, or appending to a block that already contains
>>> + * valid data).
>>> + */
>>> + if (!(disksize & (blocksize - 1)) ||
>>> + pos < round_up(disksize, blocksize))
>>> + return;
>
> Okay these checks are pretty confusing. I was intially thinking that
> i_disksize's block would always be equal to i_ordered_lblk but seems
> like that is not true because ext4_block_zero_eof() uses from=i_size.
Yeah, this is the key point that I was a bit confused about as
well.
>
> So we could have a sequence where
>
> 1. truncate 4k (i_disksize = i_size = 4k)
> 2. write 8k,10k (i_disksize = 4k i_size = 10k, i_ordered_len = 0 (old isisze is block aligned))
> 3. write 16k,18k (i_disksize = 4k i_size = 10k, i_ordered_len = 1, lblk=4)
18k lblk=2, (10k >> 12)
>
> Here we issue ordered IO even though it' probably not needed. Now if
> write 3 finishes first we see disksize as 4k so we don't wait for
> ordered write. Which seems okay since we don't risk any stale data
> exposure. However, this flow is pretty confuing.
Indeed!
>
> Can't we somehow avoid having to issue/set ordered len/lblk in case it
> is not really needed, like only issue it if i_disksize (and not i_size)
> is unaligned. That can simplify some of our check and avoid extra IO
> overhead.
>
I was also planning to explore optimizations on this point next.
However, since the original logic in buffer_head already works this way,
keeping the same logic in the iomap path will not introduce any
additional side effects. To avoid unnecessary waiting, I simply added
the disksize alignment check in ext4_iomap_wb_ordered_wait().
Therefore, I do not plan to implement this optimization in this series.
I can open a separate series later to address this optimization — perhaps
by checking i_disksize in ext4_block_zero_eof() before issuing or adding
ordered I/O, and the buffer_head path might also benefit from optimization.
Meanwhile, to avoid confusion, I can add a TODO comment in this patch.
What do you think?
Cheers,
Yi.
^ permalink raw reply
* Re: [PATCH v4 18/23] ext4: wait for ordered I/O in the iomap buffered I/O path
From: Zhang Yi @ 2026-05-30 8:24 UTC (permalink / raw)
To: Zhang Yi, Ojaswin Mujoo
Cc: linux-ext4, linux-fsdevel, linux-kernel, tytso, adilger.kernel,
libaokun, jack, ritesh.list, djwong, hch, yizhang089, yangerkun,
yukuai
In-Reply-To: <5d2b13b9-d5a0-47ce-876c-ab01070cbe49@huaweicloud.com>
On 5/30/2026 3:22 PM, Zhang Yi wrote:
> Hi, Ojaswin!
>
> On 5/27/2026 11:58 PM, Ojaswin Mujoo wrote:
>> On Mon, May 11, 2026 at 03:23:38PM +0800, Zhang Yi wrote:
>>> From: Zhang Yi <yi.zhang@huawei.com>
>>>
>>> For append writes, wait for ordered I/O to complete before updating
>>> i_disksize. This ensures that zeroed data is flushed to disk before the
>>> metadata update, preventing stale data from being exposed during
>>> unaligned post-EOF append writes.
>>>
>>> Suggested-by: Jan Kara <jack@suse.cz>
>>> Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
>>> ---
>>> fs/ext4/ext4.h | 11 +++++++
>>> fs/ext4/inode.c | 80 ++++++++++++++++++++++++++++++++++++++++++-----
>>> fs/ext4/page-io.c | 60 +++++++++++++++++++++++++++++++++++
>>> fs/ext4/super.c | 23 ++++++++++----
>>> 4 files changed, 161 insertions(+), 13 deletions(-)
>>>
>>> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
>>> index 078feda47e36..9ce2128eea3e 100644
>>> --- a/fs/ext4/ext4.h
>>> +++ b/fs/ext4/ext4.h
>>> @@ -1195,6 +1195,15 @@ struct ext4_inode_info {
>>> #ifdef CONFIG_FS_ENCRYPTION
>>> struct fscrypt_inode_info *i_crypt_info;
>>> #endif
>>> +
>>> + /*
>>> + * Track ordered zeroed data during post-EOF append writes, fallocate,
>>> + * and truncate-up operations. These parameters are used only in the
>>> + * iomap buffered I/O path.
>>> + */
>>> + ext4_lblk_t i_ordered_lblk;
>>> + ext4_lblk_t i_ordered_len;
>>> + wait_queue_head_t i_ordered_wq;
>>> };
>>>
>>> /*
>>> @@ -3858,6 +3867,8 @@ extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
>>> __u64 len, __u64 *moved_len);
>>>
>>> /* page-io.c */
>>> +#define EXT4_IOMAP_IOEND_ORDER_IO 1UL /* This I/O is an ordered one */
>>> +
>>> extern int __init ext4_init_pageio(void);
>>> extern void ext4_exit_pageio(void);
>>> extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags);
>>> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
>>> index e013aeb03d7b..11fb369efeb1 100644
>>> --- a/fs/ext4/inode.c
>>> +++ b/fs/ext4/inode.c
>>> @@ -4345,6 +4345,7 @@ static int ext4_iomap_writeback_submit(struct iomap_writepage_ctx *wpc,
>>> {
>>> struct iomap_ioend *ioend = wpc->wb_ctx;
>>> struct ext4_inode_info *ei = EXT4_I(ioend->io_inode);
>>> + ext4_lblk_t start, end, order_lblk, order_len;
>>>
>>> /*
>>> * After I/O completion, a worker needs to be scheduled when:
>>> @@ -4357,6 +4358,30 @@ static int ext4_iomap_writeback_submit(struct iomap_writepage_ctx *wpc,
>>> test_opt(ioend->io_inode->i_sb, DATA_ERR_ABORT))
>>> ioend->io_bio.bi_end_io = ext4_iomap_end_bio;
>>>
>>> + /*
>>> + * Mark the I/O as ordered. Ordered I/O requires separate endio
>>> + * handling and must not be merged with regular I/O operations.
>>> + */
>>> + order_len = READ_ONCE(ei->i_ordered_len);
>>> + if (order_len) {
>>> + /*
>>> + * Pair with smp_store_release() in ext4_block_zero_eof().
>>> + * Ensure we see the updated i_ordered_lblk that was written
>>> + * before the release store to i_ordered_len.
>>> + */
>>> + smp_rmb();
>>> + order_lblk = READ_ONCE(ei->i_ordered_lblk);
>>> + start = ioend->io_offset >> ioend->io_inode->i_blkbits;
>>> + end = EXT4_B_TO_LBLK(ioend->io_inode,
>>> + ioend->io_offset + ioend->io_size);
>>> +
>>> + if (start <= order_lblk && end >= order_lblk + order_len) {
>>
>> Hi Zhang,
>>
>> I guess this check is enough cause ordered_lblk and ordered_len will
>> always be contained in a single block.
>
> Yeah.
>
>>
>>> + ioend->io_bio.bi_end_io = ext4_iomap_end_bio;
>>> + ioend->io_private = (void *)EXT4_IOMAP_IOEND_ORDER_IO;
>>> + ioend->io_flags |= IOMAP_IOEND_BOUNDARY;
>>
>> FWIU, we are wanting the ordered IO to not be merged and submitted asap
>> since we want to wake up the waiters. Is there any other reason?
>
> My original intention was to prevent the loss of the
> EXT4_IOMAP_IOEND_ORDER_IO flag during worker processing triggered by I/O
> completion, which could be caused by merging an ordered ioend with a
> normal ioend. In patch 19, we need to determine the flag to update
> i_disksize to the correct position.
>
>>
>> Adding the boundary in ->writeback_submit() only affects
>> iomap_ioend_can_merge() which happens after we have woken up the waiters
>> and deferred the IO to the wq. We ideally want it affect
>> iomap_can_add_to_ioend() ie we need to add IOMAP_F_BOUNDARY in
>> ->writeback_range().
>
> IIUC, merging into the same ioend during the submission stage doesn't
> seem to cause any problems.
>
>>
>> Secondly, I don't think boundary is the right flag here. It ensures
>> that everything before the ordered iomap gets submitted and the ordered
>> iomap starts a new ioend. This can still keep getting merged with the
>> newer ioends untils we decide to submit the IO, which can delay waking
>> up the waiters. If we really want the "no merge" behavior, we'll have to
>> do something like [1] (Check the 2 NOMERGE flag patches).
>
> Yeah, IOMAP_IOEND_BOUNDARY appears to be just a one-way barrier and
> still cannot prevent merging. I missed this, thank you for pointing this
> out. However, I think perhaps we should change iomap_ioend_can_merge()
> to check the iomap_ioend->io_private. Something like:
>
> if (ioend->io_private || next->io_private)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ioend->io_private != next->io_private
> return false;
>
> What do you think?
>
> Thanks,
> Yi.
^ permalink raw reply
* Re: [PATCH] ext4: fix quota accounting WARN in bigalloc punch hole
From: Qiliang Yuan @ 2026-05-30 7:47 UTC (permalink / raw)
To: yi.zhang
Cc: tytso, adilger.kernel, jack, ritesh.list, ojaswin, libaokun,
linux-ext4, linux-kernel, yzjaurora
In-Reply-To: <9cf8569a-4f79-400b-ad80-197decec4b15@huaweicloud.com>
On 5/29/2026 6:37 PM, Zhang Yi wrote:
> On 5/29/2026 4:34 PM, Qiliang Yuan wrote:
>> Thanks for the review! Let me explain the issue with your specific
>> example.
>
> Thanks for the explanation, some comments below.
>
>> ext4_remove_blocks() sees that block 0's cluster has a pending
>> reservation → calls ext4_rereserve_cluster() → dquot_reclaim_block()
>> tries to move 16KB from dqb_curspace to dqb_rsvspace. But
>> dqb_curspace may already be insufficient (depending on whether
>> other allocs/frees have happened), triggering:
> ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
> How exactly did it happen? All data cluster allocations are precisely
> calculated. In addition, the current example cannot reproduce the issue
> you encountered.
>
>> The key point is: pending from __revise_pending() indicates clusters
>> that *still contain delayed blocks outside the newly inserted extent*.
>> These clusters' quota reservations must be preserved — get_rsvd()
>
> Whether to reserve a quota is determined not by whether there are
> delalloc extents left, but by whether there are only pure delalloc
> extents in this cluster range.
You are right about the simple case — I missed the ext4_clu_alloc_state()
logic in ext4_insert_delayed_blocks(). When a cluster already has
allocated blocks, subsequent delalloc within that cluster does not
increase rsvspace, so releasing rsvspace after DIO allocation is
correct there.
However, the syzkaller reproducer's scenario is more complex. It uses
three operations:
1. DIO write at sub-block-aligned offset 0x6400 with RWF_DSYNC
(pwritev2, 0x140000 bytes at offset 0x6400)
2. Delalloc single-byte write at offset 0x8080c61 (far away cluster)
3. PUNCH_HOLE from offset 0x7f covering 0x8000c61 bytes
The DIO and delalloc are in different, distant clusters. The
sub-block-aligned DIO may cause it to fall through a different
code path — possibly zeroing or partial writeback within the cluster
boundary.
I'll instrument a kernel to trace the exact rsvspace/curspace
transitions and get back with concrete numbers showing where the
mismatch occurs. This should settle whether the root cause is
in resv_used += pending or something else.
Thanks for the detailed review!
--
Qiliang Yuan
^ permalink raw reply
* Re: [PATCH v4 18/23] ext4: wait for ordered I/O in the iomap buffered I/O path
From: Zhang Yi @ 2026-05-30 7:22 UTC (permalink / raw)
To: Ojaswin Mujoo
Cc: linux-ext4, linux-fsdevel, linux-kernel, tytso, adilger.kernel,
libaokun, jack, ritesh.list, djwong, hch, yi.zhang, yizhang089,
yangerkun, yukuai
In-Reply-To: <ahcUeq_IA_X988Ca@li-dc0c254c-257c-11b2-a85c-98b6c1322444.ibm.com>
Hi, Ojaswin!
On 5/27/2026 11:58 PM, Ojaswin Mujoo wrote:
> On Mon, May 11, 2026 at 03:23:38PM +0800, Zhang Yi wrote:
>> From: Zhang Yi <yi.zhang@huawei.com>
>>
>> For append writes, wait for ordered I/O to complete before updating
>> i_disksize. This ensures that zeroed data is flushed to disk before the
>> metadata update, preventing stale data from being exposed during
>> unaligned post-EOF append writes.
>>
>> Suggested-by: Jan Kara <jack@suse.cz>
>> Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
>> ---
>> fs/ext4/ext4.h | 11 +++++++
>> fs/ext4/inode.c | 80 ++++++++++++++++++++++++++++++++++++++++++-----
>> fs/ext4/page-io.c | 60 +++++++++++++++++++++++++++++++++++
>> fs/ext4/super.c | 23 ++++++++++----
>> 4 files changed, 161 insertions(+), 13 deletions(-)
>>
>> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
>> index 078feda47e36..9ce2128eea3e 100644
>> --- a/fs/ext4/ext4.h
>> +++ b/fs/ext4/ext4.h
>> @@ -1195,6 +1195,15 @@ struct ext4_inode_info {
>> #ifdef CONFIG_FS_ENCRYPTION
>> struct fscrypt_inode_info *i_crypt_info;
>> #endif
>> +
>> + /*
>> + * Track ordered zeroed data during post-EOF append writes, fallocate,
>> + * and truncate-up operations. These parameters are used only in the
>> + * iomap buffered I/O path.
>> + */
>> + ext4_lblk_t i_ordered_lblk;
>> + ext4_lblk_t i_ordered_len;
>> + wait_queue_head_t i_ordered_wq;
>> };
>>
>> /*
>> @@ -3858,6 +3867,8 @@ extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
>> __u64 len, __u64 *moved_len);
>>
>> /* page-io.c */
>> +#define EXT4_IOMAP_IOEND_ORDER_IO 1UL /* This I/O is an ordered one */
>> +
>> extern int __init ext4_init_pageio(void);
>> extern void ext4_exit_pageio(void);
>> extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags);
>> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
>> index e013aeb03d7b..11fb369efeb1 100644
>> --- a/fs/ext4/inode.c
>> +++ b/fs/ext4/inode.c
>> @@ -4345,6 +4345,7 @@ static int ext4_iomap_writeback_submit(struct iomap_writepage_ctx *wpc,
>> {
>> struct iomap_ioend *ioend = wpc->wb_ctx;
>> struct ext4_inode_info *ei = EXT4_I(ioend->io_inode);
>> + ext4_lblk_t start, end, order_lblk, order_len;
>>
>> /*
>> * After I/O completion, a worker needs to be scheduled when:
>> @@ -4357,6 +4358,30 @@ static int ext4_iomap_writeback_submit(struct iomap_writepage_ctx *wpc,
>> test_opt(ioend->io_inode->i_sb, DATA_ERR_ABORT))
>> ioend->io_bio.bi_end_io = ext4_iomap_end_bio;
>>
>> + /*
>> + * Mark the I/O as ordered. Ordered I/O requires separate endio
>> + * handling and must not be merged with regular I/O operations.
>> + */
>> + order_len = READ_ONCE(ei->i_ordered_len);
>> + if (order_len) {
>> + /*
>> + * Pair with smp_store_release() in ext4_block_zero_eof().
>> + * Ensure we see the updated i_ordered_lblk that was written
>> + * before the release store to i_ordered_len.
>> + */
>> + smp_rmb();
>> + order_lblk = READ_ONCE(ei->i_ordered_lblk);
>> + start = ioend->io_offset >> ioend->io_inode->i_blkbits;
>> + end = EXT4_B_TO_LBLK(ioend->io_inode,
>> + ioend->io_offset + ioend->io_size);
>> +
>> + if (start <= order_lblk && end >= order_lblk + order_len) {
>
> Hi Zhang,
>
> I guess this check is enough cause ordered_lblk and ordered_len will
> always be contained in a single block.
Yeah.
>
>> + ioend->io_bio.bi_end_io = ext4_iomap_end_bio;
>> + ioend->io_private = (void *)EXT4_IOMAP_IOEND_ORDER_IO;
>> + ioend->io_flags |= IOMAP_IOEND_BOUNDARY;
>
> FWIU, we are wanting the ordered IO to not be merged and submitted asap
> since we want to wake up the waiters. Is there any other reason?
My original intention was to prevent the loss of the
EXT4_IOMAP_IOEND_ORDER_IO flag during worker processing triggered by I/O
completion, which could be caused by merging an ordered ioend with a
normal ioend. In patch 19, we need to determine the flag to update
i_disksize to the correct position.
>
> Adding the boundary in ->writeback_submit() only affects
> iomap_ioend_can_merge() which happens after we have woken up the waiters
> and deferred the IO to the wq. We ideally want it affect
> iomap_can_add_to_ioend() ie we need to add IOMAP_F_BOUNDARY in
> ->writeback_range().
IIUC, merging into the same ioend during the submission stage doesn't
seem to cause any problems.
>
> Secondly, I don't think boundary is the right flag here. It ensures
> that everything before the ordered iomap gets submitted and the ordered
> iomap starts a new ioend. This can still keep getting merged with the
> newer ioends untils we decide to submit the IO, which can delay waking
> up the waiters. If we really want the "no merge" behavior, we'll have to
> do something like [1] (Check the 2 NOMERGE flag patches).
Yeah, IOMAP_IOEND_BOUNDARY appears to be just a one-way barrier and
still cannot prevent merging. I missed this, thank you for pointing this
out. However, I think perhaps we should change iomap_ioend_can_merge()
to check the iomap_ioend->io_private. Something like:
if (ioend->io_private || next->io_private)
return false;
What do you think?
Thanks,
Yi.
>
>> + }
>> + }
>> +
>> return iomap_ioend_writeback_submit(wpc, error);
>> }
>>
>> @@ -4746,8 +4771,10 @@ static int ext4_iomap_submit_zero_block(struct inode *inode,
>> loff_t from, loff_t end)
>> {
>> struct address_space *mapping = inode->i_mapping;
>> + struct ext4_inode_info *ei = EXT4_I(inode);
>> struct folio *folio;
>> bool do_submit = false;
>> + int ret;
>>
>> folio = filemap_lock_folio(mapping, from >> PAGE_SHIFT);
>> if (IS_ERR(folio))
>> @@ -4757,14 +4784,50 @@ static int ext4_iomap_submit_zero_block(struct inode *inode,
>> folio_wait_writeback(folio);
>> WARN_ON_ONCE(folio_test_writeback(folio));
>>
>> - if (likely(folio_test_dirty(folio)))
>> + /*
>> + * Mark the ordered range. It will be cleared upon I/O completion
>> + * in ext4_iomap_end_bio(). Any operation that extends i_disksize
>> + * (including append write end io past the zeroed boundary,
>> + * truncate up and append fallocate) must wait for this I/O to
>> + * complete before updating i_disksize.
>> + *
>> + * When multiple overlapping unaligned EOF writes are in flight, we
>> + * only need to track and wait for the first one. Subsequent writes
>> + * will zero the gap in memory and ensure that the zeroed data is
>> + * written out along with the valid data in the same block before
>> + * i_disksize is updated.
>> + */
>> + if (likely(folio_test_dirty(folio) &&
>> + READ_ONCE(ei->i_ordered_len) == 0)) {
>> + WRITE_ONCE(ei->i_ordered_lblk,
>> + from >> inode->i_blkbits);
>> + /*
>> + * Pairs with smp_rmb() in ext4_iomap_writeback_submit()
>> + * and ext4_iomap_wb_ordered_wait(). Ensure the updated
>> + * i_ordered_lblk is visible when i_ordered_len becomes
>> + * non-zero.
>> + */
>> + smp_store_release(&ei->i_ordered_len, 1);
>> do_submit = true;
>> + }
>> folio_unlock(folio);
>> folio_put(folio);
>>
>> /* Submit zeroed block. */
>> - if (do_submit)
>> - return filemap_fdatawrite_range(mapping, from, end - 1);
>> + if (do_submit) {
>> + ret = filemap_fdatawrite_range(mapping, from, end - 1);
>> + if (ret) {
>> + /*
>> + * Pairs with wait_event() in
>> + * ext4_iomap_wb_ordered_wait(). Ensure
>> + * i_ordered_len = 0 is visible before waking up
>> + * waiters.
>> + */
>> + smp_store_release(&ei->i_ordered_len, 0);
>> + wake_up_all(&ei->i_ordered_wq);
>> + return ret;
>> + }
>> + }
>> return 0;
>> }
>>
>> @@ -4827,10 +4890,13 @@ int ext4_block_zero_eof(struct inode *inode, loff_t from, loff_t end)
>> * data=ordered mode. We submit zeroed range directly here.
>> * Do not wait for I/O completion for performance.
>> *
>> - * TODO: Any operation that extends i_disksize (including
>> - * append write end io past the zeroed boundary, truncate up,
>> - * and append fallocate) must wait for the relevant I/O to
>> - * complete before updating i_disksize.
>> + * The end_io handler ext4_iomap_wb_ordered_wait() will wait
>> + * for I/O completion before updating i_disksize if the write
>> + * extends beyond the zeroed boundary.
>> + *
>> + * TODO: Any other operation that extends i_disksize
>> + * (including truncate up and append fallocate) must wait for
>> + * the relevant I/O to complete before updating i_disksize.
>> */
>> } else if (ext4_inode_buffered_iomap(inode)) {
>> err = ext4_iomap_submit_zero_block(inode, from, end);
>> diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
>> index 3050c887329f..ad05ebb49bf6 100644
>> --- a/fs/ext4/page-io.c
>> +++ b/fs/ext4/page-io.c
>> @@ -613,6 +613,46 @@ int ext4_bio_write_folio(struct ext4_io_submit *io, struct folio *folio,
>> return 0;
>> }
>>
>> +/*
>> + * If the old disk size is not block size aligned and the current
>> + * writeback range is entirely beyond the old EOF block, we should
>> + * wait for the zeroed data written in ext4_block_zero_eof() to be
>> + * written out, otherwise, it may expose stale data in that block.
>> + */
>> +static void ext4_iomap_wb_ordered_wait(struct inode *inode,
>> + loff_t pos, loff_t end)
>> +{
>> + struct ext4_inode_info *ei = EXT4_I(inode);
>> + unsigned int blocksize = i_blocksize(inode);
>> + loff_t disksize = READ_ONCE(ei->i_disksize);
>> + ext4_lblk_t order_lblk, order_len;
>> +
>> + /*
>> + * Waiting for ordered I/O is unnecessary when:
>> + * - The on-disk size is block-aligned (no stale data exists).
>> + * - The write start is within the block of the old EOF
>> + * (overwriting, or appending to a block that already contains
>> + * valid data).
>> + */
>> + if (!(disksize & (blocksize - 1)) ||
>> + pos < round_up(disksize, blocksize))
>> + return;
>> +
>> + order_len = READ_ONCE(ei->i_ordered_len);
>> + if (!order_len)
>> + return;
>> +
>> + /*
>> + * Pair with smp_store_release() in ext4_iomap_end_bio() and
>> + * ext4_block_zero_eof(). Ensure we see the updated i_ordered_lblk
>> + * that was written before the release store to i_ordered_len.
>> + */
>> + smp_rmb();
>> + order_lblk = READ_ONCE(ei->i_ordered_lblk);
>> + if ((pos >> inode->i_blkbits) >= order_lblk + order_len)
>> + wait_event(ei->i_ordered_wq, READ_ONCE(ei->i_ordered_len) == 0);
>> +}
>> +
>> static int ext4_iomap_wb_update_disksize(handle_t *handle, struct inode *inode,
>> loff_t end)
>> {
>> @@ -656,6 +696,9 @@ static void ext4_iomap_finish_ioend(struct iomap_ioend *ioend)
>> goto out;
>> }
>>
>> + /* Wait ordered zero data to be written out. */
>> + ext4_iomap_wb_ordered_wait(inode, pos, pos + size);
>> +
>> /* We may need to convert one extent and dirty the inode. */
>> credits = ext4_chunk_trans_blocks(inode,
>> EXT4_MAX_BLOCKS(size, pos, inode->i_blkbits));
>> @@ -717,8 +760,25 @@ void ext4_iomap_end_bio(struct bio *bio)
>> struct inode *inode = ioend->io_inode;
>> struct ext4_inode_info *ei = EXT4_I(inode);
>> struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
>> + unsigned long io_mode = (unsigned long)ioend->io_private;
>> unsigned long flags;
>>
>> + /*
>> + * This is an ordered I/O, clear the ordered range set in
>> + * ext4_block_zero_eof() and wake up all waiters that will update
>> + * the inode i_disksize.
>> + */
>> + if (io_mode == EXT4_IOMAP_IOEND_ORDER_IO) {
>> + /*
>> + * Pairs with wait_event() in ext4_iomap_wb_ordered_wait().
>> + * Ensure i_ordered_len = 0 is visible before waking up
>> + * waiters.
>> + */
>> + smp_store_release(&ei->i_ordered_len, 0);
>> + wake_up_all(&ei->i_ordered_wq);
>> + goto defer;
>> + }
>> +
>> /* Needs to convert unwritten extents or update the i_disksize. */
>> if ((ioend->io_flags & IOMAP_IOEND_UNWRITTEN) ||
>> ioend->io_offset + ioend->io_size > READ_ONCE(ei->i_disksize))
>> diff --git a/fs/ext4/super.c b/fs/ext4/super.c
>> index 62bfe05a64bc..9c0a00e716f3 100644
>> --- a/fs/ext4/super.c
>> +++ b/fs/ext4/super.c
>> @@ -1444,6 +1444,9 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
>> ext4_fc_init_inode(&ei->vfs_inode);
>> spin_lock_init(&ei->i_fc_lock);
>> mmb_init(&ei->i_metadata_bhs, &ei->vfs_inode.i_data);
>> + ei->i_ordered_lblk = 0;
>> + ei->i_ordered_len = 0;
>> + init_waitqueue_head(&ei->i_ordered_wq);
>> return &ei->vfs_inode;
>> }
>>
>> @@ -1480,12 +1483,20 @@ static void ext4_destroy_inode(struct inode *inode)
>> dump_stack();
>> }
>>
>> - if (!(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ERROR_FS) &&
>> - WARN_ON_ONCE(EXT4_I(inode)->i_reserved_data_blocks))
>> - ext4_msg(inode->i_sb, KERN_ERR,
>> - "Inode %llu (%p): i_reserved_data_blocks (%u) not cleared!",
>> - inode->i_ino, EXT4_I(inode),
>> - EXT4_I(inode)->i_reserved_data_blocks);
>> + if (!(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ERROR_FS)) {
>> + if (WARN_ON_ONCE(EXT4_I(inode)->i_reserved_data_blocks))
>> + ext4_msg(inode->i_sb, KERN_ERR,
>> + "Inode %llu (%p): i_reserved_data_blocks (%u) not cleared!",
>> + inode->i_ino, EXT4_I(inode),
>> + EXT4_I(inode)->i_reserved_data_blocks);
>> +
>> + if (WARN_ON_ONCE(EXT4_I(inode)->i_ordered_len))
>> + ext4_msg(inode->i_sb, KERN_ERR,
>> + "Inode %llu (%p): i_ordered_lblk (%u) and i_ordered_len (%u) not cleared!",
>> + inode->i_ino, EXT4_I(inode),
>> + EXT4_I(inode)->i_ordered_lblk,
>> + EXT4_I(inode)->i_ordered_len);
>> + }
>> }
>>
>> static void ext4_shutdown(struct super_block *sb)
>> --
>> 2.52.0
>>
^ permalink raw reply
* [tytso-ext4:dev 1/5] ERROR: modpost: "ext4fs_dirhash" [fs/ext4/ext4-test.ko] undefined!
From: kernel test robot @ 2026-05-30 2:53 UTC (permalink / raw)
To: Guan-Chun Wu; +Cc: oe-kbuild-all, linux-ext4, Theodore Ts'o, Chen Hao Yu
tree: https://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4.git dev
head: 87280ac61ae6ef27fc6ee78b733689754e425592
commit: a4841d91a7f57d38d3a63c18b4ce68ee5e06829b [1/5] ext4: add Kunit coverage for directory hash computation
config: x86_64-rhel-9.4-kunit (https://download.01.org/0day-ci/archive/20260530/202605301009.iWa1l3AV-lkp@intel.com/config)
compiler: gcc-14 (Debian 14.2.0-19) 14.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20260530/202605301009.iWa1l3AV-lkp@intel.com/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202605301009.iWa1l3AV-lkp@intel.com/
All errors (new ones prefixed by >>, old ones prefixed by <<):
>> ERROR: modpost: "ext4fs_dirhash" [fs/ext4/ext4-test.ko] undefined!
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
^ permalink raw reply
* Re: [PATCH v4 17/23] ext4: submit zeroed post-EOF data immediately in the iomap buffered I/O path
From: Zhang Yi @ 2026-05-30 2:53 UTC (permalink / raw)
To: Ojaswin Mujoo
Cc: linux-ext4, linux-fsdevel, linux-kernel, tytso, adilger.kernel,
libaokun, jack, ritesh.list, djwong, hch, yi.zhang, yizhang089,
yangerkun, yukuai
In-Reply-To: <ahb0fqVCige08_--@li-dc0c254c-257c-11b2-a85c-98b6c1322444.ibm.com>
On 5/27/2026 9:41 PM, Ojaswin Mujoo wrote:
> On Mon, May 11, 2026 at 03:23:37PM +0800, Zhang Yi wrote:
>> From: Zhang Yi <yi.zhang@huawei.com>
>>
>> In the generic buffered_head I/O path, we rely on the data=order mode to
>> ensure that the zeroed EOF block data is written before updating
>> i_disksize, thus preventing stale data from being exposed.
>>
>> However, the iomap buffered I/O path cannot use this mechanism. Instead,
>> we issue the I/O immediately after performing the zero operation
>> (without synchronous waiting for performance). This can reduce the risk
>> of exposing stale data, but it does not guarantee that the zero data
>> will be flushed to disk before the metadata of i_disksize is updated.
>> The subsequent patches will wait for this I/O to complete before
>> updating i_disksize.
>>
>> Suggested-by: Jan Kara <jack@suse.cz>
>> Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
>
> I think we discussed that we may not need to do this [1] but I guess
> you've decided to make the tradeoff of issuing the IO to avoid having to
> wait for bg flush to complete the tail page zeroing
>
Yes. For truncate up and append fallocate, originally i_disksize would
be updated immediately, and the change would be persisted via the
journal within default 5 seconds. But now, if the tail page is not
committed immediately, the update to i_disksize will be delayed by about
30 seconds, and persistence will be postponed to around 35 seconds. I'm
not sure what impact this change might have — I just don't really want
to introduce it.
For normal append writes, the impact is minimal, unless we call
sync_range to sync the portion of data that extends beyond EOF.
In addition, if the zeroed page is not issued here immediately, the
logic will become more complex because we need to more careful about the
order of write-back IOs to prevent deadlock issues caused by mutual
waiting.
> However, I think one side effect might be many threads calling the
> writeback mechanism to issue zero IOs which might not scale well. I
> don't know if it'll be a huge problem though, I guess it's a sort of
> thing we will have to deal with in case we see it in real world
> workloads.
>
I agree with you. However, I suspect that unless we run some specific
benchmark tests, it should be difficult to encounter a large number of
post-EOF append writes and truncate up operations in real-world usage
scenarios — and I haven't come across such scenarios yet. For
simplicity, I'd like to proceed with this implementation for now. If we
do run into actual problems later, we can consider not issuing I/O
directly here, but instead: 1) find the ordered block in
ext4_sync_file() and perform writeback; 2) ensure writeback ordering
for normal background writeback as well — otherwise, there is a risk of
deadlock (mutual waiting). What do you think?
Cheers,
Yi.
> [1] https://lore.kernel.org/linux-ext4/yhy4cgc4fnk7tzfejuhy6m6ljo425ebpg6khss6vtvpidg6lyp@5xcyabxrl6zm/
>
>> ---
>> fs/ext4/inode.c | 66 ++++++++++++++++++++++++++++++++++++++++---------
>> 1 file changed, 55 insertions(+), 11 deletions(-)
>>
>> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
>> index 239d387ffaf2..e013aeb03d7b 100644
>> --- a/fs/ext4/inode.c
>> +++ b/fs/ext4/inode.c
>> @@ -4742,6 +4742,32 @@ static int ext4_block_zero_range(struct inode *inode,
>> zero_written);
>> }
>>
>> +static int ext4_iomap_submit_zero_block(struct inode *inode,
>> + loff_t from, loff_t end)
>> +{
>> + struct address_space *mapping = inode->i_mapping;
>> + struct folio *folio;
>> + bool do_submit = false;
>> +
>> + folio = filemap_lock_folio(mapping, from >> PAGE_SHIFT);
>> + if (IS_ERR(folio))
>> + /* Already writeback and clear? */
>> + return PTR_ERR(folio) == -ENOENT ? 0 : PTR_ERR(folio);
>> +
>> + folio_wait_writeback(folio);
>> + WARN_ON_ONCE(folio_test_writeback(folio));
>> +
>> + if (likely(folio_test_dirty(folio)))
>> + do_submit = true;
>> + folio_unlock(folio);
>> + folio_put(folio);
>> +
>> + /* Submit zeroed block. */
>> + if (do_submit)
>> + return filemap_fdatawrite_range(mapping, from, end - 1);
>> + return 0;
>> +}
>> +
>> /*
>> * Zero out a mapping from file offset 'from' up to the end of the block
>> * which corresponds to 'from' or to the given 'end' inside this block.
>> @@ -4765,8 +4791,10 @@ int ext4_block_zero_eof(struct inode *inode, loff_t from, loff_t end)
>> if (IS_ENCRYPTED(inode) && !fscrypt_has_encryption_key(inode))
>> return 0;
>>
>> - if (length > blocksize - offset)
>> + if (length > blocksize - offset) {
>> length = blocksize - offset;
>> + end = from + length;
>> + }
>>
>> err = ext4_block_zero_range(inode, from, length,
>> &did_zero, &zero_written);
>> @@ -4781,18 +4809,34 @@ int ext4_block_zero_eof(struct inode *inode, loff_t from, loff_t end)
>> * TODO: In the iomap path, handle this by updating i_disksize to
>> * i_size after the zeroed data has been written back.
>> */
>> - if (ext4_should_order_data(inode) &&
>> - did_zero && zero_written && !IS_DAX(inode)) {
>> - handle_t *handle;
>> + if (did_zero && zero_written && !IS_DAX(inode)) {
>> + if (ext4_should_order_data(inode)) {
>> + handle_t *handle;
>>
>> - handle = ext4_journal_start(inode, EXT4_HT_MISC, 1);
>> - if (IS_ERR(handle))
>> - return PTR_ERR(handle);
>> + handle = ext4_journal_start(inode, EXT4_HT_MISC, 1);
>> + if (IS_ERR(handle))
>> + return PTR_ERR(handle);
>>
>> - err = ext4_jbd2_inode_add_write(handle, inode, from, length);
>> - ext4_journal_stop(handle);
>> - if (err)
>> - return err;
>> + err = ext4_jbd2_inode_add_write(handle, inode, from,
>> + length);
>> + ext4_journal_stop(handle);
>> + if (err)
>> + return err;
>> + /*
>> + * inodes using the iomap buffered I/O path do not use the
>> + * data=ordered mode. We submit zeroed range directly here.
>> + * Do not wait for I/O completion for performance.
>> + *
>> + * TODO: Any operation that extends i_disksize (including
>> + * append write end io past the zeroed boundary, truncate up,
>> + * and append fallocate) must wait for the relevant I/O to
>> + * complete before updating i_disksize.
>> + */
>> + } else if (ext4_inode_buffered_iomap(inode)) {
>> + err = ext4_iomap_submit_zero_block(inode, from, end);
>> + if (err)
>> + return err;
>> + }
>> }
>>
>> return 0;
>> --
>> 2.52.0
>>
^ permalink raw reply
* Re: [PATCH v4 09/23] ext4: implement writeback path using iomap
From: Zhang Yi @ 2026-05-30 1:21 UTC (permalink / raw)
To: Ojaswin Mujoo, Zhang Yi
Cc: linux-ext4, linux-fsdevel, linux-kernel, tytso, adilger.kernel,
libaokun, jack, ritesh.list, djwong, hch, yi.zhang, yangerkun,
yukuai
In-Reply-To: <ahmxmrWzIWRR9PUm@li-dc0c254c-257c-11b2-a85c-98b6c1322444.ibm.com>
On 5/29/2026 11:32 PM, Ojaswin Mujoo wrote:
> On Fri, May 29, 2026 at 10:12:12PM +0800, Zhang Yi wrote:
>> Hi, Ojaswin!
>>
>> On 5/27/2026 8:49 PM, Ojaswin Mujoo wrote:
>>> On Mon, May 11, 2026 at 03:23:29PM +0800, Zhang Yi wrote:
>>>> From: Zhang Yi <yi.zhang@huawei.com>
>>>>
>>>> Add the iomap writeback path for ext4 buffered I/O. This introduces:
>>>>
>>>> - ext4_iomap_writepages(): the main writeback entry point.
>>>> - ext4_writeback_ops: a new iomap_writeback_ops instance to handle
>>>> block mapping and I/O submission.
>>>> - A new end I/O worker for converting unwritten extents, updating file
>>>> size, and handling DATA_ERR_ABORT after I/O completion.
>>>>
>>>> Core implementation details:
>>>>
>>>> - ->writeback_range() callback
>>>> Calls ext4_iomap_map_writeback_range() to query the longest range of
>>>> existing mapped extents. For performance, when a block range is not
>>>> yet allocated, it allocates based on the writeback length and delalloc
>>>> extent length, rather than allocating for a single folio at a time.
>>>> The folio is then added to an iomap_ioend instance.
>>>>
>>>> - ->writeback_submit() callback
>>>> Registers ext4_iomap_end_bio() as the end bio callback. This callback
>>>> schedules a worker to handle:
>>>> - Unwritten extent conversion.
>>>> - i_disksize update after data is written back.
>>>> - Journal abort on writeback I/O failure.
>>>
>>> Hi Zhang, the changes look good. I have a few comments below:
>>>>
>>>> Key changes and considerations:
>>>>
>>>> - Append write and unwritten extents
>>>> Since data=ordered mode is not used to prevent stale data exposure
>>>> during append writebacks, new blocks are always allocated as unwritten
>>>> extents (i.e. always enable dioread_nolock), and i_disksize update is
>>>> postponed until I/O completion.
>>>
>>> Makes sense.
>>>
>>>> Additionally, the deadlock that the
>>>> reserve handle was expected to resolve does not occur anymore.
>>>
>>> I guess this is since we don't use ordered data so we can't block on
>>> starting a txn in end io.
>>
>> Yep.
>>
>>>
>>>> Therefore, the end I/O worker can start a normal journal handle
>>>> instead of a reserve handle when converting unwritten extents.
>>>>
>>>> - Lock ordering
>>>> The ->writeback_range() callback runs under the folio lock, requiring
>>>> the journal handle to be started under that same lock. This reverses
>>>> the order compared to the buffer_head writeback path. The lock ordering
>>>> documentation in super.c has been updated accordingly.
>>>>
>>>> Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
>>>> ---
>>>> fs/ext4/ext4.h | 4 +
>>>> fs/ext4/inode.c | 208 +++++++++++++++++++++++++++++++++++++++++-
>>>> fs/ext4/page-io.c | 126 +++++++++++++++++++++++++
>>>> fs/ext4/super.c | 7 +-
>>>> fs/iomap/ioend.c | 3 +-
>>>> include/linux/iomap.h | 1 +
>>>> 6 files changed, 346 insertions(+), 3 deletions(-)
>>>>
>>>> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
>>>> index 4832e7f7db82..078feda47e36 100644
>>>> --- a/fs/ext4/ext4.h
>>>> +++ b/fs/ext4/ext4.h
>>>> @@ -1173,6 +1173,8 @@ struct ext4_inode_info {
>>>> */
>>>> struct list_head i_rsv_conversion_list;
>>>> struct work_struct i_rsv_conversion_work;
>>>> + struct list_head i_iomap_ioend_list;
>>>> + struct work_struct i_iomap_ioend_work;
>>>> /*
>>>> * Transactions that contain inode's metadata needed to complete
>>>> @@ -3870,6 +3872,8 @@ int ext4_bio_write_folio(struct ext4_io_submit *io, struct folio *page,
>>>> size_t len);
>>>> extern struct ext4_io_end_vec *ext4_alloc_io_end_vec(ext4_io_end_t *io_end);
>>>> extern struct ext4_io_end_vec *ext4_last_io_end_vec(ext4_io_end_t *io_end);
>>>> +extern void ext4_iomap_end_io(struct work_struct *work);
>>>> +extern void ext4_iomap_end_bio(struct bio *bio);
>>>> /* mmp.c */
>>>> extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t);
>>>> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
>>>> index 1ae7d3f4a1c8..a80195bd6f20 100644
>>>> --- a/fs/ext4/inode.c
>>>> +++ b/fs/ext4/inode.c
>>>> @@ -44,6 +44,7 @@
>>>> #include <linux/iversion.h>
>>>> #include "ext4_jbd2.h"
>>>> +#include "ext4_extents.h"
>>>> #include "xattr.h"
>>>> #include "acl.h"
>>>> #include "truncate.h"
>>>> @@ -4120,10 +4121,215 @@ static void ext4_iomap_readahead(struct readahead_control *rac)
>>>> iomap_bio_readahead(rac, &ext4_iomap_buffered_read_ops);
>>>> }
>>>> +static int ext4_iomap_map_one_extent(struct inode *inode,
>>>> + struct ext4_map_blocks *map)
>>>> +{
>>>> + struct extent_status es;
>>>> + handle_t *handle = NULL;
>>>> + int credits, map_flags;
>>>> + int retval;
>>>> +
>>>> + credits = ext4_chunk_trans_blocks(inode, map->m_len);
>>>> + handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, credits);
>>>> + if (IS_ERR(handle))
>>>> + return PTR_ERR(handle);
>>>> +
>>>> + map->m_flags = 0;
>>>> + /*
>>>> + * It is necessary to look up extent and map blocks under i_data_sem
>>>> + * in write mode, otherwise, the delalloc extent may become stale
>>>> + * during concurrent truncate operations.
>>>> + */
>>>> + ext4_fc_track_inode(handle, inode);
>>>> + down_write(&EXT4_I(inode)->i_data_sem);
>>>> + if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es, &map->m_seq)) {
>>>> + retval = es.es_len - (map->m_lblk - es.es_lblk);
>>>> + map->m_len = min_t(unsigned int, retval, map->m_len);
>>>> +
>>>> + if (ext4_es_is_delayed(&es)) {
>>>
>>> I understand that it is okay for us to rely on extent status ==
>>> delayed here because we never reclaim delayed es entries and hence we
>>> are sure to not skip any delayed block allocations here.
>>
>> Yeah, right.
>>
>>>
>>>> + map->m_flags |= EXT4_MAP_DELAYED;
>>>> + trace_ext4_da_write_pages_extent(inode, map);
>>>> + /*
>>>> + * Call ext4_map_create_blocks() to allocate any
>>>> + * delayed allocation blocks. It is possible that
>>>> + * we're going to need more metadata blocks, however
>>>> + * we must not fail because we're in writeback and
>>>> + * there is nothing we can do so it might result in
>>>> + * data loss. So use reserved blocks to allocate
>>>> + * metadata if possible.
>>>> + */
>>>> + map_flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT |
>>>> + EXT4_GET_BLOCKS_METADATA_NOFAIL |
>>>> + EXT4_EX_NOCACHE;
>>>> +
>>>> + retval = ext4_map_create_blocks(handle, inode, map,
>>>> + map_flags);
>>>> + if (retval > 0)
>>>> + ext4_fc_track_range(handle, inode, map->m_lblk,
>>>> + map->m_lblk + map->m_len - 1);
>>>> + goto out;
>>>> + } else if (unlikely(ext4_es_is_hole(&es)))
>>>
>>> Now that you've fixed the partial invalidate in iomap (patch 12/23)
>>> can we still hit this hole case?
>>
>> Theoretically, no hole should be encountered; this is just defensive
>> programming.
>>
>>>
>>>> + goto out;
>>>> +
>>>> + /* Found written or unwritten extent. */
>>>> + map->m_pblk = ext4_es_pblock(&es) + map->m_lblk - es.es_lblk;
>>>> + map->m_flags = ext4_es_is_written(&es) ?
>>>> + EXT4_MAP_MAPPED : EXT4_MAP_UNWRITTEN;
>>>> + goto out;
>>>> + }
>>>> +
>>>> + retval = ext4_map_query_blocks(handle, inode, map, EXT4_EX_NOCACHE);
>>>> +out:
>>>> + up_write(&EXT4_I(inode)->i_data_sem);
>>>> + ext4_journal_stop(handle);
>>>> + return retval < 0 ? retval : 0;
>>>> +}
>>>> +
>>>> +static int ext4_iomap_map_writeback_range(struct iomap_writepage_ctx *wpc,
>>>> + loff_t offset, unsigned int dirty_len)
>>>> +{
>>>> + struct inode *inode = wpc->inode;
>>>> + struct super_block *sb = inode->i_sb;
>>>> + struct journal_s *journal = EXT4_SB(sb)->s_journal;
>>>> + struct ext4_map_blocks map;
>>>> + unsigned int blkbits = inode->i_blkbits;
>>>> + unsigned int index = offset >> blkbits;
>>>> + unsigned int blk_end, blk_len;
>>>> + int ret;
>>>> +
>>>> + ret = ext4_emergency_state(sb);
>>>> + if (unlikely(ret))
>>>> + return ret;
>>>> +
>>>> + /* Check validity of the cached writeback mapping. */
>>>> + if (offset >= wpc->iomap.offset &&
>>>> + offset < wpc->iomap.offset + wpc->iomap.length &&
>>>> + ext4_iomap_valid(inode, &wpc->iomap))
>>>> + return 0;
>>>> +
>>>> + blk_len = dirty_len >> blkbits;
>>>> + blk_end = min_t(unsigned int, (wpc->wbc->range_end >> blkbits),
>>>> + (UINT_MAX - 1));
>>>
>>> This is an interesting idea. I'm just a bit worried when we have
>>> range_end == LLONG_MAX (bg flush) and we will always be trying to allocate
>>> MAX_WRITEPAGES, incase of a slightly fragmented FS, we might keep
>>> falling into slower mballoc criterias and might waste a lot of time
>>> scanning the groups.
>>
>> Actually, MAX_WRITEPAGES is not allocated every time; the allocated
>> length also depends on the length of data that has already been delayed
>> for writing, and the smaller value is taken. If the user has indeed
>
> Hmm so we take the blk_end based on range_end (which is LLONG_MAX for bg
> flusher) and then our blk_len will be set accordingly and would become a
> large number as well. Then we will set map.m_len based on this blk_len
> and MAX_WRITEPAGES. Am I missing something that clamps our m_len?
>
Please take a look at ext4_iomap_map_one_extent():
+ retval = es.es_len - (map->m_lblk - es.es_lblk);
+ map->m_len = min_t(unsigned int, retval, map->m_len);
In this case, m_len is truncated to the length of the delalloc extent.
Thanks,
Yi.
^ permalink raw reply
* Re: [PATCH v2] jbd2: Remove special jbd2 slabs
From: Tal Zussman @ 2026-05-29 21:09 UTC (permalink / raw)
To: Matthew Wilcox (Oracle), Theodore Tso
Cc: Jan Kara, linux-ext4, linux-fsdevel, Mike Rapoport (Microsoft),
Vlastimil Babka, Jan Kara
In-Reply-To: <20260528171413.1088143-1-willy@infradead.org>
On 5/28/26 1:14 PM, Matthew Wilcox (Oracle) wrote:
> When jbd2 was originally written, kmalloc() would not guarantee memory
> alignment for the requested objects. Since commit 59bb47985c1d in 2019,
> kmalloc has guaranteed natural alignment for power-of-two allocations.
> We can now remove the jbd2 special slabs and just use kmalloc() directly.
>
> Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
> Reviewed-by: Jan Kara <jack@suse.cz>
> Acked-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
> Acked-by: Vlastimil Babka (SUSE) <vbabka@kernel.org>
FWIW:
Reviewed-by: Tal Zussman <tz2294@columbia.edu>
^ permalink raw reply
* [PATCH v4 2/2] ext4: improve str2hashbuf by processing 4-byte chunks and removing function pointers
From: Guan-Chun Wu @ 2026-05-29 20:03 UTC (permalink / raw)
To: Theodore Ts'o, Andreas Dilger, Baokun Li, Jan Kara,
Ojaswin Mujoo, Ritesh Harjani, Zhang Yi
Cc: linux-ext4, linux-kernel, edward062254, visitorckw,
david.laight.linux, Guan-Chun Wu
In-Reply-To: <20260529200308.889706-1-409411716@gms.tku.edu.tw>
The original byte-by-byte implementation with modulo checks is less
efficient. Refactor str2hashbuf_unsigned() and str2hashbuf_signed()
to process input in explicit 4-byte chunks instead of using a
modulus-based loop to emit words byte by byte.
Additionally, the use of function pointers for selecting the appropriate
str2hashbuf implementation has been removed. Instead, the functions are
directly invoked based on the hash type, eliminating the overhead of
dynamic function calls.
Performance test (x86_64, Intel Core i7-10700 @ 2.90GHz, average over 10000
runs, using kernel module for testing):
len | orig_s | new_s | orig_u | new_u
----+--------+-------+--------+-------
1 | 70 | 71 | 63 | 63
8 | 68 | 64 | 64 | 62
32 | 75 | 70 | 75 | 63
64 | 96 | 71 | 100 | 68
255 | 192 | 108 | 187 | 84
This change improves performance, especially for larger input sizes.
Signed-off-by: Guan-Chun Wu <409411716@gms.tku.edu.tw>
---
fs/ext4/hash.c | 64 +++++++++++++++++++++++++++++++++-----------------
1 file changed, 42 insertions(+), 22 deletions(-)
diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c
index 72645bd92..d6e33b1f3 100644
--- a/fs/ext4/hash.c
+++ b/fs/ext4/hash.c
@@ -9,6 +9,7 @@
#include <linux/unicode.h>
#include <linux/compiler.h>
#include <linux/bitops.h>
+#include <linux/unaligned.h>
#include "ext4.h"
#define DELTA 0x9E3779B9
@@ -141,21 +142,28 @@ static void str2hashbuf_signed(const char *msg, int len, __u32 *buf, int num)
pad = (__u32)len | ((__u32)len << 8);
pad |= pad << 16;
- val = pad;
if (len > num*4)
len = num * 4;
- for (i = 0; i < len; i++) {
- val = ((int) scp[i]) + (val << 8);
- if ((i % 4) == 3) {
- *buf++ = val;
- val = pad;
- num--;
- }
+
+ while (len >= 4) {
+ val = (scp[0] << 24) + (scp[1] << 16) + (scp[2] << 8) + scp[3];
+ *buf++ = val;
+ scp += 4;
+ len -= 4;
+ num--;
}
+
+ val = pad;
+
+ for (i = 0; i < len; i++)
+ val = scp[i] + (val << 8);
+
if (--num >= 0)
*buf++ = val;
+
while (--num >= 0)
*buf++ = pad;
+
}
static void str2hashbuf_unsigned(const char *msg, int len, __u32 *buf, int num)
@@ -167,21 +175,28 @@ static void str2hashbuf_unsigned(const char *msg, int len, __u32 *buf, int num)
pad = (__u32)len | ((__u32)len << 8);
pad |= pad << 16;
- val = pad;
if (len > num*4)
len = num * 4;
- for (i = 0; i < len; i++) {
- val = ((int) ucp[i]) + (val << 8);
- if ((i % 4) == 3) {
- *buf++ = val;
- val = pad;
- num--;
- }
+
+ while (len >= 4) {
+ val = get_unaligned_be32(ucp);
+ *buf++ = val;
+ ucp += 4;
+ len -= 4;
+ num--;
}
+
+ val = pad;
+
+ for (i = 0; i < len; i++)
+ val = ucp[i] + (val << 8);
+
if (--num >= 0)
*buf++ = val;
+
while (--num >= 0)
*buf++ = pad;
+
}
/*
@@ -205,8 +220,7 @@ static int __ext4fs_dirhash(const struct inode *dir, const char *name, int len,
const char *p;
int i;
__u32 in[8], buf[4];
- void (*str2hashbuf)(const char *, int, __u32 *, int) =
- str2hashbuf_signed;
+ bool use_unsigned = false;
/* Initialize the default seed for the hash checksum functions */
buf[0] = 0x67452301;
@@ -232,12 +246,15 @@ static int __ext4fs_dirhash(const struct inode *dir, const char *name, int len,
hash = dx_hack_hash_signed(name, len);
break;
case DX_HASH_HALF_MD4_UNSIGNED:
- str2hashbuf = str2hashbuf_unsigned;
+ use_unsigned = true;
fallthrough;
case DX_HASH_HALF_MD4:
p = name;
while (len > 0) {
- (*str2hashbuf)(p, len, in, 8);
+ if (use_unsigned)
+ str2hashbuf_unsigned(p, len, in, 8);
+ else
+ str2hashbuf_signed(p, len, in, 8);
half_md4_transform(buf, in);
len -= 32;
p += 32;
@@ -246,12 +263,15 @@ static int __ext4fs_dirhash(const struct inode *dir, const char *name, int len,
hash = buf[1];
break;
case DX_HASH_TEA_UNSIGNED:
- str2hashbuf = str2hashbuf_unsigned;
+ use_unsigned = true;
fallthrough;
case DX_HASH_TEA:
p = name;
while (len > 0) {
- (*str2hashbuf)(p, len, in, 4);
+ if (use_unsigned)
+ str2hashbuf_unsigned(p, len, in, 4);
+ else
+ str2hashbuf_signed(p, len, in, 4);
TEA_transform(buf, in);
len -= 16;
p += 16;
--
2.34.1
^ permalink raw reply related
* [PATCH v4 1/2] ext4: add Kunit coverage for directory hash computation
From: Guan-Chun Wu @ 2026-05-29 20:03 UTC (permalink / raw)
To: Theodore Ts'o, Andreas Dilger, Baokun Li, Jan Kara,
Ojaswin Mujoo, Ritesh Harjani, Zhang Yi
Cc: linux-ext4, linux-kernel, edward062254, visitorckw,
david.laight.linux, Guan-Chun Wu
In-Reply-To: <20260529200308.889706-1-409411716@gms.tku.edu.tw>
Introduce Kunit tests for fs/ext4/hash.c to verify ext4fs_dirhash()
across the legacy, half-MD4, and TEA hash variants.
The tests cover empty, seeded hashing, and non-ASCII name handling.
They also verify error paths, including invalid hash versions and
SipHash without a configured key, and check that the signed and
unsigned hash variants differ on non-ASCII input as expected.
When CONFIG_UNICODE is enabled, the tests further verify casefolded-name
hashing and the fallback behavior for invalid input.
Co-developed-by: Chen Hao Yu <edward062254@gmail.com>
Signed-off-by: Chen Hao Yu <edward062254@gmail.com>
Signed-off-by: Guan-Chun Wu <409411716@gms.tku.edu.tw>
---
fs/ext4/Makefile | 2 +-
fs/ext4/hash-test.c | 546 ++++++++++++++++++++++++++++++++++++++++++++
fs/ext4/hash.c | 4 +
3 files changed, 551 insertions(+), 1 deletion(-)
create mode 100644 fs/ext4/hash-test.c
diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile
index 3baee4e7c..3f9fc0eb8 100644
--- a/fs/ext4/Makefile
+++ b/fs/ext4/Makefile
@@ -15,7 +15,7 @@ ext4-y := balloc.o bitmap.o block_validity.o dir.o ext4_jbd2.o extents.o \
ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o
ext4-$(CONFIG_EXT4_FS_SECURITY) += xattr_security.o
ext4-test-objs += inode-test.o mballoc-test.o \
- extents-test.o
+ extents-test.o hash-test.o
obj-$(CONFIG_EXT4_KUNIT_TESTS) += ext4-test.o
ext4-$(CONFIG_FS_VERITY) += verity.o
ext4-$(CONFIG_FS_ENCRYPTION) += crypto.o
diff --git a/fs/ext4/hash-test.c b/fs/ext4/hash-test.c
new file mode 100644
index 000000000..a151b5684
--- /dev/null
+++ b/fs/ext4/hash-test.c
@@ -0,0 +1,546 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * KUnit tests for ext4 directory hash computation.
+ */
+
+#include <kunit/test.h>
+#include <linux/fs.h>
+#include <linux/string.h>
+#include <linux/unicode.h>
+#include "ext4.h"
+
+static void ext4_hash_init_fake_dir(struct inode *dir, struct super_block *sb)
+{
+ memset(sb, 0, sizeof(*sb));
+ memset(dir, 0, sizeof(*dir));
+ dir->i_sb = sb;
+ strscpy(sb->s_id, "kunit-ext4", sizeof(sb->s_id));
+}
+
+static void ext4_hash_init_fake_dir_with_sbi(struct inode *dir,
+ struct super_block *sb,
+ struct ext4_sb_info *sbi)
+{
+ ext4_hash_init_fake_dir(dir, sb);
+ memset(sbi, 0, sizeof(*sbi));
+ sb->s_fs_info = sbi;
+ sbi->s_sb = sb;
+}
+
+#if IS_ENABLED(CONFIG_UNICODE)
+static void ext4_hash_init_fake_ext4_dir(struct ext4_inode_info *ei,
+ struct super_block *sb,
+ struct ext4_sb_info *sbi)
+{
+ memset(sb, 0, sizeof(*sb));
+ memset(ei, 0, sizeof(*ei));
+ memset(sbi, 0, sizeof(*sbi));
+
+ inode_init_once(&ei->vfs_inode);
+ ei->vfs_inode.i_sb = sb;
+ ei->vfs_inode.i_mode = S_IFDIR;
+
+ strscpy(sb->s_id, "kunit-ext4", sizeof(sb->s_id));
+ sb->s_fs_info = sbi;
+ sbi->s_sb = sb;
+}
+#endif
+
+struct ext4_dirhash_test_case {
+ const char *name;
+ u32 hash_version;
+ const char *input;
+ int len;
+ u32 seed[4];
+ bool use_seed;
+ u32 expected_hash;
+ u32 expected_minor_hash;
+};
+
+static const struct ext4_dirhash_test_case ext4_dirhash_test_cases[] = {
+ {
+ .name = "legacy_abc",
+ .hash_version = DX_HASH_LEGACY,
+ .input = "abc",
+ .len = 3,
+ .use_seed = false,
+ .expected_hash = 0x75afd992,
+ .expected_minor_hash = 0x00000000,
+ },
+ {
+ .name = "legacy_unsigned_abc",
+ .hash_version = DX_HASH_LEGACY_UNSIGNED,
+ .input = "abc",
+ .len = 3,
+ .use_seed = false,
+ .expected_hash = 0x75afd992,
+ .expected_minor_hash = 0x00000000,
+ },
+ {
+ .name = "half_md4_abc",
+ .hash_version = DX_HASH_HALF_MD4,
+ .input = "abc",
+ .len = 3,
+ .use_seed = false,
+ .expected_hash = 0xd196a868,
+ .expected_minor_hash = 0xc420eb28,
+ },
+ {
+ .name = "half_md4_unsigned_abc",
+ .hash_version = DX_HASH_HALF_MD4_UNSIGNED,
+ .input = "abc",
+ .len = 3,
+ .use_seed = false,
+ .expected_hash = 0xd196a868,
+ .expected_minor_hash = 0xc420eb28,
+ },
+ {
+ .name = "tea_abc",
+ .hash_version = DX_HASH_TEA,
+ .input = "abc",
+ .len = 3,
+ .use_seed = false,
+ .expected_hash = 0xb1435ec4,
+ .expected_minor_hash = 0x3f7eaa0e,
+ },
+ {
+ .name = "tea_unsigned_abc",
+ .hash_version = DX_HASH_TEA_UNSIGNED,
+ .input = "abc",
+ .len = 3,
+ .use_seed = false,
+ .expected_hash = 0xb1435ec4,
+ .expected_minor_hash = 0x3f7eaa0e,
+ },
+ {
+ .name = "empty_half_md4",
+ .hash_version = DX_HASH_HALF_MD4,
+ .input = "",
+ .len = 0,
+ .use_seed = false,
+ .expected_hash = 0xefcdab88,
+ .expected_minor_hash = 0x98badcfe,
+ },
+ {
+ .name = "half_md4_31bytes",
+ .hash_version = DX_HASH_HALF_MD4,
+ .input = "1234567890123456789012345678901",
+ .len = 31,
+ .use_seed = false,
+ .expected_hash = 0xc4db1f78,
+ .expected_minor_hash = 0xea23921b,
+ },
+ {
+ .name = "half_md4_32bytes",
+ .hash_version = DX_HASH_HALF_MD4,
+ .input = "12345678901234567890123456789012",
+ .len = 32,
+ .use_seed = false,
+ .expected_hash = 0xfa6cc63e,
+ .expected_minor_hash = 0x2f77bd1c,
+ },
+ {
+ .name = "half_md4_33bytes",
+ .hash_version = DX_HASH_HALF_MD4,
+ .input = "123456789012345678901234567890123",
+ .len = 33,
+ .use_seed = false,
+ .expected_hash = 0xdc0c2dec,
+ .expected_minor_hash = 0x5ca23365,
+ },
+ {
+ .name = "half_md4_unsigned_31bytes",
+ .hash_version = DX_HASH_HALF_MD4_UNSIGNED,
+ .input = "1234567890123456789012345678901",
+ .len = 31,
+ .use_seed = false,
+ .expected_hash = 0xc4db1f78,
+ .expected_minor_hash = 0xea23921b,
+ },
+ {
+ .name = "half_md4_unsigned_32bytes",
+ .hash_version = DX_HASH_HALF_MD4_UNSIGNED,
+ .input = "12345678901234567890123456789012",
+ .len = 32,
+ .use_seed = false,
+ .expected_hash = 0xfa6cc63e,
+ .expected_minor_hash = 0x2f77bd1c,
+ },
+ {
+ .name = "half_md4_unsigned_33bytes",
+ .hash_version = DX_HASH_HALF_MD4_UNSIGNED,
+ .input = "123456789012345678901234567890123",
+ .len = 33,
+ .use_seed = false,
+ .expected_hash = 0xdc0c2dec,
+ .expected_minor_hash = 0x5ca23365,
+ },
+ {
+ .name = "tea_15bytes",
+ .hash_version = DX_HASH_TEA,
+ .input = "123456789abcdef",
+ .len = 15,
+ .use_seed = false,
+ .expected_hash = 0xa562903a,
+ .expected_minor_hash = 0x6174a00f,
+ },
+ {
+ .name = "tea_16bytes",
+ .hash_version = DX_HASH_TEA,
+ .input = "1234567890abcdef",
+ .len = 16,
+ .use_seed = false,
+ .expected_hash = 0x8449f258,
+ .expected_minor_hash = 0x49a16d46,
+ },
+ {
+ .name = "tea_17bytes",
+ .hash_version = DX_HASH_TEA,
+ .input = "123456789abcdefgh",
+ .len = 17,
+ .use_seed = false,
+ .expected_hash = 0xf32ec10c,
+ .expected_minor_hash = 0x58ceae61,
+ },
+ {
+ .name = "half_md4_seeded",
+ .hash_version = DX_HASH_HALF_MD4,
+ .input = "same-name",
+ .len = 9,
+ .seed = { 0x11111111, 0x22222222, 0x33333333, 0x44444444 },
+ .use_seed = true,
+ .expected_hash = 0x8aebf604,
+ .expected_minor_hash = 0x66ce48fe,
+ },
+ {
+ .name = "half_md4_non_ascii_signed",
+ .hash_version = DX_HASH_HALF_MD4,
+ .input = "\x80\x81\x82\x83\x84",
+ .len = 5,
+ .use_seed = false,
+ .expected_hash = 0x8bab0498,
+ .expected_minor_hash = 0xc326632d,
+ },
+ {
+ .name = "half_md4_non_ascii_unsigned",
+ .hash_version = DX_HASH_HALF_MD4_UNSIGNED,
+ .input = "\x80\x81\x82\x83\x84",
+ .len = 5,
+ .use_seed = false,
+ .expected_hash = 0xbc48596e,
+ .expected_minor_hash = 0xde0fad41,
+ },
+ {
+ .name = "tea_non_ascii_signed",
+ .hash_version = DX_HASH_TEA,
+ .input = "\x80\x81\x82\x83\x84",
+ .len = 5,
+ .use_seed = false,
+ .expected_hash = 0x21e3a154,
+ .expected_minor_hash = 0x90112c3d,
+ },
+ {
+ .name = "tea_non_ascii_unsigned",
+ .hash_version = DX_HASH_TEA_UNSIGNED,
+ .input = "\x80\x81\x82\x83\x84",
+ .len = 5,
+ .use_seed = false,
+ .expected_hash = 0x9b648616,
+ .expected_minor_hash = 0x011dd507,
+ },
+};
+
+static void test_ext4fs_dirhash_vectors(struct kunit *test)
+{
+ struct super_block *sb;
+ struct inode *dir;
+ int i;
+
+ sb = kunit_kzalloc(test, sizeof(*sb), GFP_KERNEL);
+ dir = kunit_kzalloc(test, sizeof(*dir), GFP_KERNEL);
+ KUNIT_ASSERT_NOT_NULL(test, sb);
+ KUNIT_ASSERT_NOT_NULL(test, dir);
+
+ ext4_hash_init_fake_dir(dir, sb);
+
+ for (i = 0; i < ARRAY_SIZE(ext4_dirhash_test_cases); i++) {
+ const struct ext4_dirhash_test_case *tc =
+ &ext4_dirhash_test_cases[i];
+ struct dx_hash_info hinfo;
+ int ret;
+
+ memset(&hinfo, 0, sizeof(hinfo));
+ hinfo.hash_version = tc->hash_version;
+ hinfo.seed = tc->use_seed ? (u32 *)tc->seed : NULL;
+
+ ret = ext4fs_dirhash(dir, tc->input, tc->len, &hinfo);
+
+ KUNIT_ASSERT_EQ_MSG(test, ret, 0, "case=%s", tc->name);
+ KUNIT_EXPECT_EQ_MSG(test, hinfo.hash, tc->expected_hash,
+ "case=%s", tc->name);
+ KUNIT_EXPECT_EQ_MSG(test, hinfo.minor_hash,
+ tc->expected_minor_hash,
+ "case=%s", tc->name);
+ }
+}
+
+static void test_ext4fs_dirhash_seed_changes_result(struct kunit *test)
+{
+ struct super_block *sb;
+ struct inode *dir;
+ u32 seed[4] = { 0x11111111, 0x22222222, 0x33333333, 0x44444444 };
+ struct dx_hash_info plain = {
+ .hash_version = DX_HASH_HALF_MD4,
+ };
+ struct dx_hash_info seeded = {
+ .hash_version = DX_HASH_HALF_MD4,
+ .seed = seed,
+ };
+ int ret_plain, ret_seeded;
+
+ sb = kunit_kzalloc(test, sizeof(*sb), GFP_KERNEL);
+ dir = kunit_kzalloc(test, sizeof(*dir), GFP_KERNEL);
+ KUNIT_ASSERT_NOT_NULL(test, sb);
+ KUNIT_ASSERT_NOT_NULL(test, dir);
+
+ ext4_hash_init_fake_dir(dir, sb);
+
+ ret_plain = ext4fs_dirhash(dir, "same-name", 9, &plain);
+ ret_seeded = ext4fs_dirhash(dir, "same-name", 9, &seeded);
+
+ KUNIT_ASSERT_EQ(test, ret_plain, 0);
+ KUNIT_ASSERT_EQ(test, ret_seeded, 0);
+
+ KUNIT_EXPECT_TRUE(test,
+ plain.hash != seeded.hash ||
+ plain.minor_hash != seeded.minor_hash);
+}
+
+static void test_ext4fs_dirhash_invalid_version_returns_einval(struct kunit *test)
+{
+ struct super_block *sb;
+ struct inode *dir;
+ struct ext4_sb_info *sbi;
+ struct dx_hash_info hinfo = {
+ .hash = 0xdeadbeef,
+ .minor_hash = 0xcafebabe,
+ .hash_version = DX_HASH_LAST + 1,
+ };
+ int ret;
+
+ sb = kunit_kzalloc(test, sizeof(*sb), GFP_KERNEL);
+ dir = kunit_kzalloc(test, sizeof(*dir), GFP_KERNEL);
+ sbi = kunit_kzalloc(test, sizeof(*sbi), GFP_KERNEL);
+ KUNIT_ASSERT_NOT_NULL(test, sb);
+ KUNIT_ASSERT_NOT_NULL(test, dir);
+ KUNIT_ASSERT_NOT_NULL(test, sbi);
+
+ ext4_hash_init_fake_dir_with_sbi(dir, sb, sbi);
+
+ ret = ext4fs_dirhash(dir, "abc", 3, &hinfo);
+
+ KUNIT_EXPECT_EQ(test, ret, -EINVAL);
+ KUNIT_EXPECT_EQ(test, hinfo.hash, 0);
+ KUNIT_EXPECT_EQ(test, hinfo.minor_hash, 0);
+}
+
+static void test_ext4fs_dirhash_siphash_without_key_returns_einval(struct kunit *test)
+{
+ struct super_block *sb;
+ struct inode *dir;
+ struct ext4_sb_info *sbi;
+ struct dx_hash_info hinfo = {
+ .hash_version = DX_HASH_SIPHASH,
+ };
+ int ret;
+
+ sb = kunit_kzalloc(test, sizeof(*sb), GFP_KERNEL);
+ dir = kunit_kzalloc(test, sizeof(*dir), GFP_KERNEL);
+ sbi = kunit_kzalloc(test, sizeof(*sbi), GFP_KERNEL);
+ KUNIT_ASSERT_NOT_NULL(test, sb);
+ KUNIT_ASSERT_NOT_NULL(test, dir);
+ KUNIT_ASSERT_NOT_NULL(test, sbi);
+
+ ext4_hash_init_fake_dir_with_sbi(dir, sb, sbi);
+
+ ret = ext4fs_dirhash(dir, "abc", 3, &hinfo);
+
+ KUNIT_EXPECT_EQ(test, ret, -EINVAL);
+}
+
+static void test_ext4fs_dirhash_signed_unsigned_differ_on_nonascii(struct kunit *test)
+{
+ struct super_block *sb;
+ struct inode *dir;
+ static const char input[] = "\x80\xff\x81\xfe\101bc";
+ struct dx_hash_info legacy_signed = {
+ .hash_version = DX_HASH_LEGACY,
+ };
+ struct dx_hash_info legacy_unsigned = {
+ .hash_version = DX_HASH_LEGACY_UNSIGNED,
+ };
+ struct dx_hash_info md4_signed = {
+ .hash_version = DX_HASH_HALF_MD4,
+ };
+ struct dx_hash_info md4_unsigned = {
+ .hash_version = DX_HASH_HALF_MD4_UNSIGNED,
+ };
+ struct dx_hash_info tea_signed = {
+ .hash_version = DX_HASH_TEA,
+ };
+ struct dx_hash_info tea_unsigned = {
+ .hash_version = DX_HASH_TEA_UNSIGNED,
+ };
+ int ret;
+
+ sb = kunit_kzalloc(test, sizeof(*sb), GFP_KERNEL);
+ dir = kunit_kzalloc(test, sizeof(*dir), GFP_KERNEL);
+ KUNIT_ASSERT_NOT_NULL(test, sb);
+ KUNIT_ASSERT_NOT_NULL(test, dir);
+
+ ext4_hash_init_fake_dir(dir, sb);
+
+ ret = ext4fs_dirhash(dir, input, sizeof(input) - 1, &legacy_signed);
+ KUNIT_ASSERT_EQ(test, ret, 0);
+ ret = ext4fs_dirhash(dir, input, sizeof(input) - 1, &legacy_unsigned);
+ KUNIT_ASSERT_EQ(test, ret, 0);
+ KUNIT_EXPECT_NE(test, legacy_signed.hash, legacy_unsigned.hash);
+
+ ret = ext4fs_dirhash(dir, input, sizeof(input) - 1, &md4_signed);
+ KUNIT_ASSERT_EQ(test, ret, 0);
+ ret = ext4fs_dirhash(dir, input, sizeof(input) - 1, &md4_unsigned);
+ KUNIT_ASSERT_EQ(test, ret, 0);
+ KUNIT_EXPECT_TRUE(test,
+ md4_signed.hash != md4_unsigned.hash ||
+ md4_signed.minor_hash != md4_unsigned.minor_hash);
+
+ ret = ext4fs_dirhash(dir, input, sizeof(input) - 1, &tea_signed);
+ KUNIT_ASSERT_EQ(test, ret, 0);
+ ret = ext4fs_dirhash(dir, input, sizeof(input) - 1, &tea_unsigned);
+ KUNIT_ASSERT_EQ(test, ret, 0);
+ KUNIT_EXPECT_TRUE(test,
+ tea_signed.hash != tea_unsigned.hash ||
+ tea_signed.minor_hash != tea_unsigned.minor_hash);
+}
+
+#if IS_ENABLED(CONFIG_UNICODE)
+static void test_ext4fs_dirhash_casefolded_names_hash_consistently(struct kunit *test)
+{
+ struct super_block *sb;
+ struct ext4_inode_info *ei;
+ struct ext4_sb_info *sbi;
+ struct unicode_map *um;
+ struct dx_hash_info h1 = {
+ .hash_version = DX_HASH_HALF_MD4,
+ };
+ struct dx_hash_info h2 = {
+ .hash_version = DX_HASH_HALF_MD4,
+ };
+ int ret1, ret2;
+
+ sb = kunit_kzalloc(test, sizeof(*sb), GFP_KERNEL);
+ ei = kunit_kzalloc(test, sizeof(*ei), GFP_KERNEL);
+ sbi = kunit_kzalloc(test, sizeof(*sbi), GFP_KERNEL);
+ KUNIT_ASSERT_NOT_NULL(test, sb);
+ KUNIT_ASSERT_NOT_NULL(test, ei);
+ KUNIT_ASSERT_NOT_NULL(test, sbi);
+
+ um = utf8_load(UTF8_LATEST);
+ if (!um) {
+ kunit_skip(test, "utf8_load(UTF8_LATEST) failed");
+ return;
+ }
+
+ ext4_hash_init_fake_ext4_dir(ei, sb, sbi);
+ sb->s_encoding = um;
+ ei->vfs_inode.i_flags |= S_CASEFOLD;
+
+ KUNIT_ASSERT_TRUE(test, IS_CASEFOLDED(&ei->vfs_inode));
+
+ ret1 = ext4fs_dirhash(&ei->vfs_inode, "Alpha", 5, &h1);
+ ret2 = ext4fs_dirhash(&ei->vfs_inode, "aLPHa", 5, &h2);
+
+ KUNIT_ASSERT_EQ(test, ret1, 0);
+ KUNIT_ASSERT_EQ(test, ret2, 0);
+ KUNIT_EXPECT_EQ(test, h1.hash, h2.hash);
+ KUNIT_EXPECT_EQ(test, h1.minor_hash, h2.minor_hash);
+
+ utf8_unload(um);
+}
+
+static void test_ext4fs_dirhash_casefold_fallback(struct kunit *test)
+{
+ struct super_block *sb_cf, *sb_plain;
+ struct ext4_inode_info *ei;
+ struct ext4_sb_info *sbi;
+ struct inode *plain_dir;
+ struct unicode_map *um;
+ static const char invalid_utf8[] = "\xc3\x28";
+ struct dx_hash_info folded_dir = {
+ .hash_version = DX_HASH_HALF_MD4,
+ };
+ struct dx_hash_info plain = {
+ .hash_version = DX_HASH_HALF_MD4,
+ };
+ int ret_cf, ret_plain;
+
+ sb_cf = kunit_kzalloc(test, sizeof(*sb_cf), GFP_KERNEL);
+ sb_plain = kunit_kzalloc(test, sizeof(*sb_plain), GFP_KERNEL);
+ ei = kunit_kzalloc(test, sizeof(*ei), GFP_KERNEL);
+ sbi = kunit_kzalloc(test, sizeof(*sbi), GFP_KERNEL);
+ plain_dir = kunit_kzalloc(test, sizeof(*plain_dir), GFP_KERNEL);
+ KUNIT_ASSERT_NOT_NULL(test, sb_cf);
+ KUNIT_ASSERT_NOT_NULL(test, sb_plain);
+ KUNIT_ASSERT_NOT_NULL(test, ei);
+ KUNIT_ASSERT_NOT_NULL(test, sbi);
+ KUNIT_ASSERT_NOT_NULL(test, plain_dir);
+
+ um = utf8_load(UTF8_LATEST);
+ if (!um) {
+ kunit_skip(test, "utf8_load(UTF8_LATEST) failed");
+ return;
+ }
+
+ ext4_hash_init_fake_ext4_dir(ei, sb_cf, sbi);
+ sb_cf->s_encoding = um;
+ ei->vfs_inode.i_flags |= S_CASEFOLD;
+
+ KUNIT_ASSERT_TRUE(test, IS_CASEFOLDED(&ei->vfs_inode));
+
+ ext4_hash_init_fake_dir(plain_dir, sb_plain);
+
+ ret_cf = ext4fs_dirhash(&ei->vfs_inode, invalid_utf8,
+ sizeof(invalid_utf8) - 1, &folded_dir);
+ ret_plain = ext4fs_dirhash(plain_dir, invalid_utf8,
+ sizeof(invalid_utf8) - 1, &plain);
+
+ KUNIT_ASSERT_EQ(test, ret_cf, 0);
+ KUNIT_ASSERT_EQ(test, ret_plain, 0);
+ KUNIT_EXPECT_EQ(test, folded_dir.hash, plain.hash);
+ KUNIT_EXPECT_EQ(test, folded_dir.minor_hash, plain.minor_hash);
+
+ utf8_unload(um);
+}
+#endif
+
+static struct kunit_case ext4_hash_test_cases[] = {
+ KUNIT_CASE(test_ext4fs_dirhash_vectors),
+ KUNIT_CASE(test_ext4fs_dirhash_seed_changes_result),
+ KUNIT_CASE(test_ext4fs_dirhash_invalid_version_returns_einval),
+ KUNIT_CASE(test_ext4fs_dirhash_siphash_without_key_returns_einval),
+ KUNIT_CASE(test_ext4fs_dirhash_signed_unsigned_differ_on_nonascii),
+#if IS_ENABLED(CONFIG_UNICODE)
+ KUNIT_CASE(test_ext4fs_dirhash_casefolded_names_hash_consistently),
+ KUNIT_CASE(test_ext4fs_dirhash_casefold_fallback),
+#endif
+ {}
+};
+
+static struct kunit_suite ext4_hash_test_suite = {
+ .name = "ext4_hash",
+ .test_cases = ext4_hash_test_cases,
+};
+
+kunit_test_suites(&ext4_hash_test_suite);
+
+MODULE_LICENSE("GPL");
diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c
index 48483cd01..72645bd92 100644
--- a/fs/ext4/hash.c
+++ b/fs/ext4/hash.c
@@ -321,3 +321,7 @@ int ext4fs_dirhash(const struct inode *dir, const char *name, int len,
#endif
return __ext4fs_dirhash(dir, name, len, hinfo);
}
+
+#if IS_ENABLED(CONFIG_EXT4_KUNIT_TESTS)
+EXPORT_SYMBOL_FOR_EXT4_TEST(ext4fs_dirhash);
+#endif
--
2.34.1
^ permalink raw reply related
* [PATCH v4 0/2] ext4: add hash Kunit tests and optimize str2hashbuf
From: Guan-Chun Wu @ 2026-05-29 20:03 UTC (permalink / raw)
To: Theodore Ts'o, Andreas Dilger, Baokun Li, Jan Kara,
Ojaswin Mujoo, Ritesh Harjani, Zhang Yi
Cc: linux-ext4, linux-kernel, edward062254, visitorckw,
david.laight.linux, Guan-Chun Wu
This series adds Kunit tests for fs/ext4/hash.c and refactors
the str2hashbuf_{signed,unsigned}() helpers.
Patch 1 adds test coverage for ext4fs_dirhash(), including the main
hash variants and relevant edge cases.
Patch 2 simplifies the str2hashbuf helper implementation by processing
input in 4-byte chunks and removing function-pointer dispatch. This also
reduces overhead and shows roughly 2x improvement on longer inputs in
local testing.
Thanks,
Guan-Chun Wu
Link: https://lore.kernel.org/all/20260413065114.730231-1-409411716@gms.tku.edu.tw/
---
v3 -> v4 :
- Fix a modpost undefined symbol error for ext4fs_dirhash when building
ext4-test.ko.
---
Guan-Chun Wu (2):
ext4: add Kunit coverage for directory hash computation
ext4: improve str2hashbuf by processing 4-byte chunks and removing
function pointers
fs/ext4/Makefile | 2 +-
fs/ext4/hash-test.c | 546 ++++++++++++++++++++++++++++++++++++++++++++
fs/ext4/hash.c | 67 ++++--
3 files changed, 592 insertions(+), 23 deletions(-)
create mode 100644 fs/ext4/hash-test.c
--
2.34.1
^ permalink raw reply
* [tytso-ext4:dev] BUILD SUCCESS 87280ac61ae6ef27fc6ee78b733689754e425592
From: kernel test robot @ 2026-05-29 16:26 UTC (permalink / raw)
To: Theodore Ts'o; +Cc: linux-ext4
tree/branch: https://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4.git dev
branch HEAD: 87280ac61ae6ef27fc6ee78b733689754e425592 ext4: Use %pe to print PTR_ERR()
elapsed time: 1030m
configs tested: 189
configs skipped: 2
The following configs have been built successfully.
More configs may be tested in the coming days.
tested configs:
alpha allnoconfig gcc-15.2.0
alpha allyesconfig gcc-15.2.0
alpha defconfig gcc-15.2.0
arc allmodconfig clang-16
arc allnoconfig gcc-15.2.0
arc allyesconfig clang-23
arc defconfig gcc-15.2.0
arc randconfig-001-20260529 clang-23
arc randconfig-002-20260529 clang-23
arm allnoconfig gcc-15.2.0
arm allyesconfig clang-16
arm defconfig gcc-15.2.0
arm lpc32xx_defconfig clang-17
arm randconfig-001-20260529 clang-23
arm randconfig-002-20260529 clang-23
arm randconfig-003-20260529 clang-23
arm randconfig-004-20260529 clang-23
arm64 allmodconfig clang-23
arm64 allnoconfig gcc-15.2.0
arm64 defconfig gcc-15.2.0
arm64 randconfig-001-20260529 clang-23
arm64 randconfig-002-20260529 clang-23
arm64 randconfig-003-20260529 clang-23
arm64 randconfig-004-20260529 clang-23
csky allmodconfig gcc-15.2.0
csky allnoconfig gcc-15.2.0
csky defconfig gcc-15.2.0
csky randconfig-001-20260529 clang-23
csky randconfig-002-20260529 clang-23
hexagon allmodconfig gcc-15.2.0
hexagon allnoconfig gcc-15.2.0
hexagon defconfig gcc-15.2.0
hexagon randconfig-001-20260529 gcc-8.5.0
hexagon randconfig-002-20260529 gcc-8.5.0
i386 allmodconfig clang-20
i386 allnoconfig gcc-15.2.0
i386 allyesconfig clang-20
i386 buildonly-randconfig-001-20260529 gcc-12
i386 buildonly-randconfig-002-20260529 gcc-12
i386 buildonly-randconfig-003-20260529 gcc-12
i386 buildonly-randconfig-004-20260529 gcc-12
i386 buildonly-randconfig-005-20260529 gcc-12
i386 buildonly-randconfig-006-20260529 gcc-12
i386 defconfig gcc-15.2.0
i386 randconfig-001-20260529 gcc-14
i386 randconfig-002-20260529 gcc-14
i386 randconfig-003-20260529 gcc-14
i386 randconfig-004-20260529 gcc-14
i386 randconfig-005-20260529 gcc-14
i386 randconfig-006-20260529 gcc-14
i386 randconfig-007-20260529 gcc-14
i386 randconfig-011-20260529 gcc-14
i386 randconfig-012-20260529 gcc-14
i386 randconfig-013-20260529 gcc-14
i386 randconfig-014-20260529 gcc-14
i386 randconfig-015-20260529 gcc-14
i386 randconfig-016-20260529 gcc-14
i386 randconfig-017-20260529 gcc-14
loongarch allmodconfig clang-23
loongarch allnoconfig gcc-15.2.0
loongarch defconfig clang-19
loongarch randconfig-001-20260529 gcc-8.5.0
loongarch randconfig-002-20260529 gcc-8.5.0
m68k allmodconfig gcc-15.2.0
m68k allnoconfig gcc-15.2.0
m68k allyesconfig clang-16
m68k defconfig clang-19
microblaze allnoconfig gcc-15.2.0
microblaze allyesconfig gcc-15.2.0
microblaze defconfig clang-19
mips allmodconfig gcc-15.2.0
mips allnoconfig gcc-15.2.0
mips allyesconfig gcc-15.2.0
nios2 alldefconfig gcc-11.5.0
nios2 allmodconfig clang-23
nios2 allnoconfig clang-23
nios2 defconfig clang-19
nios2 randconfig-001-20260529 gcc-8.5.0
nios2 randconfig-002-20260529 gcc-8.5.0
openrisc allmodconfig clang-23
openrisc allnoconfig clang-23
openrisc defconfig gcc-15.2.0
parisc allmodconfig gcc-15.2.0
parisc allnoconfig clang-23
parisc allyesconfig clang-19
parisc defconfig gcc-15.2.0
parisc randconfig-001 clang-19
parisc randconfig-001-20260529 clang-19
parisc randconfig-002 clang-19
parisc randconfig-002-20260529 clang-19
parisc64 defconfig clang-19
powerpc allmodconfig gcc-15.2.0
powerpc allnoconfig clang-23
powerpc randconfig-001 clang-19
powerpc randconfig-001-20260529 clang-19
powerpc randconfig-002 clang-19
powerpc randconfig-002-20260529 clang-19
powerpc64 randconfig-001 clang-19
powerpc64 randconfig-001-20260529 clang-19
powerpc64 randconfig-002 clang-19
powerpc64 randconfig-002-20260529 clang-19
riscv allmodconfig clang-23
riscv allnoconfig clang-23
riscv allyesconfig clang-16
riscv defconfig gcc-15.2.0
riscv randconfig-001-20260529 gcc-15.2.0
riscv randconfig-002-20260529 gcc-15.2.0
s390 allmodconfig clang-19
s390 allnoconfig clang-23
s390 allyesconfig gcc-15.2.0
s390 defconfig gcc-15.2.0
s390 randconfig-001-20260529 gcc-15.2.0
s390 randconfig-002-20260529 gcc-15.2.0
sh allmodconfig gcc-15.2.0
sh allnoconfig clang-23
sh allyesconfig clang-19
sh defconfig gcc-14
sh randconfig-001-20260529 gcc-15.2.0
sh randconfig-002-20260529 gcc-15.2.0
sparc allnoconfig clang-23
sparc defconfig gcc-15.2.0
sparc randconfig-001 gcc-11.5.0
sparc randconfig-001-20260529 gcc-11.5.0
sparc randconfig-002 gcc-11.5.0
sparc randconfig-002-20260529 gcc-11.5.0
sparc64 allmodconfig clang-23
sparc64 defconfig gcc-14
sparc64 randconfig-001 gcc-11.5.0
sparc64 randconfig-001-20260529 gcc-11.5.0
sparc64 randconfig-002 gcc-11.5.0
sparc64 randconfig-002-20260529 gcc-11.5.0
um allmodconfig clang-19
um allnoconfig clang-23
um allyesconfig gcc-15.2.0
um defconfig gcc-14
um i386_defconfig gcc-14
um randconfig-001 gcc-11.5.0
um randconfig-001-20260529 gcc-11.5.0
um randconfig-002 gcc-11.5.0
um randconfig-002-20260529 gcc-11.5.0
um x86_64_defconfig gcc-14
x86_64 allmodconfig clang-20
x86_64 allnoconfig clang-23
x86_64 allyesconfig clang-20
x86_64 buildonly-randconfig-001-20260529 gcc-14
x86_64 buildonly-randconfig-002-20260529 gcc-14
x86_64 buildonly-randconfig-003-20260529 gcc-14
x86_64 buildonly-randconfig-004-20260529 gcc-14
x86_64 buildonly-randconfig-005-20260529 gcc-14
x86_64 buildonly-randconfig-006-20260529 gcc-14
x86_64 defconfig gcc-14
x86_64 kexec clang-20
x86_64 randconfig-001 clang-20
x86_64 randconfig-001-20260529 clang-20
x86_64 randconfig-002 clang-20
x86_64 randconfig-002-20260529 clang-20
x86_64 randconfig-003 clang-20
x86_64 randconfig-003-20260529 clang-20
x86_64 randconfig-004 clang-20
x86_64 randconfig-004-20260529 clang-20
x86_64 randconfig-005 clang-20
x86_64 randconfig-005-20260529 clang-20
x86_64 randconfig-006 clang-20
x86_64 randconfig-006-20260529 clang-20
x86_64 randconfig-011-20260529 clang-20
x86_64 randconfig-012-20260529 clang-20
x86_64 randconfig-013-20260529 clang-20
x86_64 randconfig-014-20260529 clang-20
x86_64 randconfig-015-20260529 clang-20
x86_64 randconfig-016-20260529 clang-20
x86_64 randconfig-071-20260529 clang-20
x86_64 randconfig-072-20260529 clang-20
x86_64 randconfig-073-20260529 clang-20
x86_64 randconfig-074-20260529 clang-20
x86_64 randconfig-075-20260529 clang-20
x86_64 randconfig-076-20260529 clang-20
x86_64 rhel-9.4 clang-20
x86_64 rhel-9.4-bpf gcc-14
x86_64 rhel-9.4-func clang-20
x86_64 rhel-9.4-kselftests clang-20
x86_64 rhel-9.4-kunit gcc-14
x86_64 rhel-9.4-ltp gcc-14
x86_64 rhel-9.4-rust clang-20
xtensa allnoconfig clang-23
xtensa allyesconfig clang-23
xtensa randconfig-001 gcc-11.5.0
xtensa randconfig-001-20260529 gcc-11.5.0
xtensa randconfig-002 gcc-11.5.0
xtensa randconfig-002-20260529 gcc-11.5.0
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
^ permalink raw reply
* Re: Re: [PATCH v2 1/2] ext4: avoid RWM atomic in EXT4_MB_GRP_TEST_AND_SET_READ
From: Bohdan Trach @ 2026-05-29 15:33 UTC (permalink / raw)
To: Andreas Dilger
Cc: Bohdan Trach, Theodore Ts'o, Jan Kara, Baokun Li,
Ojaswin Mujoo, Ritesh Harjani (IBM), Zhang Yi, linux-ext4,
linux-kernel, lilith.oberhauser, bohdan.trach, mchehab+huawei
In-Reply-To: <560F1E7C-47C0-41D4-856A-ECB9BE4BDC3E@dilger.ca>
On Wed, 27 May 2026 19:46:43, Andreas Dilger wrote:
> Thanks for the patch. Definitely the benchmarks in the 0/2 email show
> significant gains for the Kunpeng system, and reducing contention makes sense
> as core counts increase and the likely case is that the bit is already set.
>
> That said, I wonder if this should (also/instead) be put into
> test_and_set_bit() itself, or add test_and_unlikely_set_bit() or
> test_and_rarely_set_bit() (or similar) optimized for the case where the bit is
> likely to already be set.
Thank you for the review. I understand this concern; I have looked for other
parts of the kernel code to see if this change can be generalized, in which case
introducing a new test_and_set_unlikely_bit()/test_test_and_set_bit() would have
been a more appropriate approach indeed. Alas, I could not find more of such
cases, and thus I decided to limit the change to the ext4 code. Maybe developers
with more knowledge of their subsystems know other potential use cases for such
functions.
> I see in your benchmarking that there is not "apples-to-apples" comparisons
> for ARM(Kunpeng) vs. AMD on the same storage. The storage hardware and space
> usage is different for each test run, and the ARM numbers show only marginal
> gains and more negative than positive results at all thread counts:
>
> > Benchmark on an existing file system for AMD 9654 (15T FS, 6% space
> > used), kernel 7.1-rc3. This shows the performance impact on a mostly
> > free file system.
> > | thr. | base | patched | improv. |
> > | | perf | perf | |
> > |------|-------|---------|------------|
> > | 1 | 30901 | 31191 | +0.9384810 |
> > | 2 | 50874 | 50504 | -0.7272870 |
> > | 4 | 66068 | 64108 | -2.9666404 |
> > | 8 | 63963 | 61927 | -3.1830902 |
> > | 16 | 47809 | 47044 | -1.6001171 |
> > | 32 | 42441 | 42326 | -0.2709644 |
> > | 64 | 39773 | 39929 | +0.3922259 |
> > | 128 | 37065 | 36413 | -1.7590719 |
>
>
> The performance reduction might be caused by the now double memory access on
> AMD that is only adding overhead on that CPU implementation? It would be useful
> to see the testing on Kunpeng vs. AMD/Intel on the same storage device/usage.
I've looked a bit deeper into these results (which are different from the 133G
FS image results on AMD). The flame graph is dominated by contention between
open and unlink, e.g.:
... -> __x64_sys_openat -> .. -> path_openat -> down_write ->
-> rwsem_down_write_slowpath -> osq_lock
against
... -> __x64_sys_unlink -> filename_unlinkat -> down_write ->
-> rwsem_down_write_slowpath -> osq_lock
and ext4_mb_prefetch does not feature anywhere in the flame graph. So I think
in this FS configuration, different code path got stressed, the numbers should
be ~0-centered noise? I have definitely seen a correlation with the space usage
percentage of the file system -- on a completely free file system this patch
does not help much irrespective of the architecture, though I did not pinpoint
the root cause for triggering the linear group scan to the usage% specifically,
or if other factors can influence it as well (fragmentation, etc.)
OTOH, the storage device should not matter much for this (admittedly very
limited) kind of benchmark, as the main case where this optimization matters is
if there is a long iteration over groups, which makes ext4_mb_prefetch()
CPU-bound, and this does seem to happen more often as the FS becomes more full.
This was actually the intention of the test with 133G FS image (below the "big"
FS results in the email) -- it should fix parameters other than the underlying
storage device (which should not matter for this test).
> That would tell us if it is more appropriate to optimize this in the aarch64
> test_and_set_bit() rather than in ext4.
I do not believe this change can happen generically in aarch64
test_and_set_bit(), as in a situation where the bit is flipped frequently, this
change will most likely hurt the performance. This probably depends more on the
CPU microarchitecture/cache coherence implementation in hardware.
--
With best regards,
Bohdan Trach
^ permalink raw reply
* Re: [PATCH v4 09/23] ext4: implement writeback path using iomap
From: Ojaswin Mujoo @ 2026-05-29 15:32 UTC (permalink / raw)
To: Zhang Yi
Cc: Zhang Yi, linux-ext4, linux-fsdevel, linux-kernel, tytso,
adilger.kernel, libaokun, jack, ritesh.list, djwong, hch,
yi.zhang, yangerkun, yukuai
In-Reply-To: <458bda7a-c3c6-4cb3-9c05-0718cf2ef95f@gmail.com>
On Fri, May 29, 2026 at 10:12:12PM +0800, Zhang Yi wrote:
> Hi, Ojaswin!
>
> On 5/27/2026 8:49 PM, Ojaswin Mujoo wrote:
> > On Mon, May 11, 2026 at 03:23:29PM +0800, Zhang Yi wrote:
> > > From: Zhang Yi <yi.zhang@huawei.com>
> > >
> > > Add the iomap writeback path for ext4 buffered I/O. This introduces:
> > >
> > > - ext4_iomap_writepages(): the main writeback entry point.
> > > - ext4_writeback_ops: a new iomap_writeback_ops instance to handle
> > > block mapping and I/O submission.
> > > - A new end I/O worker for converting unwritten extents, updating file
> > > size, and handling DATA_ERR_ABORT after I/O completion.
> > >
> > > Core implementation details:
> > >
> > > - ->writeback_range() callback
> > > Calls ext4_iomap_map_writeback_range() to query the longest range of
> > > existing mapped extents. For performance, when a block range is not
> > > yet allocated, it allocates based on the writeback length and delalloc
> > > extent length, rather than allocating for a single folio at a time.
> > > The folio is then added to an iomap_ioend instance.
> > >
> > > - ->writeback_submit() callback
> > > Registers ext4_iomap_end_bio() as the end bio callback. This callback
> > > schedules a worker to handle:
> > > - Unwritten extent conversion.
> > > - i_disksize update after data is written back.
> > > - Journal abort on writeback I/O failure.
> >
> > Hi Zhang, the changes look good. I have a few comments below:
> > >
> > > Key changes and considerations:
> > >
> > > - Append write and unwritten extents
> > > Since data=ordered mode is not used to prevent stale data exposure
> > > during append writebacks, new blocks are always allocated as unwritten
> > > extents (i.e. always enable dioread_nolock), and i_disksize update is
> > > postponed until I/O completion.
> >
> > Makes sense.
> >
> > > Additionally, the deadlock that the
> > > reserve handle was expected to resolve does not occur anymore.
> >
> > I guess this is since we don't use ordered data so we can't block on
> > starting a txn in end io.
>
> Yep.
>
> >
> > > Therefore, the end I/O worker can start a normal journal handle
> > > instead of a reserve handle when converting unwritten extents.
> > >
> > > - Lock ordering
> > > The ->writeback_range() callback runs under the folio lock, requiring
> > > the journal handle to be started under that same lock. This reverses
> > > the order compared to the buffer_head writeback path. The lock ordering
> > > documentation in super.c has been updated accordingly.
> > >
> > > Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
> > > ---
> > > fs/ext4/ext4.h | 4 +
> > > fs/ext4/inode.c | 208 +++++++++++++++++++++++++++++++++++++++++-
> > > fs/ext4/page-io.c | 126 +++++++++++++++++++++++++
> > > fs/ext4/super.c | 7 +-
> > > fs/iomap/ioend.c | 3 +-
> > > include/linux/iomap.h | 1 +
> > > 6 files changed, 346 insertions(+), 3 deletions(-)
> > >
> > > diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> > > index 4832e7f7db82..078feda47e36 100644
> > > --- a/fs/ext4/ext4.h
> > > +++ b/fs/ext4/ext4.h
> > > @@ -1173,6 +1173,8 @@ struct ext4_inode_info {
> > > */
> > > struct list_head i_rsv_conversion_list;
> > > struct work_struct i_rsv_conversion_work;
> > > + struct list_head i_iomap_ioend_list;
> > > + struct work_struct i_iomap_ioend_work;
> > > /*
> > > * Transactions that contain inode's metadata needed to complete
> > > @@ -3870,6 +3872,8 @@ int ext4_bio_write_folio(struct ext4_io_submit *io, struct folio *page,
> > > size_t len);
> > > extern struct ext4_io_end_vec *ext4_alloc_io_end_vec(ext4_io_end_t *io_end);
> > > extern struct ext4_io_end_vec *ext4_last_io_end_vec(ext4_io_end_t *io_end);
> > > +extern void ext4_iomap_end_io(struct work_struct *work);
> > > +extern void ext4_iomap_end_bio(struct bio *bio);
> > > /* mmp.c */
> > > extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t);
> > > diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> > > index 1ae7d3f4a1c8..a80195bd6f20 100644
> > > --- a/fs/ext4/inode.c
> > > +++ b/fs/ext4/inode.c
> > > @@ -44,6 +44,7 @@
> > > #include <linux/iversion.h>
> > > #include "ext4_jbd2.h"
> > > +#include "ext4_extents.h"
> > > #include "xattr.h"
> > > #include "acl.h"
> > > #include "truncate.h"
> > > @@ -4120,10 +4121,215 @@ static void ext4_iomap_readahead(struct readahead_control *rac)
> > > iomap_bio_readahead(rac, &ext4_iomap_buffered_read_ops);
> > > }
> > > +static int ext4_iomap_map_one_extent(struct inode *inode,
> > > + struct ext4_map_blocks *map)
> > > +{
> > > + struct extent_status es;
> > > + handle_t *handle = NULL;
> > > + int credits, map_flags;
> > > + int retval;
> > > +
> > > + credits = ext4_chunk_trans_blocks(inode, map->m_len);
> > > + handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, credits);
> > > + if (IS_ERR(handle))
> > > + return PTR_ERR(handle);
> > > +
> > > + map->m_flags = 0;
> > > + /*
> > > + * It is necessary to look up extent and map blocks under i_data_sem
> > > + * in write mode, otherwise, the delalloc extent may become stale
> > > + * during concurrent truncate operations.
> > > + */
> > > + ext4_fc_track_inode(handle, inode);
> > > + down_write(&EXT4_I(inode)->i_data_sem);
> > > + if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es, &map->m_seq)) {
> > > + retval = es.es_len - (map->m_lblk - es.es_lblk);
> > > + map->m_len = min_t(unsigned int, retval, map->m_len);
> > > +
> > > + if (ext4_es_is_delayed(&es)) {
> >
> > I understand that it is okay for us to rely on extent status ==
> > delayed here because we never reclaim delayed es entries and hence we
> > are sure to not skip any delayed block allocations here.
>
> Yeah, right.
>
> >
> > > + map->m_flags |= EXT4_MAP_DELAYED;
> > > + trace_ext4_da_write_pages_extent(inode, map);
> > > + /*
> > > + * Call ext4_map_create_blocks() to allocate any
> > > + * delayed allocation blocks. It is possible that
> > > + * we're going to need more metadata blocks, however
> > > + * we must not fail because we're in writeback and
> > > + * there is nothing we can do so it might result in
> > > + * data loss. So use reserved blocks to allocate
> > > + * metadata if possible.
> > > + */
> > > + map_flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT |
> > > + EXT4_GET_BLOCKS_METADATA_NOFAIL |
> > > + EXT4_EX_NOCACHE;
> > > +
> > > + retval = ext4_map_create_blocks(handle, inode, map,
> > > + map_flags);
> > > + if (retval > 0)
> > > + ext4_fc_track_range(handle, inode, map->m_lblk,
> > > + map->m_lblk + map->m_len - 1);
> > > + goto out;
> > > + } else if (unlikely(ext4_es_is_hole(&es)))
> >
> > Now that you've fixed the partial invalidate in iomap (patch 12/23)
> > can we still hit this hole case?
>
> Theoretically, no hole should be encountered; this is just defensive
> programming.
>
> >
> > > + goto out;
> > > +
> > > + /* Found written or unwritten extent. */
> > > + map->m_pblk = ext4_es_pblock(&es) + map->m_lblk - es.es_lblk;
> > > + map->m_flags = ext4_es_is_written(&es) ?
> > > + EXT4_MAP_MAPPED : EXT4_MAP_UNWRITTEN;
> > > + goto out;
> > > + }
> > > +
> > > + retval = ext4_map_query_blocks(handle, inode, map, EXT4_EX_NOCACHE);
> > > +out:
> > > + up_write(&EXT4_I(inode)->i_data_sem);
> > > + ext4_journal_stop(handle);
> > > + return retval < 0 ? retval : 0;
> > > +}
> > > +
> > > +static int ext4_iomap_map_writeback_range(struct iomap_writepage_ctx *wpc,
> > > + loff_t offset, unsigned int dirty_len)
> > > +{
> > > + struct inode *inode = wpc->inode;
> > > + struct super_block *sb = inode->i_sb;
> > > + struct journal_s *journal = EXT4_SB(sb)->s_journal;
> > > + struct ext4_map_blocks map;
> > > + unsigned int blkbits = inode->i_blkbits;
> > > + unsigned int index = offset >> blkbits;
> > > + unsigned int blk_end, blk_len;
> > > + int ret;
> > > +
> > > + ret = ext4_emergency_state(sb);
> > > + if (unlikely(ret))
> > > + return ret;
> > > +
> > > + /* Check validity of the cached writeback mapping. */
> > > + if (offset >= wpc->iomap.offset &&
> > > + offset < wpc->iomap.offset + wpc->iomap.length &&
> > > + ext4_iomap_valid(inode, &wpc->iomap))
> > > + return 0;
> > > +
> > > + blk_len = dirty_len >> blkbits;
> > > + blk_end = min_t(unsigned int, (wpc->wbc->range_end >> blkbits),
> > > + (UINT_MAX - 1));
> >
> > This is an interesting idea. I'm just a bit worried when we have
> > range_end == LLONG_MAX (bg flush) and we will always be trying to allocate
> > MAX_WRITEPAGES, incase of a slightly fragmented FS, we might keep
> > falling into slower mballoc criterias and might waste a lot of time
> > scanning the groups.
>
> Actually, MAX_WRITEPAGES is not allocated every time; the allocated
> length also depends on the length of data that has already been delayed
> for writing, and the smaller value is taken. If the user has indeed
Hmm so we take the blk_end based on range_end (which is LLONG_MAX for bg
flusher) and then our blk_len will be set accordingly and would become a
large number as well. Then we will set map.m_len based on this blk_len
and MAX_WRITEPAGES. Am I missing something that clamps our m_len?
> performed delalloc writes on data of up to MAX_WRITEPAGES in length,
> then regardless of how fragmented the file system is, I will need to
> allocate blocks of that length. Reducing the number of calls is always
> beneficial.
>
> >
> > > + if (blk_end > index + blk_len)
> > > + blk_len = blk_end - index + 1;
> > > +
> > > +retry:
> > > + map.m_lblk = index;
> > > + map.m_len = min_t(unsigned int, MAX_WRITEPAGES_EXTENT_LEN, blk_len);
> > > + ret = ext4_map_blocks(NULL, inode, &map,
> > > + EXT4_GET_BLOCKS_IO_SUBMIT | EXT4_EX_NOCACHE);
> >
> > Do we really need the IO_SUBMIT flag here now that we are:
> > 1. Not using ordered data
> > 2. We anyways don't use it in ext4_iomap_map_one_extent().
> >
> > I think we can drop it.
>
> We can't drop it, because IO_SUBMIT is also used to avoid the check of
> ext4_check_map_extents_env() in ext4_map_blocks() under the writeback
> path.
Yes, noted :)
Thanks,
Ojaswin
>
> Cheers,
> Yi.
>
> >
> > > + if (ret < 0)
> > > + return ret;
> > > +
> > > + /*
> > > + * The map is not a delalloc extent, it must either be a hole
> > > + * or an extent which have already been allocated.
> > > + */
> > > + if (!(map.m_flags & EXT4_MAP_DELAYED))
> > > + goto out;
> > > +
> > > + /* Map one delalloc extent. */
> > > + ret = ext4_iomap_map_one_extent(inode, &map);
> > > + if (ret < 0) {
> > > + if (ext4_emergency_state(sb))
> > > + return ret;
> > > +
> > > + /*
> > > + * Retry transient ENOSPC errors, if
> > > + * ext4_count_free_blocks() is non-zero, a commit
> > > + * should free up blocks.
> > > + */
> > > + if (ret == -ENOSPC && journal && ext4_count_free_clusters(sb)) {
> > > + jbd2_journal_force_commit_nested(journal);
> > > + goto retry;
> > > + }
> > > +
> > > + ext4_msg(sb, KERN_CRIT,
> > > + "Delayed block allocation failed for inode %llu at logical offset %llu with max blocks %u with error %d",
> > > + inode->i_ino, (unsigned long long)map.m_lblk,
> > > + (unsigned int)map.m_len, -ret);
> > > + ext4_msg(sb, KERN_CRIT,
> > > + "This should not happen!! Data will be lost\n");
> > > + if (ret == -ENOSPC)
> > > + ext4_print_free_blocks(inode);
> > > + return ret;
> > > + }
> > > +out:
> > > + ext4_set_iomap(inode, &wpc->iomap, &map, offset, dirty_len, 0);
> > > + return 0;
> > > +}
> > > +
> >
> > <snip>
> > >
>
^ permalink raw reply
* [PATCH 2/4] libext2fs: add quota to libext2fs
From: eaujames @ 2026-05-29 11:57 UTC (permalink / raw)
To: linux-ext4; +Cc: adilger, dongyangli
add quota related interface to libext2fs and install the
relevant headers.
Change-Id: I17e6b5aa74e0f1bb1465168a1cf4e03184e003b0
Signed-off-by: Li Dongyang <dongyangli@ddn.com>
Reviewed-on: https://review.whamcloud.com/38027
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Lustre-bug-id: https://jira.whamcloud.com/browse/LU-13241
Signed-off-by: Etienne AUJAMES <eaujames@ddn.com>
---
lib/ext2fs/Makefile.in | 43 ++++++++++++++++++++++++++++++++++++++++++
1 file changed, 43 insertions(+)
diff --git a/lib/ext2fs/Makefile.in b/lib/ext2fs/Makefile.in
index e9a6ced24..0656c4c5c 100644
--- a/lib/ext2fs/Makefile.in
+++ b/lib/ext2fs/Makefile.in
@@ -28,6 +28,8 @@ DEBUG_OBJS= debug_cmds.o extent_cmds.o tst_cmds.o debugfs.o util.o \
create_inode_libarchive.o journal.o revoke.o recovery.o \
do_journal.o do_orphan.o
+QUOTA_LIB_OBJS= mkquota.o quotaio.o quotaio_v2.o quotaio_tree.o dict.o
+
DEBUG_SRCS= debug_cmds.c extent_cmds.c tst_cmds.c \
$(top_srcdir)/debugfs/debugfs.c \
$(top_srcdir)/debugfs/util.c \
@@ -57,6 +59,7 @@ DEBUG_SRCS= debug_cmds.c extent_cmds.c tst_cmds.c \
@TDB_CMT@TDB_OBJ= tdb.o
OBJS= $(DEBUGFS_LIB_OBJS) $(RESIZE_LIB_OBJS) $(E2IMAGE_LIB_OBJS) \
+ $(QUOTA_LIB_OBJS) \
$(TEST_IO_LIB_OBJS) \
ext2_err.o \
alloc.o \
@@ -236,6 +239,7 @@ SRCS= ext2_err.c \
HFILES= bitops.h ext2fs.h ext2_io.h ext2_fs.h ext2_ext_attr.h ext3_extents.h \
tdb.h qcow2.h hashmap.h
+QUOTA_HFILES= quotaio.h dqblk_v2.h quotaio_tree.h dict.h
HFILES_IN= ext2_err.h ext2_types.h
LIBRARY= libext2fs
@@ -459,6 +463,41 @@ do_orphan.o: $(top_srcdir)/debugfs/do_orphan.c
$(E) " CC $<"
$(Q) $(CC) $(DEBUGFS_CFLAGS) -c $< -o $@
+mkquota.o: $(top_srcdir)/lib/support/mkquota.c
+ $(E) " CC $<"
+ $(Q) $(CC) -I$(top_srcdir)/lib/support $(ALL_CFLAGS_STLIB) -c $< -o $@
+@PROFILE_CMT@ $(Q) $(CC) -I$(top_srcdir)/lib/support $(ALL_CFLAGS_STLIB) -g -pg -o profiled/$*.o -c $<
+@ELF_CMT@ $(Q) $(CC) -I$(top_srcdir)/lib/support $(ALL_CFLAGS_SHLIB) -fPIC -shared -o elfshared/$*.o -c $<
+@BSDLIB_CMT@ $(Q) $(CC) -I$(top_srcdir)/lib/support $(ALL_CFLAGS_SHLIB) $(BSDLIB_PIC_FLAG) -o pic/$*.o -c $<
+
+quotaio.o: $(top_srcdir)/lib/support/quotaio.c
+ $(E) " CC $<"
+ $(Q) $(CC) -I$(top_srcdir)/lib/support $(ALL_CFLAGS_STLIB) -c $< -o $@
+@PROFILE_CMT@ $(Q) $(CC) -I$(top_srcdir)/lib/support $(ALL_CFLAGS_STLIB) -g -pg -o profiled/$*.o -c $<
+@ELF_CMT@ $(Q) $(CC) -I$(top_srcdir)/lib/support $(ALL_CFLAGS_SHLIB) -fPIC -shared -o elfshared/$*.o -c $<
+@BSDLIB_CMT@ $(Q) $(CC) -I$(top_srcdir)/lib/support $(ALL_CFLAGS_SHLIB) $(BSDLIB_PIC_FLAG) -o pic/$*.o -c $<
+
+quotaio_v2.o: $(top_srcdir)/lib/support/quotaio_v2.c
+ $(E) " CC $<"
+ $(Q) $(CC) -I$(top_srcdir)/lib/support $(ALL_CFLAGS_STLIB) -c $< -o $@
+@PROFILE_CMT@ $(Q) $(CC) -I$(top_srcdir)/lib/support $(ALL_CFLAGS_STLIB) -g -pg -o profiled/$*.o -c $<
+@ELF_CMT@ $(Q) $(CC) -I$(top_srcdir)/lib/support $(ALL_CFLAGS_SHLIB) -fPIC -shared -o elfshared/$*.o -c $<
+@BSDLIB_CMT@ $(Q) $(CC) -I$(top_srcdir)/lib/support $(ALL_CFLAGS_SHLIB) $(BSDLIB_PIC_FLAG) -o pic/$*.o -c $<
+
+quotaio_tree.o: $(top_srcdir)/lib/support/quotaio_tree.c
+ $(E) " CC $<"
+ $(Q) $(CC) -I$(top_srcdir)/lib/support $(ALL_CFLAGS_STLIB) -c $< -o $@
+@PROFILE_CMT@ $(Q) $(CC) -I$(top_srcdir)/lib/support $(ALL_CFLAGS_STLIB) -g -pg -o profiled/$*.o -c $<
+@ELF_CMT@ $(Q) $(CC) -I$(top_srcdir)/lib/support $(ALL_CFLAGS_SHLIB) -fPIC -shared -o elfshared/$*.o -c $<
+@BSDLIB_CMT@ $(Q) $(CC) -I$(top_srcdir)/lib/support $(ALL_CFLAGS_SHLIB) $(BSDLIB_PIC_FLAG) -o pic/$*.o -c $<
+
+dict.o: $(top_srcdir)/lib/support/dict.c
+ $(E) " CC $<"
+ $(Q) $(CC) -I$(top_srcdir)/lib/support $(ALL_CFLAGS_STLIB) -c $< -o $@
+@PROFILE_CMT@ $(Q) $(CC) -I$(top_srcdir)/lib/support $(ALL_CFLAGS_STLIB) -g -pg -o profiled/$*.o -c $<
+@ELF_CMT@ $(Q) $(CC) -I$(top_srcdir)/lib/support $(ALL_CFLAGS_SHLIB) -fPIC -shared -o elfshared/$*.o -c $<
+@BSDLIB_CMT@ $(Q) $(CC) -I$(top_srcdir)/lib/support $(ALL_CFLAGS_SHLIB) $(BSDLIB_PIC_FLAG) -o pic/$*.o -c $<
+
xattrs.o: $(top_srcdir)/debugfs/xattrs.c
$(E) " CC $<"
$(Q) $(CC) $(DEBUGFS_CFLAGS) -c $< -o $@
@@ -586,6 +625,10 @@ install:: all $(HFILES) $(HFILES_IN) installdirs ext2fs.pc
echo " INSTALL_DATA $(includedir)/ext2fs/$$i"; \
$(INSTALL_DATA) $(srcdir)/$$i $(DESTDIR)$(includedir)/ext2fs/$$i; \
done
+ $(Q) for i in $(QUOTA_HFILES); do \
+ echo " INSTALL_DATA $(includedir)/ext2fs/$$i"; \
+ $(INSTALL_DATA) $(top_srcdir)/lib/support/$$i $(DESTDIR)$(includedir)/ext2fs/$$i; \
+ done
$(Q) for i in $(HFILES_IN); do \
echo " INSTALL_DATA $(includedir)/ext2fs/$$i"; \
$(INSTALL_DATA) $$i $(DESTDIR)$(includedir)/ext2fs/$$i; \
--
2.43.7
^ permalink raw reply related
* [PATCH 3/4] libext2fs: update iblock when using ea_inode feature
From: Etienne AUJAMES @ 2026-05-29 11:58 UTC (permalink / raw)
To: linux-ext4; +Cc: adilger, dongyangli
When a xattr is stored in an ea_inode, ext2fs_xattrs_* functions do
not update the inode block count.
This patch uses a cached inode to update the block count and quota
when writing xattrs.
It also fix an xattr remove case: the current ACL block was not
release by ext2fs_xattrs_write().
Add a helper function ext2fs_iblk_get() to get the inode block count
in cluster count unit.
Add ext2fs_xattrs_open_inode() to specify an optional cached inode to
use or update.
The function handle an optional quota context argument to update quota
accounting. It is hard to predict inode quota usage with ea_inode
deduplication.
For testing purposes, modify the debugfs "ea_set" command to handle
input file larger than the FS block size.
Add a regression test: d_xattr_ea_inode
Fixes: 50d0998cfe ("libext2fs: add ea_inode support to set xattr")
Signed-off-by: Etienne AUJAMES <eaujames@ddn.com>
Change-Id: I34733255bb76ffe2386d8cd6c19ce4561be4da3a
Lustre-bug-id: https://jira.whamcloud.com/browse/LU-20049
---
debugfs/xattrs.c | 19 ++-
e2fsck/pass1.c | 12 +-
lib/ext2fs/ext2fs.h | 7 ++
lib/ext2fs/ext_attr.c | 227 ++++++++++++++++++++++------------
lib/ext2fs/i_block.c | 14 +++
lib/support/quotaio.h | 1 -
tests/d_xattr_ea_inode/expect | 137 ++++++++++++++++++++
tests/d_xattr_ea_inode/name | 1 +
tests/d_xattr_ea_inode/script | 85 +++++++++++++
9 files changed, 415 insertions(+), 88 deletions(-)
create mode 100644 tests/d_xattr_ea_inode/expect
create mode 100644 tests/d_xattr_ea_inode/name
create mode 100644 tests/d_xattr_ea_inode/script
diff --git a/debugfs/xattrs.c b/debugfs/xattrs.c
index b518941c9..8364281f4 100644
--- a/debugfs/xattrs.c
+++ b/debugfs/xattrs.c
@@ -14,6 +14,7 @@ extern int optind;
extern char *optarg;
#endif
#include <ctype.h>
+#include <unistd.h>
#include "support/cstring.h"
#include "debugfs.h"
@@ -299,10 +300,24 @@ void do_set_xattr(int argc, ss_argv_t argv, int sci_idx EXT2FS_ATTR((unused)),
goto out;
if (fp) {
- err = ext2fs_get_mem(current_fs->blocksize, &buf);
+ struct stat st;
+
+ if (fstat(fileno(fp), &st)) {
+ err = errno;
+ goto out;
+ }
+ if (st.st_size > sysconf(_SC_ARG_MAX)) {
+ err = EFBIG;
+ goto out;
+ }
+ err = ext2fs_get_mem(st.st_size, &buf);
if (err)
goto out;
- buflen = fread(buf, 1, current_fs->blocksize, fp);
+ buflen = fread(buf, 1, st.st_size, fp);
+ if (ferror(fp)) {
+ err = errno;
+ goto out;
+ }
} else {
buf = argv[optind + 2];
buflen = parse_c_string(buf);
diff --git a/e2fsck/pass1.c b/e2fsck/pass1.c
index fdde76cc2..364128f4d 100644
--- a/e2fsck/pass1.c
+++ b/e2fsck/pass1.c
@@ -968,18 +968,18 @@ static void reserve_block_for_lnf_repair(e2fsck_t ctx)
static errcode_t get_inline_data_ea_size(ext2_filsys fs, ext2_ino_t ino,
struct ext2_inode *inode,
- size_t *sz)
+ size_t inode_size, size_t *sz)
{
void *p;
struct ext2_xattr_handle *handle;
errcode_t retval;
- retval = ext2fs_xattrs_open(fs, ino, &handle);
+ retval = ext2fs_xattrs_open_inode(fs, ino, inode, inode_size, NULL,
+ &handle);
if (retval)
return retval;
- retval = ext2fs_xattrs_read_inode(handle,
- (struct ext2_inode_large *)inode);
+ retval = ext2fs_xattrs_read(handle);
if (retval)
goto err;
@@ -1580,6 +1580,7 @@ void e2fsck_pass1(e2fsck_t ctx)
size_t size = 0;
pctx.errcode = get_inline_data_ea_size(fs, ino, inode,
+ inode_size,
&size);
if (!pctx.errcode &&
fix_problem(ctx, PR_1_INLINE_DATA_FEATURE, &pctx)) {
@@ -1603,7 +1604,8 @@ void e2fsck_pass1(e2fsck_t ctx)
flags = fs->flags;
if (failed_csum)
fs->flags |= EXT2_FLAG_IGNORE_CSUM_ERRORS;
- err = get_inline_data_ea_size(fs, ino, inode, &size);
+ err = get_inline_data_ea_size(fs, ino, inode,
+ inode_size, &size);
fs->flags = (flags & EXT2_FLAG_IGNORE_CSUM_ERRORS) |
(fs->flags & ~EXT2_FLAG_IGNORE_CSUM_ERRORS);
diff --git a/lib/ext2fs/ext2fs.h b/lib/ext2fs/ext2fs.h
index c4fcb10be..56de5ea50 100644
--- a/lib/ext2fs/ext2fs.h
+++ b/lib/ext2fs/ext2fs.h
@@ -94,6 +94,8 @@ typedef __s64 __bitwise ext2_off64_t;
typedef __s64 __bitwise e2_blkcnt_t;
typedef __u32 __bitwise ext2_dirhash_t;
+typedef struct quota_ctx *quota_ctx_t;
+
#if EXT2_FLAT_INCLUDES
#include "com_err.h"
#include "ext2_io.h"
@@ -1408,6 +1410,10 @@ errcode_t ext2fs_xattr_set(struct ext2_xattr_handle *handle,
errcode_t ext2fs_xattr_remove(struct ext2_xattr_handle *handle,
const char *key);
errcode_t ext2fs_xattr_remove_all(struct ext2_xattr_handle *handle);
+errcode_t ext2fs_xattrs_open_inode(ext2_filsys fs, ext2_ino_t ino,
+ struct ext2_inode *inode, size_t inode_size,
+ quota_ctx_t qctx,
+ struct ext2_xattr_handle **handle);
errcode_t ext2fs_xattrs_open(ext2_filsys fs, ext2_ino_t ino,
struct ext2_xattr_handle **handle);
errcode_t ext2fs_xattrs_close(struct ext2_xattr_handle **handle);
@@ -1607,6 +1613,7 @@ errcode_t ext2fs_iblk_add_blocks(ext2_filsys fs, struct ext2_inode *inode,
errcode_t ext2fs_iblk_sub_blocks(ext2_filsys fs, struct ext2_inode *inode,
blk64_t num_blocks);
errcode_t ext2fs_iblk_set(ext2_filsys fs, struct ext2_inode *inode, blk64_t b);
+blk64_t ext2fs_iblk_get(ext2_filsys fs, struct ext2_inode *inode);
/* imager.c */
extern errcode_t ext2fs_image_inode_write(ext2_filsys fs, int fd, int flags);
diff --git a/lib/ext2fs/ext_attr.c b/lib/ext2fs/ext_attr.c
index 7723d0f91..3b90b70bb 100644
--- a/lib/ext2fs/ext_attr.c
+++ b/lib/ext2fs/ext_attr.c
@@ -22,6 +22,7 @@
#include "ext2_fs.h"
#include "ext2_ext_attr.h"
#include "ext4_acl.h"
+#include "support/quotaio.h"
#include "ext2fsP.h"
@@ -361,8 +362,14 @@ struct ext2_xattr_handle {
int capacity;
int count;
int ibody_count;
+ struct ext2_inode *in_inode;
+ size_t in_inode_size;
+ struct ext2_inode_large *alloc_inode;
+ struct ext2_inode_large *inode;
+ size_t inode_size;
ext2_ino_t ino;
unsigned int flags;
+ quota_ctx_t qctx;
};
static errcode_t ext2fs_xattrs_expand(struct ext2_xattr_handle *h,
@@ -499,9 +506,11 @@ out:
return err;
}
-static errcode_t prep_ea_block_for_write(ext2_filsys fs, ext2_ino_t ino,
- struct ext2_inode_large *inode)
+static errcode_t prep_ea_block_for_write(ext2_filsys fs,
+ struct ext2_xattr_handle *handle)
{
+ struct ext2_inode_large *inode = handle->inode;
+ ext2_ino_t ino = handle->ino;
struct ext2_ext_attr_header *header;
void *block_buf = NULL;
blk64_t blk, goal;
@@ -541,11 +550,15 @@ static errcode_t prep_ea_block_for_write(ext2_filsys fs, ext2_ino_t ino,
if (err)
goto out2;
} else {
- /* No block, we must increment i_blocks */
+ /* No block, we must increment i_blocks and quota */
err = ext2fs_iblk_add_blocks(fs, (struct ext2_inode *)inode,
1);
if (err)
goto out;
+
+ if (handle->qctx)
+ quota_data_add(handle->qctx, handle->inode, handle->ino,
+ EXT2FS_C2B(fs, 1) * fs->blocksize);
}
/* Allocate a block */
@@ -744,8 +757,7 @@ write_xattrs_to_buffer(ext2_filsys fs, struct ext2_xattr *attrs, int count,
errcode_t ext2fs_xattrs_write(struct ext2_xattr_handle *handle)
{
ext2_filsys fs = handle->fs;
- const unsigned int inode_size = EXT2_INODE_SIZE(fs->super);
- struct ext2_inode_large *inode;
+ struct ext2_inode_large *inode = handle->inode;
char *start, *block_buf = NULL;
struct ext2_ext_attr_header *header;
__u32 ea_inode_magic;
@@ -755,21 +767,12 @@ errcode_t ext2fs_xattrs_write(struct ext2_xattr_handle *handle)
errcode_t err;
EXT2_CHECK_MAGIC(handle, EXT2_ET_MAGIC_EA_HANDLE);
- i = inode_size;
- if (i < sizeof(*inode))
- i = sizeof(*inode);
- err = ext2fs_get_memzero(i, &inode);
- if (err)
- return err;
-
- err = ext2fs_read_inode_full(fs, handle->ino, EXT2_INODE(inode),
- inode_size);
- if (err)
- goto out;
+ if (!inode)
+ return EINVAL;
/* If extra_isize isn't set, we need to set it now */
if (inode->i_extra_isize == 0 &&
- inode_size > EXT2_GOOD_OLD_INODE_SIZE) {
+ handle->inode_size > EXT2_GOOD_OLD_INODE_SIZE) {
char *p = (char *)inode;
size_t extra = fs->super->s_want_extra_isize;
@@ -778,22 +781,20 @@ errcode_t ext2fs_xattrs_write(struct ext2_xattr_handle *handle)
memset(p + EXT2_GOOD_OLD_INODE_SIZE, 0, extra);
inode->i_extra_isize = extra;
}
- if (inode->i_extra_isize & 3) {
- err = EXT2_ET_INODE_CORRUPTED;
- goto out;
- }
+ if (inode->i_extra_isize & 3)
+ return EXT2_ET_INODE_CORRUPTED;
/* Does the inode have space for EA? */
if (inode->i_extra_isize < sizeof(inode->i_extra_isize) ||
- inode_size <= EXT2_GOOD_OLD_INODE_SIZE + inode->i_extra_isize +
- sizeof(__u32))
+ handle->inode_size <= EXT2_GOOD_OLD_INODE_SIZE +
+ inode->i_extra_isize + sizeof(__u32))
goto write_ea_block;
/* Write the inode EA */
ea_inode_magic = EXT2_EXT_ATTR_MAGIC;
memcpy(((char *) inode) + EXT2_GOOD_OLD_INODE_SIZE +
inode->i_extra_isize, &ea_inode_magic, sizeof(__u32));
- storage_size = inode_size - EXT2_GOOD_OLD_INODE_SIZE -
+ storage_size = handle->inode_size - EXT2_GOOD_OLD_INODE_SIZE -
inode->i_extra_isize - sizeof(__u32);
start = ((char *) inode) + EXT2_GOOD_OLD_INODE_SIZE +
inode->i_extra_isize + sizeof(__u32);
@@ -801,17 +802,16 @@ errcode_t ext2fs_xattrs_write(struct ext2_xattr_handle *handle)
err = write_xattrs_to_buffer(fs, handle->attrs, handle->ibody_count,
start, storage_size, 0, 0);
if (err)
- goto out;
+ return err;
write_ea_block:
/* Are we done? */
- if (handle->ibody_count == handle->count &&
- !ext2fs_file_acl_block(fs, EXT2_INODE(inode)))
+ if (handle->ibody_count == handle->count)
goto skip_ea_block;
/* Write the EA block */
err = ext2fs_get_memzero(fs->blocksize, &block_buf);
if (err)
- goto out;
+ return err;
storage_size = fs->blocksize - sizeof(struct ext2_ext_attr_header);
start = block_buf + sizeof(struct ext2_ext_attr_header);
@@ -820,7 +820,7 @@ write_ea_block:
handle->count - handle->ibody_count, start,
storage_size, start - block_buf, 1);
if (err)
- goto out2;
+ goto out;
/* Write a header on the EA block */
header = (struct ext2_ext_attr_header *) block_buf;
@@ -829,15 +829,15 @@ write_ea_block:
header->h_blocks = 1;
/* Get a new block for writing */
- err = prep_ea_block_for_write(fs, handle->ino, inode);
+ err = prep_ea_block_for_write(fs, handle);
if (err)
- goto out2;
+ goto out;
/* Finally, write the new EA block */
blk = ext2fs_file_acl_block(fs, EXT2_INODE(inode));
err = ext2fs_write_ext_attr3(fs, blk, block_buf, handle->ino);
if (err)
- goto out2;
+ goto out;
skip_ea_block:
blk = ext2fs_file_acl_block(fs, (struct ext2_inode *)inode);
@@ -845,19 +845,26 @@ skip_ea_block:
/* xattrs shrunk, free the block */
err = ext2fs_free_ext_attr(fs, handle->ino, inode);
if (err)
- goto out;
+ return err;
+ if (handle->qctx)
+ quota_data_sub(handle->qctx, inode, handle->ino,
+ EXT2FS_C2B(fs, 1) * fs->blocksize);
}
/* Write the inode */
err = ext2fs_write_inode_full(fs, handle->ino, EXT2_INODE(inode),
- inode_size);
+ handle->inode_size);
if (err)
- goto out2;
+ goto out;
+
+ /* Update the caller cached inode if provided */
+ if (handle->in_inode && handle->in_inode != EXT2_INODE(handle->inode))
+ memcpy(handle->in_inode, EXT2_INODE(handle->inode),
+ handle->in_inode_size);
-out2:
- ext2fs_free_mem(&block_buf);
out:
- ext2fs_free_mem(&inode);
+ ext2fs_free_mem(&block_buf);
+
return err;
}
@@ -1130,29 +1137,51 @@ out:
return err;
}
-errcode_t ext2fs_xattrs_read(struct ext2_xattr_handle *handle)
+errcode_t ext2fs_xattrs_read(struct ext2_xattr_handle *h)
{
- struct ext2_inode_large *inode;
- size_t inode_size = EXT2_INODE_SIZE(handle->fs->super);
errcode_t err;
- EXT2_CHECK_MAGIC(handle, EXT2_ET_MAGIC_EA_HANDLE);
+ EXT2_CHECK_MAGIC(h, EXT2_ET_MAGIC_EA_HANDLE);
+
+ h->inode_size = EXT2_INODE_SIZE(h->fs->super);
+ if (h->inode_size < sizeof(*h->inode))
+ h->inode_size = sizeof(*h->inode);
+
+ /* Use the caller cached inode if possible */
+ if (h->in_inode && h->in_inode_size >= h->inode_size) {
+ h->inode = (struct ext2_inode_large *) h->in_inode;
+ goto xattrs_read;
+ }
+
+ /* Flush the caller cached inode if provided */
+ if (h->in_inode) {
+ err = ext2fs_write_inode_full(h->fs, h->ino, h->in_inode,
+ h->in_inode_size);
+ if (err)
+ goto err;
+ }
- if (inode_size < sizeof(*inode))
- inode_size = sizeof(*inode);
- err = ext2fs_get_memzero(inode_size, &inode);
+ err = ext2fs_get_memzero(h->inode_size, &h->alloc_inode);
if (err)
return err;
- err = ext2fs_read_inode_full(handle->fs, handle->ino, EXT2_INODE(inode),
- EXT2_INODE_SIZE(handle->fs->super));
+ err = ext2fs_read_inode_full(h->fs, h->ino, EXT2_INODE(h->alloc_inode),
+ h->inode_size);
if (err)
- goto out;
+ goto err;
- err = ext2fs_xattrs_read_inode(handle, inode);
+ h->inode = h->alloc_inode;
-out:
- ext2fs_free_mem(&inode);
+xattrs_read:
+ err = ext2fs_xattrs_read_inode(h, h->inode);
+ if (err)
+ goto err;
+
+ return 0;
+
+err:
+ h->inode_size = 0;
+ ext2fs_free_mem(&h->alloc_inode);
return err;
}
@@ -1272,12 +1301,15 @@ out:
return err;
}
-static errcode_t xattr_create_ea_inode(ext2_filsys fs, const void *value,
- size_t value_len, ext2_ino_t *ea_ino)
+static errcode_t xattr_create_ea_inode(struct ext2_xattr_handle *handle,
+ const void *value, size_t value_len,
+ ext2_ino_t *ea_ino)
{
+ ext2_filsys fs = handle->fs;
struct ext2_inode inode;
ext2_ino_t ino;
ext2_file_t file;
+ blk64_t iblk;
__u32 hash;
errcode_t ret;
@@ -1317,16 +1349,30 @@ static errcode_t xattr_create_ea_inode(ext2_filsys fs, const void *value,
if (ret)
return ret;
+ ret = ext2fs_read_inode(fs, ino, &inode);
+ if (ret)
+ return ret;
+
ext2fs_inode_alloc_stats2(fs, ino, 1 /* inuse */, 0 /* isdir */);
+ iblk = ext2fs_iblk_get(fs, &inode);
+ ext2fs_iblk_add_blocks(fs, EXT2_INODE(handle->inode), iblk);
+ if (handle->qctx) {
+ quota_data_add(handle->qctx, handle->inode, handle->ino,
+ EXT2FS_C2B(fs, iblk) * fs->blocksize);
+ quota_data_inodes(handle->qctx, handle->inode, handle->ino, +1);
+ }
*ea_ino = ino;
return 0;
}
-static errcode_t xattr_inode_dec_ref(ext2_filsys fs, ext2_ino_t ino)
+static errcode_t xattr_inode_dec_ref(struct ext2_xattr_handle *handle,
+ ext2_ino_t ino)
{
+ ext2_filsys fs = handle->fs;
struct ext2_inode_large inode;
__u64 ref_count;
+ blk64_t iblk;
errcode_t ret;
ret = ext2fs_read_inode_full(fs, ino, (struct ext2_inode *)&inode,
@@ -1338,6 +1384,14 @@ static errcode_t xattr_inode_dec_ref(ext2_filsys fs, ext2_ino_t ino)
ref_count--;
ext2fs_set_ea_inode_ref(EXT2_INODE(&inode), ref_count);
+ iblk = ext2fs_iblk_get(fs, EXT2_INODE(&inode));
+ ext2fs_iblk_sub_blocks(fs, EXT2_INODE(handle->inode), iblk);
+ if (handle->qctx) {
+ quota_data_sub(handle->qctx, handle->inode, handle->ino,
+ EXT2FS_C2B(fs, iblk) * fs->blocksize);
+ quota_data_inodes(handle->qctx, handle->inode, handle->ino, -1);
+ }
+
if (ref_count)
goto write_out;
@@ -1365,7 +1419,8 @@ out:
return ret;
}
-static errcode_t xattr_update_entry(ext2_filsys fs, struct ext2_xattr *x,
+static errcode_t xattr_update_entry(struct ext2_xattr_handle *handle,
+ struct ext2_xattr *x,
const char *name, const char *short_name,
int index, const void *value,
size_t value_len, int in_inode)
@@ -1390,13 +1445,13 @@ static errcode_t xattr_update_entry(ext2_filsys fs, struct ext2_xattr *x,
memcpy(new_value, value, value_len);
if (in_inode) {
- ret = xattr_create_ea_inode(fs, value, value_len, &ea_ino);
+ ret = xattr_create_ea_inode(handle, value, value_len, &ea_ino);
if (ret)
goto fail;
}
if (x->ea_ino) {
- ret = xattr_inode_dec_ref(fs, x->ea_ino);
+ ret = xattr_inode_dec_ref(handle, x->ea_ino);
if (ret)
goto fail;
}
@@ -1419,7 +1474,7 @@ fail:
if (new_value)
ext2fs_free_mem(&new_value);
if (ea_ino)
- xattr_inode_dec_ref(fs, ea_ino);
+ xattr_inode_dec_ref(handle, ea_ino);
return ret;
}
@@ -1486,7 +1541,7 @@ static errcode_t xattr_array_update(struct ext2_xattr_handle *h,
}
/* Update the existing entry. */
- ret = xattr_update_entry(h->fs, &h->attrs[old_idx], name,
+ ret = xattr_update_entry(h, &h->attrs[old_idx], name,
shortname, name_idx, value,
value_len, in_inode);
if (ret)
@@ -1515,7 +1570,7 @@ static errcode_t xattr_array_update(struct ext2_xattr_handle *h,
if (old_idx >= 0) {
/* Update the existing entry. */
- ret = xattr_update_entry(h->fs, &h->attrs[old_idx], name,
+ ret = xattr_update_entry(h, &h->attrs[old_idx], name,
shortname, name_idx, value,
value_len, in_inode);
if (ret)
@@ -1551,7 +1606,7 @@ add_new:
return ret;
}
- ret = xattr_update_entry(h->fs, &h->attrs[h->count], name, shortname,
+ ret = xattr_update_entry(h, &h->attrs[h->count], name, shortname,
name_idx, value, value_len, in_inode);
if (ret)
return ret;
@@ -1594,8 +1649,7 @@ errcode_t ext2fs_xattr_set(struct ext2_xattr_handle *h,
size_t value_len)
{
ext2_filsys fs = h->fs;
- const int inode_size = EXT2_INODE_SIZE(fs->super);
- struct ext2_inode_large *inode = NULL;
+ struct ext2_inode_large *inode = h->inode;
struct ext2_xattr *x;
char *new_value;
int ibody_free, block_free;
@@ -1605,6 +1659,8 @@ errcode_t ext2fs_xattr_set(struct ext2_xattr_handle *h,
errcode_t ret;
EXT2_CHECK_MAGIC(h, EXT2_ET_MAGIC_EA_HANDLE);
+ if (!inode || !h->inode_size)
+ return EINVAL;
ret = ext2fs_get_mem(value_len, &new_value);
if (ret)
@@ -1632,23 +1688,14 @@ errcode_t ext2fs_xattr_set(struct ext2_xattr_handle *h,
break;
}
}
-
- ret = ext2fs_get_memzero(inode_size, &inode);
- if (ret)
- goto out;
- ret = ext2fs_read_inode_full(fs, h->ino,
- (struct ext2_inode *)inode,
- inode_size);
- if (ret)
- goto out;
- if (inode_size > EXT2_GOOD_OLD_INODE_SIZE) {
+ if (h->inode_size > EXT2_GOOD_OLD_INODE_SIZE) {
extra_isize = inode->i_extra_isize;
if (extra_isize == 0) {
extra_isize = fs->super->s_want_extra_isize;
if (extra_isize == 0)
extra_isize = sizeof(__u32);
}
- ibody_free = inode_size - EXT2_GOOD_OLD_INODE_SIZE;
+ ibody_free = h->inode_size - EXT2_GOOD_OLD_INODE_SIZE;
ibody_free -= extra_isize;
/* Extended attribute magic and final null entry. */
ibody_free -= sizeof(__u32) * 2;
@@ -1694,8 +1741,6 @@ errcode_t ext2fs_xattr_set(struct ext2_xattr_handle *h,
write_out:
ret = ext2fs_xattrs_write(h);
out:
- if (inode)
- ext2fs_free_mem(&inode);
ext2fs_free_mem(&new_value);
return ret;
}
@@ -1712,7 +1757,7 @@ errcode_t ext2fs_xattr_remove(struct ext2_xattr_handle *handle,
ext2fs_free_mem(&x->name);
ext2fs_free_mem(&x->value);
if (x->ea_ino)
- xattr_inode_dec_ref(handle->fs, x->ea_ino);
+ xattr_inode_dec_ref(handle, x->ea_ino);
memmove(x, x + 1, (end - x - 1)*sizeof(*x));
memset(end - 1, 0, sizeof(*end));
if (x < handle->attrs + handle->ibody_count)
@@ -1736,7 +1781,7 @@ errcode_t ext2fs_xattr_remove_all(struct ext2_xattr_handle *handle)
ext2fs_free_mem(&x->name);
ext2fs_free_mem(&x->value);
if (x->ea_ino)
- xattr_inode_dec_ref(handle->fs, x->ea_ino);
+ xattr_inode_dec_ref(handle, x->ea_ino);
}
handle->ibody_count = 0;
@@ -1744,8 +1789,14 @@ errcode_t ext2fs_xattr_remove_all(struct ext2_xattr_handle *handle)
return ext2fs_xattrs_write(handle);
}
-errcode_t ext2fs_xattrs_open(ext2_filsys fs, ext2_ino_t ino,
- struct ext2_xattr_handle **handle)
+/* If the inode size is set to EXT2_INODE_SIZE(fs), the input inode is used
+ * directly. Otherwise, the ext2fs_xattrs_* functions operate on a separate copy
+ * (with inline xattrs) and update the caller's cached inode on write.
+ */
+errcode_t ext2fs_xattrs_open_inode(ext2_filsys fs, ext2_ino_t ino,
+ struct ext2_inode *inode, size_t inode_size,
+ quota_ctx_t qctx,
+ struct ext2_xattr_handle **handle)
{
struct ext2_xattr_handle *h;
errcode_t err;
@@ -1754,6 +1805,9 @@ errcode_t ext2fs_xattrs_open(ext2_filsys fs, ext2_ino_t ino,
!ext2fs_has_feature_inline_data(fs->super))
return EXT2_ET_MISSING_EA_FEATURE;
+ if (inode && inode_size < sizeof(*inode))
+ return EINVAL;
+
err = ext2fs_get_memzero(sizeof(*h), &h);
if (err)
return err;
@@ -1768,11 +1822,23 @@ errcode_t ext2fs_xattrs_open(ext2_filsys fs, ext2_ino_t ino,
}
h->count = 0;
h->ino = ino;
+ h->in_inode = inode;
+ h->in_inode_size = inode_size;
+ h->alloc_inode = NULL;
+ h->inode = NULL;
+ h->inode_size = 0;
h->fs = fs;
+ h->qctx = qctx;
*handle = h;
return 0;
}
+errcode_t ext2fs_xattrs_open(ext2_filsys fs, ext2_ino_t ino,
+ struct ext2_xattr_handle **handle)
+{
+ return ext2fs_xattrs_open_inode(fs, ino, NULL, 0, NULL, handle);
+}
+
errcode_t ext2fs_xattrs_close(struct ext2_xattr_handle **handle)
{
struct ext2_xattr_handle *h = *handle;
@@ -1780,6 +1846,7 @@ errcode_t ext2fs_xattrs_close(struct ext2_xattr_handle **handle)
EXT2_CHECK_MAGIC(h, EXT2_ET_MAGIC_EA_HANDLE);
xattrs_free_keys(h);
ext2fs_free_mem(&h->attrs);
+ ext2fs_free_mem(&h->alloc_inode);
ext2fs_free_mem(handle);
return 0;
}
diff --git a/lib/ext2fs/i_block.c b/lib/ext2fs/i_block.c
index 2eecf02fc..064a5c989 100644
--- a/lib/ext2fs/i_block.c
+++ b/lib/ext2fs/i_block.c
@@ -88,3 +88,17 @@ errcode_t ext2fs_iblk_set(ext2_filsys fs, struct ext2_inode *inode, blk64_t b)
return EOVERFLOW;
return 0;
}
+
+blk64_t ext2fs_iblk_get(ext2_filsys fs, struct ext2_inode *inode)
+{
+ blk64_t blk = inode->i_blocks;
+
+ if (ext2fs_has_feature_huge_file(fs->super))
+ blk += ((blk64_t) inode->osd2.linux2.l_i_blocks_hi) << 32;
+
+ if (!ext2fs_has_feature_huge_file(fs->super) ||
+ !(inode->i_flags & EXT4_HUGE_FILE_FL))
+ blk /= fs->blocksize / 512;
+
+ return EXT2FS_B2C(fs, blk);
+}
diff --git a/lib/support/quotaio.h b/lib/support/quotaio.h
index 6152416fb..c76486919 100644
--- a/lib/support/quotaio.h
+++ b/lib/support/quotaio.h
@@ -58,7 +58,6 @@ enum quota_type {
#define QUOTA_PRJ_BIT (1 << PRJQUOTA)
#define QUOTA_ALL_BIT (QUOTA_USR_BIT | QUOTA_GRP_BIT | QUOTA_PRJ_BIT)
-typedef struct quota_ctx *quota_ctx_t;
struct dict_t;
struct quota_ctx {
diff --git a/tests/d_xattr_ea_inode/expect b/tests/d_xattr_ea_inode/expect
new file mode 100644
index 000000000..aaad9c5b3
--- /dev/null
+++ b/tests/d_xattr_ea_inode/expect
@@ -0,0 +1,137 @@
+debugfs edit extended attributes with ea_inode feature
+mke2fs -Fq -b 4096 -O ea_inode test.img 1m
+Exit status is 0
+Generate xattr value (8292 bytes)
+ea_set -f d_xattr_ea_inode.tmp / user.test1
+Exit status is 0
+ea_get -f d_xattr_ea_inode.ver.tmp / user.test1
+Exit status is 0
+Compare xattr values (8292 bytes)
+stat /
+Blockcount: 32
+Exit status is 0
+ea_rm / user.test1
+Exit status is 0
+e2fsck -yf -N test_filesys
+Pass 1: Checking inodes, blocks, and sizes
+Pass 2: Checking directory structure
+Pass 3: Checking directory connectivity
+Pass 4: Checking reference counts
+Pass 5: Checking group summary information
+test_filesys: 11/128 files (0.0% non-contiguous), 18/256 blocks
+Exit status is 0
+
+Generate xattr value (4097 bytes)
+ea_set -f d_xattr_ea_inode.tmp / user.test1
+Exit status is 0
+ea_get -f d_xattr_ea_inode.ver.tmp / user.test1
+Exit status is 0
+Compare xattr values (4097 bytes)
+stat /
+Blockcount: 24
+Exit status is 0
+e2fsck -yf -N test_filesys
+Pass 1: Checking inodes, blocks, and sizes
+Pass 2: Checking directory structure
+Pass 3: Checking directory connectivity
+Pass 4: Checking reference counts
+Pass 5: Checking group summary information
+test_filesys: 12/128 files (0.0% non-contiguous), 20/256 blocks
+Exit status is 0
+
+Generate xattr value (102 bytes)
+ea_set -f d_xattr_ea_inode.tmp / user.test2
+Exit status is 0
+ea_get -f d_xattr_ea_inode.ver.tmp / user.test2
+Exit status is 0
+Compare xattr values (102 bytes)
+stat /
+Blockcount: 32
+Exit status is 0
+e2fsck -yf -N test_filesys
+Pass 1: Checking inodes, blocks, and sizes
+Pass 2: Checking directory structure
+Pass 3: Checking directory connectivity
+Pass 4: Checking reference counts
+Pass 5: Checking group summary information
+test_filesys: 12/128 files (0.0% non-contiguous), 21/256 blocks
+Exit status is 0
+
+Generate xattr value (5005 bytes)
+ea_set -f d_xattr_ea_inode.tmp / user.test2
+Exit status is 0
+ea_get -f d_xattr_ea_inode.ver.tmp / user.test2
+Exit status is 0
+Compare xattr values (5005 bytes)
+stat /
+Blockcount: 40
+Exit status is 0
+ea_rm / user.test2
+Exit status is 0
+e2fsck -yf -N test_filesys
+Pass 1: Checking inodes, blocks, and sizes
+Pass 2: Checking directory structure
+Pass 3: Checking directory connectivity
+Pass 4: Checking reference counts
+Pass 5: Checking group summary information
+test_filesys: 12/128 files (0.0% non-contiguous), 20/256 blocks
+Exit status is 0
+
+Generate xattr value (512 bytes)
+ea_set -f d_xattr_ea_inode.tmp / user.test1
+Exit status is 0
+ea_get -f d_xattr_ea_inode.ver.tmp / user.test1
+Exit status is 0
+Compare xattr values (512 bytes)
+stat /
+Blockcount: 16
+Exit status is 0
+ea_rm / user.test1
+Exit status is 0
+e2fsck -yf -N test_filesys
+Pass 1: Checking inodes, blocks, and sizes
+Pass 2: Checking directory structure
+Pass 3: Checking directory connectivity
+Pass 4: Checking reference counts
+Pass 5: Checking group summary information
+test_filesys: 11/128 files (0.0% non-contiguous), 18/256 blocks
+Exit status is 0
+
+Generate xattr value (1024 bytes)
+ea_set -f d_xattr_ea_inode.tmp / user.test1
+Exit status is 0
+ea_get -f d_xattr_ea_inode.ver.tmp / user.test1
+Exit status is 0
+Compare xattr values (1024 bytes)
+stat /
+Blockcount: 16
+Exit status is 0
+e2fsck -yf -N test_filesys
+Pass 1: Checking inodes, blocks, and sizes
+Pass 2: Checking directory structure
+Pass 3: Checking directory connectivity
+Pass 4: Checking reference counts
+Pass 5: Checking group summary information
+test_filesys: 11/128 files (0.0% non-contiguous), 19/256 blocks
+Exit status is 0
+
+Generate xattr value (5000 bytes)
+ea_set -f d_xattr_ea_inode.tmp / user.test1
+Exit status is 0
+ea_get -f d_xattr_ea_inode.ver.tmp / user.test1
+Exit status is 0
+Compare xattr values (5000 bytes)
+stat /
+Blockcount: 24
+Exit status is 0
+ea_rm / user.test1
+Exit status is 0
+e2fsck -yf -N test_filesys
+Pass 1: Checking inodes, blocks, and sizes
+Pass 2: Checking directory structure
+Pass 3: Checking directory connectivity
+Pass 4: Checking reference counts
+Pass 5: Checking group summary information
+test_filesys: 11/128 files (0.0% non-contiguous), 18/256 blocks
+Exit status is 0
+
diff --git a/tests/d_xattr_ea_inode/name b/tests/d_xattr_ea_inode/name
new file mode 100644
index 000000000..9e36dc986
--- /dev/null
+++ b/tests/d_xattr_ea_inode/name
@@ -0,0 +1 @@
+edit extended attributes in debugfs with ea_inode feature
diff --git a/tests/d_xattr_ea_inode/script b/tests/d_xattr_ea_inode/script
new file mode 100644
index 000000000..84104549c
--- /dev/null
+++ b/tests/d_xattr_ea_inode/script
@@ -0,0 +1,85 @@
+#!/bin/bash
+
+if ! test -x $DEBUGFS_EXE; then
+ echo "$test_name: $test_description: skipped (no debugfs)"
+ return 0
+fi
+
+OUT=$test_name.log
+EXP=$test_dir/expect
+VERIFY_FSCK_OPT=-yf
+
+TEST_DATA=$test_name.tmp
+VERIFY_DATA=$test_name.ver.tmp
+
+echo "debugfs edit extended attributes with ea_inode feature" > $OUT.new
+
+d_xattr_ea_inode_check() {
+ local xattr_size=$1
+ local xattr_name=$2
+ local ea_rm=$3
+
+ echo "Generate xattr value ($xattr_size bytes)" >> $OUT.new
+ echo $xattr_size |
+ awk '{srand();for(i=0;i<$1;i++) printf("%c",97+int(rand()*26));}' > $TEST_DATA
+
+ echo "ea_set -f $TEST_DATA / $xattr_name" >> $OUT.new
+ $DEBUGFS -w -R "ea_set -f $TEST_DATA / $xattr_name" $TMPFILE >> $OUT.new 2>&1
+ echo Exit status is $? >> $OUT.new
+
+ echo "ea_get -f $VERIFY_DATA / $xattr_name" >> $OUT.new
+ $DEBUGFS -w -R "ea_get -f $VERIFY_DATA / $xattr_name" $TMPFILE >> $OUT.new 2>&1
+ echo Exit status is $? >> $OUT.new
+
+ echo "Compare xattr values ($xattr_size bytes)" >> $OUT.new
+ diff -u $TEST_DATA $VERIFY_DATA >> $OUT.new
+
+ echo "stat /" >> $OUT.new
+ ($DEBUGFS -c -R "stat /" $TMPFILE | grep -Eo "Blockcount: [0-9]+") >> $OUT.new 2>&1
+ echo Exit status is $? >> $OUT.new
+
+ if $ea_rm; then
+ echo "ea_rm / $xattr_name" >> $OUT.new
+ $DEBUGFS -w -R "ea_rm / $xattr_name" $TMPFILE >> $OUT.new 2>&1
+ echo Exit status is $? >> $OUT.new
+ fi
+
+ echo e2fsck $VERIFY_FSCK_OPT -N test_filesys >> $OUT.new
+ $FSCK $VERIFY_FSCK_OPT -N test_filesys $TMPFILE >> $OUT.new 2>&1
+ echo Exit status is $? >> $OUT.new
+ echo >> $OUT.new
+}
+
+truncate -s1M $TMPFILE 2>&1
+
+echo "mke2fs -Fq -b 4096 -O ea_inode test.img 1m" >> $OUT.new
+$MKE2FS -Fq -b 4096 -O ea_inode $TMPFILE 1m > /dev/null 2>&1
+echo Exit status is $? >> $OUT.new
+
+d_xattr_ea_inode_check 8292 user.test1 true
+
+d_xattr_ea_inode_check 4097 user.test1 false
+d_xattr_ea_inode_check 102 user.test2 false
+d_xattr_ea_inode_check 5005 user.test2 true
+d_xattr_ea_inode_check 512 user.test1 true
+
+d_xattr_ea_inode_check 1024 user.test1 false
+d_xattr_ea_inode_check 5000 user.test1 true
+
+sed -f $cmd_dir/filter.sed $OUT.new > $OUT
+
+#
+# Do the verification
+#
+
+rm -f $TMPFILE $TEST_DATA $VERIFY_DATA $OUT.new
+
+if cmp -s $OUT $EXP; then
+ echo "$test_name: $test_description: ok"
+ touch $test_name.ok
+else
+ echo "$test_name: $test_description: failed"
+ diff $DIFF_OPTS $EXP $OUT > $test_name.failed
+fi
+
+unset VERIFY_FSCK_OPT VERIFY_DATA TEST_DATA OUT EXP d_xattr_ea_inode_check
--
2.43.7
^ permalink raw reply related
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox