* [PATCH RFC 15/17] ext4: use fast incremental CRC update in __ext4_new_inode()
From: Baokun Li @ 2026-05-08 12:15 UTC (permalink / raw)
To: linux-ext4
Cc: linux-crypto, ebiggers, ardb, tytso, adilger.kernel, jack,
yi.zhang, ojaswin, ritesh.list, Baokun Li
In-Reply-To: <20260508121539.4174601-1-libaokun@linux.alibaba.com>
Merge the bitmap modification and group descriptor update into a single
group lock acquisition in __ext4_new_inode(). Previously the bitmap bit
was set under one lock/unlock pair, and the GDP fields (UNINIT,
itable_unused, free_inodes, dirs, csum) were updated under a separate
lock/unlock pair with a gap in between. Another thread could modify the
bitmap and update the checksum during that gap, making incremental CRC
incorrect.
Now the full sequence -- set bit, update free inodes, clear UNINIT,
update itable_unused, and compute checksum -- happens atomically under
the same ext4_lock_group(). The alloc_sem is acquired before the group
lock to maintain correct locking order with itable lazyinit.
Use ext4_inode_bitmap_csum_set_fast() for the normal path where the
stored checksum is valid. When EXT4_BG_INODE_UNINIT is set, fall back
to ext4_inode_bitmap_csum_set() for a full recalculation to establish
a correct baseline (mkfs leaves the checksum as zero for UNINIT groups).
Signed-off-by: Baokun Li <libaokun@linux.alibaba.com>
---
fs/ext4/ialloc.c | 129 +++++++++++++++++++++++------------------------
1 file changed, 63 insertions(+), 66 deletions(-)
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 8b75b331b26e..9dd1cdb367ba 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -1135,7 +1135,25 @@ struct inode *__ext4_new_inode(struct mnt_idmap *idmap,
ext4_std_error(sb, err);
goto out;
}
+
+ BUFFER_TRACE(group_desc_bh, "get_write_access");
+ err = ext4_journal_get_write_access(handle, sb, group_desc_bh,
+ EXT4_JTR_NONE);
+ if (err) {
+ ext4_std_error(sb, err);
+ goto out;
+ }
+
+ /* We may have to initialize the block bitmap if it isn't already */
+ err = ext4_might_init_block_bitmap(handle, sb, group, gdp);
+ if (err)
+ goto out;
+
+ if (ext4_has_group_desc_csum(sb) &&
+ !(sbi->s_mount_state & EXT4_FC_REPLAY))
+ down_read(&grp->alloc_sem);
ext4_lock_group(sb, group);
+
ret2 = ext4_test_and_set_bit(bit, inode_bitmap_bh->b_data);
if (ret2) {
/* Someone already took the bit. Repeat the search
@@ -1147,9 +1165,54 @@ struct inode *__ext4_new_inode(struct mnt_idmap *idmap,
ret2 = 0;
} else {
ret2 = 1; /* we didn't grab the inode */
+ goto unlock_group;
+ }
+ }
+
+ /* Update the relevant bg descriptor fields */
+ ext4_free_inodes_set(sb, gdp,
+ ext4_free_inodes_count(sb, gdp) - 1);
+ if (S_ISDIR(mode)) {
+ ext4_used_dirs_set(sb, gdp,
+ ext4_used_dirs_count(sb, gdp) + 1);
+ if (sbi->s_log_groups_per_flex) {
+ ext4_group_t f = ext4_flex_group(sbi, group);
+ atomic_inc(&sbi_array_rcu_deref(sbi, s_flex_groups,
+ f)->used_dirs);
+ }
+ }
+
+ if (ext4_has_group_desc_csum(sb)) {
+ bool fast_crc = true;
+ int free = EXT4_INODES_PER_GROUP(sb) -
+ ext4_itable_unused_count(sb, gdp);
+
+ if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
+ gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT);
+ free = 0;
+ /* Incremental CRC needs a valid csum baseline */
+ fast_crc = false;
}
+ /*
+ * Check the relative inode number against the
+ * last used relative inode number in this group.
+ * If it is greater we need to update the
+ * bg_itable_unused count.
+ */
+ if (bit >= free)
+ ext4_itable_unused_set(sb, gdp,
+ EXT4_INODES_PER_GROUP(sb) - bit - 1);
+ if (fast_crc)
+ ext4_inode_bitmap_csum_set_fast(sb, gdp, bit);
+ else
+ ext4_inode_bitmap_csum_set(sb, gdp, inode_bitmap_bh);
+ ext4_group_desc_csum_set(sb, group, gdp);
}
+unlock_group:
ext4_unlock_group(sb, group);
+ if (ext4_has_group_desc_csum(sb) &&
+ !(sbi->s_mount_state & EXT4_FC_REPLAY))
+ up_read(&grp->alloc_sem);
if (!ret2)
goto got; /* we grabbed the inode! */
@@ -1168,72 +1231,6 @@ struct inode *__ext4_new_inode(struct mnt_idmap *idmap,
goto out;
}
- BUFFER_TRACE(group_desc_bh, "get_write_access");
- err = ext4_journal_get_write_access(handle, sb, group_desc_bh,
- EXT4_JTR_NONE);
- if (err) {
- ext4_std_error(sb, err);
- goto out;
- }
-
- /* We may have to initialize the block bitmap if it isn't already */
- err = ext4_might_init_block_bitmap(handle, sb, group, gdp);
- if (err)
- goto out;
-
- /* Update the relevant bg descriptor fields */
- if (ext4_has_group_desc_csum(sb)) {
- int free;
- struct ext4_group_info *grp = NULL;
-
- if (!(sbi->s_mount_state & EXT4_FC_REPLAY)) {
- grp = ext4_get_group_info(sb, group);
- if (!grp) {
- err = -EFSCORRUPTED;
- goto out;
- }
- down_read(&grp->alloc_sem); /*
- * protect vs itable
- * lazyinit
- */
- }
- ext4_lock_group(sb, group); /* while we modify the bg desc */
- free = EXT4_INODES_PER_GROUP(sb) -
- ext4_itable_unused_count(sb, gdp);
- if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
- gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT);
- free = 0;
- }
- /*
- * Check the relative inode number against the last used
- * relative inode number in this group. if it is greater
- * we need to update the bg_itable_unused count
- */
- if (bit >= free)
- ext4_itable_unused_set(sb, gdp,
- (EXT4_INODES_PER_GROUP(sb) - bit - 1));
- if (!(sbi->s_mount_state & EXT4_FC_REPLAY))
- up_read(&grp->alloc_sem);
- } else {
- ext4_lock_group(sb, group);
- }
-
- ext4_free_inodes_set(sb, gdp, ext4_free_inodes_count(sb, gdp) - 1);
- if (S_ISDIR(mode)) {
- ext4_used_dirs_set(sb, gdp, ext4_used_dirs_count(sb, gdp) + 1);
- if (sbi->s_log_groups_per_flex) {
- ext4_group_t f = ext4_flex_group(sbi, group);
-
- atomic_inc(&sbi_array_rcu_deref(sbi, s_flex_groups,
- f)->used_dirs);
- }
- }
- if (ext4_has_group_desc_csum(sb)) {
- ext4_inode_bitmap_csum_set(sb, gdp, inode_bitmap_bh);
- ext4_group_desc_csum_set(sb, group, gdp);
- }
- ext4_unlock_group(sb, group);
-
BUFFER_TRACE(group_desc_bh, "call ext4_handle_dirty_metadata");
err = ext4_handle_dirty_metadata(handle, NULL, group_desc_bh);
if (err) {
--
2.43.7
^ permalink raw reply related
* [PATCH RFC 12/17] ext4: factor out ext4_might_init_block_bitmap() helper
From: Baokun Li @ 2026-05-08 12:15 UTC (permalink / raw)
To: linux-ext4
Cc: linux-crypto, ebiggers, ardb, tytso, adilger.kernel, jack,
yi.zhang, ojaswin, ritesh.list, Baokun Li
In-Reply-To: <20260508121539.4174601-1-libaokun@linux.alibaba.com>
Extract the BLOCK_UNINIT initialization logic from ext4_mark_inode_used()
and __ext4_new_inode() into a shared ext4_might_init_block_bitmap() helper.
Both call sites perform the same sequence: check EXT4_BG_BLOCK_UNINIT,
read the block bitmap, dirty it, then clear the flag and establish the
correct block bitmap checksum under the group lock. The only difference
is whether a journal handle is available (NULL during fast commit replay
in ext4_mark_inode_used()).
No functional change.
Signed-off-by: Baokun Li <libaokun@linux.alibaba.com>
---
fs/ext4/ialloc.c | 129 +++++++++++++++++++++--------------------------
1 file changed, 58 insertions(+), 71 deletions(-)
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 5896cdfb2ccf..90896b7f8c73 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -756,6 +756,58 @@ static int find_inode_bit(struct super_block *sb, ext4_group_t group,
return 1;
}
+/*
+ * If the block bitmap for @group is not yet initialized (EXT4_BG_BLOCK_UNINIT),
+ * read it into memory, dirty it, and clear the UNINIT flag under the group lock
+ * so that the on-disk checksum is established. @handle may be NULL during fast
+ * commit replay (no journal credits needed in that path).
+ */
+static int ext4_might_init_block_bitmap(handle_t *handle,
+ struct super_block *sb,
+ ext4_group_t group,
+ struct ext4_group_desc *gdp)
+{
+ int err;
+ struct buffer_head *block_bitmap_bh;
+
+ if (!ext4_has_group_desc_csum(sb) ||
+ !(gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))
+ return 0;
+
+ block_bitmap_bh = ext4_read_block_bitmap(sb, group);
+ if (IS_ERR(block_bitmap_bh))
+ return PTR_ERR(block_bitmap_bh);
+
+ if (handle) {
+ BUFFER_TRACE(block_bitmap_bh, "get block bitmap access");
+ err = ext4_journal_get_write_access(handle, sb,
+ block_bitmap_bh, EXT4_JTR_NONE);
+ if (err)
+ goto out_brelse;
+ }
+
+ BUFFER_TRACE(block_bitmap_bh, "dirty block bitmap");
+ err = ext4_handle_dirty_metadata(handle, NULL, block_bitmap_bh);
+ if (!handle)
+ sync_dirty_buffer(block_bitmap_bh);
+
+ /* recheck and clear flag under lock if we still need to */
+ ext4_lock_group(sb, group);
+ if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
+ gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
+ ext4_free_group_clusters_set(sb, gdp,
+ ext4_free_clusters_after_init(sb, group, gdp));
+ ext4_block_bitmap_csum_set(sb, gdp, block_bitmap_bh);
+ ext4_group_desc_csum_set(sb, group, gdp);
+ }
+ ext4_unlock_group(sb, group);
+
+out_brelse:
+ brelse(block_bitmap_bh);
+ ext4_std_error(sb, err);
+ return err;
+}
+
int ext4_mark_inode_used(struct super_block *sb, int ino, umode_t mode)
{
unsigned long max_ino = le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count);
@@ -801,38 +853,9 @@ int ext4_mark_inode_used(struct super_block *sb, int ino, umode_t mode)
}
/* We may have to initialize the block bitmap if it isn't already */
- if (ext4_has_group_desc_csum(sb) &&
- gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
- struct buffer_head *block_bitmap_bh;
-
- block_bitmap_bh = ext4_read_block_bitmap(sb, group);
- if (IS_ERR(block_bitmap_bh)) {
- err = PTR_ERR(block_bitmap_bh);
- goto out;
- }
-
- BUFFER_TRACE(block_bitmap_bh, "dirty block bitmap");
- err = ext4_handle_dirty_metadata(NULL, NULL, block_bitmap_bh);
- sync_dirty_buffer(block_bitmap_bh);
-
- /* recheck and clear flag under lock if we still need to */
- ext4_lock_group(sb, group);
- if (ext4_has_group_desc_csum(sb) &&
- (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) {
- gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
- ext4_free_group_clusters_set(sb, gdp,
- ext4_free_clusters_after_init(sb, group, gdp));
- ext4_block_bitmap_csum_set(sb, gdp, block_bitmap_bh);
- ext4_group_desc_csum_set(sb, group, gdp);
- }
- ext4_unlock_group(sb, group);
- brelse(block_bitmap_bh);
-
- if (err) {
- ext4_std_error(sb, err);
- goto out;
- }
- }
+ err = ext4_might_init_block_bitmap(NULL, sb, group, gdp);
+ if (err)
+ goto out;
/* Update the relevant bg descriptor fields */
if (ext4_has_group_desc_csum(sb)) {
@@ -1154,45 +1177,9 @@ struct inode *__ext4_new_inode(struct mnt_idmap *idmap,
}
/* We may have to initialize the block bitmap if it isn't already */
- if (ext4_has_group_desc_csum(sb) &&
- gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
- struct buffer_head *block_bitmap_bh;
-
- block_bitmap_bh = ext4_read_block_bitmap(sb, group);
- if (IS_ERR(block_bitmap_bh)) {
- err = PTR_ERR(block_bitmap_bh);
- goto out;
- }
- BUFFER_TRACE(block_bitmap_bh, "get block bitmap access");
- err = ext4_journal_get_write_access(handle, sb, block_bitmap_bh,
- EXT4_JTR_NONE);
- if (err) {
- brelse(block_bitmap_bh);
- ext4_std_error(sb, err);
- goto out;
- }
-
- BUFFER_TRACE(block_bitmap_bh, "dirty block bitmap");
- err = ext4_handle_dirty_metadata(handle, NULL, block_bitmap_bh);
-
- /* recheck and clear flag under lock if we still need to */
- ext4_lock_group(sb, group);
- if (ext4_has_group_desc_csum(sb) &&
- (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) {
- gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
- ext4_free_group_clusters_set(sb, gdp,
- ext4_free_clusters_after_init(sb, group, gdp));
- ext4_block_bitmap_csum_set(sb, gdp, block_bitmap_bh);
- ext4_group_desc_csum_set(sb, group, gdp);
- }
- ext4_unlock_group(sb, group);
- brelse(block_bitmap_bh);
-
- if (err) {
- ext4_std_error(sb, err);
- goto out;
- }
- }
+ err = ext4_might_init_block_bitmap(handle, sb, group, gdp);
+ if (err)
+ goto out;
/* Update the relevant bg descriptor fields */
if (ext4_has_group_desc_csum(sb)) {
--
2.43.7
^ permalink raw reply related
* [PATCH RFC 13/17] ext4: use fast incremental CRC update in ext4_mark_inode_used()
From: Baokun Li @ 2026-05-08 12:15 UTC (permalink / raw)
To: linux-ext4
Cc: linux-crypto, ebiggers, ardb, tytso, adilger.kernel, jack,
yi.zhang, ojaswin, ritesh.list, Baokun Li
In-Reply-To: <20260508121539.4174601-1-libaokun@linux.alibaba.com>
Move the bitmap modification, GDP update, and checksum update into a
single group lock acquisition in ext4_mark_inode_used(), eliminating the
race window where another thread could interleave a full recomputation
between bitmap modification and checksum update.
Add a fast_crc flag to select between incremental and full CRC update.
When EXT4_BG_INODE_UNINIT is set, the stored checksum in the group
descriptor is not a valid CRC of the bitmap -- mkfs leaves it as zero
for UNINIT groups, and ext4_read_inode_bitmap() memsets the buffer to
zero without updating the gdp checksum. So fast_crc is forced to false
to fall back to ext4_inode_bitmap_csum_set() for a full recalculation
that establishes a correct baseline.
For non-UNINIT groups, ext4_inode_bitmap_csum_set_fast() computes the
CRC delta for the single flipped bit in O(log N) time.
Signed-off-by: Baokun Li <libaokun@linux.alibaba.com>
---
fs/ext4/ialloc.c | 69 ++++++++++++++++++++++++------------------------
1 file changed, 35 insertions(+), 34 deletions(-)
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 90896b7f8c73..e209e27f827f 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -838,35 +838,37 @@ int ext4_mark_inode_used(struct super_block *sb, int ino, umode_t mode)
goto out;
}
- ext4_set_bit(bit, inode_bitmap_bh->b_data);
-
- BUFFER_TRACE(inode_bitmap_bh, "call ext4_handle_dirty_metadata");
- err = ext4_handle_dirty_metadata(NULL, NULL, inode_bitmap_bh);
- if (err) {
- ext4_std_error(sb, err);
- goto out;
- }
- err = sync_dirty_buffer(inode_bitmap_bh);
- if (err) {
- ext4_std_error(sb, err);
- goto out;
- }
-
/* We may have to initialize the block bitmap if it isn't already */
err = ext4_might_init_block_bitmap(NULL, sb, group, gdp);
if (err)
goto out;
+ ext4_lock_group(sb, group);
+ /* Fast commit replay is single-threaded, no need for test_and_set */
+ ext4_set_bit(bit, inode_bitmap_bh->b_data);
+
/* Update the relevant bg descriptor fields */
+ ext4_free_inodes_set(sb, gdp, ext4_free_inodes_count(sb, gdp) - 1);
+ if (S_ISDIR(mode)) {
+ ext4_used_dirs_set(sb, gdp, ext4_used_dirs_count(sb, gdp) + 1);
+ if (sbi->s_log_groups_per_flex) {
+ ext4_group_t f = ext4_flex_group(sbi, group);
+
+ atomic_inc(&sbi_array_rcu_deref(sbi, s_flex_groups,
+ f)->used_dirs);
+ }
+ }
+
if (ext4_has_group_desc_csum(sb)) {
- int free;
+ bool fast_crc = true;
+ int free = EXT4_INODES_PER_GROUP(sb) -
+ ext4_itable_unused_count(sb, gdp);
- ext4_lock_group(sb, group); /* while we modify the bg desc */
- free = EXT4_INODES_PER_GROUP(sb) -
- ext4_itable_unused_count(sb, gdp);
if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT);
free = 0;
+ /* Incremental CRC needs a valid checksum baseline */
+ fast_crc = false;
}
/*
@@ -877,27 +879,26 @@ int ext4_mark_inode_used(struct super_block *sb, int ino, umode_t mode)
if (bit >= free)
ext4_itable_unused_set(sb, gdp,
(EXT4_INODES_PER_GROUP(sb) - bit - 1));
- } else {
- ext4_lock_group(sb, group);
+ if (fast_crc)
+ ext4_inode_bitmap_csum_set_fast(sb, gdp, bit);
+ else
+ ext4_inode_bitmap_csum_set(sb, gdp, inode_bitmap_bh);
+ ext4_group_desc_csum_set(sb, group, gdp);
}
- ext4_free_inodes_set(sb, gdp, ext4_free_inodes_count(sb, gdp) - 1);
- if (S_ISDIR(mode)) {
- ext4_used_dirs_set(sb, gdp, ext4_used_dirs_count(sb, gdp) + 1);
- if (sbi->s_log_groups_per_flex) {
- ext4_group_t f = ext4_flex_group(sbi, group);
+ ext4_unlock_group(sb, group);
- atomic_inc(&sbi_array_rcu_deref(sbi, s_flex_groups,
- f)->used_dirs);
- }
+ BUFFER_TRACE(inode_bitmap_bh, "call ext4_handle_dirty_metadata");
+ err = ext4_handle_dirty_metadata(NULL, NULL, inode_bitmap_bh);
+ if (err) {
+ ext4_std_error(sb, err);
+ goto out;
}
-
- if (ext4_has_group_desc_csum(sb)) {
- ext4_inode_bitmap_csum_set(sb, gdp, inode_bitmap_bh);
- ext4_group_desc_csum_set(sb, group, gdp);
+ err = sync_dirty_buffer(inode_bitmap_bh);
+ if (err) {
+ ext4_std_error(sb, err);
+ goto out;
}
-
- ext4_unlock_group(sb, group);
err = ext4_handle_dirty_metadata(NULL, NULL, group_desc_bh);
sync_dirty_buffer(group_desc_bh);
out:
--
2.43.7
^ permalink raw reply related
* [PATCH RFC 08/17] ext4: extract inode bitmap checksum get and store helpers
From: Baokun Li @ 2026-05-08 12:15 UTC (permalink / raw)
To: linux-ext4
Cc: linux-crypto, ebiggers, ardb, tytso, adilger.kernel, jack,
yi.zhang, ojaswin, ritesh.list, Baokun Li
In-Reply-To: <20260508121539.4174601-1-libaokun@linux.alibaba.com>
Add ext4_inode_bitmap_csum_get() and ext4_inode_bitmap_csum_store()
helpers, and use EXT4_DESC_SIZE(sb) instead of sbi->s_desc_size for
consistency. No functional change.
Signed-off-by: Baokun Li <libaokun@linux.alibaba.com>
---
fs/ext4/bitmap.c | 31 ++++++++++++++++++++++---------
1 file changed, 22 insertions(+), 9 deletions(-)
diff --git a/fs/ext4/bitmap.c b/fs/ext4/bitmap.c
index 00b0a3c74859..008acc439301 100644
--- a/fs/ext4/bitmap.c
+++ b/fs/ext4/bitmap.c
@@ -16,11 +16,29 @@ unsigned int ext4_count_free(char *bitmap, unsigned int numchars)
return numchars * BITS_PER_BYTE - memweight(bitmap, numchars);
}
+static inline __u32 ext4_inode_bitmap_csum_get(struct super_block *sb,
+ struct ext4_group_desc *gdp)
+{
+ __u32 csum = le16_to_cpu(gdp->bg_inode_bitmap_csum_lo);
+
+ if (EXT4_DESC_SIZE(sb) >= EXT4_BG_INODE_BITMAP_CSUM_HI_END)
+ csum |= (__u32)le16_to_cpu(gdp->bg_inode_bitmap_csum_hi) << 16;
+ return csum;
+}
+
+static inline void ext4_inode_bitmap_csum_store(struct super_block *sb,
+ struct ext4_group_desc *gdp,
+ __u32 csum)
+{
+ gdp->bg_inode_bitmap_csum_lo = cpu_to_le16(csum & 0xFFFF);
+ if (EXT4_DESC_SIZE(sb) >= EXT4_BG_INODE_BITMAP_CSUM_HI_END)
+ gdp->bg_inode_bitmap_csum_hi = cpu_to_le16(csum >> 16);
+}
+
int ext4_inode_bitmap_csum_verify(struct super_block *sb,
struct ext4_group_desc *gdp,
struct buffer_head *bh)
{
- __u32 hi;
__u32 provided, calculated;
struct ext4_sb_info *sbi = EXT4_SB(sb);
int sz;
@@ -29,12 +47,9 @@ int ext4_inode_bitmap_csum_verify(struct super_block *sb,
return 1;
sz = EXT4_INODES_PER_GROUP(sb) >> 3;
- provided = le16_to_cpu(gdp->bg_inode_bitmap_csum_lo);
+ provided = ext4_inode_bitmap_csum_get(sb, gdp);
calculated = ext4_chksum(sbi->s_csum_seed, (__u8 *)bh->b_data, sz);
- if (sbi->s_desc_size >= EXT4_BG_INODE_BITMAP_CSUM_HI_END) {
- hi = le16_to_cpu(gdp->bg_inode_bitmap_csum_hi);
- provided |= (hi << 16);
- } else
+ if (EXT4_DESC_SIZE(sb) < EXT4_BG_INODE_BITMAP_CSUM_HI_END)
calculated &= 0xFFFF;
return provided == calculated;
@@ -53,9 +68,7 @@ void ext4_inode_bitmap_csum_set(struct super_block *sb,
sz = EXT4_INODES_PER_GROUP(sb) >> 3;
csum = ext4_chksum(sbi->s_csum_seed, (__u8 *)bh->b_data, sz);
- gdp->bg_inode_bitmap_csum_lo = cpu_to_le16(csum & 0xFFFF);
- if (sbi->s_desc_size >= EXT4_BG_INODE_BITMAP_CSUM_HI_END)
- gdp->bg_inode_bitmap_csum_hi = cpu_to_le16(csum >> 16);
+ ext4_inode_bitmap_csum_store(sb, gdp, csum);
}
static inline __u32 ext4_block_bitmap_csum_get(struct super_block *sb,
--
2.43.7
^ permalink raw reply related
* [PATCH RFC 11/17] ext4: fix missing bg_used_dirs_count update in fast commit replay
From: Baokun Li @ 2026-05-08 12:15 UTC (permalink / raw)
To: linux-ext4
Cc: linux-crypto, ebiggers, ardb, tytso, adilger.kernel, jack,
yi.zhang, ojaswin, ritesh.list, Baokun Li
In-Reply-To: <20260508121539.4174601-1-libaokun@linux.alibaba.com>
ext4_mark_inode_used() did not update bg_used_dirs_count for directory
inodes during fast commit replay because it lacked the inode mode.
Add a mode parameter and pass it from both ext4_fc_replay_inode() (from
raw_fc_inode) and ext4_fc_replay_create() (after ext4_iget).
Fixes: 8016e29f4362 ("ext4: fast commit recovery path")
Signed-off-by: Baokun Li <libaokun@linux.alibaba.com>
---
fs/ext4/ext4.h | 2 +-
fs/ext4/fast_commit.c | 13 +++++++------
fs/ext4/ialloc.c | 13 ++++++++++++-
3 files changed, 20 insertions(+), 8 deletions(-)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index e6739d5af490..f48cb9d998ab 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2941,7 +2941,7 @@ extern int ext4fs_dirhash(const struct inode *dir, const char *name, int len,
struct dx_hash_info *hinfo);
/* ialloc.c */
-extern int ext4_mark_inode_used(struct super_block *sb, int ino);
+extern int ext4_mark_inode_used(struct super_block *sb, int ino, umode_t mode);
extern struct inode *__ext4_new_inode(struct mnt_idmap *, handle_t *,
struct inode *, umode_t,
const struct qstr *qstr, __u32 goal,
diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
index b3c22636251d..f68d7b2eb0db 100644
--- a/fs/ext4/fast_commit.c
+++ b/fs/ext4/fast_commit.c
@@ -1578,7 +1578,7 @@ static int ext4_fc_replay_inode(struct super_block *sb,
ret = sync_dirty_buffer(iloc.bh);
if (ret)
goto out_brelse;
- ret = ext4_mark_inode_used(sb, ino);
+ ret = ext4_mark_inode_used(sb, ino, le16_to_cpu(raw_fc_inode->i_mode));
if (ret)
goto out_brelse;
@@ -1635,11 +1635,7 @@ static int ext4_fc_replay_create(struct super_block *sb,
trace_ext4_fc_replay(sb, EXT4_FC_TAG_CREAT, darg.ino,
darg.parent_ino, darg.dname_len);
- /* This takes care of update group descriptor and other metadata */
- ret = ext4_mark_inode_used(sb, darg.ino);
- if (ret)
- goto out;
-
+ /* Inode already on disk from TAG_INODE replay; iget first for mode. */
inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL);
if (IS_ERR(inode)) {
ext4_debug("inode %d not found.", darg.ino);
@@ -1648,6 +1644,11 @@ static int ext4_fc_replay_create(struct super_block *sb,
goto out;
}
+ /* This takes care of update group descriptor and other metadata */
+ ret = ext4_mark_inode_used(sb, darg.ino, inode->i_mode);
+ if (ret)
+ goto out;
+
if (S_ISDIR(inode->i_mode)) {
/*
* If we are creating a directory, we need to make sure that the
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 55eb69fbb4c9..5896cdfb2ccf 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -756,11 +756,12 @@ static int find_inode_bit(struct super_block *sb, ext4_group_t group,
return 1;
}
-int ext4_mark_inode_used(struct super_block *sb, int ino)
+int ext4_mark_inode_used(struct super_block *sb, int ino, umode_t mode)
{
unsigned long max_ino = le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count);
struct buffer_head *inode_bitmap_bh = NULL, *group_desc_bh = NULL;
struct ext4_group_desc *gdp;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
ext4_group_t group;
int bit;
int err;
@@ -858,6 +859,16 @@ int ext4_mark_inode_used(struct super_block *sb, int ino)
}
ext4_free_inodes_set(sb, gdp, ext4_free_inodes_count(sb, gdp) - 1);
+ if (S_ISDIR(mode)) {
+ ext4_used_dirs_set(sb, gdp, ext4_used_dirs_count(sb, gdp) + 1);
+ if (sbi->s_log_groups_per_flex) {
+ ext4_group_t f = ext4_flex_group(sbi, group);
+
+ atomic_inc(&sbi_array_rcu_deref(sbi, s_flex_groups,
+ f)->used_dirs);
+ }
+ }
+
if (ext4_has_group_desc_csum(sb)) {
ext4_inode_bitmap_csum_set(sb, gdp, inode_bitmap_bh);
ext4_group_desc_csum_set(sb, group, gdp);
--
2.43.7
^ permalink raw reply related
* [PATCH RFC 10/17] ext4: use fast incremental CRC update in ext4_free_inode()
From: Baokun Li @ 2026-05-08 12:15 UTC (permalink / raw)
To: linux-ext4
Cc: linux-crypto, ebiggers, ardb, tytso, adilger.kernel, jack,
yi.zhang, ojaswin, ritesh.list, Baokun Li
In-Reply-To: <20260508121539.4174601-1-libaokun@linux.alibaba.com>
Replace ext4_inode_bitmap_csum_set() with the newly added
ext4_inode_bitmap_csum_set_fast() in ext4_free_inode() for incremental
inode bitmap checksum update.
This is safe because:
- At inode free time, the inode bitmap checksum has already been
initialized, so the old checksum is always valid.
- The bitmap buffer modification and checksum update are protected
by the same group lock, ensuring the old checksum is consistent
with the bitmap content before the bit flip.
Signed-off-by: Baokun Li <libaokun@linux.alibaba.com>
---
fs/ext4/ialloc.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 3fd8f0099852..55eb69fbb4c9 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -327,7 +327,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
if (percpu_counter_initialized(&sbi->s_dirs_counter))
percpu_counter_dec(&sbi->s_dirs_counter);
}
- ext4_inode_bitmap_csum_set(sb, gdp, bitmap_bh);
+ ext4_inode_bitmap_csum_set_fast(sb, gdp, bit);
ext4_group_desc_csum_set(sb, block_group, gdp);
ext4_unlock_group(sb, block_group);
--
2.43.7
^ permalink raw reply related
* [PATCH RFC 17/17] ext4: add ext4_get_flex_group() helper to simplify flex group lookups
From: Baokun Li @ 2026-05-08 12:15 UTC (permalink / raw)
To: linux-ext4
Cc: linux-crypto, ebiggers, ardb, tytso, adilger.kernel, jack,
yi.zhang, ojaswin, ritesh.list, Baokun Li
In-Reply-To: <20260508121539.4174601-1-libaokun@linux.alibaba.com>
Introduce ext4_get_flex_group() that combines ext4_flex_group() and
sbi_array_rcu_deref() into a single call, replacing the repeated
pattern across ialloc.c, mballoc.c, resize.c, and super.c.
No functional change.
Signed-off-by: Baokun Li <libaokun@linux.alibaba.com>
---
fs/ext4/ext4.h | 7 +++++++
fs/ext4/ialloc.c | 19 +++++--------------
fs/ext4/mballoc.c | 4 +---
fs/ext4/resize.c | 4 +---
fs/ext4/super.c | 4 +---
5 files changed, 15 insertions(+), 23 deletions(-)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index f48cb9d998ab..e38ada51972a 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -3457,6 +3457,13 @@ static inline unsigned int ext4_flex_bg_size(struct ext4_sb_info *sbi)
return 1 << sbi->s_log_groups_per_flex;
}
+static inline struct flex_groups *ext4_get_flex_group(struct ext4_sb_info *sbi,
+ ext4_group_t block_group)
+{
+ return sbi_array_rcu_deref(sbi, s_flex_groups,
+ ext4_flex_group(sbi, block_group));
+}
+
static inline loff_t ext4_get_maxbytes(struct inode *inode)
{
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 25430c572818..d88160afd6b6 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -336,8 +336,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
if (sbi->s_log_groups_per_flex) {
struct flex_groups *fg;
- fg = sbi_array_rcu_deref(sbi, s_flex_groups,
- ext4_flex_group(sbi, block_group));
+ fg = ext4_get_flex_group(sbi, block_group);
atomic_inc(&fg->free_inodes);
if (is_directory)
atomic_dec(&fg->used_dirs);
@@ -826,12 +825,8 @@ static void ext4_update_inode_group_desc(struct super_block *sb,
ext4_free_inodes_set(sb, gdp, ext4_free_inodes_count(sb, gdp) - 1);
if (S_ISDIR(mode)) {
ext4_used_dirs_set(sb, gdp, ext4_used_dirs_count(sb, gdp) + 1);
- if (sbi->s_log_groups_per_flex) {
- ext4_group_t f = ext4_flex_group(sbi, group);
-
- atomic_inc(&sbi_array_rcu_deref(sbi, s_flex_groups,
- f)->used_dirs);
- }
+ if (sbi->s_log_groups_per_flex)
+ atomic_inc(&ext4_get_flex_group(sbi, group)->used_dirs);
}
if (!ext4_has_group_desc_csum(sb))
@@ -997,7 +992,6 @@ struct inode *__ext4_new_inode(struct mnt_idmap *idmap,
int ret2, err;
struct inode *ret;
ext4_group_t i;
- ext4_group_t flex_group;
struct ext4_group_info *grp = NULL;
bool encrypt = false;
@@ -1220,11 +1214,8 @@ struct inode *__ext4_new_inode(struct mnt_idmap *idmap,
if (S_ISDIR(mode))
percpu_counter_inc(&sbi->s_dirs_counter);
- if (sbi->s_log_groups_per_flex) {
- flex_group = ext4_flex_group(sbi, group);
- atomic_dec(&sbi_array_rcu_deref(sbi, s_flex_groups,
- flex_group)->free_inodes);
- }
+ if (sbi->s_log_groups_per_flex)
+ atomic_dec(&ext4_get_flex_group(sbi, group)->free_inodes);
/* the inode bitmap is zero-based */
inode->i_ino = bit + 1 + group * EXT4_INODES_PER_GROUP(sb);
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 77f6309916d1..9e30c9eefd35 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -4181,9 +4181,7 @@ ext4_mb_mark_context(handle_t *handle, struct super_block *sb, bool state,
*ret_changed = changed;
if (sbi->s_log_groups_per_flex) {
- ext4_group_t flex_group = ext4_flex_group(sbi, group);
- struct flex_groups *fg = sbi_array_rcu_deref(sbi,
- s_flex_groups, flex_group);
+ struct flex_groups *fg = ext4_get_flex_group(sbi, group);
if (state)
atomic64_sub(changed, &fg->free_clusters);
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 2c5b851c552a..8d2cd1bc17bb 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -1495,11 +1495,9 @@ static void ext4_update_super(struct super_block *sb,
ext4_debug("free blocks count %llu",
percpu_counter_read(&sbi->s_freeclusters_counter));
if (ext4_has_feature_flex_bg(sb) && sbi->s_log_groups_per_flex) {
- ext4_group_t flex_group;
struct flex_groups *fg;
- flex_group = ext4_flex_group(sbi, group_data[0].group);
- fg = sbi_array_rcu_deref(sbi, s_flex_groups, flex_group);
+ fg = ext4_get_flex_group(sbi, group_data[0].group);
atomic64_add(EXT4_NUM_B2C(sbi, free_blocks),
&fg->free_clusters);
atomic_add(EXT4_INODES_PER_GROUP(sb) * flex_gd->count,
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 6a77db4d3124..064e06163716 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -3211,7 +3211,6 @@ static int ext4_fill_flex_info(struct super_block *sb)
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_group_desc *gdp = NULL;
struct flex_groups *fg;
- ext4_group_t flex_group;
int i, err;
sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex;
@@ -3227,8 +3226,7 @@ static int ext4_fill_flex_info(struct super_block *sb)
for (i = 0; i < sbi->s_groups_count; i++) {
gdp = ext4_get_group_desc(sb, i, NULL);
- flex_group = ext4_flex_group(sbi, i);
- fg = sbi_array_rcu_deref(sbi, s_flex_groups, flex_group);
+ fg = ext4_get_flex_group(sbi, i);
atomic_add(ext4_free_inodes_count(sb, gdp), &fg->free_inodes);
atomic64_add(ext4_free_group_clusters(sb, gdp),
&fg->free_clusters);
--
2.43.7
^ permalink raw reply related
* [PATCH RFC 09/17] ext4: add ext4_inode_bitmap_csum_set_fast() for incremental checksum update
From: Baokun Li @ 2026-05-08 12:15 UTC (permalink / raw)
To: linux-ext4
Cc: linux-crypto, ebiggers, ardb, tytso, adilger.kernel, jack,
yi.zhang, ojaswin, ritesh.list, Baokun Li
In-Reply-To: <20260508121539.4174601-1-libaokun@linux.alibaba.com>
Add a helper function ext4_inode_bitmap_csum_set_fast() that uses
crc32c_flip_range() to incrementally update the inode bitmap checksum
after flipping a single bit at the given offset. This avoids a full
bitmap CRC rescan, computing the CRC delta in O(log N) time.
Signed-off-by: Baokun Li <libaokun@linux.alibaba.com>
---
fs/ext4/bitmap.c | 23 +++++++++++++++++++++++
fs/ext4/ext4.h | 3 +++
2 files changed, 26 insertions(+)
diff --git a/fs/ext4/bitmap.c b/fs/ext4/bitmap.c
index 008acc439301..ea47ca0d7046 100644
--- a/fs/ext4/bitmap.c
+++ b/fs/ext4/bitmap.c
@@ -71,6 +71,29 @@ void ext4_inode_bitmap_csum_set(struct super_block *sb,
ext4_inode_bitmap_csum_store(sb, gdp, csum);
}
+/*
+ * Update inode bitmap checksum for a single flipped bit.
+ *
+ * Use crc32c_flip_range() to incrementally update the checksum after
+ * flipping the bit at @offset, avoiding a full bitmap CRC rescan.
+ * The csum_seed cancels out in the XOR delta, so it is not needed here.
+ */
+void ext4_inode_bitmap_csum_set_fast(struct super_block *sb,
+ struct ext4_group_desc *gdp,
+ ext4_grpblk_t offset)
+{
+ __u32 new_csum, old_csum;
+
+ if (!ext4_has_feature_metadata_csum(sb))
+ return;
+
+ old_csum = ext4_inode_bitmap_csum_get(sb, gdp);
+ new_csum = crc32c_flip_range(old_csum, EXT4_INODES_PER_GROUP(sb),
+ offset, 1);
+
+ ext4_inode_bitmap_csum_store(sb, gdp, new_csum);
+}
+
static inline __u32 ext4_block_bitmap_csum_get(struct super_block *sb,
struct ext4_group_desc *gdp)
{
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index c423a9a04047..e6739d5af490 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2764,6 +2764,9 @@ extern unsigned int ext4_count_free(char *bitmap, unsigned numchars);
void ext4_inode_bitmap_csum_set(struct super_block *sb,
struct ext4_group_desc *gdp,
struct buffer_head *bh);
+void ext4_inode_bitmap_csum_set_fast(struct super_block *sb,
+ struct ext4_group_desc *gdp,
+ ext4_grpblk_t offset);
int ext4_inode_bitmap_csum_verify(struct super_block *sb,
struct ext4_group_desc *gdp,
struct buffer_head *bh);
--
2.43.7
^ permalink raw reply related
* [PATCH RFC 16/17] ext4: extract ext4_update_inode_group_desc() to reduce duplication
From: Baokun Li @ 2026-05-08 12:15 UTC (permalink / raw)
To: linux-ext4
Cc: linux-crypto, ebiggers, ardb, tytso, adilger.kernel, jack,
yi.zhang, ojaswin, ritesh.list, Baokun Li
In-Reply-To: <20260508121539.4174601-1-libaokun@linux.alibaba.com>
ext4_mark_inode_used() and __ext4_new_inode() contain nearly
identical code blocks for updating group descriptor fields after
inode allocation (UNINIT flag clearing, itable_unused update,
inode bitmap checksum, group desc checksum). Extract the common
logic into ext4_update_inode_group_desc() to eliminate duplication.
Signed-off-by: Baokun Li <libaokun@linux.alibaba.com>
---
fs/ext4/ialloc.c | 136 ++++++++++++++++++++---------------------------
1 file changed, 57 insertions(+), 79 deletions(-)
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 9dd1cdb367ba..25430c572818 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -808,12 +808,63 @@ static int ext4_might_init_block_bitmap(handle_t *handle,
return err;
}
+/*
+ * Update group descriptor checksums and itable_unused after allocating
+ * inode @bit (0-based relative inode number within the group).
+ * Must be called with the group lock held.
+ */
+static void ext4_update_inode_group_desc(struct super_block *sb,
+ ext4_group_t group,
+ struct ext4_group_desc *gdp,
+ struct buffer_head *inode_bitmap_bh,
+ int bit, umode_t mode)
+{
+ int free;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ bool fast_crc = true;
+
+ ext4_free_inodes_set(sb, gdp, ext4_free_inodes_count(sb, gdp) - 1);
+ if (S_ISDIR(mode)) {
+ ext4_used_dirs_set(sb, gdp, ext4_used_dirs_count(sb, gdp) + 1);
+ if (sbi->s_log_groups_per_flex) {
+ ext4_group_t f = ext4_flex_group(sbi, group);
+
+ atomic_inc(&sbi_array_rcu_deref(sbi, s_flex_groups,
+ f)->used_dirs);
+ }
+ }
+
+ if (!ext4_has_group_desc_csum(sb))
+ return;
+
+ free = EXT4_INODES_PER_GROUP(sb) - ext4_itable_unused_count(sb, gdp);
+ if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
+ gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT);
+ free = 0;
+ /* Incremental CRC needs a valid checksum baseline */
+ fast_crc = false;
+ }
+
+ /*
+ * Check the relative inode number against the last used
+ * relative inode number in this group. If it is greater
+ * we need to update the bg_itable_unused count.
+ */
+ if (bit >= free)
+ ext4_itable_unused_set(sb, gdp,
+ EXT4_INODES_PER_GROUP(sb) - bit - 1);
+ if (fast_crc)
+ ext4_inode_bitmap_csum_set_fast(sb, gdp, bit);
+ else
+ ext4_inode_bitmap_csum_set(sb, gdp, inode_bitmap_bh);
+ ext4_group_desc_csum_set(sb, group, gdp);
+}
+
int ext4_mark_inode_used(struct super_block *sb, int ino, umode_t mode)
{
unsigned long max_ino = le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count);
struct buffer_head *inode_bitmap_bh = NULL, *group_desc_bh = NULL;
struct ext4_group_desc *gdp;
- struct ext4_sb_info *sbi = EXT4_SB(sb);
ext4_group_t group;
int bit;
int err;
@@ -848,44 +899,8 @@ int ext4_mark_inode_used(struct super_block *sb, int ino, umode_t mode)
ext4_set_bit(bit, inode_bitmap_bh->b_data);
/* Update the relevant bg descriptor fields */
- ext4_free_inodes_set(sb, gdp, ext4_free_inodes_count(sb, gdp) - 1);
- if (S_ISDIR(mode)) {
- ext4_used_dirs_set(sb, gdp, ext4_used_dirs_count(sb, gdp) + 1);
- if (sbi->s_log_groups_per_flex) {
- ext4_group_t f = ext4_flex_group(sbi, group);
-
- atomic_inc(&sbi_array_rcu_deref(sbi, s_flex_groups,
- f)->used_dirs);
- }
- }
-
- if (ext4_has_group_desc_csum(sb)) {
- bool fast_crc = true;
- int free = EXT4_INODES_PER_GROUP(sb) -
- ext4_itable_unused_count(sb, gdp);
-
- if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
- gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT);
- free = 0;
- /* Incremental CRC needs a valid checksum baseline */
- fast_crc = false;
- }
-
- /*
- * Check the relative inode number against the last used
- * relative inode number in this group. if it is greater
- * we need to update the bg_itable_unused count
- */
- if (bit >= free)
- ext4_itable_unused_set(sb, gdp,
- (EXT4_INODES_PER_GROUP(sb) - bit - 1));
- if (fast_crc)
- ext4_inode_bitmap_csum_set_fast(sb, gdp, bit);
- else
- ext4_inode_bitmap_csum_set(sb, gdp, inode_bitmap_bh);
- ext4_group_desc_csum_set(sb, group, gdp);
- }
-
+ ext4_update_inode_group_desc(sb, group, gdp,
+ inode_bitmap_bh, bit, mode);
ext4_unlock_group(sb, group);
BUFFER_TRACE(inode_bitmap_bh, "call ext4_handle_dirty_metadata");
@@ -1165,50 +1180,13 @@ struct inode *__ext4_new_inode(struct mnt_idmap *idmap,
ret2 = 0;
} else {
ret2 = 1; /* we didn't grab the inode */
- goto unlock_group;
}
}
/* Update the relevant bg descriptor fields */
- ext4_free_inodes_set(sb, gdp,
- ext4_free_inodes_count(sb, gdp) - 1);
- if (S_ISDIR(mode)) {
- ext4_used_dirs_set(sb, gdp,
- ext4_used_dirs_count(sb, gdp) + 1);
- if (sbi->s_log_groups_per_flex) {
- ext4_group_t f = ext4_flex_group(sbi, group);
- atomic_inc(&sbi_array_rcu_deref(sbi, s_flex_groups,
- f)->used_dirs);
- }
- }
-
- if (ext4_has_group_desc_csum(sb)) {
- bool fast_crc = true;
- int free = EXT4_INODES_PER_GROUP(sb) -
- ext4_itable_unused_count(sb, gdp);
-
- if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
- gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT);
- free = 0;
- /* Incremental CRC needs a valid csum baseline */
- fast_crc = false;
- }
- /*
- * Check the relative inode number against the
- * last used relative inode number in this group.
- * If it is greater we need to update the
- * bg_itable_unused count.
- */
- if (bit >= free)
- ext4_itable_unused_set(sb, gdp,
- EXT4_INODES_PER_GROUP(sb) - bit - 1);
- if (fast_crc)
- ext4_inode_bitmap_csum_set_fast(sb, gdp, bit);
- else
- ext4_inode_bitmap_csum_set(sb, gdp, inode_bitmap_bh);
- ext4_group_desc_csum_set(sb, group, gdp);
- }
-unlock_group:
+ if (!ret2)
+ ext4_update_inode_group_desc(sb, group, gdp,
+ inode_bitmap_bh, bit, mode);
ext4_unlock_group(sb, group);
if (ext4_has_group_desc_csum(sb) &&
!(sbi->s_mount_state & EXT4_FC_REPLAY))
--
2.43.7
^ permalink raw reply related
* [PATCH RFC 14/17] ext4: rename ino to bit in __ext4_new_inode()
From: Baokun Li @ 2026-05-08 12:15 UTC (permalink / raw)
To: linux-ext4
Cc: linux-crypto, ebiggers, ardb, tytso, adilger.kernel, jack,
yi.zhang, ojaswin, ritesh.list, Baokun Li
In-Reply-To: <20260508121539.4174601-1-libaokun@linux.alibaba.com>
In __ext4_new_inode(), the variable 'ino' actually holds a zero-based
bit position within the inode bitmap, not an absolute inode number.
Rename it to 'bit' to better reflect its semantics and avoid confusion
with inode->i_ino.
With this rename, the previous 'ino++' before calculating i_ino is no
longer needed; instead compute i_ino directly as 'bit + 1'.
Signed-off-by: Baokun Li <libaokun@linux.alibaba.com>
---
fs/ext4/ialloc.c | 26 +++++++++++++-------------
1 file changed, 13 insertions(+), 13 deletions(-)
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index e209e27f827f..8b75b331b26e 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -974,7 +974,7 @@ struct inode *__ext4_new_inode(struct mnt_idmap *idmap,
struct buffer_head *inode_bitmap_bh = NULL;
struct buffer_head *group_desc_bh;
ext4_group_t ngroups, group = 0;
- unsigned long ino = 0;
+ unsigned long bit = 0;
struct inode *inode;
struct ext4_group_desc *gdp = NULL;
struct ext4_inode_info *ei;
@@ -1050,7 +1050,7 @@ struct inode *__ext4_new_inode(struct mnt_idmap *idmap,
if (goal && goal <= le32_to_cpu(sbi->s_es->s_inodes_count)) {
group = (goal - 1) / EXT4_INODES_PER_GROUP(sb);
- ino = (goal - 1) % EXT4_INODES_PER_GROUP(sb);
+ bit = (goal - 1) % EXT4_INODES_PER_GROUP(sb);
ret2 = 0;
goto got_group;
}
@@ -1071,7 +1071,7 @@ struct inode *__ext4_new_inode(struct mnt_idmap *idmap,
* unless we get unlucky and it turns out the group we selected
* had its last inode grabbed by someone else.
*/
- for (i = 0; i < ngroups; i++, ino = 0) {
+ for (i = 0; i < ngroups; i++, bit = 0) {
err = -EIO;
gdp = ext4_get_group_desc(sb, group, &group_desc_bh);
@@ -1105,13 +1105,13 @@ struct inode *__ext4_new_inode(struct mnt_idmap *idmap,
EXT4_MB_GRP_IBITMAP_CORRUPT(grp))
goto next_group;
- ret2 = find_inode_bit(sb, group, inode_bitmap_bh, &ino);
+ ret2 = find_inode_bit(sb, group, inode_bitmap_bh, &bit);
if (!ret2)
goto next_group;
- if (group == 0 && (ino + 1) < EXT4_FIRST_INO(sb)) {
+ if (group == 0 && (bit + 1) < EXT4_FIRST_INO(sb)) {
ext4_error(sb, "reserved inode found cleared - "
- "inode=%lu", ino + 1);
+ "inode=%lu", bit + 1);
ext4_mark_group_bitmap_corrupted(sb, group,
EXT4_GROUP_INFO_IBITMAP_CORRUPT);
goto next_group;
@@ -1136,21 +1136,20 @@ struct inode *__ext4_new_inode(struct mnt_idmap *idmap,
goto out;
}
ext4_lock_group(sb, group);
- ret2 = ext4_test_and_set_bit(ino, inode_bitmap_bh->b_data);
+ ret2 = ext4_test_and_set_bit(bit, inode_bitmap_bh->b_data);
if (ret2) {
/* Someone already took the bit. Repeat the search
* with lock held.
*/
- ret2 = find_inode_bit(sb, group, inode_bitmap_bh, &ino);
+ ret2 = find_inode_bit(sb, group, inode_bitmap_bh, &bit);
if (ret2) {
- ext4_set_bit(ino, inode_bitmap_bh->b_data);
+ ext4_set_bit(bit, inode_bitmap_bh->b_data);
ret2 = 0;
} else {
ret2 = 1; /* we didn't grab the inode */
}
}
ext4_unlock_group(sb, group);
- ino++; /* the inode bitmap is zero-based */
if (!ret2)
goto got; /* we grabbed the inode! */
@@ -1210,9 +1209,9 @@ struct inode *__ext4_new_inode(struct mnt_idmap *idmap,
* relative inode number in this group. if it is greater
* we need to update the bg_itable_unused count
*/
- if (ino > free)
+ if (bit >= free)
ext4_itable_unused_set(sb, gdp,
- (EXT4_INODES_PER_GROUP(sb) - ino));
+ (EXT4_INODES_PER_GROUP(sb) - bit - 1));
if (!(sbi->s_mount_state & EXT4_FC_REPLAY))
up_read(&grp->alloc_sem);
} else {
@@ -1252,7 +1251,8 @@ struct inode *__ext4_new_inode(struct mnt_idmap *idmap,
flex_group)->free_inodes);
}
- inode->i_ino = ino + group * EXT4_INODES_PER_GROUP(sb);
+ /* the inode bitmap is zero-based */
+ inode->i_ino = bit + 1 + group * EXT4_INODES_PER_GROUP(sb);
/* This is the optimal IO size (for stat), not the fs block size */
inode->i_blocks = 0;
simple_inode_init_ts(inode);
--
2.43.7
^ permalink raw reply related
* [PATCH RFC 07/17] ext4: use fast incremental CRC update in ext4_mb_mark_context()
From: Baokun Li @ 2026-05-08 12:15 UTC (permalink / raw)
To: linux-ext4
Cc: linux-crypto, ebiggers, ardb, tytso, adilger.kernel, jack,
yi.zhang, ojaswin, ritesh.list, Baokun Li
In-Reply-To: <20260508121539.4174601-1-libaokun@linux.alibaba.com>
Use ext4_block_bitmap_csum_set_range() in ext4_mb_mark_context() for
fast incremental block bitmap checksum updates. Instead of re-scanning
the entire bitmap after every allocation or free, the incremental update
computes the CRC delta for the modified bit range in O(log N) time.
Add a fast_crc flag that is set when EXT4_MB_BITMAP_MARKED_CHECK is not
used. When fast_crc is true, all bits in the range are guaranteed to flip,
so the incremental CRC via ext4_block_bitmap_csum_set_range() is correct.
Otherwise, fall back to ext4_block_bitmap_csum_set() for a full CRC
recalculation, since idempotent operations (mb_set_bits/mb_clear_bits
with EXT4_MB_BITMAP_MARKED_CHECK) may leave some bits unchanged.
For the BLOCK_UNINIT case, the bitmap was just initialized and there is
no valid old checksum, so fast_crc is forced to false to ensure a full
CRC recalculation establishes a correct baseline.
Signed-off-by: Baokun Li <libaokun@linux.alibaba.com>
---
fs/ext4/mballoc.c | 22 +++++++++++++++++++++-
1 file changed, 21 insertions(+), 1 deletion(-)
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index ff2023c9f52c..77f6309916d1 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -4095,6 +4095,7 @@ ext4_mb_mark_context(handle_t *handle, struct super_block *sb, bool state,
struct buffer_head *gdp_bh;
int err;
unsigned int i, already, changed = len;
+ bool fast_crc;
KUNIT_STATIC_STUB_REDIRECT(ext4_mb_mark_context,
handle, sb, state, group, blkoff, len,
@@ -4127,12 +4128,28 @@ ext4_mb_mark_context(handle_t *handle, struct super_block *sb, bool state,
goto out_err;
}
+ /*
+ * fast_crc: Use incremental CRC update via crc32c_flip_range().
+ * This is only valid when all bits in [blkoff, blkoff+len) are
+ * guaranteed to be in the opposite state (i.e., every bit will
+ * actually flip). When EXT4_MB_BITMAP_MARKED_CHECK is set,
+ * mb_set_bits/mb_clear_bits are idempotent, so some bits may not
+ * change and incremental CRC would produce incorrect results.
+ */
+ fast_crc = !(flags & EXT4_MB_BITMAP_MARKED_CHECK);
+
ext4_lock_group(sb, group);
if (ext4_has_group_desc_csum(sb) &&
(gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) {
gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
ext4_free_group_clusters_set(sb, gdp,
ext4_free_clusters_after_init(sb, group, gdp));
+ /*
+ * The bitmap was just initialized, so the old checksum
+ * is invalid for incremental CRC update. Fall back to
+ * full recalculation.
+ */
+ fast_crc = false;
}
if (flags & EXT4_MB_BITMAP_MARKED_CHECK) {
@@ -4154,7 +4171,10 @@ ext4_mb_mark_context(handle_t *handle, struct super_block *sb, bool state,
ext4_free_group_clusters(sb, gdp) + changed);
}
- ext4_block_bitmap_csum_set(sb, gdp, bitmap_bh);
+ if (fast_crc)
+ ext4_block_bitmap_csum_set_range(sb, gdp, blkoff, len);
+ else
+ ext4_block_bitmap_csum_set(sb, gdp, bitmap_bh);
ext4_group_desc_csum_set(sb, group, gdp);
ext4_unlock_group(sb, group);
if (ret_changed)
--
2.43.7
^ permalink raw reply related
* [PATCH RFC 05/17] ext4: extract block bitmap checksum get and store helpers
From: Baokun Li @ 2026-05-08 12:15 UTC (permalink / raw)
To: linux-ext4
Cc: linux-crypto, ebiggers, ardb, tytso, adilger.kernel, jack,
yi.zhang, ojaswin, ritesh.list, Baokun Li
In-Reply-To: <20260508121539.4174601-1-libaokun@linux.alibaba.com>
Add ext4_block_bitmap_csum_get() and ext4_block_bitmap_csum_store()
helpers, and use EXT4_DESC_SIZE(sb) instead of sbi->s_desc_size for
consistency. No functional change.
Signed-off-by: Baokun Li <libaokun@linux.alibaba.com>
---
fs/ext4/bitmap.c | 31 ++++++++++++++++++++++---------
1 file changed, 22 insertions(+), 9 deletions(-)
diff --git a/fs/ext4/bitmap.c b/fs/ext4/bitmap.c
index 87760fabdd2e..46affc9e80ca 100644
--- a/fs/ext4/bitmap.c
+++ b/fs/ext4/bitmap.c
@@ -58,11 +58,29 @@ void ext4_inode_bitmap_csum_set(struct super_block *sb,
gdp->bg_inode_bitmap_csum_hi = cpu_to_le16(csum >> 16);
}
+static inline __u32 ext4_block_bitmap_csum_get(struct super_block *sb,
+ struct ext4_group_desc *gdp)
+{
+ __u32 csum = le16_to_cpu(gdp->bg_block_bitmap_csum_lo);
+
+ if (EXT4_DESC_SIZE(sb) >= EXT4_BG_BLOCK_BITMAP_CSUM_HI_END)
+ csum |= (__u32)le16_to_cpu(gdp->bg_block_bitmap_csum_hi) << 16;
+ return csum;
+}
+
+static inline void ext4_block_bitmap_csum_store(struct super_block *sb,
+ struct ext4_group_desc *gdp,
+ __u32 csum)
+{
+ gdp->bg_block_bitmap_csum_lo = cpu_to_le16(csum & 0xFFFF);
+ if (EXT4_DESC_SIZE(sb) >= EXT4_BG_BLOCK_BITMAP_CSUM_HI_END)
+ gdp->bg_block_bitmap_csum_hi = cpu_to_le16(csum >> 16);
+}
+
int ext4_block_bitmap_csum_verify(struct super_block *sb,
struct ext4_group_desc *gdp,
struct buffer_head *bh)
{
- __u32 hi;
__u32 provided, calculated;
struct ext4_sb_info *sbi = EXT4_SB(sb);
int sz = EXT4_CLUSTERS_PER_GROUP(sb) / 8;
@@ -70,12 +88,9 @@ int ext4_block_bitmap_csum_verify(struct super_block *sb,
if (!ext4_has_feature_metadata_csum(sb))
return 1;
- provided = le16_to_cpu(gdp->bg_block_bitmap_csum_lo);
+ provided = ext4_block_bitmap_csum_get(sb, gdp);
calculated = ext4_chksum(sbi->s_csum_seed, (__u8 *)bh->b_data, sz);
- if (sbi->s_desc_size >= EXT4_BG_BLOCK_BITMAP_CSUM_HI_END) {
- hi = le16_to_cpu(gdp->bg_block_bitmap_csum_hi);
- provided |= (hi << 16);
- } else
+ if (EXT4_DESC_SIZE(sb) < EXT4_BG_BLOCK_BITMAP_CSUM_HI_END)
calculated &= 0xFFFF;
return provided == calculated;
@@ -93,7 +108,5 @@ void ext4_block_bitmap_csum_set(struct super_block *sb,
return;
csum = ext4_chksum(sbi->s_csum_seed, (__u8 *)bh->b_data, sz);
- gdp->bg_block_bitmap_csum_lo = cpu_to_le16(csum & 0xFFFF);
- if (sbi->s_desc_size >= EXT4_BG_BLOCK_BITMAP_CSUM_HI_END)
- gdp->bg_block_bitmap_csum_hi = cpu_to_le16(csum >> 16);
+ ext4_block_bitmap_csum_store(sb, gdp, csum);
}
--
2.43.7
^ permalink raw reply related
* [PATCH RFC 06/17] ext4: add ext4_block_bitmap_csum_set_range() for incremental checksum update
From: Baokun Li @ 2026-05-08 12:15 UTC (permalink / raw)
To: linux-ext4
Cc: linux-crypto, ebiggers, ardb, tytso, adilger.kernel, jack,
yi.zhang, ojaswin, ritesh.list, Baokun Li
In-Reply-To: <20260508121539.4174601-1-libaokun@linux.alibaba.com>
Add a helper function ext4_block_bitmap_csum_set_range() that updates
the block bitmap checksum using crc32c_flip_range() for incremental CRC
calculation. Unlike ext4_block_bitmap_csum_set() which re-scans the
entire bitmap buffer, this function efficiently computes the CRC delta
for a range of flipped bits, avoiding the cost of a full CRC
recalculation.
Signed-off-by: Baokun Li <libaokun@linux.alibaba.com>
---
fs/ext4/bitmap.c | 24 ++++++++++++++++++++++++
fs/ext4/ext4.h | 3 +++
2 files changed, 27 insertions(+)
diff --git a/fs/ext4/bitmap.c b/fs/ext4/bitmap.c
index 46affc9e80ca..00b0a3c74859 100644
--- a/fs/ext4/bitmap.c
+++ b/fs/ext4/bitmap.c
@@ -110,3 +110,27 @@ void ext4_block_bitmap_csum_set(struct super_block *sb,
csum = ext4_chksum(sbi->s_csum_seed, (__u8 *)bh->b_data, sz);
ext4_block_bitmap_csum_store(sb, gdp, csum);
}
+
+/*
+ * Update block bitmap checksum using incremental CRC calculation.
+ *
+ * This function assumes that ALL bits in the range [offset, offset+len)
+ * have been flipped (XORed with 1). It uses crc32c_flip_range() to
+ * efficiently compute the CRC delta without re-scanning the entire bitmap.
+ * The csum_seed cancels out in the XOR delta, so it is not needed here.
+ */
+void ext4_block_bitmap_csum_set_range(struct super_block *sb,
+ struct ext4_group_desc *gdp,
+ ext4_grpblk_t offset, ext4_grpblk_t len)
+{
+ __u32 new_csum, old_csum;
+
+ if (!ext4_has_feature_metadata_csum(sb))
+ return;
+
+ old_csum = ext4_block_bitmap_csum_get(sb, gdp);
+ new_csum = crc32c_flip_range(old_csum, EXT4_CLUSTERS_PER_GROUP(sb),
+ offset, len);
+
+ ext4_block_bitmap_csum_store(sb, gdp, new_csum);
+}
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 94283a991e5c..c423a9a04047 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2770,6 +2770,9 @@ int ext4_inode_bitmap_csum_verify(struct super_block *sb,
void ext4_block_bitmap_csum_set(struct super_block *sb,
struct ext4_group_desc *gdp,
struct buffer_head *bh);
+void ext4_block_bitmap_csum_set_range(struct super_block *sb,
+ struct ext4_group_desc *gdp,
+ ext4_grpblk_t offset, ext4_grpblk_t len);
int ext4_block_bitmap_csum_verify(struct super_block *sb,
struct ext4_group_desc *gdp,
struct buffer_head *bh);
--
2.43.7
^ permalink raw reply related
* [PATCH RFC 03/17] lib/crc: crc_kunit: add benchmark for crc32c_flip_range()
From: Baokun Li @ 2026-05-08 12:15 UTC (permalink / raw)
To: linux-ext4
Cc: linux-crypto, ebiggers, ardb, tytso, adilger.kernel, jack,
yi.zhang, ojaswin, ritesh.list, Baokun Li
In-Reply-To: <20260508121539.4174601-1-libaokun@linux.alibaba.com>
Add a kunit benchmark comparing crc32c_flip_range() against full crc32c
recomputation across bitmap sizes from 1KB to 64KB. The benchmark reports
per-call latency in nanoseconds and the speedup ratio.
Sample results (x86_64, Intel(R) Xeon(R) Platinum 8331C):
bitmap=1024: flip_range=48 ns, full_crc=45 ns, speedup=0.9x
bitmap=2048: flip_range=53 ns, full_crc=88 ns, speedup=1.6x
bitmap=4096: flip_range=57 ns, full_crc=182 ns, speedup=3.1x
bitmap=8192: flip_range=63 ns, full_crc=357 ns, speedup=5.6x
bitmap=16384: flip_range=68 ns, full_crc=709 ns, speedup=10.3x
bitmap=32768: flip_range=73 ns, full_crc=1421 ns, speedup=19.3x
bitmap=65536: flip_range=78 ns, full_crc=2853 ns, speedup=36.3x
Signed-off-by: Baokun Li <libaokun@linux.alibaba.com>
---
lib/crc/tests/crc_kunit.c | 52 +++++++++++++++++++++++++++++++++++++++
1 file changed, 52 insertions(+)
diff --git a/lib/crc/tests/crc_kunit.c b/lib/crc/tests/crc_kunit.c
index 46f9df5b58e4..8e8b541b37d3 100644
--- a/lib/crc/tests/crc_kunit.c
+++ b/lib/crc/tests/crc_kunit.c
@@ -554,6 +554,57 @@ static void crc32c_flip_range_test(struct kunit *test)
}
}
+/*
+ * Benchmark crc32c_flip_range vs full crc32c recomputation
+ */
+static void crc32c_flip_range_benchmark(struct kunit *test)
+{
+ static const size_t bitmap_sizes[] = {
+ 1024, 2048, 4096, 8192, 16384, 32768, 65536,
+ };
+ size_t i, j, num_iters, buflen, total_bits;
+ volatile u32 crc;
+ u64 t_flip, t_full;
+ u8 *buf;
+
+ if (!IS_ENABLED(CONFIG_CRC_BENCHMARK))
+ kunit_skip(test, "not enabled");
+
+ buf = kunit_kzalloc(test, 65536, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_NULL(test, buf);
+
+ for (i = 0; i < ARRAY_SIZE(bitmap_sizes); i++) {
+ buflen = bitmap_sizes[i];
+ total_bits = buflen * 8;
+ num_iters = 10000000 / (buflen + 128);
+
+ /* Benchmark crc32c_flip_range */
+ crc = crc32c(0, buf, buflen);
+ preempt_disable();
+ t_flip = ktime_get_ns();
+ for (j = 0; j < num_iters; j++)
+ crc = crc32c_flip_range(crc, total_bits, 100, 100);
+ t_flip = ktime_get_ns() - t_flip;
+ preempt_enable();
+
+ /* Benchmark full crc32c recomputation */
+ preempt_disable();
+ t_full = ktime_get_ns();
+ for (j = 0; j < num_iters; j++)
+ crc = crc32c(0, buf, buflen);
+ t_full = ktime_get_ns() - t_full;
+ preempt_enable();
+
+ kunit_info(test,
+ "bitmap=%zu: flip_range=%llu ns, full_crc=%llu ns, speedup=%llu.%01llux\n",
+ buflen,
+ div64_u64(t_flip, num_iters),
+ div64_u64(t_full, num_iters),
+ div64_u64(t_full * 10, t_flip ? t_flip : 1) / 10,
+ div64_u64(t_full * 10, t_flip ? t_flip : 1) % 10);
+ }
+}
+
static struct kunit_case crc_test_cases[] = {
#if IS_REACHABLE(CONFIG_CRC7)
KUNIT_CASE(crc7_be_test),
@@ -575,6 +626,7 @@ static struct kunit_case crc_test_cases[] = {
KUNIT_CASE(crc32c_test),
KUNIT_CASE(crc32c_benchmark),
KUNIT_CASE(crc32c_flip_range_test),
+ KUNIT_CASE(crc32c_flip_range_benchmark),
#endif
#if IS_REACHABLE(CONFIG_CRC64)
KUNIT_CASE(crc64_be_test),
--
2.43.7
^ permalink raw reply related
* [PATCH RFC 02/17] lib/crc: crc_kunit: add kunit test for crc32c_flip_range()
From: Baokun Li @ 2026-05-08 12:15 UTC (permalink / raw)
To: linux-ext4
Cc: linux-crypto, ebiggers, ardb, tytso, adilger.kernel, jack,
yi.zhang, ojaswin, ritesh.list, Baokun Li
In-Reply-To: <20260508121539.4174601-1-libaokun@linux.alibaba.com>
Add kunit tests for crc32c_flip_range(), validating correctness
against naive full-buffer CRC recomputation. All tests use a 64KB
buffer and a non-zero CRC seed to match real-world usage (e.g. ext4
metadata checksums):
- ones_lookup[0] single-bit verification.
- num_bits=0 no-op, first/last byte, full 64KB flip.
- Random single-bit flips (100 iterations).
- Random multi-bit contiguous ranges (100 iterations).
Signed-off-by: Baokun Li <libaokun@linux.alibaba.com>
---
lib/crc/tests/crc_kunit.c | 85 +++++++++++++++++++++++++++++++++++++++
1 file changed, 85 insertions(+)
diff --git a/lib/crc/tests/crc_kunit.c b/lib/crc/tests/crc_kunit.c
index 9428cd913625..46f9df5b58e4 100644
--- a/lib/crc/tests/crc_kunit.c
+++ b/lib/crc/tests/crc_kunit.c
@@ -470,6 +470,90 @@ static void crc64_nvme_benchmark(struct kunit *test)
}
#endif /* CONFIG_CRC64 */
+/*
+ * Test crc32c_flip_range() against naive full-buffer CRC recomputation.
+ * All tests use a 64KB buffer (2^19 bits = INCR_MAX_ORDER limit)
+ * and a non-zero seed to match real-world usage (e.g. ext4 checksums).
+ */
+static void crc32c_flip_range_test(struct kunit *test)
+{
+ size_t buflen = 65536;
+ size_t total_bits = buflen * 8;
+ u32 seed = 0x12345678;
+ u32 expected, flip_crc;
+ size_t start, num_bits, b, pos;
+ u8 *buf;
+ int i;
+
+ buf = kunit_kmalloc(test, buflen, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_NULL(test, buf);
+
+ /* Test 1: Single bit at bit 0 (verifies ones_lookup[0]) */
+ buf[0] = 0x00;
+ expected = crc32c(seed, buf, 1);
+ buf[0] = 0x01;
+ flip_crc = crc32c_flip_range(expected, 8, 0, 1);
+ expected = crc32c(seed, buf, 1);
+ KUNIT_ASSERT_EQ_MSG(test, expected, flip_crc, "Single bit at bit 0");
+
+ /* Test 2: num_bits=0 should be a no-op */
+ memset(buf, 0, buflen);
+ expected = crc32c(seed, buf, buflen);
+ flip_crc = crc32c_flip_range(expected, total_bits, 0, 0);
+ KUNIT_ASSERT_EQ_MSG(test, expected, flip_crc,
+ "num_bits=0: expected=0x%08x got=0x%08x",
+ expected, flip_crc);
+
+ /* Test 3: Boundary flips - first byte, last byte, all bits */
+ buf[0] = 0xFF;
+ flip_crc = crc32c_flip_range(expected, total_bits, 0, 8);
+ expected = crc32c(seed, buf, buflen);
+ KUNIT_ASSERT_EQ_MSG(test, expected, flip_crc, "Flip first byte");
+
+ buf[buflen - 1] = 0xFF;
+ flip_crc = crc32c_flip_range(expected, total_bits, (buflen - 1) * 8, 8);
+ expected = crc32c(seed, buf, buflen);
+ KUNIT_ASSERT_EQ_MSG(test, expected, flip_crc, "Flip last byte");
+
+ memset(buf, 0, buflen);
+ expected = crc32c(seed, buf, buflen);
+ memset(buf, 0xFF, buflen);
+ flip_crc = crc32c_flip_range(expected, total_bits, 0, total_bits);
+ expected = crc32c(seed, buf, buflen);
+ KUNIT_ASSERT_EQ_MSG(test, expected, flip_crc, "Flip all 64KB bits");
+
+ /* Test 4: Random single-bit flips (100 iterations) */
+ memset(buf, 0, buflen);
+ expected = crc32c(seed, buf, buflen);
+ for (i = 0; i < 100; i++) {
+ start = rand32() % total_bits;
+ buf[start / 8] ^= (1 << (start % 8));
+
+ flip_crc = crc32c_flip_range(expected, total_bits, start, 1);
+ expected = crc32c(seed, buf, buflen);
+ KUNIT_ASSERT_EQ_MSG(test, expected, flip_crc,
+ "Single bit at %zu: expected=0x%08x got=0x%08x",
+ start, expected, flip_crc);
+ }
+
+ /* Test 5: Random multi-bit ranges (100 iterations) */
+ for (i = 0; i < 100; i++) {
+ num_bits = (rand32() % (total_bits - 1)) + 1;
+ start = rand32() % (total_bits - num_bits + 1);
+ for (b = 0; b < num_bits; b++) {
+ pos = start + b;
+ buf[pos / 8] ^= (1 << (pos % 8));
+ }
+
+ flip_crc = crc32c_flip_range(expected, total_bits, start, num_bits);
+ expected = crc32c(seed, buf, buflen);
+
+ KUNIT_ASSERT_EQ_MSG(test, expected, flip_crc,
+ "Range [%zu, +%zu): expected=0x%08x got=0x%08x",
+ start, num_bits, expected, flip_crc);
+ }
+}
+
static struct kunit_case crc_test_cases[] = {
#if IS_REACHABLE(CONFIG_CRC7)
KUNIT_CASE(crc7_be_test),
@@ -490,6 +574,7 @@ static struct kunit_case crc_test_cases[] = {
KUNIT_CASE(crc32_be_benchmark),
KUNIT_CASE(crc32c_test),
KUNIT_CASE(crc32c_benchmark),
+ KUNIT_CASE(crc32c_flip_range_test),
#endif
#if IS_REACHABLE(CONFIG_CRC64)
KUNIT_CASE(crc64_be_test),
--
2.43.7
^ permalink raw reply related
* [PATCH RFC 01/17] lib/crc: add crc32c_flip_range() for incremental CRC update
From: Baokun Li @ 2026-05-08 12:15 UTC (permalink / raw)
To: linux-ext4
Cc: linux-crypto, ebiggers, ardb, tytso, adilger.kernel, jack,
yi.zhang, ojaswin, ritesh.list, Baokun Li
In-Reply-To: <20260508121539.4174601-1-libaokun@linux.alibaba.com>
When a contiguous range of bits in a buffer is flipped, the CRC32c
checksum can be updated incrementally without re-scanning the entire
buffer, by exploiting the linearity of CRCs over GF(2):
New_CRC = Old_CRC ^ CRC(flip_mask << trailing_bits)
Introduce crc32c_flip_range() which computes this delta using
precomputed GF(2) shift matrices and nibble-indexed lookup tables.
The implementation decomposes nbits and trailing_bits into
power-of-2 components and combines them via the CRC concatenation
property:
CRC(A || B) = shift(CRC(A), len(B)) ^ CRC(B)
This gives O(log N) complexity with only ~9.8KB of static tables
(fits in L1 cache). The current maximum supported buffer size is
64KB (INCR_MAX_ORDER = 19, i.e. 2^19 bits = 524288 bits = 64KB).
This is useful for filesystems like ext4, where bitmap updates
involve flipping a contiguous range of bits, and recalculating
the full CRC after every update is wasteful.
Benchmark results on Intel Xeon (Ice Lake) with CRC32c hardware
acceleration:
bitmap: 1024 2048 4096 8192 16384 32768 65536
flip(ns): 48 53 57 63 68 73 78
full(ns): 45 88 182 357 709 1421 2853
speedup: 0.9x 1.6x 3.1x 5.6x 10.3x 19.3x 36.3x
Signed-off-by: Baokun Li <libaokun@linux.alibaba.com>
---
include/linux/crc32.h | 25 ++++++
lib/crc/.gitignore | 2 +
lib/crc/Makefile | 13 ++-
lib/crc/crc32c-incr.c | 140 +++++++++++++++++++++++++++++++
lib/crc/gen_crc32c_incr_table.c | 141 ++++++++++++++++++++++++++++++++
5 files changed, 318 insertions(+), 3 deletions(-)
create mode 100644 lib/crc/crc32c-incr.c
create mode 100644 lib/crc/gen_crc32c_incr_table.c
diff --git a/include/linux/crc32.h b/include/linux/crc32.h
index da78b215ff2e..034f73f0f5dc 100644
--- a/include/linux/crc32.h
+++ b/include/linux/crc32.h
@@ -81,6 +81,31 @@ u32 crc32_be(u32 crc, const void *p, size_t len);
*/
u32 crc32c(u32 crc, const void *p, size_t len);
+/**
+ * crc32c_flip_range - Update CRC32c after flipping a range of bits
+ * @old_crc: Existing CRC32c value of the buffer (pre-flip).
+ * @total_bits: Total size of the buffer in bits (e.g., 524288 for 64KB).
+ * @bit_off: Starting bit offset of the modified range.
+ * @nbits: Length of the flipped bit sequence.
+ *
+ * This function calculates the new CRC32c value when a contiguous range of
+ * bits is flipped (XORed with 1s) without re-scanning the entire buffer.
+ * It leverages the linearity of CRCs in Galois Field GF(2):
+ *
+ * New_CRC = Old_CRC ^ CRC(Mask_of_Ones << Trailing_Bits)
+ *
+ * The complexity is O(log nbits + log trailing_bits), making it
+ * significantly faster than recomputing the CRC for large buffers.
+ *
+ * Note: @total_bits must not exceed 524288 (2^19 bits = 64KB). Callers
+ * must ensure that @bit_off + @nbits <= @total_bits. Behavior is
+ * undefined if these constraints are violated.
+ *
+ * Return: The updated CRC32c value.
+ */
+u32 crc32c_flip_range(u32 old_crc, u32 total_bits,
+ u32 bit_off, u32 nbits);
+
/*
* crc32_optimizations() returns flags that indicate which CRC32 library
* functions are using architecture-specific optimizations. Unlike
diff --git a/lib/crc/.gitignore b/lib/crc/.gitignore
index a9e48103c9fb..4e2b9524426d 100644
--- a/lib/crc/.gitignore
+++ b/lib/crc/.gitignore
@@ -1,5 +1,7 @@
# SPDX-License-Identifier: GPL-2.0-only
/crc32table.h
+/crc32c-incr-table.h
/crc64table.h
/gen_crc32table
+/gen_crc32c_incr_table
/gen_crc64table
diff --git a/lib/crc/Makefile b/lib/crc/Makefile
index ff213590e4e3..2c255ac029d0 100644
--- a/lib/crc/Makefile
+++ b/lib/crc/Makefile
@@ -21,7 +21,7 @@ crc-t10dif-$(CONFIG_X86) += x86/crc16-msb-pclmul.o
endif
obj-$(CONFIG_CRC32) += crc32.o
-crc32-y := crc32-main.o
+crc32-y := crc32-main.o crc32c-incr.o
ifeq ($(CONFIG_CRC32_ARCH),y)
CFLAGS_crc32-main.o += -I$(src)/$(SRCARCH)
crc32-$(CONFIG_ARM) += arm/crc32-core.o
@@ -49,20 +49,27 @@ endif # CONFIG_CRC64_ARCH
obj-y += tests/
-hostprogs := gen_crc32table gen_crc64table
-clean-files := crc32table.h crc64table.h
+hostprogs := gen_crc32table gen_crc32c_incr_table gen_crc64table
+clean-files := crc32table.h crc32c-incr-table.h crc64table.h
$(obj)/crc32-main.o: $(obj)/crc32table.h
+$(obj)/crc32c-incr.o: $(obj)/crc32c-incr-table.h
$(obj)/crc64-main.o: $(obj)/crc64table.h
quiet_cmd_crc32 = GEN $@
cmd_crc32 = $< > $@
+quiet_cmd_crc32c_incr = GEN $@
+ cmd_crc32c_incr = $< > $@
+
quiet_cmd_crc64 = GEN $@
cmd_crc64 = $< > $@
$(obj)/crc32table.h: $(obj)/gen_crc32table
$(call cmd,crc32)
+$(obj)/crc32c-incr-table.h: $(obj)/gen_crc32c_incr_table
+ $(call cmd,crc32c_incr)
+
$(obj)/crc64table.h: $(obj)/gen_crc64table
$(call cmd,crc64)
diff --git a/lib/crc/crc32c-incr.c b/lib/crc/crc32c-incr.c
new file mode 100644
index 000000000000..b6258231cc0d
--- /dev/null
+++ b/lib/crc/crc32c-incr.c
@@ -0,0 +1,140 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * GF(2) matrix-based CRC32c incremental update.
+ *
+ * When a contiguous range of bits is flipped, the new CRC can be
+ * derived from the old one without re-scanning the buffer:
+ * New_CRC = Old_CRC ^ CRC(flip_mask << trailing_bits)
+ *
+ * The delta CRC is computed by decomposing num_bits and trailing_bits
+ * into power-of-2 components and combining them via the CRC
+ * concatenation property, giving O(log N) complexity.
+ *
+ * Memory usage: ~9.8KB
+ * - crc32c_incr_nibble_table: 19 * 8 * 16 * 4 = 9728 bytes
+ * - crc32c_incr_ones_lookup: 20 * 4 = 80 bytes
+ *
+ * Tables are generated at compile time by gen_crc32c_incr_table.
+ * INCR_MAX_ORDER 19 supports up to 64KB buffers (2^19 bits).
+ *
+ * Copyright (C) 2026 Alibaba Inc.
+ */
+
+#include <linux/bitops.h>
+#include <linux/bug.h>
+#include <linux/export.h>
+#include <linux/crc32.h>
+
+#include "crc32c-incr-table.h"
+
+#define INCR_MAX_ORDER 19
+
+/**
+ * gf2_xform - Multiply a CRC state vector by a GF(2) shift matrix
+ * @order: Selects the precomputed matrix M^(2^order).
+ * @v: The 32-bit CRC state vector.
+ *
+ * Computes v * M^(2^order) using nibble (4-bit) indexed tables,
+ * reducing the operation from 32 bit-level iterations to 8 lookups.
+ */
+static inline u32 gf2_xform(int order, u32 v)
+{
+ const u32 (*tab)[16] = crc32c_incr_nibble_table[order];
+
+ return tab[0][v & 0xf] ^
+ tab[1][(v >> 4) & 0xf] ^
+ tab[2][(v >> 8) & 0xf] ^
+ tab[3][(v >> 12) & 0xf] ^
+ tab[4][(v >> 16) & 0xf] ^
+ tab[5][(v >> 20) & 0xf] ^
+ tab[6][(v >> 24) & 0xf] ^
+ tab[7][(v >> 28) & 0xf];
+}
+
+/**
+ * crc32c_incr_get_ones_delta - Compute CRC of an all-ones bit sequence
+ * @num_bits: Length of the all-ones sequence.
+ *
+ * Returns CRC(0, [111...1] of length num_bits). Decomposes num_bits
+ * into powers of 2 (MSB-first) and combines using:
+ * CRC(A || B) = shift(CRC(A), len(B)) ^ CRC(B)
+ *
+ * This requires only (popcount - 1) gf2_xform calls, each doing
+ * 8 table lookups.
+ *
+ * Caller must ensure num_bits <= (1UL << INCR_MAX_ORDER).
+ */
+static u32 crc32c_incr_get_ones_delta(size_t num_bits)
+{
+ u32 delta;
+ int n;
+
+ if (!num_bits)
+ return 0;
+
+ /* Initialize with the highest power-of-2 block */
+ n = __fls(num_bits);
+ delta = crc32c_incr_ones_lookup[n];
+ num_bits ^= (1UL << n);
+
+ /* Concatenate remaining blocks from high to low */
+ while (num_bits) {
+ n = __fls(num_bits);
+ delta = gf2_xform(n, delta);
+ delta ^= crc32c_incr_ones_lookup[n];
+ num_bits ^= (1UL << n);
+ }
+ return delta;
+}
+
+/**
+ * gf2_shift_crc - Shift a CRC state by @trailing_bits zero-bit positions
+ * @crc: The CRC state vector.
+ * @trailing_bits: Number of zero bits to shift through.
+ *
+ * Equivalent to appending @trailing_bits zero bits to the data stream
+ * and continuing the CRC computation. Decomposes trailing_bits into
+ * powers of 2 and applies the corresponding precomputed matrices.
+ */
+static u32 gf2_shift_crc(u32 crc, size_t trailing_bits)
+{
+ int n;
+
+ for (n = 0; trailing_bits > 0 && n < INCR_MAX_ORDER; n++) {
+ if (trailing_bits & 1)
+ crc = gf2_xform(n, crc);
+ trailing_bits >>= 1;
+ }
+ return crc;
+}
+
+/* See full kernel-doc in include/linux/crc32.h */
+u32 crc32c_flip_range(u32 old_crc, u32 total_bits,
+ u32 bit_off, u32 nbits)
+{
+ u32 delta, trailing_bits;
+
+ if (!nbits)
+ return old_crc;
+
+ /*
+ * total_bits must not exceed 2^INCR_MAX_ORDER bits (64KB).
+ * bit_off + nbits must not exceed total_bits.
+ */
+ if (WARN_ON_ONCE(total_bits > (1UL << INCR_MAX_ORDER)))
+ return old_crc;
+ if (WARN_ON_ONCE(bit_off + nbits > total_bits))
+ return old_crc;
+
+ trailing_bits = total_bits - (bit_off + nbits);
+
+ /* 1. Calculate CRC of the flip-mask (all 1s of length nbits) */
+ delta = crc32c_incr_get_ones_delta(nbits);
+
+ /* 2. Shift the mask-CRC to the correct bit position */
+ delta = gf2_shift_crc(delta, trailing_bits);
+
+ /* 3. Apply the delta to the existing CRC */
+ return old_crc ^ delta;
+}
+EXPORT_SYMBOL(crc32c_flip_range);
diff --git a/lib/crc/gen_crc32c_incr_table.c b/lib/crc/gen_crc32c_incr_table.c
new file mode 100644
index 000000000000..f906506282cc
--- /dev/null
+++ b/lib/crc/gen_crc32c_incr_table.c
@@ -0,0 +1,141 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Generate GF(2) nibble-based lookup tables for incremental CRC32c updates.
+ * MAX_ORDER 19 supports up to 64KB buffers (2^19 bits = 524288 bits).
+ *
+ * Instead of storing raw 32x32 bit matrices (32 rows per order),
+ * we precompute nibble (4-bit) indexed tables. This reduces gf2_xform
+ * to 8 table lookups instead of 32 branchless mask-and-XOR iterations.
+ *
+ * Memory layout:
+ * - crc32c_incr_nibble_table[19][8][16]: 19 * 8 * 16 * 4 = 9728 bytes
+ * - crc32c_incr_ones_lookup[20]: 20 * 4 = 80 bytes
+ * Total: ~9.8KB (fits comfortably in L1 cache)
+ *
+ * Copyright (C) 2026 Alibaba Inc.
+ */
+
+#include <stdio.h>
+#include <inttypes.h>
+
+#include "../../include/linux/crc32poly.h"
+
+#define CRC32C_INCR_MAX_ORDER 19
+#define NIBBLES_PER_U32 8
+
+static uint32_t bit_matrix[CRC32C_INCR_MAX_ORDER][32];
+static uint32_t nibble_table[CRC32C_INCR_MAX_ORDER][NIBBLES_PER_U32][16];
+static uint32_t ones_lookup[CRC32C_INCR_MAX_ORDER + 1];
+
+static void crc32c_incr_init(void)
+{
+ int n, i, k, v;
+
+ /*
+ * Step 1: Build the order-0 matrix M, where M[i] is the CRC
+ * state after shifting basis vector e_i by one bit position.
+ */
+ for (i = 0; i < 32; i++) {
+ uint32_t r = 1U << i;
+
+ bit_matrix[0][i] = (r & 1) ?
+ (r >> 1) ^ CRC32C_POLY_LE : (r >> 1);
+ }
+
+ /* Step 2: M^(2^n) = (M^(2^(n-1)))^2 via matrix squaring */
+ for (n = 1; n < CRC32C_INCR_MAX_ORDER; n++) {
+ for (i = 0; i < 32; i++) {
+ uint32_t r = bit_matrix[n - 1][i];
+ uint32_t res = 0;
+
+ for (k = 0; k < 32; k++) {
+ if (r & (1U << k))
+ res ^= bit_matrix[n - 1][k];
+ }
+ bit_matrix[n][i] = res;
+ }
+ }
+
+ /* Step 3: Convert bit matrices to nibble-indexed lookup tables */
+ for (n = 0; n < CRC32C_INCR_MAX_ORDER; n++) {
+ for (i = 0; i < NIBBLES_PER_U32; i++) {
+ nibble_table[n][i][0] = 0;
+ for (v = 1; v < 16; v++) {
+ uint32_t res = 0;
+
+ for (k = 0; k < 4; k++) {
+ if (v & (1 << k))
+ res ^= bit_matrix[n][i * 4 + k];
+ }
+ nibble_table[n][i][v] = res;
+ }
+ }
+ }
+
+ /*
+ * Step 4: ones_lookup[n] = CRC(0, all-ones of 2^n bits).
+ * Uses CRC(A||B) = shift(CRC(A), len(B)) ^ CRC(B) to double
+ * the length at each step. ones_lookup[0] = CRC of a single
+ * 1-bit, which equals the generator polynomial.
+ */
+ ones_lookup[0] = CRC32C_POLY_LE;
+
+ for (n = 1; n <= CRC32C_INCR_MAX_ORDER; n++) {
+ uint32_t low = ones_lookup[n - 1];
+ uint32_t high = 0;
+
+ for (k = 0; k < 32; k++) {
+ if (low & (1U << k))
+ high ^= bit_matrix[n - 1][k];
+ }
+ ones_lookup[n] = low ^ high;
+ }
+}
+
+int main(int argc, char **argv)
+{
+ int n, i, v;
+
+ crc32c_incr_init();
+
+ printf("/* this file is generated - do not edit */\n\n");
+
+ printf("static const u32 crc32c_incr_nibble_table[%d][%d][16] = {\n",
+ CRC32C_INCR_MAX_ORDER, NIBBLES_PER_U32);
+ for (n = 0; n < CRC32C_INCR_MAX_ORDER; n++) {
+ printf("\t{\n");
+ for (i = 0; i < NIBBLES_PER_U32; i++) {
+ printf("\t\t{\n");
+ for (v = 0; v < 16; v += 4) {
+ printf("\t\t\t0x%08x, 0x%08x, 0x%08x, 0x%08x,\n",
+ nibble_table[n][i][v],
+ nibble_table[n][i][v + 1],
+ nibble_table[n][i][v + 2],
+ nibble_table[n][i][v + 3]);
+ }
+ printf("\t\t},\n");
+ }
+ printf("\t},\n");
+ }
+ printf("};\n\n");
+
+ printf("static const u32 crc32c_incr_ones_lookup[%d] = {\n",
+ CRC32C_INCR_MAX_ORDER + 1);
+ for (n = 0; n <= CRC32C_INCR_MAX_ORDER; n += 4) {
+ int remaining = CRC32C_INCR_MAX_ORDER + 1 - n;
+
+ if (remaining >= 4) {
+ printf("\t0x%08x, 0x%08x, 0x%08x, 0x%08x,\n",
+ ones_lookup[n], ones_lookup[n + 1],
+ ones_lookup[n + 2], ones_lookup[n + 3]);
+ } else {
+ printf("\t");
+ for (i = 0; i < remaining; i++)
+ printf("0x%08x, ", ones_lookup[n + i]);
+ printf("\n");
+ }
+ }
+ printf("};\n");
+
+ return 0;
+}
--
2.43.7
^ permalink raw reply related
* [PATCH RFC 04/17] ext4: fix incorrect block bitmap free clusters update on metadata overlap
From: Baokun Li @ 2026-05-08 12:15 UTC (permalink / raw)
To: linux-ext4
Cc: linux-crypto, ebiggers, ardb, tytso, adilger.kernel, jack,
yi.zhang, ojaswin, ritesh.list, Baokun Li
In-Reply-To: <20260508121539.4174601-1-libaokun@linux.alibaba.com>
In ext4_mb_mark_diskspace_used(), when the allocator detects that the
allocated blocks overlap with filesystem metadata, it enters an error
recovery path that marks these blocks as used in the bitmap via
ext4_mb_mark_context() with flags=0.
Without EXT4_MB_BITMAP_MARKED_CHECK, ext4_mb_mark_context() assumes all
bits in the range will be flipped, so it sets changed=len unconditionally.
However, in a corrupted filesystem, some of these metadata blocks may
already be marked as used (bit=1) in the bitmap. Since mb_set_bits() is
idempotent (sets bits to 1 regardless of current state), bits that are
already set won't actually change, but the free clusters count is still
decremented by the full range length, leading to an inaccurate free
clusters count.
Fix this by passing EXT4_MB_BITMAP_MARKED_CHECK, which correctly counts
only the bits that actually changed state.
Fixes: 2f94711b098b ("ext4: call ext4_mb_mark_context in ext4_mb_mark_diskspace_used")
Signed-off-by: Baokun Li <libaokun@linux.alibaba.com>
---
fs/ext4/mballoc.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index ed1bd00e11cd..ff2023c9f52c 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -4228,7 +4228,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, handle_t *handle
ac->ac_b_ex.fe_group,
ac->ac_b_ex.fe_start,
ac->ac_b_ex.fe_len,
- 0, NULL);
+ EXT4_MB_BITMAP_MARKED_CHECK, NULL);
if (!err)
err = -EFSCORRUPTED;
return err;
--
2.43.7
^ permalink raw reply related
* [PATCH RFC 00/17] ext4/lib-crc: LBS performance part 1 - incremental CRC32c for bitmap checksums
From: Baokun Li @ 2026-05-08 12:15 UTC (permalink / raw)
To: linux-ext4
Cc: linux-crypto, ebiggers, ardb, tytso, adilger.kernel, jack,
yi.zhang, ojaswin, ritesh.list, Baokun Li
Motivation
==========
In [1] we added large block size (LBS) support to ext4. After enabling
LBS we observed several performance bottlenecks:
1. Checksum computation (bitmap, extent block, dir block) becomes
significantly more expensive as the block size grows.
2. Free-bit searches (_find_next_bit) over large bitmaps become costly.
CRC32c is linear over GF(2), so when a contiguous range of bits in a
buffer is flipped the new checksum can be derived from the old one
without re-scanning the entire buffer:
New_CRC = Old_CRC ^ CRC(flip_mask << trailing_bits)
This series introduces crc32c_flip_range() in lib/crc and applies it to
ext4's inode and block bitmap checksum paths, reducing checksum overhead
from O(N) to O(log N) per update.
For dir blocks and extent blocks, each modification touches a 12-264
byte region; by computing the CRC of the modified region before and
after the change we can derive the delta to the overall checksum. A
crc32c_splice() API implementing this approach has been developed
locally and will be posted shortly.
For the _find_next_bit bottleneck under LBS, a per-block-group free
space rb-tree can accelerate lookups. A local prototype exists and is
still being tested; feedback and alternative approaches are welcome.
[1]: https://lore.kernel.org/all/20251121090654.631996-1-libaokun@huaweicloud.com
Benchmark (full roadmap projection)
====================================
Single-process sequential fallocate of 64K blocks. All throughput
values are in GB/s; percentages in parentheses show improvement over
the unpatched baseline. "+crc_splice" and "+free_space_tree" columns
show expected gains from follow-up series (not included here).
* Blocks per group up to 65528 (default e2fsprogs limit)
+--------+---------+-----------------+-----------------+---------------------+
| Blksz | Before | +crc_flip_range | +crc_splice | +free_space_tree |
+--------+---------+-----------------+-----------------+---------------------+
| 1k | 14.9 | 15.0 (+0.7%) | 15.2 (+2.0%) | 15.3 (+2.7%) |
| 2k | 17.5 | 17.8 (+1.7%) | 18.2 (+4.0%) | 18.7 (+6.9%) |
| 4k | 16.8 | 17.4 (+3.6%) | 18.3 (+8.9%) | 18.4 (+9.5%) |
| 8k | 15.5 | 16.5 (+6.5%) | 18.3 (+18.1%) | 18.2 (+17.4%) |
| 16k | 12.6 | 13.2 (+4.8%) | 15.9 (+26.2%) | 15.9 (+26.2%) |
| 32k | 8.99 | 9.60 (+6.8%) | 12.3 (+36.8%) | 12.5 (+39.0%) |
| 64k | 8.24 | 8.54 (+3.6%) | 14.0 (+69.9%) | 19.4 (+135%) |
+--------+---------+-----------------+-----------------+---------------------+
* Blocks per group up to 524288 (e2fsprogs limit lifted)
+--------+---------+-----------------+-----------------+---------------------+
| Blksz | Before | +crc_flip_range | +crc_splice | +free_space_tree |
+--------+---------+-----------------+-----------------+---------------------+
| 1k | 15.0 | 14.9 (-0.7%) | 15.5 (+3.3%) | 15.6 (+4.0%) |
| 2k | 17.4 | 17.7 (+1.7%) | 17.9 (+2.9%) | 18.2 (+4.6%) |
| 4k | 16.7 | 17.3 (+3.6%) | 18.4 (+10.2%) | 18.7 (+12.0%) |
| 8k | 15.7 | 16.4 (+4.5%) | 19.1 (+21.7%) | 19.3 (+22.9%) |
| 16k | 13.5 | 15.4 (+14.1%) | 18.7 (+38.5%) | 19.0 (+40.7%) |
| 32k | 9.64 | 12.3 (+27.6%) | 17.7 (+83.5%) | 17.7 (+83.5%) |
| 64k | 2.84 | 3.17 (+11.6%) | 3.48 (+22.5%) | 19.8 (+597%) |
+--------+---------+-----------------+-----------------+---------------------+
Patch Overview
==============
* Patches 1-3 (lib/crc): Introduce crc32c_flip_range() with O(log N)
complexity using precomputed GF(2) shift matrices and nibble-indexed
lookup tables (~9.8KB, fits in L1 cache). Add kunit tests and
benchmarks.
* Patch 4: Fix incorrect free clusters accounting when allocated blocks
overlap with filesystem metadata on a corrupted filesystem.
* Patches 5-7: Extract block bitmap checksum helpers, add the
incremental update wrapper ext4_block_bitmap_csum_set_range(), and
use it in ext4_mb_mark_context().
* Patches 8-10: Extract inode bitmap checksum helpers, add
ext4_inode_bitmap_csum_set_fast(), and use it in ext4_free_inode().
* Patch 11: Fix missing bg_used_dirs_count update during fast commit
replay.
* Patches 12-13: Factor out ext4_might_init_block_bitmap() and merge
bitmap modification with GDP update under a single group lock in
ext4_mark_inode_used(), eliminating a race window.
* Patches 14-15: Rename 'ino' to 'bit' in __ext4_new_inode() for
clarity, then merge bitmap modification and GDP update under a
single group lock with incremental CRC.
* Patches 16-17: Extract ext4_update_inode_group_desc() and
ext4_get_flex_group() helpers to reduce code duplication.
Testing
=======
"kvm-xfstests -c ext4/all -g auto" has been executed with no new failures.
crc32c_flip_range() micro-benchmark on Intel Xeon (Ice Lake) with
CRC32c hardware acceleration:
bitmap: 1024 2048 4096 8192 16384 32768 65536
flip(ns): 48 53 57 63 68 73 78
full(ns): 45 88 182 357 709 1421 2853
speedup: 0.9x 1.6x 3.1x 5.6x 10.3x 19.3x 36.3x
Comments and questions are, as always, welcome.
Thanks,
Baokun
Baokun Li (17):
lib/crc: add crc32c_flip_range() for incremental CRC update
lib/crc: crc_kunit: add kunit test for crc32c_flip_range()
lib/crc: crc_kunit: add benchmark for crc32c_flip_range()
ext4: fix incorrect block bitmap free clusters update on metadata
overlap
ext4: extract block bitmap checksum get and store helpers
ext4: add ext4_block_bitmap_csum_set_range() for incremental checksum
update
ext4: use fast incremental CRC update in ext4_mb_mark_context()
ext4: extract inode bitmap checksum get and store helpers
ext4: add ext4_inode_bitmap_csum_set_fast() for incremental checksum
update
ext4: use fast incremental CRC update in ext4_free_inode()
ext4: fix missing bg_used_dirs_count update in fast commit replay
ext4: factor out ext4_might_init_block_bitmap() helper
ext4: use fast incremental CRC update in ext4_mark_inode_used()
ext4: rename ino to bit in __ext4_new_inode()
ext4: use fast incremental CRC update in __ext4_new_inode()
ext4: extract ext4_update_inode_group_desc() to reduce duplication
ext4: add ext4_get_flex_group() helper to simplify flex group lookups
fs/ext4/bitmap.c | 109 ++++++++--
fs/ext4/ext4.h | 15 +-
fs/ext4/fast_commit.c | 13 +-
fs/ext4/ialloc.c | 343 ++++++++++++++------------------
fs/ext4/mballoc.c | 28 ++-
fs/ext4/resize.c | 4 +-
fs/ext4/super.c | 4 +-
include/linux/crc32.h | 25 +++
lib/crc/.gitignore | 2 +
lib/crc/Makefile | 13 +-
lib/crc/crc32c-incr.c | 140 +++++++++++++
lib/crc/gen_crc32c_incr_table.c | 141 +++++++++++++
lib/crc/tests/crc_kunit.c | 137 +++++++++++++
13 files changed, 746 insertions(+), 228 deletions(-)
create mode 100644 lib/crc/crc32c-incr.c
create mode 100644 lib/crc/gen_crc32c_incr_table.c
--
2.43.7
^ permalink raw reply
* Re: [PATCH v4 v4 0/3] ext4: improve mballoc statistics reporting and control
From: liubaolin @ 2026-05-08 10:31 UTC (permalink / raw)
To: tytso, libaokun
Cc: ojaswin, ritesh.list, yi.zhang, linux-ext4, linux-kernel,
wangguanyu, adilger
In-Reply-To: <20260508093428.5814-1-liubaolin12138@163.com>
Dear Ted, Baokun,
I have revised the patch according to your previous comments.
There are three additional points I would like to clarify:
1.Ted, you previously suggested considering concurrency issues.
After thinking about it, I decided to use an atomic variable for
s_mb_stats to address the concurrency problem.
Using a spinlock would require much larger code changes.
2.Regarding the modification to Documentation/filesystems/proc.rst, I
previously sent an email to Jonathan, the maintainer of proc.rst:
https://lore.kernel.org/all/87ik9bmphp.fsf@trenco.lwn.net/
Jonathan replied and agreed with my idea that it is sufficient to
simply add the following note in proc.rst:
“See Documentation/admin-guide/ext4.rst for ext4-specific /proc entries.”
If you also agree, Ted, then in the future we will no longer
duplicate ext4 proc file documentation in proc.rst.
All ext4 proc file related documentation will instead be maintained
centrally in Documentation/admin-guide/ext4.rst.
3.The functionality of the sys mb_stats file will be retained,
but the following note will be added to
Documentation/ABI/testing/sysfs-fs-ext4:
“This sysfs entry is deprecated, and users should prefer
/proc/fs/ext4//mb_stats.”
Thank you for your comments and reviews.
Thanks,
Baolin
在 2026/5/8 17:34, Baolin Liu 写道:
> This series improves ext4 mballoc statistics reporting and control.
> Compared with v3, this version adds a new patch to convert s_mb_stats
> to atomic_t and use atomic operations for its accesses, so as to avoid
> potential concurrent accesses to this variable.
>
> Patch 3 is updated according to comments from Ted and BaoKun. Writing 0
> to /proc/fs/ext4/<dev>/mb_stats disables statistics collection, writing
> 1 enables it, and writing -1 clears the current statistics and enables
> collection. The related documentation is updated accordingly, and the
> sysfs mb_stats entry is documented as deprecated in favor of the proc
> mb_stats entry.
>
> Changes since v3:
> - add a new patch to convert s_mb_stats to atomic_t and use atomic
> operations for its accesses
> - update /proc/fs/ext4/<dev>/mb_stats write semantics based on comments
> from Ted and BaoKun
> - update related documentation and document sysfs mb_stats as deprecated
>
> Baolin Liu (3):
> ext4: add blocks_allocated to mb_stats output
> ext4: use atomic operations for s_mb_stats accesses
> ext4: allow controlling mballoc stats through proc mb_stats
>
> Documentation/ABI/testing/sysfs-fs-ext4 | 3 +-
> Documentation/admin-guide/ext4.rst | 9 +++-
> Documentation/filesystems/proc.rst | 13 +-----
> fs/ext4/ext4.h | 3 +-
> fs/ext4/mballoc.c | 57 +++++++++++++++++++------
> fs/ext4/sysfs.c | 55 ++++++++++++++++++++++--
> 6 files changed, 109 insertions(+), 31 deletions(-)
>
^ permalink raw reply
* [PATCH v4 v4 2/3] ext4: use atomic operations for s_mb_stats accesses
From: Baolin Liu @ 2026-05-08 9:34 UTC (permalink / raw)
To: tytso, adilger.kernel, libaokun
Cc: ojaswin, ritesh.list, yi.zhang, linux-ext4, linux-kernel,
wangguanyu, adilger, liubaolin12138, Baolin Liu
In-Reply-To: <20260508093428.5814-1-liubaolin12138@163.com>
From: Baolin Liu <liubaolin@kylinos.cn>
s_mb_stats can be read from mballoc paths while being updated
through sysfs, which can race.
Convert it to atomic_t and use atomic_read()/atomic_set()
for all accesses.
Signed-off-by: Baolin Liu <liubaolin@kylinos.cn>
---
fs/ext4/ext4.h | 2 +-
fs/ext4/mballoc.c | 24 ++++++++++++------------
fs/ext4/sysfs.c | 9 ++++++++-
3 files changed, 21 insertions(+), 14 deletions(-)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 293f698b7042..04bccfcb018e 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1624,7 +1624,7 @@ struct ext4_sb_info {
unsigned int s_mb_stream_request;
unsigned int s_mb_max_to_scan;
unsigned int s_mb_min_to_scan;
- unsigned int s_mb_stats;
+ atomic_t s_mb_stats;
unsigned int s_mb_order2_reqs;
unsigned int s_mb_group_prealloc;
unsigned int s_max_dir_size_kb;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 1e13ef62cb9d..95103fbc1583 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -924,7 +924,7 @@ static int ext4_mb_scan_groups_xa_range(struct ext4_allocation_context *ac,
xa_for_each_range(xa, group, grp, start, end - 1) {
int err;
- if (sbi->s_mb_stats)
+ if (atomic_read(&sbi->s_mb_stats))
atomic64_inc(&sbi->s_bal_cX_groups_considered[cr]);
err = ext4_mb_scan_group(ac, grp->bb_group);
@@ -980,7 +980,7 @@ static int ext4_mb_scan_groups_p2_aligned(struct ext4_allocation_context *ac,
goto wrap_around;
}
- if (sbi->s_mb_stats)
+ if (atomic_read(&sbi->s_mb_stats))
atomic64_inc(&sbi->s_bal_cX_failed[ac->ac_criteria]);
/* Increment cr and search again if no group is found */
@@ -1031,7 +1031,7 @@ static int ext4_mb_scan_groups_goal_fast(struct ext4_allocation_context *ac,
goto wrap_around;
}
- if (sbi->s_mb_stats)
+ if (atomic_read(&sbi->s_mb_stats))
atomic64_inc(&sbi->s_bal_cX_failed[ac->ac_criteria]);
/*
* CR_BEST_AVAIL_LEN works based on the concept that we have
@@ -1135,7 +1135,7 @@ static int ext4_mb_scan_groups_best_avail(struct ext4_allocation_context *ac,
/* Reset goal length to original goal length before falling into CR_GOAL_LEN_SLOW */
ac->ac_g_ex.fe_len = ac->ac_orig_goal_len;
- if (sbi->s_mb_stats)
+ if (atomic_read(&sbi->s_mb_stats))
atomic64_inc(&sbi->s_bal_cX_failed[ac->ac_criteria]);
ac->ac_criteria = CR_GOAL_LEN_SLOW;
@@ -1184,7 +1184,7 @@ static int ext4_mb_scan_groups_linear(struct ext4_allocation_context *ac,
ac->ac_criteria++;
/* Processed all groups and haven't found blocks */
- if (sbi->s_mb_stats && i == ngroups)
+ if (atomic_read(&sbi->s_mb_stats) && i == ngroups)
atomic64_inc(&sbi->s_bal_cX_failed[cr]);
return 0;
@@ -2535,7 +2535,7 @@ void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac,
BUG_ON(ac->ac_f_ex.fe_len != ac->ac_g_ex.fe_len);
- if (EXT4_SB(sb)->s_mb_stats)
+ if (atomic_read(&EXT4_SB(sb)->s_mb_stats))
atomic_inc(&EXT4_SB(sb)->s_bal_2orders);
break;
@@ -2780,7 +2780,7 @@ static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac,
if (!grp)
return -EFSCORRUPTED;
- if (sbi->s_mb_stats)
+ if (atomic_read(&sbi->s_mb_stats))
atomic64_inc(&sbi->s_bal_cX_groups_considered[ac->ac_criteria]);
if (should_lock) {
ext4_lock_group(sb, group);
@@ -3091,7 +3091,7 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
}
}
- if (sbi->s_mb_stats && ac->ac_status == AC_STATUS_FOUND) {
+ if (atomic_read(&sbi->s_mb_stats) && ac->ac_status == AC_STATUS_FOUND) {
atomic64_inc(&sbi->s_bal_cX_hits[ac->ac_criteria]);
if (ac->ac_flags & EXT4_MB_STREAM_ALLOC &&
ac->ac_b_ex.fe_group == ac->ac_g_ex.fe_group)
@@ -3204,7 +3204,7 @@ int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset)
struct ext4_sb_info *sbi = EXT4_SB(sb);
seq_puts(seq, "mballoc:\n");
- if (!sbi->s_mb_stats) {
+ if (!atomic_read(&sbi->s_mb_stats)) {
seq_puts(seq, "\tmb stats collection turned off.\n");
seq_puts(
seq,
@@ -3783,7 +3783,7 @@ int ext4_mb_init(struct super_block *sb)
sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN;
sbi->s_mb_min_to_scan = MB_DEFAULT_MIN_TO_SCAN;
- sbi->s_mb_stats = MB_DEFAULT_STATS;
+ atomic_set(&sbi->s_mb_stats, MB_DEFAULT_STATS);
sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD;
sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS;
sbi->s_mb_best_avail_max_trim_order = MB_DEFAULT_BEST_AVAIL_TRIM_ORDER;
@@ -3929,7 +3929,7 @@ void ext4_mb_release(struct super_block *sb)
kfree(sbi->s_mb_offsets);
kfree(sbi->s_mb_maxs);
iput(sbi->s_buddy_cache);
- if (sbi->s_mb_stats) {
+ if (atomic_read(&sbi->s_mb_stats)) {
ext4_msg(sb, KERN_INFO,
"mballoc: %u blocks %u reqs (%u success)",
atomic_read(&sbi->s_bal_allocated),
@@ -4694,7 +4694,7 @@ static void ext4_mb_collect_stats(struct ext4_allocation_context *ac)
{
struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
- if (sbi->s_mb_stats && ac->ac_g_ex.fe_len >= 1) {
+ if (atomic_read(&sbi->s_mb_stats) && ac->ac_g_ex.fe_len >= 1) {
atomic_inc(&sbi->s_bal_reqs);
atomic_add(ac->ac_b_ex.fe_len, &sbi->s_bal_allocated);
if (ac->ac_b_ex.fe_len >= ac->ac_o_ex.fe_len)
diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
index b87d7bdab06a..0f65ab372dee 100644
--- a/fs/ext4/sysfs.c
+++ b/fs/ext4/sysfs.c
@@ -250,7 +250,7 @@ EXT4_ATTR_OFFSET(mb_best_avail_max_trim_order, 0644, mb_order,
ext4_sb_info, s_mb_best_avail_max_trim_order);
EXT4_ATTR_OFFSET(err_report_sec, 0644, err_report_sec, ext4_sb_info, s_err_report_sec);
EXT4_RW_ATTR_SBI_UI(inode_goal, s_inode_goal);
-EXT4_RW_ATTR_SBI_UI(mb_stats, s_mb_stats);
+EXT4_ATTR_OFFSET(mb_stats, 0644, pointer_atomic, ext4_sb_info, s_mb_stats);
EXT4_RW_ATTR_SBI_UI(mb_max_to_scan, s_mb_max_to_scan);
EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan);
EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
@@ -493,6 +493,7 @@ static ssize_t ext4_generic_attr_store(struct ext4_attr *a,
const char *buf, size_t len)
{
int ret;
+ int i;
unsigned int t;
unsigned long lt;
void *ptr = calc_ptr(a, sbi);
@@ -540,6 +541,12 @@ static ssize_t ext4_generic_attr_store(struct ext4_attr *a,
return ret;
*((unsigned long *) ptr) = lt;
return len;
+ case attr_pointer_atomic:
+ ret = kstrtoint(skip_spaces(buf), 0, &i);
+ if (ret)
+ return ret;
+ atomic_set((atomic_t *)ptr, i);
+ return len;
}
return 0;
}
--
2.51.0
^ permalink raw reply related
* [PATCH v4 v4 3/3] ext4: allow controlling mballoc stats through proc mb_stats
From: Baolin Liu @ 2026-05-08 9:34 UTC (permalink / raw)
To: tytso, adilger.kernel, libaokun
Cc: ojaswin, ritesh.list, yi.zhang, linux-ext4, linux-kernel,
wangguanyu, adilger, liubaolin12138, Baolin Liu
In-Reply-To: <20260508093428.5814-1-liubaolin12138@163.com>
From: Baolin Liu <liubaolin@kylinos.cn>
Make /proc/fs/ext4/<dev>/mb_stats writable. Writing 0 disables mballoc
statistics collection, writing 1 enables it, and writing -1 clears the
current statistics before enabling collection.
Update the ext4 documentation for proc mb_stats, document that the
sysfs mb_stats entry is deprecated, and point proc.rst to
Documentation/admin-guide/ext4.rst for ext4-specific /proc entries.
Reviewed-by: Ojaswin Mujoo <ojaswin@linux.ibm.com>
Reviewed-by: Andreas Dilger <adilger@dilger.ca>
Reviewed-by: Ritesh Harjani (IBM) <ritesh.list@gmail.com>
Reviewed-by: Zhang Yi <yi.zhang@huawei.com>
Reviewed-by: Baokun Li <libaokun@linux.alibaba.com>
Reviewed-by: Ted Tso <tytso@mit.edu>
Signed-off-by: Baolin Liu <liubaolin@kylinos.cn>
---
Documentation/ABI/testing/sysfs-fs-ext4 | 3 +-
Documentation/admin-guide/ext4.rst | 9 ++++-
Documentation/filesystems/proc.rst | 13 +------
fs/ext4/ext4.h | 1 +
fs/ext4/mballoc.c | 31 ++++++++++++++++-
fs/ext4/sysfs.c | 46 +++++++++++++++++++++++--
6 files changed, 86 insertions(+), 17 deletions(-)
diff --git a/Documentation/ABI/testing/sysfs-fs-ext4 b/Documentation/ABI/testing/sysfs-fs-ext4
index 2edd0a6672d3..7bf06c533343 100644
--- a/Documentation/ABI/testing/sysfs-fs-ext4
+++ b/Documentation/ABI/testing/sysfs-fs-ext4
@@ -5,7 +5,8 @@ Description:
Controls whether the multiblock allocator should
collect statistics, which are shown during the unmount.
1 means to collect statistics, 0 means not to collect
- statistics
+ statistics. This sysfs entry is deprecated, and users
+ should prefer /proc/fs/ext4/<disk>/mb_stats.
What: /sys/fs/ext4/<disk>/mb_group_prealloc
Date: March 2008
diff --git a/Documentation/admin-guide/ext4.rst b/Documentation/admin-guide/ext4.rst
index ac0c709ea9e7..ca76e981b2aa 100644
--- a/Documentation/admin-guide/ext4.rst
+++ b/Documentation/admin-guide/ext4.rst
@@ -436,6 +436,12 @@ Files in /proc/fs/ext4/<devname>
mb_groups
details of multiblock allocator buddy cache of free blocks
+ mb_stats
+ reports runtime statistics from the multiblock allocator
+ (mballoc). Writing 0 disables statistics collection, writing
+ 1 enables statistics collection, and writing -1 clears the
+ current statistics and enables statistics collection.
+
/sys entries
============
@@ -493,7 +499,8 @@ Files in /sys/fs/ext4/<devname>:
mb_stats
Controls whether the multiblock allocator should collect statistics,
which are shown during the unmount. 1 means to collect statistics, 0
- means not to collect statistics.
+ means not to collect statistics. This sysfs entry is deprecated, and
+ users should prefer /proc/fs/ext4/<devname>/mb_stats.
mb_stream_req
Files which have fewer blocks than this tunable parameter will have
diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst
index b0c0d1b45b99..dd487004b862 100644
--- a/Documentation/filesystems/proc.rst
+++ b/Documentation/filesystems/proc.rst
@@ -1623,18 +1623,7 @@ softirq.
1.8 Ext4 file system parameters
-------------------------------
-Information about mounted ext4 file systems can be found in
-/proc/fs/ext4. Each mounted filesystem will have a directory in
-/proc/fs/ext4 based on its device name (i.e., /proc/fs/ext4/hdc or
-/proc/fs/ext4/sda9 or /proc/fs/ext4/dm-0). The files in each per-device
-directory are shown in Table 1-12, below.
-
-.. table:: Table 1-12: Files in /proc/fs/ext4/<devname>
-
- ============== ==========================================================
- File Content
- mb_groups details of multiblock allocator buddy cache of free blocks
- ============== ==========================================================
+See Documentation/admin-guide/ext4.rst for ext4-specific /proc entries.
1.9 /proc/consoles
-------------------
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 04bccfcb018e..536589dda8d1 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2994,6 +2994,7 @@ int ext4_fc_record_regions(struct super_block *sb, int ino,
extern const struct seq_operations ext4_mb_seq_groups_ops;
extern const struct seq_operations ext4_mb_seq_structs_summary_ops;
extern int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset);
+extern void ext4_mb_stats_clear(struct ext4_sb_info *sbi);
extern int ext4_mb_init(struct super_block *);
extern void ext4_mb_release(struct super_block *);
extern ext4_fsblk_t ext4_mb_new_blocks(handle_t *,
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 95103fbc1583..69ee737f8655 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -3208,7 +3208,7 @@ int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset)
seq_puts(seq, "\tmb stats collection turned off.\n");
seq_puts(
seq,
- "\tTo enable, please write \"1\" to sysfs file mb_stats.\n");
+ "\tTo enable, please write \"1\" to proc file mb_stats.\n");
return 0;
}
seq_printf(seq, "\tblocks_allocated: %u\n",
@@ -4723,6 +4723,35 @@ static void ext4_mb_collect_stats(struct ext4_allocation_context *ac)
trace_ext4_mballoc_prealloc(ac);
}
+void ext4_mb_stats_clear(struct ext4_sb_info *sbi)
+{
+ int i;
+
+ atomic_set(&sbi->s_bal_reqs, 0);
+ atomic_set(&sbi->s_bal_success, 0);
+ atomic_set(&sbi->s_bal_allocated, 0);
+ atomic_set(&sbi->s_bal_groups_scanned, 0);
+
+ for (i = 0; i < EXT4_MB_NUM_CRS; i++) {
+ atomic64_set(&sbi->s_bal_cX_hits[i], 0);
+ atomic64_set(&sbi->s_bal_cX_groups_considered[i], 0);
+ atomic_set(&sbi->s_bal_cX_ex_scanned[i], 0);
+ atomic64_set(&sbi->s_bal_cX_failed[i], 0);
+ }
+
+ atomic_set(&sbi->s_bal_ex_scanned, 0);
+ atomic_set(&sbi->s_bal_goals, 0);
+ atomic_set(&sbi->s_bal_stream_goals, 0);
+ atomic_set(&sbi->s_bal_len_goals, 0);
+ atomic_set(&sbi->s_bal_2orders, 0);
+ atomic_set(&sbi->s_bal_breaks, 0);
+ atomic_set(&sbi->s_mb_lost_chunks, 0);
+ atomic_set(&sbi->s_mb_buddies_generated, 0);
+ atomic64_set(&sbi->s_mb_generation_time, 0);
+ atomic_set(&sbi->s_mb_preallocated, 0);
+ atomic_set(&sbi->s_mb_discarded, 0);
+}
+
/*
* Called on failure; free up any blocks from the inode PA for this
* context. We don't need this for MB_GROUP_PA because we only change
diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
index 0f65ab372dee..86e2ae022659 100644
--- a/fs/ext4/sysfs.c
+++ b/fs/ext4/sysfs.c
@@ -52,6 +52,48 @@ typedef enum {
static const char proc_dirname[] = "fs/ext4";
static struct proc_dir_entry *ext4_proc_root;
+static int ext4_mb_stats_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, ext4_seq_mb_stats_show, pde_data(inode));
+}
+
+static ssize_t ext4_mb_stats_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct super_block *sb = pde_data(file_inode(file));
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ int val;
+ int ret;
+
+ ret = kstrtoint_from_user(buf, count, 0, &val);
+ if (ret)
+ return ret;
+
+ switch (val) {
+ case -1:
+ ext4_mb_stats_clear(sbi);
+ fallthrough;
+ case 1:
+ atomic_set(&sbi->s_mb_stats, 1);
+ break;
+ case 0:
+ atomic_set(&sbi->s_mb_stats, 0);
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return count;
+}
+
+static const struct proc_ops ext4_mb_stats_proc_ops = {
+ .proc_open = ext4_mb_stats_open,
+ .proc_read = seq_read,
+ .proc_lseek = seq_lseek,
+ .proc_release = single_release,
+ .proc_write = ext4_mb_stats_write,
+};
+
struct ext4_attr {
struct attribute attr;
short attr_id;
@@ -637,8 +679,8 @@ int ext4_register_sysfs(struct super_block *sb)
ext4_fc_info_show, sb);
proc_create_seq_data("mb_groups", S_IRUGO, sbi->s_proc,
&ext4_mb_seq_groups_ops, sb);
- proc_create_single_data("mb_stats", 0444, sbi->s_proc,
- ext4_seq_mb_stats_show, sb);
+ proc_create_data("mb_stats", 0644, sbi->s_proc,
+ &ext4_mb_stats_proc_ops, sb);
proc_create_seq_data("mb_structs_summary", 0444, sbi->s_proc,
&ext4_mb_seq_structs_summary_ops, sb);
}
--
2.51.0
^ permalink raw reply related
* [PATCH v4 v4 1/3] ext4: add blocks_allocated to mb_stats output
From: Baolin Liu @ 2026-05-08 9:34 UTC (permalink / raw)
To: tytso, adilger.kernel, libaokun
Cc: ojaswin, ritesh.list, yi.zhang, linux-ext4, linux-kernel,
wangguanyu, adilger, liubaolin12138, Baolin Liu
In-Reply-To: <20260508093428.5814-1-liubaolin12138@163.com>
From: Baolin Liu <liubaolin@kylinos.cn>
Add blocks_allocated to /proc/fs/ext4/<dev>/mb_stats so that the
reported statistics match the mballoc summary printed at unmount time.
Reviewed-by: Ojaswin Mujoo <ojaswin@linux.ibm.com>
Reviewed-by: Andreas Dilger <adilger@dilger.ca>
Reviewed-by: Ritesh Harjani (IBM) <ritesh.list@gmail.com>
Reviewed-by: Zhang Yi <yi.zhang@huawei.com>
Signed-off-by: Baolin Liu <liubaolin@kylinos.cn>
---
fs/ext4/mballoc.c | 2 ++
1 file changed, 2 insertions(+)
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 20e9fdaf4301..1e13ef62cb9d 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -3211,6 +3211,8 @@ int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset)
"\tTo enable, please write \"1\" to sysfs file mb_stats.\n");
return 0;
}
+ seq_printf(seq, "\tblocks_allocated: %u\n",
+ atomic_read(&sbi->s_bal_allocated));
seq_printf(seq, "\treqs: %u\n", atomic_read(&sbi->s_bal_reqs));
seq_printf(seq, "\tsuccess: %u\n", atomic_read(&sbi->s_bal_success));
--
2.51.0
^ permalink raw reply related
* [PATCH v4 v4 0/3] ext4: improve mballoc statistics reporting and control
From: Baolin Liu @ 2026-05-08 9:34 UTC (permalink / raw)
To: tytso, adilger.kernel, libaokun
Cc: ojaswin, ritesh.list, yi.zhang, linux-ext4, linux-kernel,
wangguanyu, adilger, liubaolin12138
This series improves ext4 mballoc statistics reporting and control.
Compared with v3, this version adds a new patch to convert s_mb_stats
to atomic_t and use atomic operations for its accesses, so as to avoid
potential concurrent accesses to this variable.
Patch 3 is updated according to comments from Ted and BaoKun. Writing 0
to /proc/fs/ext4/<dev>/mb_stats disables statistics collection, writing
1 enables it, and writing -1 clears the current statistics and enables
collection. The related documentation is updated accordingly, and the
sysfs mb_stats entry is documented as deprecated in favor of the proc
mb_stats entry.
Changes since v3:
- add a new patch to convert s_mb_stats to atomic_t and use atomic
operations for its accesses
- update /proc/fs/ext4/<dev>/mb_stats write semantics based on comments
from Ted and BaoKun
- update related documentation and document sysfs mb_stats as deprecated
Baolin Liu (3):
ext4: add blocks_allocated to mb_stats output
ext4: use atomic operations for s_mb_stats accesses
ext4: allow controlling mballoc stats through proc mb_stats
Documentation/ABI/testing/sysfs-fs-ext4 | 3 +-
Documentation/admin-guide/ext4.rst | 9 +++-
Documentation/filesystems/proc.rst | 13 +-----
fs/ext4/ext4.h | 3 +-
fs/ext4/mballoc.c | 57 +++++++++++++++++++------
fs/ext4/sysfs.c | 55 ++++++++++++++++++++++--
6 files changed, 109 insertions(+), 31 deletions(-)
--
2.51.0
^ permalink raw reply
* Re: [PATCH v3 03/22] ext4: simplify error handling in ext4_setattr()
From: Zhang Yi @ 2026-05-08 8:48 UTC (permalink / raw)
To: Jan Kara
Cc: linux-ext4, linux-fsdevel, linux-kernel, tytso, adilger.kernel,
libaokun, ojaswin, ritesh.list, djwong, hch, yi.zhang, yizhang089,
yangerkun, yukuai
In-Reply-To: <yueuhejhnqlffyjth5u56japyrgeoa74pasp5zydwrxgbnfrco@iv3gpx33esvu>
On 4/30/2026 9:03 PM, Jan Kara wrote:
> On Wed 22-04-26 10:10:23, Zhang Yi wrote:
>> From: Zhang Yi <yi.zhang@huawei.com>
>>
>> Remove the redundant rc variable and consolidate error handling.
>>
>> Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
>
> One comment below. Otherwise the changes look good.
>
>> @@ -6073,8 +6073,8 @@ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
>>
>> filemap_invalidate_lock(inode->i_mapping);
>>
>> - rc = ext4_break_layouts(inode);
>> - if (rc) {
>> + error = ext4_break_layouts(inode);
>> + if (error) {
>
> This is wrong. Errors from ext4_break_layouts() just need to be returned
> but they shouldn't be logged with ext4_std_error().
Yeah, I got it, will fix.
Thanks,
Yi.
>
>> filemap_invalidate_unlock(inode->i_mapping);
>> goto err_out;
>> }
>
> Honza
^ permalink raw reply
* Re: [PATCH v3 02/22] ext4: factor out ext4_truncate_[up|down]()
From: Zhang Yi @ 2026-05-08 8:47 UTC (permalink / raw)
To: Jan Kara
Cc: linux-ext4, linux-fsdevel, linux-kernel, tytso, adilger.kernel,
libaokun, ojaswin, ritesh.list, djwong, hch, yi.zhang, yizhang089,
yangerkun, yukuai
In-Reply-To: <ucwp5hqcp6yghqbvpnmfsznzzuw6g7uerajm5x2zqtcyvsv33f@5ppt5hcffnsf>
On 4/30/2026 8:55 PM, Jan Kara wrote:
> On Wed 22-04-26 10:10:22, Zhang Yi wrote:
>> From: Zhang Yi <yi.zhang@huawei.com>
>>
>> Refactor ext4_setattr() by introducing two helper functions,
>> ext4_truncate_up() and ext4_truncate_down(), to handle size changes. The
>> current ATTR_SIZE processing consolidates checks for both shrinking and
>> non-shrinking cases, leading to cluttered code. Separating the
>> truncation paths improves readability.
>>
>> Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
>
> Looks good. Just a few nits below.
Thank you for reviewing this series!
>
>> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
>> index 94283a991e5c..9e4353432325 100644
>> --- a/fs/ext4/ext4.h
>> +++ b/fs/ext4/ext4.h
>> @@ -3501,6 +3501,23 @@ static inline int ext4_update_inode_size(struct inode *inode, loff_t newsize)
>> return changed;
>> }
>>
>> +/*
>> + * Set i_size and i_disksize to 'newsize'.
>> + *
>> + * Both i_rwsem and i_data_sem are required here to avoid races between
>> + * generic append writeback and concurrent truncate that also modify
>> + * i_size and i_disksize.
>> + */
>> +static inline void ext4_set_inode_size(struct inode *inode, loff_t newsize)
>> +{
>> + WARN_ON_ONCE(S_ISREG(inode->i_mode) && !inode_is_locked(inode));
>> +
>> + down_write(&EXT4_I(inode)->i_data_sem);
>> + i_size_write(inode, newsize);
>> + EXT4_I(inode)->i_disksize = newsize;
>> + up_write(&EXT4_I(inode)->i_data_sem);
>> +}
>> +
>
> Do we need this in the header later or can we keep it local to inode.c?
In the current version of the patch 021, this helper is called by
ext4_collapse_range() and ext4_insert_range(). However, after analyzing
sashiko's review comments, I believe this is unnecessary, so I will move
it to inode.c in my next iteration.
>
>> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
>> index 0751dc55e94f..5e913aca6499 100644
>> --- a/fs/ext4/inode.c
>> +++ b/fs/ext4/inode.c
>> @@ -5855,6 +5855,83 @@ static void ext4_wait_for_tail_page_commit(struct inode *inode)
>> }
>> }
>>
>> +static int ext4_truncate_up(struct inode *inode, loff_t oldsize, loff_t newsize)
>> +{
>> + ext4_lblk_t old_lblk, new_lblk;
>> + handle_t *handle;
>> + int ret;
>> +
>> + if (!IS_ALIGNED(oldsize | newsize, i_blocksize(inode))) {
>> + ret = ext4_inode_attach_jinode(inode);
>> + if (ret)
>> + return ret;
>> + }
>> +
>> + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
>> + if (oldsize & (i_blocksize(inode) - 1)) {
>
> When you transitioned to IS_ALIGNED above, use it here as well?
Ha, right, I missed it.
>
>> + ret = ext4_block_zero_eof(inode, oldsize, LLONG_MAX);
>> + if (ret)
>> + return ret;
>> + }
>
> ...
>
>> + if (attr->ia_size > oldsize)
>> + error = ext4_truncate_up(inode, oldsize, attr->ia_size);
>> + else if (shrink)
>> + error = ext4_truncate_down(inode, oldsize,
>> + attr->ia_size, &orphan);
>> + if (error)
>> + goto out_mmap_sem;
>>
>> /*
>> * Truncate pagecache after we've waited for commit
>
> Hum, why not move the truncate_pagecache() call and ext4_truncate() call
> into ext4_truncate_down()? They are not needed in the truncate up case...
Yeah, agree. These two functions also need to be used in cases where
the file size remains unchanged. We can move them into ext4_truncate_down()
and extend it to handle the file size unchanged scenario.
Thanks,
Yi.
>
> Honza
>
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox