* [PATCH v3 08/22] ext4: implement buffered write path using iomap
From: Zhang Yi @ 2026-04-22 2:10 UTC (permalink / raw)
To: linux-ext4, linux-fsdevel
Cc: linux-kernel, tytso, adilger.kernel, libaokun, jack, ojaswin,
ritesh.list, djwong, hch, yi.zhang, yi.zhang, yizhang089,
yangerkun, yukuai
In-Reply-To: <20260422021042.4157510-1-yi.zhang@huaweicloud.com>
From: Zhang Yi <yi.zhang@huawei.com>
Introduce two new iomap_ops instances, ext4_iomap_buffered_write_ops and
ext4_iomap_buffered_da_write_ops, to implement the iomap write paths for
ext4. ext4_iomap_buffered_da_write_begin() invokes ext4_da_map_blocks()
to map delayed allocation extents, and ext4_iomap_buffer_write_begin()
invokes ext4_iomap_get_blocks() to directly allocate blocks in
non-delayed allocation mode. Additionally, add ext4_iomap_valid() to
check the validity of extents by the iomap infrastructure.
Key changes:
- Since we don't use data=ordered mode to prevent exposing stale data
in the non-delayed allocation path, we always allocate unwritten
extents for new blocks.
- The iomap write path maps multiple blocks at a time in the
iomap_begin() callbacks, so we must remove the stale delayed
allocation range in case of short writes and write failures.
Otherwise, this could result in a range of delayed extents being
covered by a clean folio, which would lead to inaccurate space
reservation.
- The lock ordering of the folio lock and transaction start is the
opposite of that in the buffer_head buffered write path. So we have
to stop journal handle in the iomap_begin() callbacks. The lock
ordering documentation in super.c has been updated accordingly.
Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
---
fs/ext4/ext4.h | 4 ++
fs/ext4/file.c | 20 +++++-
fs/ext4/inode.c | 164 +++++++++++++++++++++++++++++++++++++++++++++++-
fs/ext4/super.c | 10 ++-
4 files changed, 191 insertions(+), 7 deletions(-)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index fe3491ad2129..be92ff648362 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -3057,6 +3057,7 @@ int ext4_walk_page_buffers(handle_t *handle,
int do_journal_get_write_access(handle_t *handle, struct inode *inode,
struct buffer_head *bh);
void ext4_set_inode_mapping_order(struct inode *inode);
+int ext4_nonda_switch(struct super_block *sb);
#define FALL_BACK_TO_NONDELALLOC 1
#define CONVERT_INLINE_DATA 2
@@ -3943,6 +3944,9 @@ static inline void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end)
extern const struct iomap_ops ext4_iomap_ops;
extern const struct iomap_ops ext4_iomap_report_ops;
+extern const struct iomap_ops ext4_iomap_buffered_write_ops;
+extern const struct iomap_ops ext4_iomap_buffered_da_write_ops;
+extern const struct iomap_write_ops ext4_iomap_write_ops;
static inline int ext4_buffer_uptodate(struct buffer_head *bh)
{
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index eb1a323962b1..7f9bfbbc4a4e 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -299,6 +299,21 @@ static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from)
return count;
}
+static ssize_t ext4_iomap_buffered_write(struct kiocb *iocb,
+ struct iov_iter *from)
+{
+ struct inode *inode = file_inode(iocb->ki_filp);
+ const struct iomap_ops *iomap_ops;
+
+ if (test_opt(inode->i_sb, DELALLOC) && !ext4_nonda_switch(inode->i_sb))
+ iomap_ops = &ext4_iomap_buffered_da_write_ops;
+ else
+ iomap_ops = &ext4_iomap_buffered_write_ops;
+
+ return iomap_file_buffered_write(iocb, from, iomap_ops,
+ &ext4_iomap_write_ops, NULL);
+}
+
static ssize_t ext4_buffered_write_iter(struct kiocb *iocb,
struct iov_iter *from)
{
@@ -313,7 +328,10 @@ static ssize_t ext4_buffered_write_iter(struct kiocb *iocb,
if (ret <= 0)
goto out;
- ret = generic_perform_write(iocb, from);
+ if (ext4_inode_buffered_iomap(inode))
+ ret = ext4_iomap_buffered_write(iocb, from);
+ else
+ ret = generic_perform_write(iocb, from);
out:
inode_unlock(inode);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 5ffd6aeb3485..0ca303a90249 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3097,7 +3097,7 @@ static int ext4_dax_writepages(struct address_space *mapping,
return ret;
}
-static int ext4_nonda_switch(struct super_block *sb)
+int ext4_nonda_switch(struct super_block *sb)
{
s64 free_clusters, dirty_clusters;
struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -3467,6 +3467,15 @@ static bool ext4_inode_datasync_dirty(struct inode *inode)
return inode_state_read_once(inode) & I_DIRTY_DATASYNC;
}
+static bool ext4_iomap_valid(struct inode *inode, const struct iomap *iomap)
+{
+ return iomap->validity_cookie == READ_ONCE(EXT4_I(inode)->i_es_seq);
+}
+
+const struct iomap_write_ops ext4_iomap_write_ops = {
+ .iomap_valid = ext4_iomap_valid,
+};
+
static void ext4_set_iomap(struct inode *inode, struct iomap *iomap,
struct ext4_map_blocks *map, loff_t offset,
loff_t length, unsigned int flags)
@@ -3501,6 +3510,8 @@ static void ext4_set_iomap(struct inode *inode, struct iomap *iomap,
!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
iomap->flags |= IOMAP_F_MERGED;
+ iomap->validity_cookie = map->m_seq;
+
/*
* Flags passed to ext4_map_blocks() for direct I/O writes can result
* in m_flags having both EXT4_MAP_MAPPED and EXT4_MAP_UNWRITTEN bits
@@ -3908,8 +3919,12 @@ const struct iomap_ops ext4_iomap_report_ops = {
.iomap_begin = ext4_iomap_begin_report,
};
+/* Map blocks */
+typedef int (ext4_get_blocks_t)(struct inode *, struct ext4_map_blocks *);
+
static int ext4_iomap_map_blocks(struct inode *inode, loff_t offset,
- loff_t length, struct ext4_map_blocks *map)
+ loff_t length, ext4_get_blocks_t get_blocks,
+ struct ext4_map_blocks *map)
{
u8 blkbits = inode->i_blkbits;
@@ -3921,6 +3936,9 @@ static int ext4_iomap_map_blocks(struct inode *inode, loff_t offset,
map->m_len = min_t(loff_t, (offset + length - 1) >> blkbits,
EXT4_MAX_LOGICAL_BLOCK) - map->m_lblk + 1;
+ if (get_blocks)
+ return get_blocks(inode, map);
+
return ext4_map_blocks(NULL, inode, map, 0);
}
@@ -3938,7 +3956,7 @@ static int ext4_iomap_buffered_read_begin(struct inode *inode, loff_t offset,
if (WARN_ON_ONCE(ext4_has_inline_data(inode)))
return -ERANGE;
- ret = ext4_iomap_map_blocks(inode, offset, length, &map);
+ ret = ext4_iomap_map_blocks(inode, offset, length, NULL, &map);
if (ret < 0)
return ret;
@@ -3946,6 +3964,146 @@ static int ext4_iomap_buffered_read_begin(struct inode *inode, loff_t offset,
return 0;
}
+static int ext4_iomap_get_blocks(struct inode *inode,
+ struct ext4_map_blocks *map)
+{
+ loff_t i_size = i_size_read(inode);
+ handle_t *handle;
+ int ret, needed_blocks;
+
+ /*
+ * Check if the blocks have already been allocated, this could
+ * avoid initiating a new journal transaction and return the
+ * mapping information directly.
+ */
+ if ((map->m_lblk + map->m_len) <=
+ round_up(i_size, i_blocksize(inode)) >> inode->i_blkbits) {
+ ret = ext4_map_blocks(NULL, inode, map, 0);
+ if (ret < 0)
+ return ret;
+ if (map->m_flags & (EXT4_MAP_MAPPED | EXT4_MAP_UNWRITTEN |
+ EXT4_MAP_DELAYED))
+ return 0;
+ }
+
+ /*
+ * Reserve one block more for addition to orphan list in case
+ * we allocate blocks but write fails for some reason.
+ */
+ needed_blocks = ext4_chunk_trans_blocks(inode, map->m_len) + 1;
+ handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, needed_blocks);
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+
+ ret = ext4_map_blocks(handle, inode, map,
+ EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT);
+ /*
+ * Stop handle here following the lock ordering of the folio lock
+ * and the transaction start.
+ */
+ ext4_journal_stop(handle);
+
+ return ret;
+}
+
+static int ext4_iomap_buffered_do_write_begin(struct inode *inode,
+ loff_t offset, loff_t length, unsigned int flags,
+ struct iomap *iomap, struct iomap *srcmap, bool delalloc)
+{
+ int ret, retries = 0;
+ struct ext4_map_blocks map;
+ ext4_get_blocks_t *get_blocks;
+
+ ret = ext4_emergency_state(inode->i_sb);
+ if (unlikely(ret))
+ return ret;
+
+ /* Inline data support is not yet available. */
+ if (WARN_ON_ONCE(ext4_has_inline_data(inode)))
+ return -ERANGE;
+ if (WARN_ON_ONCE(!(flags & IOMAP_WRITE)))
+ return -EINVAL;
+
+ if (delalloc)
+ get_blocks = ext4_da_map_blocks;
+ else
+ get_blocks = ext4_iomap_get_blocks;
+retry:
+ ret = ext4_iomap_map_blocks(inode, offset, length, get_blocks, &map);
+ if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
+ goto retry;
+ if (ret < 0)
+ return ret;
+
+ ext4_set_iomap(inode, iomap, &map, offset, length, flags);
+ return 0;
+}
+
+static int ext4_iomap_buffered_write_begin(struct inode *inode,
+ loff_t offset, loff_t length, unsigned int flags,
+ struct iomap *iomap, struct iomap *srcmap)
+{
+ return ext4_iomap_buffered_do_write_begin(inode, offset, length, flags,
+ iomap, srcmap, false);
+}
+
+static int ext4_iomap_buffered_da_write_begin(struct inode *inode,
+ loff_t offset, loff_t length, unsigned int flags,
+ struct iomap *iomap, struct iomap *srcmap)
+{
+ return ext4_iomap_buffered_do_write_begin(inode, offset, length, flags,
+ iomap, srcmap, true);
+}
+
+/*
+ * Drop the staled delayed allocation range from the write failure,
+ * including both start and end blocks. If not, we could leave a range
+ * of delayed extents covered by a clean folio, it could lead to
+ * inaccurate space reservation.
+ */
+static void ext4_iomap_punch_delalloc(struct inode *inode, loff_t offset,
+ loff_t length, struct iomap *iomap)
+{
+ down_write(&EXT4_I(inode)->i_data_sem);
+ ext4_es_remove_extent(inode, offset >> inode->i_blkbits,
+ DIV_ROUND_UP_ULL(length, EXT4_BLOCK_SIZE(inode->i_sb)));
+ up_write(&EXT4_I(inode)->i_data_sem);
+}
+
+static int ext4_iomap_buffered_da_write_end(struct inode *inode, loff_t offset,
+ loff_t length, ssize_t written,
+ unsigned int flags,
+ struct iomap *iomap)
+{
+ loff_t start_byte, end_byte;
+
+ /* If we didn't reserve the blocks, we're not allowed to punch them. */
+ if (iomap->type != IOMAP_DELALLOC || !(iomap->flags & IOMAP_F_NEW))
+ return 0;
+
+ /* Nothing to do if we've written the entire delalloc extent */
+ start_byte = iomap_last_written_block(inode, offset, written);
+ end_byte = round_up(offset + length, i_blocksize(inode));
+ if (start_byte >= end_byte)
+ return 0;
+
+ filemap_invalidate_lock(inode->i_mapping);
+ iomap_write_delalloc_release(inode, start_byte, end_byte, flags,
+ iomap, ext4_iomap_punch_delalloc);
+ filemap_invalidate_unlock(inode->i_mapping);
+ return 0;
+}
+
+
+const struct iomap_ops ext4_iomap_buffered_write_ops = {
+ .iomap_begin = ext4_iomap_buffered_write_begin,
+};
+
+const struct iomap_ops ext4_iomap_buffered_da_write_ops = {
+ .iomap_begin = ext4_iomap_buffered_da_write_begin,
+ .iomap_end = ext4_iomap_buffered_da_write_end,
+};
+
const struct iomap_ops ext4_iomap_buffered_read_ops = {
.iomap_begin = ext4_iomap_buffered_read_begin,
};
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 6a77db4d3124..9bc294b769db 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -104,9 +104,13 @@ static const struct fs_parameter_spec ext4_param_specs[];
* -> page lock -> i_data_sem (rw)
*
* buffered write path:
- * sb_start_write -> i_mutex -> mmap_lock
- * sb_start_write -> i_mutex -> transaction start -> page lock ->
- * i_data_sem (rw)
+ * sb_start_write -> i_rwsem (w) -> mmap_lock
+ * - buffer_head path:
+ * sb_start_write -> i_rwsem (w) -> transaction start -> folio lock ->
+ * i_data_sem (rw)
+ * - iomap path:
+ * sb_start_write -> i_rwsem (w) -> transaction start -> i_data_sem (rw)
+ * sb_start_write -> i_rwsem (w) -> folio lock (not under an active handle)
*
* truncate:
* sb_start_write -> i_mutex -> invalidate_lock (w) -> i_mmap_rwsem (w) ->
--
2.52.0
^ permalink raw reply related
* [PATCH v3 05/22] ext4: implement buffered read path using iomap
From: Zhang Yi @ 2026-04-22 2:10 UTC (permalink / raw)
To: linux-ext4, linux-fsdevel
Cc: linux-kernel, tytso, adilger.kernel, libaokun, jack, ojaswin,
ritesh.list, djwong, hch, yi.zhang, yi.zhang, yizhang089,
yangerkun, yukuai
In-Reply-To: <20260422021042.4157510-1-yi.zhang@huaweicloud.com>
From: Zhang Yi <yi.zhang@huawei.com>
Implement the iomap read path for ext4 by introducing a new
ext4_iomap_buffered_read_ops instance. This provides the read_folio()
and readahead() callbacks for ext4_iomap_aops. The implementation
introduces:
- ext4_iomap_map_blocks(): Helper function to query extent mappings for
a given read range using ext4_map_blocks() and convert the mapping
information to iomap type
- ext4_iomap_buffered_read_begin(): The iomap_begin callback that maps
blocks, validates filesystem state, and populates the iomap. It
returns -ERANGE for inline data which is not yet supported.
Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
---
fs/ext4/inode.c | 45 ++++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 44 insertions(+), 1 deletion(-)
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 9e9f421888ed..df21f6870ec4 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3908,14 +3908,57 @@ const struct iomap_ops ext4_iomap_report_ops = {
.iomap_begin = ext4_iomap_begin_report,
};
+static int ext4_iomap_map_blocks(struct inode *inode, loff_t offset,
+ loff_t length, struct ext4_map_blocks *map)
+{
+ u8 blkbits = inode->i_blkbits;
+
+ if ((offset >> blkbits) > EXT4_MAX_LOGICAL_BLOCK)
+ return -EINVAL;
+
+ /* Calculate the first and last logical blocks respectively. */
+ map->m_lblk = offset >> blkbits;
+ map->m_len = min_t(loff_t, (offset + length - 1) >> blkbits,
+ EXT4_MAX_LOGICAL_BLOCK) - map->m_lblk + 1;
+
+ return ext4_map_blocks(NULL, inode, map, 0);
+}
+
+static int ext4_iomap_buffered_read_begin(struct inode *inode, loff_t offset,
+ loff_t length, unsigned int flags, struct iomap *iomap,
+ struct iomap *srcmap)
+{
+ struct ext4_map_blocks map;
+ int ret;
+
+ if (unlikely(ext4_forced_shutdown(inode->i_sb)))
+ return -EIO;
+
+ /* Inline data support is not yet available. */
+ if (WARN_ON_ONCE(ext4_has_inline_data(inode)))
+ return -ERANGE;
+
+ ret = ext4_iomap_map_blocks(inode, offset, length, &map);
+ if (ret < 0)
+ return ret;
+
+ ext4_set_iomap(inode, iomap, &map, offset, length, flags);
+ return 0;
+}
+
+const struct iomap_ops ext4_iomap_buffered_read_ops = {
+ .iomap_begin = ext4_iomap_buffered_read_begin,
+};
+
static int ext4_iomap_read_folio(struct file *file, struct folio *folio)
{
+ iomap_bio_read_folio(folio, &ext4_iomap_buffered_read_ops);
return 0;
}
static void ext4_iomap_readahead(struct readahead_control *rac)
{
-
+ iomap_bio_readahead(rac, &ext4_iomap_buffered_read_ops);
}
static int ext4_iomap_writepages(struct address_space *mapping,
--
2.52.0
^ permalink raw reply related
* [PATCH v3 07/22] ext4: do not use data=ordered mode for inodes using buffered iomap path
From: Zhang Yi @ 2026-04-22 2:10 UTC (permalink / raw)
To: linux-ext4, linux-fsdevel
Cc: linux-kernel, tytso, adilger.kernel, libaokun, jack, ojaswin,
ritesh.list, djwong, hch, yi.zhang, yi.zhang, yizhang089,
yangerkun, yukuai
In-Reply-To: <20260422021042.4157510-1-yi.zhang@huaweicloud.com>
From: Zhang Yi <yi.zhang@huawei.com>
Do not use data=ordered mode for inodes using the buffered iomap path.
There are two reasons:
1. The lock ordering of the folio lock and starting transactions
conflicts with the data=ordered mode. In the writeback path of the
iomap, it processes each folio one by one. It first holds the folio
lock and then starts a transaction to create the block mapping. In
the data=ordered mode, if we perform writeback through the journal
commit process, it may try to acquire the folio lock of a folio
already locked by iomap, and the iomap could start a new transaction
under this folio lock, which may also wait for the current committing
transaction to finish, finally triggering a deadlock.
2. The iomap writeback path doesn't support partial folio submission. In
the data=ordered mode, when the journal process is waiting for a
folio to be written back, and the folio may also contain unmapped
blocks with a block size smaller than the folio size, if the regular
writeback process has already started committing this folio (and set
the writeback flag), then a deadlock may occur while mapping the
remaining unmapped blocks. This is because the writeback flag is
cleared only after the entire folio are processed and committed.
To support the data=ordered mode, we need to modify the iomap
infrastructure by grabbing the transaction handle before we lock any
folio for writeback. In addition, we need to add support for submitting
partial folios, which is complicated and tricky, and may also cause
performance regressions. Therefore, we need to get rid of the
data=ordered mode when doing the conversion.
Currently, there are three scenarios where the data=ordered mode is used:
- Append write
- Post-EOF partial block truncate up and append write
- Online defragmentation
For append write, we can get rid of it by always allocating unwritten
blocks, retains the behavior of the current extents-type inode. For
post-E0F partial block truncate up and append write, we can get rid of
it by postponing updating i_disksize after the zeroed partial block is
written back. For the case of online defragmentation, it has not yet
been supported, we can find other solutions later.
Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
---
fs/ext4/ext4_jbd2.h | 7 ++++++-
1 file changed, 6 insertions(+), 1 deletion(-)
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index 63d17c5201b5..26999f173870 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -383,7 +383,12 @@ static inline int ext4_should_journal_data(struct inode *inode)
static inline int ext4_should_order_data(struct inode *inode)
{
- return ext4_inode_journal_mode(inode) & EXT4_INODE_ORDERED_DATA_MODE;
+ /*
+ * inodes using the iomap buffered I/O path do not use the
+ * data=ordered mode.
+ */
+ return !ext4_inode_buffered_iomap(inode) &&
+ (ext4_inode_journal_mode(inode) & EXT4_INODE_ORDERED_DATA_MODE);
}
static inline int ext4_should_writeback_data(struct inode *inode)
--
2.52.0
^ permalink raw reply related
* [PATCH v3 00/22] ext4: use iomap for regular file's buffered I/O path
From: Zhang Yi @ 2026-04-22 2:10 UTC (permalink / raw)
To: linux-ext4, linux-fsdevel
Cc: linux-kernel, tytso, adilger.kernel, libaokun, jack, ojaswin,
ritesh.list, djwong, hch, yi.zhang, yi.zhang, yizhang089,
yangerkun, yukuai
From: Zhang Yi <yi.zhang@huawei.com>
This series adds the iomap buffered I/O path support for regular files,
based on the latest upstream kernel. It implements the core iomap APIs
on ext4 and introduces the 'buffered_iomap' mount option to enable the
iomap buffered I/O path. It supports default features, default mount
options and bigalloc feature. However, it does not support online
defragmentation, inline data, fsverify, fscrypt, non-extent inodes, and
data=journal mode, it will fall to buffered_head I/O path automatically
if these features and options are used.
This iomap buffered I/O path is not enabled by default because the
preceding features are not supported. Users can explicitly enable or
disable it via 'buffered_iomap' and 'nobuffered_iomap' mount options.
Key notes
=========
1. Lock ordering difference
The lock ordering of folio lock and transaction start in the iomap
path is the opposite of that in the buffer_head path.
2. data=ordered mode is not used
Two main reasons:
a) The lock ordering of folio lock and transaction start for
data=ordered mode is opposite to the iomap path, which would cause
a deadlock.
b) The iomap writeback path does not support partial folio submission
(required by data=ordered mode when block size < folio size, and
it is currently handled by ext4_bio_write_folio()), which would
also cause a deadlock.
To replace data=ordered mode functionality:
- For append write: Always allocate unwritten extents (dioread_nolock
behavior) to prevent stale data exposure.
- For post-EOF partial block zeroing: Issue zeroing I/O immediately
and wait for completion before updating i_disksize. On ordered I/O
completion, set i_disksize = i_size to avoid lost updates in the
truncate up case. (Jan suggested).
- For online defragmentation: Not supported yet, needs further
consideration.
3. Always enable dioread_nolock
Two main reasons:
a) Since data=ordered mode cannot be used, allocating written blocks
directly would expose stale data.
b) To optimize writeback, we should allocate blocks based on writeback
length rather than per-folio mapping. Direct written allocation
would over-allocate blocks.
dioread_nolock has been the default mount option for many years, and
Jan pointed out that we may no longer need to disable it, so gradually
remove this mount option in the future.
Series structure
================
- Patch 01-03: Simplify truncate operations and prepare for conversion.
- Patch 04-18: Implement core iomap buffered read/write, writeback,
mmap, and partial block zeroing paths.
- Patch 19-22: Handle ordered I/O for zeroing post-EOF partial block.
Testing and Performance
=======================
Tested with xfstests-bld using -g auto, fast_commit, and 64k
configurations. No new regressions were observed.
For the special case of zeroing post-EOF partial block, I add a new
generic/790 to address this scenario.
https://lore.kernel.org/fstests/20260422015246.4132376-1-yi.zhang@huaweicloud.com/
Performance tested with fio on a 150 GB memory-backed virtual machine
(no much difference compared to v2, so no update):
Buffered write (MiB/s)
===
bs write cache uncached write
bh iomap bh iomap
1k 423 403 36.3 57
4k 1067 1093 58.4 61
64k 4321 6488 869 1206
1M 4640 7378 3158 4818
Buffered read (MiB/s)
===
bs read hole read pre-cache read ondisk data
bh iomap bh iomap bh iomap
1k 635 643 661 653 605 602
4k 1987 2075 2128 2159 1761 1716
64k 6068 6267 9472 9545 4475 4451
1M 5471 6072 8657 9191 4405 4467
Large I/O write performance improved by approximately 30% to 50%.
Read performance showed no significant difference.
Changes sicne v2:
- Rebased on the latest upstream kernel (7.1-rc1).
- Added patches 01-03 to simplify truncate operations.
- Added patch 13 to fix incorrect did_zero parameter in
iomap_zero_range().
- Added patches 19-22 to handle ordered I/O for zeroing post-EOF
partial block.
- Minor code and comment optimizations.
Changes since v1:
- Rebase this series on linux-next 20260122.
- Refactor partial block zero range, stop passing handle to
ext4_block_truncate_page() and ext4_zero_partial_blocks(), and move
partial block zeroing operation outside an active journal transaction
to prevent potential deadlocks because of the lock ordering of folio
and transaction start.
- Clarify the lock ordering of folio lock and transaction start, update
the comments accordingly.
- Fix some issues related to fast commit, pollute post-EOF folio.
- Some minor code and comments optimizations.
v2: https://lore.kernel.org/linux-ext4/20260203062523.3869120-1-yi.zhang@huawei.com/
v1: https://lore.kernel.org/linux-ext4/20241022111059.2566137-1-yi.zhang@huaweicloud.com/
RFC v4: https://lore.kernel.org/linux-ext4/20240410142948.2817554-1-yi.zhang@huaweicloud.com/
RFC v3: https://lore.kernel.org/linux-ext4/20240127015825.1608160-1-yi.zhang@huaweicloud.com/
RFC v2: https://lore.kernel.org/linux-ext4/20240102123918.799062-1-yi.zhang@huaweicloud.com/
RFC v1: https://lore.kernel.org/linux-ext4/20231123125121.4064694-1-yi.zhang@huaweicloud.com/
Comments and suggestions are welcome!
Thanks,
Yi.
Zhang Yi (22):
ext4: simplify size updating in ext4_setattr()
ext4: factor out ext4_truncate_[up|down]()
ext4: simplify error handling in ext4_setattr()
ext4: add iomap address space operations for buffered I/O
ext4: implement buffered read path using iomap
ext4: pass out extent seq counter when mapping da blocks
ext4: do not use data=ordered mode for inodes using buffered iomap
path
ext4: implement buffered write path using iomap
ext4: implement writeback path using iomap
ext4: implement mmap path using iomap
iomap: correct the range of a partial dirty clear
iomap: support invalidating partial folios
iomap: fix incorrect did_zero setting in iomap_zero_iter()
ext4: implement partial block zero range path using iomap
ext4: add block mapping tracepoints for iomap buffered I/O path
ext4: disable online defrag when inode using iomap buffered I/O path
ext4: partially enable iomap for the buffered I/O path of regular
files
ext4: introduce a mount option for iomap buffered I/O path
ext4: submit zeroed post-EOF data immediately in the iomap buffered
I/O path
ext4: wait for ordered I/O in the iomap buffered I/O path
ext4: update i_disksize to i_size on ordered I/O completion
ext4: add tracepoints for ordered I/O in the iomap buffered I/O path
fs/ext4/ext4.h | 73 ++-
fs/ext4/ext4_jbd2.c | 1 +
fs/ext4/ext4_jbd2.h | 7 +-
fs/ext4/extents.c | 9 +-
fs/ext4/file.c | 20 +-
fs/ext4/ialloc.c | 1 +
fs/ext4/inode.c | 911 +++++++++++++++++++++++++++++++-----
fs/ext4/move_extent.c | 11 +
fs/ext4/page-io.c | 203 ++++++++
fs/ext4/super.c | 55 ++-
fs/iomap/buffered-io.c | 20 +-
include/trace/events/ext4.h | 142 ++++++
12 files changed, 1313 insertions(+), 140 deletions(-)
--
2.52.0
^ permalink raw reply
* [PATCH v3 02/22] ext4: factor out ext4_truncate_[up|down]()
From: Zhang Yi @ 2026-04-22 2:10 UTC (permalink / raw)
To: linux-ext4, linux-fsdevel
Cc: linux-kernel, tytso, adilger.kernel, libaokun, jack, ojaswin,
ritesh.list, djwong, hch, yi.zhang, yi.zhang, yizhang089,
yangerkun, yukuai
In-Reply-To: <20260422021042.4157510-1-yi.zhang@huaweicloud.com>
From: Zhang Yi <yi.zhang@huawei.com>
Refactor ext4_setattr() by introducing two helper functions,
ext4_truncate_up() and ext4_truncate_down(), to handle size changes. The
current ATTR_SIZE processing consolidates checks for both shrinking and
non-shrinking cases, leading to cluttered code. Separating the
truncation paths improves readability.
Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
---
fs/ext4/ext4.h | 17 ++++++
fs/ext4/inode.c | 157 ++++++++++++++++++++++++++----------------------
2 files changed, 101 insertions(+), 73 deletions(-)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 94283a991e5c..9e4353432325 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -3501,6 +3501,23 @@ static inline int ext4_update_inode_size(struct inode *inode, loff_t newsize)
return changed;
}
+/*
+ * Set i_size and i_disksize to 'newsize'.
+ *
+ * Both i_rwsem and i_data_sem are required here to avoid races between
+ * generic append writeback and concurrent truncate that also modify
+ * i_size and i_disksize.
+ */
+static inline void ext4_set_inode_size(struct inode *inode, loff_t newsize)
+{
+ WARN_ON_ONCE(S_ISREG(inode->i_mode) && !inode_is_locked(inode));
+
+ down_write(&EXT4_I(inode)->i_data_sem);
+ i_size_write(inode, newsize);
+ EXT4_I(inode)->i_disksize = newsize;
+ up_write(&EXT4_I(inode)->i_data_sem);
+}
+
int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset,
loff_t len);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 0751dc55e94f..5e913aca6499 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -5855,6 +5855,83 @@ static void ext4_wait_for_tail_page_commit(struct inode *inode)
}
}
+static int ext4_truncate_up(struct inode *inode, loff_t oldsize, loff_t newsize)
+{
+ ext4_lblk_t old_lblk, new_lblk;
+ handle_t *handle;
+ int ret;
+
+ if (!IS_ALIGNED(oldsize | newsize, i_blocksize(inode))) {
+ ret = ext4_inode_attach_jinode(inode);
+ if (ret)
+ return ret;
+ }
+
+ inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
+ if (oldsize & (i_blocksize(inode) - 1)) {
+ ret = ext4_block_zero_eof(inode, oldsize, LLONG_MAX);
+ if (ret)
+ return ret;
+ }
+
+ handle = ext4_journal_start(inode, EXT4_HT_INODE, 3);
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+
+ old_lblk = oldsize > 0 ? (oldsize - 1) >> inode->i_blkbits : 0;
+ new_lblk = newsize > 0 ? (newsize - 1) >> inode->i_blkbits : 0;
+ ext4_fc_track_range(handle, inode, old_lblk, new_lblk);
+
+ ext4_set_inode_size(inode, newsize);
+
+ ret = ext4_mark_inode_dirty(handle, inode);
+ ext4_journal_stop(handle);
+ if (ret)
+ return ret;
+ /*
+ * isize extend must be called outside an active handle due to
+ * the lock ordering of transaction start and folio lock in the
+ * iomap buffered I/O path (folio lock -> transaction start).
+ */
+ pagecache_isize_extended(inode, oldsize, newsize);
+ return 0;
+}
+
+static int ext4_truncate_down(struct inode *inode, loff_t oldsize,
+ loff_t newsize, int *orphan)
+{
+ ext4_lblk_t start_lblk;
+ handle_t *handle;
+ int ret;
+
+ handle = ext4_journal_start(inode, EXT4_HT_INODE, 3);
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+
+ if (ext4_handle_valid(handle)) {
+ ret = ext4_orphan_add(handle, inode);
+ *orphan = 1;
+ if (ret) {
+ ext4_journal_stop(handle);
+ return ret;
+ }
+ }
+
+ start_lblk = newsize > 0 ? (newsize - 1) >> inode->i_blkbits : 0;
+ ext4_fc_track_range(handle, inode, start_lblk, EXT_MAX_BLOCKS - 1);
+
+ ext4_set_inode_size(inode, newsize);
+
+ ret = ext4_mark_inode_dirty(handle, inode);
+ ext4_journal_stop(handle);
+ if (ret)
+ return ret;
+
+ if (ext4_should_journal_data(inode))
+ ext4_wait_for_tail_page_commit(inode);
+ return 0;
+}
+
/*
* ext4_setattr()
*
@@ -5951,7 +6028,6 @@ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
}
if (attr->ia_valid & ATTR_SIZE) {
- handle_t *handle;
loff_t oldsize = inode->i_size;
int shrink = (attr->ia_size < inode->i_size);
@@ -6003,78 +6079,13 @@ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
goto err_out;
}
- if (attr->ia_size != inode->i_size) {
- /* attach jbd2 jinode for EOF folio tail zeroing */
- if (attr->ia_size & (inode->i_sb->s_blocksize - 1) ||
- oldsize & (inode->i_sb->s_blocksize - 1)) {
- error = ext4_inode_attach_jinode(inode);
- if (error)
- goto out_mmap_sem;
- }
-
- /*
- * Update c/mtime and tail zero the EOF folio on
- * truncate up. ext4_truncate() handles the shrink case
- * below.
- */
- if (!shrink) {
- inode_set_mtime_to_ts(inode,
- inode_set_ctime_current(inode));
- if (oldsize & (inode->i_sb->s_blocksize - 1)) {
- error = ext4_block_zero_eof(inode,
- oldsize, LLONG_MAX);
- if (error)
- goto out_mmap_sem;
- }
- }
-
- handle = ext4_journal_start(inode, EXT4_HT_INODE, 3);
- if (IS_ERR(handle)) {
- error = PTR_ERR(handle);
- goto out_mmap_sem;
- }
- if (ext4_handle_valid(handle) && shrink) {
- error = ext4_orphan_add(handle, inode);
- orphan = 1;
- if (error)
- goto out_handle;
- }
-
- if (shrink)
- ext4_fc_track_range(handle, inode,
- (attr->ia_size > 0 ? attr->ia_size - 1 : 0) >>
- inode->i_sb->s_blocksize_bits,
- EXT_MAX_BLOCKS - 1);
- else
- ext4_fc_track_range(
- handle, inode,
- (oldsize > 0 ? oldsize - 1 : oldsize) >>
- inode->i_sb->s_blocksize_bits,
- (attr->ia_size > 0 ? attr->ia_size - 1 : 0) >>
- inode->i_sb->s_blocksize_bits);
-
- /*
- * We have to update i_size under i_data_sem together
- * with i_disksize to avoid races with writeback code
- * updating disksize in mpage_map_and_submit_extent().
- */
- down_write(&EXT4_I(inode)->i_data_sem);
- i_size_write(inode, attr->ia_size);
- EXT4_I(inode)->i_disksize = attr->ia_size;
- up_write(&EXT4_I(inode)->i_data_sem);
-
- error = ext4_mark_inode_dirty(handle, inode);
-out_handle:
- ext4_journal_stop(handle);
- if (error)
- goto out_mmap_sem;
- if (!shrink) {
- pagecache_isize_extended(inode, oldsize,
- inode->i_size);
- } else if (ext4_should_journal_data(inode)) {
- ext4_wait_for_tail_page_commit(inode);
- }
- }
+ if (attr->ia_size > oldsize)
+ error = ext4_truncate_up(inode, oldsize, attr->ia_size);
+ else if (shrink)
+ error = ext4_truncate_down(inode, oldsize,
+ attr->ia_size, &orphan);
+ if (error)
+ goto out_mmap_sem;
/*
* Truncate pagecache after we've waited for commit
--
2.52.0
^ permalink raw reply related
* [PATCH v3 11/22] iomap: correct the range of a partial dirty clear
From: Zhang Yi @ 2026-04-22 2:10 UTC (permalink / raw)
To: linux-ext4, linux-fsdevel
Cc: linux-kernel, tytso, adilger.kernel, libaokun, jack, ojaswin,
ritesh.list, djwong, hch, yi.zhang, yi.zhang, yizhang089,
yangerkun, yukuai
In-Reply-To: <20260422021042.4157510-1-yi.zhang@huaweicloud.com>
From: Zhang Yi <yi.zhang@huawei.com>
The block range calculation in ifs_clear_range_dirty() is incorrect when
partially clearing a range in a folio. We cannot clear the dirty bit of
the first block or the last block if the start or end offset is not
blocksize-aligned. This has not yet caused any issues since we always
clear a whole folio in iomap_writeback_folio().
Fix this by rounding up the first block to blocksize alignment, and
calculate the last block by rounding down (using truncation). Correct
the nr_blks calculation accordingly.
Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
---
This is modified from:
https://lore.kernel.org/linux-fsdevel/20240812121159.3775074-2-yi.zhang@huaweicloud.com/
Changes:
- Use round_up() instead of DIV_ROUND_UP() to prevent wasted integer
division.
fs/iomap/buffered-io.c | 10 +++++++---
1 file changed, 7 insertions(+), 3 deletions(-)
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index d7b648421a70..7e7d5b776d35 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -176,11 +176,15 @@ static void ifs_clear_range_dirty(struct folio *folio,
{
struct inode *inode = folio->mapping->host;
unsigned int blks_per_folio = i_blocks_per_folio(inode, folio);
- unsigned int first_blk = (off >> inode->i_blkbits);
- unsigned int last_blk = (off + len - 1) >> inode->i_blkbits;
- unsigned int nr_blks = last_blk - first_blk + 1;
+ unsigned int first_blk = round_up(off, i_blocksize(inode)) >>
+ inode->i_blkbits;
+ unsigned int last_blk = (off + len) >> inode->i_blkbits;
+ unsigned int nr_blks = last_blk - first_blk;
unsigned long flags;
+ if (!nr_blks)
+ return;
+
spin_lock_irqsave(&ifs->state_lock, flags);
bitmap_clear(ifs->state, first_blk + blks_per_folio, nr_blks);
spin_unlock_irqrestore(&ifs->state_lock, flags);
--
2.52.0
^ permalink raw reply related
* [PATCH v3 01/22] ext4: simplify size updating in ext4_setattr()
From: Zhang Yi @ 2026-04-22 2:10 UTC (permalink / raw)
To: linux-ext4, linux-fsdevel
Cc: linux-kernel, tytso, adilger.kernel, libaokun, jack, ojaswin,
ritesh.list, djwong, hch, yi.zhang, yi.zhang, yizhang089,
yangerkun, yukuai
In-Reply-To: <20260422021042.4157510-1-yi.zhang@huaweicloud.com>
From: Zhang Yi <yi.zhang@huawei.com>
The logic for updating the file size in ext4_setattr() is currently
somewhat messy. By directly entering the error-handling path after
failing to add an orphan inode, the unnecessary recovery process
involving old_disksize and the file size can be avoided.
Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
---
fs/ext4/inode.c | 22 +++++++++-------------
1 file changed, 9 insertions(+), 13 deletions(-)
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index c2c2d6ac7f3d..0751dc55e94f 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -5953,7 +5953,6 @@ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
if (attr->ia_valid & ATTR_SIZE) {
handle_t *handle;
loff_t oldsize = inode->i_size;
- loff_t old_disksize;
int shrink = (attr->ia_size < inode->i_size);
if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
@@ -6037,6 +6036,8 @@ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
if (ext4_handle_valid(handle) && shrink) {
error = ext4_orphan_add(handle, inode);
orphan = 1;
+ if (error)
+ goto out_handle;
}
if (shrink)
@@ -6052,23 +6053,18 @@ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
(attr->ia_size > 0 ? attr->ia_size - 1 : 0) >>
inode->i_sb->s_blocksize_bits);
- down_write(&EXT4_I(inode)->i_data_sem);
- old_disksize = EXT4_I(inode)->i_disksize;
- EXT4_I(inode)->i_disksize = attr->ia_size;
-
/*
* We have to update i_size under i_data_sem together
* with i_disksize to avoid races with writeback code
- * running ext4_wb_update_i_disksize().
+ * updating disksize in mpage_map_and_submit_extent().
*/
- if (!error)
- i_size_write(inode, attr->ia_size);
- else
- EXT4_I(inode)->i_disksize = old_disksize;
+ down_write(&EXT4_I(inode)->i_data_sem);
+ i_size_write(inode, attr->ia_size);
+ EXT4_I(inode)->i_disksize = attr->ia_size;
up_write(&EXT4_I(inode)->i_data_sem);
- rc = ext4_mark_inode_dirty(handle, inode);
- if (!error)
- error = rc;
+
+ error = ext4_mark_inode_dirty(handle, inode);
+out_handle:
ext4_journal_stop(handle);
if (error)
goto out_mmap_sem;
--
2.52.0
^ permalink raw reply related
* [PATCH v3 03/22] ext4: simplify error handling in ext4_setattr()
From: Zhang Yi @ 2026-04-22 2:10 UTC (permalink / raw)
To: linux-ext4, linux-fsdevel
Cc: linux-kernel, tytso, adilger.kernel, libaokun, jack, ojaswin,
ritesh.list, djwong, hch, yi.zhang, yi.zhang, yizhang089,
yangerkun, yukuai
In-Reply-To: <20260422021042.4157510-1-yi.zhang@huaweicloud.com>
From: Zhang Yi <yi.zhang@huawei.com>
Remove the redundant rc variable and consolidate error handling.
Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
---
fs/ext4/inode.c | 37 ++++++++++++++++---------------------
1 file changed, 16 insertions(+), 21 deletions(-)
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 5e913aca6499..59405a95ecfc 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -5960,7 +5960,7 @@ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
struct iattr *attr)
{
struct inode *inode = d_inode(dentry);
- int error, rc = 0;
+ int error;
int orphan = 0;
const unsigned int ia_valid = attr->ia_valid;
bool inc_ivers = true;
@@ -6073,8 +6073,8 @@ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
filemap_invalidate_lock(inode->i_mapping);
- rc = ext4_break_layouts(inode);
- if (rc) {
+ error = ext4_break_layouts(inode);
+ if (error) {
filemap_invalidate_unlock(inode->i_mapping);
goto err_out;
}
@@ -6096,22 +6096,23 @@ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
* Call ext4_truncate() even if i_size didn't change to
* truncate possible preallocated blocks.
*/
- if (attr->ia_size <= oldsize) {
- rc = ext4_truncate(inode);
- if (rc)
- error = rc;
- }
+ if (attr->ia_size <= oldsize)
+ error = ext4_truncate(inode);
out_mmap_sem:
filemap_invalidate_unlock(inode->i_mapping);
+ if (error)
+ goto err_out;
}
- if (!error) {
- if (inc_ivers)
- inode_inc_iversion(inode);
- setattr_copy(idmap, inode, attr);
- mark_inode_dirty(inode);
- }
+ if (inc_ivers)
+ inode_inc_iversion(inode);
+ setattr_copy(idmap, inode, attr);
+ mark_inode_dirty(inode);
+ if (ia_valid & ATTR_MODE)
+ error = posix_acl_chmod(idmap, dentry, inode->i_mode);
+
+err_out:
/*
* If the call to ext4_truncate failed to get a transaction handle at
* all, we need to clean up the in-core orphan list manually.
@@ -6119,14 +6120,8 @@ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
if (orphan && inode->i_nlink)
ext4_orphan_del(NULL, inode);
- if (!error && (ia_valid & ATTR_MODE))
- rc = posix_acl_chmod(idmap, dentry, inode->i_mode);
-
-err_out:
- if (error)
+ if (error)
ext4_std_error(inode->i_sb, error);
- if (!error)
- error = rc;
return error;
}
--
2.52.0
^ permalink raw reply related
* [PATCH v3 04/22] ext4: add iomap address space operations for buffered I/O
From: Zhang Yi @ 2026-04-22 2:10 UTC (permalink / raw)
To: linux-ext4, linux-fsdevel
Cc: linux-kernel, tytso, adilger.kernel, libaokun, jack, ojaswin,
ritesh.list, djwong, hch, yi.zhang, yi.zhang, yizhang089,
yangerkun, yukuai
In-Reply-To: <20260422021042.4157510-1-yi.zhang@huaweicloud.com>
From: Zhang Yi <yi.zhang@huawei.com>
Introduce initial support for iomap in the buffered I/O path for regular
files on ext4.
- Add a new inode state flag EXT4_STATE_BUFFERED_IOMAP to indicate the
inode uses iomap instead of buffer_head for buffered I/O
- Add helper ext4_inode_buffered_iomap() to check the flag
- Add new address space operations ext4_iomap_aops with callbacks that
will use generic iomap implementations
- Add ext4_iomap_aops to ext4_set_aops() when the flag is set
The following callbacks(read_folio(), readahead(), writepages()) are
provided as placeholders and will be implemented in later patches.
Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
---
fs/ext4/ext4.h | 7 +++++++
fs/ext4/inode.c | 32 ++++++++++++++++++++++++++++++++
2 files changed, 39 insertions(+)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 9e4353432325..fe3491ad2129 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1972,6 +1972,7 @@ enum {
EXT4_STATE_FC_COMMITTING, /* Fast commit ongoing */
EXT4_STATE_FC_FLUSHING_DATA, /* Fast commit flushing data */
EXT4_STATE_ORPHAN_FILE, /* Inode orphaned in orphan file */
+ EXT4_STATE_BUFFERED_IOMAP, /* Inode use iomap for buffered IO */
};
#define EXT4_INODE_BIT_FNS(name, field, offset) \
@@ -2040,6 +2041,12 @@ static inline bool ext4_inode_orphan_tracked(struct inode *inode)
!list_empty(&EXT4_I(inode)->i_orphan);
}
+/* Whether the inode pass through the iomap infrastructure for buffered I/O */
+static inline bool ext4_inode_buffered_iomap(struct inode *inode)
+{
+ return ext4_test_inode_state(inode, EXT4_STATE_BUFFERED_IOMAP);
+}
+
/*
* Codes for operating systems
*/
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 59405a95ecfc..9e9f421888ed 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3908,6 +3908,22 @@ const struct iomap_ops ext4_iomap_report_ops = {
.iomap_begin = ext4_iomap_begin_report,
};
+static int ext4_iomap_read_folio(struct file *file, struct folio *folio)
+{
+ return 0;
+}
+
+static void ext4_iomap_readahead(struct readahead_control *rac)
+{
+
+}
+
+static int ext4_iomap_writepages(struct address_space *mapping,
+ struct writeback_control *wbc)
+{
+ return 0;
+}
+
/*
* For data=journal mode, folio should be marked dirty only when it was
* writeably mapped. When that happens, it was already attached to the
@@ -3994,6 +4010,20 @@ static const struct address_space_operations ext4_da_aops = {
.swap_activate = ext4_iomap_swap_activate,
};
+static const struct address_space_operations ext4_iomap_aops = {
+ .read_folio = ext4_iomap_read_folio,
+ .readahead = ext4_iomap_readahead,
+ .writepages = ext4_iomap_writepages,
+ .dirty_folio = iomap_dirty_folio,
+ .bmap = ext4_bmap,
+ .invalidate_folio = iomap_invalidate_folio,
+ .release_folio = iomap_release_folio,
+ .migrate_folio = filemap_migrate_folio,
+ .is_partially_uptodate = iomap_is_partially_uptodate,
+ .error_remove_folio = generic_error_remove_folio,
+ .swap_activate = ext4_iomap_swap_activate,
+};
+
static const struct address_space_operations ext4_dax_aops = {
.writepages = ext4_dax_writepages,
.dirty_folio = noop_dirty_folio,
@@ -4015,6 +4045,8 @@ void ext4_set_aops(struct inode *inode)
}
if (IS_DAX(inode))
inode->i_mapping->a_ops = &ext4_dax_aops;
+ else if (ext4_inode_buffered_iomap(inode))
+ inode->i_mapping->a_ops = &ext4_iomap_aops;
else if (test_opt(inode->i_sb, DELALLOC))
inode->i_mapping->a_ops = &ext4_da_aops;
else
--
2.52.0
^ permalink raw reply related
* [PATCH] generic/790: test post-EOF gap zeroing persistence
From: Zhang Yi @ 2026-04-22 1:52 UTC (permalink / raw)
To: fstests, zlang
Cc: linux-ext4, linux-fsdevel, jack, yi.zhang, yi.zhang, yizhang089,
yangerkun
From: Zhang Yi <yi.zhang@huawei.com>
Test that extending a file past a non-block-aligned EOF correctly
zero-fills the gap [old_EOF, block_boundary), and that this zeroing
persists through a filesystem shutdown+remount cycle.
Stale data beyond EOF can persist on disk when append write data blocks
are flushed before the i_size metadata update, or when concurrent append
writeback and mmap writes persist non-zero data past EOF. Subsequent
post-EOF operations (append write, fallocate, truncate up) must
zero-fill and persist the gap to prevent exposing stale data.
The test pollutes the file's last physical block (via FIEMAP + raw
device write) with a sentinel pattern beyond i_size, then performs each
extend operation and verifies the gap is zeroed both in memory and on
disk.
Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
---
This is the case Jan Kara pointed out during my work on the ext4
buffered I/O to iomap conversion. This case is similar to generic/363,
but generic/363 doesn't provide persistent testing. For details:
https://lore.kernel.org/linux-ext4/jgotl7vzzuzm6dvz5zfgk6haodxvunb4hq556pzh4hqqwvnhxq@lr3jiedhqh7c/
tests/generic/790 | 155 ++++++++++++++++++++++++++++++++++++++++++
tests/generic/790.out | 4 ++
2 files changed, 159 insertions(+)
create mode 100755 tests/generic/790
create mode 100644 tests/generic/790.out
diff --git a/tests/generic/790 b/tests/generic/790
new file mode 100755
index 00000000..5d8f61f9
--- /dev/null
+++ b/tests/generic/790
@@ -0,0 +1,155 @@
+#! /bin/bash
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (c) 2026 Huawei. All Rights Reserved.
+#
+# FS QA Test No. 790
+#
+# Test that extending a file past a non-block-aligned EOF correctly zero-fills
+# the gap [old_EOF, block_boundary), and that this zeroing persists through a
+# filesystem shutdown+remount cycle.
+#
+# Stale data beyond EOF can persist on disk when:
+# 1) append write data blocks are flushed before the i_size metadata update,
+# and the system crashes in this window.
+# 2) concurrent append writeback and mmap writes persist non-zero data past EOF.
+#
+# Subsequent post-EOF operations (append write, fallocate, truncate up) must
+# zero-fill and persist the gap to prevent exposing stale data.
+#
+# The test pollutes the file's last physical block (via FIEMAP + raw device
+# write) with a sentinel pattern beyond i_size, then performs each extend
+# operation and verifies the gap is zeroed both in memory and on disk.
+#
+. ./common/preamble
+_begin_fstest auto quick rw shutdown
+
+. ./common/filter
+
+_require_scratch
+_require_block_device $SCRATCH_DEV
+_require_scratch_shutdown
+_require_metadata_journaling $SCRATCH_DEV
+_require_xfs_io_command "fiemap"
+_require_xfs_io_command "falloc"
+_require_xfs_io_command "pwrite"
+_require_xfs_io_command "truncate"
+_require_xfs_io_command "sync_range"
+
+# Check that gap region [offset, offset+nbytes) is entirely zero
+_check_gap_zero()
+{
+ local file="$1"
+ local offset="$2"
+ local nbytes="$3"
+ local label="$4"
+ local data
+ local stripped
+
+ data=$(od -A n -t x1 -j $offset -N $nbytes "$file" 2>/dev/null)
+
+ # Remove whitespace and check if any byte is non-zero
+ stripped=$(echo "$data" | tr -d ' \n\t')
+ if [ -n "$stripped" ] && ! echo "$stripped" | grep -qE "^0+$"; then
+ echo "FAIL: non-zero data in gap [$offset,$((offset + nbytes))) $label"
+ _hexdump -N $((offset + nbytes)) "$file"
+ return 1
+ fi
+ return 0
+}
+
+# Get the physical block offset (in bytes) of the file's first block on device
+_get_phys_offset()
+{
+ local file="$1"
+ local fiemap_output
+ local phys_blk
+
+ fiemap_output=$($XFS_IO_PROG -r -c "fiemap -v" "$file" 2>/dev/null)
+ phys_blk=$(echo "$fiemap_output" | _filter_xfs_io_fiemap | head -1 | awk '{print $3}')
+ if [ -z "$phys_blk" ]; then
+ echo ""
+ return
+ fi
+ # Convert 512-byte blocks to bytes
+ echo $((phys_blk * 512))
+}
+
+_test_eof_zeroing()
+{
+ local test_name="$1"
+ local extend_cmd="$2"
+ local file=$SCRATCH_MNT/testfile_${test_name}
+
+ echo "$test_name" | tee -a $seqres.full
+
+ # Compute non-block-aligned EOF offset
+ local gap_bytes=16
+ local eof_offset=$((blksz - gap_bytes))
+
+ # Step 1: Write one full block to ensure the filesystem allocates a
+ # physical block for the file instead of using inline data.
+ $XFS_IO_PROG -f -c "pwrite -S 0x5a 0 $blksz" -c fsync \
+ "$file" >> $seqres.full 2>&1
+
+ # Step 2: Get physical block offset on device via FIEMAP
+ local phys_offset
+ phys_offset=$(_get_phys_offset "$file")
+ if [ -z "$phys_offset" ]; then
+ _fail "$test_name: failed to get physical block offset via fiemap"
+ fi
+
+ # Step 3: Truncate file to non-block-aligned size and fsync.
+ # The on-disk region [eof_offset, blksz) may or may not be
+ # zeroed by the filesystem at this point.
+ $XFS_IO_PROG -c "truncate $eof_offset" -c fsync \
+ "$file" >> $seqres.full 2>&1
+
+ # Step 4: Unmount and restore the physical block to all-0x5a on disk.
+ # This bypasses the kernel's pagecache EOF-zeroing to ensure
+ # the stale pattern is present on disk. Then remount.
+ _scratch_unmount
+ $XFS_IO_PROG -d -c "pwrite -S 0x5a $phys_offset $blksz" \
+ $SCRATCH_DEV >> $seqres.full 2>&1
+ _scratch_mount >> $seqres.full 2>&1
+
+ # Verify file size is still eof_offset after remount
+ local sz
+ sz=$(stat -c %s "$file")
+ if [ "$sz" -ne "$eof_offset" ]; then
+ _fail "$test_name: file size wrong after remount: $sz != $eof_offset"
+ fi
+
+ # Step 5: Execute the extend operation.
+ $XFS_IO_PROG -c "$extend_cmd" "$file" >> $seqres.full 2>&1
+
+ # Step 6: Verify gap [eof_offset, blksz) is zeroed BEFORE shutdown
+ _check_gap_zero "$file" $eof_offset $gap_bytes "before shutdown" || return 1
+
+ # Step 7: Sync the extended range and shutdown the filesystem with
+ # journal flush. This persists the file size extending, and
+ # the filesystem should persist the zeroed data in the gap
+ # range as well.
+ if [ "$extend_cmd" != "${extend_cmd#pwrite}" ]; then
+ $XFS_IO_PROG -c "sync_range -w $blksz $blksz" \
+ "$file" >> $seqres.full 2>&1
+ fi
+ _scratch_shutdown -f
+
+ # Step 8: Remount and verify gap is still zeroed
+ _scratch_cycle_mount
+ _check_gap_zero "$file" $eof_offset $gap_bytes "after shutdown+remount" || return 1
+}
+
+_scratch_mkfs >> $seqres.full 2>&1
+_scratch_mount
+
+blksz=$(_get_block_size $SCRATCH_MNT)
+
+# Test three variants of EOF-extending operations
+_test_eof_zeroing "append_write" "pwrite -S 0x42 $blksz $blksz"
+_test_eof_zeroing "truncate_up" "truncate $((blksz * 2))"
+_test_eof_zeroing "fallocate" "falloc $blksz $blksz"
+
+# success, all done
+status=0
+exit
diff --git a/tests/generic/790.out b/tests/generic/790.out
new file mode 100644
index 00000000..e5e2cc09
--- /dev/null
+++ b/tests/generic/790.out
@@ -0,0 +1,4 @@
+QA output created by 790
+append_write
+truncate_up
+fallocate
--
2.52.0
^ permalink raw reply related
* [PATCH v3 v3 2/2] ext4: allow clearing mballoc stats through mb_stats
From: Baolin Liu @ 2026-04-22 1:50 UTC (permalink / raw)
To: tytso, adilger.kernel, ojaswin, ritesh.list, yi.zhang
Cc: linux-ext4, linux-kernel, wangguanyu, liubaolin12138, Baolin Liu,
Andreas Dilger
In-Reply-To: <20260422015026.7170-1-liubaolin12138@163.com>
From: Baolin Liu <liubaolin@kylinos.cn>
Make /proc/fs/ext4/<dev>/mb_stats writable and clear the runtime
mballoc statistics when 0 is written.
Update the related documentation accordingly.
Reviewed-by: Ojaswin Mujoo <ojaswin@linux.ibm.com>
Reviewed-by: Andreas Dilger <adilger@dilger.ca>
Reviewed-by: Ritesh Harjani (IBM) <ritesh.list@gmail.com>
Reviewed-by: Zhang Yi <yi.zhang@huawei.com>
Signed-off-by: Baolin Liu <liubaolin@kylinos.cn>
---
Documentation/admin-guide/ext4.rst | 5 ++++
Documentation/filesystems/proc.rst | 3 +++
fs/ext4/ext4.h | 1 +
fs/ext4/mballoc.c | 29 ++++++++++++++++++++++
fs/ext4/sysfs.c | 40 ++++++++++++++++++++++++++++--
5 files changed, 76 insertions(+), 2 deletions(-)
diff --git a/Documentation/admin-guide/ext4.rst b/Documentation/admin-guide/ext4.rst
index ac0c709ea9e7..fb3887cd5e44 100644
--- a/Documentation/admin-guide/ext4.rst
+++ b/Documentation/admin-guide/ext4.rst
@@ -436,6 +436,11 @@ Files in /proc/fs/ext4/<devname>
mb_groups
details of multiblock allocator buddy cache of free blocks
+ mb_stats
+ reports runtime statistics from the multiblock allocator
+ (mballoc). Writing 0 to this file clears the current
+ statistics.
+
/sys entries
============
diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst
index b0c0d1b45b99..7ce02573a3d9 100644
--- a/Documentation/filesystems/proc.rst
+++ b/Documentation/filesystems/proc.rst
@@ -1634,6 +1634,9 @@ directory are shown in Table 1-12, below.
============== ==========================================================
File Content
mb_groups details of multiblock allocator buddy cache of free blocks
+ mb_stats reports runtime statistics from the multiblock allocator
+ (mballoc). Writing 0 to this file clears the current
+ statistics.
============== ==========================================================
1.9 /proc/consoles
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 293f698b7042..3223e73612ae 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2994,6 +2994,7 @@ int ext4_fc_record_regions(struct super_block *sb, int ino,
extern const struct seq_operations ext4_mb_seq_groups_ops;
extern const struct seq_operations ext4_mb_seq_structs_summary_ops;
extern int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset);
+extern void ext4_mb_stats_clear(struct ext4_sb_info *sbi);
extern int ext4_mb_init(struct super_block *);
extern void ext4_mb_release(struct super_block *);
extern ext4_fsblk_t ext4_mb_new_blocks(handle_t *,
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 1e13ef62cb9d..79ddfa935813 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -4723,6 +4723,35 @@ static void ext4_mb_collect_stats(struct ext4_allocation_context *ac)
trace_ext4_mballoc_prealloc(ac);
}
+void ext4_mb_stats_clear(struct ext4_sb_info *sbi)
+{
+ int i;
+
+ atomic_set(&sbi->s_bal_reqs, 0);
+ atomic_set(&sbi->s_bal_success, 0);
+ atomic_set(&sbi->s_bal_allocated, 0);
+ atomic_set(&sbi->s_bal_groups_scanned, 0);
+
+ for (i = 0; i < EXT4_MB_NUM_CRS; i++) {
+ atomic64_set(&sbi->s_bal_cX_hits[i], 0);
+ atomic64_set(&sbi->s_bal_cX_groups_considered[i], 0);
+ atomic_set(&sbi->s_bal_cX_ex_scanned[i], 0);
+ atomic64_set(&sbi->s_bal_cX_failed[i], 0);
+ }
+
+ atomic_set(&sbi->s_bal_ex_scanned, 0);
+ atomic_set(&sbi->s_bal_goals, 0);
+ atomic_set(&sbi->s_bal_stream_goals, 0);
+ atomic_set(&sbi->s_bal_len_goals, 0);
+ atomic_set(&sbi->s_bal_2orders, 0);
+ atomic_set(&sbi->s_bal_breaks, 0);
+ atomic_set(&sbi->s_mb_lost_chunks, 0);
+ atomic_set(&sbi->s_mb_buddies_generated, 0);
+ atomic64_set(&sbi->s_mb_generation_time, 0);
+ atomic_set(&sbi->s_mb_preallocated, 0);
+ atomic_set(&sbi->s_mb_discarded, 0);
+}
+
/*
* Called on failure; free up any blocks from the inode PA for this
* context. We don't need this for MB_GROUP_PA because we only change
diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
index b87d7bdab06a..e90885d470ab 100644
--- a/fs/ext4/sysfs.c
+++ b/fs/ext4/sysfs.c
@@ -52,6 +52,42 @@ typedef enum {
static const char proc_dirname[] = "fs/ext4";
static struct proc_dir_entry *ext4_proc_root;
+static int ext4_mb_stats_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, ext4_seq_mb_stats_show, pde_data(inode));
+}
+
+static ssize_t ext4_mb_stats_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct super_block *sb = pde_data(file_inode(file));
+ char kbuf[2];
+
+ if (count == 0 || count > sizeof(kbuf))
+ return -EINVAL;
+
+ if (copy_from_user(kbuf, buf, count))
+ return -EFAULT;
+
+ if (count == 2) {
+ if (kbuf[0] != '0' || kbuf[1] != '\n')
+ return -EINVAL;
+ } else if (kbuf[0] != '0') {
+ return -EINVAL;
+ }
+
+ ext4_mb_stats_clear(EXT4_SB(sb));
+ return count;
+}
+
+static const struct proc_ops ext4_mb_stats_proc_ops = {
+ .proc_open = ext4_mb_stats_open,
+ .proc_read = seq_read,
+ .proc_lseek = seq_lseek,
+ .proc_release = single_release,
+ .proc_write = ext4_mb_stats_write,
+};
+
struct ext4_attr {
struct attribute attr;
short attr_id;
@@ -630,8 +666,8 @@ int ext4_register_sysfs(struct super_block *sb)
ext4_fc_info_show, sb);
proc_create_seq_data("mb_groups", S_IRUGO, sbi->s_proc,
&ext4_mb_seq_groups_ops, sb);
- proc_create_single_data("mb_stats", 0444, sbi->s_proc,
- ext4_seq_mb_stats_show, sb);
+ proc_create_data("mb_stats", 0644, sbi->s_proc,
+ &ext4_mb_stats_proc_ops, sb);
proc_create_seq_data("mb_structs_summary", 0444, sbi->s_proc,
&ext4_mb_seq_structs_summary_ops, sb);
}
--
2.51.0
^ permalink raw reply related
* [PATCH v3 v3 1/2] ext4: add blocks_allocated to mb_stats output
From: Baolin Liu @ 2026-04-22 1:50 UTC (permalink / raw)
To: tytso, adilger.kernel, ojaswin, ritesh.list, yi.zhang
Cc: linux-ext4, linux-kernel, wangguanyu, liubaolin12138, Baolin Liu,
Andreas Dilger
In-Reply-To: <20260422015026.7170-1-liubaolin12138@163.com>
From: Baolin Liu <liubaolin@kylinos.cn>
Add blocks_allocated to /proc/fs/ext4/<dev>/mb_stats so that the
reported statistics match the mballoc summary printed at unmount time.
Reviewed-by: Ojaswin Mujoo <ojaswin@linux.ibm.com>
Reviewed-by: Andreas Dilger <adilger@dilger.ca>
Reviewed-by: Ritesh Harjani (IBM) <ritesh.list@gmail.com>
Reviewed-by: Zhang Yi <yi.zhang@huawei.com>
Signed-off-by: Baolin Liu <liubaolin@kylinos.cn>
---
fs/ext4/mballoc.c | 2 ++
1 file changed, 2 insertions(+)
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 20e9fdaf4301..1e13ef62cb9d 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -3211,6 +3211,8 @@ int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset)
"\tTo enable, please write \"1\" to sysfs file mb_stats.\n");
return 0;
}
+ seq_printf(seq, "\tblocks_allocated: %u\n",
+ atomic_read(&sbi->s_bal_allocated));
seq_printf(seq, "\treqs: %u\n", atomic_read(&sbi->s_bal_reqs));
seq_printf(seq, "\tsuccess: %u\n", atomic_read(&sbi->s_bal_success));
--
2.51.0
^ permalink raw reply related
* [PATCH v3 v3 0/2] add blocks_allocated to mb_stats and clear mb_stats
From: Baolin Liu @ 2026-04-22 1:50 UTC (permalink / raw)
To: tytso, adilger.kernel, ojaswin, ritesh.list, yi.zhang
Cc: linux-ext4, linux-kernel, wangguanyu, liubaolin12138
The series contains two patches:
- add blocks_allocated to /proc/fs/ext4/<dev>/mb_stats
- allow writing 0 to /proc/fs/ext4/<dev>/mb_stats to clear the current
mballoc statistics
Changes since v2:
- Add mb_stats documentation to patch 2
- Add Reviewed-by tags
Baolin Liu (2):
ext4: add blocks_allocated to mb_stats output
ext4: allow clearing mballoc stats through mb_stats
Documentation/admin-guide/ext4.rst | 5 ++++
Documentation/filesystems/proc.rst | 3 +++
fs/ext4/ext4.h | 1 +
fs/ext4/mballoc.c | 31 +++++++++++++++++++++++
fs/ext4/sysfs.c | 40 ++++++++++++++++++++++++++++--
5 files changed, 78 insertions(+), 2 deletions(-)
--
2.51.0
^ permalink raw reply
* Re: [RFC PATCH] iomap: add fast read path for small direct I/O
From: Dave Chinner @ 2026-04-21 22:36 UTC (permalink / raw)
To: Fengnan
Cc: Ojaswin Mujoo, Fengnan Chang, brauner, djwong, linux-xfs,
linux-fsdevel, linux-ext4, lidiangang
In-Reply-To: <87674d63-c8cb-4135-8d76-84f52e90ac2e@bytedance.com>
On Tue, Apr 21, 2026 at 11:19:31AM +0800, Fengnan wrote:
> 在 2026/4/21 07:59, Dave Chinner 写道:
> > I'm clearly missing something here. I'm trying to work out why the
> > profiles show what they do, but there's differences between them
> > that do make obvious sense to me.
> >
> > It would also be useful to have XFS profiles, because it has a
> > larger CPU cache footprint than ext4. If what the profiles are
> > showing is a result of CPU cache residency artifacts, then we'll see
> > different profile (and, potentially, performance) artifacts with
> > XFS...
> The XFS flame graph is also attached now.
> IOPS: 1.92M->2.3M.
The callchains in both XFS flame graphs are completely bogus:
<io_uring entry>
....
io_read
__io_read
xfs_inode_free_eofblocks
xfs_prep_free_cowblocks
iomap_dio_rw
iomap_dio_simple_read
xfs_mountfs
....
Can you regenerate the profiles, please, and this time check that
they make sense before posting them?
-Dave.
--
Dave Chinner
dgc@kernel.org
^ permalink raw reply
* Re: [PATCH v8 04/22] fsverity: generate and store zero-block hash
From: Eric Biggers @ 2026-04-21 21:47 UTC (permalink / raw)
To: Andrey Albershteyn
Cc: linux-xfs, fsverity, linux-fsdevel, hch, linux-ext4,
linux-f2fs-devel, linux-btrfs, linux-unionfs, djwong
In-Reply-To: <20260420114714.1621982-5-aalbersh@kernel.org>
On Mon, Apr 20, 2026 at 01:46:51PM +0200, Andrey Albershteyn wrote:
> +void fsverity_fill_zerohash(struct folio *folio, size_t offset, size_t len,
> + struct fsverity_info *vi)
[...]
> +void fsverity_fill_zerohash(struct folio *folio, size_t poff, size_t plen,
> + struct fsverity_info *vi);
[...]
> +static inline void fsverity_fill_zerohash(struct folio *folio, size_t poff,
> + size_t plen, struct fsverity_info *vi)
The parameters should have the same name at each declaration site.
- Eric
^ permalink raw reply
* Re: [PATCH v8 03/22] ovl: use core fsverity ensure info interface
From: Eric Biggers @ 2026-04-21 21:44 UTC (permalink / raw)
To: Andrey Albershteyn
Cc: linux-xfs, fsverity, linux-fsdevel, hch, linux-ext4,
linux-f2fs-devel, linux-btrfs, linux-unionfs, djwong,
Amir Goldstein
In-Reply-To: <20260420114714.1621982-4-aalbersh@kernel.org>
On Mon, Apr 20, 2026 at 01:46:50PM +0200, Andrey Albershteyn wrote:
> int ovl_ensure_verity_loaded(const struct path *datapath)
> {
> struct inode *inode = d_inode(datapath->dentry);
> - struct file *filp;
>
> - if (!fsverity_active(inode) && IS_VERITY(inode)) {
> - /*
> - * If this inode was not yet opened, the verity info hasn't been
> - * loaded yet, so we need to do that here to force it into memory.
> - */
> - filp = kernel_file_open(datapath, O_RDONLY, current_cred());
> - if (IS_ERR(filp))
> - return PTR_ERR(filp);
> - fput(filp);
> - }
> + if (fsverity_active(inode))
> + return fsverity_ensure_verity_info(inode);
Not sure whether I should review this version or the version in git, but
both seem wrong. The 'if (!fsverity_active(inode) && IS_VERITY(inode))
{' condition should stay, but fsverity_ensure_verity_info() will need to
gain a !CONFIG_FS_VERITY stub to fix the build error.
- Eric
^ permalink raw reply
* Re: [PATCH v8 00/22] fs-verity support for XFS with post EOF merkle tree
From: Eric Biggers @ 2026-04-21 21:43 UTC (permalink / raw)
To: Andrey Albershteyn
Cc: linux-xfs, fsverity, linux-fsdevel, hch, linux-ext4,
linux-f2fs-devel, linux-btrfs, linux-unionfs, djwong, david
In-Reply-To: <20260420114714.1621982-1-aalbersh@kernel.org>
On Mon, Apr 20, 2026 at 01:46:47PM +0200, Andrey Albershteyn wrote:
> This series based on v7.0 with Christoph's read ioends patchset [1].
>
> kernel:
> https://git.kernel.org/pub/scm/linux/kernel/git/aalbersh/xfs-linux.git/log/?h=b4/fsverity
FYI: the git repository doesn't match what was actually sent out. For
example the patch "ovl: use core fsverity ensure info interface" is a
bit different. The version in git (incorrectly, I think) ignores the
error code, while the patch returns it.
- Eric
^ permalink raw reply
* Re: [BUG] ext4: BUG_ON in ext4_write_inline_data (fs/ext4/inline.c:240)
From: Jan Kara @ 2026-04-21 12:25 UTC (permalink / raw)
To: Zw Tang
Cc: tytso, Andreas Dilger, libaokun, jack, ojaswin, linux-ext4,
linux-kernel, yi.zhang
In-Reply-To: <CAPHJ_VJeBAL_fk+P79guYTABZgW1hkcAz8t=c_nVK1mbn3_FYw@mail.gmail.com>
Hello!
On Tue 21-04-26 19:32:43, Zw Tang wrote:
> I am reporting a kernel BUG in ext4 triggered by a syzkaller reproducer on
> Linux 7.0.0-08391-g1d51b370a0f8.
Sorry but we don't generally debug fuzzer issues not being reported by
syzbot. It provides much better tracking, deduplication and general
handling of issues which significantly speeds up debugging. If you have
some interesting modification of syzkaller, please contribute it upstream
so that everybody can benefit.
Honza
>
> The first fatal issue happens in the ext4 inline data write path:
> sendfile64 -> ext4_file_write_iter -> ext4_da_write_end ->
> ext4_write_inline_data_end -> ext4_write_inline_data.
>
> The crash is reported as:
>
> kernel BUG at fs/ext4/inline.c:240
>
> and RIP points to:
>
> ext4_write_inline_data+0x3d0/0x490
>
> This looks like an ext4 inline-data boundary/state inconsistency triggered
> while writing to an ext4 image crafted by syzkaller. The later
> KASAN: slab-use-after-free in rwsem_down_write_slowpath() appears to be a
> secondary effect after the primary ext4 BUG, likely during teardown/unlink
> after the filesystem failure.
>
> Reproducer:
> C reproducer: pastebin.com/raw/3LmK5Kxg
> console output: pastebin.com/raw/C0XjNMXp
> kernel config: pastebin.com/raw/aq1V3cLk
>
> Kernel:
> HEAD commit:
> git tree: <e.g. torvalds/linux>
> kernel version: 7.0.0-08391-g1d51b370a0f8 #1 PREEMPT(lazy) (QEMU)
>
> Relevant log:
>
> [ 1329.147750] kernel BUG at fs/ext4/inline.c:240!
> [ 1329.148692] Oops: invalid opcode: 0000 [#1] SMP KASAN
> [ 1329.149543] CPU: 0 UID: 0 PID: 334 Comm: repro1 Tainted: G W
> 7.0.0-08391-g1d51b370a0f8 #1 PREEMPT(lazy)
> [ 1329.153249] RIP: 0010+0x3d0/0x490
> [ 1329.167978] ext4_write_inline_data_end+0x293/0xc90
> [ 1329.170566] ext4_da_write_end+0x521/0xec0
> [ 1329.176842] ext4_buffered_write_iter+0x11a/0x430
> [ 1329.177610] ext4_file_write_iter+0x561/0x1840
> [ 1329.185052] iter_file_splice_write+0xa33/0x11c0
> [ 1329.190820] direct_splice_actor+0x18f/0x7a0
> [ 1329.198893] do_splice_direct+0x41/0x50
> [ 1329.200276] do_sendfile+0xa86/0xda0
> [ 1329.203110] __x64_sys_sendfile64+0x1cf/0x210
>
> There is also an ext4 metadata inconsistency message right after the BUG:
>
> [ 1329.221770] EXT4-fs error (device loop1):
> ext4_mb_generate_buddy:1314: group 0, block bitmap and bg descriptor
> inconsistent: 25 vs 150994969 free clusters
>
> and later a secondary report:
>
> [ 1329.274881] BUG: KASAN: slab-use-after-free in
> rwsem_down_write_slowpath+0x15e9/0x1640
>
> Based on the log, I believe the primary issue to investigate is the BUG in
> fs/ext4/inline.c, while the later rwsem report is probably fallout after
> the ext4 failure.
>
> Please let me know if more information is needed.
>
> Thanks.
--
Jan Kara <jack@suse.com>
SUSE Labs, CR
^ permalink raw reply
* Re: [BUG] ext4: BUG_ON in ext4_write_inline_data (fs/ext4/inline.c:240)
From: Theodore Tso @ 2026-04-21 12:20 UTC (permalink / raw)
To: Zw Tang
Cc: Andreas Dilger, libaokun, jack, ojaswin, linux-ext4, linux-kernel,
yi.zhang, syzkaller-bugs
In-Reply-To: <CAPHJ_VJeBAL_fk+P79guYTABZgW1hkcAz8t=c_nVK1mbn3_FYw@mail.gmail.com>
On Tue, Apr 21, 2026 at 07:32:43PM +0800, Zw Tang wrote:
> This looks like an ext4 inline-data boundary/state inconsistency triggered
> while writing to an ext4 image crafted by syzkaller. The later
> KASAN: slab-use-after-free in rwsem_down_write_slowpath() appears to be a
> secondary effect after the primary ext4 BUG, likely during teardown/unlink
> after the filesystem failure.
Writing to a mounted image is not something that we consider a valid
threat model. If you can write to a mounted image, there are a
zillion different ways that you can creash the kernel, or you can
create a setuid shell, etc.
The upstream syzkaller bot makes sure that CONFIG_BLK_DEV_WRITE_MOUNTED
is not defined to avoid syzkaller noise.
Out of curiosity, why are you running your own syzkaller instance? To
the extent that you duplicate findings done by the upstream syzkaller,
or use bad configurations that waste the time of upstream maintainers,
you are simply accelerating the time when we consider all syzkaller
reports as a denial of service attack on upstream maintainers.
To the upstream syzkaller folkers, can you fix syzkaller so that
disabling CONFIG_BLK_DEV_WRITE_MOUNTED is hard-coded so that the many
people who insist on duplicating syzkaller instances without enabling
the syzkaller dashboard functionality don't make this configuration
mistake?
Thanks, regards,
- Ted
^ permalink raw reply
* [PATH 6.6 3/3] ext4: get rid of ppath in convert_initialized_extent()
From: Yang Erkun @ 2026-04-21 11:34 UTC (permalink / raw)
To: stable, linux-ext4
Cc: tytso, libaokun, adilger.kernel, ojaswin, ritesh.list, jack,
gregkh, sashal, yangerkun, yi.zhang, zhangxiaoxu5
In-Reply-To: <20260421113416.4040274-1-yangerkun@huawei.com>
From: Baokun Li <libaokun1@huawei.com>
[ Upstream commit 4191eefef978d734fa8249bede3f9b02a85aa3c0 ]
The use of path and ppath is now very confusing, so to make the code more
readable, pass path between functions uniformly, and get rid of ppath.
To get rid of the ppath in convert_initialized_extent(), the following is
done here:
* Free the extents path when an error is encountered.
No functional changes.
Signed-off-by: Baokun Li <libaokun1@huawei.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Ojaswin Mujoo <ojaswin@linux.ibm.com>
Tested-by: Ojaswin Mujoo <ojaswin@linux.ibm.com>
Link: https://patch.msgid.link/20240822023545.1994557-23-libaokun@huaweicloud.com
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Yang Erkun <yangerkun@huawei.com>
---
fs/ext4/extents.c | 37 +++++++++++++++++++------------------
1 file changed, 19 insertions(+), 18 deletions(-)
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 0406dac7fbf1..1c55f498ce4f 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3830,13 +3830,12 @@ ext4_convert_unwritten_extents_endio(handle_t *handle, struct inode *inode,
return ERR_PTR(err);
}
-static int
+static struct ext4_ext_path *
convert_initialized_extent(handle_t *handle, struct inode *inode,
struct ext4_map_blocks *map,
- struct ext4_ext_path **ppath,
+ struct ext4_ext_path *path,
unsigned int *allocated)
{
- struct ext4_ext_path *path = *ppath;
struct ext4_extent *ex;
ext4_lblk_t ee_block;
unsigned int ee_len;
@@ -3861,29 +3860,25 @@ convert_initialized_extent(handle_t *handle, struct inode *inode,
if (ee_block != map->m_lblk || ee_len > map->m_len) {
path = ext4_split_convert_extents(handle, inode, map, path,
EXT4_GET_BLOCKS_CONVERT_UNWRITTEN, NULL);
- if (IS_ERR(path)) {
- *ppath = NULL;
- return PTR_ERR(path);
- }
+ if (IS_ERR(path))
+ return path;
path = ext4_find_extent(inode, map->m_lblk, path, 0);
- if (IS_ERR(path)) {
- *ppath = NULL;
- return PTR_ERR(path);
- }
- *ppath = path;
+ if (IS_ERR(path))
+ return path;
depth = ext_depth(inode);
ex = path[depth].p_ext;
if (!ex) {
EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
(unsigned long) map->m_lblk);
- return -EFSCORRUPTED;
+ err = -EFSCORRUPTED;
+ goto errout;
}
}
err = ext4_ext_get_access(handle, inode, path + depth);
if (err)
- return err;
+ goto errout;
/* first mark the extent as unwritten */
ext4_ext_mark_unwritten(ex);
@@ -3895,7 +3890,7 @@ convert_initialized_extent(handle_t *handle, struct inode *inode,
/* Mark modified extent as dirty */
err = ext4_ext_dirty(handle, inode, path + path->p_depth);
if (err)
- return err;
+ goto errout;
ext4_ext_show_leaf(inode, path);
ext4_update_inode_fsync_trans(handle, inode, 1);
@@ -3904,7 +3899,11 @@ convert_initialized_extent(handle_t *handle, struct inode *inode,
if (*allocated > map->m_len)
*allocated = map->m_len;
map->m_len = *allocated;
- return 0;
+ return path;
+
+errout:
+ ext4_free_ext_path(path);
+ return ERR_PTR(err);
}
static struct ext4_ext_path *
@@ -4271,8 +4270,10 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
*/
if ((!ext4_ext_is_unwritten(ex)) &&
(flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) {
- err = convert_initialized_extent(handle,
- inode, map, &path, &allocated);
+ path = convert_initialized_extent(handle,
+ inode, map, path, &allocated);
+ if (IS_ERR(path))
+ err = PTR_ERR(path);
goto out;
} else if (!ext4_ext_is_unwritten(ex)) {
map->m_flags |= EXT4_MAP_MAPPED;
--
2.39.2
^ permalink raw reply related
* [PATH 6.6 2/3] ext4: get rid of ppath in ext4_force_split_extent_at()
From: Yang Erkun @ 2026-04-21 11:34 UTC (permalink / raw)
To: stable, linux-ext4
Cc: tytso, libaokun, adilger.kernel, ojaswin, ritesh.list, jack,
gregkh, sashal, yangerkun, yi.zhang, zhangxiaoxu5
In-Reply-To: <20260421113416.4040274-1-yangerkun@huawei.com>
From: Baokun Li <libaokun1@huawei.com>
[ Upstream commit f07be1c367369636d7d338d7994473d6eae283c5 ]
The use of path and ppath is now very confusing, so to make the code more
readable, pass path between functions uniformly, and get rid of ppath.
To get rid of the ppath in ext4_force_split_extent_at(), the following is
done here:
* Free the extents path when an error is encountered.
No functional changes.
Signed-off-by: Baokun Li <libaokun1@huawei.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Ojaswin Mujoo <ojaswin@linux.ibm.com>
Tested-by: Ojaswin Mujoo <ojaswin@linux.ibm.com>
Link: https://patch.msgid.link/20240822023545.1994557-17-libaokun@huaweicloud.com
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Yang Erkun <yangerkun@huawei.com>
---
fs/ext4/extents.c | 69 ++++++++++++++++++++++++++---------------------
1 file changed, 38 insertions(+), 31 deletions(-)
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 300bf2289bc1..0406dac7fbf1 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -329,27 +329,20 @@ static inline int ext4_ext_space_root_idx(struct inode *inode, int check)
return size;
}
-static inline int
+static inline struct ext4_ext_path *
ext4_force_split_extent_at(handle_t *handle, struct inode *inode,
- struct ext4_ext_path **ppath, ext4_lblk_t lblk,
+ struct ext4_ext_path *path, ext4_lblk_t lblk,
int nofail)
{
- struct ext4_ext_path *path = *ppath;
int unwritten = ext4_ext_is_unwritten(path[path->p_depth].p_ext);
int flags = EXT4_EX_NOCACHE | EXT4_GET_BLOCKS_PRE_IO;
if (nofail)
flags |= EXT4_GET_BLOCKS_METADATA_NOFAIL | EXT4_EX_NOFAIL;
- path = ext4_split_extent_at(handle, inode, path, lblk, unwritten ?
+ return ext4_split_extent_at(handle, inode, path, lblk, unwritten ?
EXT4_EXT_MARK_UNWRIT1|EXT4_EXT_MARK_UNWRIT2 : 0,
flags);
- if (IS_ERR(path)) {
- *ppath = NULL;
- return PTR_ERR(path);
- }
- *ppath = path;
- return 0;
}
static int
@@ -2890,11 +2883,12 @@ int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
* fail removing space due to ENOSPC so try to use
* reserved block if that happens.
*/
- err = ext4_force_split_extent_at(handle, inode, &path,
- end + 1, 1);
- if (err < 0)
+ path = ext4_force_split_extent_at(handle, inode, path,
+ end + 1, 1);
+ if (IS_ERR(path)) {
+ err = PTR_ERR(path);
goto out;
-
+ }
} else if (sbi->s_cluster_ratio > 1 && end >= ex_end &&
partial.state == initial) {
/*
@@ -5772,17 +5766,21 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1,
/* Prepare left boundary */
if (e1_blk < lblk1) {
split = 1;
- *erp = ext4_force_split_extent_at(handle, inode1,
- &path1, lblk1, 0);
- if (unlikely(*erp))
+ path1 = ext4_force_split_extent_at(handle, inode1,
+ path1, lblk1, 0);
+ if (IS_ERR(path1)) {
+ *erp = PTR_ERR(path1);
goto finish;
+ }
}
if (e2_blk < lblk2) {
split = 1;
- *erp = ext4_force_split_extent_at(handle, inode2,
- &path2, lblk2, 0);
- if (unlikely(*erp))
+ path2 = ext4_force_split_extent_at(handle, inode2,
+ path2, lblk2, 0);
+ if (IS_ERR(path2)) {
+ *erp = PTR_ERR(path2);
goto finish;
+ }
}
/* ext4_split_extent_at() may result in leaf extent split,
* path must to be revalidated. */
@@ -5798,17 +5796,21 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1,
if (len != e1_len) {
split = 1;
- *erp = ext4_force_split_extent_at(handle, inode1,
- &path1, lblk1 + len, 0);
- if (unlikely(*erp))
+ path1 = ext4_force_split_extent_at(handle, inode1,
+ path1, lblk1 + len, 0);
+ if (IS_ERR(path1)) {
+ *erp = PTR_ERR(path1);
goto finish;
+ }
}
if (len != e2_len) {
split = 1;
- *erp = ext4_force_split_extent_at(handle, inode2,
- &path2, lblk2 + len, 0);
- if (*erp)
+ path2 = ext4_force_split_extent_at(handle, inode2,
+ path2, lblk2 + len, 0);
+ if (IS_ERR(path2)) {
+ *erp = PTR_ERR(path2);
goto finish;
+ }
}
/* ext4_split_extent_at() may result in leaf extent split,
* path must to be revalidated. */
@@ -5974,24 +5976,29 @@ int ext4_ext_replay_update_ex(struct inode *inode, ext4_lblk_t start,
ext4_ext_get_actual_len(ex) != len) {
/* We need to split this extent to match our extent first */
down_write(&EXT4_I(inode)->i_data_sem);
- ret = ext4_force_split_extent_at(NULL, inode, &path, start, 1);
+ path = ext4_force_split_extent_at(NULL, inode, path, start, 1);
up_write(&EXT4_I(inode)->i_data_sem);
- if (ret)
+ if (IS_ERR(path)) {
+ ret = PTR_ERR(path);
goto out;
+ }
path = ext4_find_extent(inode, start, path, 0);
if (IS_ERR(path))
return PTR_ERR(path);
+
ex = path[path->p_depth].p_ext;
WARN_ON(le32_to_cpu(ex->ee_block) != start);
if (ext4_ext_get_actual_len(ex) != len) {
down_write(&EXT4_I(inode)->i_data_sem);
- ret = ext4_force_split_extent_at(NULL, inode, &path,
- start + len, 1);
+ path = ext4_force_split_extent_at(NULL, inode, path,
+ start + len, 1);
up_write(&EXT4_I(inode)->i_data_sem);
- if (ret)
+ if (IS_ERR(path)) {
+ ret = PTR_ERR(path);
goto out;
+ }
path = ext4_find_extent(inode, start, path, 0);
if (IS_ERR(path))
--
2.39.2
^ permalink raw reply related
* [PATH 6.6 1/3] ext4: get rid of ppath in get_ext_path()
From: Yang Erkun @ 2026-04-21 11:34 UTC (permalink / raw)
To: stable, linux-ext4
Cc: tytso, libaokun, adilger.kernel, ojaswin, ritesh.list, jack,
gregkh, sashal, yangerkun, yi.zhang, zhangxiaoxu5
In-Reply-To: <20260421113416.4040274-1-yangerkun@huawei.com>
From: Baokun Li <libaokun1@huawei.com>
[ Upstream commit 6b854d552711aa33f59eda334e6d94a00d8825bb ]
The use of path and ppath is now very confusing, so to make the code more
readable, pass path between functions uniformly, and get rid of ppath.
After getting rid of ppath in get_ext_path(), its caller may pass an error
pointer to ext4_free_ext_path(), so it needs to teach ext4_free_ext_path()
and ext4_ext_drop_refs() to skip the error pointer. No functional changes.
Signed-off-by: Baokun Li <libaokun1@huawei.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Ojaswin Mujoo <ojaswin@linux.ibm.com>
Tested-by: Ojaswin Mujoo <ojaswin@linux.ibm.com>
Link: https://patch.msgid.link/20240822023545.1994557-13-libaokun@huaweicloud.com
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
[we need IS_ERR_OR_NULL to prevent oops]
Reported-by: Hulk Robot <hulkrobot@huawei.com>
Signed-off-by: Yang Erkun <yangerkun@huawei.com>
---
fs/ext4/extents.c | 5 +++--
fs/ext4/move_extent.c | 34 +++++++++++++++++-----------------
2 files changed, 20 insertions(+), 19 deletions(-)
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 7626cf2b07f1..300bf2289bc1 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -114,7 +114,7 @@ static void ext4_ext_drop_refs(struct ext4_ext_path *path)
{
int depth, i;
- if (!path)
+ if (IS_ERR_OR_NULL(path))
return;
depth = path->p_depth;
for (i = 0; i <= depth; i++, path++) {
@@ -125,6 +125,8 @@ static void ext4_ext_drop_refs(struct ext4_ext_path *path)
void ext4_free_ext_path(struct ext4_ext_path *path)
{
+ if (IS_ERR_OR_NULL(path))
+ return;
ext4_ext_drop_refs(path);
kfree(path);
}
@@ -4227,7 +4229,6 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
path = ext4_find_extent(inode, map->m_lblk, NULL, 0);
if (IS_ERR(path)) {
err = PTR_ERR(path);
- path = NULL;
goto out;
}
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index d5636a2a718a..96a84de32169 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -17,27 +17,23 @@
* get_ext_path() - Find an extent path for designated logical block number.
* @inode: inode to be searched
* @lblock: logical block number to find an extent path
- * @ppath: pointer to an extent path pointer (for output)
+ * @path: pointer to an extent path
*
- * ext4_find_extent wrapper. Return 0 on success, or a negative error value
- * on failure.
+ * ext4_find_extent wrapper. Return an extent path pointer on success,
+ * or an error pointer on failure.
*/
-static inline int
+static inline struct ext4_ext_path *
get_ext_path(struct inode *inode, ext4_lblk_t lblock,
- struct ext4_ext_path **ppath)
+ struct ext4_ext_path *path)
{
- struct ext4_ext_path *path = *ppath;
-
- *ppath = NULL;
path = ext4_find_extent(inode, lblock, path, EXT4_EX_NOCACHE);
if (IS_ERR(path))
- return PTR_ERR(path);
+ return path;
if (path[ext_depth(inode)].p_ext == NULL) {
ext4_free_ext_path(path);
- return -ENODATA;
+ return ERR_PTR(-ENODATA);
}
- *ppath = path;
- return 0;
+ return path;
}
/**
@@ -95,9 +91,11 @@ mext_check_coverage(struct inode *inode, ext4_lblk_t from, ext4_lblk_t count,
int ret = 0;
ext4_lblk_t last = from + count;
while (from < last) {
- *err = get_ext_path(inode, from, &path);
- if (*err)
- goto out;
+ path = get_ext_path(inode, from, path);
+ if (IS_ERR(path)) {
+ *err = PTR_ERR(path);
+ return ret;
+ }
ext = path[ext_depth(inode)].p_ext;
if (unwritten != ext4_ext_is_unwritten(ext))
goto out;
@@ -634,9 +632,11 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk,
int offset_in_page;
int unwritten, cur_len;
- ret = get_ext_path(orig_inode, o_start, &path);
- if (ret)
+ path = get_ext_path(orig_inode, o_start, path);
+ if (IS_ERR(path)) {
+ ret = PTR_ERR(path);
goto out;
+ }
ex = path[path->p_depth].p_ext;
cur_blk = le32_to_cpu(ex->ee_block);
cur_len = ext4_ext_get_actual_len(ex);
--
2.39.2
^ permalink raw reply related
* [PATH 6.6 0/3] fix potential ext4 null pointer
From: Yang Erkun @ 2026-04-21 11:34 UTC (permalink / raw)
To: stable, linux-ext4
Cc: tytso, libaokun, adilger.kernel, ojaswin, ritesh.list, jack,
gregkh, sashal, yangerkun, yi.zhang, zhangxiaoxu5
Our Hulk robot discovered that there were missing 6b854d552711
("ext4: get rid of ppath in get_ext_path()") when backporting the
linux-6.6.y mainline patch set[1], which could potentially trigger
some error branches in ext4 to cause a panic. I also verified this
point during testing using xfstests-bld[2]. Along with this, we
have also backported other related patches from the corresponding
patch set [1].
[1]. https://lore.kernel.org/all/20240822023545.1994557-1-libaokun@huaweicloud.com/
[2]. https://github.com/tytso/xfstests-bld
generic/051 81s ... [09:20:24][ 509.012535] run fstests generic/051 at 2026-04-21 09:20:24
[ 509.314026] EXT4-fs (vdb): mounted filesystem d225342d-c437-4a7d-893b-5d02903a5ea4 r/w with ordered data mode. Quota mode: none.
[ 509.397019] EXT4-fs (vdc): mounted filesystem 72b9e6ee-4b56-45ee-b71e-ca491d2fd7e9 r/w with ordered data mode. Quota mode: none.
[ 509.399614] EXT4-fs (vdc): shut down requested (1)
[ 509.400082] Aborting journal on device vdc-8.
[ 509.402378] EXT4-fs (vdc): unmounting filesystem 72b9e6ee-4b56-45ee-b71e-ca491d2fd7e9.
[ 509.443140] EXT4-fs (vdc): mounted filesystem 03f15f3c-5938-41ea-bbf8-321de40d01ff r/w with ordered data mode. Quota mode: none.
[ 539.831842] EXT4-fs (vdc): unmounting filesystem 03f15f3c-5938-41ea-bbf8-321de40d01ff.
[ 539.868710] EXT4-fs (vdc): mounted filesystem 03f15f3c-5938-41ea-bbf8-321de40d01ff r/w with ordered data mode. Quota mode: none.
[ 552.967466] BUG: unable to handle page fault for address: ffffffffffffffec
[ 552.968455] #PF: supervisor read access in kernel mode
[ 552.969157] #PF: error_code(0x0000) - not-present page
[ 552.969859] PGD 282c067 P4D 282d067 PUD 282f067 PMD 0
[ 552.970575] Oops: 0000 [#1] PREEMPT SMP NOPTI
[ 552.971179] CPU: 0 PID: 292843 Comm: fsstress Not tainted 6.6.135-xfstests #2
[ 552.972143] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.1-2.fc37 04/01/2014
[ 552.973284] RIP: 0010:ext4_ext_map_blocks+0x191/0xab0
[ 552.973986] Code: 4c 89 e6 48 89 ef 48 8d 54 24 60 e8 89 6c ff ff 85 c0 89 44 24 28 0f 84 59 02 00 00 48 8b 44 24 30 48 85 c0 0f 84 09 06 00 00 <44> 0f b7 78 08 45 31 f6 48 89 1c 24 49 89 c4 44 89 f3 49 89 c6 49
[ 552.976362] RSP: 0018:ffa0000006ab3c78 EFLAGS: 00010286
[ 552.976862] RAX: ffffffffffffffe4 RBX: ffa0000006ab3de0 RCX: 0000000000000000
[ 552.977525] RDX: ffffffff82244590 RSI: ffffffff825d3cfc RDI: ff1100002e5d5068
[ 552.978189] RBP: ff110000064f6628 R08: ffffffff825d3ddd R09: ff1100006b74a618
[ 552.978850] R10: 00000000d8f693c7 R11: ff11000077f49ff0 R12: ff110000062040c0
[ 552.979511] R13: 0000000000000043 R14: 0000000000025b80 R15: ff110000069f1000
[ 552.980159] FS: 00007f7685ec8740(0000) GS:ff1100007dc00000(0000) knlGS:0000000000000000
[ 552.980906] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 552.981445] CR2: ffffffffffffffec CR3: 0000000006a44005 CR4: 0000000000771ef0
[ 552.982091] PKRU: 55555554
[ 552.982351] Call Trace:
[ 552.982606] <TASK>
[ 552.982818] ext4_map_blocks+0x23e/0x6b0
[ 552.983191] ext4_alloc_file_blocks.isra.0+0x12b/0x370
[ 552.983671] ext4_fallocate+0x150/0x310
[ 552.984034] vfs_fallocate+0x13e/0x380
[ 552.984391] ioctl_preallocate+0xa4/0xd0
[ 552.984769] __x64_sys_ioctl+0x71/0xd0
[ 552.985126] do_syscall_64+0x38/0x80
[ 552.985479] entry_SYSCALL_64_after_hwframe+0x78/0xe2
[ 552.985956] RIP: 0033:0x7f7685fc8c5b
[ 552.986291] Code: 00 48 89 44 24 18 31 c0 48 8d 44 24 60 c7 04 24 10 00 00 00 48 89 44 24 08 48 8d 44 24 20 48 89 44 24 10 b8 10 00 00 00 0f 05 <89> c2 3d 00 f0 ff ff 77 1c 48 8b 44 24 18 64 48 2b 04 25 28 00 00
[ 552.987925] RSP: 002b:00007fff16838290 EFLAGS: 00000246 ORIG_RAX: 0000000000000010
[ 552.988571] RAX: ffffffffffffffda RBX: 000000000000991b RCX: 00007f7685fc8c5b
[ 552.989186] RDX: 00007fff16838310 RSI: 000000004030582a RDI: 0000000000000003
[ 552.989812] RBP: 0000000000000003 R08: 0000000000000002 R09: 00007fff168382fc
[ 552.990428] R10: 0000000000001000 R11: 0000000000000246 R12: 0000000000000000
[ 552.991043] R13: 00000000001a6829 R14: 8f5c28f5c28f5c29 R15: 000055c9bd970650
[ 552.991655] </TASK>
[ 552.991857] CR2: ffffffffffffffec
[ 552.992154] ---[ end trace 0000000000000000 ]---
[ 552.992557] RIP: 0010:ext4_ext_map_blocks+0x191/0xab0
Baokun Li (3):
ext4: get rid of ppath in get_ext_path()
ext4: get rid of ppath in ext4_force_split_extent_at()
ext4: get rid of ppath in convert_initialized_extent()
fs/ext4/extents.c | 111 +++++++++++++++++++++++-------------------
fs/ext4/move_extent.c | 34 ++++++-------
2 files changed, 77 insertions(+), 68 deletions(-)
--
2.39.2
^ permalink raw reply
* [BUG] ext4: BUG_ON in ext4_write_inline_data (fs/ext4/inline.c:240)
From: Zw Tang @ 2026-04-21 11:32 UTC (permalink / raw)
To: tytso
Cc: Andreas Dilger, libaokun, jack, ojaswin, linux-ext4, linux-kernel,
yi.zhang
Hi,
I am reporting a kernel BUG in ext4 triggered by a syzkaller reproducer on
Linux 7.0.0-08391-g1d51b370a0f8.
The first fatal issue happens in the ext4 inline data write path:
sendfile64 -> ext4_file_write_iter -> ext4_da_write_end ->
ext4_write_inline_data_end -> ext4_write_inline_data.
The crash is reported as:
kernel BUG at fs/ext4/inline.c:240
and RIP points to:
ext4_write_inline_data+0x3d0/0x490
This looks like an ext4 inline-data boundary/state inconsistency triggered
while writing to an ext4 image crafted by syzkaller. The later
KASAN: slab-use-after-free in rwsem_down_write_slowpath() appears to be a
secondary effect after the primary ext4 BUG, likely during teardown/unlink
after the filesystem failure.
Reproducer:
C reproducer: pastebin.com/raw/3LmK5Kxg
console output: pastebin.com/raw/C0XjNMXp
kernel config: pastebin.com/raw/aq1V3cLk
Kernel:
HEAD commit:
git tree: <e.g. torvalds/linux>
kernel version: 7.0.0-08391-g1d51b370a0f8 #1 PREEMPT(lazy) (QEMU)
Relevant log:
[ 1329.147750] kernel BUG at fs/ext4/inline.c:240!
[ 1329.148692] Oops: invalid opcode: 0000 [#1] SMP KASAN
[ 1329.149543] CPU: 0 UID: 0 PID: 334 Comm: repro1 Tainted: G W
7.0.0-08391-g1d51b370a0f8 #1 PREEMPT(lazy)
[ 1329.153249] RIP: 0010+0x3d0/0x490
[ 1329.167978] ext4_write_inline_data_end+0x293/0xc90
[ 1329.170566] ext4_da_write_end+0x521/0xec0
[ 1329.176842] ext4_buffered_write_iter+0x11a/0x430
[ 1329.177610] ext4_file_write_iter+0x561/0x1840
[ 1329.185052] iter_file_splice_write+0xa33/0x11c0
[ 1329.190820] direct_splice_actor+0x18f/0x7a0
[ 1329.198893] do_splice_direct+0x41/0x50
[ 1329.200276] do_sendfile+0xa86/0xda0
[ 1329.203110] __x64_sys_sendfile64+0x1cf/0x210
There is also an ext4 metadata inconsistency message right after the BUG:
[ 1329.221770] EXT4-fs error (device loop1):
ext4_mb_generate_buddy:1314: group 0, block bitmap and bg descriptor
inconsistent: 25 vs 150994969 free clusters
and later a secondary report:
[ 1329.274881] BUG: KASAN: slab-use-after-free in
rwsem_down_write_slowpath+0x15e9/0x1640
Based on the log, I believe the primary issue to investigate is the BUG in
fs/ext4/inline.c, while the later rwsem report is probably fallout after
the ext4 failure.
Please let me know if more information is needed.
Thanks.
^ permalink raw reply
* Re: [patch 33/38] powerpc: Select ARCH_HAS_RANDOM_ENTROPY
From: Mukesh Kumar Chaurasiya @ 2026-04-21 11:22 UTC (permalink / raw)
To: Thomas Gleixner
Cc: LKML, Michael Ellerman, linuxppc-dev, Arnd Bergmann, x86,
Lu Baolu, iommu, Michael Grzeschik, netdev, linux-wireless,
Herbert Xu, linux-crypto, Vlastimil Babka, linux-mm,
David Woodhouse, Bernie Thompson, linux-fbdev, Theodore Tso,
linux-ext4, Andrew Morton, Uladzislau Rezki, Marco Elver,
Dmitry Vyukov, kasan-dev, Andrey Ryabinin, Thomas Sailer,
linux-hams, Jason A. Donenfeld, Richard Henderson, linux-alpha,
Russell King, linux-arm-kernel, Catalin Marinas, Huacai Chen,
loongarch, Geert Uytterhoeven, linux-m68k, Dinh Nguyen,
Jonas Bonn, linux-openrisc, Helge Deller, linux-parisc,
Paul Walmsley, linux-riscv, Heiko Carstens, linux-s390,
David S. Miller, sparclinux
In-Reply-To: <20260410120319.789114053@kernel.org>
On Fri, Apr 10, 2026 at 02:21:09PM +0200, Thomas Gleixner wrote:
> The only remaining usage of get_cycles() is to provide random_get_entropy().
>
> Switch powerpc over to the new scheme of selecting ARCH_HAS_RANDOM_ENTROPY
> and providing random_get_entropy() in asm/random.h.
>
> Remove asm/timex.h as it has no functionality anymore.
>
> Signed-off-by: Thomas Gleixner <tglx@kernel.org>
> Cc: Michael Ellerman <mpe@ellerman.id.au>
> Cc: linuxppc-dev@lists.ozlabs.org
> ---
> arch/powerpc/Kconfig | 1 +
> arch/powerpc/include/asm/random.h | 13 +++++++++++++
> arch/powerpc/include/asm/timex.h | 21 ---------------------
> 3 files changed, 14 insertions(+), 21 deletions(-)
>
> --- a/arch/powerpc/Kconfig
> +++ b/arch/powerpc/Kconfig
> @@ -150,6 +150,7 @@ config PPC
> select ARCH_HAS_PREEMPT_LAZY
> select ARCH_HAS_PTDUMP
> select ARCH_HAS_PTE_SPECIAL
> + select ARCH_HAS_RANDOM_ENTROPY
> select ARCH_HAS_SCALED_CPUTIME if VIRT_CPU_ACCOUNTING_NATIVE && PPC_BOOK3S_64
> select ARCH_HAS_SET_MEMORY
> select ARCH_HAS_STRICT_KERNEL_RWX if (PPC_BOOK3S || PPC_8xx) && !HIBERNATION
> --- /dev/null
> +++ b/arch/powerpc/include/asm/random.h
> @@ -0,0 +1,13 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef _ASM_POWERPC_RANDOM_H
> +#define _ASM_POWERPC_RANDOM_H
> +
> +#include <asm/cputable.h>
> +#include <asm/vdso/timebase.h>
> +
> +static inline unsigned long random_get_entropy(void)
> +{
> + return mftb();
> +}
> +
> +#endif /* _ASM_POWERPC_RANDOM_H */
> --- a/arch/powerpc/include/asm/timex.h
> +++ b/arch/powerpc/include/asm/timex.h
> @@ -1,21 +0,0 @@
> -/* SPDX-License-Identifier: GPL-2.0 */
> -#ifndef _ASM_POWERPC_TIMEX_H
> -#define _ASM_POWERPC_TIMEX_H
> -
> -#ifdef __KERNEL__
> -
> -/*
> - * PowerPC architecture timex specifications
> - */
> -
> -#include <asm/cputable.h>
> -#include <asm/vdso/timebase.h>
> -
> -ostatic inline cycles_t get_cycles(void)
> -{
R> - return mftb();
> -}
> -#define get_cycles get_cycles
> -
> -#endif /* __KERNEL__ */
> -#endif /* _ASM_POWERPC_TIMEX_H */
>
Build tested for this series with allmodconfig and allyesconfig on ppc64le
machine for ppc64le.
tree: git://git.kernel.org/pub/scm/linux/kernel/git/tglx/devel.git getcycles-v1
Boot tested for this series on powernv9 qemu, powernv10 qemu and pSeries
power11 hardware.
Tested-by: Mukesh Kumar Chaurasiya (IBM) <mkchauras@gmail.com>
Reviewed-by: Mukesh Kumar Chaurasiya (IBM) <mkchauras@gmail.com>
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox