Linux EXT4 FS development
 help / color / mirror / Atom feed
* Re: [PATCH] generic/790: test post-EOF gap zeroing persistence
From: Brian Foster @ 2026-04-22 13:22 UTC (permalink / raw)
  To: Zhang Yi
  Cc: fstests, zlang, linux-ext4, linux-fsdevel, jack, yi.zhang,
	yizhang089, yangerkun
In-Reply-To: <20260422015246.4132376-1-yi.zhang@huaweicloud.com>

On Wed, Apr 22, 2026 at 09:52:46AM +0800, Zhang Yi wrote:
> From: Zhang Yi <yi.zhang@huawei.com>
> 
> Test that extending a file past a non-block-aligned EOF correctly
> zero-fills the gap [old_EOF, block_boundary), and that this zeroing
> persists through a filesystem shutdown+remount cycle.
> 
> Stale data beyond EOF can persist on disk when append write data blocks
> are flushed before the i_size metadata update, or when concurrent append
> writeback and mmap writes persist non-zero data past EOF. Subsequent
> post-EOF operations (append write, fallocate, truncate up) must
> zero-fill and persist the gap to prevent exposing stale data.
> 
> The test pollutes the file's last physical block (via FIEMAP + raw
> device write) with a sentinel pattern beyond i_size, then performs each
> extend operation and verifies the gap is zeroed both in memory and on
> disk.
> 
> Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
> ---
> This is the case Jan Kara pointed out during my work on the ext4
> buffered I/O to iomap conversion. This case is similar to generic/363,
> but generic/363 doesn't provide persistent testing. For details:
> 
>  https://lore.kernel.org/linux-ext4/jgotl7vzzuzm6dvz5zfgk6haodxvunb4hq556pzh4hqqwvnhxq@lr3jiedhqh7c/
> 
>  tests/generic/790     | 155 ++++++++++++++++++++++++++++++++++++++++++
>  tests/generic/790.out |   4 ++
>  2 files changed, 159 insertions(+)
>  create mode 100755 tests/generic/790
>  create mode 100644 tests/generic/790.out
> 
> diff --git a/tests/generic/790 b/tests/generic/790
> new file mode 100755
> index 00000000..5d8f61f9
> --- /dev/null
> +++ b/tests/generic/790
> @@ -0,0 +1,155 @@
> +#! /bin/bash
> +# SPDX-License-Identifier: GPL-2.0
> +# Copyright (c) 2026 Huawei.  All Rights Reserved.
> +#
> +# FS QA Test No. 790
> +#
> +# Test that extending a file past a non-block-aligned EOF correctly zero-fills
> +# the gap [old_EOF, block_boundary), and that this zeroing persists through a
> +# filesystem shutdown+remount cycle.
> +#

Nice test! This is a great idea.

> +# Stale data beyond EOF can persist on disk when:
> +# 1) append write data blocks are flushed before the i_size metadata update,
> +#    and the system crashes in this window.

Maybe it's wording or I'm missing something, but how would "append write
data blocks" be flushed before i_size updates? Wouldn't writeback toss
them or zero the post-eof range of a folio? Do you mean to refer to
"on-disk size update" specifically (where I'm reading it as
inode->i_isize)?

> +# 2) concurrent append writeback and mmap writes persist non-zero data past EOF.
> +#
> +# Subsequent post-EOF operations (append write, fallocate, truncate up) must
> +# zero-fill and persist the gap to prevent exposing stale data.
> +#
> +# The test pollutes the file's last physical block (via FIEMAP + raw device
> +# write) with a sentinel pattern beyond i_size, then performs each extend
> +# operation and verifies the gap is zeroed both in memory and on disk.
> +#
...
> +_test_eof_zeroing()
> +{
> +	local test_name="$1"
> +	local extend_cmd="$2"
> +	local file=$SCRATCH_MNT/testfile_${test_name}
> +
> +	echo "$test_name" | tee -a $seqres.full
> +
> +	# Compute non-block-aligned EOF offset
> +	local gap_bytes=16
> +	local eof_offset=$((blksz - gap_bytes))
> +
> +	# Step 1: Write one full block to ensure the filesystem allocates a
> +	#         physical block for the file instead of using inline data.
> +	$XFS_IO_PROG -f -c "pwrite -S 0x5a 0 $blksz" -c fsync \
> +		"$file" >> $seqres.full 2>&1
> +
> +	# Step 2: Get physical block offset on device via FIEMAP
> +	local phys_offset
> +	phys_offset=$(_get_phys_offset "$file")
> +	if [ -z "$phys_offset" ]; then
> +		_fail "$test_name: failed to get physical block offset via fiemap"
> +	fi
> +
> +	# Step 3: Truncate file to non-block-aligned size and fsync.
> +	#         The on-disk region [eof_offset, blksz) may or may not be
> +	#         zeroed by the filesystem at this point.
> +	$XFS_IO_PROG -c "truncate $eof_offset" -c fsync \
> +		"$file" >> $seqres.full 2>&1
> +
> +	# Step 4: Unmount and restore the physical block to all-0x5a on disk.
> +	#         This bypasses the kernel's pagecache EOF-zeroing to ensure
> +	#         the stale pattern is present on disk. Then remount.
> +	_scratch_unmount
> +	$XFS_IO_PROG -d -c "pwrite -S 0x5a $phys_offset $blksz" \
> +		$SCRATCH_DEV >> $seqres.full 2>&1
> +	_scratch_mount >> $seqres.full 2>&1
> +
> +	# Verify file size is still eof_offset after remount
> +	local sz
> +	sz=$(stat -c %s "$file")
> +	if [ "$sz" -ne "$eof_offset" ]; then
> +		_fail "$test_name: file size wrong after remount: $sz != $eof_offset"
> +	fi

I was initially curious why we'd want to do this, but after further
thought I wonder if it might make more sense to check file size against
the extended size after the shutdown/mount cycle below (but before
checking the gap range). That way we know the size update was
logged/recovered correctly and we're about to read from a file range
within eof. Hm?

Those couple nits aside this all looks pretty good to me.

Brian

> +
> +	# Step 5: Execute the extend operation.
> +	$XFS_IO_PROG -c "$extend_cmd" "$file" >> $seqres.full 2>&1
> +
> +	# Step 6: Verify gap [eof_offset, blksz) is zeroed BEFORE shutdown
> +	_check_gap_zero "$file" $eof_offset $gap_bytes "before shutdown" || return 1
> +
> +	# Step 7: Sync the extended range and shutdown the filesystem with
> +	#         journal flush. This persists the file size extending, and
> +	#         the filesystem should persist the zeroed data in the gap
> +	#         range as well.
> +	if [ "$extend_cmd" != "${extend_cmd#pwrite}" ]; then
> +		$XFS_IO_PROG -c "sync_range -w $blksz $blksz" \
> +			"$file" >> $seqres.full 2>&1
> +	fi
> +	_scratch_shutdown -f
> +
> +	# Step 8: Remount and verify gap is still zeroed
> +	_scratch_cycle_mount
> +	_check_gap_zero "$file" $eof_offset $gap_bytes "after shutdown+remount" || return 1
> +}
> +
> +_scratch_mkfs >> $seqres.full 2>&1
> +_scratch_mount
> +
> +blksz=$(_get_block_size $SCRATCH_MNT)
> +
> +# Test three variants of EOF-extending operations
> +_test_eof_zeroing "append_write" "pwrite -S 0x42 $blksz $blksz"
> +_test_eof_zeroing "truncate_up" "truncate $((blksz * 2))"
> +_test_eof_zeroing "fallocate" "falloc $blksz $blksz"
> +
> +# success, all done
> +status=0
> +exit
> diff --git a/tests/generic/790.out b/tests/generic/790.out
> new file mode 100644
> index 00000000..e5e2cc09
> --- /dev/null
> +++ b/tests/generic/790.out
> @@ -0,0 +1,4 @@
> +QA output created by 790
> +append_write
> +truncate_up
> +fallocate
> -- 
> 2.52.0
> 
> 


^ permalink raw reply

* Re: [PATCH v2] iomap: avoid memset iomap when iter is done
From: Christian Brauner @ 2026-04-22 12:56 UTC (permalink / raw)
  To: djwong, hch, linux-xfs, linux-fsdevel, linux-ext4, Fengnan Chang
  Cc: Christian Brauner, lidiangang, Fengnan Chang
In-Reply-To: <20260420061630.62077-1-changfengnan@bytedance.com>

On Mon, 20 Apr 2026 14:16:30 +0800, Fengnan Chang wrote:
> When iomap_iter() finishes its iteration (returns <= 0), it is no longer
> necessary to memset the entire iomap and srcmap structures.
> 
> In high-IOPS scenarios (like 4k randread NVMe polling with io_uring),
> where the majority of I/Os complete in a single extent map, this wasted
> memory write bandwidth, as the caller will just discard the iterator.
> Use this command to test:
> taskset -c 30 ./t/io_uring -p1 -d512 -b4096 -s32 -c32 -F1 -B1 -R1 -X1
> -n1 -P1 /mnt/testfile
> IOPS improve about 5% on ext4 and XFS.
> 
> [...]

Applied to the vfs-7.2.iomap branch of the vfs/vfs.git tree.
Patches in the vfs-7.2.iomap branch should appear in linux-next soon.

Please report any outstanding bugs that were missed during review in a
new review to the original patch series allowing us to drop it.

It's encouraged to provide Acked-bys and Reviewed-bys even though the
patch has now been applied. If possible patch trailers will be updated.

Note that commit hashes shown below are subject to change due to rebase,
trailer updates or similar. If in doubt, please check the listed branch.

tree:   https://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs.git
branch: master

[1/1] iomap: avoid memset iomap when iter is done
      https://git.kernel.org/vfs/vfs/c/72fa5c7e5c81

^ permalink raw reply

* Re: [PATCH v2 3/3] ext4: derive f_fsid from block device to avoid collisions
From: Anand Jain @ 2026-04-22 11:39 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Theodore Tso, Darrick J. Wong, linux-ext4, linux-btrfs, linux-xfs,
	Anand Jain, dsterba
In-Reply-To: <aeHikJwRmFdDg5FW@infradead.org>



>> It seems we've reached the functional limits of f_fsid.
>> If we want to solve this properly for Overlayfs, NFS handles, or a
>> complex system monitoring..etc, we need a new identifier let's call
>> it f_fsid_v2, that meets the following requirements:
>>
>>   System-wide Uniqueness: Must distinguish between cloned filesystems.
>>
>>   Persistence: Must remain consistent across reboots/HW re-enumeration.

 My mistake- this should be:

  Consistency: Must remain consistent across reboots/HW re-enumeration.


>>
>>   Non-On-Disk: Must not be stored on-disk.
> 
> The third requirement doesn't make much sense to me.  If it is
> persistent it. or something it can be derived from must be stored
> on-disk.

 I hope it make sense now with "Persistence" replaced by
 "Consistency" for the 2nd requirement (above).


>> One possible implementation for f_fsid_v2 could be:
>>
>>    f_fsid_v2 =  hash(s_uuid, block_device_serial, [subvol_id])
>>
>> For pseudo block devices (virtio-blk, loop, nbd, brd,..),
>> the serial could be derived recursively:
>>
>>    serial_number = hash(backing_file.f_fsid_v2, backing_file.ino)
> 
> What i the point in this?  All of this seems to be better served
> by s_uuid.

The goal is to fix duplicate f_fsid issues in cloned filesystems
by creating a unique, reboot-consistent ID. This allows the
source and clone to remain identical (sharing same uuid) while
still being individually identifiable.

>> Note on Hardware Serials:
>>  Standard storage protocols (T10, NVMe, SAS) mandate unique,
>>  persistent serials per LUN. While I've seen T10 protocol
>>  violations during my time authoring Solaris HBA drivers, I
>>  believe these outliers shouldn't dictate the design.
> 
> No, T10 does not actually mandate unique identifiers, NVMe does, but the
> implementations are often totally broken.

Right. Newer SPC-3 (and above) compliant devices must support
the Inquiry CDB EVPD flag and provide page 0x83 for identification,
which is what we typically use for multipathing.
These are globally unique. And, we can overlook legacy
drives, as they've probably been past their EOSL for a while now.

We can tweak the algorithm as follows:

  ID = hash(s_uuid, device_identifier_id,
            partition_start_lba, partition_end_lba,
            [subvol_id], [file.ino])

This is an ID which remains consistent across reboots while
staying unique within the system, which we can use it for f_fsid.

Thanks.

^ permalink raw reply

* Re: [PATCH v2] iomap: avoid memset iomap when iter is done
From: Brian Foster @ 2026-04-22 10:59 UTC (permalink / raw)
  To: Fengnan Chang
  Cc: brauner, djwong, hch, linux-xfs, linux-fsdevel, linux-ext4,
	lidiangang, Fengnan Chang
In-Reply-To: <20260420061630.62077-1-changfengnan@bytedance.com>

On Mon, Apr 20, 2026 at 02:16:30PM +0800, Fengnan Chang wrote:
> When iomap_iter() finishes its iteration (returns <= 0), it is no longer
> necessary to memset the entire iomap and srcmap structures.
> 
> In high-IOPS scenarios (like 4k randread NVMe polling with io_uring),
> where the majority of I/Os complete in a single extent map, this wasted
> memory write bandwidth, as the caller will just discard the iterator.
> Use this command to test:
> taskset -c 30 ./t/io_uring -p1 -d512 -b4096 -s32 -c32 -F1 -B1 -R1 -X1
> -n1 -P1 /mnt/testfile
> IOPS improve about 5% on ext4 and XFS.
> 
> However, we MUST still call iomap_iter_reset_iomap() to release the
> folio_batch if IOMAP_F_FOLIO_BATCH is set, otherwise we leak page
> references. Therefore, split the cleanup logic: always release the
> folio_batch, but skip the memset() when ret <= 0.
> 
> Signed-off-by: Fengnan Chang <changfengnan@bytedance.com>
> ---

Reviewed-by: Brian Foster <bfoster@redhat.com>

>  fs/iomap/iter.c | 12 ++++++------
>  1 file changed, 6 insertions(+), 6 deletions(-)
> 
> diff --git a/fs/iomap/iter.c b/fs/iomap/iter.c
> index c04796f6e57f..e4a29829591a 100644
> --- a/fs/iomap/iter.c
> +++ b/fs/iomap/iter.c
> @@ -6,17 +6,13 @@
>  #include <linux/iomap.h>
>  #include "trace.h"
>  
> -static inline void iomap_iter_reset_iomap(struct iomap_iter *iter)
> +static inline void iomap_iter_clean_fbatch(struct iomap_iter *iter)
>  {
>  	if (iter->iomap.flags & IOMAP_F_FOLIO_BATCH) {
>  		folio_batch_release(iter->fbatch);
>  		folio_batch_reinit(iter->fbatch);
>  		iter->iomap.flags &= ~IOMAP_F_FOLIO_BATCH;
>  	}
> -
> -	iter->status = 0;
> -	memset(&iter->iomap, 0, sizeof(iter->iomap));
> -	memset(&iter->srcmap, 0, sizeof(iter->srcmap));
>  }
>  
>  /* Advance the current iterator position and decrement the remaining length */
> @@ -102,10 +98,14 @@ int iomap_iter(struct iomap_iter *iter, const struct iomap_ops *ops)
>  		ret = 0;
>  	else
>  		ret = 1;
> -	iomap_iter_reset_iomap(iter);
> +	iomap_iter_clean_fbatch(iter);
> +	iter->status = 0;
>  	if (ret <= 0)
>  		return ret;
>  
> +	memset(&iter->iomap, 0, sizeof(iter->iomap));
> +	memset(&iter->srcmap, 0, sizeof(iter->srcmap));
> +
>  begin:
>  	ret = ops->iomap_begin(iter->inode, iter->pos, iter->len, iter->flags,
>  			       &iter->iomap, &iter->srcmap);
> -- 
> 2.39.5 (Apple Git-154)
> 
> 


^ permalink raw reply

* Re: [PATCH v8 03/22] ovl: use core fsverity ensure info interface
From: Andrey Albershteyn @ 2026-04-22  9:59 UTC (permalink / raw)
  To: Eric Biggers
  Cc: Andrey Albershteyn, linux-xfs, fsverity, linux-fsdevel, hch,
	linux-ext4, linux-f2fs-devel, linux-btrfs, linux-unionfs, djwong,
	Amir Goldstein
In-Reply-To: <20260421214457.GC37143@quark>

On 2026-04-21 14:44:57, Eric Biggers wrote:
> On Mon, Apr 20, 2026 at 01:46:50PM +0200, Andrey Albershteyn wrote:
> >  int ovl_ensure_verity_loaded(const struct path *datapath)
> >  {
> >  	struct inode *inode = d_inode(datapath->dentry);
> > -	struct file *filp;
> >  
> > -	if (!fsverity_active(inode) && IS_VERITY(inode)) {
> > -		/*
> > -		 * If this inode was not yet opened, the verity info hasn't been
> > -		 * loaded yet, so we need to do that here to force it into memory.
> > -		 */
> > -		filp = kernel_file_open(datapath, O_RDONLY, current_cred());
> > -		if (IS_ERR(filp))
> > -			return PTR_ERR(filp);
> > -		fput(filp);
> > -	}
> > +	if (fsverity_active(inode))
> > +		return fsverity_ensure_verity_info(inode);
> 
> Not sure whether I should review this version or the version in git, but
> both seem wrong.  

Sorry, I forgot to push, this one is the latest, the one on git is
v7. I will push v8 now.

> The 'if (!fsverity_active(inode) && IS_VERITY(inode)) {' condition
> should stay

Why? With recent changes, the fsverity_active() now checks for
IS_VERITY() instead of verity_descriptor.

> , but fsverity_ensure_verity_info() will need to
> gain a !CONFIG_FS_VERITY stub to fix the build error.

With "if (fsverity_active(inode))" I think this is not necessary as
this fsverity_active() will be always false, and this if-case is
optimized.

-- 
- Andrey


^ permalink raw reply

* Re: [PATCH v3 v3 0/2] add blocks_allocated to mb_stats and clear mb_stats
From: liubaolin @ 2026-04-22  9:55 UTC (permalink / raw)
  To: tytso, adilger.kernel, ojaswin, ritesh.list, yi.zhang
  Cc: linux-ext4, linux-kernel, wangguanyu
In-Reply-To: <20260422015026.7170-1-liubaolin12138@163.com>



在 2026/4/22 9:50, Baolin Liu 写道:
> The series contains two patches:
>   - add blocks_allocated to /proc/fs/ext4/<dev>/mb_stats
>   - allow writing 0 to /proc/fs/ext4/<dev>/mb_stats to clear the current
>     mballoc statistics
> 
> Changes since v2:
>   - Add mb_stats documentation to patch 2
>   - Add Reviewed-by tags
> 
> Baolin Liu (2):
>    ext4: add blocks_allocated to mb_stats output
>    ext4: allow clearing mballoc stats through mb_stats
> 
>   Documentation/admin-guide/ext4.rst |  5 ++++
>   Documentation/filesystems/proc.rst |  3 +++
>   fs/ext4/ext4.h                     |  1 +
>   fs/ext4/mballoc.c                  | 31 +++++++++++++++++++++++
>   fs/ext4/sysfs.c                    | 40 ++++++++++++++++++++++++++++--
>   5 files changed, 78 insertions(+), 2 deletions(-)
> 
Dear All,
    This commit adds the description of the ext4 proc parameter mb_stats 
to the corresponding documentation.
    I noticed that the documentation also lacks descriptions for the 
es_shrinker_info, fc_info, mb_structs_summary, and options parameters.
    However, these parameters are irrelevant to the work done in this 
series of patches.
    I will commit a separate patch later to explain the other parameters.
    This time, I only explain mb_stats.

    Thanks,
    Baolin



^ permalink raw reply

* Re: [PATCH v8 00/22] fs-verity support for XFS with post EOF merkle tree
From: Andrey Albershteyn @ 2026-04-22  8:58 UTC (permalink / raw)
  To: Eric Biggers
  Cc: Andrey Albershteyn, linux-xfs, fsverity, linux-fsdevel, hch,
	linux-ext4, linux-f2fs-devel, linux-btrfs, linux-unionfs, djwong,
	david
In-Reply-To: <20260421214316.GB37143@quark>

On 2026-04-21 14:43:16, Eric Biggers wrote:
> On Mon, Apr 20, 2026 at 01:46:47PM +0200, Andrey Albershteyn wrote:
> > This series based on v7.0 with Christoph's read ioends patchset [1].
> > 
> > kernel:
> > https://git.kernel.org/pub/scm/linux/kernel/git/aalbersh/xfs-linux.git/log/?h=b4/fsverity
> 
> FYI: the git repository doesn't match what was actually sent out.  For
> example the patch "ovl: use core fsverity ensure info interface" is a
> bit different.  The version in git (incorrectly, I think) ignores the
> error code, while the patch returns it.
> 
> - Eric
> 

Thanks, you're right, I forgot to push

-- 
- Andrey


^ permalink raw reply

* Re: [PATCH v2] iomap: avoid memset iomap when iter is done
From: Christoph Hellwig @ 2026-04-22  6:34 UTC (permalink / raw)
  To: Fengnan Chang
  Cc: brauner, djwong, hch, linux-xfs, linux-fsdevel, linux-ext4,
	lidiangang, Fengnan Chang
In-Reply-To: <20260420061630.62077-1-changfengnan@bytedance.com>

Looks good:

Reviewed-by: Christoph Hellwig <hch@lst.de>


^ permalink raw reply

* Re: [RFC PATCH] iomap: add fast read path for small direct I/O
From: Fengnan @ 2026-04-22  2:43 UTC (permalink / raw)
  To: Dave Chinner
  Cc: Ojaswin Mujoo, Fengnan Chang, brauner, djwong, linux-xfs,
	linux-fsdevel, linux-ext4, lidiangang
In-Reply-To: <aef79R24wFemmUdN@dread>

[-- Attachment #1: Type: text/plain, Size: 1163 bytes --]

在 2026/4/22 06:36, Dave Chinner 写道:
> On Tue, Apr 21, 2026 at 11:19:31AM +0800, Fengnan wrote:
>> 在 2026/4/21 07:59, Dave Chinner 写道:
>>> I'm clearly missing something here. I'm trying to work out why the
>>> profiles show what they do, but there's differences between them
>>> that do make obvious sense to me.
>>>
>>> It would also be useful to have XFS profiles, because it has a
>>> larger CPU cache footprint than ext4. If what the profiles are
>>> showing is a result of CPU cache residency artifacts, then we'll see
>>> different profile (and, potentially, performance) artifacts with
>>> XFS...
>> The XFS flame graph is also attached now.
>> IOPS: 1.92M->2.3M.
> The callchains in both XFS flame graphs are completely bogus:
>
> <io_uring entry>
> ....
> io_read
> __io_read
> xfs_inode_free_eofblocks
> xfs_prep_free_cowblocks
> iomap_dio_rw
> iomap_dio_simple_read
> xfs_mountfs
> ....
>
> Can you regenerate the profiles, please, and this time check that
> they make sense before posting them?

Sorry, I didn't check xfs before, new flame graph is attached
now, now the callchains make sense.

>
> -Dave.

[-- Attachment #2: xfs_base.svg --]
[-- Type: image/svg+xml, Size: 220396 bytes --]

[-- Attachment #3: xfs_patch.svg --]
[-- Type: image/svg+xml, Size: 201551 bytes --]

^ permalink raw reply

* [PATCH v3 21/22] ext4: update i_disksize to i_size on ordered I/O completion
From: Zhang Yi @ 2026-04-22  2:10 UTC (permalink / raw)
  To: linux-ext4, linux-fsdevel
  Cc: linux-kernel, tytso, adilger.kernel, libaokun, jack, ojaswin,
	ritesh.list, djwong, hch, yi.zhang, yi.zhang, yizhang089,
	yangerkun, yukuai
In-Reply-To: <20260422021042.4157510-1-yi.zhang@huaweicloud.com>

From: Zhang Yi <yi.zhang@huawei.com>

Currently, i_disksize is updated after ordered data writeback to prevent
exposing stale data in the post-EOF block. However, operations like
fallocate and truncate update i_disksize directly. If the new i_disksize
exceeds the original value, metadata may be written back before the
zeroed data is persisted. To avoid this, we defer i_disksize updates
when i_ordered_len is non-zero, only applying them after ordered I/O
completes.

But this deferral introduces a new problem: on ordered I/O completion,
i_disksize is updated only to the end of that specific I/O, discarding
any later updates (e.g., from fallocate) and causing filesystem
inconsistency. A potential fix would involve scanning for dirty or
writeback folios beyond the current position, then updating i_disksize
to the start of the first such folio or to i_size. However, folio
scanning is expensive and concurrency with operations like fallocate
makes this approach prohibitively complex.

Instead, update i_disksize directly to i_size upon ordered I/O
completion. This may expose zeroed data if dirty data within the range
is not yet written to disk after crash recovery, but it will never
expose stale data. The is limited to unaligned append writes and is
deemed acceptable.

Suggested-by: Jan Kara <jack@suse.cz>
Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
---
 fs/ext4/ext4.h    | 40 +++++++++++++++++++++++++++++++---------
 fs/ext4/extents.c |  9 +++------
 fs/ext4/inode.c   |  3 ---
 fs/ext4/page-io.c | 23 ++++++++++++++++++-----
 4 files changed, 52 insertions(+), 23 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 760400395cb7..59dcec47675f 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -3495,13 +3495,21 @@ do {								\
 #define EXT4_FREECLUSTERS_WATERMARK 0
 #endif
 
-/* Update i_disksize. Requires i_rwsem to avoid races with truncate */
+/*
+ * Update i_disksize. Requires i_rwsem to avoid races with truncate.
+ *
+ * In the iomap buffered I/O path, a non-zero i_ordered_len indicates that
+ * an ordered I/O (zeroing the EOF partial block) is still in progress.
+ * In that case, i_disksize will be updated after the ordered data has
+ * been written out.
+ */
 static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize)
 {
 	WARN_ON_ONCE(S_ISREG(inode->i_mode) &&
 		     !inode_is_locked(inode));
 	down_write(&EXT4_I(inode)->i_data_sem);
-	if (newsize > EXT4_I(inode)->i_disksize)
+	if (newsize > EXT4_I(inode)->i_disksize &&
+	    READ_ONCE(EXT4_I(inode)->i_ordered_len) == 0)
 		WRITE_ONCE(EXT4_I(inode)->i_disksize, newsize);
 	up_write(&EXT4_I(inode)->i_data_sem);
 }
@@ -3515,7 +3523,8 @@ static inline int ext4_update_inode_size(struct inode *inode, loff_t newsize)
 		i_size_write(inode, newsize);
 		changed = 1;
 	}
-	if (newsize > EXT4_I(inode)->i_disksize) {
+	if (newsize > EXT4_I(inode)->i_disksize &&
+	    READ_ONCE(EXT4_I(inode)->i_ordered_len) == 0) {
 		ext4_update_i_disksize(inode, newsize);
 		changed |= 2;
 	}
@@ -3523,19 +3532,32 @@ static inline int ext4_update_inode_size(struct inode *inode, loff_t newsize)
 }
 
 /*
- * Set i_size and i_disksize to 'newsize'.
+ * Set i_size and i_disksize to 'newsize'.  In the iomap buffered I/O path,
+ * if i_ordered_len is non-zero and newsize exceeds the current i_disksize,
+ * the actual i_disksize update is deferred until after the ordered data is
+ * written out. In that case, i_disksize will be set to i_size upon I/O
+ * completion.
  *
  * Both i_rwsem and i_data_sem are required here to avoid races between
- * generic append writeback and concurrent truncate that also modify
- * i_size and i_disksize.
+ * generic append writeback (or ordered I/O writeback) and concurrent
+ * operations like fallocate and truncate that also modify i_size and
+ * i_disksize.
  */
-static inline void ext4_set_inode_size(struct inode *inode, loff_t newsize)
+static inline void __ext4_set_inode_size(struct inode *inode, loff_t newsize)
 {
 	WARN_ON_ONCE(S_ISREG(inode->i_mode) && !inode_is_locked(inode));
+	WARN_ON_ONCE(!rwsem_is_locked(&EXT4_I(inode)->i_data_sem));
 
-	down_write(&EXT4_I(inode)->i_data_sem);
 	i_size_write(inode, newsize);
-	EXT4_I(inode)->i_disksize = newsize;
+	if (READ_ONCE(EXT4_I(inode)->i_ordered_len) == 0 ||
+	    newsize < EXT4_I(inode)->i_disksize)
+		EXT4_I(inode)->i_disksize = newsize;
+}
+
+static inline void ext4_set_inode_size(struct inode *inode, loff_t newsize)
+{
+	down_write(&EXT4_I(inode)->i_data_sem);
+	__ext4_set_inode_size(inode, newsize);
 	up_write(&EXT4_I(inode)->i_data_sem);
 }
 
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 125f628e738a..e0c36cd920bf 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -5531,7 +5531,7 @@ static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len)
 	ext4_lblk_t start_lblk, end_lblk;
 	handle_t *handle;
 	unsigned int credits;
-	loff_t start, new_size;
+	loff_t start;
 	int ret;
 
 	trace_ext4_collapse_range(inode, offset, len);
@@ -5597,9 +5597,7 @@ static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len)
 		goto out_handle;
 	}
 
-	new_size = inode->i_size - len;
-	i_size_write(inode, new_size);
-	EXT4_I(inode)->i_disksize = new_size;
+	__ext4_set_inode_size(inode, inode->i_size - len);
 
 	up_write(&EXT4_I(inode)->i_data_sem);
 	ret = ext4_mark_inode_dirty(handle, inode);
@@ -5671,8 +5669,7 @@ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len)
 	ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE, handle);
 
 	/* Expand file to avoid data loss if there is error while shifting */
-	inode->i_size += len;
-	EXT4_I(inode)->i_disksize += len;
+	ext4_set_inode_size(inode, inode->i_size + len);
 	ret = ext4_mark_inode_dirty(handle, inode);
 	if (ret)
 		goto out_handle;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 17bd4403c782..d983336390c7 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4805,9 +4805,6 @@ int ext4_block_zero_eof(struct inode *inode, loff_t from, loff_t end)
 	 * truncating up or performing an append write, because there might be
 	 * exposing stale on-disk data which may caused by concurrent post-EOF
 	 * mmap write during folio writeback.
-	 *
-	 * TODO: In the iomap path, handle this by updating i_disksize to
-	 * i_size after the zeroed data has been written back.
 	 */
 	if (did_zero && zero_written && !IS_DAX(inode)) {
 		if (ext4_should_order_data(inode)) {
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 9c88671836fe..589c74b9f8a3 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -647,13 +647,13 @@ static void ext4_iomap_wb_ordered_wait(struct inode *inode,
 }
 
 static int ext4_iomap_wb_update_disksize(handle_t *handle, struct inode *inode,
-					 loff_t end)
+					 loff_t end, bool is_ordered)
 {
-	loff_t new_disksize = end;
+	loff_t new_disksize, i_size;
 	struct ext4_inode_info *ei = EXT4_I(inode);
 	int ret;
 
-	if (new_disksize <= READ_ONCE(ei->i_disksize))
+	if (end <= READ_ONCE(ei->i_disksize) && !is_ordered)
 		return 0;
 
 	/*
@@ -661,7 +661,18 @@ static int ext4_iomap_wb_update_disksize(handle_t *handle, struct inode *inode,
 	 * are avoided by checking i_size under i_data_sem.
 	 */
 	down_write(&ei->i_data_sem);
-	new_disksize = min(new_disksize, i_size_read(inode));
+	i_size = i_size_read(inode);
+
+	/*
+	 * Update i_disksize to i_size when completing an ordered I/O that
+	 * zeroes the old EOF partial block. This ensures i_disksize is
+	 * correctly advanced during truncate-up on a blocksize-unaligned
+	 * file, preventing it from remaining stale. A downside is that
+	 * zeroed data may be exposed after crash recovery if the dirty
+	 * data in this range is not yet on disk, but stale data will
+	 * never be exposed.
+	 */
+	new_disksize = is_ordered ? i_size : min(end, i_size);
 	if (new_disksize > ei->i_disksize)
 		ei->i_disksize = new_disksize;
 	up_write(&ei->i_data_sem);
@@ -678,6 +689,7 @@ static void ext4_iomap_finish_ioend(struct iomap_ioend *ioend)
 	struct super_block *sb = inode->i_sb;
 	loff_t pos = ioend->io_offset;
 	size_t size = ioend->io_size;
+	unsigned long io_mode = (unsigned long)ioend->io_private;
 	handle_t *handle;
 	int credits;
 	int ret, err;
@@ -707,7 +719,8 @@ static void ext4_iomap_finish_ioend(struct iomap_ioend *ioend)
 			goto out_journal;
 	}
 
-	ret = ext4_iomap_wb_update_disksize(handle, inode, pos + size);
+	ret = ext4_iomap_wb_update_disksize(handle, inode, pos + size,
+			io_mode == EXT4_IOMAP_IOEND_ORDER_IO);
 out_journal:
 	err = ext4_journal_stop(handle);
 	if (!ret)
-- 
2.52.0


^ permalink raw reply related

* [PATCH v3 14/22] ext4: implement partial block zero range path using iomap
From: Zhang Yi @ 2026-04-22  2:10 UTC (permalink / raw)
  To: linux-ext4, linux-fsdevel
  Cc: linux-kernel, tytso, adilger.kernel, libaokun, jack, ojaswin,
	ritesh.list, djwong, hch, yi.zhang, yi.zhang, yizhang089,
	yangerkun, yukuai
In-Reply-To: <20260422021042.4157510-1-yi.zhang@huaweicloud.com>

From: Zhang Yi <yi.zhang@huawei.com>

Introduce a new iomap_ops instance, ext4_iomap_zero_ops, along with
ext4_iomap_block_zero_range() to implement the iomap block zeroing range
for ext4. ext4_iomap_block_zero_range() invokes iomap_zero_range() and
passes ext4_iomap_zero_begin() to locate and zero out a mapped partial
block or a dirty, unwritten partial block.

Note that zeroing out under an active handle can cause deadlock since
the order of acquiring the folio lock and starting a handle is
inconsistent with the iomap writeback procedure. Therefore,
ext4_iomap_block_zero_range() cannot be called under an active handle,
and we also cannot use data=order mode to ensure zeroed data to be
unwritten back before updating i_disksize when performing post-EOF
append write or performing truncate up as well.

Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
---
 fs/ext4/inode.c | 91 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 91 insertions(+)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 26e1366b85fd..701b912db6fb 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4103,6 +4103,50 @@ static int ext4_iomap_buffered_da_write_end(struct inode *inode, loff_t offset,
 	return 0;
 }
 
+static int ext4_iomap_zero_begin(struct inode *inode,
+		loff_t offset, loff_t length, unsigned int flags,
+		struct iomap *iomap, struct iomap *srcmap)
+{
+	struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap);
+	struct ext4_map_blocks map;
+	u8 blkbits = inode->i_blkbits;
+	unsigned int iomap_flags = 0;
+	int ret;
+
+	ret = ext4_emergency_state(inode->i_sb);
+	if (unlikely(ret))
+		return ret;
+
+	if (WARN_ON_ONCE(!(flags & IOMAP_ZERO)))
+		return -EINVAL;
+
+	ret = ext4_iomap_map_blocks(inode, offset, length, NULL, &map);
+	if (ret < 0)
+		return ret;
+
+	/*
+	 * Look up dirty folios for unwritten mappings within EOF. Providing
+	 * this bypasses the flush iomap uses to trigger extent conversion
+	 * when unwritten mappings have dirty pagecache in need of zeroing.
+	 */
+	if (map.m_flags & EXT4_MAP_UNWRITTEN) {
+		loff_t offset = ((loff_t)map.m_lblk) << blkbits;
+		loff_t end = ((loff_t)map.m_lblk + map.m_len) << blkbits;
+
+		iomap_fill_dirty_folios(iter, &offset, end, &iomap_flags);
+		if ((offset >> blkbits) < map.m_lblk + map.m_len)
+			map.m_len = (offset >> blkbits) - map.m_lblk;
+	}
+
+	ext4_set_iomap(inode, iomap, &map, offset, length, flags);
+	iomap->flags |= iomap_flags;
+
+	return 0;
+}
+
+static const struct iomap_ops ext4_iomap_zero_ops = {
+	.iomap_begin = ext4_iomap_zero_begin,
+};
 
 const struct iomap_ops ext4_iomap_buffered_write_ops = {
 	.iomap_begin = ext4_iomap_buffered_write_begin,
@@ -4609,6 +4653,47 @@ static int ext4_block_journalled_zero_range(struct inode *inode, loff_t from,
 	return err;
 }
 
+static int ext4_block_iomap_zero_range(struct inode *inode, loff_t from,
+				       loff_t length, bool *did_zero,
+				       bool *zero_written)
+{
+	int ret;
+
+	/*
+	 * Zeroing out under an active handle can cause deadlock since
+	 * the order of acquiring the folio lock and starting a handle is
+	 * inconsistent with the iomap writeback procedure.
+	 */
+	if (WARN_ON_ONCE(ext4_handle_valid(journal_current_handle())))
+		return -EINVAL;
+
+	/* The zeroing scope should not extend across a block. */
+	if (WARN_ON_ONCE((from >> inode->i_blkbits) !=
+			 ((from + length - 1) >> inode->i_blkbits)))
+		return -EINVAL;
+
+	if (!(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS) &&
+	    !(inode_state_read_once(inode) & (I_NEW | I_FREEING)))
+		WARN_ON_ONCE(!inode_is_locked(inode) &&
+			!rwsem_is_locked(&inode->i_mapping->invalidate_lock));
+
+	ret = iomap_zero_range(inode, from, length, did_zero,
+			       &ext4_iomap_zero_ops, &ext4_iomap_write_ops,
+			       NULL);
+	if (ret)
+		return ret;
+
+	/*
+	 * TODO: The iomap does not distinguish between different types of
+	 * zeroing and always sets zero_written if a zeroing operation is
+	 * performed, which may result in unnecessary order operations.
+	 */
+	if (did_zero && zero_written)
+		*zero_written = *did_zero;
+
+	return 0;
+}
+
 /*
  * Zeros out a mapping of length 'length' starting from file offset
  * 'from'.  The range to be zero'd must be contained with in one block.
@@ -4635,6 +4720,9 @@ static int ext4_block_zero_range(struct inode *inode,
 	} else if (ext4_should_journal_data(inode)) {
 		return ext4_block_journalled_zero_range(inode, from, length,
 							did_zero);
+	} else if (ext4_inode_buffered_iomap(inode)) {
+		return ext4_block_iomap_zero_range(inode, from, length,
+						   did_zero, zero_written);
 	}
 	return ext4_block_do_zero_range(inode, from, length, did_zero,
 					zero_written);
@@ -4675,6 +4763,9 @@ int ext4_block_zero_eof(struct inode *inode, loff_t from, loff_t end)
 	 * truncating up or performing an append write, because there might be
 	 * exposing stale on-disk data which may caused by concurrent post-EOF
 	 * mmap write during folio writeback.
+	 *
+	 * TODO: In the iomap path, handle this by updating i_disksize to
+	 * i_size after the zeroed data has been written back.
 	 */
 	if (ext4_should_order_data(inode) &&
 	    did_zero && zero_written && !IS_DAX(inode)) {
-- 
2.52.0


^ permalink raw reply related

* [PATCH v3 10/22] ext4: implement mmap path using iomap
From: Zhang Yi @ 2026-04-22  2:10 UTC (permalink / raw)
  To: linux-ext4, linux-fsdevel
  Cc: linux-kernel, tytso, adilger.kernel, libaokun, jack, ojaswin,
	ritesh.list, djwong, hch, yi.zhang, yi.zhang, yizhang089,
	yangerkun, yukuai
In-Reply-To: <20260422021042.4157510-1-yi.zhang@huaweicloud.com>

From: Zhang Yi <yi.zhang@huawei.com>

Introduce ext4_iomap_page_mkwrite() to implement the mmap iomap path for
ext4. Most of this work is delegated to iomap_page_mkwrite(), which only
needs to be called with ext4_iomap_buffer_write_ops and
ext4_iomap_buffer_da_write_ops as arguments to allocate and map the
blocks. However, the lock ordering of the folio lock and transaction
start is the opposite of that in the buffer_head buffered write path.
The locking documentation in super.c has been updated accordingly.

Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
---
 fs/ext4/inode.c | 32 +++++++++++++++++++++++++++++++-
 fs/ext4/super.c |  8 ++++++--
 2 files changed, 37 insertions(+), 3 deletions(-)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 76ce43c64c30..26e1366b85fd 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4022,7 +4022,7 @@ static int ext4_iomap_buffered_do_write_begin(struct inode *inode,
 	/* Inline data support is not yet available. */
 	if (WARN_ON_ONCE(ext4_has_inline_data(inode)))
 		return -ERANGE;
-	if (WARN_ON_ONCE(!(flags & IOMAP_WRITE)))
+	if (WARN_ON_ONCE(!(flags & (IOMAP_WRITE | IOMAP_FAULT))))
 		return -EINVAL;
 
 	if (delalloc)
@@ -4082,6 +4082,14 @@ static int ext4_iomap_buffered_da_write_end(struct inode *inode, loff_t offset,
 	if (iomap->type != IOMAP_DELALLOC || !(iomap->flags & IOMAP_F_NEW))
 		return 0;
 
+	/*
+	 * iomap_page_mkwrite() will never fail in a way that requires delalloc
+	 * extents that it allocated to be revoked.  Hence never try to release
+	 * them here.
+	 */
+	if (flags & IOMAP_FAULT)
+		return 0;
+
 	/* Nothing to do if we've written the entire delalloc extent */
 	start_byte = iomap_last_written_block(inode, offset, written);
 	end_byte = round_up(offset + length, i_blocksize(inode));
@@ -7167,6 +7175,23 @@ static int ext4_block_page_mkwrite(struct inode *inode, struct folio *folio,
 	return ret;
 }
 
+static vm_fault_t ext4_iomap_page_mkwrite(struct vm_fault *vmf)
+{
+	struct inode *inode = file_inode(vmf->vma->vm_file);
+	const struct iomap_ops *iomap_ops;
+
+	/*
+	 * ext4_nonda_switch() could writeback this folio, so have to
+	 * call it before lock folio.
+	 */
+	if (test_opt(inode->i_sb, DELALLOC) && !ext4_nonda_switch(inode->i_sb))
+		iomap_ops = &ext4_iomap_buffered_da_write_ops;
+	else
+		iomap_ops = &ext4_iomap_buffered_write_ops;
+
+	return iomap_page_mkwrite(vmf, iomap_ops, NULL);
+}
+
 vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf)
 {
 	struct vm_area_struct *vma = vmf->vma;
@@ -7189,6 +7214,11 @@ vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf)
 
 	filemap_invalidate_lock_shared(mapping);
 
+	if (ext4_inode_buffered_iomap(inode)) {
+		ret = ext4_iomap_page_mkwrite(vmf);
+		goto out;
+	}
+
 	err = ext4_convert_inline_data(inode);
 	if (err)
 		goto out_ret;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 51d87db53543..62bfe05a64bc 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -100,8 +100,12 @@ static const struct fs_parameter_spec ext4_param_specs[];
  * Lock ordering
  *
  * page fault path:
- * mmap_lock -> sb_start_pagefault -> invalidate_lock (r) -> transaction start
- *   -> page lock -> i_data_sem (rw)
+ * - buffer_head path:
+ *   mmap_lock -> sb_start_pagefault -> invalidate_lock (r) ->
+ *     transaction start -> folio lock -> i_data_sem (rw)
+ * - iomap path:
+ *   mmap_lock -> sb_start_pagefault -> invalidate_lock (r) ->
+ *     folio lock -> transaction start -> i_data_sem (rw)
  *
  * buffered write path:
  * sb_start_write -> i_rwsem (w) -> mmap_lock
-- 
2.52.0


^ permalink raw reply related

* [PATCH v3 15/22] ext4: add block mapping tracepoints for iomap buffered I/O path
From: Zhang Yi @ 2026-04-22  2:10 UTC (permalink / raw)
  To: linux-ext4, linux-fsdevel
  Cc: linux-kernel, tytso, adilger.kernel, libaokun, jack, ojaswin,
	ritesh.list, djwong, hch, yi.zhang, yi.zhang, yizhang089,
	yangerkun, yukuai
In-Reply-To: <20260422021042.4157510-1-yi.zhang@huaweicloud.com>

From: Zhang Yi <yi.zhang@huawei.com>

Add tracepoints for iomap buffered read, write, partial block zeroing,
and writeback operations to help debug the iomap buffered I/O path.

Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
---
 fs/ext4/inode.c             |  6 +++++
 include/trace/events/ext4.h | 45 +++++++++++++++++++++++++++++++++++++
 2 files changed, 51 insertions(+)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 701b912db6fb..53fdcb50f3dd 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3961,6 +3961,8 @@ static int ext4_iomap_buffered_read_begin(struct inode *inode, loff_t offset,
 	if (ret < 0)
 		return ret;
 
+	trace_ext4_iomap_buffered_read_begin(inode, &map, offset, length,
+					     flags);
 	ext4_set_iomap(inode, iomap, &map, offset, length, flags);
 	return 0;
 }
@@ -4036,6 +4038,8 @@ static int ext4_iomap_buffered_do_write_begin(struct inode *inode,
 	if (ret < 0)
 		return ret;
 
+	trace_ext4_iomap_buffered_write_begin(inode, &map, offset, length,
+					      flags);
 	ext4_set_iomap(inode, iomap, &map, offset, length, flags);
 	return 0;
 }
@@ -4138,6 +4142,7 @@ static int ext4_iomap_zero_begin(struct inode *inode,
 			map.m_len = (offset >> blkbits) - map.m_lblk;
 	}
 
+	trace_ext4_iomap_zero_begin(inode, &map, offset, length, flags);
 	ext4_set_iomap(inode, iomap, &map, offset, length, flags);
 	iomap->flags |= iomap_flags;
 
@@ -4306,6 +4311,7 @@ static int ext4_iomap_map_writeback_range(struct iomap_writepage_ctx *wpc,
 		return ret;
 	}
 out:
+	trace_ext4_iomap_map_writeback_range(inode, &map, offset, dirty_len, 0);
 	ext4_set_iomap(inode, &wpc->iomap, &map, offset, dirty_len, 0);
 	return 0;
 }
diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
index f493642cf121..ebafa06cd191 100644
--- a/include/trace/events/ext4.h
+++ b/include/trace/events/ext4.h
@@ -3096,6 +3096,51 @@ TRACE_EVENT(ext4_move_extent_exit,
 		  __entry->ret)
 );
 
+DECLARE_EVENT_CLASS(ext4_set_iomap_class,
+	TP_PROTO(struct inode *inode, struct ext4_map_blocks *map,
+		 loff_t offset, loff_t length, unsigned int flags),
+	TP_ARGS(inode, map, offset, length, flags),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(u64, ino)
+		__field(ext4_lblk_t, m_lblk)
+		__field(unsigned int, m_len)
+		__field(unsigned int, m_flags)
+		__field(u64, m_seq)
+		__field(loff_t, offset)
+		__field(loff_t, length)
+		__field(unsigned int, iomap_flags)
+	),
+	TP_fast_assign(
+		__entry->dev		= inode->i_sb->s_dev;
+		__entry->ino		= inode->i_ino;
+		__entry->m_lblk		= map->m_lblk;
+		__entry->m_len		= map->m_len;
+		__entry->m_flags	= map->m_flags;
+		__entry->m_seq		= map->m_seq;
+		__entry->offset		= offset;
+		__entry->length		= length;
+		__entry->iomap_flags	= flags;
+
+	),
+	TP_printk("dev %d:%d ino %llu m_lblk %u m_len %u m_flags %s m_seq %llu orig_off 0x%llx orig_len 0x%llx iomap_flags 0x%x",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino, __entry->m_lblk, __entry->m_len,
+		  show_mflags(__entry->m_flags), __entry->m_seq,
+		  __entry->offset, __entry->length, __entry->iomap_flags)
+)
+
+#define DEFINE_SET_IOMAP_EVENT(name) \
+DEFINE_EVENT(ext4_set_iomap_class, name, \
+	TP_PROTO(struct inode *inode, struct ext4_map_blocks *map, \
+		 loff_t offset, loff_t length, unsigned int flags), \
+	TP_ARGS(inode, map, offset, length, flags))
+
+DEFINE_SET_IOMAP_EVENT(ext4_iomap_buffered_read_begin);
+DEFINE_SET_IOMAP_EVENT(ext4_iomap_buffered_write_begin);
+DEFINE_SET_IOMAP_EVENT(ext4_iomap_map_writeback_range);
+DEFINE_SET_IOMAP_EVENT(ext4_iomap_zero_begin);
+
 #endif /* _TRACE_EXT4_H */
 
 /* This part must be outside protection */
-- 
2.52.0


^ permalink raw reply related

* [PATCH v3 18/22] ext4: introduce a mount option for iomap buffered I/O path
From: Zhang Yi @ 2026-04-22  2:10 UTC (permalink / raw)
  To: linux-ext4, linux-fsdevel
  Cc: linux-kernel, tytso, adilger.kernel, libaokun, jack, ojaswin,
	ritesh.list, djwong, hch, yi.zhang, yi.zhang, yizhang089,
	yangerkun, yukuai
In-Reply-To: <20260422021042.4157510-1-yi.zhang@huaweicloud.com>

From: Zhang Yi <yi.zhang@huawei.com>

Since the iomap buffered I/O path does not yet support all existing
features, it cannot be enabled by default. Introduce 'buffered_iomap'
and 'nobuffered_iomap' mount options to enable and disable the iomap
buffered I/O path for regular files.

Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
---
 fs/ext4/ext4.h  | 1 +
 fs/ext4/inode.c | 2 ++
 fs/ext4/super.c | 7 +++++++
 3 files changed, 10 insertions(+)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 80d086d40990..60ba488b01c5 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1281,6 +1281,7 @@ struct ext4_inode_info {
 						    * scanning in mballoc
 						    */
 #define EXT4_MOUNT2_ABORT		0x00000100 /* Abort filesystem */
+#define EXT4_MOUNT2_BUFFERED_IOMAP	0x00000200 /* Use iomap for buffered I/O */
 
 #define clear_opt(sb, opt)		EXT4_SB(sb)->s_mount_opt &= \
 						~EXT4_MOUNT_##opt
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 57b5708235cf..d2f7af7922d7 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -5750,6 +5750,8 @@ void ext4_enable_buffered_iomap(struct inode *inode)
 {
 	struct super_block *sb = inode->i_sb;
 
+	if (!test_opt2(sb, BUFFERED_IOMAP))
+		return;
 	if (!S_ISREG(inode->i_mode))
 		return;
 	if (ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE))
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 62bfe05a64bc..b2da4834b6bb 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1722,6 +1722,7 @@ enum {
 	Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable,
 	Opt_max_dir_size_kb, Opt_nojournal_checksum, Opt_nombcache,
 	Opt_no_prefetch_block_bitmaps, Opt_mb_optimize_scan,
+	Opt_buffered_iomap, Opt_nobuffered_iomap,
 	Opt_errors, Opt_data, Opt_data_err, Opt_jqfmt, Opt_dax_type,
 #ifdef CONFIG_EXT4_DEBUG
 	Opt_fc_debug_max_replay, Opt_fc_debug_force
@@ -1860,6 +1861,8 @@ static const struct fs_parameter_spec ext4_param_specs[] = {
 	fsparam_flag	("no_prefetch_block_bitmaps",
 						Opt_no_prefetch_block_bitmaps),
 	fsparam_s32	("mb_optimize_scan",	Opt_mb_optimize_scan),
+	fsparam_flag	("buffered_iomap",	Opt_buffered_iomap),
+	fsparam_flag	("nobuffered_iomap",	Opt_nobuffered_iomap),
 	fsparam_string	("check",		Opt_removed),	/* mount option from ext2/3 */
 	fsparam_flag	("nocheck",		Opt_removed),	/* mount option from ext2/3 */
 	fsparam_flag	("reservation",		Opt_removed),	/* mount option from ext2/3 */
@@ -1953,6 +1956,10 @@ static const struct mount_opts {
 	{Opt_nombcache, EXT4_MOUNT_NO_MBCACHE, MOPT_SET},
 	{Opt_no_prefetch_block_bitmaps, EXT4_MOUNT_NO_PREFETCH_BLOCK_BITMAPS,
 	 MOPT_SET},
+	{Opt_buffered_iomap, EXT4_MOUNT2_BUFFERED_IOMAP,
+	 MOPT_SET | MOPT_2 | MOPT_EXT4_ONLY},
+	{Opt_nobuffered_iomap, EXT4_MOUNT2_BUFFERED_IOMAP,
+	 MOPT_CLEAR | MOPT_2 | MOPT_EXT4_ONLY},
 #ifdef CONFIG_EXT4_DEBUG
 	{Opt_fc_debug_force, EXT4_MOUNT2_JOURNAL_FAST_COMMIT,
 	 MOPT_SET | MOPT_2 | MOPT_EXT4_ONLY},
-- 
2.52.0


^ permalink raw reply related

* [PATCH v3 17/22] ext4: partially enable iomap for the buffered I/O path of regular files
From: Zhang Yi @ 2026-04-22  2:10 UTC (permalink / raw)
  To: linux-ext4, linux-fsdevel
  Cc: linux-kernel, tytso, adilger.kernel, libaokun, jack, ojaswin,
	ritesh.list, djwong, hch, yi.zhang, yi.zhang, yizhang089,
	yangerkun, yukuai
In-Reply-To: <20260422021042.4157510-1-yi.zhang@huaweicloud.com>

From: Zhang Yi <yi.zhang@huawei.com>

Partially enable iomap for the buffered I/O path of regular files. We
now support default filesystem features, mount options, and the bigalloc
feature. However, inline data, fsverity, fscrypt, online
defragmentation, and data=journal mode are not yet supported. Some of
these features are expected to be gradually supported in the future. The
filesystem will automatically fall back to the original buffer_head path
if these mount options or features are enabled.

Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
---
 fs/ext4/ext4.h      |  1 +
 fs/ext4/ext4_jbd2.c |  1 +
 fs/ext4/ialloc.c    |  1 +
 fs/ext4/inode.c     | 36 ++++++++++++++++++++++++++++++++++++
 4 files changed, 39 insertions(+)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 0ffa81f86bc5..80d086d40990 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -3059,6 +3059,7 @@ int ext4_walk_page_buffers(handle_t *handle,
 int do_journal_get_write_access(handle_t *handle, struct inode *inode,
 				struct buffer_head *bh);
 void ext4_set_inode_mapping_order(struct inode *inode);
+void ext4_enable_buffered_iomap(struct inode *inode);
 int ext4_nonda_switch(struct super_block *sb);
 #define FALL_BACK_TO_NONDELALLOC 1
 #define CONVERT_INLINE_DATA	 2
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index 9a8c225f2753..9b25a1c414b9 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -16,6 +16,7 @@ int ext4_inode_journal_mode(struct inode *inode)
 	    ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE) ||
 	    test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ||
 	    (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA) &&
+	    !ext4_inode_buffered_iomap(inode) &&
 	    !test_opt(inode->i_sb, DELALLOC))) {
 		/* We do not support data journalling for encrypted data */
 		if (S_ISREG(inode->i_mode) && IS_ENCRYPTED(inode))
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 3fd8f0099852..ea64b9e9e382 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -1340,6 +1340,7 @@ struct inode *__ext4_new_inode(struct mnt_idmap *idmap,
 		}
 	}
 
+	ext4_enable_buffered_iomap(inode);
 	ext4_set_inode_mapping_order(inode);
 
 	ext4_update_inode_fsync_trans(handle, inode, 1);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 53fdcb50f3dd..57b5708235cf 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -918,6 +918,9 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock,
 
 	if (ext4_has_inline_data(inode))
 		return -ERANGE;
+	/* inodes using the iomap buffered I/O path should not go here. */
+	if (WARN_ON_ONCE(ext4_inode_buffered_iomap(inode)))
+		return -EINVAL;
 
 	map.m_lblk = iblock;
 	map.m_len = bh->b_size >> inode->i_blkbits;
@@ -2797,6 +2800,12 @@ static int ext4_do_writepages(struct mpage_da_data *mpd)
 	if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
 		goto out_writepages;
 
+	/* inodes using the iomap buffered I/O path should not go here. */
+	if (WARN_ON_ONCE(ext4_inode_buffered_iomap(inode))) {
+		ret = -EINVAL;
+		goto out_writepages;
+	}
+
 	/*
 	 * If the filesystem has aborted, it is read-only, so return
 	 * right away instead of dumping stack traces later on that
@@ -5737,6 +5746,31 @@ static int check_igot_inode(struct inode *inode, ext4_iget_flags flags,
 	return -EFSCORRUPTED;
 }
 
+void ext4_enable_buffered_iomap(struct inode *inode)
+{
+	struct super_block *sb = inode->i_sb;
+
+	if (!S_ISREG(inode->i_mode))
+		return;
+	if (ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE))
+		return;
+
+	/* Unsupported Features */
+	if (ext4_has_feature_inline_data(sb))
+		return;
+	if (ext4_has_feature_verity(sb))
+		return;
+	if (ext4_has_feature_encrypt(sb))
+		return;
+	if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ||
+	    ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA))
+		return;
+	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
+		return;
+
+	ext4_set_inode_state(inode, EXT4_STATE_BUFFERED_IOMAP);
+}
+
 void ext4_set_inode_mapping_order(struct inode *inode)
 {
 	struct super_block *sb = inode->i_sb;
@@ -6022,6 +6056,8 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
 	if (ret)
 		goto bad_inode;
 
+	ext4_enable_buffered_iomap(inode);
+
 	if (S_ISREG(inode->i_mode)) {
 		inode->i_op = &ext4_file_inode_operations;
 		inode->i_fop = &ext4_file_operations;
-- 
2.52.0


^ permalink raw reply related

* [PATCH v3 22/22] ext4: add tracepoints for ordered I/O in the iomap buffered I/O path
From: Zhang Yi @ 2026-04-22  2:10 UTC (permalink / raw)
  To: linux-ext4, linux-fsdevel
  Cc: linux-kernel, tytso, adilger.kernel, libaokun, jack, ojaswin,
	ritesh.list, djwong, hch, yi.zhang, yi.zhang, yizhang089,
	yangerkun, yukuai
In-Reply-To: <20260422021042.4157510-1-yi.zhang@huaweicloud.com>

From: Zhang Yi <yi.zhang@huawei.com>

To facilitate the tracing of ordered I/Os in the iomap buffered I/O
path, add tracepoints to track the ordered I/O flow:

 - ext4_iomap_ordered_submit: trace when ordered I/O is being submitted;
 - ext4_iomap_ordered_complete: trace when ordered I/O completes;
 - ext4_iomap_disksize_update: trace when i_disksize is updated, either
   when appending I/O or when an ordered I/O completes;
 - ext4_block_zero_eof - trace zero EOF partial block.

Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
---
 fs/ext4/inode.c             |  4 ++
 fs/ext4/page-io.c           |  8 +++
 include/trace/events/ext4.h | 97 +++++++++++++++++++++++++++++++++++++
 3 files changed, 109 insertions(+)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index d983336390c7..ca4284da2a2b 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4377,6 +4377,9 @@ static int ext4_iomap_writeback_submit(struct iomap_writepage_ctx *wpc,
 				     ioend->io_offset + ioend->io_size);
 
 		if (start <= order_lblk && end >= order_lblk + order_len) {
+			trace_ext4_iomap_ordered_submit(ioend->io_inode,
+					ioend->io_offset, ioend->io_size,
+					order_lblk, order_len);
 			ioend->io_bio.bi_end_io = ext4_iomap_end_bio;
 			ioend->io_private = (void *)EXT4_IOMAP_IOEND_ORDER_IO;
 			ioend->io_flags |= IOMAP_IOEND_BOUNDARY;
@@ -4879,6 +4882,7 @@ int ext4_block_zero_eof(struct inode *inode, loff_t from, loff_t end)
 		}
 	}
 
+	trace_ext4_block_zero_eof(inode, from, length, did_zero, zero_written);
 	return 0;
 }
 
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 589c74b9f8a3..979a88c38fff 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -31,6 +31,8 @@
 #include "xattr.h"
 #include "acl.h"
 
+#include <trace/events/ext4.h>
+
 static struct kmem_cache *io_end_cachep;
 static struct kmem_cache *io_end_vec_cachep;
 
@@ -673,6 +675,9 @@ static int ext4_iomap_wb_update_disksize(handle_t *handle, struct inode *inode,
 	 * never be exposed.
 	 */
 	new_disksize = is_ordered ? i_size : min(end, i_size);
+	trace_ext4_iomap_disksize_update(inode, end, i_size, ei->i_disksize,
+					 new_disksize, is_ordered);
+
 	if (new_disksize > ei->i_disksize)
 		ei->i_disksize = new_disksize;
 	up_write(&ei->i_data_sem);
@@ -782,6 +787,9 @@ void ext4_iomap_end_bio(struct bio *bio)
 		 * waiters.
 		 */
 		smp_store_release(&ei->i_ordered_len, 0);
+		trace_ext4_iomap_ordered_complete(inode, ioend->io_offset,
+				ioend->io_size, READ_ONCE(ei->i_ordered_lblk),
+				READ_ONCE(ei->i_ordered_len));
 		wake_up_all(&ei->i_ordered_wq);
 		goto defer;
 	}
diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
index ebafa06cd191..423aec6d09d1 100644
--- a/include/trace/events/ext4.h
+++ b/include/trace/events/ext4.h
@@ -3141,6 +3141,103 @@ DEFINE_SET_IOMAP_EVENT(ext4_iomap_buffered_write_begin);
 DEFINE_SET_IOMAP_EVENT(ext4_iomap_map_writeback_range);
 DEFINE_SET_IOMAP_EVENT(ext4_iomap_zero_begin);
 
+/* Ordered I/O tracepoints for iomap buffered I/O path */
+DECLARE_EVENT_CLASS(ext4_iomap_ordered_io,
+	TP_PROTO(struct inode *inode, loff_t io_offset, size_t io_size,
+		 ext4_lblk_t i_ordered_lblk, unsigned int i_ordered_len),
+	TP_ARGS(inode, io_offset, io_size, i_ordered_lblk, i_ordered_len),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(u64, ino)
+		__field(loff_t, io_offset)
+		__field(size_t, io_size)
+		__field(ext4_lblk_t, i_ordered_lblk)
+		__field(unsigned int, i_ordered_len)
+	),
+	TP_fast_assign(
+		__entry->dev = inode->i_sb->s_dev;
+		__entry->ino = inode->i_ino;
+		__entry->io_offset = io_offset;
+		__entry->io_size = io_size;
+		__entry->i_ordered_lblk = i_ordered_lblk;
+		__entry->i_ordered_len = i_ordered_len;
+	),
+	TP_printk("dev %d:%d ino %llu io_offset %lld io_size %zu i_ordered_lblk %u i_ordered_len %u",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino, __entry->io_offset, __entry->io_size,
+		  __entry->i_ordered_lblk, __entry->i_ordered_len)
+);
+
+DEFINE_EVENT(ext4_iomap_ordered_io, ext4_iomap_ordered_submit,
+	TP_PROTO(struct inode *inode, loff_t io_offset, size_t io_size,
+		 ext4_lblk_t i_ordered_lblk, unsigned int i_ordered_len),
+	TP_ARGS(inode, io_offset, io_size, i_ordered_lblk, i_ordered_len)
+);
+
+DEFINE_EVENT(ext4_iomap_ordered_io, ext4_iomap_ordered_complete,
+	TP_PROTO(struct inode *inode, loff_t io_offset, size_t io_size,
+		 ext4_lblk_t i_ordered_lblk, unsigned int i_ordered_len),
+	TP_ARGS(inode, io_offset, io_size, i_ordered_lblk, i_ordered_len)
+);
+
+
+/* i_disksize update tracepoint */
+TRACE_EVENT(ext4_iomap_disksize_update,
+	TP_PROTO(struct inode *inode, loff_t end, loff_t i_size,
+		 loff_t i_disksize, loff_t new_disksize, bool is_ordered),
+	TP_ARGS(inode, end, i_size, i_disksize, new_disksize, is_ordered),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(u64, ino)
+		__field(loff_t, end)
+		__field(loff_t, i_size)
+		__field(loff_t, i_disksize)
+		__field(loff_t, new_disksize)
+		__field(bool, is_ordered)
+	),
+	TP_fast_assign(
+		__entry->dev = inode->i_sb->s_dev;
+		__entry->ino = inode->i_ino;
+		__entry->end = end;
+		__entry->i_size = i_size;
+		__entry->i_disksize = i_disksize;
+		__entry->new_disksize = new_disksize;
+		__entry->is_ordered = is_ordered;
+	),
+	TP_printk("dev %d:%d ino %llu end %lld i_size %lld i_disksize %lld new_disksize %lld is_ordered %d",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino, __entry->end, __entry->i_size,
+		  __entry->i_disksize, __entry->new_disksize,
+		  __entry->is_ordered)
+);
+
+/* Block zero EOF tracepoint */
+TRACE_EVENT(ext4_block_zero_eof,
+	TP_PROTO(struct inode *inode, loff_t from, loff_t length,
+		 bool did_zero, bool zero_written),
+	TP_ARGS(inode, from, length, did_zero, zero_written),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(u64, ino)
+		__field(loff_t, from)
+		__field(loff_t, length)
+		__field(bool, did_zero)
+		__field(bool, zero_written)
+	),
+	TP_fast_assign(
+		__entry->dev = inode->i_sb->s_dev;
+		__entry->ino = inode->i_ino;
+		__entry->from = from;
+		__entry->length = length;
+		__entry->did_zero = did_zero;
+		__entry->zero_written = zero_written;
+	),
+	TP_printk("dev %d:%d ino %llu zero EOF from %lld length %lld did_zero %d zero_written %d",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino, __entry->from, __entry->length,
+		  __entry->did_zero, __entry->zero_written)
+);
+
 #endif /* _TRACE_EXT4_H */
 
 /* This part must be outside protection */
-- 
2.52.0


^ permalink raw reply related

* [PATCH v3 19/22] ext4: submit zeroed post-EOF data immediately in the iomap buffered I/O path
From: Zhang Yi @ 2026-04-22  2:10 UTC (permalink / raw)
  To: linux-ext4, linux-fsdevel
  Cc: linux-kernel, tytso, adilger.kernel, libaokun, jack, ojaswin,
	ritesh.list, djwong, hch, yi.zhang, yi.zhang, yizhang089,
	yangerkun, yukuai
In-Reply-To: <20260422021042.4157510-1-yi.zhang@huaweicloud.com>

From: Zhang Yi <yi.zhang@huawei.com>

In the generic buffered_head I/O path, we rely on the data=order mode to
ensure that the zeroed EOF block data is written before updating
i_disksize, thus preventing stale data from being exposed.

However, the iomap buffered I/O path cannot use this mechanism. Instead,
we issue the I/O immediately after performing the zero operation
(without synchronous waiting). This can reduce the risk of exposing
stale data, but it does not guarantee that the zero data will be flushed
to disk before the metadata of i_disksize is updated. The subsequent
patches will wait for this I/O to complete before updating i_disksize.

Suggested-by: Jan Kara <jack@suse.cz>
Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
---
 fs/ext4/inode.c | 58 +++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 47 insertions(+), 11 deletions(-)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index d2f7af7922d7..d55899c1ef4c 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4766,8 +4766,10 @@ int ext4_block_zero_eof(struct inode *inode, loff_t from, loff_t end)
 	if (IS_ENCRYPTED(inode) && !fscrypt_has_encryption_key(inode))
 		return 0;
 
-	if (length > blocksize - offset)
+	if (length > blocksize - offset) {
 		length = blocksize - offset;
+		end = from + length;
+	}
 
 	err = ext4_block_zero_range(inode, from, length,
 				    &did_zero, &zero_written);
@@ -4782,18 +4784,52 @@ int ext4_block_zero_eof(struct inode *inode, loff_t from, loff_t end)
 	 * TODO: In the iomap path, handle this by updating i_disksize to
 	 * i_size after the zeroed data has been written back.
 	 */
-	if (ext4_should_order_data(inode) &&
-	    did_zero && zero_written && !IS_DAX(inode)) {
-		handle_t *handle;
+	if (did_zero && zero_written && !IS_DAX(inode)) {
+		if (ext4_should_order_data(inode)) {
+			handle_t *handle;
 
-		handle = ext4_journal_start(inode, EXT4_HT_MISC, 1);
-		if (IS_ERR(handle))
-			return PTR_ERR(handle);
+			handle = ext4_journal_start(inode, EXT4_HT_MISC, 1);
+			if (IS_ERR(handle))
+				return PTR_ERR(handle);
 
-		err = ext4_jbd2_inode_add_write(handle, inode, from, length);
-		ext4_journal_stop(handle);
-		if (err)
-			return err;
+			err = ext4_jbd2_inode_add_write(handle, inode, from,
+							length);
+			ext4_journal_stop(handle);
+			if (err)
+				return err;
+		/*
+		 * inodes using the iomap buffered I/O path do not use the
+		 * data=ordered mode. We submit zeroed range here.
+		 *
+		 * TODO: The end_io process needs to wait for I/O to completes
+		 * before updating i_disksize.
+		 */
+		} else if (ext4_inode_buffered_iomap(inode)) {
+			struct folio *folio;
+			bool do_submit = false;
+
+			folio = filemap_lock_folio(inode->i_mapping,
+						   from >> PAGE_SHIFT);
+			if (IS_ERR(folio))
+				/* Already writeback and clear? */
+				return PTR_ERR(folio) == -ENOENT ? 0 :
+						PTR_ERR(folio);
+
+			folio_wait_writeback(folio);
+			WARN_ON_ONCE(folio_test_writeback(folio));
+
+			if (likely(folio_test_dirty(folio)))
+				do_submit = true;
+			folio_unlock(folio);
+			folio_put(folio);
+
+			if (do_submit) {
+				err = filemap_fdatawrite_range(inode->i_mapping,
+							       from, end - 1);
+				if (err)
+					return err;
+			}
+		}
 	}
 
 	return 0;
-- 
2.52.0


^ permalink raw reply related

* [PATCH v3 20/22] ext4: wait for ordered I/O in the iomap buffered I/O path
From: Zhang Yi @ 2026-04-22  2:10 UTC (permalink / raw)
  To: linux-ext4, linux-fsdevel
  Cc: linux-kernel, tytso, adilger.kernel, libaokun, jack, ojaswin,
	ritesh.list, djwong, hch, yi.zhang, yi.zhang, yizhang089,
	yangerkun, yukuai
In-Reply-To: <20260422021042.4157510-1-yi.zhang@huaweicloud.com>

From: Zhang Yi <yi.zhang@huawei.com>

Wait for ordered I/O to complete before updating i_disksize. This
ensures zeroed data is flushed to disk before the i_disksize metadata is
updated, preventing stale data exposure during unaligned post-EOF append
writes.

Suggested-by: Jan Kara <jack@suse.cz>
Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
---
 fs/ext4/ext4.h    | 11 +++++++++
 fs/ext4/inode.c   | 62 ++++++++++++++++++++++++++++++++++++++++++-----
 fs/ext4/page-io.c | 53 ++++++++++++++++++++++++++++++++++++++++
 fs/ext4/super.c   | 23 +++++++++++++-----
 4 files changed, 137 insertions(+), 12 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 60ba488b01c5..760400395cb7 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1195,6 +1195,15 @@ struct ext4_inode_info {
 #ifdef CONFIG_FS_ENCRYPTION
 	struct fscrypt_inode_info *i_crypt_info;
 #endif
+
+	/*
+	 * Track ordered zeroed data during post-EOF append writes, fallocate,
+	 * and truncate-up operations. These parameters are used only in the
+	 * iomap buffered I/O path.
+	 */
+	ext4_lblk_t i_ordered_lblk;
+	ext4_lblk_t i_ordered_len;
+	wait_queue_head_t i_ordered_wq;
 };
 
 /*
@@ -3877,6 +3886,8 @@ extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
 			     __u64 len, __u64 *moved_len);
 
 /* page-io.c */
+#define EXT4_IOMAP_IOEND_ORDER_IO	1UL	/* This I/O is an ordered one */
+
 extern int __init ext4_init_pageio(void);
 extern void ext4_exit_pageio(void);
 extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index d55899c1ef4c..17bd4403c782 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4352,12 +4352,37 @@ static int ext4_iomap_writeback_submit(struct iomap_writepage_ctx *wpc,
 {
 	struct iomap_ioend *ioend = wpc->wb_ctx;
 	struct ext4_inode_info *ei = EXT4_I(ioend->io_inode);
+	ext4_lblk_t start, end, order_lblk, order_len;
 
 	/* Need to convert unwritten extents when I/Os are completed. */
 	if ((ioend->io_flags & IOMAP_IOEND_UNWRITTEN) ||
 	    ioend->io_offset + ioend->io_size > READ_ONCE(ei->i_disksize))
 		ioend->io_bio.bi_end_io = ext4_iomap_end_bio;
 
+	/*
+	 * Mark the I/O as ordered. Ordered I/O requires separate endio
+	 * handling and must not be merged with regular I/O operations.
+	 */
+	order_len = READ_ONCE(ei->i_ordered_len);
+	if (order_len) {
+		/*
+		 * Pair with smp_store_release() in ext4_block_zero_eof().
+		 * Ensure we see the updated i_ordered_lblk that was written
+		 * before the release store to i_ordered_len.
+		 */
+		smp_rmb();
+		order_lblk = READ_ONCE(ei->i_ordered_lblk);
+		start = ioend->io_offset >> ioend->io_inode->i_blkbits;
+		end = EXT4_B_TO_LBLK(ioend->io_inode,
+				     ioend->io_offset + ioend->io_size);
+
+		if (start <= order_lblk && end >= order_lblk + order_len) {
+			ioend->io_bio.bi_end_io = ext4_iomap_end_bio;
+			ioend->io_private = (void *)EXT4_IOMAP_IOEND_ORDER_IO;
+			ioend->io_flags |= IOMAP_IOEND_BOUNDARY;
+		}
+	}
+
 	return iomap_ioend_writeback_submit(wpc, error);
 }
 
@@ -4799,12 +4824,12 @@ int ext4_block_zero_eof(struct inode *inode, loff_t from, loff_t end)
 				return err;
 		/*
 		 * inodes using the iomap buffered I/O path do not use the
-		 * data=ordered mode. We submit zeroed range here.
-		 *
-		 * TODO: The end_io process needs to wait for I/O to completes
-		 * before updating i_disksize.
+		 * data=ordered mode. Submit zeroed range here. The end_io
+		 * handler ext4_iomap_wb_ordered_wait() will wait for I/O
+		 * completion before updating i_disksize.
 		 */
 		} else if (ext4_inode_buffered_iomap(inode)) {
+			struct ext4_inode_info *ei = EXT4_I(inode);
 			struct folio *folio;
 			bool do_submit = false;
 
@@ -4818,16 +4843,41 @@ int ext4_block_zero_eof(struct inode *inode, loff_t from, loff_t end)
 			folio_wait_writeback(folio);
 			WARN_ON_ONCE(folio_test_writeback(folio));
 
-			if (likely(folio_test_dirty(folio)))
+			/*
+			 * Mark the ordered range. It will be cleared upon
+			 * I/O completion in ext4_iomap_end_bio().
+			 */
+			if (likely(folio_test_dirty(folio)) &&
+			    READ_ONCE(ei->i_ordered_len) == 0) {
+				WRITE_ONCE(ei->i_ordered_lblk,
+					   from >> inode->i_blkbits);
+				/*
+				 * Pairs with smp_rmb() in
+				 * ext4_iomap_writeback_submit() and
+				 * ext4_iomap_wb_ordered_wait(). Ensure the
+				 * updated i_ordered_lblk is visible when
+				 * i_ordered_len becomes non-zero.
+				 */
+				smp_store_release(&ei->i_ordered_len, 1);
 				do_submit = true;
+			}
 			folio_unlock(folio);
 			folio_put(folio);
 
 			if (do_submit) {
 				err = filemap_fdatawrite_range(inode->i_mapping,
 							       from, end - 1);
-				if (err)
+				if (err) {
+					/*
+					 * Pairs with wait_event() in
+					 * ext4_iomap_wb_ordered_wait(). Ensure
+					 * i_ordered_len = 0 is visible before
+					 * waking up waiters.
+					 */
+					smp_store_release(&ei->i_ordered_len, 0);
+					wake_up_all(&ei->i_ordered_wq);
 					return err;
+				}
 			}
 		}
 	}
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 07978e2cd9c8..9c88671836fe 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -613,6 +613,39 @@ int ext4_bio_write_folio(struct ext4_io_submit *io, struct folio *folio,
 	return 0;
 }
 
+/*
+ * If the old disk size is not block size aligned and the current
+ * writeback range is entirely beyond the old EOF block, we should
+ * wait for the zeroed data written in ext4_block_zero_eof() to be
+ * written out, otherwise, it may expose stale data in that block.
+ */
+static void ext4_iomap_wb_ordered_wait(struct inode *inode,
+				       loff_t pos, loff_t end)
+{
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	unsigned int blocksize = i_blocksize(inode);
+	loff_t disksize = READ_ONCE(ei->i_disksize);
+	ext4_lblk_t order_lblk, order_len;
+
+	if (!(disksize & (blocksize - 1)) ||
+	    pos <= round_up(disksize, blocksize))
+		return;
+
+	order_len = READ_ONCE(ei->i_ordered_len);
+	if (!order_len)
+		return;
+
+	/*
+	 * Pair with smp_store_release() in ext4_iomap_end_bio() and
+	 * ext4_block_zero_eof(). Ensure we see the updated i_ordered_lblk
+	 * that was written before the release store to i_ordered_len.
+	 */
+	smp_rmb();
+	order_lblk = READ_ONCE(ei->i_ordered_lblk);
+	if ((pos >> inode->i_blkbits) >= order_lblk + order_len)
+		wait_event(ei->i_ordered_wq, READ_ONCE(ei->i_ordered_len) == 0);
+}
+
 static int ext4_iomap_wb_update_disksize(handle_t *handle, struct inode *inode,
 					 loff_t end)
 {
@@ -656,6 +689,9 @@ static void ext4_iomap_finish_ioend(struct iomap_ioend *ioend)
 		goto out;
 	}
 
+	/* Wait ordered zero data to be written out. */
+	ext4_iomap_wb_ordered_wait(inode, pos, pos + size);
+
 	/* We may need to convert one extent and dirty the inode. */
 	credits = ext4_chunk_trans_blocks(inode,
 			EXT4_MAX_BLOCKS(size, pos, inode->i_blkbits));
@@ -717,9 +753,26 @@ void ext4_iomap_end_bio(struct bio *bio)
 	struct inode *inode = ioend->io_inode;
 	struct ext4_inode_info *ei = EXT4_I(inode);
 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+	unsigned long io_mode = (unsigned long)ioend->io_private;
 	unsigned long flags;
 	int ret;
 
+	/*
+	 * This is an ordered I/O, clear the ordered range set in
+	 * ext4_block_zero_eof() and wake up all waiters that will update
+	 * the inode i_disksize.
+	 */
+	if (io_mode == EXT4_IOMAP_IOEND_ORDER_IO) {
+		/*
+		 * Pairs with wait_event() in ext4_iomap_wb_ordered_wait().
+		 * Ensure i_ordered_len = 0 is visible before waking up
+		 * waiters.
+		 */
+		smp_store_release(&ei->i_ordered_len, 0);
+		wake_up_all(&ei->i_ordered_wq);
+		goto defer;
+	}
+
 	/* Needs to convert unwritten extents or update the i_disksize. */
 	if ((ioend->io_flags & IOMAP_IOEND_UNWRITTEN) ||
 	    ioend->io_offset + ioend->io_size > READ_ONCE(ei->i_disksize))
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index b2da4834b6bb..2fc07739c9e8 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1444,6 +1444,9 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
 	ext4_fc_init_inode(&ei->vfs_inode);
 	spin_lock_init(&ei->i_fc_lock);
 	mmb_init(&ei->i_metadata_bhs, &ei->vfs_inode.i_data);
+	ei->i_ordered_lblk = 0;
+	ei->i_ordered_len = 0;
+	init_waitqueue_head(&ei->i_ordered_wq);
 	return &ei->vfs_inode;
 }
 
@@ -1480,12 +1483,20 @@ static void ext4_destroy_inode(struct inode *inode)
 		dump_stack();
 	}
 
-	if (!(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ERROR_FS) &&
-	    WARN_ON_ONCE(EXT4_I(inode)->i_reserved_data_blocks))
-		ext4_msg(inode->i_sb, KERN_ERR,
-			 "Inode %llu (%p): i_reserved_data_blocks (%u) not cleared!",
-			 inode->i_ino, EXT4_I(inode),
-			 EXT4_I(inode)->i_reserved_data_blocks);
+	if (!(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ERROR_FS)) {
+		if (WARN_ON_ONCE(EXT4_I(inode)->i_reserved_data_blocks))
+			ext4_msg(inode->i_sb, KERN_ERR,
+				 "Inode %llu (%p): i_reserved_data_blocks (%u) not cleared!",
+				 inode->i_ino, EXT4_I(inode),
+				 EXT4_I(inode)->i_reserved_data_blocks);
+
+		if (WARN_ON_ONCE(EXT4_I(inode)->i_ordered_len))
+			ext4_msg(inode->i_sb, KERN_ERR,
+				 "Inode %llu (%p): i_ordered_lblk (%u) and i_ordered_len (%u) not cleared!",
+				 inode->i_ino, EXT4_I(inode),
+				 EXT4_I(inode)->i_ordered_lblk,
+				 EXT4_I(inode)->i_ordered_len);
+	}
 }
 
 static void ext4_shutdown(struct super_block *sb)
-- 
2.52.0


^ permalink raw reply related

* [PATCH v3 12/22] iomap: support invalidating partial folios
From: Zhang Yi @ 2026-04-22  2:10 UTC (permalink / raw)
  To: linux-ext4, linux-fsdevel
  Cc: linux-kernel, tytso, adilger.kernel, libaokun, jack, ojaswin,
	ritesh.list, djwong, hch, yi.zhang, yi.zhang, yizhang089,
	yangerkun, yukuai
In-Reply-To: <20260422021042.4157510-1-yi.zhang@huaweicloud.com>

From: Zhang Yi <yi.zhang@huawei.com>

Current iomap_invalidate_folio() can only invalidate an entire folio. If
we truncate a partial folio on a filesystem where the block size is
smaller than the folio size, it will leave behind dirty bits for the
truncated or punched blocks. During the write-back process, it will
attempt to map the invalid hole range. Fortunately, this has not caused
any real problems so far because the ->writeback_range() function
corrects the length.

However, the implementation of FALLOC_FL_ZERO_RANGE in ext4 depends on
the support for invalidating partial folios. When ext4 partially zeroes
out a dirty and unwritten folio, it does not perform a flush first like
XFS. Therefore, if the dirty bits of the corresponding area cannot be
cleared, the zeroed area after writeback remains in the written state
rather than reverting to the unwritten state. Fix this by supporting
invalidation of partial folios.

Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
---
This is cherry picked form:
 https://lore.kernel.org/linux-fsdevel/20240812121159.3775074-3-yi.zhang@huaweicloud.com/
No code changes, only update the commit message to explain why Ext4
needs this.

 fs/iomap/buffered-io.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 7e7d5b776d35..b17296b61a6e 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -761,6 +761,8 @@ void iomap_invalidate_folio(struct folio *folio, size_t offset, size_t len)
 		WARN_ON_ONCE(folio_test_writeback(folio));
 		folio_cancel_dirty(folio);
 		ifs_free(folio);
+	} else {
+		iomap_clear_range_dirty(folio, offset, len);
 	}
 }
 EXPORT_SYMBOL_GPL(iomap_invalidate_folio);
-- 
2.52.0


^ permalink raw reply related

* [PATCH v3 16/22] ext4: disable online defrag when inode using iomap buffered I/O path
From: Zhang Yi @ 2026-04-22  2:10 UTC (permalink / raw)
  To: linux-ext4, linux-fsdevel
  Cc: linux-kernel, tytso, adilger.kernel, libaokun, jack, ojaswin,
	ritesh.list, djwong, hch, yi.zhang, yi.zhang, yizhang089,
	yangerkun, yukuai
In-Reply-To: <20260422021042.4157510-1-yi.zhang@huaweicloud.com>

From: Zhang Yi <yi.zhang@huawei.com>

Online defragmentation does not currently support inodes using the
iomap buffered I/O path, as it still relies on buffer_head for the
management of sub-folio blocks and on the data=ordered mode for data
consistency.

Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
---
 fs/ext4/move_extent.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 3329b7ad5dbd..f707a1096544 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -476,6 +476,17 @@ static int mext_check_validity(struct inode *orig_inode,
 		return -EOPNOTSUPP;
 	}
 
+	/*
+	 * TODO: support online defrag for inodes that using the buffered
+	 * I/O iomap path.
+	 */
+	if (ext4_inode_buffered_iomap(orig_inode) ||
+	    ext4_inode_buffered_iomap(donor_inode)) {
+		ext4_msg(sb, KERN_ERR,
+			 "Online defrag not supported for inode with iomap buffered IO path");
+		return -EOPNOTSUPP;
+	}
+
 	if (donor_inode->i_mode & (S_ISUID|S_ISGID)) {
 		ext4_debug("ext4 move extent: suid or sgid is set to donor file [ino:orig %llu, donor %llu]\n",
 			   orig_inode->i_ino, donor_inode->i_ino);
-- 
2.52.0


^ permalink raw reply related

* [PATCH v3 06/22] ext4: pass out extent seq counter when mapping da blocks
From: Zhang Yi @ 2026-04-22  2:10 UTC (permalink / raw)
  To: linux-ext4, linux-fsdevel
  Cc: linux-kernel, tytso, adilger.kernel, libaokun, jack, ojaswin,
	ritesh.list, djwong, hch, yi.zhang, yi.zhang, yizhang089,
	yangerkun, yukuai
In-Reply-To: <20260422021042.4157510-1-yi.zhang@huaweicloud.com>

From: Zhang Yi <yi.zhang@huawei.com>

The iomap buffered write path does not hold any locks between querying
inode extent mapping information and performing buffered writes. It
relies on the sequence counter saved in the inode to detect stale
mappings.

Commit 07c440e8da8f ("ext4: pass out extent seq counter when mapping
blocks") added the m_seq field to ext4_map_blocks to pass out extent
sequence numbers, but it missed two callsites within
ext4_da_map_blocks(). These callsites are on the delayed allocation
path, which is also used in the iomap buffered write path. Pass out the
sequence counter to ensure stale mappings can be detected.

Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
---
 fs/ext4/inode.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index df21f6870ec4..5ffd6aeb3485 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1929,7 +1929,7 @@ static int ext4_da_map_blocks(struct inode *inode, struct ext4_map_blocks *map)
 	ext4_check_map_extents_env(inode);
 
 	/* Lookup extent status tree firstly */
-	if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es, NULL)) {
+	if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es, &map->m_seq)) {
 		map->m_len = min_t(unsigned int, map->m_len,
 				   es.es_len - (map->m_lblk - es.es_lblk));
 
@@ -1982,7 +1982,7 @@ static int ext4_da_map_blocks(struct inode *inode, struct ext4_map_blocks *map)
 	 * is held in write mode, before inserting a new da entry in
 	 * the extent status tree.
 	 */
-	if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es, NULL)) {
+	if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es, &map->m_seq)) {
 		map->m_len = min_t(unsigned int, map->m_len,
 				   es.es_len - (map->m_lblk - es.es_lblk));
 
-- 
2.52.0


^ permalink raw reply related

* [PATCH v3 13/22] iomap: fix incorrect did_zero setting in iomap_zero_iter()
From: Zhang Yi @ 2026-04-22  2:10 UTC (permalink / raw)
  To: linux-ext4, linux-fsdevel
  Cc: linux-kernel, tytso, adilger.kernel, libaokun, jack, ojaswin,
	ritesh.list, djwong, hch, yi.zhang, yi.zhang, yizhang089,
	yangerkun, yukuai
In-Reply-To: <20260422021042.4157510-1-yi.zhang@huaweicloud.com>

From: Zhang Yi <yi.zhang@huawei.com>

The did_zero output parameter was unconditionally set after the loop,
which is incorrect. It should only be set when the zeroing operation
actually completes, not when IOMAP_F_STALE is set or when
IOMAP_F_FOLIO_BATCH is set but !folio causes the loop to break early,
or when iomap_iter_advance() returns an error.

This causes did_zero to be incorrectly set when zeroing a clean
unwritten extent because the loop exits early without actually zeroing
any data.

Fix it by using a local variable to track whether any folio was actually
zeroed, and only set did_zero after the loop if zeroing happened.

Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
---
This is cherry picked form:
 https://lore.kernel.org/linux-fsdevel/20260310082250.3535486-1-yi.zhang@huaweicloud.com/
No changes.

 fs/iomap/buffered-io.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index b17296b61a6e..0ffc2c3230af 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -1542,6 +1542,7 @@ static int iomap_zero_iter(struct iomap_iter *iter, bool *did_zero,
 		const struct iomap_write_ops *write_ops)
 {
 	u64 bytes = iomap_length(iter);
+	bool zeroed = false;
 	int status;
 
 	do {
@@ -1560,6 +1561,8 @@ static int iomap_zero_iter(struct iomap_iter *iter, bool *did_zero,
 		/* a NULL folio means we're done with a folio batch */
 		if (!folio) {
 			status = iomap_iter_advance_full(iter);
+			if (status)
+				return status;
 			break;
 		}
 
@@ -1570,6 +1573,7 @@ static int iomap_zero_iter(struct iomap_iter *iter, bool *did_zero,
 				bytes);
 
 		folio_zero_range(folio, offset, bytes);
+		zeroed = true;
 		folio_mark_accessed(folio);
 
 		ret = iomap_write_end(iter, bytes, bytes, folio);
@@ -1579,10 +1583,10 @@ static int iomap_zero_iter(struct iomap_iter *iter, bool *did_zero,
 
 		status = iomap_iter_advance(iter, bytes);
 		if (status)
-			break;
+			return status;
 	} while ((bytes = iomap_length(iter)) > 0);
 
-	if (did_zero)
+	if (did_zero && zeroed)
 		*did_zero = true;
 	return status;
 }
-- 
2.52.0


^ permalink raw reply related

* [PATCH v3 09/22] ext4: implement writeback path using iomap
From: Zhang Yi @ 2026-04-22  2:10 UTC (permalink / raw)
  To: linux-ext4, linux-fsdevel
  Cc: linux-kernel, tytso, adilger.kernel, libaokun, jack, ojaswin,
	ritesh.list, djwong, hch, yi.zhang, yi.zhang, yizhang089,
	yangerkun, yukuai
In-Reply-To: <20260422021042.4157510-1-yi.zhang@huaweicloud.com>

From: Zhang Yi <yi.zhang@huawei.com>

Implement the iomap writeback path for ext4. It implements
ext4_iomap_writepages(), introduces a new iomap_writeback_ops instance,
ext4_writeback_ops, and creates a new end I/O extent conversion worker
to convert unwritten extents after the I/O is completed.

In the ->writeback_range() callback, it first calls
ext4_iomap_map_writeback_range() to query the longest range of existing
mapped extents. For performance considerations, if the block range has
not been allocated, it attempts to allocate a range of the longest
blocks which is based on the writeback length and the delalloc extent
length, rather than allocating for a single folio length at a time.
Then, it adds the folio to the iomap_ioend instance.

In the ->writeback_submit() callback, it registers a special end bio
callback, ext4_iomap_end_bio(), which will start a worker if we need to
convert unwritten extents or need to update i_disksize after the data
has been written back, and if we need to abort the journal when the I/O
failed to write back.

Key changes:

 - Since we don't use data=ordered mode to prevent exposing stale data
   during append writebacks, we always allocate unwritten extents for
   new blocks and postpone updating the i_disksize until the I/O is
   done. In addition, the deadlock problem that was expected to be
   resolved through the reserve handle does not exist here. Therefore,
   we also do not need to use the reserve handle when converting the
   unwritten extent in the end I/O worker; we can start a normal
   journal handle instead.

 - Since ->writeback_range() is always executed under the folio lock,
   this means we need to start the handle under the folio lock as well.
   This is opposite to the order in the buffer_head writeback path. The
   lock ordering documentation in super.c has been updated accordingly.

Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
---
 fs/ext4/ext4.h    |   4 +
 fs/ext4/inode.c   | 202 +++++++++++++++++++++++++++++++++++++++++++++-
 fs/ext4/page-io.c | 129 +++++++++++++++++++++++++++++
 fs/ext4/super.c   |   7 +-
 4 files changed, 340 insertions(+), 2 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index be92ff648362..0ffa81f86bc5 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1173,6 +1173,8 @@ struct ext4_inode_info {
 	 */
 	struct list_head i_rsv_conversion_list;
 	struct work_struct i_rsv_conversion_work;
+	struct list_head i_iomap_ioend_list;
+	struct work_struct i_iomap_ioend_work;
 
 	/*
 	 * Transactions that contain inode's metadata needed to complete
@@ -3887,6 +3889,8 @@ int ext4_bio_write_folio(struct ext4_io_submit *io, struct folio *page,
 		size_t len);
 extern struct ext4_io_end_vec *ext4_alloc_io_end_vec(ext4_io_end_t *io_end);
 extern struct ext4_io_end_vec *ext4_last_io_end_vec(ext4_io_end_t *io_end);
+extern void ext4_iomap_end_io(struct work_struct *work);
+extern void ext4_iomap_end_bio(struct bio *bio);
 
 /* mmp.c */
 extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 0ca303a90249..76ce43c64c30 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -44,6 +44,7 @@
 #include <linux/iversion.h>
 
 #include "ext4_jbd2.h"
+#include "ext4_extents.h"
 #include "xattr.h"
 #include "acl.h"
 #include "truncate.h"
@@ -4119,10 +4120,209 @@ static void ext4_iomap_readahead(struct readahead_control *rac)
 	iomap_bio_readahead(rac, &ext4_iomap_buffered_read_ops);
 }
 
+static int ext4_iomap_map_one_extent(struct inode *inode,
+				     struct ext4_map_blocks *map)
+{
+	struct extent_status es;
+	handle_t *handle = NULL;
+	int credits, map_flags;
+	int retval;
+
+	credits = ext4_chunk_trans_blocks(inode, map->m_len);
+	handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, credits);
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+
+	map->m_flags = 0;
+	/*
+	 * It is necessary to look up extent and map blocks under i_data_sem
+	 * in write mode, otherwise, the delalloc extent may become stale
+	 * during concurrent truncate operations.
+	 */
+	ext4_fc_track_inode(handle, inode);
+	down_write(&EXT4_I(inode)->i_data_sem);
+	if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es, &map->m_seq)) {
+		retval = es.es_len - (map->m_lblk - es.es_lblk);
+		map->m_len = min_t(unsigned int, retval, map->m_len);
+
+		if (ext4_es_is_delayed(&es)) {
+			map->m_flags |= EXT4_MAP_DELAYED;
+			trace_ext4_da_write_pages_extent(inode, map);
+			/*
+			 * Call ext4_map_create_blocks() to allocate any
+			 * delayed allocation blocks. It is possible that
+			 * we're going to need more metadata blocks, however
+			 * we must not fail because we're in writeback and
+			 * there is nothing we can do so it might result in
+			 * data loss. So use reserved blocks to allocate
+			 * metadata if possible.
+			 */
+			map_flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT |
+				    EXT4_GET_BLOCKS_METADATA_NOFAIL |
+				    EXT4_EX_NOCACHE;
+
+			retval = ext4_map_create_blocks(handle, inode, map,
+							map_flags);
+			if (retval > 0)
+				ext4_fc_track_range(handle, inode, map->m_lblk,
+						map->m_lblk + map->m_len - 1);
+			goto out;
+		} else if (unlikely(ext4_es_is_hole(&es)))
+			goto out;
+
+		/* Found written or unwritten extent. */
+		map->m_pblk = ext4_es_pblock(&es) + map->m_lblk - es.es_lblk;
+		map->m_flags = ext4_es_is_written(&es) ?
+			       EXT4_MAP_MAPPED : EXT4_MAP_UNWRITTEN;
+		goto out;
+	}
+
+	retval = ext4_map_query_blocks(handle, inode, map, EXT4_EX_NOCACHE);
+out:
+	up_write(&EXT4_I(inode)->i_data_sem);
+	ext4_journal_stop(handle);
+	return retval < 0 ? retval : 0;
+}
+
+static int ext4_iomap_map_writeback_range(struct iomap_writepage_ctx *wpc,
+					  loff_t offset, unsigned int dirty_len)
+{
+	struct inode *inode = wpc->inode;
+	struct super_block *sb = inode->i_sb;
+	struct journal_s *journal = EXT4_SB(sb)->s_journal;
+	struct ext4_map_blocks map;
+	unsigned int blkbits = inode->i_blkbits;
+	unsigned int index = offset >> blkbits;
+	unsigned int blk_end, blk_len;
+	int ret;
+
+	ret = ext4_emergency_state(sb);
+	if (unlikely(ret))
+		return ret;
+
+	/* Check validity of the cached writeback mapping. */
+	if (offset >= wpc->iomap.offset &&
+	    offset < wpc->iomap.offset + wpc->iomap.length &&
+	    ext4_iomap_valid(inode, &wpc->iomap))
+		return 0;
+
+	blk_len = dirty_len >> blkbits;
+	blk_end = min_t(unsigned int, (wpc->wbc->range_end >> blkbits),
+				      (UINT_MAX - 1));
+	if (blk_end > index + blk_len)
+		blk_len = blk_end - index + 1;
+
+retry:
+	map.m_lblk = index;
+	map.m_len = min_t(unsigned int, MAX_WRITEPAGES_EXTENT_LEN, blk_len);
+	ret = ext4_map_blocks(NULL, inode, &map,
+			      EXT4_GET_BLOCKS_IO_SUBMIT | EXT4_EX_NOCACHE);
+	if (ret < 0)
+		return ret;
+
+	/*
+	 * The map is not a delalloc extent, it must either be a hole
+	 * or an extent which have already been allocated.
+	 */
+	if (!(map.m_flags & EXT4_MAP_DELAYED))
+		goto out;
+
+	/* Map one delalloc extent. */
+	ret = ext4_iomap_map_one_extent(inode, &map);
+	if (ret < 0) {
+		if (ext4_emergency_state(sb))
+			return ret;
+
+		/*
+		 * Retry transient ENOSPC errors, if
+		 * ext4_count_free_blocks() is non-zero, a commit
+		 * should free up blocks.
+		 */
+		if (ret == -ENOSPC && journal && ext4_count_free_clusters(sb)) {
+			jbd2_journal_force_commit_nested(journal);
+			goto retry;
+		}
+
+		ext4_msg(sb, KERN_CRIT,
+			 "Delayed block allocation failed for inode %llu at logical offset %llu with max blocks %u with error %d",
+			 inode->i_ino, (unsigned long long)map.m_lblk,
+			 (unsigned int)map.m_len, -ret);
+		ext4_msg(sb, KERN_CRIT,
+			 "This should not happen!! Data will be lost\n");
+		if (ret == -ENOSPC)
+			ext4_print_free_blocks(inode);
+		return ret;
+	}
+out:
+	ext4_set_iomap(inode, &wpc->iomap, &map, offset, dirty_len, 0);
+	return 0;
+}
+
+static void ext4_iomap_discard_folio(struct folio *folio, loff_t pos)
+{
+	struct inode *inode = folio->mapping->host;
+	loff_t length = folio_pos(folio) + folio_size(folio) - pos;
+
+	ext4_iomap_punch_delalloc(inode, pos, length, NULL);
+}
+
+static ssize_t ext4_iomap_writeback_range(struct iomap_writepage_ctx *wpc,
+					  struct folio *folio, u64 offset,
+					  unsigned int len, u64 end_pos)
+{
+	ssize_t ret;
+
+	ret = ext4_iomap_map_writeback_range(wpc, offset, len);
+	if (!ret)
+		ret = iomap_add_to_ioend(wpc, folio, offset, end_pos, len);
+	if (ret < 0)
+		ext4_iomap_discard_folio(folio, offset);
+	return ret;
+}
+
+static int ext4_iomap_writeback_submit(struct iomap_writepage_ctx *wpc,
+				       int error)
+{
+	struct iomap_ioend *ioend = wpc->wb_ctx;
+	struct ext4_inode_info *ei = EXT4_I(ioend->io_inode);
+
+	/* Need to convert unwritten extents when I/Os are completed. */
+	if ((ioend->io_flags & IOMAP_IOEND_UNWRITTEN) ||
+	    ioend->io_offset + ioend->io_size > READ_ONCE(ei->i_disksize))
+		ioend->io_bio.bi_end_io = ext4_iomap_end_bio;
+
+	return iomap_ioend_writeback_submit(wpc, error);
+}
+
+static const struct iomap_writeback_ops ext4_writeback_ops = {
+	.writeback_range = ext4_iomap_writeback_range,
+	.writeback_submit = ext4_iomap_writeback_submit,
+};
+
 static int ext4_iomap_writepages(struct address_space *mapping,
 				 struct writeback_control *wbc)
 {
-	return 0;
+	struct inode *inode = mapping->host;
+	struct super_block *sb = inode->i_sb;
+	long nr = wbc->nr_to_write;
+	int alloc_ctx, ret;
+	struct iomap_writepage_ctx wpc = {
+		.inode = inode,
+		.wbc = wbc,
+		.ops = &ext4_writeback_ops,
+	};
+
+	ret = ext4_emergency_state(sb);
+	if (unlikely(ret))
+		return ret;
+
+	alloc_ctx = ext4_writepages_down_read(sb);
+	trace_ext4_writepages(inode, wbc);
+	ret = iomap_writepages(&wpc);
+	trace_ext4_writepages_result(inode, wbc, ret, nr - wbc->nr_to_write);
+	ext4_writepages_up_read(sb, alloc_ctx);
+
+	return ret;
 }
 
 /*
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index dc82e7b57e75..07978e2cd9c8 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -22,6 +22,7 @@
 #include <linux/bio.h>
 #include <linux/workqueue.h>
 #include <linux/kernel.h>
+#include <linux/iomap.h>
 #include <linux/slab.h>
 #include <linux/mm.h>
 #include <linux/sched/mm.h>
@@ -611,3 +612,131 @@ int ext4_bio_write_folio(struct ext4_io_submit *io, struct folio *folio,
 
 	return 0;
 }
+
+static int ext4_iomap_wb_update_disksize(handle_t *handle, struct inode *inode,
+					 loff_t end)
+{
+	loff_t new_disksize = end;
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	int ret;
+
+	if (new_disksize <= READ_ONCE(ei->i_disksize))
+		return 0;
+
+	/*
+	 * Update on-disk size after IO is completed. Races with truncate
+	 * are avoided by checking i_size under i_data_sem.
+	 */
+	down_write(&ei->i_data_sem);
+	new_disksize = min(new_disksize, i_size_read(inode));
+	if (new_disksize > ei->i_disksize)
+		ei->i_disksize = new_disksize;
+	up_write(&ei->i_data_sem);
+	ret = ext4_mark_inode_dirty(handle, inode);
+	if (ret)
+		EXT4_ERROR_INODE_ERR(inode, -ret, "Failed to mark inode dirty");
+
+	return ret;
+}
+
+static void ext4_iomap_finish_ioend(struct iomap_ioend *ioend)
+{
+	struct inode *inode = ioend->io_inode;
+	struct super_block *sb = inode->i_sb;
+	loff_t pos = ioend->io_offset;
+	size_t size = ioend->io_size;
+	handle_t *handle;
+	int credits;
+	int ret, err;
+
+	ret = blk_status_to_errno(ioend->io_bio.bi_status);
+	if (unlikely(ret)) {
+		if (test_opt(sb, DATA_ERR_ABORT))
+			jbd2_journal_abort(EXT4_SB(sb)->s_journal, ret);
+		goto out;
+	}
+
+	/* We may need to convert one extent and dirty the inode. */
+	credits = ext4_chunk_trans_blocks(inode,
+			EXT4_MAX_BLOCKS(size, pos, inode->i_blkbits));
+	handle = ext4_journal_start(inode, EXT4_HT_EXT_CONVERT, credits);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		goto out_err;
+	}
+
+	if (ioend->io_flags & IOMAP_IOEND_UNWRITTEN) {
+		ret = ext4_convert_unwritten_extents(handle, inode, pos, size);
+		if (ret)
+			goto out_journal;
+	}
+
+	ret = ext4_iomap_wb_update_disksize(handle, inode, pos + size);
+out_journal:
+	err = ext4_journal_stop(handle);
+	if (!ret)
+		ret = err;
+out_err:
+	if (ret < 0 && !ext4_emergency_state(sb)) {
+		ext4_msg(sb, KERN_EMERG,
+			 "failed to convert unwritten extents to written extents or update inode size -- potential data loss! (inode %llu, error %d)",
+			 inode->i_ino, ret);
+	}
+out:
+	iomap_finish_ioends(ioend, ret);
+}
+
+/*
+ * Work on buffered iomap completed IO, to convert unwritten extents to
+ * mapped extents
+ */
+void ext4_iomap_end_io(struct work_struct *work)
+{
+	struct ext4_inode_info *ei = container_of(work, struct ext4_inode_info,
+						  i_iomap_ioend_work);
+	struct iomap_ioend *ioend;
+	struct list_head ioend_list;
+	unsigned long flags;
+
+	spin_lock_irqsave(&ei->i_completed_io_lock, flags);
+	list_replace_init(&ei->i_iomap_ioend_list, &ioend_list);
+	spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
+
+	iomap_sort_ioends(&ioend_list);
+	while (!list_empty(&ioend_list)) {
+		ioend = list_entry(ioend_list.next, struct iomap_ioend, io_list);
+		list_del_init(&ioend->io_list);
+		iomap_ioend_try_merge(ioend, &ioend_list);
+		ext4_iomap_finish_ioend(ioend);
+	}
+}
+
+void ext4_iomap_end_bio(struct bio *bio)
+{
+	struct iomap_ioend *ioend = iomap_ioend_from_bio(bio);
+	struct inode *inode = ioend->io_inode;
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+	unsigned long flags;
+	int ret;
+
+	/* Needs to convert unwritten extents or update the i_disksize. */
+	if ((ioend->io_flags & IOMAP_IOEND_UNWRITTEN) ||
+	    ioend->io_offset + ioend->io_size > READ_ONCE(ei->i_disksize))
+		goto defer;
+
+	/* Needs to abort the journal on data_err=abort.  */
+	ret = blk_status_to_errno(ioend->io_bio.bi_status);
+	if (unlikely(ret) && test_opt(inode->i_sb, DATA_ERR_ABORT) &&
+	    !ext4_emergency_state(inode->i_sb))
+		goto defer;
+
+	iomap_finish_ioends(ioend, ret);
+	return;
+defer:
+	spin_lock_irqsave(&ei->i_completed_io_lock, flags);
+	if (list_empty(&ei->i_iomap_ioend_list))
+		queue_work(sbi->rsv_conversion_wq, &ei->i_iomap_ioend_work);
+	list_add_tail(&ioend->io_list, &ei->i_iomap_ioend_list);
+	spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
+}
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 9bc294b769db..51d87db53543 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -123,7 +123,10 @@ static const struct fs_parameter_spec ext4_param_specs[];
  * sb_start_write -> i_mutex -> transaction start -> i_data_sem (rw)
  *
  * writepages:
- * transaction start -> page lock(s) -> i_data_sem (rw)
+ * - buffer_head path:
+ *   transaction start -> folio lock(s) -> i_data_sem (rw)
+ * - iomap path:
+ *   folio lock -> transaction start -> i_data_sem (rw)
  */
 
 static const struct fs_context_operations ext4_context_ops = {
@@ -1428,10 +1431,12 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
 #endif
 	ei->jinode = NULL;
 	INIT_LIST_HEAD(&ei->i_rsv_conversion_list);
+	INIT_LIST_HEAD(&ei->i_iomap_ioend_list);
 	spin_lock_init(&ei->i_completed_io_lock);
 	ei->i_sync_tid = 0;
 	ei->i_datasync_tid = 0;
 	INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work);
+	INIT_WORK(&ei->i_iomap_ioend_work, ext4_iomap_end_io);
 	ext4_fc_init_inode(&ei->vfs_inode);
 	spin_lock_init(&ei->i_fc_lock);
 	mmb_init(&ei->i_metadata_bhs, &ei->vfs_inode.i_data);
-- 
2.52.0


^ permalink raw reply related

* [PATCH v3 08/22] ext4: implement buffered write path using iomap
From: Zhang Yi @ 2026-04-22  2:10 UTC (permalink / raw)
  To: linux-ext4, linux-fsdevel
  Cc: linux-kernel, tytso, adilger.kernel, libaokun, jack, ojaswin,
	ritesh.list, djwong, hch, yi.zhang, yi.zhang, yizhang089,
	yangerkun, yukuai
In-Reply-To: <20260422021042.4157510-1-yi.zhang@huaweicloud.com>

From: Zhang Yi <yi.zhang@huawei.com>

Introduce two new iomap_ops instances, ext4_iomap_buffered_write_ops and
ext4_iomap_buffered_da_write_ops, to implement the iomap write paths for
ext4. ext4_iomap_buffered_da_write_begin() invokes ext4_da_map_blocks()
to map delayed allocation extents, and ext4_iomap_buffer_write_begin()
invokes ext4_iomap_get_blocks() to directly allocate blocks in
non-delayed allocation mode. Additionally, add ext4_iomap_valid() to
check the validity of extents by the iomap infrastructure.

Key changes:

 - Since we don't use data=ordered mode to prevent exposing stale data
   in the non-delayed allocation path, we always allocate unwritten
   extents for new blocks.

 - The iomap write path maps multiple blocks at a time in the
   iomap_begin() callbacks, so we must remove the stale delayed
   allocation range in case of short writes and write failures.
   Otherwise, this could result in a range of delayed extents being
   covered by a clean folio, which would lead to inaccurate space
   reservation.

 - The lock ordering of the folio lock and transaction start is the
   opposite of that in the buffer_head buffered write path. So we have
   to stop journal handle in the iomap_begin() callbacks. The lock
   ordering documentation in super.c has been updated accordingly.

Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
---
 fs/ext4/ext4.h  |   4 ++
 fs/ext4/file.c  |  20 +++++-
 fs/ext4/inode.c | 164 +++++++++++++++++++++++++++++++++++++++++++++++-
 fs/ext4/super.c |  10 ++-
 4 files changed, 191 insertions(+), 7 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index fe3491ad2129..be92ff648362 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -3057,6 +3057,7 @@ int ext4_walk_page_buffers(handle_t *handle,
 int do_journal_get_write_access(handle_t *handle, struct inode *inode,
 				struct buffer_head *bh);
 void ext4_set_inode_mapping_order(struct inode *inode);
+int ext4_nonda_switch(struct super_block *sb);
 #define FALL_BACK_TO_NONDELALLOC 1
 #define CONVERT_INLINE_DATA	 2
 
@@ -3943,6 +3944,9 @@ static inline void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end)
 
 extern const struct iomap_ops ext4_iomap_ops;
 extern const struct iomap_ops ext4_iomap_report_ops;
+extern const struct iomap_ops ext4_iomap_buffered_write_ops;
+extern const struct iomap_ops ext4_iomap_buffered_da_write_ops;
+extern const struct iomap_write_ops ext4_iomap_write_ops;
 
 static inline int ext4_buffer_uptodate(struct buffer_head *bh)
 {
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index eb1a323962b1..7f9bfbbc4a4e 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -299,6 +299,21 @@ static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from)
 	return count;
 }
 
+static ssize_t ext4_iomap_buffered_write(struct kiocb *iocb,
+					 struct iov_iter *from)
+{
+	struct inode *inode = file_inode(iocb->ki_filp);
+	const struct iomap_ops *iomap_ops;
+
+	if (test_opt(inode->i_sb, DELALLOC) && !ext4_nonda_switch(inode->i_sb))
+		iomap_ops = &ext4_iomap_buffered_da_write_ops;
+	else
+		iomap_ops = &ext4_iomap_buffered_write_ops;
+
+	return iomap_file_buffered_write(iocb, from, iomap_ops,
+					 &ext4_iomap_write_ops, NULL);
+}
+
 static ssize_t ext4_buffered_write_iter(struct kiocb *iocb,
 					struct iov_iter *from)
 {
@@ -313,7 +328,10 @@ static ssize_t ext4_buffered_write_iter(struct kiocb *iocb,
 	if (ret <= 0)
 		goto out;
 
-	ret = generic_perform_write(iocb, from);
+	if (ext4_inode_buffered_iomap(inode))
+		ret = ext4_iomap_buffered_write(iocb, from);
+	else
+		ret = generic_perform_write(iocb, from);
 
 out:
 	inode_unlock(inode);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 5ffd6aeb3485..0ca303a90249 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3097,7 +3097,7 @@ static int ext4_dax_writepages(struct address_space *mapping,
 	return ret;
 }
 
-static int ext4_nonda_switch(struct super_block *sb)
+int ext4_nonda_switch(struct super_block *sb)
 {
 	s64 free_clusters, dirty_clusters;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -3467,6 +3467,15 @@ static bool ext4_inode_datasync_dirty(struct inode *inode)
 	return inode_state_read_once(inode) & I_DIRTY_DATASYNC;
 }
 
+static bool ext4_iomap_valid(struct inode *inode, const struct iomap *iomap)
+{
+	return iomap->validity_cookie == READ_ONCE(EXT4_I(inode)->i_es_seq);
+}
+
+const struct iomap_write_ops ext4_iomap_write_ops = {
+	.iomap_valid = ext4_iomap_valid,
+};
+
 static void ext4_set_iomap(struct inode *inode, struct iomap *iomap,
 			   struct ext4_map_blocks *map, loff_t offset,
 			   loff_t length, unsigned int flags)
@@ -3501,6 +3510,8 @@ static void ext4_set_iomap(struct inode *inode, struct iomap *iomap,
 	    !ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
 		iomap->flags |= IOMAP_F_MERGED;
 
+	iomap->validity_cookie = map->m_seq;
+
 	/*
 	 * Flags passed to ext4_map_blocks() for direct I/O writes can result
 	 * in m_flags having both EXT4_MAP_MAPPED and EXT4_MAP_UNWRITTEN bits
@@ -3908,8 +3919,12 @@ const struct iomap_ops ext4_iomap_report_ops = {
 	.iomap_begin = ext4_iomap_begin_report,
 };
 
+/* Map blocks */
+typedef int (ext4_get_blocks_t)(struct inode *, struct ext4_map_blocks *);
+
 static int ext4_iomap_map_blocks(struct inode *inode, loff_t offset,
-		loff_t length, struct ext4_map_blocks *map)
+		loff_t length, ext4_get_blocks_t get_blocks,
+		struct ext4_map_blocks *map)
 {
 	u8 blkbits = inode->i_blkbits;
 
@@ -3921,6 +3936,9 @@ static int ext4_iomap_map_blocks(struct inode *inode, loff_t offset,
 	map->m_len = min_t(loff_t, (offset + length - 1) >> blkbits,
 			   EXT4_MAX_LOGICAL_BLOCK) - map->m_lblk + 1;
 
+	if (get_blocks)
+		return get_blocks(inode, map);
+
 	return ext4_map_blocks(NULL, inode, map, 0);
 }
 
@@ -3938,7 +3956,7 @@ static int ext4_iomap_buffered_read_begin(struct inode *inode, loff_t offset,
 	if (WARN_ON_ONCE(ext4_has_inline_data(inode)))
 		return -ERANGE;
 
-	ret = ext4_iomap_map_blocks(inode, offset, length, &map);
+	ret = ext4_iomap_map_blocks(inode, offset, length, NULL, &map);
 	if (ret < 0)
 		return ret;
 
@@ -3946,6 +3964,146 @@ static int ext4_iomap_buffered_read_begin(struct inode *inode, loff_t offset,
 	return 0;
 }
 
+static int ext4_iomap_get_blocks(struct inode *inode,
+				 struct ext4_map_blocks *map)
+{
+	loff_t i_size = i_size_read(inode);
+	handle_t *handle;
+	int ret, needed_blocks;
+
+	/*
+	 * Check if the blocks have already been allocated, this could
+	 * avoid initiating a new journal transaction and return the
+	 * mapping information directly.
+	 */
+	if ((map->m_lblk + map->m_len) <=
+	    round_up(i_size, i_blocksize(inode)) >> inode->i_blkbits) {
+		ret = ext4_map_blocks(NULL, inode, map, 0);
+		if (ret < 0)
+			return ret;
+		if (map->m_flags & (EXT4_MAP_MAPPED | EXT4_MAP_UNWRITTEN |
+				    EXT4_MAP_DELAYED))
+			return 0;
+	}
+
+	/*
+	 * Reserve one block more for addition to orphan list in case
+	 * we allocate blocks but write fails for some reason.
+	 */
+	needed_blocks = ext4_chunk_trans_blocks(inode, map->m_len) + 1;
+	handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, needed_blocks);
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+
+	ret = ext4_map_blocks(handle, inode, map,
+			      EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT);
+	/*
+	 * Stop handle here following the lock ordering of the folio lock
+	 * and the transaction start.
+	 */
+	ext4_journal_stop(handle);
+
+	return ret;
+}
+
+static int ext4_iomap_buffered_do_write_begin(struct inode *inode,
+		loff_t offset, loff_t length, unsigned int flags,
+		struct iomap *iomap, struct iomap *srcmap, bool delalloc)
+{
+	int ret, retries = 0;
+	struct ext4_map_blocks map;
+	ext4_get_blocks_t *get_blocks;
+
+	ret = ext4_emergency_state(inode->i_sb);
+	if (unlikely(ret))
+		return ret;
+
+	/* Inline data support is not yet available. */
+	if (WARN_ON_ONCE(ext4_has_inline_data(inode)))
+		return -ERANGE;
+	if (WARN_ON_ONCE(!(flags & IOMAP_WRITE)))
+		return -EINVAL;
+
+	if (delalloc)
+		get_blocks = ext4_da_map_blocks;
+	else
+		get_blocks = ext4_iomap_get_blocks;
+retry:
+	ret = ext4_iomap_map_blocks(inode, offset, length, get_blocks, &map);
+	if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
+		goto retry;
+	if (ret < 0)
+		return ret;
+
+	ext4_set_iomap(inode, iomap, &map, offset, length, flags);
+	return 0;
+}
+
+static int ext4_iomap_buffered_write_begin(struct inode *inode,
+		loff_t offset, loff_t length, unsigned int flags,
+		struct iomap *iomap, struct iomap *srcmap)
+{
+	return ext4_iomap_buffered_do_write_begin(inode, offset, length, flags,
+						  iomap, srcmap, false);
+}
+
+static int ext4_iomap_buffered_da_write_begin(struct inode *inode,
+		loff_t offset, loff_t length, unsigned int flags,
+		struct iomap *iomap, struct iomap *srcmap)
+{
+	return ext4_iomap_buffered_do_write_begin(inode, offset, length, flags,
+						  iomap, srcmap, true);
+}
+
+/*
+ * Drop the staled delayed allocation range from the write failure,
+ * including both start and end blocks. If not, we could leave a range
+ * of delayed extents covered by a clean folio, it could lead to
+ * inaccurate space reservation.
+ */
+static void ext4_iomap_punch_delalloc(struct inode *inode, loff_t offset,
+				     loff_t length, struct iomap *iomap)
+{
+	down_write(&EXT4_I(inode)->i_data_sem);
+	ext4_es_remove_extent(inode, offset >> inode->i_blkbits,
+			DIV_ROUND_UP_ULL(length, EXT4_BLOCK_SIZE(inode->i_sb)));
+	up_write(&EXT4_I(inode)->i_data_sem);
+}
+
+static int ext4_iomap_buffered_da_write_end(struct inode *inode, loff_t offset,
+					    loff_t length, ssize_t written,
+					    unsigned int flags,
+					    struct iomap *iomap)
+{
+	loff_t start_byte, end_byte;
+
+	/* If we didn't reserve the blocks, we're not allowed to punch them. */
+	if (iomap->type != IOMAP_DELALLOC || !(iomap->flags & IOMAP_F_NEW))
+		return 0;
+
+	/* Nothing to do if we've written the entire delalloc extent */
+	start_byte = iomap_last_written_block(inode, offset, written);
+	end_byte = round_up(offset + length, i_blocksize(inode));
+	if (start_byte >= end_byte)
+		return 0;
+
+	filemap_invalidate_lock(inode->i_mapping);
+	iomap_write_delalloc_release(inode, start_byte, end_byte, flags,
+				     iomap, ext4_iomap_punch_delalloc);
+	filemap_invalidate_unlock(inode->i_mapping);
+	return 0;
+}
+
+
+const struct iomap_ops ext4_iomap_buffered_write_ops = {
+	.iomap_begin = ext4_iomap_buffered_write_begin,
+};
+
+const struct iomap_ops ext4_iomap_buffered_da_write_ops = {
+	.iomap_begin = ext4_iomap_buffered_da_write_begin,
+	.iomap_end = ext4_iomap_buffered_da_write_end,
+};
+
 const struct iomap_ops ext4_iomap_buffered_read_ops = {
 	.iomap_begin = ext4_iomap_buffered_read_begin,
 };
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 6a77db4d3124..9bc294b769db 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -104,9 +104,13 @@ static const struct fs_parameter_spec ext4_param_specs[];
  *   -> page lock -> i_data_sem (rw)
  *
  * buffered write path:
- * sb_start_write -> i_mutex -> mmap_lock
- * sb_start_write -> i_mutex -> transaction start -> page lock ->
- *   i_data_sem (rw)
+ * sb_start_write -> i_rwsem (w) -> mmap_lock
+ * - buffer_head path:
+ *   sb_start_write -> i_rwsem (w) -> transaction start -> folio lock ->
+ *     i_data_sem (rw)
+ * - iomap path:
+ *   sb_start_write -> i_rwsem (w) -> transaction start -> i_data_sem (rw)
+ *   sb_start_write -> i_rwsem (w) -> folio lock (not under an active handle)
  *
  * truncate:
  * sb_start_write -> i_mutex -> invalidate_lock (w) -> i_mmap_rwsem (w) ->
-- 
2.52.0


^ permalink raw reply related

* [PATCH v3 05/22] ext4: implement buffered read path using iomap
From: Zhang Yi @ 2026-04-22  2:10 UTC (permalink / raw)
  To: linux-ext4, linux-fsdevel
  Cc: linux-kernel, tytso, adilger.kernel, libaokun, jack, ojaswin,
	ritesh.list, djwong, hch, yi.zhang, yi.zhang, yizhang089,
	yangerkun, yukuai
In-Reply-To: <20260422021042.4157510-1-yi.zhang@huaweicloud.com>

From: Zhang Yi <yi.zhang@huawei.com>

Implement the iomap read path for ext4 by introducing a new
ext4_iomap_buffered_read_ops instance. This provides the read_folio()
and readahead() callbacks for ext4_iomap_aops. The implementation
introduces:

 - ext4_iomap_map_blocks(): Helper function to query extent mappings for
   a given read range using ext4_map_blocks() and convert the mapping
   information to iomap type
 - ext4_iomap_buffered_read_begin(): The iomap_begin callback that maps
   blocks, validates filesystem state, and populates the iomap. It
   returns -ERANGE for inline data which is not yet supported.

Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
---
 fs/ext4/inode.c | 45 ++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 44 insertions(+), 1 deletion(-)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 9e9f421888ed..df21f6870ec4 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3908,14 +3908,57 @@ const struct iomap_ops ext4_iomap_report_ops = {
 	.iomap_begin = ext4_iomap_begin_report,
 };
 
+static int ext4_iomap_map_blocks(struct inode *inode, loff_t offset,
+		loff_t length, struct ext4_map_blocks *map)
+{
+	u8 blkbits = inode->i_blkbits;
+
+	if ((offset >> blkbits) > EXT4_MAX_LOGICAL_BLOCK)
+		return -EINVAL;
+
+	/* Calculate the first and last logical blocks respectively. */
+	map->m_lblk = offset >> blkbits;
+	map->m_len = min_t(loff_t, (offset + length - 1) >> blkbits,
+			   EXT4_MAX_LOGICAL_BLOCK) - map->m_lblk + 1;
+
+	return ext4_map_blocks(NULL, inode, map, 0);
+}
+
+static int ext4_iomap_buffered_read_begin(struct inode *inode, loff_t offset,
+		loff_t length, unsigned int flags, struct iomap *iomap,
+		struct iomap *srcmap)
+{
+	struct ext4_map_blocks map;
+	int ret;
+
+	if (unlikely(ext4_forced_shutdown(inode->i_sb)))
+		return -EIO;
+
+	/* Inline data support is not yet available. */
+	if (WARN_ON_ONCE(ext4_has_inline_data(inode)))
+		return -ERANGE;
+
+	ret = ext4_iomap_map_blocks(inode, offset, length, &map);
+	if (ret < 0)
+		return ret;
+
+	ext4_set_iomap(inode, iomap, &map, offset, length, flags);
+	return 0;
+}
+
+const struct iomap_ops ext4_iomap_buffered_read_ops = {
+	.iomap_begin = ext4_iomap_buffered_read_begin,
+};
+
 static int ext4_iomap_read_folio(struct file *file, struct folio *folio)
 {
+	iomap_bio_read_folio(folio, &ext4_iomap_buffered_read_ops);
 	return 0;
 }
 
 static void ext4_iomap_readahead(struct readahead_control *rac)
 {
-
+	iomap_bio_readahead(rac, &ext4_iomap_buffered_read_ops);
 }
 
 static int ext4_iomap_writepages(struct address_space *mapping,
-- 
2.52.0


^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox