Linux EXT4 FS development

Linux EXT4 FS development
 help / color / mirror / Atom feed

* [PATCH v7 02/11] fstests: add _clone_mount_option() helper
From: Anand Jain @ 2026-06-17 11:20 UTC (permalink / raw)
  To: fstests
  Cc: linux-btrfs, linux-ext4, linux-xfs, linux-f2fs-devel, zlang, hch,
	djwong
In-Reply-To: <cover.1781694879.git.asj@kernel.org>

Adds _clone_mount_option() helper function to handle filesystem-specific
requirements for mounting cloned devices. Abstract the need for -o nouuid
on XFS.

Signed-off-by: Anand Jain <asj@kernel.org>
Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
---
 common/rc | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/common/rc b/common/rc
index d7e3e0bdfb1e..968ba33686f3 100644
--- a/common/rc
+++ b/common/rc
@@ -414,6 +414,19 @@ _scratch_mount_options()
 					$SCRATCH_DEV $SCRATCH_MNT
 }
 
+# Return filesystem-specific mount options required for mounting clone/snapshot
+# devices.
+_clone_mount_option()
+{
+	case "$FSTYP" in
+	xfs)
+		# Allow mounting a duplicate filesystem on the same host
+		echo "-o nouuid"
+		;;
+	*)
+	esac
+}
+
 _supports_filetype()
 {
 	local dir=$1
-- 
2.43.0


^ permalink raw reply related

* [PATCH v7 01/11] fstests: add _loop_image_create_clone() helper
From: Anand Jain @ 2026-06-17 11:20 UTC (permalink / raw)
  To: fstests
  Cc: linux-btrfs, linux-ext4, linux-xfs, linux-f2fs-devel, zlang, hch,
	djwong
In-Reply-To: <cover.1781694879.git.asj@kernel.org>

Introduce _loop_image_create_clone() and _loop_image_destroy() to mkfs an
image file and clone it to another image file, and attach a loop device to
them. And its destroy part.

Signed-off-by: Anand Jain <asj@kernel.org>
---
 common/rc | 63 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 63 insertions(+)

diff --git a/common/rc b/common/rc
index 79189e7e6e94..d7e3e0bdfb1e 100644
--- a/common/rc
+++ b/common/rc
@@ -1520,6 +1520,69 @@ _scratch_resvblks()
 	esac
 }
 
+# Create a small loop image, run an optional tuning function ($2) on it,
+# clone it, and attach both to loop devices, returned in ($1).
+# Args:
+#   $1: Nameref to return the array of allocated loop devices [base, clone].
+#   $2: Optional callback function to tune the base filesystem before cloning.
+_loop_image_create_clone()
+{
+	local -n _ret=$1
+	local pre_clone_tune_func="$2"
+	local img_file=$TEST_DIR/${seq}.img
+	local img_file_clone=$TEST_DIR/${seq}_clone.img
+	local size=$(_small_fs_size_mb 128) # Smallest possible
+	local loop_devs
+
+	# Since we copy the block device image, we keep its size small.
+	_require_fs_space $TEST_DIR $((size * 1024))
+
+	_create_file_sized $((size * 1024 * 1024)) $img_file ||
+				_fail "Failed: Create $img_file $size"
+
+	loop_devs=$(_create_loop_device $img_file)
+	_ret=($loop_devs)
+
+	case $FSTYP in
+	xfs)
+		_mkfs_dev "-s size=4096" ${loop_devs[0]}
+		;;
+	btrfs)
+		_mkfs_dev ${loop_devs[0]}
+		;;
+	*)
+		_mkfs_dev ${loop_devs[0]}
+		;;
+	esac
+
+	# Only execute if the function argument is not empty
+	if [ -n "$pre_clone_tune_func" ]; then
+		$pre_clone_tune_func ${loop_devs[0]}
+	fi
+
+	sync ${loop_devs[0]}
+	cp $img_file $img_file_clone
+
+	loop_devs="$loop_devs $(_create_loop_device $img_file_clone)"
+
+	_ret=($loop_devs)
+}
+
+# Teardown loop devices and delete their underlying backing image files.
+# Accepts a list of loop device paths (e.g., /dev/loop0 /dev/loop1).
+_loop_image_destroy()
+{
+	for d in "$@"; do
+		# Retrieve the path of the backing file
+		local f=$(losetup --noheadings --output BACK-FILE $d)
+
+		# Detach the loop device from the backing file
+		_destroy_loop_device "$d"
+
+		# Clean up the backing disk image file
+		[ -n "$f" ] && rm -f "$f"
+	done
+}
 
 # Repair scratch filesystem.  Returns 0 if the FS is good to go (either no
 # errors found or errors were fixed) and nonzero otherwise; also spits out
-- 
2.43.0


^ permalink raw reply related

* [PATCH v7 0/11] fstests: add test coverage for cloned filesystem ids
From: Anand Jain @ 2026-06-17 11:20 UTC (permalink / raw)
  To: fstests
  Cc: linux-btrfs, linux-ext4, linux-xfs, linux-f2fs-devel, zlang, hch,
	djwong

v7:
. 803, 806: Trimmed down the UUID checks to only what is required, mountinfo
   and libblkid.
. 802: Dropped the unnecessary echo statements previously used for logical
   flow in the golden output. On second thought, it looks fine without them.
. Swapped _fixed_by_kernel_commit for _fixed_by_fs_commit.
. _clone_mount_option(): Now echoes directly from the case block itself.
. _require_unique_f_fsid(): add the link to the ref. discussions.

v6:
  https://lore.kernel.org/fstests/cover.1779939330.git.asj@kernel.org

v5:
  https://lore.kernel.org/fstests/cover.1779367627.git.asj@kernel.org

v4:
  https://lore.kernel.org/fstests/cover.1777357320.git.asj@kernel.org

v3:
  https://lore.kernel.org/fstests/cover.1777281778.git.asj@kernel.org

v2:
  https://lore.kernel.org/fstests/cover.1774090817.git.asj@kernel.org

v1:
  https://lore.kernel.org/fstests/cover.1772095513.git.asj@kernel.org

This series adds fstests infrastructure and test cases to verify correct
filesystem identity when a filesystem is cloned (block-level copy).
Test covers inotify, fanotify, f_fsid, libblkid, IMA, exportfs file handles
and libblkid tools verify with metadata_uuid.
  New helpers:
   _loop_image_create_clone() and _loop_image_destroy() to help create fs and clone
   _clone_mount_option() helper to apply per-filesystem clone mount options
   _change_metadata_uuid() changes the UUID before the clone

  New tests:
  - fanotify events are isolated between cloned filesystems
  - f_fsid is unique across cloned filesystem instances
  - libblkid correctly resolves duplicate UUIDs to distinct devices
    with and without metadata_uuid
  - IMA distinct identity for each cloned filesystem
  - exportfs file handles resolve correctly on cloned filesystems

Kernel Patches:
  Requires Btrfs kernel patches for all tests to pass.
   [1] https://lore.kernel.org/linux-btrfs/cover.1777281686.git.asj@kernel.org

Anand Jain (11):
  fstests: add _loop_image_create_clone() helper
  fstests: add _clone_mount_option() helper
  fstests: add FSNOTIFYWAIT_PROG
  fstests: add _require_unique_f_fsid() helper
  fstests: verify fanotify isolation on cloned filesystems
  fstests: verify f_fsid for cloned filesystems
  fstests: verify libblkid resolution of duplicate UUIDs
  fstests: verify IMA isolation on cloned filesystems
  fstests: verify exportfs file handles on cloned filesystems
  fstests: add _change_metadata_uuid helper
  fstests: test UUID consistency for clones with metadata_uuid

 common/config         |   1 +
 common/rc             | 120 +++++++++++++++++++++++++++++++++++++
 tests/generic/801     | 135 ++++++++++++++++++++++++++++++++++++++++++
 tests/generic/801.out |   7 +++
 tests/generic/802     |  64 ++++++++++++++++++++
 tests/generic/802.out |   4 ++
 tests/generic/803     |  72 ++++++++++++++++++++++
 tests/generic/803.out |   6 ++
 tests/generic/804     | 108 +++++++++++++++++++++++++++++++++
 tests/generic/804.out |  10 ++++
 tests/generic/805     |  80 +++++++++++++++++++++++++
 tests/generic/805.out |   2 +
 tests/generic/806     |  74 +++++++++++++++++++++++
 tests/generic/806.out |   6 ++
 14 files changed, 689 insertions(+)
 create mode 100644 tests/generic/801
 create mode 100644 tests/generic/801.out
 create mode 100644 tests/generic/802
 create mode 100644 tests/generic/802.out
 create mode 100644 tests/generic/803
 create mode 100644 tests/generic/803.out
 create mode 100644 tests/generic/804
 create mode 100644 tests/generic/804.out
 create mode 100644 tests/generic/805
 create mode 100644 tests/generic/805.out
 create mode 100644 tests/generic/806
 create mode 100644 tests/generic/806.out

-- 
2.43.0


^ permalink raw reply

* Re: [PATCH 2/2] ext4: base unaligned DIO lock decision on partial block zeroing
From: Jan Kara @ 2026-06-17 11:08 UTC (permalink / raw)
  To: Baokun Li
  Cc: Zhang Yi, linux-ext4, tytso, adilger.kernel, jack, yi.zhang,
	ojaswin, ritesh.list, peng_wang
In-Reply-To: <4f4800dc-0a2c-48e2-9535-f32b21628bdb@linux.alibaba.com>

On Wed 17-06-26 15:52:24, Baokun Li wrote:
> On 2026/6/17 10:45, Zhang Yi wrote:
> > On 6/16/2026 9:10 PM, Baokun Li wrote:
> >> Thank you for your review!
> >>
> >> After extensive testing, I found that after merging this patch,
> >> generic/746
> >> started failing intermittently on ext3 (mkfs.ext4 -O ^extents).  The
> >> test
> >> triggers a "Page cache invalidation failure on direct I/O" warning, and
> >> subsequent fsync returns -EIO.
> >>
> >> The underlying race existed before this patch, but this patch appears to
> >> have widened the reproduction window considerably, so I thought it worth
> >> trying to address.  Here is my analysis:
> >>
> >> On no-extent inodes, DIO writes that hit holes cannot use unwritten
> >> extents.  ext4_iomap_alloc() leaves m_flags=0, so ext4_map_blocks()
> >> returns 0 for a hole, and:
> >>
> >>          if (!m_flags && !ret)
> >>                  ret = -ENOTBLK;
> >>
> >> The iomap layer returns -ENOTBLK to ext4, which falls back to buffered
> >> I/O.  The fallback path dirties pages in the page cache, then flushes
> >> and invalidates them.  However, concurrent async DIO completions to
> >> other blocks on the same inode can run
> >> kiocb_invalidate_post_direct_write()
> >> without holding the inode lock.
> >>
> >> Consider a file with two 4k extents: [hole][written].  Thread A does DIO
> >> to the written extent, while thread B does DIO spanning both extents:
> >>
> >>    kworker A (4k DIO, allocated block)    kworker B (8k DIO,
> >> hole->fallback)
> >>    -----------------------------------   
> >> -----------------------------------
> >>    inode_lock_shared()                    inode_lock_shared()
> >>    iomap_dio_rw():                        iomap_dio_rw():
> >>      kiocb_invalidate_pages -> clean        iomap_begin -> -ENOTBLK
> >>      submit_bio (async)                     dio->size = 0
> >>    inode_unlock_shared()                  inode_unlock_shared()
> >>
> >>    [bio pending in block layer]           /* fallback: inode lock
> >> released */
> >>                                           ext4_buffered_write_iter()
> >>                                             inode_lock(exclusive)
> >>                                             generic_perform_write()
> >>                                               -> dirty pages [0, 8k]
> >>                                             inode_unlock(exclusive)
> >>
> >>                                           /* pages still dirty here */
> >>    [bio completes]                        filemap_write_and_wait_range()
> >>    iomap_dio_complete()                     -> flush dirty pages
> >>      kiocb_invalidate_post_direct_write() invalidate_mapping_pages()
> >>        invalidate_inode_pages2_range()
> >>        -> finds dirty page!               /* window closed */
> >>        -> dio_warn_stale_pagecache()
> >>        -> errseq_set(-EIO)
> >>
> >
> > It looks like this issue occurs when invalidate_inode_pages2_range()
> > checks beyond the DIO write range, which may only happen when folio size
> > is larger than block size. Is that correct?
> Thanks for looking at this!
> 
> Not quite — the scenario involves an 8k file with layout
> 
>  [hole at 0-4k] [written extent at 4k-8k]
> 
> and two DIO threads. Thread A does a 4k DIO write at offset 4k; since
> the target block is a written extent, no fallback occurs. Thread B
> does an 8k DIO write at offset 0; since blocks 0-4k are a hole on an
> indirect-block inode and ext3 does not support unwritten extents,
> iomap returns -ENOTBLK and the entire 8k write falls back to buffered
> I/O.

Right, but for this to happen userspace had to submit two overlapping
direct IO writes. This always had undefined behavior so some inconsistent
content in the file is more or less acceptable. But as Zhang pointed out,
the same failure can also appear when block_size < folio_size and there we
should really strive to provide consistent data.

> >> The critical window is the gap between ext4_buffered_write_iter()
> >> dirtying
> >> pages and filemap_write_and_wait_range() flushing them.  In this
> >> window the
> >> inode lock is not held, so another thread's async DIO completion is
> >> free to
> >> invalidate the still-dirty pages in the page cache.
> >>
> >> This race has always existed on ext3 because indirect-block inodes lack
> >> unwritten-extent support.  However, the window was extremely narrow in
> >> practice, because the old ext4_overwrite_io() checked every block and
> >> would conservatively take an exclusive lock.  This patch replaced it
> >> with ext4_dio_needs_zeroing(), which only checks head and tail blocks,
> >> making unaligned DIO more likely to take a shared lock and
> >> proportionally increasing the chance of hitting the race.
> >>
> >> I tried a couple of alternatives before settling on the patch below:
> >>
> >> 1. Force exclusive lock + IOMAP_DIO_FORCE_WAIT for all no-extent DIO.
> >>     This closes the window for new DIO submissions, but does not protect
> >>     against bio completions from previously submitted async DIO, which
> >>     run independently of the inode lock.
> >>
> >> 2. Wrap the fallback dirty+flush+invalidate sequence in
> >>     filemap_invalidate_lock().  However, the ext4 DIO and iomap layers
> >>     do not use this lock, so it would not serialise against DIO
> >>     completions.
> >>
> >
> > Could we add a call to inode_dio_wait() before falling back to buffered
> > I/O? That is, in thread B, when falling back to buffered I/O, could we
> > acquire the exclusive inode lock and then call inode_dio_wait() to wait
> > for in-flight DIO to complete? This should close the race window. Since
> > scenarios where DIO writes to holes on ext3 are relatively rare, the
> > performance impact should be minimal (I suppose).
> >
> That's a great idea, thank you!
> 
> I had been trying to fix this on the DIO side and didn't consider
> waiting from the buffered fallback path.
> 
> I've tested the approach locally and it closes the race; I'll add a
> patch using it in the next version.

Yes, this looks like the best solution so far. The fallback doesn't have to
be fast. It was always - you are doing something stupid and we try to fixup
for you - kind of thing and bad performance is acceptable in that case.

> >> One straightforward approach that seems correct is to skip direct I/O
> >> for no-extent inodes entirely, by returning 0 from ext4_dio_alignment():
> >>
> >> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> >> --- a/fs/ext4/inode.c
> >> +++ b/fs/ext4/inode.c
> >> @@ -6131,6 +6131,8 @@ u32 ext4_dio_alignment(struct inode *inode)
> >>   {
> >>          if (fsverity_active(inode))
> >>                  return 0;
> >> +       if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
> >> +               return 0;
> >>          if (ext4_should_journal_data(inode))
> >>                  return 0;
> >>          if (ext4_has_inline_data(inode))
> >>
> >> With this, ext4_should_use_dio() returns false for no-extent inodes, and
> >> all I/O goes through ext4_buffered_write_iter() directly, bypassing the
> >> DIO path entirely.  On ext3, DIO to a hole already falls back to
> >> buffered
> >> I/O, so there is essentially no performance benefit to using DIO in the
> >> first place.
> >>
> >> Note that with this change, the fallback branch in
> >> ext4_dio_write_iter():
> >>
> >>          if (ret >= 0 && iov_iter_count(from)) {
> >>                  /* buffered fallback */
> >>          }
> >>
> >> would also become dead code for extent-based inodes (since unwritten
> >> extents guarantee iomap_dio_rw() never returns zero with unconsumed
> >> data), and could be removed in a follow-up cleanup.
> >>
> >> Thoughts?  Is there a reason to preserve DIO on no-extent inodes that
> >> I'm missing?
> >>
> >
> > Hmm, this would also cause DIO to fall back to buffered I/O in common
> > extending write cases, which I think would be unacceptable.
> 
> Fair point, the regression on extending writes is hard to justify.  That
> said, until we had a better fix, I'd argue a behavioural change was
> still preferable to potential data corruption. With the inode_dio_wait()
> approach above, this trade-off goes away. 

But heavily regressing performance for overwrites or extending DIO writes
even on indirect block based files is not really acceptable. There are
still users who for whatever reasons stay with old filesystems having
indirect block based files and they'd likely notice the regression.

								Honza
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply

* Re: [PATCH v4 14/23] ext4: implement partial block zero range path using iomap
From: Brian Foster @ 2026-06-17 10:56 UTC (permalink / raw)
  To: Zhang Yi
  Cc: Jan Kara, linux-ext4, linux-fsdevel, linux-kernel, tytso,
	adilger.kernel, libaokun, ojaswin, ritesh.list, djwong, hch,
	yi.zhang, yizhang089, yangerkun, yukuai
In-Reply-To: <16cccb83-cdad-4113-8182-e8ea9e3049a2@huaweicloud.com>

On Wed, Jun 17, 2026 at 04:14:40PM +0800, Zhang Yi wrote:
> On 6/16/2026 8:28 PM, Jan Kara wrote:
> > On Mon 11-05-26 15:23:34, Zhang Yi wrote:
> >> From: Zhang Yi <yi.zhang@huawei.com>
> >>
> >> Introduce a new iomap_ops instance, ext4_iomap_zero_ops, along with
> >> ext4_iomap_block_zero_range() to implement block zeroing via the iomap
> >> infrastructure for ext4.
> >>
> >> ext4_iomap_block_zero_range() calls iomap_zero_range() with
> >> ext4_iomap_zero_begin() as the callback. The callback locates and zeros
> >> out either a mapped partial block or a dirty, unwritten partial block.
> >>
> >> Important constraints:
> >>
> >> Zeroing out under an active journal handle can cause deadlock, because
> >> the order of acquiring the folio lock and starting a handle is
> >> inconsistent with the iomap writeback path.
> >>
> >> Therefore, ext4_iomap_block_zero_range():
> >> - Must NOT be called under an active handle.
> >> - Cannot rely on data=ordered mode to ensure zeroed data persistence
> >>   before updating i_disksize (for the cases of post-EOF append write,
> >>   post-EOF fallocate, and truncate up). In subsequent patches, we will
> >>   address this by synchronizing commit I/O but doesn't waiting for
> >>   completion, and updating i_disksize to i_size only after the zeroed
> >>   data has been written back.
> >>
> >> Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
> >> ---
> >>  fs/ext4/inode.c | 92 +++++++++++++++++++++++++++++++++++++++++++++++++
> >>  1 file changed, 92 insertions(+)
> >>
> >> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> >> index c6fe42d012fc..e0dae2501292 100644
> >> --- a/fs/ext4/inode.c
> >> +++ b/fs/ext4/inode.c
> >> @@ -4101,6 +4101,51 @@ static int ext4_iomap_buffered_da_write_end(struct inode *inode, loff_t offset,
> >>  	return 0;
> >>  }
> >>  
> >> +static int ext4_iomap_zero_begin(struct inode *inode,
> >> +		loff_t offset, loff_t length, unsigned int flags,
> >> +		struct iomap *iomap, struct iomap *srcmap)
> >> +{
> >> +	struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap);
> > 
> > This looks like a layering violation to me. I don't think you can safely
> > assume the iomap you're passed is a part of iomap_iter...
> > 
> >> +	struct ext4_map_blocks map;
> >> +	u8 blkbits = inode->i_blkbits;
> >> +	unsigned int iomap_flags = 0;
> >> +	int ret;
> >> +
> >> +	ret = ext4_emergency_state(inode->i_sb);
> >> +	if (unlikely(ret))
> >> +		return ret;
> >> +
> >> +	if (WARN_ON_ONCE(!(flags & IOMAP_ZERO)))
> >> +		return -EINVAL;
> >> +
> >> +	ret = ext4_iomap_map_blocks(inode, offset, length, NULL, &map);
> >> +	if (ret < 0)
> >> +		return ret;
> >> +
> >> +	/*
> >> +	 * Look up dirty folios for unwritten mappings within EOF. Providing
> >> +	 * this bypasses the flush iomap uses to trigger extent conversion
> >> +	 * when unwritten mappings have dirty pagecache in need of zeroing.
> >> +	 */
> >> +	if (map.m_flags & EXT4_MAP_UNWRITTEN) {
> >> +		loff_t start = ((loff_t)map.m_lblk) << blkbits;
> >> +		loff_t end = ((loff_t)map.m_lblk + map.m_len) << blkbits;
> >> +
> >> +		iomap_fill_dirty_folios(iter, &start, end, &iomap_flags);
> >> +		if ((start >> blkbits) < map.m_lblk + map.m_len)
> >> +			map.m_len = (start >> blkbits) - map.m_lblk;
> >> +	}
> > 
> > ... and you need access to iter only for this which seems to be really a
> > hack that's trying to outsmart the iomap code. I have to admit I don't
> > fully understand what you are trying to achieve here. Are you trying to
> > avoid flushing of the range that will be zeroed out?
> 
> This logic is copied from the XFS and iomap infrastructure. Its primary
> purpose is to optimize the zeroing operations on dirty written extents.
> It was introduced by Brian in [1].
> 
> The history as I understand it: originally, the iomap infrastructure
> could not zero dirty unwritten extents during zero range processing,
> which led to stale data exposure. XFS had to flush dirty ranges itself
> before zeroing — a workaround that was not generic.
> 
> In c5c810b94cf ("iomap: fix handling of dirty folios over unwritten
> extents"), Brian added an unconditional flush in the iomap
> infrastructure, ensuring that by the time zeroing runs the extent has
> already been converted to written so the zero can proceed correctly.
> However, this flush was too heavy and introduced noticeable performance
> overhead.
> 
> This was then optimized in 7d9b474ee4cc3 ("iomap: make zero range flush
> conditional on unwritten mappings"), which restricts flushing to only
> dirty pagecache over unwritten or hole mappings.
> 
> Brian later proposed a different approach: rather than relying on flush
> to convert the extent type, find dirty folios ahead of the zero range
> and zero the dirty unwritten extents directly. In [1] he added this
> lookup logic. The filesystem now supplies a folio batch (a collection of
> dirty folios) via the iomap begin callback, and zero range iterates over
> these dirty folios to perform zeroing. Clean regions not covered by the
> batch are simply skipped. This entirely eliminates the need to flush.
> 
> [1] https://lore.kernel.org/linux-xfs/20251003134642.604736-1-bfoster@redhat.com/
> 
> If I understand correctly, the current approach is a compromise, and
> Brian is still working on this. Perhaps ext4 and XFS could work together
> on improvements in the future?
> 

I think that about covers it!

I do agree wrt to the iomap_iter thing in that it doesn't seem like the
most elegant thing. I considered that a bit of a roadblock when first
hacking on the batch stuff, but IIRC somebody pointed out that there was
precedent already so I didn't think too hard about it after that. Indeed
if you poke around, other filesystems use a similar pattern to access
iter->private for whatever private context is carried around.

FWIW, one of the longer term thoughts for the dirty folio stuff was to
eventually lift it out of the callback and just have iomap do it for the
fs. That would eliminate this particular pattern and probably clean
things up a bit, but there were also some other caveats with that that
aren't top of mind atm (IIRC, things like dealing with map trimming,
etc., but I haven't had a chance to think about it in a while).

Also note that this isn't necessarily a hard requirement. It's an
optional optimization. iomap will flush and retry in the dirty
pagecache+unwritten extent case if the fs hasn't otherwised provided
folios to make sure it zeroes properly, it's just that performance of
that may or may not be acceptable for your use case.

Brian

> > 
> >> +	ret = iomap_zero_range(inode, from, length, did_zero,
> >> +			       &ext4_iomap_zero_ops, &ext4_iomap_write_ops,
> >> +			       NULL);
> >> +	if (ret)
> >> +		return ret;
> >> +
> >> +	/*
> >> +	 * TODO: The iomap does not distinguish between different types of
> >> +	 * zeroing and always sets zero_written if a zeroing operation is
> >> +	 * performed, which may result in unnecessary order operations.
> >> +	 */
> > 
> > Is this still true after your fix to did_zero handling?
> 
> Yeah. Currently, iomap_zero_range() can only report whether a zeroing
> operation has occurred through did_zero parameter, but it cannot
> distinguish whether the zeroed range is a written extent that already
> exists on disk. That is, even if the zeroing is performed on a delalloc
> extent, did_zero will still return true.
> 
> Thanks,
> Yi.
> 
> > 
> >> +	if (did_zero && zero_written)
> >> +		*zero_written = *did_zero;
> >> +
> >> +	return 0;
> >> +}
> >> +
> >>  /*
> >>   * Zeros out a mapping of length 'length' starting from file offset
> >>   * 'from'.  The range to be zero'd must be contained with in one block.
> > 
> > 								Honza
> 


^ permalink raw reply

* Re: [PATCH 2/2] ext4: base unaligned DIO lock decision on partial block zeroing
From: Zhang Yi @ 2026-06-17 10:54 UTC (permalink / raw)
  To: Baokun Li, Zhang Yi
  Cc: linux-ext4, tytso, adilger.kernel, jack, ojaswin, ritesh.list,
	peng_wang
In-Reply-To: <4f4800dc-0a2c-48e2-9535-f32b21628bdb@linux.alibaba.com>

On 6/17/2026 3:52 PM, Baokun Li wrote:
> On 2026/6/17 10:45, Zhang Yi wrote:
>> Hi, Baokun!
>>
>> On 6/16/2026 9:10 PM, Baokun Li wrote:
>>> Hi all,
>>>
>>> Thank you for your review!
>>>
>>> After extensive testing, I found that after merging this patch,
>>> generic/746
>>> started failing intermittently on ext3 (mkfs.ext4 -O ^extents).  The
>>> test
>>> triggers a "Page cache invalidation failure on direct I/O" warning, and
>>> subsequent fsync returns -EIO.
>>>
>>> The underlying race existed before this patch, but this patch appears to
>>> have widened the reproduction window considerably, so I thought it worth
>>> trying to address.  Here is my analysis:
>>>
>>> On no-extent inodes, DIO writes that hit holes cannot use unwritten
>>> extents.  ext4_iomap_alloc() leaves m_flags=0, so ext4_map_blocks()
>>> returns 0 for a hole, and:
>>>
>>>          if (!m_flags && !ret)
>>>                  ret = -ENOTBLK;
>>>
>>> The iomap layer returns -ENOTBLK to ext4, which falls back to buffered
>>> I/O.  The fallback path dirties pages in the page cache, then flushes
>>> and invalidates them.  However, concurrent async DIO completions to
>>> other blocks on the same inode can run
>>> kiocb_invalidate_post_direct_write()
>>> without holding the inode lock.
>>>
>>> Consider a file with two 4k extents: [hole][written].  Thread A does DIO
>>> to the written extent, while thread B does DIO spanning both extents:
>>>
>>>    kworker A (4k DIO, allocated block)    kworker B (8k DIO,
>>> hole->fallback)
>>>    -----------------------------------   
>>> -----------------------------------
>>>    inode_lock_shared()                    inode_lock_shared()
>>>    iomap_dio_rw():                        iomap_dio_rw():
>>>      kiocb_invalidate_pages -> clean        iomap_begin -> -ENOTBLK
>>>      submit_bio (async)                     dio->size = 0
>>>    inode_unlock_shared()                  inode_unlock_shared()
>>>
>>>    [bio pending in block layer]           /* fallback: inode lock
>>> released */
>>>                                           ext4_buffered_write_iter()
>>>                                             inode_lock(exclusive)
>>>                                             generic_perform_write()
>>>                                               -> dirty pages [0, 8k]
>>>                                             inode_unlock(exclusive)
>>>
>>>                                           /* pages still dirty here */
>>>    [bio completes]                        filemap_write_and_wait_range()
>>>    iomap_dio_complete()                     -> flush dirty pages
>>>      kiocb_invalidate_post_direct_write() invalidate_mapping_pages()
>>>        invalidate_inode_pages2_range()
>>>        -> finds dirty page!               /* window closed */
>>>        -> dio_warn_stale_pagecache()
>>>        -> errseq_set(-EIO)
>>>
>>
>> It looks like this issue occurs when invalidate_inode_pages2_range()
>> checks beyond the DIO write range, which may only happen when folio size
>> is larger than block size. Is that correct?
> Thanks for looking at this!
> 
> Not quite — the scenario involves an 8k file with layout
> 
>  [hole at 0-4k] [written extent at 4k-8k]
> 
> and two DIO threads. Thread A does a 4k DIO write at offset 4k; since
> the target block is a written extent, no fallback occurs. Thread B
> does an 8k DIO write at offset 0; since blocks 0-4k are a hole on an
> indirect-block inode and ext3 does not support unwritten extents,
> iomap returns -ENOTBLK and the entire 8k write falls back to buffered
> I/O.
> 
> Normally the kernel would prevent concurrent BIO and DIO to
> overlapping ranges on the same file. But because Thread A holds only
> a shared inode lock (pure overwrite on a written extent), and
> Thread B's DIO has already returned -ENOTBLK before its buffered
> fallback begins, both paths can proceed concurrently:
> 
> 
> Thread A: 4k DIO at offset 4k     Thread B: 8k DIO at offset 0
> ─────────────────────────────     ─────────────────────────────
> kiocb_invalidate_pages            iomap_begin → -ENOTBLK
>   → page index 1 already clean      (indirect inode hole, m_flags=0)
> submit_bio (async)                dio->size = 0
> inode_unlock_shared()             inode_unlock_shared()
> 
>                                   ext4_buffered_write_iter()
> [bio pending]                       → dirty page 0 [0, 4k]
>                                     → dirty page 1 [4k, 8k]
>                                   inode_unlock()
>                                   // pages dirty, no lock
> 
> [bio completes]
> iomap_dio_complete():
>   kiocb_invalidate_post_direct_write()
>     start = 4096 >> 12 = 1
>     end = (8191) >> 12 = 1
>     invalidate_inode_pages2_range(1, 1)
>       → page 1 [4k,8k] is DIRTY
>       → -EBUSY → errseq_set(-EIO)
> 
> Page index 1 corresponds to file offset [4k, 8k], which is exactly
> Thread A's DIO range. The invalidation is not going beyond the DIO
> range — the dirty page was placed there by Thread B's buffered
> fallback, which wrote to [0, 8k] and dirtied the same page.
> 
> No large folio is needed; 4k pages and 4k blocks are sufficient.
> 
> From the user's perspective, when performing concurrent DIO on a
> holed ext3 file, the file contents can become corrupted with some
> probability. If the file is used as a loop device's backing file,
> this manifests as filesystem corruption inside the loop device.

Ha, fair enough. Thanks for the details.

Cheers,
Yi.


^ permalink raw reply

* Re: [PATCH v4 14/23] ext4: implement partial block zero range path using iomap
From: Jan Kara @ 2026-06-17 10:50 UTC (permalink / raw)
  To: Zhang Yi
  Cc: Jan Kara, linux-ext4, linux-fsdevel, linux-kernel, tytso,
	adilger.kernel, libaokun, ojaswin, ritesh.list, djwong, hch,
	yi.zhang, yizhang089, yangerkun, yukuai, Brian Foster
In-Reply-To: <16cccb83-cdad-4113-8182-e8ea9e3049a2@huaweicloud.com>

On Wed 17-06-26 16:14:40, Zhang Yi wrote:
> On 6/16/2026 8:28 PM, Jan Kara wrote:
> > On Mon 11-05-26 15:23:34, Zhang Yi wrote:
> >> From: Zhang Yi <yi.zhang@huawei.com>
> >>
> >> Introduce a new iomap_ops instance, ext4_iomap_zero_ops, along with
> >> ext4_iomap_block_zero_range() to implement block zeroing via the iomap
> >> infrastructure for ext4.
> >>
> >> ext4_iomap_block_zero_range() calls iomap_zero_range() with
> >> ext4_iomap_zero_begin() as the callback. The callback locates and zeros
> >> out either a mapped partial block or a dirty, unwritten partial block.
> >>
> >> Important constraints:
> >>
> >> Zeroing out under an active journal handle can cause deadlock, because
> >> the order of acquiring the folio lock and starting a handle is
> >> inconsistent with the iomap writeback path.
> >>
> >> Therefore, ext4_iomap_block_zero_range():
> >> - Must NOT be called under an active handle.
> >> - Cannot rely on data=ordered mode to ensure zeroed data persistence
> >>   before updating i_disksize (for the cases of post-EOF append write,
> >>   post-EOF fallocate, and truncate up). In subsequent patches, we will
> >>   address this by synchronizing commit I/O but doesn't waiting for
> >>   completion, and updating i_disksize to i_size only after the zeroed
> >>   data has been written back.
> >>
> >> Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
> >> ---
> >>  fs/ext4/inode.c | 92 +++++++++++++++++++++++++++++++++++++++++++++++++
> >>  1 file changed, 92 insertions(+)
> >>
> >> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> >> index c6fe42d012fc..e0dae2501292 100644
> >> --- a/fs/ext4/inode.c
> >> +++ b/fs/ext4/inode.c
> >> @@ -4101,6 +4101,51 @@ static int ext4_iomap_buffered_da_write_end(struct inode *inode, loff_t offset,
> >>  	return 0;
> >>  }
> >>  
> >> +static int ext4_iomap_zero_begin(struct inode *inode,
> >> +		loff_t offset, loff_t length, unsigned int flags,
> >> +		struct iomap *iomap, struct iomap *srcmap)
> >> +{
> >> +	struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap);
> > 
> > This looks like a layering violation to me. I don't think you can safely
> > assume the iomap you're passed is a part of iomap_iter...
> > 
> >> +	struct ext4_map_blocks map;
> >> +	u8 blkbits = inode->i_blkbits;
> >> +	unsigned int iomap_flags = 0;
> >> +	int ret;
> >> +
> >> +	ret = ext4_emergency_state(inode->i_sb);
> >> +	if (unlikely(ret))
> >> +		return ret;
> >> +
> >> +	if (WARN_ON_ONCE(!(flags & IOMAP_ZERO)))
> >> +		return -EINVAL;
> >> +
> >> +	ret = ext4_iomap_map_blocks(inode, offset, length, NULL, &map);
> >> +	if (ret < 0)
> >> +		return ret;
> >> +
> >> +	/*
> >> +	 * Look up dirty folios for unwritten mappings within EOF. Providing
> >> +	 * this bypasses the flush iomap uses to trigger extent conversion
> >> +	 * when unwritten mappings have dirty pagecache in need of zeroing.
> >> +	 */
> >> +	if (map.m_flags & EXT4_MAP_UNWRITTEN) {
> >> +		loff_t start = ((loff_t)map.m_lblk) << blkbits;
> >> +		loff_t end = ((loff_t)map.m_lblk + map.m_len) << blkbits;
> >> +
> >> +		iomap_fill_dirty_folios(iter, &start, end, &iomap_flags);
> >> +		if ((start >> blkbits) < map.m_lblk + map.m_len)
> >> +			map.m_len = (start >> blkbits) - map.m_lblk;
> >> +	}
> > 
> > ... and you need access to iter only for this which seems to be really a
> > hack that's trying to outsmart the iomap code. I have to admit I don't
> > fully understand what you are trying to achieve here. Are you trying to
> > avoid flushing of the range that will be zeroed out?
> 
> This logic is copied from the XFS and iomap infrastructure. Its primary
> purpose is to optimize the zeroing operations on dirty written extents.
> It was introduced by Brian in [1].

Ah, I see. I still find it hacky but apparently it is an established hack
in iomap :). Fair.

> The history as I understand it: originally, the iomap infrastructure
> could not zero dirty unwritten extents during zero range processing,
> which led to stale data exposure. XFS had to flush dirty ranges itself
> before zeroing — a workaround that was not generic.
> 
> In c5c810b94cf ("iomap: fix handling of dirty folios over unwritten
> extents"), Brian added an unconditional flush in the iomap
> infrastructure, ensuring that by the time zeroing runs the extent has
> already been converted to written so the zero can proceed correctly.
> However, this flush was too heavy and introduced noticeable performance
> overhead.
> 
> This was then optimized in 7d9b474ee4cc3 ("iomap: make zero range flush
> conditional on unwritten mappings"), which restricts flushing to only
> dirty pagecache over unwritten or hole mappings.
> 
> Brian later proposed a different approach: rather than relying on flush
> to convert the extent type, find dirty folios ahead of the zero range
> and zero the dirty unwritten extents directly. In [1] he added this
> lookup logic. The filesystem now supplies a folio batch (a collection of
> dirty folios) via the iomap begin callback, and zero range iterates over
> these dirty folios to perform zeroing. Clean regions not covered by the
> batch are simply skipped. This entirely eliminates the need to flush.
> 
> [1] https://lore.kernel.org/linux-xfs/20251003134642.604736-1-bfoster@redhat.com/

Thanks for the summary! So I was confused because somehow I thought this is
about fallocate(FALLOC_FL_ZERO_RANGE) and so I was wondering why we just
cannot evict the page cache and be done with that. Only after reading
everything again I've realized this is about zeroing partial blocks on hole
punch etc. And we may need to really handle multiple folios because XFS
also uses this mechanism to implement FALLOC_FL_ZERO_RANGE for zoned
storage. Ugh. OK, anyway for now this looks like your patch is following
how things are expected to be done so feel free to add:

Reviewed-by: Jan Kara <jack@suse.cz>

> >> +	/*
> >> +	 * TODO: The iomap does not distinguish between different types of
> >> +	 * zeroing and always sets zero_written if a zeroing operation is
> >> +	 * performed, which may result in unnecessary order operations.
> >> +	 */
> > 
> > Is this still true after your fix to did_zero handling?
> 
> Yeah. Currently, iomap_zero_range() can only report whether a zeroing
> operation has occurred through did_zero parameter, but it cannot
> distinguish whether the zeroed range is a written extent that already
> exists on disk. That is, even if the zeroing is performed on a delalloc
> extent, did_zero will still return true.

So maybe write in the comment explicitely, that this may result in
unnecessary flushing of folios if zeroing happened in
delayed-not-yet-allocated blocks?

								Honza
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply

* Re: [PATCH RFC 2/8] fs: add a global device to super block hash table
From: Christian Brauner @ 2026-06-17  9:26 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Jan Kara, Jens Axboe, Alexander Viro, linux-block, linux-kernel,
	linux-fsdevel, Carlos Maiolino, linux-xfs, Chris Mason,
	David Sterba, linux-btrfs, Theodore Ts'o, linux-ext4,
	Gao Xiang, linux-erofs
In-Reply-To: <20260617062523.GA20041@lst.de>

> No, we don't need a secondary device number to sb mapping.  On the other
> hand we do need the deviceloss, freeze etc upcalls to work for owners
> that are not file systems like mdraid or dm, even if they have been
> slow to pick this.  The whole idea of the holder ops is to abstract
> away from who holds it instead of adding back the broken hard coding
> of the superblock.  Otherwise you're just badly reinventing get_super.

No, the expanded version works for all device numbers. There's also
no-hardcoding. And non-fs users may do whatever they want with their
holder ops ofc. erofs always had the non 1:1 relationship between
devices and filesystems and for that case it seems sane. I'm happy to
let the series sit for a bit to gather input and do the security
mediation patches first. The series are complementary.

^ permalink raw reply

* Re: [PATCH v7 3/4] ext4: introduce ext4_put_ea_inode() for safe deferred iput
From: Zhou, Yun @ 2026-06-17  8:38 UTC (permalink / raw)
  To: Jan Kara; +Cc: linux-ext4, linux-kernel
In-Reply-To: <20260616151558.1728881-4-yun.zhou@windriver.com>

Hi Honza,
> Add ext4_put_ea_inode() which safely releases EA inode references:
> when SB_ACTIVE, it calls iput() directly (write_inode_now cannot be
> triggered); during mount (!SB_ACTIVE), it queues the inode on a per-sb
> lock-free llist and schedules a worker to call iput() in a clean
> context without holding any ext4 locks.
>
> Convert the iput in ext4_xattr_block_set()'s "Drop the previous xattr
> block" path to use ext4_xattr_inode_array_free_deferred(), which
> releases EA inodes via ext4_put_ea_inode().  This path previously called
> ext4_xattr_inode_array_free() (synchronous iput) while holding xattr_sem
> and a jbd2 handle.
>
> The worker is flushed in ext4_put_super() before journal destruction to
> ensure all pending EA inode cleanup completes while the journal is still
> available.
>
>   
> +static void ext4_xattr_inode_array_free_deferred(struct super_block *sb,
> +				struct ext4_xattr_inode_array *array)
> +{
> +	int idx;
> +
> +	if (array == NULL)
> +		return;
> +
> +	for (idx = 0; idx < array->count; ++idx)
> +		ext4_put_ea_inode(sb, array->inodes[idx]);
> +	kfree(array);
> +}
> +
> +struct ext4_ea_iput_entry {
> +	struct llist_node node;
> +	struct inode *inode;
> +};
> +
> +/*
> + * Worker function for deferred EA inode iput.  Processes all inodes queued
> + * on s_ea_inode_to_free in a context free of xattr_sem/jbd2 handle locks.
> + */
> +void ext4_ea_inode_work(struct work_struct *work)
> +{
> +	struct ext4_sb_info *sbi = container_of(work, struct ext4_sb_info,
> +						s_ea_inode_work);
> +	struct llist_node *node = llist_del_all(&sbi->s_ea_inode_to_free);
> +	struct llist_node *next;
> +
> +	while (node) {
> +		struct ext4_ea_iput_entry *entry = container_of(node,
> +				struct ext4_ea_iput_entry, node);
> +		next = node->next;
> +		iput(entry->inode);
> +		kfree(entry);
> +		node = next;
> +	}
> +}
> +
> +/*
> + * Release a VFS reference on an EA inode after ext4_xattr_inode_dec_ref()
> + * may have set i_nlink=0.  Must be used instead of iput() in any context
> + * where xattr_sem or a jbd2 handle is held, because eviction of a nlink=0
> + * inode can acquire those same locks.
> + *
> + * When SB_ACTIVE, eviction does not call write_inode_now() so direct
> + * iput() is safe.  During mount (!SB_ACTIVE), defer to a workqueue.
> + *
> + * For EA inode references dropped without a preceding dec_ref (e.g.,
> + * lookup-only paths where nlink remains >= 1), plain iput() is safe
> + * and preferred.
> + */
> +void ext4_put_ea_inode(struct super_block *sb, struct inode *inode)
> +{
> +	struct ext4_ea_iput_entry *entry;
> +
> +	if (!inode)
> +		return;
> +	if (sb->s_flags & SB_ACTIVE) {
> +		iput(inode);
> +		return;
> +	}
> +	entry = kmalloc(sizeof(*entry), GFP_NOFS | __GFP_NOFAIL);
> +	entry->inode = inode;
> +	llist_add(&entry->node, &EXT4_SB(sb)->s_ea_inode_to_free);
> +	schedule_work(&EXT4_SB(sb)->s_ea_inode_work);
> +}
> +
>
Could you help me check if this is the way you expected?

Thanks,
Yun

^ permalink raw reply

* Re: [PATCH v4 14/23] ext4: implement partial block zero range path using iomap
From: Zhang Yi @ 2026-06-17  8:14 UTC (permalink / raw)
  To: Jan Kara
  Cc: linux-ext4, linux-fsdevel, linux-kernel, tytso, adilger.kernel,
	libaokun, ojaswin, ritesh.list, djwong, hch, yi.zhang, yizhang089,
	yangerkun, yukuai, Brian Foster
In-Reply-To: <c2q54d6u724xctkzwm6x7sbmg5cpvcackfz3toc47qts6iaj77@ci2czn4fqjik>

On 6/16/2026 8:28 PM, Jan Kara wrote:
> On Mon 11-05-26 15:23:34, Zhang Yi wrote:
>> From: Zhang Yi <yi.zhang@huawei.com>
>>
>> Introduce a new iomap_ops instance, ext4_iomap_zero_ops, along with
>> ext4_iomap_block_zero_range() to implement block zeroing via the iomap
>> infrastructure for ext4.
>>
>> ext4_iomap_block_zero_range() calls iomap_zero_range() with
>> ext4_iomap_zero_begin() as the callback. The callback locates and zeros
>> out either a mapped partial block or a dirty, unwritten partial block.
>>
>> Important constraints:
>>
>> Zeroing out under an active journal handle can cause deadlock, because
>> the order of acquiring the folio lock and starting a handle is
>> inconsistent with the iomap writeback path.
>>
>> Therefore, ext4_iomap_block_zero_range():
>> - Must NOT be called under an active handle.
>> - Cannot rely on data=ordered mode to ensure zeroed data persistence
>>   before updating i_disksize (for the cases of post-EOF append write,
>>   post-EOF fallocate, and truncate up). In subsequent patches, we will
>>   address this by synchronizing commit I/O but doesn't waiting for
>>   completion, and updating i_disksize to i_size only after the zeroed
>>   data has been written back.
>>
>> Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
>> ---
>>  fs/ext4/inode.c | 92 +++++++++++++++++++++++++++++++++++++++++++++++++
>>  1 file changed, 92 insertions(+)
>>
>> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
>> index c6fe42d012fc..e0dae2501292 100644
>> --- a/fs/ext4/inode.c
>> +++ b/fs/ext4/inode.c
>> @@ -4101,6 +4101,51 @@ static int ext4_iomap_buffered_da_write_end(struct inode *inode, loff_t offset,
>>  	return 0;
>>  }
>>  
>> +static int ext4_iomap_zero_begin(struct inode *inode,
>> +		loff_t offset, loff_t length, unsigned int flags,
>> +		struct iomap *iomap, struct iomap *srcmap)
>> +{
>> +	struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap);
> 
> This looks like a layering violation to me. I don't think you can safely
> assume the iomap you're passed is a part of iomap_iter...
> 
>> +	struct ext4_map_blocks map;
>> +	u8 blkbits = inode->i_blkbits;
>> +	unsigned int iomap_flags = 0;
>> +	int ret;
>> +
>> +	ret = ext4_emergency_state(inode->i_sb);
>> +	if (unlikely(ret))
>> +		return ret;
>> +
>> +	if (WARN_ON_ONCE(!(flags & IOMAP_ZERO)))
>> +		return -EINVAL;
>> +
>> +	ret = ext4_iomap_map_blocks(inode, offset, length, NULL, &map);
>> +	if (ret < 0)
>> +		return ret;
>> +
>> +	/*
>> +	 * Look up dirty folios for unwritten mappings within EOF. Providing
>> +	 * this bypasses the flush iomap uses to trigger extent conversion
>> +	 * when unwritten mappings have dirty pagecache in need of zeroing.
>> +	 */
>> +	if (map.m_flags & EXT4_MAP_UNWRITTEN) {
>> +		loff_t start = ((loff_t)map.m_lblk) << blkbits;
>> +		loff_t end = ((loff_t)map.m_lblk + map.m_len) << blkbits;
>> +
>> +		iomap_fill_dirty_folios(iter, &start, end, &iomap_flags);
>> +		if ((start >> blkbits) < map.m_lblk + map.m_len)
>> +			map.m_len = (start >> blkbits) - map.m_lblk;
>> +	}
> 
> ... and you need access to iter only for this which seems to be really a
> hack that's trying to outsmart the iomap code. I have to admit I don't
> fully understand what you are trying to achieve here. Are you trying to
> avoid flushing of the range that will be zeroed out?

This logic is copied from the XFS and iomap infrastructure. Its primary
purpose is to optimize the zeroing operations on dirty written extents.
It was introduced by Brian in [1].

The history as I understand it: originally, the iomap infrastructure
could not zero dirty unwritten extents during zero range processing,
which led to stale data exposure. XFS had to flush dirty ranges itself
before zeroing — a workaround that was not generic.

In c5c810b94cf ("iomap: fix handling of dirty folios over unwritten
extents"), Brian added an unconditional flush in the iomap
infrastructure, ensuring that by the time zeroing runs the extent has
already been converted to written so the zero can proceed correctly.
However, this flush was too heavy and introduced noticeable performance
overhead.

This was then optimized in 7d9b474ee4cc3 ("iomap: make zero range flush
conditional on unwritten mappings"), which restricts flushing to only
dirty pagecache over unwritten or hole mappings.

Brian later proposed a different approach: rather than relying on flush
to convert the extent type, find dirty folios ahead of the zero range
and zero the dirty unwritten extents directly. In [1] he added this
lookup logic. The filesystem now supplies a folio batch (a collection of
dirty folios) via the iomap begin callback, and zero range iterates over
these dirty folios to perform zeroing. Clean regions not covered by the
batch are simply skipped. This entirely eliminates the need to flush.

[1] https://lore.kernel.org/linux-xfs/20251003134642.604736-1-bfoster@redhat.com/

If I understand correctly, the current approach is a compromise, and
Brian is still working on this. Perhaps ext4 and XFS could work together
on improvements in the future?

> 
>> +	ret = iomap_zero_range(inode, from, length, did_zero,
>> +			       &ext4_iomap_zero_ops, &ext4_iomap_write_ops,
>> +			       NULL);
>> +	if (ret)
>> +		return ret;
>> +
>> +	/*
>> +	 * TODO: The iomap does not distinguish between different types of
>> +	 * zeroing and always sets zero_written if a zeroing operation is
>> +	 * performed, which may result in unnecessary order operations.
>> +	 */
> 
> Is this still true after your fix to did_zero handling?

Yeah. Currently, iomap_zero_range() can only report whether a zeroing
operation has occurred through did_zero parameter, but it cannot
distinguish whether the zeroed range is a written extent that already
exists on disk. That is, even if the zeroing is performed on a delalloc
extent, did_zero will still return true.

Thanks,
Yi.

> 
>> +	if (did_zero && zero_written)
>> +		*zero_written = *did_zero;
>> +
>> +	return 0;
>> +}
>> +
>>  /*
>>   * Zeros out a mapping of length 'length' starting from file offset
>>   * 'from'.  The range to be zero'd must be contained with in one block.
> 
> 								Honza

^ permalink raw reply

* Re: [PATCH 2/2] ext4: base unaligned DIO lock decision on partial block zeroing
From: Baokun Li @ 2026-06-17  7:52 UTC (permalink / raw)
  To: Zhang Yi
  Cc: linux-ext4, tytso, adilger.kernel, jack, yi.zhang, ojaswin,
	ritesh.list, peng_wang
In-Reply-To: <d1adcf7c-c276-458d-9cac-68a4410f7626@gmail.com>

On 2026/6/17 10:45, Zhang Yi wrote:
> Hi, Baokun!
>
> On 6/16/2026 9:10 PM, Baokun Li wrote:
>> Hi all,
>>
>> Thank you for your review!
>>
>> After extensive testing, I found that after merging this patch,
>> generic/746
>> started failing intermittently on ext3 (mkfs.ext4 -O ^extents).  The
>> test
>> triggers a "Page cache invalidation failure on direct I/O" warning, and
>> subsequent fsync returns -EIO.
>>
>> The underlying race existed before this patch, but this patch appears to
>> have widened the reproduction window considerably, so I thought it worth
>> trying to address.  Here is my analysis:
>>
>> On no-extent inodes, DIO writes that hit holes cannot use unwritten
>> extents.  ext4_iomap_alloc() leaves m_flags=0, so ext4_map_blocks()
>> returns 0 for a hole, and:
>>
>>          if (!m_flags && !ret)
>>                  ret = -ENOTBLK;
>>
>> The iomap layer returns -ENOTBLK to ext4, which falls back to buffered
>> I/O.  The fallback path dirties pages in the page cache, then flushes
>> and invalidates them.  However, concurrent async DIO completions to
>> other blocks on the same inode can run
>> kiocb_invalidate_post_direct_write()
>> without holding the inode lock.
>>
>> Consider a file with two 4k extents: [hole][written].  Thread A does DIO
>> to the written extent, while thread B does DIO spanning both extents:
>>
>>    kworker A (4k DIO, allocated block)    kworker B (8k DIO,
>> hole->fallback)
>>    -----------------------------------   
>> -----------------------------------
>>    inode_lock_shared()                    inode_lock_shared()
>>    iomap_dio_rw():                        iomap_dio_rw():
>>      kiocb_invalidate_pages -> clean        iomap_begin -> -ENOTBLK
>>      submit_bio (async)                     dio->size = 0
>>    inode_unlock_shared()                  inode_unlock_shared()
>>
>>    [bio pending in block layer]           /* fallback: inode lock
>> released */
>>                                           ext4_buffered_write_iter()
>>                                             inode_lock(exclusive)
>>                                             generic_perform_write()
>>                                               -> dirty pages [0, 8k]
>>                                             inode_unlock(exclusive)
>>
>>                                           /* pages still dirty here */
>>    [bio completes]                        filemap_write_and_wait_range()
>>    iomap_dio_complete()                     -> flush dirty pages
>>      kiocb_invalidate_post_direct_write() invalidate_mapping_pages()
>>        invalidate_inode_pages2_range()
>>        -> finds dirty page!               /* window closed */
>>        -> dio_warn_stale_pagecache()
>>        -> errseq_set(-EIO)
>>
>
> It looks like this issue occurs when invalidate_inode_pages2_range()
> checks beyond the DIO write range, which may only happen when folio size
> is larger than block size. Is that correct?
Thanks for looking at this!

Not quite — the scenario involves an 8k file with layout

 [hole at 0-4k] [written extent at 4k-8k]

and two DIO threads. Thread A does a 4k DIO write at offset 4k; since
the target block is a written extent, no fallback occurs. Thread B
does an 8k DIO write at offset 0; since blocks 0-4k are a hole on an
indirect-block inode and ext3 does not support unwritten extents,
iomap returns -ENOTBLK and the entire 8k write falls back to buffered
I/O.

Normally the kernel would prevent concurrent BIO and DIO to
overlapping ranges on the same file. But because Thread A holds only
a shared inode lock (pure overwrite on a written extent), and
Thread B's DIO has already returned -ENOTBLK before its buffered
fallback begins, both paths can proceed concurrently:


Thread A: 4k DIO at offset 4k     Thread B: 8k DIO at offset 0
─────────────────────────────     ─────────────────────────────
kiocb_invalidate_pages            iomap_begin → -ENOTBLK
  → page index 1 already clean      (indirect inode hole, m_flags=0)
submit_bio (async)                dio->size = 0
inode_unlock_shared()             inode_unlock_shared()

                                  ext4_buffered_write_iter()
[bio pending]                       → dirty page 0 [0, 4k]
                                    → dirty page 1 [4k, 8k]
                                  inode_unlock()
                                  // pages dirty, no lock

[bio completes]
iomap_dio_complete():
  kiocb_invalidate_post_direct_write()
    start = 4096 >> 12 = 1
    end = (8191) >> 12 = 1
    invalidate_inode_pages2_range(1, 1)
      → page 1 [4k,8k] is DIRTY
      → -EBUSY → errseq_set(-EIO)

Page index 1 corresponds to file offset [4k, 8k], which is exactly
Thread A's DIO range. The invalidation is not going beyond the DIO
range — the dirty page was placed there by Thread B's buffered
fallback, which wrote to [0, 8k] and dirtied the same page.

No large folio is needed; 4k pages and 4k blocks are sufficient.

From the user's perspective, when performing concurrent DIO on a
holed ext3 file, the file contents can become corrupted with some
probability. If the file is used as a loop device's backing file,
this manifests as filesystem corruption inside the loop device.
>
>> The critical window is the gap between ext4_buffered_write_iter()
>> dirtying
>> pages and filemap_write_and_wait_range() flushing them.  In this
>> window the
>> inode lock is not held, so another thread's async DIO completion is
>> free to
>> invalidate the still-dirty pages in the page cache.
>>
>> This race has always existed on ext3 because indirect-block inodes lack
>> unwritten-extent support.  However, the window was extremely narrow in
>> practice, because the old ext4_overwrite_io() checked every block and
>> would conservatively take an exclusive lock.  This patch replaced it
>> with ext4_dio_needs_zeroing(), which only checks head and tail blocks,
>> making unaligned DIO more likely to take a shared lock and
>> proportionally increasing the chance of hitting the race.
>>
>> I tried a couple of alternatives before settling on the patch below:
>>
>> 1. Force exclusive lock + IOMAP_DIO_FORCE_WAIT for all no-extent DIO.
>>     This closes the window for new DIO submissions, but does not protect
>>     against bio completions from previously submitted async DIO, which
>>     run independently of the inode lock.
>>
>> 2. Wrap the fallback dirty+flush+invalidate sequence in
>>     filemap_invalidate_lock().  However, the ext4 DIO and iomap layers
>>     do not use this lock, so it would not serialise against DIO
>>     completions.
>>
>
> Could we add a call to inode_dio_wait() before falling back to buffered
> I/O? That is, in thread B, when falling back to buffered I/O, could we
> acquire the exclusive inode lock and then call inode_dio_wait() to wait
> for in-flight DIO to complete? This should close the race window. Since
> scenarios where DIO writes to holes on ext3 are relatively rare, the
> performance impact should be minimal (I suppose).
>
That's a great idea, thank you!

I had been trying to fix this on the DIO side and didn't consider
waiting from the buffered fallback path.

I've tested the approach locally and it closes the race; I'll add a
patch using it in the next version.
>> One straightforward approach that seems correct is to skip direct I/O
>> for no-extent inodes entirely, by returning 0 from ext4_dio_alignment():
>>
>> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
>> --- a/fs/ext4/inode.c
>> +++ b/fs/ext4/inode.c
>> @@ -6131,6 +6131,8 @@ u32 ext4_dio_alignment(struct inode *inode)
>>   {
>>          if (fsverity_active(inode))
>>                  return 0;
>> +       if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
>> +               return 0;
>>          if (ext4_should_journal_data(inode))
>>                  return 0;
>>          if (ext4_has_inline_data(inode))
>>
>> With this, ext4_should_use_dio() returns false for no-extent inodes, and
>> all I/O goes through ext4_buffered_write_iter() directly, bypassing the
>> DIO path entirely.  On ext3, DIO to a hole already falls back to
>> buffered
>> I/O, so there is essentially no performance benefit to using DIO in the
>> first place.
>>
>> Note that with this change, the fallback branch in
>> ext4_dio_write_iter():
>>
>>          if (ret >= 0 && iov_iter_count(from)) {
>>                  /* buffered fallback */
>>          }
>>
>> would also become dead code for extent-based inodes (since unwritten
>> extents guarantee iomap_dio_rw() never returns zero with unconsumed
>> data), and could be removed in a follow-up cleanup.
>>
>> Thoughts?  Is there a reason to preserve DIO on no-extent inodes that
>> I'm missing?
>>
>
> Hmm, this would also cause DIO to fall back to buffered I/O in common
> extending write cases, which I think would be unacceptable.
>

Fair point, the regression on extending writes is hard to justify.  That
said, until we had a better fix, I'd argue a behavioural change was
still preferable to potential data corruption. With the inode_dio_wait()
approach above, this trade-off goes away. 


Thanks,
Baokun



^ permalink raw reply

* Re: [PATCH RFC 2/8] fs: add a global device to super block hash table
From: Christoph Hellwig @ 2026-06-17  6:25 UTC (permalink / raw)
  To: Christian Brauner
  Cc: Christoph Hellwig, Jan Kara, Jens Axboe, Alexander Viro,
	linux-block, linux-kernel, linux-fsdevel, Carlos Maiolino,
	linux-xfs, Chris Mason, David Sterba, linux-btrfs,
	Theodore Ts'o, linux-ext4, Gao Xiang, linux-erofs
In-Reply-To: <20260616-fragil-duktus-nachverfolgen-60f54584c206@brauner>

On Tue, Jun 16, 2026 at 04:59:53PM +0200, Christian Brauner wrote:
> > Err, no.  block devices need to have a specific owner.  If erofs wants
> > to share a device between superblock it needs to come up with an entity
> > that owns the block devices which is not a superblock.
> 
> It already did.
> 
> > IMHO sharing devices between superblocks is a bad idea, but that ship
> > has sailed, but please keep it contained inside of erofs.
> 
> We need a simple device number to superblock mapping anyway and that can
> simply be centralized in the vfs. And it can work with anon device
> numbers and block device numbers uniformly.

No, we don't need a secondary device number to sb mapping.  On the other
hand we do need the deviceloss, freeze etc upcalls to work for owners
that are not file systems like mdraid or dm, even if they have been
slow to pick this.  The whole idea of the holder ops is to abstract
away from who holds it instead of adding back the broken hard coding
of the superblock.  Otherwise you're just badly reinventing get_super.

If erofs already has an owner entity it just needs custom holder ops for
that.

^ permalink raw reply

* Re: [PATCH RFC v2 15/18] f2fs: open via dedicated fs bdev helpers
From: Chao Yu @ 2026-06-17  3:17 UTC (permalink / raw)
  To: Christian Brauner, Jan Kara
  Cc: chao, Christoph Hellwig, Jens Axboe, Alexander Viro, linux-block,
	linux-kernel, linux-fsdevel, Carlos Maiolino, linux-xfs,
	Chris Mason, David Sterba, linux-btrfs, Theodore Ts'o,
	linux-ext4, Gao Xiang, linux-erofs
In-Reply-To: <20260616-work-super-bdev_holder_global-v2-15-7df6b864028e@kernel.org>

On 6/16/26 22:08, Christian Brauner wrote:
> Route the extra device opens of a multi-device f2fs through
> fs_bdev_file_open_by_path() so each device is registered against the
> superblock, and convert the matching release in destroy_device_list()
> to fs_bdev_file_release(). The first device aliases the main bdev file
> opened by setup_bdev_super() and is already registered through it.
> 
> f2fs opened its extra devices without holder ops, so a freeze, sync, or
> removal of one of them was never propagated to the superblock.
> Registering them wires those events up: every device now freezes,
> thaws, syncs, and shuts down the filesystem like the main device does.
> 
> Signed-off-by: Christian Brauner (Amutable) <brauner@kernel.org>

Acked-by: Chao Yu <chao@kernel.org>

Thanks,

^ permalink raw reply

* Re: [PATCH 2/2] ext4: base unaligned DIO lock decision on partial block zeroing
From: Zhang Yi @ 2026-06-17  2:45 UTC (permalink / raw)
  To: Baokun Li, linux-ext4
  Cc: tytso, adilger.kernel, jack, yi.zhang, ojaswin, ritesh.list,
	peng_wang
In-Reply-To: <060f63e0-d64f-40df-99a7-af53862049ee@linux.alibaba.com>

Hi, Baokun!

On 6/16/2026 9:10 PM, Baokun Li wrote:
> Hi all,
> 
> Thank you for your review!
> 
> After extensive testing, I found that after merging this patch, generic/746
> started failing intermittently on ext3 (mkfs.ext4 -O ^extents).  The test
> triggers a "Page cache invalidation failure on direct I/O" warning, and
> subsequent fsync returns -EIO.
> 
> The underlying race existed before this patch, but this patch appears to
> have widened the reproduction window considerably, so I thought it worth
> trying to address.  Here is my analysis:
> 
> On no-extent inodes, DIO writes that hit holes cannot use unwritten
> extents.  ext4_iomap_alloc() leaves m_flags=0, so ext4_map_blocks()
> returns 0 for a hole, and:
> 
>          if (!m_flags && !ret)
>                  ret = -ENOTBLK;
> 
> The iomap layer returns -ENOTBLK to ext4, which falls back to buffered
> I/O.  The fallback path dirties pages in the page cache, then flushes
> and invalidates them.  However, concurrent async DIO completions to
> other blocks on the same inode can run kiocb_invalidate_post_direct_write()
> without holding the inode lock.
> 
> Consider a file with two 4k extents: [hole][written].  Thread A does DIO
> to the written extent, while thread B does DIO spanning both extents:
> 
>    kworker A (4k DIO, allocated block)    kworker B (8k DIO, hole->fallback)
>    -----------------------------------    -----------------------------------
>    inode_lock_shared()                    inode_lock_shared()
>    iomap_dio_rw():                        iomap_dio_rw():
>      kiocb_invalidate_pages -> clean        iomap_begin -> -ENOTBLK
>      submit_bio (async)                     dio->size = 0
>    inode_unlock_shared()                  inode_unlock_shared()
> 
>    [bio pending in block layer]           /* fallback: inode lock released */
>                                           ext4_buffered_write_iter()
>                                             inode_lock(exclusive)
>                                             generic_perform_write()
>                                               -> dirty pages [0, 8k]
>                                             inode_unlock(exclusive)
> 
>                                           /* pages still dirty here */
>    [bio completes]                        filemap_write_and_wait_range()
>    iomap_dio_complete()                     -> flush dirty pages
>      kiocb_invalidate_post_direct_write() invalidate_mapping_pages()
>        invalidate_inode_pages2_range()
>        -> finds dirty page!               /* window closed */
>        -> dio_warn_stale_pagecache()
>        -> errseq_set(-EIO)
> 

It looks like this issue occurs when invalidate_inode_pages2_range()
checks beyond the DIO write range, which may only happen when folio size
is larger than block size. Is that correct?

> The critical window is the gap between ext4_buffered_write_iter() dirtying
> pages and filemap_write_and_wait_range() flushing them.  In this window the
> inode lock is not held, so another thread's async DIO completion is free to
> invalidate the still-dirty pages in the page cache.
> 
> This race has always existed on ext3 because indirect-block inodes lack
> unwritten-extent support.  However, the window was extremely narrow in
> practice, because the old ext4_overwrite_io() checked every block and
> would conservatively take an exclusive lock.  This patch replaced it
> with ext4_dio_needs_zeroing(), which only checks head and tail blocks,
> making unaligned DIO more likely to take a shared lock and
> proportionally increasing the chance of hitting the race.
> 
> I tried a couple of alternatives before settling on the patch below:
> 
> 1. Force exclusive lock + IOMAP_DIO_FORCE_WAIT for all no-extent DIO.
>     This closes the window for new DIO submissions, but does not protect
>     against bio completions from previously submitted async DIO, which
>     run independently of the inode lock.
> 
> 2. Wrap the fallback dirty+flush+invalidate sequence in
>     filemap_invalidate_lock().  However, the ext4 DIO and iomap layers
>     do not use this lock, so it would not serialise against DIO
>     completions.
> 

Could we add a call to inode_dio_wait() before falling back to buffered
I/O? That is, in thread B, when falling back to buffered I/O, could we
acquire the exclusive inode lock and then call inode_dio_wait() to wait
for in-flight DIO to complete? This should close the race window. Since
scenarios where DIO writes to holes on ext3 are relatively rare, the
performance impact should be minimal (I suppose).

> One straightforward approach that seems correct is to skip direct I/O
> for no-extent inodes entirely, by returning 0 from ext4_dio_alignment():
> 
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -6131,6 +6131,8 @@ u32 ext4_dio_alignment(struct inode *inode)
>   {
>          if (fsverity_active(inode))
>                  return 0;
> +       if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
> +               return 0;
>          if (ext4_should_journal_data(inode))
>                  return 0;
>          if (ext4_has_inline_data(inode))
> 
> With this, ext4_should_use_dio() returns false for no-extent inodes, and
> all I/O goes through ext4_buffered_write_iter() directly, bypassing the
> DIO path entirely.  On ext3, DIO to a hole already falls back to buffered
> I/O, so there is essentially no performance benefit to using DIO in the
> first place.
> 
> Note that with this change, the fallback branch in ext4_dio_write_iter():
> 
>          if (ret >= 0 && iov_iter_count(from)) {
>                  /* buffered fallback */
>          }
> 
> would also become dead code for extent-based inodes (since unwritten
> extents guarantee iomap_dio_rw() never returns zero with unconsumed
> data), and could be removed in a follow-up cleanup.
> 
> Thoughts?  Is there a reason to preserve DIO on no-extent inodes that
> I'm missing?
> 

Hmm, this would also cause DIO to fall back to buffered I/O in common
extending write cases, which I think would be unacceptable.

Cheers,
Yi.

> Looking forward to your feedback.
> 
> 
> Thanks,
> Baokun
> 
> 
> 


^ permalink raw reply

* Re: [PATCH v3 0/3] f2fs: support encrypted inline data
From: Eric Biggers @ 2026-06-16 23:02 UTC (permalink / raw)
  To: LiaoYuanhong-vivo
  Cc: chao, corbet, jaegeuk, linux-doc, linux-ext4, linux-f2fs-devel,
	linux-fscrypt, linux-kernel, skhan, tytso
In-Reply-To: <20260616094612.45505-1-liaoyuanhong@vivo.com>

On Tue, Jun 16, 2026 at 05:46:12PM +0800, LiaoYuanhong-vivo wrote:
> Could you share more about the direction you have in mind for simplifying
> f2fs/ext4 contents encryption around blk-crypto?

Currently ext4 and f2fs each have two implementations of file contents
encryption and decryption:

- One where the en/decryption is done in the filesystem layer

- One where the filesystem attaches a bio_crypt_ctx to the bios and the
  en/decryption is done either in the block layer by blk-crypto-fallback
  or by inline encryption hardware

I'd like to drop the first one, for simplicity and to reduce the burden
on ongoing developments like large folio support.

> For f2fs inline_data, there is still a real space-saving benefit on phones,
> since many encrypted files are smaller than 4K. Is there any acceptable
> future direction to support this kind of inode-resident data with
> blk-crypto or hardware-wrapped keys?

It is incompatible with inline encryption hardware.  A CPU-based
solution like Intel Key Locker or RISC-V High Assurance Cryptography
could provide similar security properties.  But there's nothing for
arm64 yet.  And I should mention that no one has wanted to use Key
Locker anyway because it's really slow.

- Eric

^ permalink raw reply

* Re: [PATCH RFC 2/8] fs: add a global device to super block hash table
From: Gao Xiang @ 2026-06-16 16:35 UTC (permalink / raw)
  To: Christoph Hellwig, Christian Brauner
  Cc: Jan Kara, Jens Axboe, Alexander Viro, linux-block, linux-kernel,
	linux-fsdevel, Carlos Maiolino, linux-xfs, Chris Mason,
	David Sterba, linux-btrfs, Theodore Ts'o, linux-ext4,
	Gao Xiang, linux-erofs
In-Reply-To: <20260616123443.GA21024@lst.de>

On 2026/6/16 20:34, Christoph Hellwig wrote:

> IMHO sharing devices between superblocks is a bad idea, but that ship
> has sailed, but please keep it contained inside of erofs.

I'm not sure why it's a bad idea, for example,
the immutable layer model is already applied to layered virtual
block formats (such as qcow2) and layered fs like overlayfs.

and I think device mappers may have some similar immutable
approaches as shared layers but works in a slight different
way.

The principle is that each instance uses shared blobs in a
read-only way, and that is almost a simple and safest way
to share data among filesystem instances.

Yet I don't want to argue with that since it's pretty common
for years and I've seen no practical risk using this model.

Thanks,
Gao Xiang

^ permalink raw reply

* [PATCH v7 0/4] ext4: fix xattr iput deadlock with s_writepages_rwsem
From: Yun Zhou @ 2026-06-16 15:15 UTC (permalink / raw)
  To: tytso, adilger.kernel, libaokun, jack, ojaswin, ritesh.list,
	yi.zhang
  Cc: linux-ext4, linux-kernel, yun.zhou

This series fixes a circular lock dependency reported by syzbot:

  s_writepages_rwsem --> jbd2_handle --> xattr_sem --> s_writepages_rwsem

The deadlock occurs when iput() on an EA inode triggers write_inode_now()
while xattr_sem and a jbd2 handle are held.  The triggering path is
during mount-time orphan cleanup (!SB_ACTIVE) where iput_final() calls
write_inode_now() synchronously.

Patch 1 blocks the deadlock by skipping extra isize expansion when
!SB_ACTIVE -- this prevents the xattr manipulation path from being
entered during mount.

Patch 2 is a belt-and-suspenders semantic improvement: an inode under
eviction never needs extra isize expansion.

Patches 3-4 are a structural improvement using a per-sb workqueue:

  Patch 3 introduces ext4_put_ea_inode(), which does direct iput() when
  SB_ACTIVE (zero overhead) and defers to a workqueue when !SB_ACTIVE.
  It also converts the first call site (ext4_xattr_block_set release
  path) which previously called iput under xattr_sem + jbd2 handle.

  Patch 4 converts the remaining EA inode iput() calls that execute
  under locks.  Sites where direct iput() is provably safe (i_nlink=0
  after dec_ref, or lookup-only paths) are left unchanged with comments.

Link: https://syzkaller.appspot.com/bug?extid=5d19358d7eb30ffb0cc5

v7:
 - Replaced the deferred-iput array threading approach (v4-v6) with a
   simpler per-sb workqueue + lock-free llist design.  No function
   signature changes needed.  ext4_put_ea_inode() does direct iput when
   SB_ACTIVE (zero overhead in normal operation) and defers to the
   workqueue only during mount (!SB_ACTIVE).
 - Converted the iput in ext4_xattr_delete_inode()'s quota accounting
   loop to ext4_put_ea_inode() to eliminate a lockdep-reportable lock
   ordering violation (jbd2_handle -> iput -> s_writepages_rwsem).
 - Moved flush_work() before the if (sbi->s_journal) check in
   ext4_put_super() to cover nojournal mode.

v6:
 - ext4_inline_data_truncate(): use local ea_inode_array instead of
   passing NULL, freed after ext4_journal_stop().  Fixes a deadlock
   reachable via crafted filesystem where inline data xattr entry has
   e_value_inum set: orphan cleanup -> ext4_truncate ->
   ext4_inline_data_truncate -> iput under !SB_ACTIVE.

v5:
 - Split into 3 patches for easier review.
 - Add explicit !SB_ACTIVE early-return in ext4_try_to_expand_extra_isize()
   to block ALL mount-time paths (ext4_process_orphan -> ext4_truncate ->
   ext4_mark_inode_dirty), not just the eviction path. v4 only relied on
   EXT4_STATE_NO_EXPAND which doesn't cover orphan truncation.

v4:
 - Comprehensive rewrite of the deferred iput mechanism.
 - Thread ea_inode_array through ext4_expand_extra_isize_ea() and
   ext4_xattr_move_to_block() so ALL ea_inode iputs in the expand
   path are deferred, not just those in ext4_xattr_block_set().
 - Add NULL safety to ext4_expand_inode_array(): when ea_inode_array
   pointer is NULL, fall back to synchronous iput (for callers like
   ext4_initxattrs that only run with SB_ACTIVE).
 - Use __GFP_NOFAIL to guarantee deferred array growth, eliminating
   fallback to synchronous iput under locks.
 - Update ext4_xattr_ibody_set() and ext4_xattr_set_entry() signatures
   to accept ea_inode_array, converting ALL iput(ea_inode) calls.
 - Set EXT4_STATE_NO_EXPAND in ext4_evict_inode() before
   ext4_mark_inode_dirty().

v3:
 - Check ext4_expand_inode_array() return value; fallback to
   direct iput() on ENOMEM to prevent inode leak.
 - Make ext4_xattr_set_handle() take an optional ea_inode_array
   output parameter so callers can free after ext4_journal_stop(),
   avoiding the jbd2_handle vs s_writepages_rwsem AB-BA.
 - Pass ea_inode_array directly to ext4_xattr_release_block()
   instead of using a local array freed under xattr_sem.
 - Move ext4_xattr_inode_array_free() after ext4_journal_stop()

v2:
 - Defer iput() in ext4_xattr_block_set() via ea_inode_array,
   freed after xattr_sem is released. Fixes the root cause.

v1:
 - Set EXT4_STATE_NO_EXPAND in ext4_evict_inode() to skip expand
   on inodes being deleted. Only fixes the syzbot reproducer, not
   the underlying lock ordering violation.

Yun Zhou (4):
  ext4: skip extra isize expansion during mount to prevent deadlock
  ext4: set EXT4_STATE_NO_EXPAND in ext4_evict_inode
  ext4: introduce ext4_put_ea_inode() for safe deferred iput
  ext4: convert remaining EA inode iput() calls to ext4_put_ea_inode()

 fs/ext4/ext4.h  |   5 +++
 fs/ext4/inode.c |  11 +++++
 fs/ext4/super.c |   6 +++
 fs/ext4/xattr.c | 105 +++++++++++++++++++++++++++++++++++++++++++-----
 fs/ext4/xattr.h |   2 +
 5 files changed, 120 insertions(+), 9 deletions(-)

-- 
2.43.0


^ permalink raw reply

* Re: [PATCH RFC 2/8] fs: add a global device to super block hash table
From: Christian Brauner @ 2026-06-16 15:19 UTC (permalink / raw)
  To: Christoph Hellwig, Jan Kara
  Cc: Jens Axboe, Alexander Viro, linux-block, linux-kernel,
	linux-fsdevel, Carlos Maiolino, linux-xfs, Chris Mason,
	David Sterba, linux-btrfs, Theodore Ts'o, linux-ext4,
	Gao Xiang, linux-erofs
In-Reply-To: <20260616-fragil-duktus-nachverfolgen-60f54584c206@brauner>

On Tue, Jun 16, 2026 at 04:59:53PM +0200, Christian Brauner wrote:
> On Tue, Jun 16, 2026 at 02:34:43PM +0200, Christoph Hellwig wrote:
> > On Tue, Jun 02, 2026 at 12:10:08PM +0200, Christian Brauner wrote:
> > > fs_holder_ops recovers the owning superblock from bdev->bd_holder, which
> > > forces the holder to be exactly one superblock and prevents several
> > > superblocks from sharing one block device. That's what erofs is doing.
> > > 
> > > Introduce a global dev_t-keyed rhltable mapping each block device to the
> > > superblock(s) using it. The holder argument becomes purely the block
> > > layer's exclusivity token (a superblock, or a file_system_type for
> > > shared devices) and is no longer needed by the fs specific callbacks.
> > 
> > Err, no.  block devices need to have a specific owner.  If erofs wants
> > to share a device between superblock it needs to come up with an entity
> > that owns the block devices which is not a superblock.
> 
> It already did.
> 
> > IMHO sharing devices between superblocks is a bad idea, but that ship
> > has sailed, but please keep it contained inside of erofs.
> 
> We need a simple device number to superblock mapping anyway and that can
> simply be centralized in the vfs. And it can work with anon device
> numbers and block device numbers uniformly.

Plus, after we're done we then also have a centry place where we can
intercept what devices can be mounted by a filesystem uniformly.

My first approach for this was of course to just add fs_file_open_by_*()
wrappers and move the relevant security hook into there. But while doing
this - ignoring the ton of bugs I found - I realized that having a
mapping so we can go from device number to superblock is very helpful.

We could of course keep the mapping just local to erofs but I see no
reason why the vfs cannot just provide this ability natively given that
it has all the required machinery. I'll let Jan chime in as well.

^ permalink raw reply

* [PATCH v7 4/4] ext4: convert remaining EA inode iput() calls to ext4_put_ea_inode()
From: Yun Zhou @ 2026-06-16 15:15 UTC (permalink / raw)
  To: tytso, adilger.kernel, libaokun, jack, ojaswin, ritesh.list,
	yi.zhang
  Cc: linux-ext4, linux-kernel, yun.zhou
In-Reply-To: <20260616151558.1728881-1-yun.zhou@windriver.com>

Convert all remaining iput() calls on EA inodes that execute under
xattr_sem or a jbd2 handle to use ext4_put_ea_inode().  With i_nlink>=1
and !SB_ACTIVE, a direct iput() would trigger write_inode_now() ->
s_writepages_rwsem, creating a lock ordering violation with the caller's
active jbd2 handle.

Converted sites and why defer is necessary:

- ext4_xattr_inode_inc_ref_all() cleanup: dec_ref undoes the failed
  inc_ref, but the EA inode may be shared so i_nlink remains 1.

- ext4_xattr_inode_lookup_create() out_err: may be a cache-found inode
  where inc_ref failed; i_nlink remains 1.

- ext4_xattr_set_entry() old_ea_inode: dec_ref was called but the EA
  inode may be shared by other xattr blocks, so i_nlink remains 1.

- ext4_xattr_block_set() new block path: dec_ref drops the "extra" ref
  but inc_ref_all added another, so i_nlink stays 1.

- ext4_xattr_block_set() cleanup: on success no dec_ref was called
  (i_nlink=1); on error dec_ref may leave i_nlink=1 if shared.

- ext4_xattr_ibody_set() error path: dec_ref on a cache-found EA inode
  may leave i_nlink=1 if shared.

- ext4_xattr_ibody_set() success path: newly stored EA inode with
  i_nlink=1, just releasing the lookup reference.

- ext4_xattr_delete_inode() quota loop: iget for quota accounting only,
  no dec_ref called, i_nlink=1, jbd2 handle is active.

Sites where direct iput() is provably safe are left unchanged with a
comment: ext4_xattr_inode_create() error path (dec_ref guarantees
i_nlink=0, so eviction skips write_inode_now).

Signed-off-by: Yun Zhou <yun.zhou@windriver.com>
---
 fs/ext4/xattr.c | 32 ++++++++++++++++++++++++--------
 1 file changed, 24 insertions(+), 8 deletions(-)

diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 04e7f674340d..b8a2ccd0a958 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -1079,6 +1079,13 @@ static int ext4_xattr_inode_inc_ref(handle_t *handle, struct inode *ea_inode)
 	return ext4_xattr_inode_update_ref(handle, ea_inode, 1);
 }
 
+/*
+ * Decrement on-disk reference count of an EA inode.  If refcount reaches 0,
+ * i_nlink is cleared and the inode is added to the orphan list.  Callers
+ * must use ext4_put_ea_inode() (not iput) to release the VFS reference
+ * afterwards, since iput on a nlink=0 inode triggers eviction which may
+ * deadlock if called under xattr_sem or an active jbd2 handle.
+ */
 static int ext4_xattr_inode_dec_ref(handle_t *handle, struct inode *ea_inode)
 {
 	return ext4_xattr_inode_update_ref(handle, ea_inode, -1);
@@ -1135,7 +1142,8 @@ static int ext4_xattr_inode_inc_ref_all(handle_t *handle, struct inode *parent,
 		if (err)
 			ext4_warning_inode(ea_inode, "cleanup dec ref error %d",
 					   err);
-		iput(ea_inode);
+		/* i_nlink may remain 1 if shared; defer for !SB_ACTIVE safety */
+		ext4_put_ea_inode(parent->i_sb, ea_inode);
 	}
 	return saved_err;
 }
@@ -1507,6 +1515,7 @@ static struct inode *ext4_xattr_inode_create(handle_t *handle,
 			if (ext4_xattr_inode_dec_ref(handle, ea_inode))
 				ext4_warning_inode(ea_inode,
 					"cleanup dec ref error %d", err);
+			/* dec_ref set i_nlink=0; iput won't trigger write_inode_now */
 			iput(ea_inode);
 			return ERR_PTR(err);
 		}
@@ -1617,7 +1626,8 @@ static struct inode *ext4_xattr_inode_lookup_create(handle_t *handle,
 				      ea_inode->i_ino, true /* reusable */);
 	return ea_inode;
 out_err:
-	iput(ea_inode);
+	/* May be cache-found inode with i_nlink=1 (inc_ref failed) */
+	ext4_put_ea_inode(inode->i_sb, ea_inode);
 	ext4_xattr_inode_free_quota(inode, NULL, value_len);
 	return ERR_PTR(err);
 }
@@ -1850,7 +1860,8 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i,
 
 	ret = 0;
 out:
-	iput(old_ea_inode);
+	/* old_ea_inode had dec_ref; may still have i_nlink=1 if shared */
+	ext4_put_ea_inode(inode->i_sb, old_ea_inode);
 	return ret;
 }
 
@@ -2152,7 +2163,8 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
 					ext4_warning_inode(ea_inode,
 							   "dec ref error=%d",
 							   error);
-				iput(ea_inode);
+				/* i_nlink stays 1 (inc_ref_all added a ref) */
+				ext4_put_ea_inode(inode->i_sb, ea_inode);
 				ea_inode = NULL;
 			}
 
@@ -2206,7 +2218,8 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
 			ext4_xattr_inode_free_quota(inode, ea_inode,
 						    i_size_read(ea_inode));
 		}
-		iput(ea_inode);
+		/* success: i_nlink=1; error+dec_ref: may still be 1 if shared */
+		ext4_put_ea_inode(inode->i_sb, ea_inode);
 	}
 	if (ce)
 		mb_cache_entry_put(ea_block_cache, ce);
@@ -2288,7 +2301,8 @@ int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode,
 
 			ext4_xattr_inode_free_quota(inode, ea_inode,
 						    i_size_read(ea_inode));
-			iput(ea_inode);
+			/* cache-found ea_inode may retain i_nlink=1 */
+			ext4_put_ea_inode(inode->i_sb, ea_inode);
 		}
 		return error;
 	}
@@ -2300,7 +2314,8 @@ int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode,
 		header->h_magic = cpu_to_le32(0);
 		ext4_clear_inode_state(inode, EXT4_STATE_XATTR);
 	}
-	iput(ea_inode);
+	/* ea_inode has i_nlink=1 (new ref just stored in xattr entry) */
+	ext4_put_ea_inode(inode->i_sb, ea_inode);
 	return 0;
 }
 
@@ -2989,7 +3004,8 @@ int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
 					continue;
 				ext4_xattr_inode_free_quota(inode, ea_inode,
 					      le32_to_cpu(entry->e_value_size));
-				iput(ea_inode);
+				/* no dec_ref yet but i_nlink=1; handle is active */
+				ext4_put_ea_inode(inode->i_sb, ea_inode);
 			}
 
 		}
-- 
2.43.0


^ permalink raw reply related

* [PATCH v7 2/4] ext4: set EXT4_STATE_NO_EXPAND in ext4_evict_inode
From: Yun Zhou @ 2026-06-16 15:15 UTC (permalink / raw)
  To: tytso, adilger.kernel, libaokun, jack, ojaswin, ritesh.list,
	yi.zhang
  Cc: linux-ext4, linux-kernel, yun.zhou
In-Reply-To: <20260616151558.1728881-1-yun.zhou@windriver.com>

An inode being evicted will never need its extra isize expanded.  Set
EXT4_STATE_NO_EXPAND before ext4_mark_inode_dirty() in ext4_evict_inode()
to make this explicit and prevent any unnecessary work in
ext4_try_to_expand_extra_isize().

This also provides defense-in-depth for the s_writepages_rwsem deadlock
during mount-time orphan cleanup, ensuring the expand path is blocked
for inodes under eviction regardless of how they are reached.

Signed-off-by: Yun Zhou <yun.zhou@windriver.com>
---
 fs/ext4/inode.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 09dcfb6bf48c..1de0aaa28e63 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -264,6 +264,7 @@ void ext4_evict_inode(struct inode *inode)
 	if (ext4_inode_is_fast_symlink(inode))
 		memset(EXT4_I(inode)->i_data, 0, sizeof(EXT4_I(inode)->i_data));
 	inode->i_size = 0;
+	ext4_set_inode_state(inode, EXT4_STATE_NO_EXPAND);
 	err = ext4_mark_inode_dirty(handle, inode);
 	if (err) {
 		ext4_warning(inode->i_sb,
-- 
2.43.0

^ permalink raw reply related

* [PATCH v7 1/4] ext4: skip extra isize expansion during mount to prevent deadlock
From: Yun Zhou @ 2026-06-16 15:15 UTC (permalink / raw)
  To: tytso, adilger.kernel, libaokun, jack, ojaswin, ritesh.list,
	yi.zhang
  Cc: linux-ext4, linux-kernel, yun.zhou
In-Reply-To: <20260616151558.1728881-1-yun.zhou@windriver.com>

ext4_try_to_expand_extra_isize() is called from __ext4_mark_inode_dirty()
while holding an active jbd2 handle.  During mount (!SB_ACTIVE), the
expand path may move xattrs to external blocks and release ea_inodes via
iput().  When !SB_ACTIVE, iput() calls write_inode_now() which acquires
s_writepages_rwsem, creating a circular lock dependency:

  s_writepages_rwsem --> jbd2_handle --> xattr_sem --> s_writepages_rwsem

This can be triggered via:

  ext4_process_orphan() -> ext4_truncate() -> ext4_mark_inode_dirty()
    -> ext4_try_to_expand_extra_isize()

or:

  ext4_evict_inode() -> ext4_mark_inode_dirty()
    -> ext4_try_to_expand_extra_isize()

Skip expansion when !SB_ACTIVE.  This is a minor loss of functionality
(extra isize won't grow for these inodes during mount), which e2fsck
can resolve later if needed.

Reported-by: syzbot+5d19358d7eb30ffb0cc5@syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=5d19358d7eb30ffb0cc5
Fixes: c8585c6fcaf2 ("ext4: fix races between changing inode journal mode and ext4_writepages")
Signed-off-by: Yun Zhou <yun.zhou@windriver.com>
---
 fs/ext4/inode.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index c2c2d6ac7f3d..09dcfb6bf48c 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -6458,6 +6458,16 @@ static int ext4_try_to_expand_extra_isize(struct inode *inode,
 	if (ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND))
 		return -EOVERFLOW;
 
+	/*
+	 * Skip expansion during mount (!SB_ACTIVE).  Expanding extra isize
+	 * may move xattrs to external blocks and release ea_inodes via iput.
+	 * When !SB_ACTIVE, iput triggers write_inode_now() which acquires
+	 * s_writepages_rwsem, causing a deadlock with the caller's active
+	 * jbd2 handle (lock order: s_writepages_rwsem -> jbd2_handle).
+	 */
+	if (unlikely(!(inode->i_sb->s_flags & SB_ACTIVE)))
+		return -EBUSY;
+
 	/*
 	 * In nojournal mode, we can immediately attempt to expand
 	 * the inode.  When journaled, we first need to obtain extra
-- 
2.43.0


^ permalink raw reply related

* [PATCH v7 3/4] ext4: introduce ext4_put_ea_inode() for safe deferred iput
From: Yun Zhou @ 2026-06-16 15:15 UTC (permalink / raw)
  To: tytso, adilger.kernel, libaokun, jack, ojaswin, ritesh.list,
	yi.zhang
  Cc: linux-ext4, linux-kernel, yun.zhou
In-Reply-To: <20260616151558.1728881-1-yun.zhou@windriver.com>

Calling iput() on EA inodes while holding xattr_sem or a jbd2 handle
can trigger write_inode_now() -> ext4_writepages() -> s_writepages_rwsem,
creating a lock ordering issue during mount (!SB_ACTIVE).

Add ext4_put_ea_inode() which safely releases EA inode references:
when SB_ACTIVE, it calls iput() directly (write_inode_now cannot be
triggered); during mount (!SB_ACTIVE), it queues the inode on a per-sb
lock-free llist and schedules a worker to call iput() in a clean
context without holding any ext4 locks.

Convert the iput in ext4_xattr_block_set()'s "Drop the previous xattr
block" path to use ext4_xattr_inode_array_free_deferred(), which
releases EA inodes via ext4_put_ea_inode().  This path previously called
ext4_xattr_inode_array_free() (synchronous iput) while holding xattr_sem
and a jbd2 handle.

The worker is flushed in ext4_put_super() before journal destruction to
ensure all pending EA inode cleanup completes while the journal is still
available.

Signed-off-by: Yun Zhou <yun.zhou@windriver.com>
---
 fs/ext4/ext4.h  |  5 ++++
 fs/ext4/super.c |  6 ++++
 fs/ext4/xattr.c | 73 ++++++++++++++++++++++++++++++++++++++++++++++++-
 fs/ext4/xattr.h |  2 ++
 4 files changed, 85 insertions(+), 1 deletion(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 94283a991e5c..690202303269 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1706,6 +1706,11 @@ struct ext4_sb_info {
 	struct ext4_es_stats s_es_stats;
 	struct mb_cache *s_ea_block_cache;
 	struct mb_cache *s_ea_inode_cache;
+
+	/* Deferred iput for EA inodes to avoid lock ordering issues */
+	struct llist_head s_ea_inode_to_free;
+	struct work_struct s_ea_inode_work;
+
 	spinlock_t s_es_lock ____cacheline_aligned_in_smp;
 
 	/* Journal triggers for checksum computation */
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 6a77db4d3124..b777bb0a81ea 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1308,6 +1308,9 @@ static void ext4_put_super(struct super_block *sb)
 	destroy_workqueue(sbi->rsv_conversion_wq);
 	ext4_release_orphan_info(sb);
 
+	/* Flush deferred EA inode iputs before destroying journal */
+	flush_work(&sbi->s_ea_inode_work);
+
 	if (sbi->s_journal) {
 		aborted = is_journal_aborted(sbi->s_journal);
 		err = ext4_journal_destroy(sbi, sbi->s_journal);
@@ -5535,6 +5538,9 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
 		needs_recovery = 0;
 	}
 
+	init_llist_head(&sbi->s_ea_inode_to_free);
+	INIT_WORK(&sbi->s_ea_inode_work, ext4_ea_inode_work);
+
 	if (!test_opt(sb, NO_MBCACHE)) {
 		sbi->s_ea_block_cache = ext4_xattr_create_cache();
 		if (!sbi->s_ea_block_cache) {
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 982a1f831e22..04e7f674340d 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -117,6 +117,8 @@ const struct xattr_handler * const ext4_xattr_handlers[] = {
 static int
 ext4_expand_inode_array(struct ext4_xattr_inode_array **ea_inode_array,
 			struct inode *inode);
+static void ext4_xattr_inode_array_free_deferred(struct super_block *sb,
+				struct ext4_xattr_inode_array *array);
 
 #ifdef CONFIG_LOCKDEP
 void ext4_xattr_inode_set_class(struct inode *ea_inode)
@@ -2187,7 +2189,8 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
 		ext4_xattr_release_block(handle, inode, bs->bh,
 					 &ea_inode_array,
 					 0 /* extra_credits */);
-		ext4_xattr_inode_array_free(ea_inode_array);
+		ext4_xattr_inode_array_free_deferred(inode->i_sb,
+						     ea_inode_array);
 	}
 	error = 0;
 
@@ -3025,6 +3028,74 @@ void ext4_xattr_inode_array_free(struct ext4_xattr_inode_array *ea_inode_array)
 	kfree(ea_inode_array);
 }
 
+static void ext4_xattr_inode_array_free_deferred(struct super_block *sb,
+				struct ext4_xattr_inode_array *array)
+{
+	int idx;
+
+	if (array == NULL)
+		return;
+
+	for (idx = 0; idx < array->count; ++idx)
+		ext4_put_ea_inode(sb, array->inodes[idx]);
+	kfree(array);
+}
+
+struct ext4_ea_iput_entry {
+	struct llist_node node;
+	struct inode *inode;
+};
+
+/*
+ * Worker function for deferred EA inode iput.  Processes all inodes queued
+ * on s_ea_inode_to_free in a context free of xattr_sem/jbd2 handle locks.
+ */
+void ext4_ea_inode_work(struct work_struct *work)
+{
+	struct ext4_sb_info *sbi = container_of(work, struct ext4_sb_info,
+						s_ea_inode_work);
+	struct llist_node *node = llist_del_all(&sbi->s_ea_inode_to_free);
+	struct llist_node *next;
+
+	while (node) {
+		struct ext4_ea_iput_entry *entry = container_of(node,
+				struct ext4_ea_iput_entry, node);
+		next = node->next;
+		iput(entry->inode);
+		kfree(entry);
+		node = next;
+	}
+}
+
+/*
+ * Release a VFS reference on an EA inode after ext4_xattr_inode_dec_ref()
+ * may have set i_nlink=0.  Must be used instead of iput() in any context
+ * where xattr_sem or a jbd2 handle is held, because eviction of a nlink=0
+ * inode can acquire those same locks.
+ *
+ * When SB_ACTIVE, eviction does not call write_inode_now() so direct
+ * iput() is safe.  During mount (!SB_ACTIVE), defer to a workqueue.
+ *
+ * For EA inode references dropped without a preceding dec_ref (e.g.,
+ * lookup-only paths where nlink remains >= 1), plain iput() is safe
+ * and preferred.
+ */
+void ext4_put_ea_inode(struct super_block *sb, struct inode *inode)
+{
+	struct ext4_ea_iput_entry *entry;
+
+	if (!inode)
+		return;
+	if (sb->s_flags & SB_ACTIVE) {
+		iput(inode);
+		return;
+	}
+	entry = kmalloc(sizeof(*entry), GFP_NOFS | __GFP_NOFAIL);
+	entry->inode = inode;
+	llist_add(&entry->node, &EXT4_SB(sb)->s_ea_inode_to_free);
+	schedule_work(&EXT4_SB(sb)->s_ea_inode_work);
+}
+
 /*
  * ext4_xattr_block_cache_insert()
  *
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index 1fedf44d4fb6..52074537dce5 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -190,6 +190,8 @@ extern int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
 				   struct ext4_xattr_inode_array **array,
 				   int extra_credits);
 extern void ext4_xattr_inode_array_free(struct ext4_xattr_inode_array *array);
+extern void ext4_ea_inode_work(struct work_struct *work);
+extern void ext4_put_ea_inode(struct super_block *sb, struct inode *inode);
 
 extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
 			    struct ext4_inode *raw_inode, handle_t *handle);
-- 
2.43.0


^ permalink raw reply related

* Re: [PATCH RFC 2/8] fs: add a global device to super block hash table
From: Christian Brauner @ 2026-06-16 14:59 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Jan Kara, Jens Axboe, Alexander Viro, linux-block, linux-kernel,
	linux-fsdevel, Carlos Maiolino, linux-xfs, Chris Mason,
	David Sterba, linux-btrfs, Theodore Ts'o, linux-ext4,
	Gao Xiang, linux-erofs
In-Reply-To: <20260616123443.GA21024@lst.de>

On Tue, Jun 16, 2026 at 02:34:43PM +0200, Christoph Hellwig wrote:
> On Tue, Jun 02, 2026 at 12:10:08PM +0200, Christian Brauner wrote:
> > fs_holder_ops recovers the owning superblock from bdev->bd_holder, which
> > forces the holder to be exactly one superblock and prevents several
> > superblocks from sharing one block device. That's what erofs is doing.
> > 
> > Introduce a global dev_t-keyed rhltable mapping each block device to the
> > superblock(s) using it. The holder argument becomes purely the block
> > layer's exclusivity token (a superblock, or a file_system_type for
> > shared devices) and is no longer needed by the fs specific callbacks.
> 
> Err, no.  block devices need to have a specific owner.  If erofs wants
> to share a device between superblock it needs to come up with an entity
> that owns the block devices which is not a superblock.

It already did.

> IMHO sharing devices between superblocks is a bad idea, but that ship
> has sailed, but please keep it contained inside of erofs.

We need a simple device number to superblock mapping anyway and that can
simply be centralized in the vfs. And it can work with anon device
numbers and block device numbers uniformly.

^ permalink raw reply

* Re: [PATCH v4 08/23] ext4: implement buffered write path using iomap
From: Zhang Yi @ 2026-06-16 14:42 UTC (permalink / raw)
  To: Jan Kara, Zhang Yi
  Cc: linux-ext4, linux-fsdevel, linux-kernel, tytso, adilger.kernel,
	libaokun, ojaswin, ritesh.list, djwong, hch, yi.zhang, yangerkun,
	yukuai
In-Reply-To: <xuxrr3ls4gttesznypeplbawvnxwa4mqcbt7fkpdtdn3cfcfrv@paiigv322v3n>

On 6/16/2026 6:45 PM, Jan Kara wrote:
> On Mon 11-05-26 15:23:28, Zhang Yi wrote:
>> From: Zhang Yi <yi.zhang@huawei.com>
>>
>> Introduce two new iomap_ops instances for ext4 buffered writes:
>>
>>   - ext4_iomap_buffered_da_write_ops: for delayed allocation mode, using
>>     ext4_da_map_blocks() to map delalloc extents.
>>   - ext4_iomap_buffered_write_ops: for non-delayed allocation mode, using
>>     ext4_iomap_get_blocks() to directly allocate blocks.
>>
>> Also add ext4_iomap_valid() for the iomap infrastructure to check extent
>> validity.
>>
>> Key changes and considerations:
>>
>>   - Unwritten extents for new blocks (dioread_nolock always on)
>>     Since data=ordered mode is not used to prevent stale data exposure in
>>     the non-delayed allocation path, new blocks are always allocated as
>>     unwritten extents.
>>
>>   - Short write and write failure handling
>>     a. Delalloc path: On short write or failure, the stale delalloc range
>>        must be dropped and its space reservation released. Otherwise, a
>>        clean folio may cover leftover delalloc extents, causing
>>        inaccurate space reservation accounting.
>>     b. Non-delalloc path: No cleanup of allocated blocks is needed on
>>        short write.
>>
>>   - Lock ordering reversal
>>     The folio lock and transaction start ordering is reversed compared to
>>     the buffer_head buffered write path. To handle this, the journal
>>     handle must be stopped in iomap_begin() callbacks. The lock ordering
>>     documentation in super.c has been updated accordingly.
>>
>> Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
> 
> Looks good to me - besides the IOMAP_F_NEW bugs Ojaswin found. One
> observation I have here is that since the old indirect block based on-disk
> format doesn't support unwritten extents we can never transition it to the
> iomap scheme used here. So we'll have to figure out some way to avoid
> maintaining two (actually three if we count data=journal) buffered write /
> writeback paths in the long term. But let's address that once things settle
> for the common paths.
> 
> 								Honza

Yes, I agree. In the future, we need to convert the buffered I/O path
to iomap as much as possible to reduce maintenance costs. For the ext3
filesystem, which does not support unwritten extents, I haven't given
much thought to a conversion plan at the moment. Perhaps we could
implement it through the "delay map" we discussed earlier in v2. After
this patch set is finished, we can take a closer look at the solution.
:-)

Best Regards,
Yi.

> 
>> ---
>>   fs/ext4/ext4.h  |   4 ++
>>   fs/ext4/file.c  |  20 +++++-
>>   fs/ext4/inode.c | 165 +++++++++++++++++++++++++++++++++++++++++++++++-
>>   fs/ext4/super.c |  10 ++-
>>   4 files changed, 192 insertions(+), 7 deletions(-)
>>
>> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
>> index 1e27d73d7427..4832e7f7db82 100644
>> --- a/fs/ext4/ext4.h
>> +++ b/fs/ext4/ext4.h
>> @@ -3057,6 +3057,7 @@ int ext4_walk_page_buffers(handle_t *handle,
>>   int do_journal_get_write_access(handle_t *handle, struct inode *inode,
>>   				struct buffer_head *bh);
>>   void ext4_set_inode_mapping_order(struct inode *inode);
>> +int ext4_nonda_switch(struct super_block *sb);
>>   #define FALL_BACK_TO_NONDELALLOC 1
>>   #define CONVERT_INLINE_DATA	 2
>>   
>> @@ -3926,6 +3927,9 @@ static inline void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end)
>>   
>>   extern const struct iomap_ops ext4_iomap_ops;
>>   extern const struct iomap_ops ext4_iomap_report_ops;
>> +extern const struct iomap_ops ext4_iomap_buffered_write_ops;
>> +extern const struct iomap_ops ext4_iomap_buffered_da_write_ops;
>> +extern const struct iomap_write_ops ext4_iomap_write_ops;
>>   
>>   static inline int ext4_buffer_uptodate(struct buffer_head *bh)
>>   {
>> diff --git a/fs/ext4/file.c b/fs/ext4/file.c
>> index eb1a323962b1..7f9bfbbc4a4e 100644
>> --- a/fs/ext4/file.c
>> +++ b/fs/ext4/file.c
>> @@ -299,6 +299,21 @@ static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from)
>>   	return count;
>>   }
>>   
>> +static ssize_t ext4_iomap_buffered_write(struct kiocb *iocb,
>> +					 struct iov_iter *from)
>> +{
>> +	struct inode *inode = file_inode(iocb->ki_filp);
>> +	const struct iomap_ops *iomap_ops;
>> +
>> +	if (test_opt(inode->i_sb, DELALLOC) && !ext4_nonda_switch(inode->i_sb))
>> +		iomap_ops = &ext4_iomap_buffered_da_write_ops;
>> +	else
>> +		iomap_ops = &ext4_iomap_buffered_write_ops;
>> +
>> +	return iomap_file_buffered_write(iocb, from, iomap_ops,
>> +					 &ext4_iomap_write_ops, NULL);
>> +}
>> +
>>   static ssize_t ext4_buffered_write_iter(struct kiocb *iocb,
>>   					struct iov_iter *from)
>>   {
>> @@ -313,7 +328,10 @@ static ssize_t ext4_buffered_write_iter(struct kiocb *iocb,
>>   	if (ret <= 0)
>>   		goto out;
>>   
>> -	ret = generic_perform_write(iocb, from);
>> +	if (ext4_inode_buffered_iomap(inode))
>> +		ret = ext4_iomap_buffered_write(iocb, from);
>> +	else
>> +		ret = generic_perform_write(iocb, from);
>>   
>>   out:
>>   	inode_unlock(inode);
>> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
>> index 39577a6b65b9..1ae7d3f4a1c8 100644
>> --- a/fs/ext4/inode.c
>> +++ b/fs/ext4/inode.c
>> @@ -3097,7 +3097,7 @@ static int ext4_dax_writepages(struct address_space *mapping,
>>   	return ret;
>>   }
>>   
>> -static int ext4_nonda_switch(struct super_block *sb)
>> +int ext4_nonda_switch(struct super_block *sb)
>>   {
>>   	s64 free_clusters, dirty_clusters;
>>   	struct ext4_sb_info *sbi = EXT4_SB(sb);
>> @@ -3467,6 +3467,15 @@ static bool ext4_inode_datasync_dirty(struct inode *inode)
>>   	return inode_state_read_once(inode) & I_DIRTY_DATASYNC;
>>   }
>>   
>> +static bool ext4_iomap_valid(struct inode *inode, const struct iomap *iomap)
>> +{
>> +	return iomap->validity_cookie == READ_ONCE(EXT4_I(inode)->i_es_seq);
>> +}
>> +
>> +const struct iomap_write_ops ext4_iomap_write_ops = {
>> +	.iomap_valid = ext4_iomap_valid,
>> +};
>> +
>>   static void ext4_set_iomap(struct inode *inode, struct iomap *iomap,
>>   			   struct ext4_map_blocks *map, loff_t offset,
>>   			   loff_t length, unsigned int flags)
>> @@ -3501,6 +3510,8 @@ static void ext4_set_iomap(struct inode *inode, struct iomap *iomap,
>>   	    !ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
>>   		iomap->flags |= IOMAP_F_MERGED;
>>   
>> +	iomap->validity_cookie = map->m_seq;
>> +
>>   	/*
>>   	 * Flags passed to ext4_map_blocks() for direct I/O writes can result
>>   	 * in m_flags having both EXT4_MAP_MAPPED and EXT4_MAP_UNWRITTEN bits
>> @@ -3908,8 +3919,12 @@ const struct iomap_ops ext4_iomap_report_ops = {
>>   	.iomap_begin = ext4_iomap_begin_report,
>>   };
>>   
>> +/* Map blocks */
>> +typedef int (ext4_get_blocks_t)(struct inode *, struct ext4_map_blocks *);
>> +
>>   static int ext4_iomap_map_blocks(struct inode *inode, loff_t offset,
>> -		loff_t length, struct ext4_map_blocks *map)
>> +		loff_t length, ext4_get_blocks_t get_blocks,
>> +		struct ext4_map_blocks *map)
>>   {
>>   	u8 blkbits = inode->i_blkbits;
>>   
>> @@ -3921,6 +3936,9 @@ static int ext4_iomap_map_blocks(struct inode *inode, loff_t offset,
>>   	map->m_len = min_t(loff_t, (offset + length - 1) >> blkbits,
>>   			   EXT4_MAX_LOGICAL_BLOCK) - map->m_lblk + 1;
>>   
>> +	if (get_blocks)
>> +		return get_blocks(inode, map);
>> +
>>   	return ext4_map_blocks(NULL, inode, map, 0);
>>   }
>>   
>> @@ -3938,7 +3956,7 @@ static int ext4_iomap_buffered_read_begin(struct inode *inode, loff_t offset,
>>   	if (WARN_ON_ONCE(ext4_has_inline_data(inode)))
>>   		return -ERANGE;
>>   
>> -	ret = ext4_iomap_map_blocks(inode, offset, length, &map);
>> +	ret = ext4_iomap_map_blocks(inode, offset, length, NULL, &map);
>>   	if (ret < 0)
>>   		return ret;
>>   
>> @@ -3946,6 +3964,147 @@ static int ext4_iomap_buffered_read_begin(struct inode *inode, loff_t offset,
>>   	return 0;
>>   }
>>   
>> +static int ext4_iomap_get_blocks(struct inode *inode,
>> +				 struct ext4_map_blocks *map)
>> +{
>> +	loff_t i_size = i_size_read(inode);
>> +	handle_t *handle;
>> +	int ret;
>> +
>> +	/*
>> +	 * Check if the blocks have already been allocated, this could
>> +	 * avoid initiating a new journal transaction and return the
>> +	 * mapping information directly.
>> +	 */
>> +	if ((map->m_lblk + map->m_len) <=
>> +	    round_up(i_size, i_blocksize(inode)) >> inode->i_blkbits) {
>> +		ret = ext4_map_blocks(NULL, inode, map, 0);
>> +		if (ret < 0)
>> +			return ret;
>> +		if (map->m_flags & (EXT4_MAP_MAPPED | EXT4_MAP_UNWRITTEN |
>> +				    EXT4_MAP_DELAYED))
>> +			return 0;
>> +	}
>> +
>> +	handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
>> +			ext4_chunk_trans_blocks(inode, map->m_len));
>> +	if (IS_ERR(handle))
>> +		return PTR_ERR(handle);
>> +
>> +	ret = ext4_map_blocks(handle, inode, map,
>> +			      EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT);
>> +	/*
>> +	 * Stop handle here following the lock ordering of the folio lock
>> +	 * and the transaction start.
>> +	 */
>> +	ext4_journal_stop(handle);
>> +
>> +	return ret;
>> +}
>> +
>> +static int ext4_iomap_buffered_do_write_begin(struct inode *inode,
>> +		loff_t offset, loff_t length, unsigned int flags,
>> +		struct iomap *iomap, struct iomap *srcmap, bool delalloc)
>> +{
>> +	int ret, retries = 0;
>> +	struct ext4_map_blocks map;
>> +	ext4_get_blocks_t *get_blocks;
>> +
>> +	ret = ext4_emergency_state(inode->i_sb);
>> +	if (unlikely(ret))
>> +		return ret;
>> +
>> +	/* Inline data and non-extent are not supported. */
>> +	if (WARN_ON_ONCE(ext4_has_inline_data(inode)))
>> +		return -ERANGE;
>> +	if (WARN_ON_ONCE(!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
>> +		return -EINVAL;
>> +	if (WARN_ON_ONCE(!(flags & IOMAP_WRITE)))
>> +		return -EINVAL;
>> +
>> +	if (delalloc)
>> +		get_blocks = ext4_da_map_blocks;
>> +	else
>> +		get_blocks = ext4_iomap_get_blocks;
>> +retry:
>> +	ret = ext4_iomap_map_blocks(inode, offset, length, get_blocks, &map);
>> +	if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
>> +		goto retry;
>> +	if (ret < 0)
>> +		return ret;
>> +
>> +	ext4_set_iomap(inode, iomap, &map, offset, length, flags);
>> +	return 0;
>> +}
>> +
>> +static int ext4_iomap_buffered_write_begin(struct inode *inode,
>> +		loff_t offset, loff_t length, unsigned int flags,
>> +		struct iomap *iomap, struct iomap *srcmap)
>> +{
>> +	return ext4_iomap_buffered_do_write_begin(inode, offset, length, flags,
>> +						  iomap, srcmap, false);
>> +}
>> +
>> +static int ext4_iomap_buffered_da_write_begin(struct inode *inode,
>> +		loff_t offset, loff_t length, unsigned int flags,
>> +		struct iomap *iomap, struct iomap *srcmap)
>> +{
>> +	return ext4_iomap_buffered_do_write_begin(inode, offset, length, flags,
>> +						  iomap, srcmap, true);
>> +}
>> +
>> +/*
>> + * On write failure, drop the stale delayed allocation range and release
>> + * its reserved space for both start and end blocks. Otherwise, we may
>> + * leave a range of delayed extents covered by a clean folio, which can
>> + * result in inaccurate space reservation accounting.
>> + */
>> +static void ext4_iomap_punch_delalloc(struct inode *inode, loff_t offset,
>> +				     loff_t length, struct iomap *iomap)
>> +{
>> +	down_write(&EXT4_I(inode)->i_data_sem);
>> +	ext4_es_remove_extent(inode, offset >> inode->i_blkbits,
>> +			DIV_ROUND_UP_ULL(length, EXT4_BLOCK_SIZE(inode->i_sb)));
>> +	up_write(&EXT4_I(inode)->i_data_sem);
>> +}
>> +
>> +static int ext4_iomap_buffered_da_write_end(struct inode *inode, loff_t offset,
>> +					    loff_t length, ssize_t written,
>> +					    unsigned int flags,
>> +					    struct iomap *iomap)
>> +{
>> +	loff_t start_byte, end_byte;
>> +
>> +	/* If we didn't reserve the blocks, we're not allowed to punch them. */
>> +	if (iomap->type != IOMAP_DELALLOC || !(iomap->flags & IOMAP_F_NEW))
>> +		return 0;
>> +
>> +	/* Nothing to do if we've written the entire delalloc extent */
>> +	start_byte = iomap_last_written_block(inode, offset, written);
>> +	end_byte = round_up(offset + length, i_blocksize(inode));
>> +	if (start_byte >= end_byte)
>> +		return 0;
>> +
>> +	filemap_invalidate_lock(inode->i_mapping);
>> +	iomap_write_delalloc_release(inode, start_byte, end_byte, flags,
>> +				     iomap, ext4_iomap_punch_delalloc);
>> +	filemap_invalidate_unlock(inode->i_mapping);
>> +	return 0;
>> +}
>> +
>> +/*
>> + * Since we always allocate unwritten extents, there is no need for
>> + * iomap_end to clean up allocated blocks on a short write.
>> + */
>> +const struct iomap_ops ext4_iomap_buffered_write_ops = {
>> +	.iomap_begin = ext4_iomap_buffered_write_begin,
>> +};
>> +
>> +const struct iomap_ops ext4_iomap_buffered_da_write_ops = {
>> +	.iomap_begin = ext4_iomap_buffered_da_write_begin,
>> +	.iomap_end = ext4_iomap_buffered_da_write_end,
>> +};
>> +
>>   const struct iomap_ops ext4_iomap_buffered_read_ops = {
>>   	.iomap_begin = ext4_iomap_buffered_read_begin,
>>   };
>> diff --git a/fs/ext4/super.c b/fs/ext4/super.c
>> index 6a77db4d3124..9bc294b769db 100644
>> --- a/fs/ext4/super.c
>> +++ b/fs/ext4/super.c
>> @@ -104,9 +104,13 @@ static const struct fs_parameter_spec ext4_param_specs[];
>>    *   -> page lock -> i_data_sem (rw)
>>    *
>>    * buffered write path:
>> - * sb_start_write -> i_mutex -> mmap_lock
>> - * sb_start_write -> i_mutex -> transaction start -> page lock ->
>> - *   i_data_sem (rw)
>> + * sb_start_write -> i_rwsem (w) -> mmap_lock
>> + * - buffer_head path:
>> + *   sb_start_write -> i_rwsem (w) -> transaction start -> folio lock ->
>> + *     i_data_sem (rw)
>> + * - iomap path:
>> + *   sb_start_write -> i_rwsem (w) -> transaction start -> i_data_sem (rw)
>> + *   sb_start_write -> i_rwsem (w) -> folio lock (not under an active handle)
>>    *
>>    * truncate:
>>    * sb_start_write -> i_mutex -> invalidate_lock (w) -> i_mmap_rwsem (w) ->
>> -- 
>> 2.52.0
>>


^ permalink raw reply

* [PATCH RFC v2 18/18] selftests/filesystems: add ustat() coverage
From: Christian Brauner @ 2026-06-16 14:08 UTC (permalink / raw)
  To: Jan Kara
  Cc: Christoph Hellwig, Jens Axboe, Alexander Viro, linux-block,
	linux-kernel, linux-fsdevel, Carlos Maiolino, linux-xfs,
	Chris Mason, David Sterba, linux-btrfs, Theodore Ts'o,
	linux-ext4, Gao Xiang, linux-erofs, Christian Brauner (Amutable)
In-Reply-To: <20260616-work-super-bdev_holder_global-v2-0-7df6b864028e@kernel.org>

user_get_super() is now backed by the global device-to-superblock table
instead of a walk of the super_blocks list. ustat(2) is its most direct
user-visible consumer but nothing in the tree exercises it.

Add a small regression test: the device number of a mounted tmpfs (an
anonymous device, registered in the table by sget_fc()) must resolve,
it must stop resolving after the unmount (the entry is dropped again in
kill_super_notify()), and bogus device numbers keep reporting EINVAL.

The test passes on kernels before the conversion: it pins down the
semantics the table-backed lookup must preserve.

Signed-off-by: Christian Brauner (Amutable) <brauner@kernel.org>
---
 tools/testing/selftests/filesystems/.gitignore   |   1 +
 tools/testing/selftests/filesystems/Makefile     |   2 +-
 tools/testing/selftests/filesystems/ustat_test.c | 135 +++++++++++++++++++++++
 3 files changed, 137 insertions(+), 1 deletion(-)

diff --git a/tools/testing/selftests/filesystems/.gitignore b/tools/testing/selftests/filesystems/.gitignore
index 64ac0dfa46b7..1bd53d54553c 100644
--- a/tools/testing/selftests/filesystems/.gitignore
+++ b/tools/testing/selftests/filesystems/.gitignore
@@ -5,3 +5,4 @@ fclog
 file_stressor
 anon_inode_test
 kernfs_test
+ustat_test
diff --git a/tools/testing/selftests/filesystems/Makefile b/tools/testing/selftests/filesystems/Makefile
index 85427d7f19b9..bbdd40b167fa 100644
--- a/tools/testing/selftests/filesystems/Makefile
+++ b/tools/testing/selftests/filesystems/Makefile
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0
 
 CFLAGS += $(KHDR_INCLUDES)
-TEST_GEN_PROGS := devpts_pts file_stressor anon_inode_test kernfs_test fclog
+TEST_GEN_PROGS := devpts_pts file_stressor anon_inode_test kernfs_test fclog ustat_test
 TEST_GEN_PROGS_EXTENDED := dnotify_test
 
 include ../lib.mk
diff --git a/tools/testing/selftests/filesystems/ustat_test.c b/tools/testing/selftests/filesystems/ustat_test.c
new file mode 100644
index 000000000000..d429fd18d779
--- /dev/null
+++ b/tools/testing/selftests/filesystems/ustat_test.c
@@ -0,0 +1,135 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Test ustat(2): looking up superblocks by device number.
+ *
+ * ustat() resolves a device number to a mounted superblock via
+ * user_get_super(). Check that the device number of a mounted tmpfs (an
+ * anonymous device) resolves, that it stops resolving once the filesystem
+ * is unmounted and that bogus device numbers report EINVAL.
+ */
+#define _GNU_SOURCE
+#include <errno.h>
+#include <fcntl.h>
+#include <sched.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/mount.h>
+#include <sys/stat.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+
+#include "../kselftest_harness.h"
+
+/* struct ustat is not exported through UAPI, mirror include/linux/types.h. */
+struct ustat_buf {
+	int		f_tfree;
+	unsigned long	f_tinode;
+	char		f_fname[6];
+	char		f_fpack[6];
+	/* slack in case an architecture lays the struct out differently */
+	char		pad[64];
+};
+
+#ifdef __NR_ustat
+
+/*
+ * The kernel decodes @dev with new_decode_dev(), which matches the low 32
+ * bits of the st_dev encoding stat(2) returns for any major below 4096.
+ */
+static int sys_ustat(unsigned int dev, struct ustat_buf *buf)
+{
+	return syscall(__NR_ustat, dev, buf);
+}
+
+static int write_string(const char *path, const char *string)
+{
+	ssize_t len = strlen(string);
+	int fd;
+
+	fd = open(path, O_WRONLY);
+	if (fd < 0)
+		return -1;
+	if (write(fd, string, len) != len) {
+		close(fd);
+		return -1;
+	}
+	return close(fd);
+}
+
+/* Enter namespaces in which mounting a tmpfs instance is allowed. */
+static int setup_namespaces(void)
+{
+	uid_t uid = getuid();
+	gid_t gid = getgid();
+	char map[64];
+
+	if (unshare(CLONE_NEWNS | (uid ? CLONE_NEWUSER : 0)))
+		return -1;
+
+	if (uid) {
+		if (write_string("/proc/self/setgroups", "deny"))
+			return -1;
+		snprintf(map, sizeof(map), "0 %d 1", uid);
+		if (write_string("/proc/self/uid_map", map))
+			return -1;
+		snprintf(map, sizeof(map), "0 %d 1", gid);
+		if (write_string("/proc/self/gid_map", map))
+			return -1;
+	}
+
+	return mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, NULL);
+}
+
+TEST(resolves_mounted_superblock)
+{
+	char dir[] = "/tmp/ustat_test.XXXXXX";
+	struct ustat_buf ub;
+	struct stat st;
+
+	ASSERT_NE(NULL, mkdtemp(dir));
+
+	if (setup_namespaces()) {
+		rmdir(dir);
+		SKIP(return, "cannot set up namespaces: %s", strerror(errno));
+	}
+
+	ASSERT_EQ(0, mount("ustat_test", dir, "tmpfs", 0, NULL));
+	ASSERT_EQ(0, stat(dir, &st));
+
+	memset(&ub, 0xff, sizeof(ub));
+	ASSERT_EQ(0, sys_ustat(st.st_dev, &ub))
+		TH_LOG("ustat(%u): %s", (unsigned int)st.st_dev,
+		       strerror(errno));
+
+	ASSERT_EQ(0, umount(dir));
+
+	/* The unmount removed the superblock, the device is gone. */
+	ASSERT_EQ(-1, sys_ustat(st.st_dev, &ub));
+	ASSERT_EQ(EINVAL, errno);
+
+	rmdir(dir);
+}
+
+TEST(bogus_device_numbers)
+{
+	struct ustat_buf ub;
+
+	ASSERT_EQ(-1, sys_ustat(0, &ub));
+	ASSERT_EQ(EINVAL, errno);
+
+	/* major 4095, minor 1048575: nothing plausible lives there */
+	ASSERT_EQ(-1, sys_ustat((0xfffu << 8) | 0xffu | (0xfff00u << 12), &ub));
+	ASSERT_EQ(EINVAL, errno);
+}
+
+#else /* !__NR_ustat */
+
+TEST(unsupported)
+{
+	SKIP(return, "ustat(2) is not available on this architecture");
+}
+
+#endif /* __NR_ustat */
+
+TEST_HARNESS_MAIN

-- 
2.47.3


^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox