Linux EXT4 FS development

Linux EXT4 FS development
 help / color / mirror / Atom feed

* Re: [PATCH v4 14/23] ext4: implement partial block zero range path using iomap
From: Ojaswin Mujoo @ 2026-05-27 13:13 UTC (permalink / raw)
  To: Zhang Yi
  Cc: linux-ext4, linux-fsdevel, linux-kernel, tytso, adilger.kernel,
	libaokun, jack, ritesh.list, djwong, hch, yi.zhang, yizhang089,
	yangerkun, yukuai
In-Reply-To: <20260511072344.191271-15-yi.zhang@huaweicloud.com>

On Mon, May 11, 2026 at 03:23:34PM +0800, Zhang Yi wrote:
> From: Zhang Yi <yi.zhang@huawei.com>
> 
> Introduce a new iomap_ops instance, ext4_iomap_zero_ops, along with
> ext4_iomap_block_zero_range() to implement block zeroing via the iomap
> infrastructure for ext4.
> 
> ext4_iomap_block_zero_range() calls iomap_zero_range() with
> ext4_iomap_zero_begin() as the callback. The callback locates and zeros
> out either a mapped partial block or a dirty, unwritten partial block.
> 
> Important constraints:
> 
> Zeroing out under an active journal handle can cause deadlock, because
> the order of acquiring the folio lock and starting a handle is
> inconsistent with the iomap writeback path.
> 
> Therefore, ext4_iomap_block_zero_range():
> - Must NOT be called under an active handle.
> - Cannot rely on data=ordered mode to ensure zeroed data persistence
>   before updating i_disksize (for the cases of post-EOF append write,
>   post-EOF fallocate, and truncate up). In subsequent patches, we will
>   address this by synchronizing commit I/O but doesn't waiting for
>   completion, and updating i_disksize to i_size only after the zeroed
>   data has been written back.
> 
> Signed-off-by: Zhang Yi <yi.zhang@huawei.com>

Looks good in itself. Feel free to add:

Reviewed-by: Ojaswin Mujoo <ojaswin@linux.ibm.com>

Regards,
Ojaswin

> ---
>  fs/ext4/inode.c | 92 +++++++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 92 insertions(+)
> 
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index c6fe42d012fc..e0dae2501292 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -4101,6 +4101,51 @@ static int ext4_iomap_buffered_da_write_end(struct inode *inode, loff_t offset,
>  	return 0;
>  }
>  
> +static int ext4_iomap_zero_begin(struct inode *inode,
> +		loff_t offset, loff_t length, unsigned int flags,
> +		struct iomap *iomap, struct iomap *srcmap)
> +{
> +	struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap);
> +	struct ext4_map_blocks map;
> +	u8 blkbits = inode->i_blkbits;
> +	unsigned int iomap_flags = 0;
> +	int ret;
> +
> +	ret = ext4_emergency_state(inode->i_sb);
> +	if (unlikely(ret))
> +		return ret;
> +
> +	if (WARN_ON_ONCE(!(flags & IOMAP_ZERO)))
> +		return -EINVAL;
> +
> +	ret = ext4_iomap_map_blocks(inode, offset, length, NULL, &map);
> +	if (ret < 0)
> +		return ret;
> +
> +	/*
> +	 * Look up dirty folios for unwritten mappings within EOF. Providing
> +	 * this bypasses the flush iomap uses to trigger extent conversion
> +	 * when unwritten mappings have dirty pagecache in need of zeroing.
> +	 */
> +	if (map.m_flags & EXT4_MAP_UNWRITTEN) {
> +		loff_t start = ((loff_t)map.m_lblk) << blkbits;
> +		loff_t end = ((loff_t)map.m_lblk + map.m_len) << blkbits;
> +
> +		iomap_fill_dirty_folios(iter, &start, end, &iomap_flags);
> +		if ((start >> blkbits) < map.m_lblk + map.m_len)
> +			map.m_len = (start >> blkbits) - map.m_lblk;
> +	}
> +
> +	ext4_set_iomap(inode, iomap, &map, offset, length, flags);
> +	iomap->flags |= iomap_flags;
> +
> +	return 0;
> +}
> +
> +static const struct iomap_ops ext4_iomap_zero_ops = {
> +	.iomap_begin = ext4_iomap_zero_begin,
> +};
> +
>  /*
>   * Since we always allocate unwritten extents, there is no need for
>   * iomap_end to clean up allocated blocks on a short write.
> @@ -4616,6 +4661,47 @@ static int ext4_block_journalled_zero_range(struct inode *inode, loff_t from,
>  	return err;
>  }
>  
> +static int ext4_block_iomap_zero_range(struct inode *inode, loff_t from,
> +				       loff_t length, bool *did_zero,
> +				       bool *zero_written)
> +{
> +	int ret;
> +
> +	/*
> +	 * Zeroing out under an active handle can cause deadlock since
> +	 * the order of acquiring the folio lock and starting a handle is
> +	 * inconsistent with the iomap writeback procedure.
> +	 */
> +	if (WARN_ON_ONCE(ext4_handle_valid(journal_current_handle())))
> +		return -EINVAL;
> +
> +	/* The zeroing scope should not extend across a block. */
> +	if (WARN_ON_ONCE((from >> inode->i_blkbits) !=
> +			 ((from + length - 1) >> inode->i_blkbits)))
> +		return -EINVAL;
> +
> +	if (!(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS) &&
> +	    !(inode_state_read_once(inode) & (I_NEW | I_FREEING)))
> +		WARN_ON_ONCE(!inode_is_locked(inode) &&
> +			!rwsem_is_locked(&inode->i_mapping->invalidate_lock));
> +
> +	ret = iomap_zero_range(inode, from, length, did_zero,
> +			       &ext4_iomap_zero_ops, &ext4_iomap_write_ops,
> +			       NULL);
> +	if (ret)
> +		return ret;
> +
> +	/*
> +	 * TODO: The iomap does not distinguish between different types of
> +	 * zeroing and always sets zero_written if a zeroing operation is
> +	 * performed, which may result in unnecessary order operations.
> +	 */
> +	if (did_zero && zero_written)
> +		*zero_written = *did_zero;
> +
> +	return 0;
> +}
> +
>  /*
>   * Zeros out a mapping of length 'length' starting from file offset
>   * 'from'.  The range to be zero'd must be contained with in one block.
> @@ -4642,6 +4728,9 @@ static int ext4_block_zero_range(struct inode *inode,
>  	} else if (ext4_should_journal_data(inode)) {
>  		return ext4_block_journalled_zero_range(inode, from, length,
>  							did_zero);
> +	} else if (ext4_inode_buffered_iomap(inode)) {
> +		return ext4_block_iomap_zero_range(inode, from, length,
> +						   did_zero, zero_written);
>  	}
>  	return ext4_block_do_zero_range(inode, from, length, did_zero,
>  					zero_written);
> @@ -4682,6 +4771,9 @@ int ext4_block_zero_eof(struct inode *inode, loff_t from, loff_t end)
>  	 * truncating up or performing an append write, because there might be
>  	 * exposing stale on-disk data which may caused by concurrent post-EOF
>  	 * mmap write during folio writeback.
> +	 *
> +	 * TODO: In the iomap path, handle this by updating i_disksize to
> +	 * i_size after the zeroed data has been written back.
>  	 */
>  	if (ext4_should_order_data(inode) &&
>  	    did_zero && zero_written && !IS_DAX(inode)) {
> -- 
> 2.52.0
> 

^ permalink raw reply

* Re: [PATCH v4 10/23] ext4: implement mmap path using iomap
From: Ojaswin Mujoo @ 2026-05-27 12:56 UTC (permalink / raw)
  To: Zhang Yi
  Cc: linux-ext4, linux-fsdevel, linux-kernel, tytso, adilger.kernel,
	libaokun, jack, ritesh.list, djwong, hch, yi.zhang, yizhang089,
	yangerkun, yukuai
In-Reply-To: <20260511072344.191271-11-yi.zhang@huaweicloud.com>

On Mon, May 11, 2026 at 03:23:30PM +0800, Zhang Yi wrote:
> From: Zhang Yi <yi.zhang@huawei.com>
> 
> Introduce ext4_iomap_page_mkwrite() to implement the mmap iomap path
> for ext4. The heavy lifting is delegated to iomap_page_mkwrite(), which
> only requires ext4_iomap_buffered_write_ops and
> ext4_iomap_buffered_da_write_ops to allocate and map blocks.
> 
> Note that the lock ordering between folio lock and transaction start in
> this path is reversed compared to the buffer_head buffered write path.
> The lock ordering documentation in super.c has been updated accordingly.
> 
> Signed-off-by: Zhang Yi <yi.zhang@huawei.com>

Looks good, feel free to add:
Reviewed-by: Ojaswin Mujoo <ojaswin@linux.ibm.com>

Regards,
Ojaswin

> ---
>  fs/ext4/inode.c | 32 +++++++++++++++++++++++++++++++-
>  fs/ext4/super.c |  8 ++++++--
>  2 files changed, 37 insertions(+), 3 deletions(-)
> 
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index a80195bd6f20..c6fe42d012fc 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -4020,7 +4020,7 @@ static int ext4_iomap_buffered_do_write_begin(struct inode *inode,
>  		return -ERANGE;
>  	if (WARN_ON_ONCE(!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
>  		return -EINVAL;
> -	if (WARN_ON_ONCE(!(flags & IOMAP_WRITE)))
> +	if (WARN_ON_ONCE(!(flags & (IOMAP_WRITE | IOMAP_FAULT))))
>  		return -EINVAL;
>  
>  	if (delalloc)
> @@ -4080,6 +4080,14 @@ static int ext4_iomap_buffered_da_write_end(struct inode *inode, loff_t offset,
>  	if (iomap->type != IOMAP_DELALLOC || !(iomap->flags & IOMAP_F_NEW))
>  		return 0;
>  
> +	/*
> +	 * iomap_page_mkwrite() will never fail in a way that requires delalloc
> +	 * extents that it allocated to be revoked.  Hence never try to release
> +	 * them here.
> +	 */
> +	if (flags & IOMAP_FAULT)
> +		return 0;
> +
>  	/* Nothing to do if we've written the entire delalloc extent */
>  	start_byte = iomap_last_written_block(inode, offset, written);
>  	end_byte = round_up(offset + length, i_blocksize(inode));
> @@ -7191,6 +7199,23 @@ static int ext4_block_page_mkwrite(struct inode *inode, struct folio *folio,
>  	return ret;
>  }
>  
> +static vm_fault_t ext4_iomap_page_mkwrite(struct vm_fault *vmf)
> +{
> +	struct inode *inode = file_inode(vmf->vma->vm_file);
> +	const struct iomap_ops *iomap_ops;
> +
> +	/*
> +	 * ext4_nonda_switch() could writeback this folio, so have to
> +	 * call it before lock folio.
> +	 */
> +	if (test_opt(inode->i_sb, DELALLOC) && !ext4_nonda_switch(inode->i_sb))
> +		iomap_ops = &ext4_iomap_buffered_da_write_ops;
> +	else
> +		iomap_ops = &ext4_iomap_buffered_write_ops;
> +
> +	return iomap_page_mkwrite(vmf, iomap_ops, NULL);
> +}
> +
>  vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf)
>  {
>  	struct vm_area_struct *vma = vmf->vma;
> @@ -7213,6 +7238,11 @@ vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf)
>  
>  	filemap_invalidate_lock_shared(mapping);
>  
> +	if (ext4_inode_buffered_iomap(inode)) {
> +		ret = ext4_iomap_page_mkwrite(vmf);
> +		goto out;
> +	}
> +
>  	err = ext4_convert_inline_data(inode);
>  	if (err)
>  		goto out_ret;
> diff --git a/fs/ext4/super.c b/fs/ext4/super.c
> index 51d87db53543..62bfe05a64bc 100644
> --- a/fs/ext4/super.c
> +++ b/fs/ext4/super.c
> @@ -100,8 +100,12 @@ static const struct fs_parameter_spec ext4_param_specs[];
>   * Lock ordering
>   *
>   * page fault path:
> - * mmap_lock -> sb_start_pagefault -> invalidate_lock (r) -> transaction start
> - *   -> page lock -> i_data_sem (rw)
> + * - buffer_head path:
> + *   mmap_lock -> sb_start_pagefault -> invalidate_lock (r) ->
> + *     transaction start -> folio lock -> i_data_sem (rw)
> + * - iomap path:
> + *   mmap_lock -> sb_start_pagefault -> invalidate_lock (r) ->
> + *     folio lock -> transaction start -> i_data_sem (rw)
>   *
>   * buffered write path:
>   * sb_start_write -> i_rwsem (w) -> mmap_lock
> -- 
> 2.52.0
> 

^ permalink raw reply

* Re: [PATCH v4 09/23] ext4: implement writeback path using iomap
From: Ojaswin Mujoo @ 2026-05-27 12:49 UTC (permalink / raw)
  To: Zhang Yi
  Cc: linux-ext4, linux-fsdevel, linux-kernel, tytso, adilger.kernel,
	libaokun, jack, ritesh.list, djwong, hch, yi.zhang, yizhang089,
	yangerkun, yukuai
In-Reply-To: <20260511072344.191271-10-yi.zhang@huaweicloud.com>

On Mon, May 11, 2026 at 03:23:29PM +0800, Zhang Yi wrote:
> From: Zhang Yi <yi.zhang@huawei.com>
> 
> Add the iomap writeback path for ext4 buffered I/O. This introduces:
> 
>  - ext4_iomap_writepages(): the main writeback entry point.
>  - ext4_writeback_ops: a new iomap_writeback_ops instance to handle
>    block mapping and I/O submission.
>  - A new end I/O worker for converting unwritten extents, updating file
>    size, and handling DATA_ERR_ABORT after I/O completion.
> 
> Core implementation details:
> 
>  - ->writeback_range() callback
>    Calls ext4_iomap_map_writeback_range() to query the longest range of
>    existing mapped extents. For performance, when a block range is not
>    yet allocated, it allocates based on the writeback length and delalloc
>    extent length, rather than allocating for a single folio at a time.
>    The folio is then added to an iomap_ioend instance.
> 
>  - ->writeback_submit() callback
>    Registers ext4_iomap_end_bio() as the end bio callback. This callback
>    schedules a worker to handle:
>    - Unwritten extent conversion.
>    - i_disksize update after data is written back.
>    - Journal abort on writeback I/O failure.

Hi Zhang, the changes look good. I have a few comments below:
> 
> Key changes and considerations:
> 
> - Append write and unwritten extents
>   Since data=ordered mode is not used to prevent stale data exposure
>   during append writebacks, new blocks are always allocated as unwritten
>   extents (i.e. always enable dioread_nolock), and i_disksize update is
>   postponed until I/O completion. 

Makes sense.

>   Additionally, the deadlock that the
>   reserve handle was expected to resolve does not occur anymore.

I guess this is since we don't use ordered data so we can't block on
starting a txn in end io.

>   Therefore, the end I/O worker can start a normal journal handle
>   instead of a reserve handle when converting unwritten extents.
> 
> - Lock ordering
>   The ->writeback_range() callback runs under the folio lock, requiring
>   the journal handle to be started under that same lock. This reverses
>   the order compared to the buffer_head writeback path. The lock ordering
>   documentation in super.c has been updated accordingly.
> 
> Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
> ---
>  fs/ext4/ext4.h        |   4 +
>  fs/ext4/inode.c       | 208 +++++++++++++++++++++++++++++++++++++++++-
>  fs/ext4/page-io.c     | 126 +++++++++++++++++++++++++
>  fs/ext4/super.c       |   7 +-
>  fs/iomap/ioend.c      |   3 +-
>  include/linux/iomap.h |   1 +
>  6 files changed, 346 insertions(+), 3 deletions(-)
> 
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 4832e7f7db82..078feda47e36 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -1173,6 +1173,8 @@ struct ext4_inode_info {
>  	 */
>  	struct list_head i_rsv_conversion_list;
>  	struct work_struct i_rsv_conversion_work;
> +	struct list_head i_iomap_ioend_list;
> +	struct work_struct i_iomap_ioend_work;
>  
>  	/*
>  	 * Transactions that contain inode's metadata needed to complete
> @@ -3870,6 +3872,8 @@ int ext4_bio_write_folio(struct ext4_io_submit *io, struct folio *page,
>  		size_t len);
>  extern struct ext4_io_end_vec *ext4_alloc_io_end_vec(ext4_io_end_t *io_end);
>  extern struct ext4_io_end_vec *ext4_last_io_end_vec(ext4_io_end_t *io_end);
> +extern void ext4_iomap_end_io(struct work_struct *work);
> +extern void ext4_iomap_end_bio(struct bio *bio);
>  
>  /* mmp.c */
>  extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t);
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 1ae7d3f4a1c8..a80195bd6f20 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -44,6 +44,7 @@
>  #include <linux/iversion.h>
>  
>  #include "ext4_jbd2.h"
> +#include "ext4_extents.h"
>  #include "xattr.h"
>  #include "acl.h"
>  #include "truncate.h"
> @@ -4120,10 +4121,215 @@ static void ext4_iomap_readahead(struct readahead_control *rac)
>  	iomap_bio_readahead(rac, &ext4_iomap_buffered_read_ops);
>  }
>  
> +static int ext4_iomap_map_one_extent(struct inode *inode,
> +				     struct ext4_map_blocks *map)
> +{
> +	struct extent_status es;
> +	handle_t *handle = NULL;
> +	int credits, map_flags;
> +	int retval;
> +
> +	credits = ext4_chunk_trans_blocks(inode, map->m_len);
> +	handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, credits);
> +	if (IS_ERR(handle))
> +		return PTR_ERR(handle);
> +
> +	map->m_flags = 0;
> +	/*
> +	 * It is necessary to look up extent and map blocks under i_data_sem
> +	 * in write mode, otherwise, the delalloc extent may become stale
> +	 * during concurrent truncate operations.
> +	 */
> +	ext4_fc_track_inode(handle, inode);
> +	down_write(&EXT4_I(inode)->i_data_sem);
> +	if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es, &map->m_seq)) {
> +		retval = es.es_len - (map->m_lblk - es.es_lblk);
> +		map->m_len = min_t(unsigned int, retval, map->m_len);
> +
> +		if (ext4_es_is_delayed(&es)) {

I understand that it is okay for us to rely on extent status ==
delayed here because we never reclaim delayed es entries and hence we
are sure to not skip any delayed block allocations here.

> +			map->m_flags |= EXT4_MAP_DELAYED;
> +			trace_ext4_da_write_pages_extent(inode, map);
> +			/*
> +			 * Call ext4_map_create_blocks() to allocate any
> +			 * delayed allocation blocks. It is possible that
> +			 * we're going to need more metadata blocks, however
> +			 * we must not fail because we're in writeback and
> +			 * there is nothing we can do so it might result in
> +			 * data loss. So use reserved blocks to allocate
> +			 * metadata if possible.
> +			 */
> +			map_flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT |
> +				    EXT4_GET_BLOCKS_METADATA_NOFAIL |
> +				    EXT4_EX_NOCACHE;
> +
> +			retval = ext4_map_create_blocks(handle, inode, map,
> +							map_flags);
> +			if (retval > 0)
> +				ext4_fc_track_range(handle, inode, map->m_lblk,
> +						map->m_lblk + map->m_len - 1);
> +			goto out;
> +		} else if (unlikely(ext4_es_is_hole(&es)))

Now that you've fixed the partial invalidate in iomap (patch 12/23)
can we still hit this hole case? 

> +			goto out;
> +
> +		/* Found written or unwritten extent. */
> +		map->m_pblk = ext4_es_pblock(&es) + map->m_lblk - es.es_lblk;
> +		map->m_flags = ext4_es_is_written(&es) ?
> +			       EXT4_MAP_MAPPED : EXT4_MAP_UNWRITTEN;
> +		goto out;
> +	}
> +
> +	retval = ext4_map_query_blocks(handle, inode, map, EXT4_EX_NOCACHE);
> +out:
> +	up_write(&EXT4_I(inode)->i_data_sem);
> +	ext4_journal_stop(handle);
> +	return retval < 0 ? retval : 0;
> +}
> +
> +static int ext4_iomap_map_writeback_range(struct iomap_writepage_ctx *wpc,
> +					  loff_t offset, unsigned int dirty_len)
> +{
> +	struct inode *inode = wpc->inode;
> +	struct super_block *sb = inode->i_sb;
> +	struct journal_s *journal = EXT4_SB(sb)->s_journal;
> +	struct ext4_map_blocks map;
> +	unsigned int blkbits = inode->i_blkbits;
> +	unsigned int index = offset >> blkbits;
> +	unsigned int blk_end, blk_len;
> +	int ret;
> +
> +	ret = ext4_emergency_state(sb);
> +	if (unlikely(ret))
> +		return ret;
> +
> +	/* Check validity of the cached writeback mapping. */
> +	if (offset >= wpc->iomap.offset &&
> +	    offset < wpc->iomap.offset + wpc->iomap.length &&
> +	    ext4_iomap_valid(inode, &wpc->iomap))
> +		return 0;
> +
> +	blk_len = dirty_len >> blkbits;
> +	blk_end = min_t(unsigned int, (wpc->wbc->range_end >> blkbits),
> +				      (UINT_MAX - 1));

This is an interesting idea. I'm just a bit worried when we have
range_end == LLONG_MAX (bg flush) and we will always be trying to allocate
MAX_WRITEPAGES, incase of a slightly fragmented FS, we might keep
falling into slower mballoc criterias and might waste a lot of time
scanning the groups.

> +	if (blk_end > index + blk_len)
> +		blk_len = blk_end - index + 1;
> +
> +retry:
> +	map.m_lblk = index;
> +	map.m_len = min_t(unsigned int, MAX_WRITEPAGES_EXTENT_LEN, blk_len);
> +	ret = ext4_map_blocks(NULL, inode, &map,
> +			      EXT4_GET_BLOCKS_IO_SUBMIT | EXT4_EX_NOCACHE);

Do we really need the IO_SUBMIT flag here now that we are:
1. Not using ordered data
2. We anyways don't use it in ext4_iomap_map_one_extent().

I think we can drop it.

> +	if (ret < 0)
> +		return ret;
> +
> +	/*
> +	 * The map is not a delalloc extent, it must either be a hole
> +	 * or an extent which have already been allocated.
> +	 */
> +	if (!(map.m_flags & EXT4_MAP_DELAYED))
> +		goto out;
> +
> +	/* Map one delalloc extent. */
> +	ret = ext4_iomap_map_one_extent(inode, &map);
> +	if (ret < 0) {
> +		if (ext4_emergency_state(sb))
> +			return ret;
> +
> +		/*
> +		 * Retry transient ENOSPC errors, if
> +		 * ext4_count_free_blocks() is non-zero, a commit
> +		 * should free up blocks.
> +		 */
> +		if (ret == -ENOSPC && journal && ext4_count_free_clusters(sb)) {
> +			jbd2_journal_force_commit_nested(journal);
> +			goto retry;
> +		}
> +
> +		ext4_msg(sb, KERN_CRIT,
> +			 "Delayed block allocation failed for inode %llu at logical offset %llu with max blocks %u with error %d",
> +			 inode->i_ino, (unsigned long long)map.m_lblk,
> +			 (unsigned int)map.m_len, -ret);
> +		ext4_msg(sb, KERN_CRIT,
> +			 "This should not happen!! Data will be lost\n");
> +		if (ret == -ENOSPC)
> +			ext4_print_free_blocks(inode);
> +		return ret;
> +	}
> +out:
> +	ext4_set_iomap(inode, &wpc->iomap, &map, offset, dirty_len, 0);
> +	return 0;
> +}
> +

<snip>
> 

^ permalink raw reply

* Re: [PATCH 00/17] fs: replace __get_free_pages() call with kmalloc()
From: Christian Brauner @ 2026-05-27 12:05 UTC (permalink / raw)
  To: Jan Kara, Mark Fasheh, Joel Becker, Joseph Qi, Ryusuke Konishi,
	Viacheslav Dubeyko, Trond Myklebust, Anna Schumaker, Chuck Lever,
	Jeff Layton, NeilBrown, Olga Kornievskaia, Dai Ngo, Tom Talpey,
	Alexander Viro, Jan Kara, Dave Kleikamp, Theodore Ts'o,
	Miklos Szeredi, Andreas Hindborg, Breno Leitao, Kees Cook,
	Tigran A. Aivazian, Mike Rapoport (Microsoft)
  Cc: Christian Brauner, linux-kernel, linux-fsdevel, ocfs2-devel,
	linux-nilfs, linux-nfs, jfs-discussion, linux-ext4, linux-mm
In-Reply-To: <20260523-b4-fs-v1-0-275e36a83f0e@kernel.org>

On Sat, 23 May 2026 20:54:12 +0300, Mike Rapoport (Microsoft) wrote:
> This is a (small) part of larger work of replacing page allocator calls
> with kmalloc.
> 
> Also in git:
> https://git.kernel.org/pub/scm/linux/kernel/git/rppt/linux.git gfp-to-kmalloc/fs
> 
> 
> [...]

Applied to the vfs-7.2.misc branch of the vfs/vfs.git tree.
Patches in the vfs-7.2.misc branch should appear in linux-next soon.

Please report any outstanding bugs that were missed during review in a
new review to the original patch series allowing us to drop it.

It's encouraged to provide Acked-bys and Reviewed-bys even though the
patch has now been applied. If possible patch trailers will be updated.

Note that commit hashes shown below are subject to change due to rebase,
trailer updates or similar. If in doubt, please check the listed branch.

tree:   https://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs.git
branch: vfs-7.2.misc

[01/17] quota: allocate dquot_hash with kmalloc()
        https://git.kernel.org/vfs/vfs/c/c94d1fa0af45
[02/17] proc: replace __get_free_page() with kmalloc()
        https://git.kernel.org/vfs/vfs/c/3c849e5fe1db
[03/17] ocfs2/dlm: replace __get_free_page() with kmalloc()
        https://git.kernel.org/vfs/vfs/c/40b7e5db6a25
[04/17] nilfs2: replace get_zeroed_page() with kzalloc()
        https://git.kernel.org/vfs/vfs/c/2abe95d9f56d
[05/17] NFS: replace __get_free_page() with kmalloc() in nfs_show_devname()
        https://git.kernel.org/vfs/vfs/c/75805c8f6d43
[06/17] NFS: remove unused page and page2 in nfs4_replace_transport()
        https://git.kernel.org/vfs/vfs/c/0d77bacd0eab
[07/17] NFSD: replace __get_free_page() with kmalloc() in nfsd_buffered_readdir()
        https://git.kernel.org/vfs/vfs/c/64f162f93a81
[08/17] libfs: simple_transaction_get(): replace get_zeroed_page() with kzalloc()
        https://git.kernel.org/vfs/vfs/c/5a3763a94e95
[09/17] jfs: replace __get_free_page() with kmalloc()
        https://git.kernel.org/vfs/vfs/c/d50250728dc1
[10/17] jbd2: replace __get_free_pages() with kmalloc()
        https://git.kernel.org/vfs/vfs/c/75c9377833a1
[11/17] isofs: replace __get_free_page() with kmalloc()
        https://git.kernel.org/vfs/vfs/c/95f2509040ac
[12/17] fuse: replace __get_free_page() with kmalloc()
        https://git.kernel.org/vfs/vfs/c/c78262429022
[13/17] fs/select: replace __get_free_page() with kmalloc()
        https://git.kernel.org/vfs/vfs/c/ac6aa4672cef
[14/17] fs/namespace: use __getname() to allocate mntpath buffer
        https://git.kernel.org/vfs/vfs/c/bd822134dcaf
[15/17] configfs: replace __get_free_pages() with kzalloc()
        https://git.kernel.org/vfs/vfs/c/32466534cba7
[16/17] binfmt_misc: replace __get_free_page() with kmalloc()
        https://git.kernel.org/vfs/vfs/c/df5f3ac3e999
[17/17] bfs: replace get_zeroed_page() with kzalloc()
        https://git.kernel.org/vfs/vfs/c/0a994e1ab090

^ permalink raw reply

* Re: [PATCH 0/8] super: retire sget(), convert iterators to RCU
From: Christian Brauner @ 2026-05-27 11:54 UTC (permalink / raw)
  To: linux-fsdevel
  Cc: Theodore Ts'o, Andreas Dilger, Jan Kara, Ritesh Harjani (IBM),
	linux-ext4, linux-cifs, Alexander Viro
In-Reply-To: <20260526-work-sget-v1-0-263f7025cedd@kernel.org>

On Tue, May 26, 2026 at 05:09:02PM +0200, Christian Brauner wrote:
> * retire sget(): CIFS plus the two ext4 KUnit tests (extents-test,
> 
> * Walk @super_blocks and @type->fs_supers under RCU, pinned by

Can't work as I originally envisioned.

^ permalink raw reply

* Re: [PATCH 5/8] super: drop sb_lock from setup_bdev_super() tuple publication
From: Christian Brauner @ 2026-05-27 11:53 UTC (permalink / raw)
  To: linux-fsdevel
  Cc: Theodore Ts'o, Andreas Dilger, Jan Kara, Ritesh Harjani (IBM),
	linux-ext4, linux-cifs, Alexander Viro
In-Reply-To: <20260526-work-sget-v1-5-263f7025cedd@kernel.org>

>  	}
> -	spin_lock(&sb_lock);

Yeah, I failed to consider that we need to protect against a concurrent
sget_fc() call with a custom callback so we cannot reasonably drop this
lock.

> -	spin_unlock(&sb_lock);
> +		WRITE_ONCE(sb->s_iflags, sb->s_iflags | SB_I_STABLE_WRITES);

^ permalink raw reply

* Re: [PATCH 17/34] jbd2: Convert jbd2_write_superblock() to bh_submit()
From: Jan Kara @ 2026-05-27 10:54 UTC (permalink / raw)
  To: Matthew Wilcox (Oracle)
  Cc: Jan Kara, Christian Brauner, Christoph Hellwig, linux-fsdevel,
	linux-ext4
In-Reply-To: <20260525171931.4144395-18-willy@infradead.org>

On Mon 25-05-26 18:19:10, Matthew Wilcox (Oracle) wrote:
> Avoid an extra indirect function call by using bh_submit() instead of
> submit_bh().
> 
> Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
> Cc: linux-ext4@vger.kernel.org

Looks good. Feel free to add:

Reviewed-by: Jan Kara <jack@suse.cz>

								Honza

> ---
>  fs/jbd2/journal.c | 3 +--
>  1 file changed, 1 insertion(+), 2 deletions(-)
> 
> diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
> index 4f397fcdb13c..a6616380ce38 100644
> --- a/fs/jbd2/journal.c
> +++ b/fs/jbd2/journal.c
> @@ -1821,8 +1821,7 @@ static int jbd2_write_superblock(journal_t *journal, blk_opf_t write_flags)
>  	if (jbd2_journal_has_csum_v2or3(journal))
>  		sb->s_checksum = jbd2_superblock_csum(sb);
>  	get_bh(bh);
> -	bh->b_end_io = end_buffer_write_sync;
> -	submit_bh(REQ_OP_WRITE | write_flags, bh);
> +	bh_submit(bh, REQ_OP_WRITE | write_flags, bh_end_write);
>  	wait_on_buffer(bh);
>  	if (buffer_write_io_error(bh)) {
>  		clear_buffer_write_io_error(bh);
> -- 
> 2.47.3
> 
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply

* Re: [PATCH 16/34] jbd2: Convert journal commit to bh_submit()
From: Jan Kara @ 2026-05-27 10:54 UTC (permalink / raw)
  To: Matthew Wilcox (Oracle)
  Cc: Jan Kara, Christian Brauner, Christoph Hellwig, linux-fsdevel,
	linux-ext4
In-Reply-To: <20260525171931.4144395-17-willy@infradead.org>

On Mon 25-05-26 18:19:09, Matthew Wilcox (Oracle) wrote:
> Avoid an extra indirect function call by using bh_submit()
> instead of submit_bh() in journal_submit_commit_record()
> and jbd2_journal_commit_transaction().  These both use
> journal_end_buffer_io_sync(), so it's more straightforward to do them
> both at once.
> 
> Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
> Cc: linux-ext4@vger.kernel.org

Looks good. Feel free to add:

Reviewed-by: Jan Kara <jack@suse.cz>

Another note for future work here: The BH_Shadow handling looks like a dead
code. We hold buffer lock when writing out bh to the journal and we do
acquire the buffer lock in do_get_write_access() anyway (which is the only
place that checks for BH_Shadow) so the buffer_shadow() check should never
trigger. Needs checking, some more thought, and possibly slightly expanding
the area where buffer lock is held in do_get_write_access() but it should
be relatively low hanging fruit. Then we can completely remove BH_Shadow
and use generic IO completion function.

								Honza

> ---
>  fs/jbd2/commit.c | 13 +++++++------
>  1 file changed, 7 insertions(+), 6 deletions(-)
> 
> diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
> index 8cf61e7185c4..38f318bb4279 100644
> --- a/fs/jbd2/commit.c
> +++ b/fs/jbd2/commit.c
> @@ -29,8 +29,10 @@
>  /*
>   * IO end handler for temporary buffer_heads handling writes to the journal.
>   */
> -static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
> +static void journal_end_buffer_io_sync(struct bio *bio)
>  {
> +	bool uptodate = bio->bi_status == BLK_STS_OK;
> +	struct buffer_head *bh = bio_endio_bh(bio);
>  	struct buffer_head *orig_bh = bh->b_private;
>  
>  	BUFFER_TRACE(bh, "");
> @@ -147,13 +149,12 @@ static int journal_submit_commit_record(journal_t *journal,
>  	lock_buffer(bh);
>  	clear_buffer_dirty(bh);
>  	set_buffer_uptodate(bh);
> -	bh->b_end_io = journal_end_buffer_io_sync;
>  
>  	if (journal->j_flags & JBD2_BARRIER &&
>  	    !jbd2_has_feature_async_commit(journal))
>  		write_flags |= REQ_PREFLUSH | REQ_FUA;
>  
> -	submit_bh(write_flags, bh);
> +	bh_submit(bh, write_flags, journal_end_buffer_io_sync);
>  	*cbh = bh;
>  	return 0;
>  }
> @@ -751,9 +752,9 @@ void jbd2_journal_commit_transaction(journal_t *journal)
>  				lock_buffer(bh);
>  				clear_buffer_dirty(bh);
>  				set_buffer_uptodate(bh);
> -				bh->b_end_io = journal_end_buffer_io_sync;
> -				submit_bh(REQ_OP_WRITE | JBD2_JOURNAL_REQ_FLAGS,
> -					  bh);
> +				bh_submit(bh,
> +					REQ_OP_WRITE | JBD2_JOURNAL_REQ_FLAGS,
> +					journal_end_buffer_io_sync);
>  			}
>  			cond_resched();
>  
> -- 
> 2.47.3
> 
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply

* Re: [PATCH v2] ext2: Remove deprecated DAX support
From: Ashwin Gundarapu @ 2026-05-27 10:53 UTC (permalink / raw)
  To: Jan Kara; +Cc: jack, linux-ext4, linux-kernel
In-Reply-To: <fxaiddid432ivmkcsqmzeovemmnkyh37nfgwn4xurxb2wx5u5y@lhkjib3tmd7e>

Thanks for the review, Jan. All the style issues you mentioned have
been addressed in v3 and v4:

v3: https://lore.kernel.org/linux-ext4/19e595ac3d0.1a0dcfbe128078.1031782761444069401@zohomail.in/
v4: https://lore.kernel.org/linux-ext4/19e5aa07c9b.3a2e576d130187.5289857983023045470@zohomail.in/

v3 fixed the spaces-to-tabs indentation issues in inode.c, super.c,
and file.c. It also restored Opt_dax for a graceful mount error message.

v4 changed Opt_xip and Opt_dax from -EINVAL to break with a warning,
per Sashiko AI review, to avoid potential boot failures on systems
with these options in /etc/fstab.

The stray empty lines and tab indentation are all cleaned up in the
latest version (v4).

Thanks,
Ashwin


From: Jan Kara <jack@suse.cz>
To: "Ashwin Gundarapu"<linuxuser509@zohomail.in>
Cc: "jack"<jack@suse.com>, "linux-ext4"<linux-ext4@vger.kernel.org>, "linux-kernel"<linux-kernel@vger.kernel.org>
Date: Mon, 25 May 2026 22:00:40 +0530
Subject: Re: [PATCH v2] ext2: Remove deprecated DAX support

 > On Sun 24-05-26 11:08:53, Ashwin Gundarapu wrote: 
 > > 
 > > DAX support in ext2 was deprecated in commit d5a2693f93e4 
 > > ("ext2: Deprecate DAX") with a removal deadline of end of 2025. 
 > > Remove all DAX code from ext2 as scheduled. 
 > > 
 > > This removes the DAX mount option, IOMAP DAX support, DAX file 
 > > operations, DAX address_space_operations, and the DAX fault handler. 
 > > 
 > > Signed-off-by: Ashwin Gundarapu <linuxuser509@zohomail.in> 
 > > --- 
 > > v2: Removed unused sbi variable and fixed indentation as reported 
 > >     by kernel test robot. 
 >  
 > Thanks for the patch. Some style nits below. 
 >  
 > >  static ssize_t ext2_file_read_iter(struct kiocb *iocb, struct iov_iter *to) 
 > >  { 
 > > -#ifdef CONFIG_FS_DAX 
 > > -    if (IS_DAX(iocb->ki_filp->f_mapping->host)) 
 > > -        return ext2_dax_read_iter(iocb, to); 
 > > -#endif 
 > > + 
 >  
 > Stray empty line here. 
 >  
 > >      if (iocb->ki_flags & IOCB_DIRECT) 
 > >          return ext2_dio_read_iter(iocb, to); 
 > > 
 > > @@ -297,10 +188,7 @@ static ssize_t ext2_file_read_iter(struct kiocb *iocb, struct iov_iter *to) 
 > > 
 > >  static ssize_t ext2_file_write_iter(struct kiocb *iocb, struct iov_iter *from) 
 > >  { 
 > > -#ifdef CONFIG_FS_DAX 
 > > -    if (IS_DAX(iocb->ki_filp->f_mapping->host)) 
 > > -        return ext2_dax_write_iter(iocb, from); 
 > > -#endif 
 > > + 
 >  
 > ... and here. 
 >  
 > >      if (iocb->ki_flags & IOCB_DIRECT) 
 > >          return ext2_dio_write_iter(iocb, from); 
 > > 
 > > @@ -321,7 +209,7 @@ const struct file_operations ext2_file_operations = { 
 > >  #ifdef CONFIG_COMPAT 
 > >      .compat_ioctl    = ext2_compat_ioctl, 
 > >  #endif 
 > > -    .mmap_prepare    = ext2_file_mmap_prepare, 
 > > +    .mmap_prepare = generic_file_mmap_prepare, 
 >  
 > Please indent this with tab the same way as other methods. 
 >  
 > > @@ -841,10 +818,7 @@ static int ext2_iomap_begin(struct inode *inode, loff_t offset, loff_t length, 
 > > 
 > >      iomap->flags = 0; 
 > >      iomap->offset = (u64)first_block << blkbits; 
 > > -    if (flags & IOMAP_DAX) 
 > > -        iomap->dax_dev = sbi->s_daxdev; 
 > > -    else 
 > > -        iomap->bdev = inode->i_sb->s_bdev; 
 > > +        iomap->bdev = inode->i_sb->s_bdev; 
 >  
 > Indented with spaces instead of tabs. 
 >  
 > > @@ -1290,12 +1248,8 @@ static int ext2_setsize(struct inode *inode, loff_t newsize) 
 > > 
 > >      inode_dio_wait(inode); 
 > > 
 > > -    if (IS_DAX(inode)) 
 > > -        error = dax_truncate_page(inode, newsize, NULL, 
 > > -                      &ext2_iomap_ops); 
 > > -    else 
 > > -        error = block_truncate_page(inode->i_mapping, 
 > > -                newsize, ext2_get_block); 
 > > +        error = block_truncate_page(inode->i_mapping, 
 > > +                                newsize, ext2_get_block); 
 >  
 > Indented with spaces instead of tabs. 
 >  
 > >      if (error) 
 > >          return error; 
 > > 
 >  
 > ... 
 > > +        case Opt_xip: 
 > > +                ext2_msg_fc(fc, KERN_ERR, "DAX support has been removed. Please use ext4 instead."); 
 > > +                return -EINVAL; 
 >  
 > Indented with spaces instead of tabs. 
 >  
 > > @@ -992,16 +974,8 @@ static int ext2_fill_super(struct super_block *sb, struct fs_context *fc) 
 > >      } 
 > >      blocksize = BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size); 
 > > 
 > > -    if (test_opt(sb, DAX)) { 
 > > -        if (!sbi->s_daxdev) { 
 > > -            ext2_msg(sb, KERN_ERR, 
 > > -                "DAX unsupported by block device. Turning off DAX."); 
 > > -            clear_opt(sbi->s_mount_opt, DAX); 
 > > -        } else if (blocksize != PAGE_SIZE) { 
 > > -            ext2_msg(sb, KERN_ERR, "unsupported blocksize for DAX\n"); 
 > > -            clear_opt(sbi->s_mount_opt, DAX); 
 > > -        } 
 > > -    } 
 > > + 
 > > + 
 >  
 > Stray empty lines. 
 >  
 >                                 Honza 
 > -- 
 > Jan Kara <jack@suse.com> 
 > SUSE Labs, CR 
 >  
 > 


^ permalink raw reply

* Re: [PATCH 15/34] ext4: Convert ext4_commit_super() to bh_submit()
From: Jan Kara @ 2026-05-27 10:42 UTC (permalink / raw)
  To: Matthew Wilcox (Oracle)
  Cc: Jan Kara, Christian Brauner, Christoph Hellwig, linux-fsdevel,
	linux-ext4
In-Reply-To: <20260525171931.4144395-16-willy@infradead.org>

On Mon 25-05-26 18:19:08, Matthew Wilcox (Oracle) wrote:
> Avoid an extra indirect function call by using bh_submit() instead of
> submit_bh().
> 
> Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
> Cc: linux-ext4@vger.kernel.org

Looks good. Feel free to add:

Reviewed-by: Jan Kara <jack@suse.cz>

								Honza

> ---
>  fs/ext4/super.c | 5 ++---
>  1 file changed, 2 insertions(+), 3 deletions(-)
> 
> diff --git a/fs/ext4/super.c b/fs/ext4/super.c
> index fbe175951e01..905d66cbe3f2 100644
> --- a/fs/ext4/super.c
> +++ b/fs/ext4/super.c
> @@ -6320,9 +6320,8 @@ static int ext4_commit_super(struct super_block *sb)
>  	get_bh(sbh);
>  	/* Clear potential dirty bit if it was journalled update */
>  	clear_buffer_dirty(sbh);
> -	sbh->b_end_io = end_buffer_write_sync;
> -	submit_bh(REQ_OP_WRITE | REQ_SYNC |
> -		  (test_opt(sb, BARRIER) ? REQ_FUA : 0), sbh);
> +	bh_submit(sbh, REQ_OP_WRITE | REQ_SYNC |
> +		  (test_opt(sb, BARRIER) ? REQ_FUA : 0), bh_end_write);
>  	wait_on_buffer(sbh);
>  	if (buffer_write_io_error(sbh)) {
>  		ext4_msg(sb, KERN_ERR, "I/O error while writing "
> -- 
> 2.47.3
> 
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply

* Re: [PATCH 14/34] ext4: Convert write_mmp_block_thawed() to bh_submit()
From: Jan Kara @ 2026-05-27 10:42 UTC (permalink / raw)
  To: Matthew Wilcox (Oracle)
  Cc: Jan Kara, Christian Brauner, Christoph Hellwig, linux-fsdevel,
	linux-ext4
In-Reply-To: <20260525171931.4144395-15-willy@infradead.org>

On Mon 25-05-26 18:19:07, Matthew Wilcox (Oracle) wrote:
> Avoid an extra indirect function call by using bh_submit() instead of
> submit_bh().
> 
> Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
> Cc: linux-ext4@vger.kernel.org

Looks good. Feel free to add:

Reviewed-by: Jan Kara <jack@suse.cz>

								Honza

> ---
>  fs/ext4/mmp.c | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
> index 6f57c181ff77..493528fbed75 100644
> --- a/fs/ext4/mmp.c
> +++ b/fs/ext4/mmp.c
> @@ -46,9 +46,9 @@ static int write_mmp_block_thawed(struct super_block *sb,
>  
>  	ext4_mmp_csum_set(sb, mmp);
>  	lock_buffer(bh);
> -	bh->b_end_io = end_buffer_write_sync;
>  	get_bh(bh);
> -	submit_bh(REQ_OP_WRITE | REQ_SYNC | REQ_META | REQ_PRIO, bh);
> +	bh_submit(bh, REQ_OP_WRITE | REQ_SYNC | REQ_META | REQ_PRIO,
> +			bh_end_write);
>  	wait_on_buffer(bh);
>  	if (unlikely(!buffer_uptodate(bh)))
>  		return -EIO;
> -- 
> 2.47.3
> 
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply

* Re: [PATCH 13/34] ext4: Convert ext4_fc_submit_bh() to bh_submit()
From: Jan Kara @ 2026-05-27 10:41 UTC (permalink / raw)
  To: Matthew Wilcox (Oracle)
  Cc: Jan Kara, Christian Brauner, Christoph Hellwig, linux-fsdevel,
	linux-ext4
In-Reply-To: <20260525171931.4144395-14-willy@infradead.org>

On Mon 25-05-26 18:19:06, Matthew Wilcox (Oracle) wrote:
> Avoid an extra indirect function call by converting
> ext4_end_buffer_io_sync() from bh_end_io_t to bio_end_io_t and
> calling bh_submit().
> 
> Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
> Cc: linux-ext4@vger.kernel.org

Looks good. Feel free to add:

Reviewed-by: Jan Kara <jack@suse.cz>

								Honza

> ---
>  fs/ext4/fast_commit.c | 8 +++++---
>  1 file changed, 5 insertions(+), 3 deletions(-)
> 
> diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
> index b3c22636251d..d52c64adf416 100644
> --- a/fs/ext4/fast_commit.c
> +++ b/fs/ext4/fast_commit.c
> @@ -184,8 +184,11 @@
>  #include <trace/events/ext4.h>
>  static struct kmem_cache *ext4_fc_dentry_cachep;
>  
> -static void ext4_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
> +static void ext4_end_buffer_io_sync(struct bio *bio)
>  {
> +	bool uptodate = bio->bi_status == BLK_STS_OK;
> +	struct buffer_head *bh = bio_endio_bh(bio);
> +
>  	BUFFER_TRACE(bh, "");
>  	if (uptodate) {
>  		ext4_debug("%s: Block %lld up-to-date",
> @@ -659,8 +662,7 @@ static void ext4_fc_submit_bh(struct super_block *sb, bool is_tail)
>  	lock_buffer(bh);
>  	set_buffer_dirty(bh);
>  	set_buffer_uptodate(bh);
> -	bh->b_end_io = ext4_end_buffer_io_sync;
> -	submit_bh(REQ_OP_WRITE | write_flags, bh);
> +	bh_submit(bh, REQ_OP_WRITE | write_flags, ext4_end_buffer_io_sync);
>  	EXT4_SB(sb)->s_fc_bh = NULL;
>  }
>  
> -- 
> 2.47.3
> 
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply

* Re: [PATCH 12/34] ext4; Convert __ext4_read_bh() to bh_submit()
From: Jan Kara @ 2026-05-27 10:38 UTC (permalink / raw)
  To: Matthew Wilcox (Oracle)
  Cc: Jan Kara, Christian Brauner, Christoph Hellwig, linux-fsdevel,
	linux-ext4
In-Reply-To: <20260525171931.4144395-13-willy@infradead.org>

On Mon 25-05-26 18:19:05, Matthew Wilcox (Oracle) wrote:
> Avoid an extra indirect function call by converting
> ext4_end_bitmap_read() from bh_end_io_t to bio_end_io_t and
> calling bh_submit().
> 
> Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
> Cc: linux-ext4@vger.kernel.org

Looks good. Feel free to add:

Reviewed-by: Jan Kara <jack@suse.cz>

								Honza

> ---
>  fs/ext4/ext4.h   | 10 +++++-----
>  fs/ext4/ialloc.c |  5 ++++-
>  fs/ext4/super.c  | 11 ++++++-----
>  3 files changed, 15 insertions(+), 11 deletions(-)
> 
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 94283a991e5c..6af11f0ff1c5 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -2959,7 +2959,7 @@ extern unsigned long ext4_count_dirs(struct super_block *);
>  extern void ext4_mark_bitmap_end(int start_bit, int end_bit, char *bitmap);
>  extern int ext4_init_inode_table(struct super_block *sb,
>  				 ext4_group_t group, int barrier);
> -extern void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate);
> +void ext4_end_bitmap_read(struct bio *bio);
>  
>  /* fast_commit.c */
>  int ext4_fc_info_show(struct seq_file *seq, void *v);
> @@ -3184,10 +3184,10 @@ extern struct buffer_head *ext4_sb_bread_unmovable(struct super_block *sb,
>  						   sector_t block);
>  extern struct buffer_head *ext4_sb_bread_nofail(struct super_block *sb,
>  						sector_t block);
> -extern void ext4_read_bh_nowait(struct buffer_head *bh, blk_opf_t op_flags,
> -				bh_end_io_t *end_io, bool simu_fail);
> -extern int ext4_read_bh(struct buffer_head *bh, blk_opf_t op_flags,
> -			bh_end_io_t *end_io, bool simu_fail);
> +void ext4_read_bh_nowait(struct buffer_head *bh, blk_opf_t op_flags,
> +		bio_end_io_t end_io, bool simu_fail);
> +int ext4_read_bh(struct buffer_head *bh, blk_opf_t op_flags,
> +		bio_end_io_t end_io, bool simu_fail);
>  extern int ext4_read_bh_lock(struct buffer_head *bh, blk_opf_t op_flags, bool wait);
>  extern void ext4_sb_breadahead_unmovable(struct super_block *sb, sector_t block);
>  extern int ext4_seq_options_show(struct seq_file *seq, void *offset);
> diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
> index 3fd8f0099852..2db68b1bf855 100644
> --- a/fs/ext4/ialloc.c
> +++ b/fs/ext4/ialloc.c
> @@ -66,8 +66,11 @@ void ext4_mark_bitmap_end(int start_bit, int end_bit, char *bitmap)
>  		memset(bitmap + (i >> 3), 0xff, (end_bit - i) >> 3);
>  }
>  
> -void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate)
> +void ext4_end_bitmap_read(struct bio *bio)
>  {
> +	bool uptodate = bio->bi_status == BLK_STS_OK;
> +	struct buffer_head *bh = bio_endio_bh(bio);
> +
>  	if (uptodate) {
>  		set_buffer_uptodate(bh);
>  		set_bitmap_uptodate(bh);
> diff --git a/fs/ext4/super.c b/fs/ext4/super.c
> index 6a77db4d3124..fbe175951e01 100644
> --- a/fs/ext4/super.c
> +++ b/fs/ext4/super.c
> @@ -161,7 +161,7 @@ MODULE_ALIAS("ext3");
>  
>  
>  static inline void __ext4_read_bh(struct buffer_head *bh, blk_opf_t op_flags,
> -				  bh_end_io_t *end_io, bool simu_fail)
> +				  bio_end_io_t end_io, bool simu_fail)
>  {
>  	if (simu_fail) {
>  		clear_buffer_uptodate(bh);
> @@ -176,13 +176,14 @@ static inline void __ext4_read_bh(struct buffer_head *bh, blk_opf_t op_flags,
>  	 */
>  	clear_buffer_verified(bh);
>  
> -	bh->b_end_io = end_io ? end_io : end_buffer_read_sync;
> +	if (!end_io)
> +		end_io = bh_end_read;
>  	get_bh(bh);
> -	submit_bh(REQ_OP_READ | op_flags, bh);
> +	bh_submit(bh, REQ_OP_READ | op_flags, end_io);
>  }
>  
>  void ext4_read_bh_nowait(struct buffer_head *bh, blk_opf_t op_flags,
> -			 bh_end_io_t *end_io, bool simu_fail)
> +			 bio_end_io_t end_io, bool simu_fail)
>  {
>  	BUG_ON(!buffer_locked(bh));
>  
> @@ -194,7 +195,7 @@ void ext4_read_bh_nowait(struct buffer_head *bh, blk_opf_t op_flags,
>  }
>  
>  int ext4_read_bh(struct buffer_head *bh, blk_opf_t op_flags,
> -		 bh_end_io_t *end_io, bool simu_fail)
> +		 bio_end_io_t end_io, bool simu_fail)
>  {
>  	BUG_ON(!buffer_locked(bh));
>  
> -- 
> 2.47.3
> 
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply

* [PATCH v2 2/2] ext4: get ext4_group_desc in ext4_mb_prefetch only when necessary
From: Bohdan Trach @ 2026-05-27  9:03 UTC (permalink / raw)
  To: Theodore Ts'o, Andreas Dilger, Baokun Li, Jan Kara,
	Ojaswin Mujoo, Ritesh Harjani (IBM), Zhang Yi
  Cc: mchehab+huawei, bohdan.trach, lilith.oberhauser, Bohdan Trach,
	linux-ext4, linux-kernel
In-Reply-To: <20260527090329.2680170-1-bohdan.trach@huaweicloud.com>

Getting ext4_group_desc structure can contribute to the cost of
ext4_mb_prefetch() without any need, as most groups fail the
!EXT4_MB_GRP_TEST_AND_SET_READ check.

Optimize ext4_mb_prefetch by getting the group description only when
necessary.

The result is further increase in performance of fallocate() system call
path that triggers ext4_mb_prefetch() via a linear group scan.

Signed-off-by: Bohdan Trach <bohdan.trach@huaweicloud.com>
Reviewed-by: Jan Kara <jack@suse.cz>
---
 fs/ext4/mballoc.c | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 25e3d9204233..907a209eb1e8 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2861,8 +2861,6 @@ ext4_group_t ext4_mb_prefetch(struct super_block *sb, ext4_group_t group,
 
 	blk_start_plug(&plug);
 	while (nr-- > 0) {
-		struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group,
-								  NULL);
 		struct ext4_group_info *grp = ext4_get_group_info(sb, group);
 
 		/*
@@ -2872,14 +2870,17 @@ ext4_group_t ext4_mb_prefetch(struct super_block *sb, ext4_group_t group,
 		 * prefetch once, so we avoid getblk() call, which can
 		 * be expensive.
 		 */
-		if (gdp && grp && !EXT4_MB_GRP_TEST_AND_SET_READ(grp) &&
-		    EXT4_MB_GRP_NEED_INIT(grp) &&
-		    ext4_free_group_clusters(sb, gdp) > 0 ) {
-			bh = ext4_read_block_bitmap_nowait(sb, group, true);
-			if (!IS_ERR_OR_NULL(bh)) {
-				if (!buffer_uptodate(bh) && cnt)
-					(*cnt)++;
-				brelse(bh);
+		if (grp && !EXT4_MB_GRP_TEST_AND_SET_READ(grp) &&
+		    EXT4_MB_GRP_NEED_INIT(grp)) {
+			struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL);
+
+			if (gdp && ext4_free_group_clusters(sb, gdp) > 0) {
+				bh = ext4_read_block_bitmap_nowait(sb, group, true);
+				if (!IS_ERR_OR_NULL(bh)) {
+					if (!buffer_uptodate(bh) && cnt)
+						(*cnt)++;
+					brelse(bh);
+				}
 			}
 		}
 		if (++group >= ngroups)
-- 
2.43.0


^ permalink raw reply related

* [PATCH v2 1/2] ext4: avoid RWM atomic in EXT4_MB_GRP_TEST_AND_SET_READ
From: Bohdan Trach @ 2026-05-27  9:03 UTC (permalink / raw)
  To: Theodore Ts'o, Andreas Dilger, Baokun Li, Jan Kara,
	Ojaswin Mujoo, Ritesh Harjani (IBM), Zhang Yi
  Cc: mchehab+huawei, bohdan.trach, lilith.oberhauser, Bohdan Trach,
	linux-ext4, linux-kernel
In-Reply-To: <20260527090329.2680170-1-bohdan.trach@huaweicloud.com>

EXT4_MB_GRP_TEST_AND_SET_READ uses test_and_set_bit function which
issues an atomic write. This can cause high overhead due to cache
contention when multiple threads iterate over groups in a tight loop,
as is the case for ext4_mb_prefetch(). We have seen this to be a
problem for Kunpeng 920b CPUs which uses a single ARM LSE instruction
for this purpose.

Avoid this unconditional atomic write by testing the bit first without
changing its value. This is OK for this use case as this bit is never
unset.

This change significantly reduces costs of fallocate() operations which
trigger linear group scans on large multicore machines where
test_and_set_bit issues an atomic write operation unconditionally.

Signed-off-by: Bohdan Trach <bohdan.trach@huaweicloud.com>
---
 fs/ext4/ext4.h | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 56b82d4a15d7..f8eacf1375f8 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -3551,7 +3551,13 @@ struct ext4_group_info {
 #define EXT4_MB_GRP_CLEAR_TRIMMED(grp)	\
 	(clear_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state)))
 #define EXT4_MB_GRP_TEST_AND_SET_READ(grp)	\
-	(test_and_set_bit(EXT4_GROUP_INFO_BBITMAP_READ_BIT, &((grp)->bb_state)))
+	(ext4_mb_grp_test_and_set_read((grp)))
+
+static inline int ext4_mb_grp_test_and_set_read(struct ext4_group_info *grp)
+{
+	return (test_bit(EXT4_GROUP_INFO_BBITMAP_READ_BIT, &grp->bb_state) ||
+		test_and_set_bit(EXT4_GROUP_INFO_BBITMAP_READ_BIT, &grp->bb_state));
+}

 #define EXT4_MAX_CONTENTION		8
 #define EXT4_CONTENTION_THRESHOLD	2
-- 
2.43.0

^ permalink raw reply related

* [PATCH v2 0/2] ext4: optimize ext4_mb_prefetch
From: Bohdan Trach @ 2026-05-27  9:03 UTC (permalink / raw)
  To: Theodore Ts'o, Andreas Dilger, Baokun Li, Jan Kara,
	Ojaswin Mujoo, Ritesh Harjani (IBM), Zhang Yi
  Cc: mchehab+huawei, bohdan.trach, lilith.oberhauser, Bohdan Trach,
	linux-ext4, linux-kernel

v2:
  Fix issues found by Jan Kara, added R-b for patch 2/2.
  Extend commit message of patch 1/2 a bit.
v1:
https://lore.kernel.org/linux-ext4/20260521125931.16474-1-bohdan.trach@huaweicloud.com/

Original cover letter below:

Dear Ted,

We have been profiling scalability of some rocksdb-related workloads on
ext4 file system and have found a case where significant time ends up
being spent in ext4_mb_prefetch() function. This happens because
ext4_mb_scan_groups_linear() path is triggered in ext4_mb_scan_groups().
We have noticed that on larger, filled disks, this function can take
lots of time.

We have added a test for this issue to our fork of will-it-scale [1],
which you can use to reproduce the issue.(the actual workload does a few
writes after fallocate, they have been dropped to better illustrate the
issue).
1) https://github.com/open-s4c/will-it-scale/blob/master/tests/fallocate3.c

On this series, we optimize this code path:
Patch 1: change EXT4_MB_GRP_TEST_AND_SET_READ() to reduce the rate of
         atomic RMW operation via test_and_set_bit, which has quite
         high cost on large multicore CPUs, especially under
         contention for the group's flag cache lines.
         As this bit is only ever set, but never unset, it should be
         possible to reduce the cost of this check by calling
         test_bit[_acquire]() first.
Patch 2: restructure the ext4_mb_prefetch loop operations such that
         ext4_group_desc is fetched only after the checks based on
         ext4_group_info succeed.

This series has been tested with
        kvm-xfstests -c ext4/all -g auto
and did not introduce any new issues.

Performance test: we have used a our will-it-scale drop-in test we have
provided above, and used three machines for running it:
- Kunpeng 920 (arm64, 96 CPUs * 1 socket, 128G RAM, SAS HDD: Seagate
  Exos 10E2400 1.2TB)
- Kunpeng 920b (arm64, 80 CPUs * 2 sockets, 502G RAM, SATA SSD: Huawei
  ES3000 V6 0.96TB)
- AMD 9654 (x86_64, 96 CPUs * 2 sockets, 1.5T RAM, NVME SSD: Samsung SSD
  970 EVO Plus 1TB)
We have performed tests with existing file systems, as well as more limited
tests with a fixed-size file systems.

Benchmark on an existing file system for Kunpeng 920 (842G FS, 31% space
used) with the patch based on kernel 7.0.6:
| thr. | base | patched |      improv. |
|      | perf |    perf |              |
|------|------|---------|--------------|
|    1 | 1286 |    1608 |  +25.0388802 |
|    2 | 1673 |    1680 |   +0.4184100 |
|    4 | 1698 |    1712 |   +0.8244994 |
|    8 | 1721 |    1730 |   +0.5229518 |
|   16 | 1739 |    2313 |  +33.0074756 |
|   32 | 1742 |    3571 | +104.9942595 |
|   64 | 1735 |    3427 |  +97.5216138 |
|   96 | 1688 |    1814 |   +7.4644550 |

Benchmark on an existing file system for Kunpeng 920b (802G ext4 FS, 68%
space used) with the patch based on kernel 6.6:
| thr. | base | patched |  improv. |
|      | perf |    perf |          |
|------|------|---------|----------|
|    1 | 1613 |   1625  |   +0.74% |
|    2 | 1620 |   2603  |  +60.67% |
|    4 | 1624 |   4894  | +201.35% |
|    8 | 2505 |   8328  | +232.45% |
|   16 | 4736 |  11632  | +145.60% |
|   32 | 7784 |  13124  |  +68.60% |
|   64 | 8094 |   8636  |   +6.69% |
|  128 | 6914 |   7890  |  +14.11% |

Benchmark on an existing file system for AMD 9654 (15T FS, 6% space
used), kernel 7.1-rc3. This shows the performance impact on a mostly
free file system.
| thr. |  base | patched |    improv. |
|      |  perf |    perf |            |
|------|-------|---------|------------|
|    1 | 30901 |   31191 | +0.9384810 |
|    2 | 50874 |   50504 | -0.7272870 |
|    4 | 66068 |   64108 | -2.9666404 |
|    8 | 63963 |   61927 | -3.1830902 |
|   16 | 47809 |   47044 | -1.6001171 |
|   32 | 42441 |   42326 | -0.2709644 |
|   64 | 39773 |   39929 | +0.3922259 |
|  128 | 37065 |   36413 | -1.7590719 |

We have also performed the test with kernel 6.6 on both Kunpeng920b and
AMD 9654 with much smaller FS image (133G) to have more controlled
benchmarking environment, although this reduces the measured benefits as
well compared to a bigger FS with more groups to iterate over:

AMD 9654 performance:
| thr. |  base | patched |  improv. |
|      |  perf |    perf |          |
|------|----------------------------|
| 25% full file system:             |
|------|----------------------------|
|    1 |  5964 |    6778 |  +13.64% |
|    2 | 11811 |   13415 |  +13.58% |
|    4 | 20111 |   23570 |  +17.19% |
|    8 | 30083 |   36296 |  +20.65% |
|   16 | 27781 |   38302 |  +37.87% |
|   32 | 28325 |   36930 |  +30.37% |
|   64 | 26044 |   29952 |  +15.00% |
|  128 | 19969 |   20882 |   +4.57% |
|------|----------------------------|
| 50% full file system:             |
|------|----------------------------|
|    1 |  4093 |    7380 |  +80.30% |
|    2 | 13168 |   13906 |   +5.60% |
|    4 | 21440 |   22623 |   +5.51% |
|    8 | 30523 |   32360 |   +6.01% |
|   16 | 27502 |   34017 |  +23.68% |
|   32 | 27189 |   32480 |  +19.46% |
|   64 | 24146 |   26463 |   +9.59% |
|  128 | 18386 |   18631 |   +1.33% |
|------|----------------------------|
| 75% full file system:             |
|------|----------------------------|
|    1 |  5738 |    7208 |  +25.61% |
|    2 | 13869 |   15309 |  +10.38% |
|    4 | 21803 |   23447 |   +7.54% |
|    8 | 29004 |   30766 |   +6.07% |
|   16 | 25542 |   30584 |  +19.74% |
|   32 | 24242 |   28631 |  +18.10% |
|   64 | 20631 |   22833 |  +10.67% |
|  128 | 14603 |   15086 |   +3.30% |

Kunpeng K920b performance:
| thr. |  base | patched | improv. |
|      |  perf |    perf |         |
|------|---------------------------|
| 25% full file system:            |
|------|---------------------------|
|    1 |  5398 |    7025 | +30.14% |
|    2 |  7451 |   12299 | +65.06% |
|    4 | 12574 |   20899 | +66.20% |
|    8 | 18645 |   27694 | +48.53% |
|   16 | 25088 |   31739 | +26.51% |
|   32 | 26699 |   27632 |  +3.49% |
|   64 | 14943 |   19547 | +30.81% |
|  128 | 13047 |   14544 | +11.47% |
|------|---------------------------|
| 50% full file system:            |
|------|---------------------------|
|    1 |  4881 |    6618 | +35.58% |
|    2 |  6544 |   11660 | +78.17% |
|    4 | 11156 |   19506 | +74.84% |
|    8 | 16842 |   25835 | +53.39% |
|   16 | 23305 |   29260 | +25.55% |
|   32 | 24622 |   25303 |  +2.76% |
|   64 | 13814 |   17707 | +28.18% |
|  128 | 12061 |   13180 |  +9.27% |
|------|---------------------------|
| 75% full file system:            |
|------|---------------------------|
|    1 |  7037 |   10580 | +50.34% |
|    2 |  9216 |    9075 |  -1.52% |
|    4 | 14534 |   22076 | +51.89% |
|    8 | 19341 |   25936 | +34.09% |
|   16 | 23592 |   27409 | +16.17% |
|   32 | 23680 |   23078 |  -2.54% |
|   64 | 12836 |   15902 | +23.88% |
|  128 |  9614 |   10341 |  +7.56% |

Thanks,
Bohdan.

Bohdan Trach (2):
  ext4: avoid RWM atomic in EXT4_MB_GRP_TEST_AND_SET_READ
  ext4: get ext4_group_desc in ext4_mb_prefetch only when necessary

 fs/ext4/ext4.h    |  8 +++++++-
 fs/ext4/mballoc.c | 21 +++++++++++----------
 2 files changed, 18 insertions(+), 11 deletions(-)

-- 
2.43.0


^ permalink raw reply

* [PATCH 4/4] select: make select() and poll() waits freezable
From: Dai Junbing @ 2026-05-27  6:49 UTC (permalink / raw)
  To: linux-fsdevel, viro, brauner, tytso, jack, linux-ext4
  Cc: jack, linux-kernel, Dai Junbing
In-Reply-To: <20260527064912.1038-1-daijunbing@vivo.com>

Tasks blocked in select() or poll() may be woken during suspend and
resume due to freezer state transitions. This can cause avoidable
activity in the suspend/resume path and add unnecessary overhead.

Mark the waits in do_select() and do_poll() as freezable so these tasks
are not unnecessarily woken by the freezer.

Both functions are only used from their respective system call paths,
where the task sleeps without holding locks that would make freezing
unsafe.

Signed-off-by: Dai Junbing <daijunbing@vivo.com>
---
 fs/select.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/select.c b/fs/select.c
index bf71c9838dfe..b0b279748355 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -600,7 +600,7 @@ static noinline_for_stack int do_select(int n, fd_set_bits *fds, struct timespec
 			to = &expire;
 		}
 
-		if (!poll_schedule_timeout(&table, TASK_INTERRUPTIBLE,
+		if (!poll_schedule_timeout(&table, TASK_INTERRUPTIBLE|TASK_FREEZABLE,
 					   to, slack))
 			timed_out = 1;
 	}
@@ -962,7 +962,7 @@ static int do_poll(struct poll_list *list, struct poll_wqueues *wait,
 			to = &expire;
 		}
 
-		if (!poll_schedule_timeout(wait, TASK_INTERRUPTIBLE, to, slack))
+		if (!poll_schedule_timeout(wait, TASK_INTERRUPTIBLE|TASK_FREEZABLE, to, slack))
 			timed_out = 1;
 	}
 	return count;
-- 
2.25.1


^ permalink raw reply related

* [PATCH 3/4] pipe: mark blocking pipe read and FIFO open sleeps as freezable
From: Dai Junbing @ 2026-05-27  6:49 UTC (permalink / raw)
  To: linux-fsdevel, viro, brauner, tytso, jack, linux-ext4
  Cc: jack, linux-kernel, Dai Junbing
In-Reply-To: <20260527064912.1038-1-daijunbing@vivo.com>

Tasks blocked in pipe read or FIFO open may be woken during suspend and
resume due to freezer state transitions. This can cause avoidable
activity in the suspend/resume path and add unnecessary overhead.

Mark these sleeps as freezable so they are not unnecessarily woken by
the freezer.

Signed-off-by: Dai Junbing <daijunbing@vivo.com>
---
 fs/pipe.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/pipe.c b/fs/pipe.c
index 9841648c9cf3..594726a7e542 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -385,7 +385,7 @@ anon_pipe_read(struct kiocb *iocb, struct iov_iter *to)
 		 * since we've done any required wakeups and there's no need
 		 * to mark anything accessed. And we've dropped the lock.
 		 */
-		if (wait_event_interruptible_exclusive(pipe->rd_wait, pipe_readable(pipe)) < 0)
+		if (wait_event_freezable_exclusive(pipe->rd_wait, pipe_readable(pipe)) < 0)
 			return -ERESTARTSYS;
 
 		wake_next_reader = true;
@@ -1102,7 +1102,7 @@ static int wait_for_partner(struct pipe_inode_info *pipe, unsigned int *cnt)
 	int cur = *cnt;
 
 	while (cur == *cnt) {
-		prepare_to_wait(&pipe->rd_wait, &rdwait, TASK_INTERRUPTIBLE);
+		prepare_to_wait(&pipe->rd_wait, &rdwait, TASK_INTERRUPTIBLE|TASK_FREEZABLE);
 		pipe_unlock(pipe);
 		schedule();
 		finish_wait(&pipe->rd_wait, &rdwait);
-- 
2.25.1


^ permalink raw reply related

* [PATCH 2/4] jbd2: make kjournald2 commit wait freezable
From: Dai Junbing @ 2026-05-27  6:49 UTC (permalink / raw)
  To: linux-fsdevel, viro, brauner, tytso, jack, linux-ext4
  Cc: jack, linux-kernel, Dai Junbing
In-Reply-To: <20260527064912.1038-1-daijunbing@vivo.com>

While waiting for commit work, kjournald2 may be woken during suspend
and resume due to freezer state transitions. This causes avoidable CPU
activity in the suspend/resume path and adds unnecessary power overhead.

Make the commit wait freezable so the thread is not unnecessarily woken
by the freezer during suspend/resume.

Signed-off-by: Dai Junbing <daijunbing@vivo.com>
---
 fs/jbd2/journal.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 4f397fcdb13c..d7ffe60c8793 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -222,7 +222,7 @@ static int kjournald2(void *arg)
 		DEFINE_WAIT(wait);
 
 		prepare_to_wait(&journal->j_wait_commit, &wait,
-				TASK_INTERRUPTIBLE);
+				TASK_INTERRUPTIBLE|TASK_FREEZABLE);
 		transaction = journal->j_running_transaction;
 		if (transaction == NULL ||
 		    time_before(jiffies, transaction->t_expires)) {
-- 
2.25.1


^ permalink raw reply related

* [PATCH 1/4] eventpoll: mark ep_poll() sleep as freezable
From: Dai Junbing @ 2026-05-27  6:49 UTC (permalink / raw)
  To: linux-fsdevel, viro, brauner, tytso, jack, linux-ext4
  Cc: jack, linux-kernel, Dai Junbing
In-Reply-To: <20260527064912.1038-1-daijunbing@vivo.com>

Tasks blocked in epoll_wait() may be woken during suspend and resume due
to freezer state transitions. This can cause avoidable activity in the
suspend/resume path and add unnecessary power overhead.

Mark the sleep in ep_poll() as freezable so tasks waiting in
epoll_wait()-related paths are not unnecessarily woken by the freezer.

ep_poll() is only used from epoll_wait()-related system call paths, and
the task does not sleep while holding locks that would make freezing
unsafe. This makes the wait state safe to mark as freezable.

Signed-off-by: Dai Junbing <daijunbing@vivo.com>
---
 fs/eventpoll.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index a3090b446af1..64987b64d72b 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -2010,7 +2010,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
 		 * the same lock on wakeup ep_poll_callback() side, so it
 		 * is safe to avoid an explicit barrier.
 		 */
-		__set_current_state(TASK_INTERRUPTIBLE);
+		__set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE);

 		/*
 		 * Do the final check under the lock. ep_start/done_scan()
-- 
2.25.1

^ permalink raw reply related

* [PATCH 0/4] fs: mark selected blocking waits as freezable
From: Dai Junbing @ 2026-05-27  6:49 UTC (permalink / raw)
  To: linux-fsdevel, viro, brauner, tytso, jack, linux-ext4
  Cc: jack, linux-kernel, Dai Junbing

Hi,

During suspend and resume, tasks blocked in some interruptible wait
paths may be unnecessarily woken due to freezer state transitions. This
can introduce avoidable activity in the suspend/resume path.

This series marks a small set of blocking waits as freezable in places
where the task sleeps without holding locks that would make freezing
unsafe. The goal is to avoid unnecessary wakeups during suspend/resume
while preserving the normal wakeup conditions of these paths.

The effect is more noticeable on systems with frequent suspend/resume
cycles, such as mobile devices.

This series currently covers:
  - epoll_wait()-related waits
  - select()/poll() waits
  - blocking pipe read and FIFO open waits
  - kjournald2 commit wait

Comments are welcome.

Dai Junbing (4):
  eventpoll: mark ep_poll() sleep as freezable
  jbd2: make kjournald2 commit wait freezable
  pipe: mark blocking pipe read and FIFO open sleeps as freezable
  select: make select() and poll() waits freezable

 fs/eventpoll.c    | 2 +-
 fs/jbd2/journal.c | 2 +-
 fs/pipe.c         | 4 ++--
 fs/select.c       | 4 ++--
 4 files changed, 6 insertions(+), 6 deletions(-)

-- 
2.25.1

^ permalink raw reply

* Re: [PATCH v5 03/10] fstests: add test for inotify isolation on cloned devices
From: Anand Jain @ 2026-05-27  6:28 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Anand Jain, fstests, linux-btrfs, linux-ext4, linux-xfs, amir73il,
	zlang
In-Reply-To: <ahaOPOQYaE1wrbA6@infradead.org>



On 27/5/26 14:25, Christoph Hellwig wrote:
> On Tue, May 26, 2026 at 11:19:30PM +0800, Anand Jain wrote:
>>> fanotity and inotify use exactly the same backends, so I'm not sure
>>> why testing both matters. 
>>
>> I noticed that back then I decided to keep both.
>> Since fsnotifywait -F is only supported from kernel 5.1x.
>> inotifywait serves the legacy LTS kernels where required.
> 
> < 5.1x is pretty old these days, I would not add test just for that.
> 
>>> Not that I care very strongly, I'm just a
>>> bit confused.
>>
>> I am happy dropping inotifywait if unnecessary and confusing.
> 
> Not a major issue, but it felt weird.
> 

Okay, dropping the inotifywait test case in the reroll.

Thanks.

^ permalink raw reply

* Re: [PATCH v5 03/10] fstests: add test for inotify isolation on cloned devices
From: Christoph Hellwig @ 2026-05-27  6:25 UTC (permalink / raw)
  To: Anand Jain
  Cc: Christoph Hellwig, Anand Jain, fstests, linux-btrfs, linux-ext4,
	linux-xfs, amir73il, zlang
In-Reply-To: <e8d0caf9-df8b-415e-9f89-bc71e73a0e32@gmail.com>

On Tue, May 26, 2026 at 11:19:30PM +0800, Anand Jain wrote:
> > fanotity and inotify use exactly the same backends, so I'm not sure
> > why testing both matters. 
> 
> I noticed that back then I decided to keep both.
> Since fsnotifywait -F is only supported from kernel 5.1x.
> inotifywait serves the legacy LTS kernels where required.

< 5.1x is pretty old these days, I would not add test just for that.

> > Not that I care very strongly, I'm just a
> > bit confused.
> 
> I am happy dropping inotifywait if unnecessary and confusing.

Not a major issue, but it felt weird.


^ permalink raw reply

* Re: [PATCH 2/8] ext4: convert mballoc KUnit test to sget_fc()
From: Theodore Tso @ 2026-05-27  0:47 UTC (permalink / raw)
  To: Christian Brauner
  Cc: linux-fsdevel, Andreas Dilger, Jan Kara, Ritesh Harjani (IBM),
	linux-ext4, linux-cifs, Alexander Viro
In-Reply-To: <20260526-work-sget-v1-2-263f7025cedd@kernel.org>

On Tue, May 26, 2026 at 05:09:04PM +0200, Christian Brauner wrote:
> Add a no-op mbt_init_fs_context() so fs_context_for_mount() has
> something to call on the fake fs_type....

I was trying to figure out what needed to be in an init_fs_context()
functrion, and I came accross this in
Documentation/filesystems/mount_api.rst:

       const struct fs_context_operations *ops

     These are operations that can be done on a filesystem context (see
     below).  This must be set by the ->init_fs_context() file_system_type
     operation.    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

So is it safe to just have an init_fs_context() function which doesn't
do this?

> +static int mbt_init_fs_context(struct fs_context *fc)
> +{
> +	return 0;
> +}
> +

I see in fs/fs_context.c that in some places the code protects against
a NULL ops pointer:

        if (fc->need_free && fc->ops && fc->ops->free)
		fc->ops->free(fc);

But in other places, it doesn't and we'll end up derefrencing a null
pointer:

        if (fc->ops->parse_param) {
		ret = fc->ops->parse_param(fc, param);

	....

So it's unclear to me --- when is it safe (and not safe) to not bother
to fill in the ops pointer?

						- Ted

^ permalink raw reply

* [PATCH] ext4: add ext4_dir_entry_is_tail()
From: Artem Blagodarenko @ 2026-05-26 23:38 UTC (permalink / raw)
  To: linux-ext4; +Cc: adilger.kernel, Artem Blagodarenko

From: Artem Blagodarenko <artem.blagodarenko@gmail.com>

Replace open-coded checks for directory tail entries with a call
to ext4_dir_entry_is_tail(). This helper will also be used by
upcoming changes.

Signed-off-by: Artem Blagodarenko <artem.blagodarenko@gmail.com>
---
 fs/ext4/ext4.h  | 16 ++++++++++++++++
 fs/ext4/namei.c |  7 +------
 2 files changed, 17 insertions(+), 6 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 94283a991e5c..01b1222b1454 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -3917,6 +3917,22 @@ static inline void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end)
 		io_end->flag &= ~EXT4_IO_END_UNWRITTEN;
 }
 
+/*
+ * ext4_dir_entry_is_tail() - Check if a directory entry is a tail entry.
+ * @de: directory entry to check
+ *
+ * Returns true if @de is a directory block tail entry (checksum record).
+ */
+static inline bool ext4_dir_entry_is_tail(struct ext4_dir_entry_2 *de)
+{
+	struct ext4_dir_entry_tail *t = (struct ext4_dir_entry_tail *)de;
+
+	return !t->det_reserved_zero1 &&
+	       le16_to_cpu(t->det_rec_len) == sizeof(*t) &&
+	       !t->det_reserved_zero2 &&
+	       t->det_reserved_ft == EXT4_FT_DIR_CSUM;
+}
+
 extern const struct iomap_ops ext4_iomap_ops;
 extern const struct iomap_ops ext4_iomap_report_ops;
 
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 4a47fbd8dd30..accf63fbbc79 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -314,7 +314,6 @@ static struct ext4_dir_entry_tail *get_dirent_tail(struct inode *inode,
 						   struct buffer_head *bh)
 {
 	struct ext4_dir_entry_tail *t;
-	int blocksize = EXT4_BLOCK_SIZE(inode->i_sb);
 
 #ifdef PARANOID
 	struct ext4_dir_entry *d, *top;
@@ -334,11 +333,7 @@ static struct ext4_dir_entry_tail *get_dirent_tail(struct inode *inode,
 	t = EXT4_DIRENT_TAIL(bh->b_data, EXT4_BLOCK_SIZE(inode->i_sb));
 #endif
 
-	if (t->det_reserved_zero1 ||
-	    (ext4_rec_len_from_disk(t->det_rec_len, blocksize) !=
-	     sizeof(struct ext4_dir_entry_tail)) ||
-	    t->det_reserved_zero2 ||
-	    t->det_reserved_ft != EXT4_FT_DIR_CSUM)
+	if (!ext4_dir_entry_is_tail((struct ext4_dir_entry_2 *)t))
 		return NULL;
 
 	return t;
-- 
2.43.7


^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox