* [PATCH v2 01/10] ext4: add did_zero output parameter to ext4_block_zero_page_range()
2026-03-25 7:28 [PATCH v2 00/10] ext4: refactor partial block zero-out for iomap conversion Zhang Yi
@ 2026-03-25 7:28 ` Zhang Yi
2026-03-25 7:28 ` [PATCH v2 02/10] ext4: ext4_block_truncate_page() returns zeroed length on success Zhang Yi
` (9 subsequent siblings)
10 siblings, 0 replies; 12+ messages in thread
From: Zhang Yi @ 2026-03-25 7:28 UTC (permalink / raw)
To: linux-ext4
Cc: linux-fsdevel, linux-kernel, tytso, adilger.kernel, jack, ojaswin,
ritesh.list, libaokun, yi.zhang, yi.zhang, yizhang089, yangerkun,
yukuai
From: Zhang Yi <yi.zhang@huawei.com>
Add a bool *did_zero output parameter to ext4_block_zero_page_range()
and __ext4_block_zero_page_range(). The parameter reports whether a
partial block was zeroed out, which is needed for the upcoming iomap
buffered I/O conversion.
Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
Reviewed-by: Jan Kara <jack@suse.cz>
---
fs/ext4/inode.c | 23 ++++++++++++++---------
1 file changed, 14 insertions(+), 9 deletions(-)
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index af6d1759c8de..1c9474d5d11d 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4003,7 +4003,8 @@ void ext4_set_aops(struct inode *inode)
* racing writeback can come later and flush the stale pagecache to disk.
*/
static int __ext4_block_zero_page_range(handle_t *handle,
- struct address_space *mapping, loff_t from, loff_t length)
+ struct address_space *mapping, loff_t from, loff_t length,
+ bool *did_zero)
{
unsigned int offset, blocksize, pos;
ext4_lblk_t iblock;
@@ -4091,6 +4092,8 @@ static int __ext4_block_zero_page_range(handle_t *handle,
err = ext4_jbd2_inode_add_write(handle, inode, from,
length);
}
+ if (!err && did_zero)
+ *did_zero = true;
unlock:
folio_unlock(folio);
@@ -4106,7 +4109,8 @@ static int __ext4_block_zero_page_range(handle_t *handle,
* that corresponds to 'from'
*/
static int ext4_block_zero_page_range(handle_t *handle,
- struct address_space *mapping, loff_t from, loff_t length)
+ struct address_space *mapping, loff_t from, loff_t length,
+ bool *did_zero)
{
struct inode *inode = mapping->host;
unsigned blocksize = inode->i_sb->s_blocksize;
@@ -4120,10 +4124,11 @@ static int ext4_block_zero_page_range(handle_t *handle,
length = max;
if (IS_DAX(inode)) {
- return dax_zero_range(inode, from, length, NULL,
+ return dax_zero_range(inode, from, length, did_zero,
&ext4_iomap_ops);
}
- return __ext4_block_zero_page_range(handle, mapping, from, length);
+ return __ext4_block_zero_page_range(handle, mapping, from, length,
+ did_zero);
}
/*
@@ -4146,7 +4151,7 @@ static int ext4_block_truncate_page(handle_t *handle,
blocksize = i_blocksize(inode);
length = blocksize - (from & (blocksize - 1));
- return ext4_block_zero_page_range(handle, mapping, from, length);
+ return ext4_block_zero_page_range(handle, mapping, from, length, NULL);
}
int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
@@ -4169,13 +4174,13 @@ int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
if (start == end &&
(partial_start || (partial_end != sb->s_blocksize - 1))) {
err = ext4_block_zero_page_range(handle, mapping,
- lstart, length);
+ lstart, length, NULL);
return err;
}
/* Handle partial zero out on the start of the range */
if (partial_start) {
- err = ext4_block_zero_page_range(handle, mapping,
- lstart, sb->s_blocksize);
+ err = ext4_block_zero_page_range(handle, mapping, lstart,
+ sb->s_blocksize, NULL);
if (err)
return err;
}
@@ -4183,7 +4188,7 @@ int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
if (partial_end != sb->s_blocksize - 1)
err = ext4_block_zero_page_range(handle, mapping,
byte_end - partial_end,
- partial_end + 1);
+ partial_end + 1, NULL);
return err;
}
--
2.52.0
^ permalink raw reply related [flat|nested] 12+ messages in thread* [PATCH v2 02/10] ext4: ext4_block_truncate_page() returns zeroed length on success
2026-03-25 7:28 [PATCH v2 00/10] ext4: refactor partial block zero-out for iomap conversion Zhang Yi
2026-03-25 7:28 ` [PATCH v2 01/10] ext4: add did_zero output parameter to ext4_block_zero_page_range() Zhang Yi
@ 2026-03-25 7:28 ` Zhang Yi
2026-03-25 7:28 ` [PATCH v2 03/10] ext4: rename and extend ext4_block_truncate_page() Zhang Yi
` (8 subsequent siblings)
10 siblings, 0 replies; 12+ messages in thread
From: Zhang Yi @ 2026-03-25 7:28 UTC (permalink / raw)
To: linux-ext4
Cc: linux-fsdevel, linux-kernel, tytso, adilger.kernel, jack, ojaswin,
ritesh.list, libaokun, yi.zhang, yi.zhang, yizhang089, yangerkun,
yukuai
From: Zhang Yi <yi.zhang@huawei.com>
Return the actual zeroed length instead of 0 on success. This prepares
for the upcoming iomap buffered I/O conversion by exposing zeroed length
information to callers.
Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
Reviewed-by: Jan Kara <jack@suse.cz>
---
fs/ext4/inode.c | 10 +++++++++-
1 file changed, 9 insertions(+), 1 deletion(-)
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 1c9474d5d11d..f21be26b4642 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4136,6 +4136,7 @@ static int ext4_block_zero_page_range(handle_t *handle,
* up to the end of the block which corresponds to `from'.
* This required during truncate. We need to physically zero the tail end
* of that block so it doesn't yield old data if the file is later grown.
+ * Return the zeroed length on success.
*/
static int ext4_block_truncate_page(handle_t *handle,
struct address_space *mapping, loff_t from)
@@ -4143,6 +4144,8 @@ static int ext4_block_truncate_page(handle_t *handle,
unsigned length;
unsigned blocksize;
struct inode *inode = mapping->host;
+ bool did_zero = false;
+ int err;
/* If we are processing an encrypted inode during orphan list handling */
if (IS_ENCRYPTED(inode) && !fscrypt_has_encryption_key(inode))
@@ -4151,7 +4154,12 @@ static int ext4_block_truncate_page(handle_t *handle,
blocksize = i_blocksize(inode);
length = blocksize - (from & (blocksize - 1));
- return ext4_block_zero_page_range(handle, mapping, from, length, NULL);
+ err = ext4_block_zero_page_range(handle, mapping, from, length,
+ &did_zero);
+ if (err)
+ return err;
+
+ return did_zero ? length : 0;
}
int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
--
2.52.0
^ permalink raw reply related [flat|nested] 12+ messages in thread* [PATCH v2 03/10] ext4: rename and extend ext4_block_truncate_page()
2026-03-25 7:28 [PATCH v2 00/10] ext4: refactor partial block zero-out for iomap conversion Zhang Yi
2026-03-25 7:28 ` [PATCH v2 01/10] ext4: add did_zero output parameter to ext4_block_zero_page_range() Zhang Yi
2026-03-25 7:28 ` [PATCH v2 02/10] ext4: ext4_block_truncate_page() returns zeroed length on success Zhang Yi
@ 2026-03-25 7:28 ` Zhang Yi
2026-03-25 7:28 ` [PATCH v2 04/10] ext4: factor out journalled block zeroing range Zhang Yi
` (7 subsequent siblings)
10 siblings, 0 replies; 12+ messages in thread
From: Zhang Yi @ 2026-03-25 7:28 UTC (permalink / raw)
To: linux-ext4
Cc: linux-fsdevel, linux-kernel, tytso, adilger.kernel, jack, ojaswin,
ritesh.list, libaokun, yi.zhang, yi.zhang, yizhang089, yangerkun,
yukuai
From: Zhang Yi <yi.zhang@huawei.com>
Rename ext4_block_truncate_page() to ext4_block_zero_eof() and extend
its signature to accept an explicit 'end' offset instead of calculating
the block boundary. This helper function now can replace all cases
requiring zeroing of the partial EOF block, including the append
buffered write paths in ext4_*_write_end().
Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
Reviewed-by: Jan Kara <jack@suse.cz>
---
fs/ext4/ext4.h | 2 ++
fs/ext4/extents.c | 4 ++--
fs/ext4/inode.c | 43 +++++++++++++++++++++++--------------------
3 files changed, 27 insertions(+), 22 deletions(-)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 293f698b7042..c62459ef9796 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -3099,6 +3099,8 @@ extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
extern int ext4_chunk_trans_extent(struct inode *inode, int nrblocks);
extern int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
int pextents);
+extern int ext4_block_zero_eof(handle_t *handle, struct inode *inode,
+ loff_t from, loff_t end);
extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
loff_t lstart, loff_t lend);
extern vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf);
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index ae3804f36535..a265070c1b79 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -4625,8 +4625,8 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset,
inode_get_ctime(inode));
if (epos > old_size) {
pagecache_isize_extended(inode, old_size, epos);
- ext4_zero_partial_blocks(handle, inode,
- old_size, epos - old_size);
+ ext4_block_zero_eof(handle, inode, old_size,
+ epos);
}
}
ret2 = ext4_mark_inode_dirty(handle, inode);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index f21be26b4642..a7635bbac1a0 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1458,7 +1458,7 @@ static int ext4_write_end(const struct kiocb *iocb,
if (old_size < pos && !verity) {
pagecache_isize_extended(inode, old_size, pos);
- ext4_zero_partial_blocks(handle, inode, old_size, pos - old_size);
+ ext4_block_zero_eof(handle, inode, old_size, pos);
}
/*
* Don't mark the inode dirty under folio lock. First, it unnecessarily
@@ -1576,7 +1576,7 @@ static int ext4_journalled_write_end(const struct kiocb *iocb,
if (old_size < pos && !verity) {
pagecache_isize_extended(inode, old_size, pos);
- ext4_zero_partial_blocks(handle, inode, old_size, pos - old_size);
+ ext4_block_zero_eof(handle, inode, old_size, pos);
}
if (size_changed) {
@@ -3252,7 +3252,7 @@ static int ext4_da_do_write_end(struct address_space *mapping,
if (IS_ERR(handle))
return PTR_ERR(handle);
if (zero_len)
- ext4_zero_partial_blocks(handle, inode, old_size, zero_len);
+ ext4_block_zero_eof(handle, inode, old_size, pos);
ext4_mark_inode_dirty(handle, inode);
ext4_journal_stop(handle);
@@ -4132,29 +4132,32 @@ static int ext4_block_zero_page_range(handle_t *handle,
}
/*
- * ext4_block_truncate_page() zeroes out a mapping from file offset `from'
- * up to the end of the block which corresponds to `from'.
- * This required during truncate. We need to physically zero the tail end
- * of that block so it doesn't yield old data if the file is later grown.
- * Return the zeroed length on success.
+ * Zero out a mapping from file offset 'from' up to the end of the block
+ * which corresponds to 'from' or to the given 'end' inside this block.
+ * This required during truncate up and performing append writes. We need
+ * to physically zero the tail end of that block so it doesn't yield old
+ * data if the file is grown. Return the zeroed length on success.
*/
-static int ext4_block_truncate_page(handle_t *handle,
- struct address_space *mapping, loff_t from)
+int ext4_block_zero_eof(handle_t *handle, struct inode *inode,
+ loff_t from, loff_t end)
{
- unsigned length;
- unsigned blocksize;
- struct inode *inode = mapping->host;
+ unsigned int blocksize = i_blocksize(inode);
+ unsigned int offset;
+ loff_t length = end - from;
bool did_zero = false;
int err;
+ offset = from & (blocksize - 1);
+ if (!offset || from >= end)
+ return 0;
/* If we are processing an encrypted inode during orphan list handling */
if (IS_ENCRYPTED(inode) && !fscrypt_has_encryption_key(inode))
return 0;
- blocksize = i_blocksize(inode);
- length = blocksize - (from & (blocksize - 1));
+ if (length > blocksize - offset)
+ length = blocksize - offset;
- err = ext4_block_zero_page_range(handle, mapping, from, length,
+ err = ext4_block_zero_page_range(handle, inode->i_mapping, from, length,
&did_zero);
if (err)
return err;
@@ -4508,7 +4511,6 @@ int ext4_truncate(struct inode *inode)
unsigned int credits;
int err = 0, err2;
handle_t *handle;
- struct address_space *mapping = inode->i_mapping;
/*
* There is a possibility that we're either freeing the inode
@@ -4551,8 +4553,9 @@ int ext4_truncate(struct inode *inode)
goto out_trace;
}
+ /* Zero to the end of the block containing i_size */
if (inode->i_size & (inode->i_sb->s_blocksize - 1))
- ext4_block_truncate_page(handle, mapping, inode->i_size);
+ ext4_block_zero_eof(handle, inode, inode->i_size, LLONG_MAX);
/*
* We add the inode to the orphan list, so that if this
@@ -5929,8 +5932,8 @@ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
inode_set_mtime_to_ts(inode,
inode_set_ctime_current(inode));
if (oldsize & (inode->i_sb->s_blocksize - 1))
- ext4_block_truncate_page(handle,
- inode->i_mapping, oldsize);
+ ext4_block_zero_eof(handle, inode,
+ oldsize, LLONG_MAX);
}
if (shrink)
--
2.52.0
^ permalink raw reply related [flat|nested] 12+ messages in thread* [PATCH v2 04/10] ext4: factor out journalled block zeroing range
2026-03-25 7:28 [PATCH v2 00/10] ext4: refactor partial block zero-out for iomap conversion Zhang Yi
` (2 preceding siblings ...)
2026-03-25 7:28 ` [PATCH v2 03/10] ext4: rename and extend ext4_block_truncate_page() Zhang Yi
@ 2026-03-25 7:28 ` Zhang Yi
2026-03-25 7:28 ` [PATCH v2 05/10] ext4: rename ext4_block_zero_page_range() to ext4_block_zero_range() Zhang Yi
` (6 subsequent siblings)
10 siblings, 0 replies; 12+ messages in thread
From: Zhang Yi @ 2026-03-25 7:28 UTC (permalink / raw)
To: linux-ext4
Cc: linux-fsdevel, linux-kernel, tytso, adilger.kernel, jack, ojaswin,
ritesh.list, libaokun, yi.zhang, yi.zhang, yizhang089, yangerkun,
yukuai
From: Zhang Yi <yi.zhang@huawei.com>
Refactor __ext4_block_zero_page_range() by separating the block zeroing
operations for ordered data mode and journal data mode into two distinct
functions:
- ext4_block_do_zero_range(): handles non-journal data mode with
ordered data support
- ext4_block_journalled_zero_range(): handles journal data mode
Also extract a common helper, ext4_load_tail_bh(), to handle buffer head
and folio retrieval, along with the associated error handling. This
prepares for converting the partial block zero range to the iomap
infrastructure.
Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
Reviewed-by: Jan Kara <jack@suse.cz>
---
fs/ext4/inode.c | 98 ++++++++++++++++++++++++++++++++++---------------
1 file changed, 69 insertions(+), 29 deletions(-)
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index a7635bbac1a0..3ccba708895d 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4002,13 +4002,11 @@ void ext4_set_aops(struct inode *inode)
* ext4_punch_hole, etc) which needs to be properly zeroed out. Otherwise a
* racing writeback can come later and flush the stale pagecache to disk.
*/
-static int __ext4_block_zero_page_range(handle_t *handle,
- struct address_space *mapping, loff_t from, loff_t length,
- bool *did_zero)
+static struct buffer_head *ext4_load_tail_bh(struct inode *inode, loff_t from)
{
unsigned int offset, blocksize, pos;
ext4_lblk_t iblock;
- struct inode *inode = mapping->host;
+ struct address_space *mapping = inode->i_mapping;
struct buffer_head *bh;
struct folio *folio;
int err = 0;
@@ -4017,7 +4015,7 @@ static int __ext4_block_zero_page_range(handle_t *handle,
FGP_LOCK | FGP_ACCESSED | FGP_CREAT,
mapping_gfp_constraint(mapping, ~__GFP_FS));
if (IS_ERR(folio))
- return PTR_ERR(folio);
+ return ERR_CAST(folio);
blocksize = inode->i_sb->s_blocksize;
@@ -4069,33 +4067,73 @@ static int __ext4_block_zero_page_range(handle_t *handle,
}
}
}
- if (ext4_should_journal_data(inode)) {
- BUFFER_TRACE(bh, "get write access");
- err = ext4_journal_get_write_access(handle, inode->i_sb, bh,
- EXT4_JTR_NONE);
- if (err)
- goto unlock;
- }
- folio_zero_range(folio, offset, length);
+ return bh;
+
+unlock:
+ folio_unlock(folio);
+ folio_put(folio);
+ return err ? ERR_PTR(err) : NULL;
+}
+
+static int ext4_block_do_zero_range(handle_t *handle, struct inode *inode,
+ loff_t from, loff_t length, bool *did_zero)
+{
+ struct buffer_head *bh;
+ struct folio *folio;
+ int err = 0;
+
+ bh = ext4_load_tail_bh(inode, from);
+ if (IS_ERR_OR_NULL(bh))
+ return PTR_ERR_OR_ZERO(bh);
+
+ folio = bh->b_folio;
+ folio_zero_range(folio, offset_in_folio(folio, from), length);
BUFFER_TRACE(bh, "zeroed end of block");
- if (ext4_should_journal_data(inode)) {
- err = ext4_dirty_journalled_data(handle, bh);
- } else {
- mark_buffer_dirty(bh);
- /*
- * Only the written block requires ordered data to prevent
- * exposing stale data.
- */
- if (!buffer_unwritten(bh) && !buffer_delay(bh) &&
- ext4_should_order_data(inode))
- err = ext4_jbd2_inode_add_write(handle, inode, from,
- length);
- }
+ mark_buffer_dirty(bh);
+ /*
+ * Only the written block requires ordered data to prevent exposing
+ * stale data.
+ */
+ if (ext4_should_order_data(inode) &&
+ !buffer_unwritten(bh) && !buffer_delay(bh))
+ err = ext4_jbd2_inode_add_write(handle, inode, from, length);
if (!err && did_zero)
*did_zero = true;
-unlock:
+ folio_unlock(folio);
+ folio_put(folio);
+ return err;
+}
+
+static int ext4_block_journalled_zero_range(handle_t *handle,
+ struct inode *inode, loff_t from, loff_t length, bool *did_zero)
+{
+ struct buffer_head *bh;
+ struct folio *folio;
+ int err;
+
+ bh = ext4_load_tail_bh(inode, from);
+ if (IS_ERR_OR_NULL(bh))
+ return PTR_ERR_OR_ZERO(bh);
+ folio = bh->b_folio;
+
+ BUFFER_TRACE(bh, "get write access");
+ err = ext4_journal_get_write_access(handle, inode->i_sb, bh,
+ EXT4_JTR_NONE);
+ if (err)
+ goto out;
+
+ folio_zero_range(folio, offset_in_folio(folio, from), length);
+ BUFFER_TRACE(bh, "zeroed end of block");
+
+ err = ext4_dirty_journalled_data(handle, bh);
+ if (err)
+ goto out;
+
+ if (did_zero)
+ *did_zero = true;
+out:
folio_unlock(folio);
folio_put(folio);
return err;
@@ -4126,9 +4164,11 @@ static int ext4_block_zero_page_range(handle_t *handle,
if (IS_DAX(inode)) {
return dax_zero_range(inode, from, length, did_zero,
&ext4_iomap_ops);
+ } else if (ext4_should_journal_data(inode)) {
+ return ext4_block_journalled_zero_range(handle, inode, from,
+ length, did_zero);
}
- return __ext4_block_zero_page_range(handle, mapping, from, length,
- did_zero);
+ return ext4_block_do_zero_range(handle, inode, from, length, did_zero);
}
/*
--
2.52.0
^ permalink raw reply related [flat|nested] 12+ messages in thread* [PATCH v2 05/10] ext4: rename ext4_block_zero_page_range() to ext4_block_zero_range()
2026-03-25 7:28 [PATCH v2 00/10] ext4: refactor partial block zero-out for iomap conversion Zhang Yi
` (3 preceding siblings ...)
2026-03-25 7:28 ` [PATCH v2 04/10] ext4: factor out journalled block zeroing range Zhang Yi
@ 2026-03-25 7:28 ` Zhang Yi
2026-03-25 7:28 ` [PATCH v2 06/10] ext4: move ordered data handling out of ext4_block_do_zero_range() Zhang Yi
` (5 subsequent siblings)
10 siblings, 0 replies; 12+ messages in thread
From: Zhang Yi @ 2026-03-25 7:28 UTC (permalink / raw)
To: linux-ext4
Cc: linux-fsdevel, linux-kernel, tytso, adilger.kernel, jack, ojaswin,
ritesh.list, libaokun, yi.zhang, yi.zhang, yizhang089, yangerkun,
yukuai
From: Zhang Yi <yi.zhang@huawei.com>
Rename ext4_block_zero_page_range() to ext4_block_zero_range() since the
"page" naming is no longer appropriate for current context. Also change
its signature to take an inode pointer instead of an address_space. This
aligns with the caller ext4_block_zero_eof() and
ext4_zero_partial_blocks().
Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
Reviewed-by: Jan Kara <jack@suse.cz>
---
fs/ext4/inode.c | 24 ++++++++++--------------
1 file changed, 10 insertions(+), 14 deletions(-)
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 3ccba708895d..3c3c07fd00ba 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4146,11 +4146,9 @@ static int ext4_block_journalled_zero_range(handle_t *handle,
* the end of the block it will be shortened to end of the block
* that corresponds to 'from'
*/
-static int ext4_block_zero_page_range(handle_t *handle,
- struct address_space *mapping, loff_t from, loff_t length,
- bool *did_zero)
+static int ext4_block_zero_range(handle_t *handle, struct inode *inode,
+ loff_t from, loff_t length, bool *did_zero)
{
- struct inode *inode = mapping->host;
unsigned blocksize = inode->i_sb->s_blocksize;
unsigned int max = blocksize - (from & (blocksize - 1));
@@ -4197,8 +4195,7 @@ int ext4_block_zero_eof(handle_t *handle, struct inode *inode,
if (length > blocksize - offset)
length = blocksize - offset;
- err = ext4_block_zero_page_range(handle, inode->i_mapping, from, length,
- &did_zero);
+ err = ext4_block_zero_range(handle, inode, from, length, &did_zero);
if (err)
return err;
@@ -4209,7 +4206,6 @@ int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
loff_t lstart, loff_t length)
{
struct super_block *sb = inode->i_sb;
- struct address_space *mapping = inode->i_mapping;
unsigned partial_start, partial_end;
ext4_fsblk_t start, end;
loff_t byte_end = (lstart + length - 1);
@@ -4224,22 +4220,22 @@ int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
/* Handle partial zero within the single block */
if (start == end &&
(partial_start || (partial_end != sb->s_blocksize - 1))) {
- err = ext4_block_zero_page_range(handle, mapping,
- lstart, length, NULL);
+ err = ext4_block_zero_range(handle, inode, lstart,
+ length, NULL);
return err;
}
/* Handle partial zero out on the start of the range */
if (partial_start) {
- err = ext4_block_zero_page_range(handle, mapping, lstart,
- sb->s_blocksize, NULL);
+ err = ext4_block_zero_range(handle, inode, lstart,
+ sb->s_blocksize, NULL);
if (err)
return err;
}
/* Handle partial zero out on the end of the range */
if (partial_end != sb->s_blocksize - 1)
- err = ext4_block_zero_page_range(handle, mapping,
- byte_end - partial_end,
- partial_end + 1, NULL);
+ err = ext4_block_zero_range(handle, inode,
+ byte_end - partial_end,
+ partial_end + 1, NULL);
return err;
}
--
2.52.0
^ permalink raw reply related [flat|nested] 12+ messages in thread* [PATCH v2 06/10] ext4: move ordered data handling out of ext4_block_do_zero_range()
2026-03-25 7:28 [PATCH v2 00/10] ext4: refactor partial block zero-out for iomap conversion Zhang Yi
` (4 preceding siblings ...)
2026-03-25 7:28 ` [PATCH v2 05/10] ext4: rename ext4_block_zero_page_range() to ext4_block_zero_range() Zhang Yi
@ 2026-03-25 7:28 ` Zhang Yi
2026-03-25 7:28 ` [PATCH v2 07/10] ext4: remove handle parameters from zero partial block functions Zhang Yi
` (4 subsequent siblings)
10 siblings, 0 replies; 12+ messages in thread
From: Zhang Yi @ 2026-03-25 7:28 UTC (permalink / raw)
To: linux-ext4
Cc: linux-fsdevel, linux-kernel, tytso, adilger.kernel, jack, ojaswin,
ritesh.list, libaokun, yi.zhang, yi.zhang, yizhang089, yangerkun,
yukuai
From: Zhang Yi <yi.zhang@huawei.com>
Remove the handle parameter from ext4_block_do_zero_range() and move the
ordered data handling to ext4_block_zero_eof().
This is necessary for truncate up and append writes across a range
extending beyond EOF. The ordered data must be committed before updating
i_disksize to prevent exposing stale on-disk data from concurrent
post-EOF mmap writes during previous folio writeback or in case of
system crash during append writes.
This is unnecessary for partial block hole punching because the entire
punch operation does not provide atomicity guarantees and can already
expose intermediate results in case of crash.
Hole punching can only ever expose data that was there before the punch
but missed zeroing during append / truncate could expose data that was
not visible in the file before the operation.
Since ordered data handling is no longer performed inside
ext4_zero_partial_blocks(), ext4_punch_hole() no longer needs to attach
jinode.
This is prepared for the conversion to the iomap infrastructure, which
does not use ordered data mode while zeroing post-EOF partial blocks.
Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
Reviewed-by: Jan Kara <jack@suse.cz>
---
fs/ext4/inode.c | 58 ++++++++++++++++++++++++-------------------------
1 file changed, 29 insertions(+), 29 deletions(-)
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 3c3c07fd00ba..84dd3140793d 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4075,12 +4075,12 @@ static struct buffer_head *ext4_load_tail_bh(struct inode *inode, loff_t from)
return err ? ERR_PTR(err) : NULL;
}
-static int ext4_block_do_zero_range(handle_t *handle, struct inode *inode,
- loff_t from, loff_t length, bool *did_zero)
+static int ext4_block_do_zero_range(struct inode *inode, loff_t from,
+ loff_t length, bool *did_zero,
+ bool *zero_written)
{
struct buffer_head *bh;
struct folio *folio;
- int err = 0;
bh = ext4_load_tail_bh(inode, from);
if (IS_ERR_OR_NULL(bh))
@@ -4091,19 +4091,14 @@ static int ext4_block_do_zero_range(handle_t *handle, struct inode *inode,
BUFFER_TRACE(bh, "zeroed end of block");
mark_buffer_dirty(bh);
- /*
- * Only the written block requires ordered data to prevent exposing
- * stale data.
- */
- if (ext4_should_order_data(inode) &&
- !buffer_unwritten(bh) && !buffer_delay(bh))
- err = ext4_jbd2_inode_add_write(handle, inode, from, length);
- if (!err && did_zero)
+ if (did_zero)
*did_zero = true;
+ if (zero_written && !buffer_unwritten(bh) && !buffer_delay(bh))
+ *zero_written = true;
folio_unlock(folio);
folio_put(folio);
- return err;
+ return 0;
}
static int ext4_block_journalled_zero_range(handle_t *handle,
@@ -4147,7 +4142,8 @@ static int ext4_block_journalled_zero_range(handle_t *handle,
* that corresponds to 'from'
*/
static int ext4_block_zero_range(handle_t *handle, struct inode *inode,
- loff_t from, loff_t length, bool *did_zero)
+ loff_t from, loff_t length, bool *did_zero,
+ bool *zero_written)
{
unsigned blocksize = inode->i_sb->s_blocksize;
unsigned int max = blocksize - (from & (blocksize - 1));
@@ -4166,7 +4162,8 @@ static int ext4_block_zero_range(handle_t *handle, struct inode *inode,
return ext4_block_journalled_zero_range(handle, inode, from,
length, did_zero);
}
- return ext4_block_do_zero_range(handle, inode, from, length, did_zero);
+ return ext4_block_do_zero_range(inode, from, length, did_zero,
+ zero_written);
}
/*
@@ -4183,6 +4180,7 @@ int ext4_block_zero_eof(handle_t *handle, struct inode *inode,
unsigned int offset;
loff_t length = end - from;
bool did_zero = false;
+ bool zero_written = false;
int err;
offset = from & (blocksize - 1);
@@ -4195,9 +4193,22 @@ int ext4_block_zero_eof(handle_t *handle, struct inode *inode,
if (length > blocksize - offset)
length = blocksize - offset;
- err = ext4_block_zero_range(handle, inode, from, length, &did_zero);
+ err = ext4_block_zero_range(handle, inode, from, length,
+ &did_zero, &zero_written);
if (err)
return err;
+ /*
+ * It's necessary to order zeroed data before update i_disksize when
+ * truncating up or performing an append write, because there might be
+ * exposing stale on-disk data which may caused by concurrent post-EOF
+ * mmap write during folio writeback.
+ */
+ if (ext4_should_order_data(inode) &&
+ did_zero && zero_written && !IS_DAX(inode)) {
+ err = ext4_jbd2_inode_add_write(handle, inode, from, length);
+ if (err)
+ return err;
+ }
return did_zero ? length : 0;
}
@@ -4221,13 +4232,13 @@ int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
if (start == end &&
(partial_start || (partial_end != sb->s_blocksize - 1))) {
err = ext4_block_zero_range(handle, inode, lstart,
- length, NULL);
+ length, NULL, NULL);
return err;
}
/* Handle partial zero out on the start of the range */
if (partial_start) {
err = ext4_block_zero_range(handle, inode, lstart,
- sb->s_blocksize, NULL);
+ sb->s_blocksize, NULL, NULL);
if (err)
return err;
}
@@ -4235,7 +4246,7 @@ int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
if (partial_end != sb->s_blocksize - 1)
err = ext4_block_zero_range(handle, inode,
byte_end - partial_end,
- partial_end + 1, NULL);
+ partial_end + 1, NULL, NULL);
return err;
}
@@ -4410,17 +4421,6 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
end = max_end;
length = end - offset;
- /*
- * Attach jinode to inode for jbd2 if we do any zeroing of partial
- * block.
- */
- if (!IS_ALIGNED(offset | end, sb->s_blocksize)) {
- ret = ext4_inode_attach_jinode(inode);
- if (ret < 0)
- return ret;
- }
-
-
ret = ext4_update_disksize_before_punch(inode, offset, length);
if (ret)
return ret;
--
2.52.0
^ permalink raw reply related [flat|nested] 12+ messages in thread* [PATCH v2 07/10] ext4: remove handle parameters from zero partial block functions
2026-03-25 7:28 [PATCH v2 00/10] ext4: refactor partial block zero-out for iomap conversion Zhang Yi
` (5 preceding siblings ...)
2026-03-25 7:28 ` [PATCH v2 06/10] ext4: move ordered data handling out of ext4_block_do_zero_range() Zhang Yi
@ 2026-03-25 7:28 ` Zhang Yi
2026-03-25 7:28 ` [PATCH v2 08/10] ext4: pass allocate range as loff_t to ext4_alloc_file_blocks() Zhang Yi
` (3 subsequent siblings)
10 siblings, 0 replies; 12+ messages in thread
From: Zhang Yi @ 2026-03-25 7:28 UTC (permalink / raw)
To: linux-ext4
Cc: linux-fsdevel, linux-kernel, tytso, adilger.kernel, jack, ojaswin,
ritesh.list, libaokun, yi.zhang, yi.zhang, yizhang089, yangerkun,
yukuai
From: Zhang Yi <yi.zhang@huawei.com>
Only journal data mode requires an active journal handle when zeroing
partial blocks. Stop passing handle_t *handle to
ext4_zero_partial_blocks() and related functions, and make
ext4_block_journalled_zero_range() start a handle independently.
This change has no practical impact now because all callers invoke these
functions within the context of an active handle. It prepares for moving
ext4_block_zero_eof() out of an active handle in the next patch, which
is a prerequisite for converting block zero range operations to iomap
infrastructure.
Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
Reviewed-by: Jan Kara <jack@suse.cz>
---
fs/ext4/ext4.h | 7 +++---
fs/ext4/extents.c | 5 ++--
fs/ext4/inode.c | 62 ++++++++++++++++++++++++++++-------------------
3 files changed, 42 insertions(+), 32 deletions(-)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index c62459ef9796..20545a9523e9 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -3099,10 +3099,9 @@ extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
extern int ext4_chunk_trans_extent(struct inode *inode, int nrblocks);
extern int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
int pextents);
-extern int ext4_block_zero_eof(handle_t *handle, struct inode *inode,
- loff_t from, loff_t end);
-extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
- loff_t lstart, loff_t lend);
+extern int ext4_block_zero_eof(struct inode *inode, loff_t from, loff_t end);
+extern int ext4_zero_partial_blocks(struct inode *inode, loff_t lstart,
+ loff_t lend);
extern vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf);
extern qsize_t *ext4_get_reserved_space(struct inode *inode);
extern int ext4_get_projid(struct inode *inode, kprojid_t *projid);
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index a265070c1b79..753a0f3418a4 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -4625,8 +4625,7 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset,
inode_get_ctime(inode));
if (epos > old_size) {
pagecache_isize_extended(inode, old_size, epos);
- ext4_block_zero_eof(handle, inode, old_size,
- epos);
+ ext4_block_zero_eof(inode, old_size, epos);
}
}
ret2 = ext4_mark_inode_dirty(handle, inode);
@@ -4744,7 +4743,7 @@ static long ext4_zero_range(struct file *file, loff_t offset,
}
/* Zero out partial block at the edges of the range */
- ret = ext4_zero_partial_blocks(handle, inode, offset, len);
+ ret = ext4_zero_partial_blocks(inode, offset, len);
if (ret)
goto out_handle;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 84dd3140793d..f68b2afdcfcb 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1458,7 +1458,7 @@ static int ext4_write_end(const struct kiocb *iocb,
if (old_size < pos && !verity) {
pagecache_isize_extended(inode, old_size, pos);
- ext4_block_zero_eof(handle, inode, old_size, pos);
+ ext4_block_zero_eof(inode, old_size, pos);
}
/*
* Don't mark the inode dirty under folio lock. First, it unnecessarily
@@ -1576,7 +1576,7 @@ static int ext4_journalled_write_end(const struct kiocb *iocb,
if (old_size < pos && !verity) {
pagecache_isize_extended(inode, old_size, pos);
- ext4_block_zero_eof(handle, inode, old_size, pos);
+ ext4_block_zero_eof(inode, old_size, pos);
}
if (size_changed) {
@@ -3252,7 +3252,7 @@ static int ext4_da_do_write_end(struct address_space *mapping,
if (IS_ERR(handle))
return PTR_ERR(handle);
if (zero_len)
- ext4_block_zero_eof(handle, inode, old_size, pos);
+ ext4_block_zero_eof(inode, old_size, pos);
ext4_mark_inode_dirty(handle, inode);
ext4_journal_stop(handle);
@@ -4101,16 +4101,23 @@ static int ext4_block_do_zero_range(struct inode *inode, loff_t from,
return 0;
}
-static int ext4_block_journalled_zero_range(handle_t *handle,
- struct inode *inode, loff_t from, loff_t length, bool *did_zero)
+static int ext4_block_journalled_zero_range(struct inode *inode, loff_t from,
+ loff_t length, bool *did_zero)
{
struct buffer_head *bh;
struct folio *folio;
+ handle_t *handle;
int err;
+ handle = ext4_journal_start(inode, EXT4_HT_MISC, 1);
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+
bh = ext4_load_tail_bh(inode, from);
- if (IS_ERR_OR_NULL(bh))
- return PTR_ERR_OR_ZERO(bh);
+ if (IS_ERR_OR_NULL(bh)) {
+ err = PTR_ERR_OR_ZERO(bh);
+ goto out_handle;
+ }
folio = bh->b_folio;
BUFFER_TRACE(bh, "get write access");
@@ -4131,6 +4138,8 @@ static int ext4_block_journalled_zero_range(handle_t *handle,
out:
folio_unlock(folio);
folio_put(folio);
+out_handle:
+ ext4_journal_stop(handle);
return err;
}
@@ -4141,7 +4150,7 @@ static int ext4_block_journalled_zero_range(handle_t *handle,
* the end of the block it will be shortened to end of the block
* that corresponds to 'from'
*/
-static int ext4_block_zero_range(handle_t *handle, struct inode *inode,
+static int ext4_block_zero_range(struct inode *inode,
loff_t from, loff_t length, bool *did_zero,
bool *zero_written)
{
@@ -4159,8 +4168,8 @@ static int ext4_block_zero_range(handle_t *handle, struct inode *inode,
return dax_zero_range(inode, from, length, did_zero,
&ext4_iomap_ops);
} else if (ext4_should_journal_data(inode)) {
- return ext4_block_journalled_zero_range(handle, inode, from,
- length, did_zero);
+ return ext4_block_journalled_zero_range(inode, from, length,
+ did_zero);
}
return ext4_block_do_zero_range(inode, from, length, did_zero,
zero_written);
@@ -4173,8 +4182,7 @@ static int ext4_block_zero_range(handle_t *handle, struct inode *inode,
* to physically zero the tail end of that block so it doesn't yield old
* data if the file is grown. Return the zeroed length on success.
*/
-int ext4_block_zero_eof(handle_t *handle, struct inode *inode,
- loff_t from, loff_t end)
+int ext4_block_zero_eof(struct inode *inode, loff_t from, loff_t end)
{
unsigned int blocksize = i_blocksize(inode);
unsigned int offset;
@@ -4193,7 +4201,7 @@ int ext4_block_zero_eof(handle_t *handle, struct inode *inode,
if (length > blocksize - offset)
length = blocksize - offset;
- err = ext4_block_zero_range(handle, inode, from, length,
+ err = ext4_block_zero_range(inode, from, length,
&did_zero, &zero_written);
if (err)
return err;
@@ -4205,7 +4213,14 @@ int ext4_block_zero_eof(handle_t *handle, struct inode *inode,
*/
if (ext4_should_order_data(inode) &&
did_zero && zero_written && !IS_DAX(inode)) {
+ handle_t *handle;
+
+ handle = ext4_journal_start(inode, EXT4_HT_MISC, 1);
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+
err = ext4_jbd2_inode_add_write(handle, inode, from, length);
+ ext4_journal_stop(handle);
if (err)
return err;
}
@@ -4213,8 +4228,7 @@ int ext4_block_zero_eof(handle_t *handle, struct inode *inode,
return did_zero ? length : 0;
}
-int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
- loff_t lstart, loff_t length)
+int ext4_zero_partial_blocks(struct inode *inode, loff_t lstart, loff_t length)
{
struct super_block *sb = inode->i_sb;
unsigned partial_start, partial_end;
@@ -4231,21 +4245,19 @@ int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
/* Handle partial zero within the single block */
if (start == end &&
(partial_start || (partial_end != sb->s_blocksize - 1))) {
- err = ext4_block_zero_range(handle, inode, lstart,
- length, NULL, NULL);
+ err = ext4_block_zero_range(inode, lstart, length, NULL, NULL);
return err;
}
/* Handle partial zero out on the start of the range */
if (partial_start) {
- err = ext4_block_zero_range(handle, inode, lstart,
- sb->s_blocksize, NULL, NULL);
+ err = ext4_block_zero_range(inode, lstart, sb->s_blocksize,
+ NULL, NULL);
if (err)
return err;
}
/* Handle partial zero out on the end of the range */
if (partial_end != sb->s_blocksize - 1)
- err = ext4_block_zero_range(handle, inode,
- byte_end - partial_end,
+ err = ext4_block_zero_range(inode, byte_end - partial_end,
partial_end + 1, NULL, NULL);
return err;
}
@@ -4441,7 +4453,7 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
return ret;
}
- ret = ext4_zero_partial_blocks(handle, inode, offset, length);
+ ret = ext4_zero_partial_blocks(inode, offset, length);
if (ret)
goto out_handle;
@@ -4591,7 +4603,7 @@ int ext4_truncate(struct inode *inode)
/* Zero to the end of the block containing i_size */
if (inode->i_size & (inode->i_sb->s_blocksize - 1))
- ext4_block_zero_eof(handle, inode, inode->i_size, LLONG_MAX);
+ ext4_block_zero_eof(inode, inode->i_size, LLONG_MAX);
/*
* We add the inode to the orphan list, so that if this
@@ -5968,8 +5980,8 @@ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
inode_set_mtime_to_ts(inode,
inode_set_ctime_current(inode));
if (oldsize & (inode->i_sb->s_blocksize - 1))
- ext4_block_zero_eof(handle, inode,
- oldsize, LLONG_MAX);
+ ext4_block_zero_eof(inode, oldsize,
+ LLONG_MAX);
}
if (shrink)
--
2.52.0
^ permalink raw reply related [flat|nested] 12+ messages in thread* [PATCH v2 08/10] ext4: pass allocate range as loff_t to ext4_alloc_file_blocks()
2026-03-25 7:28 [PATCH v2 00/10] ext4: refactor partial block zero-out for iomap conversion Zhang Yi
` (6 preceding siblings ...)
2026-03-25 7:28 ` [PATCH v2 07/10] ext4: remove handle parameters from zero partial block functions Zhang Yi
@ 2026-03-25 7:28 ` Zhang Yi
2026-03-25 7:28 ` [PATCH v2 09/10] ext4: move zero partial block range functions out of active handle Zhang Yi
` (2 subsequent siblings)
10 siblings, 0 replies; 12+ messages in thread
From: Zhang Yi @ 2026-03-25 7:28 UTC (permalink / raw)
To: linux-ext4
Cc: linux-fsdevel, linux-kernel, tytso, adilger.kernel, jack, ojaswin,
ritesh.list, libaokun, yi.zhang, yi.zhang, yizhang089, yangerkun,
yukuai
From: Zhang Yi <yi.zhang@huawei.com>
Change ext4_alloc_file_blocks() to accept offset and len in byte
granularity instead of block granularity. This allows callers to pass
byte offsets and lengths directly, and this prepares for moving the
ext4_zero_partial_blocks() call from the while(len) loop for unaligned
append writes, where it only needs to be invoked once before doing block
allocation.
Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
Reviewed-by: Jan Kara <jack@suse.cz>
---
fs/ext4/extents.c | 53 ++++++++++++++++++++---------------------------
1 file changed, 22 insertions(+), 31 deletions(-)
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 753a0f3418a4..57a686b600d9 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -4542,15 +4542,15 @@ int ext4_ext_truncate(handle_t *handle, struct inode *inode)
return err;
}
-static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset,
- ext4_lblk_t len, loff_t new_size,
- int flags)
+static int ext4_alloc_file_blocks(struct file *file, loff_t offset, loff_t len,
+ loff_t new_size, int flags)
{
struct inode *inode = file_inode(file);
handle_t *handle;
int ret = 0, ret2 = 0, ret3 = 0;
int retries = 0;
int depth = 0;
+ ext4_lblk_t len_lblk;
struct ext4_map_blocks map;
unsigned int credits;
loff_t epos, old_size = i_size_read(inode);
@@ -4558,14 +4558,14 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset,
bool alloc_zero = false;
BUG_ON(!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS));
- map.m_lblk = offset;
- map.m_len = len;
+ map.m_lblk = offset >> blkbits;
+ map.m_len = len_lblk = EXT4_MAX_BLOCKS(len, offset, blkbits);
/*
* Don't normalize the request if it can fit in one extent so
* that it doesn't get unnecessarily split into multiple
* extents.
*/
- if (len <= EXT_UNWRITTEN_MAX_LEN)
+ if (len_lblk <= EXT_UNWRITTEN_MAX_LEN)
flags |= EXT4_GET_BLOCKS_NO_NORMALIZE;
/*
@@ -4582,16 +4582,16 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset,
/*
* credits to insert 1 extent into extent tree
*/
- credits = ext4_chunk_trans_blocks(inode, len);
+ credits = ext4_chunk_trans_blocks(inode, len_lblk);
depth = ext_depth(inode);
retry:
- while (len) {
+ while (len_lblk) {
/*
* Recalculate credits when extent tree depth changes.
*/
if (depth != ext_depth(inode)) {
- credits = ext4_chunk_trans_blocks(inode, len);
+ credits = ext4_chunk_trans_blocks(inode, len_lblk);
depth = ext_depth(inode);
}
@@ -4648,7 +4648,7 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset,
}
map.m_lblk += ret;
- map.m_len = len = len - ret;
+ map.m_len = len_lblk = len_lblk - ret;
}
if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
goto retry;
@@ -4665,11 +4665,9 @@ static long ext4_zero_range(struct file *file, loff_t offset,
{
struct inode *inode = file_inode(file);
handle_t *handle = NULL;
- loff_t new_size = 0;
+ loff_t align_start, align_end, new_size = 0;
loff_t end = offset + len;
- ext4_lblk_t start_lblk, end_lblk;
unsigned int blocksize = i_blocksize(inode);
- unsigned int blkbits = inode->i_blkbits;
int ret, flags, credits;
trace_ext4_zero_range(inode, offset, len, mode);
@@ -4690,11 +4688,8 @@ static long ext4_zero_range(struct file *file, loff_t offset,
flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT;
/* Preallocate the range including the unaligned edges */
if (!IS_ALIGNED(offset | end, blocksize)) {
- ext4_lblk_t alloc_lblk = offset >> blkbits;
- ext4_lblk_t len_lblk = EXT4_MAX_BLOCKS(len, offset, blkbits);
-
- ret = ext4_alloc_file_blocks(file, alloc_lblk, len_lblk,
- new_size, flags);
+ ret = ext4_alloc_file_blocks(file, offset, len, new_size,
+ flags);
if (ret)
return ret;
}
@@ -4709,18 +4704,17 @@ static long ext4_zero_range(struct file *file, loff_t offset,
return ret;
/* Zero range excluding the unaligned edges */
- start_lblk = EXT4_B_TO_LBLK(inode, offset);
- end_lblk = end >> blkbits;
- if (end_lblk > start_lblk) {
- ext4_lblk_t zero_blks = end_lblk - start_lblk;
-
+ align_start = round_up(offset, blocksize);
+ align_end = round_down(end, blocksize);
+ if (align_end > align_start) {
if (mode & FALLOC_FL_WRITE_ZEROES)
flags = EXT4_GET_BLOCKS_CREATE_ZERO | EXT4_EX_NOCACHE;
else
flags |= (EXT4_GET_BLOCKS_CONVERT_UNWRITTEN |
EXT4_EX_NOCACHE);
- ret = ext4_alloc_file_blocks(file, start_lblk, zero_blks,
- new_size, flags);
+ ret = ext4_alloc_file_blocks(file, align_start,
+ align_end - align_start, new_size,
+ flags);
if (ret)
return ret;
}
@@ -4768,15 +4762,11 @@ static long ext4_do_fallocate(struct file *file, loff_t offset,
struct inode *inode = file_inode(file);
loff_t end = offset + len;
loff_t new_size = 0;
- ext4_lblk_t start_lblk, len_lblk;
int ret;
trace_ext4_fallocate_enter(inode, offset, len, mode);
WARN_ON_ONCE(!inode_is_locked(inode));
- start_lblk = offset >> inode->i_blkbits;
- len_lblk = EXT4_MAX_BLOCKS(len, offset, inode->i_blkbits);
-
/* We only support preallocation for extent-based files only. */
if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
ret = -EOPNOTSUPP;
@@ -4791,7 +4781,7 @@ static long ext4_do_fallocate(struct file *file, loff_t offset,
goto out;
}
- ret = ext4_alloc_file_blocks(file, start_lblk, len_lblk, new_size,
+ ret = ext4_alloc_file_blocks(file, offset, len, new_size,
EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT);
if (ret)
goto out;
@@ -4801,7 +4791,8 @@ static long ext4_do_fallocate(struct file *file, loff_t offset,
EXT4_I(inode)->i_sync_tid);
}
out:
- trace_ext4_fallocate_exit(inode, offset, len_lblk, ret);
+ trace_ext4_fallocate_exit(inode, offset,
+ EXT4_MAX_BLOCKS(len, offset, inode->i_blkbits), ret);
return ret;
}
--
2.52.0
^ permalink raw reply related [flat|nested] 12+ messages in thread* [PATCH v2 09/10] ext4: move zero partial block range functions out of active handle
2026-03-25 7:28 [PATCH v2 00/10] ext4: refactor partial block zero-out for iomap conversion Zhang Yi
` (7 preceding siblings ...)
2026-03-25 7:28 ` [PATCH v2 08/10] ext4: pass allocate range as loff_t to ext4_alloc_file_blocks() Zhang Yi
@ 2026-03-25 7:28 ` Zhang Yi
2026-03-25 7:28 ` [PATCH v2 10/10] ext4: zero post-EOF partial block before appending write Zhang Yi
2026-03-26 8:53 ` [PATCH v2 00/10] ext4: refactor partial block zero-out for iomap conversion Zhang Yi
10 siblings, 0 replies; 12+ messages in thread
From: Zhang Yi @ 2026-03-25 7:28 UTC (permalink / raw)
To: linux-ext4
Cc: linux-fsdevel, linux-kernel, tytso, adilger.kernel, jack, ojaswin,
ritesh.list, libaokun, yi.zhang, yi.zhang, yizhang089, yangerkun,
yukuai
From: Zhang Yi <yi.zhang@huawei.com>
Move ext4_block_zero_eof() and ext4_zero_partial_blocks() calls out of
the active handle context, making them independent operations. This is
safe because it still ensures data is updated before metadata for
data=ordered mode and data=journal mode because we still zero data and
ordering data before modifying the metadata.
This change is required for iomap infrastructure conversion because the
iomap buffered I/O path does not use the same journal infrastructure for
partial block zeroing. The lock ordering of folio lock and starting
transactions is "folio lock -> transaction start", which is opposite of
the current path. Therefore, zeroing partial blocks cannot be performed
under the active handle.
Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
Reviewed-by: Jan Kara <jack@suse.cz>
---
fs/ext4/extents.c | 29 ++++++++++++-----------------
fs/ext4/inode.c | 36 ++++++++++++++++++------------------
2 files changed, 30 insertions(+), 35 deletions(-)
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 57a686b600d9..81b9d5b4ad71 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -4585,6 +4585,10 @@ static int ext4_alloc_file_blocks(struct file *file, loff_t offset, loff_t len,
credits = ext4_chunk_trans_blocks(inode, len_lblk);
depth = ext_depth(inode);
+ /* Zero to the end of the block containing i_size */
+ if (new_size && offset > old_size)
+ ext4_block_zero_eof(inode, old_size, LLONG_MAX);
+
retry:
while (len_lblk) {
/*
@@ -4623,10 +4627,8 @@ static int ext4_alloc_file_blocks(struct file *file, loff_t offset, loff_t len,
if (ext4_update_inode_size(inode, epos) & 0x1)
inode_set_mtime_to_ts(inode,
inode_get_ctime(inode));
- if (epos > old_size) {
+ if (epos > old_size)
pagecache_isize_extended(inode, old_size, epos);
- ext4_block_zero_eof(inode, old_size, epos);
- }
}
ret2 = ext4_mark_inode_dirty(handle, inode);
ext4_update_inode_fsync_trans(handle, inode, 1);
@@ -4668,7 +4670,7 @@ static long ext4_zero_range(struct file *file, loff_t offset,
loff_t align_start, align_end, new_size = 0;
loff_t end = offset + len;
unsigned int blocksize = i_blocksize(inode);
- int ret, flags, credits;
+ int ret, flags;
trace_ext4_zero_range(inode, offset, len, mode);
WARN_ON_ONCE(!inode_is_locked(inode));
@@ -4722,25 +4724,18 @@ static long ext4_zero_range(struct file *file, loff_t offset,
if (IS_ALIGNED(offset | end, blocksize))
return ret;
- /*
- * In worst case we have to writeout two nonadjacent unwritten
- * blocks and update the inode
- */
- credits = (2 * ext4_ext_index_trans_blocks(inode, 2)) + 1;
- if (ext4_should_journal_data(inode))
- credits += 2;
- handle = ext4_journal_start(inode, EXT4_HT_MISC, credits);
+ /* Zero out partial block at the edges of the range */
+ ret = ext4_zero_partial_blocks(inode, offset, len);
+ if (ret)
+ return ret;
+
+ handle = ext4_journal_start(inode, EXT4_HT_MISC, 1);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
ext4_std_error(inode->i_sb, ret);
return ret;
}
- /* Zero out partial block at the edges of the range */
- ret = ext4_zero_partial_blocks(inode, offset, len);
- if (ret)
- goto out_handle;
-
if (new_size)
ext4_update_inode_size(inode, new_size);
ret = ext4_mark_inode_dirty(handle, inode);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index f68b2afdcfcb..530197a53208 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4442,8 +4442,12 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
if (ret)
return ret;
+ ret = ext4_zero_partial_blocks(inode, offset, length);
+ if (ret)
+ return ret;
+
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
- credits = ext4_chunk_trans_extent(inode, 2);
+ credits = ext4_chunk_trans_extent(inode, 0);
else
credits = ext4_blocks_for_truncate(inode);
handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
@@ -4453,10 +4457,6 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
return ret;
}
- ret = ext4_zero_partial_blocks(inode, offset, length);
- if (ret)
- goto out_handle;
-
/* If there are blocks to remove, do it */
start_lblk = EXT4_B_TO_LBLK(inode, offset);
end_lblk = end >> inode->i_blkbits;
@@ -4588,6 +4588,9 @@ int ext4_truncate(struct inode *inode)
err = ext4_inode_attach_jinode(inode);
if (err)
goto out_trace;
+
+ /* Zero to the end of the block containing i_size */
+ ext4_block_zero_eof(inode, inode->i_size, LLONG_MAX);
}
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
@@ -4601,10 +4604,6 @@ int ext4_truncate(struct inode *inode)
goto out_trace;
}
- /* Zero to the end of the block containing i_size */
- if (inode->i_size & (inode->i_sb->s_blocksize - 1))
- ext4_block_zero_eof(inode, inode->i_size, LLONG_MAX);
-
/*
* We add the inode to the orphan list, so that if this
* truncate spans multiple transactions, and we crash, we will
@@ -5962,15 +5961,6 @@ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
goto out_mmap_sem;
}
- handle = ext4_journal_start(inode, EXT4_HT_INODE, 3);
- if (IS_ERR(handle)) {
- error = PTR_ERR(handle);
- goto out_mmap_sem;
- }
- if (ext4_handle_valid(handle) && shrink) {
- error = ext4_orphan_add(handle, inode);
- orphan = 1;
- }
/*
* Update c/mtime and tail zero the EOF folio on
* truncate up. ext4_truncate() handles the shrink case
@@ -5984,6 +5974,16 @@ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
LLONG_MAX);
}
+ handle = ext4_journal_start(inode, EXT4_HT_INODE, 3);
+ if (IS_ERR(handle)) {
+ error = PTR_ERR(handle);
+ goto out_mmap_sem;
+ }
+ if (ext4_handle_valid(handle) && shrink) {
+ error = ext4_orphan_add(handle, inode);
+ orphan = 1;
+ }
+
if (shrink)
ext4_fc_track_range(handle, inode,
(attr->ia_size > 0 ? attr->ia_size - 1 : 0) >>
--
2.52.0
^ permalink raw reply related [flat|nested] 12+ messages in thread* [PATCH v2 10/10] ext4: zero post-EOF partial block before appending write
2026-03-25 7:28 [PATCH v2 00/10] ext4: refactor partial block zero-out for iomap conversion Zhang Yi
` (8 preceding siblings ...)
2026-03-25 7:28 ` [PATCH v2 09/10] ext4: move zero partial block range functions out of active handle Zhang Yi
@ 2026-03-25 7:28 ` Zhang Yi
2026-03-26 8:53 ` [PATCH v2 00/10] ext4: refactor partial block zero-out for iomap conversion Zhang Yi
10 siblings, 0 replies; 12+ messages in thread
From: Zhang Yi @ 2026-03-25 7:28 UTC (permalink / raw)
To: linux-ext4
Cc: linux-fsdevel, linux-kernel, tytso, adilger.kernel, jack, ojaswin,
ritesh.list, libaokun, yi.zhang, yi.zhang, yizhang089, yangerkun,
yukuai
From: Zhang Yi <yi.zhang@huawei.com>
In cases of appending write beyond EOF, ext4_zero_partial_blocks() is
called within ext4_*_write_end() to zero out the partial block beyond
EOF. This prevents exposing stale data that might be written through
mmap.
However, supporting only the regular buffered write path is
insufficient. It is also necessary to support the DAX path as well as
the upcoming iomap buffered write path. Therefore, move this operation
to ext4_write_checks().
In addition, this may introduce a race window in which a post-EOF
buffered write can race with an mmap write after the old EOF block has
been zeroed. As a result, the data in this block written by the
buffer-write and the data written by the mmap-write may be mixed.
However, this is safe because users should not rely on the result of the
race condition.
Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
Reviewed-by: Jan Kara <jack@suse.cz>
---
fs/ext4/file.c | 14 ++++++++++++++
fs/ext4/inode.c | 21 +++++++--------------
2 files changed, 21 insertions(+), 14 deletions(-)
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index f1dc5ce791a7..b2e44601ab6a 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -271,6 +271,8 @@ static ssize_t ext4_generic_write_checks(struct kiocb *iocb,
static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from)
{
+ struct inode *inode = file_inode(iocb->ki_filp);
+ loff_t old_size = i_size_read(inode);
ssize_t ret, count;
count = ext4_generic_write_checks(iocb, from);
@@ -280,6 +282,18 @@ static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from)
ret = file_modified(iocb->ki_filp);
if (ret)
return ret;
+
+ /*
+ * If the position is beyond the EOF, it is necessary to zero out the
+ * partial block that beyond the existing EOF, as it may contains
+ * stale data written through mmap.
+ */
+ if (iocb->ki_pos > old_size && !ext4_verity_in_progress(inode)) {
+ ret = ext4_block_zero_eof(inode, old_size, iocb->ki_pos);
+ if (ret < 0)
+ return ret;
+ }
+
return count;
}
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 530197a53208..1479ff56f7d0 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1456,10 +1456,9 @@ static int ext4_write_end(const struct kiocb *iocb,
folio_unlock(folio);
folio_put(folio);
- if (old_size < pos && !verity) {
+ if (old_size < pos && !verity)
pagecache_isize_extended(inode, old_size, pos);
- ext4_block_zero_eof(inode, old_size, pos);
- }
+
/*
* Don't mark the inode dirty under folio lock. First, it unnecessarily
* makes the holding time of folio lock longer. Second, it forces lock
@@ -1574,10 +1573,8 @@ static int ext4_journalled_write_end(const struct kiocb *iocb,
folio_unlock(folio);
folio_put(folio);
- if (old_size < pos && !verity) {
+ if (old_size < pos && !verity)
pagecache_isize_extended(inode, old_size, pos);
- ext4_block_zero_eof(inode, old_size, pos);
- }
if (size_changed) {
ret2 = ext4_mark_inode_dirty(handle, inode);
@@ -3196,7 +3193,7 @@ static int ext4_da_do_write_end(struct address_space *mapping,
struct inode *inode = mapping->host;
loff_t old_size = inode->i_size;
bool disksize_changed = false;
- loff_t new_i_size, zero_len = 0;
+ loff_t new_i_size;
handle_t *handle;
if (unlikely(!folio_buffers(folio))) {
@@ -3240,19 +3237,15 @@ static int ext4_da_do_write_end(struct address_space *mapping,
folio_unlock(folio);
folio_put(folio);
- if (pos > old_size) {
+ if (pos > old_size)
pagecache_isize_extended(inode, old_size, pos);
- zero_len = pos - old_size;
- }
- if (!disksize_changed && !zero_len)
+ if (!disksize_changed)
return copied;
- handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
+ handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
if (IS_ERR(handle))
return PTR_ERR(handle);
- if (zero_len)
- ext4_block_zero_eof(inode, old_size, pos);
ext4_mark_inode_dirty(handle, inode);
ext4_journal_stop(handle);
--
2.52.0
^ permalink raw reply related [flat|nested] 12+ messages in thread* Re: [PATCH v2 00/10] ext4: refactor partial block zero-out for iomap conversion
2026-03-25 7:28 [PATCH v2 00/10] ext4: refactor partial block zero-out for iomap conversion Zhang Yi
` (9 preceding siblings ...)
2026-03-25 7:28 ` [PATCH v2 10/10] ext4: zero post-EOF partial block before appending write Zhang Yi
@ 2026-03-26 8:53 ` Zhang Yi
10 siblings, 0 replies; 12+ messages in thread
From: Zhang Yi @ 2026-03-26 8:53 UTC (permalink / raw)
To: linux-ext4
Cc: linux-fsdevel, linux-kernel, tytso, adilger.kernel, jack, ojaswin,
ritesh.list, libaokun, yi.zhang, yizhang089, yangerkun, yukuai
On 3/25/2026 3:28 PM, Zhang Yi wrote:
> From: Zhang Yi <yi.zhang@huawei.com>
Sashiko found some real issues in patch 09 and 10, I will send v3 to fix them.
https://sashiko.dev/#/patchset/20260325072850.3997161-1-yi.zhang%40huaweicloud.com
Best Regards,
Yi.
>
> Changes since v1:
> - In patch 04, rename ext4_block_get_zero_range() to
> ext4_load_tail_bh() and drop the unused 'length' parameter as Jan
> suggested.
> - In patch 06, modify the commit message, add another reason to drop
> data=ordered mode when zeroing partial blocks in ext4_punch_hole()
> and ext4_punch_hole() as Jan pointed out.
> - In patch 10, modify the commit message, explain the race condition
> between the buffered write and mmap write that pointed out by Jan.
> - Collect reviewed tags from Jan.
>
> v1: https://lore.kernel.org/linux-ext4/20260310014101.4140698-1-yi.zhang@huaweicloud.com/
>
> Original cover letter:
>
> This patch series extracted from my iomap conversion v2 series[1]. It
> refactors the ext4 zero partial block code path in preparation for
> converting buffered I/O to the iomap infrastructure. The main changes
> are:
>
> [1] https://lore.kernel.org/linux-ext4/20260203062523.3869120-1-yi.zhang@huawei.com/
>
> 1. Introduce ext4_block_zero_eof(): Extend and rename
> ext4_block_truncate_page() to handle post-EOF partial block zeroing
> for both append writes and truncate operations.
> 2. Separate ordered data handling: Move data=ordered mode handling from
> __ext4_block_zero_page_range to ext4_block_zero_eof(). Only truncate
> and post-EOF append write/fallocate paths need ordered data mode,
> hole punching and zero range paths don't need ordered data handling.
> 3. Split journal mode handling: Extract
> ext4_block_journalled_zero_range() from
> __ext4_block_zero_page_range() for data=journal mode, leaving
> ext4_block_do_zero_range() for data=ordered/writeback modes.
> 4. Refactor ext4_alloc_file_blocks(): Change parameters to loff_t byte
> granularity to simplify callers and prepares removing the zero call
> from the allocation loop for unaligned append writes.
> 5. Remove handle parameters: Stop passing handle_t * to zero functions.
> Make ext4_block_journalled_zero_range() start its own handle, and
> move zero operations outside active handles. This is required because
> iomap uses "folio lock -> transaction start" lock ordering, opposite
> to the current lock ordering.
> 6. Centralize zeroing in ext4_write_checks(): Move all post-EOF partial
> block zeroing to ext4_write_checks() so it applies to both regular
> buffered writes and the upcoming iomap path.
>
> Thanks
> Yi.
>
> Zhang Yi (10):
> ext4: add did_zero output parameter to ext4_block_zero_page_range()
> ext4: ext4_block_truncate_page() returns zeroed length on success
> ext4: rename and extend ext4_block_truncate_page()
> ext4: factor out journalled block zeroing range
> ext4: rename ext4_block_zero_page_range() to ext4_block_zero_range()
> ext4: move ordered data handling out of ext4_block_do_zero_range()
> ext4: remove handle parameters from zero partial block functions
> ext4: pass allocate range as loff_t to ext4_alloc_file_blocks()
> ext4: move zero partial block range functions out of active handle
> ext4: zero post-EOF partial block before appending write
>
> fs/ext4/ext4.h | 5 +-
> fs/ext4/extents.c | 83 +++++++--------
> fs/ext4/file.c | 14 +++
> fs/ext4/inode.c | 255 ++++++++++++++++++++++++++++------------------
> 4 files changed, 207 insertions(+), 150 deletions(-)
>
^ permalink raw reply [flat|nested] 12+ messages in thread