From: Zhang Yi <yi.zhang@huaweicloud.com>
To: linux-fsdevel@vger.kernel.org, linux-ext4@vger.kernel.org,
linux-block@vger.kernel.org, dm-devel@lists.linux.dev,
linux-nvme@lists.infradead.org, linux-scsi@vger.kernel.org
Cc: linux-xfs@vger.kernel.org, linux-kernel@vger.kernel.org,
hch@lst.de, tytso@mit.edu, djwong@kernel.org,
john.g.garry@oracle.com, bmarzins@redhat.com,
chaitanyak@nvidia.com, shinichiro.kawasaki@wdc.com,
brauner@kernel.org, martin.petersen@oracle.com,
yi.zhang@huawei.com, yi.zhang@huaweicloud.com,
chengzhihao1@huawei.com, yukuai3@huawei.com,
yangerkun@huawei.com
Subject: [PATCH v2 9/9] ext4: add FALLOC_FL_WRITE_ZEROES support
Date: Thu, 19 Jun 2025 19:18:06 +0800 [thread overview]
Message-ID: <20250619111806.3546162-10-yi.zhang@huaweicloud.com> (raw)
In-Reply-To: <20250619111806.3546162-1-yi.zhang@huaweicloud.com>
From: Zhang Yi <yi.zhang@huawei.com>
Add support for FALLOC_FL_WRITE_ZEROES if the underlying device enable
the unmap write zeroes operation. This first allocates blocks as
unwritten, then issues a zero command outside of the running journal
handle, and finally converts them to a written state.
Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
---
fs/ext4/extents.c | 66 ++++++++++++++++++++++++++++++-------
include/trace/events/ext4.h | 3 +-
2 files changed, 57 insertions(+), 12 deletions(-)
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index b543a46fc809..b43aa82c1b39 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -4501,6 +4501,8 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset,
struct ext4_map_blocks map;
unsigned int credits;
loff_t epos, old_size = i_size_read(inode);
+ unsigned int blkbits = inode->i_blkbits;
+ bool alloc_zero = false;
BUG_ON(!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS));
map.m_lblk = offset;
@@ -4513,6 +4515,17 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset,
if (len <= EXT_UNWRITTEN_MAX_LEN)
flags |= EXT4_GET_BLOCKS_NO_NORMALIZE;
+ /*
+ * Do the actual write zero during a running journal transaction
+ * costs a lot. First allocate an unwritten extent and then
+ * convert it to written after zeroing it out.
+ */
+ if (flags & EXT4_GET_BLOCKS_ZERO) {
+ flags &= ~EXT4_GET_BLOCKS_ZERO;
+ flags |= EXT4_GET_BLOCKS_UNWRIT_EXT;
+ alloc_zero = true;
+ }
+
/*
* credits to insert 1 extent into extent tree
*/
@@ -4549,9 +4562,7 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset,
* allow a full retry cycle for any remaining allocations
*/
retries = 0;
- map.m_lblk += ret;
- map.m_len = len = len - ret;
- epos = (loff_t)map.m_lblk << inode->i_blkbits;
+ epos = (loff_t)(map.m_lblk + ret) << blkbits;
inode_set_ctime_current(inode);
if (new_size) {
if (epos > new_size)
@@ -4571,6 +4582,21 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset,
ret2 = ret3 ? ret3 : ret2;
if (unlikely(ret2))
break;
+
+ if (alloc_zero &&
+ (map.m_flags & (EXT4_MAP_MAPPED | EXT4_MAP_UNWRITTEN))) {
+ ret2 = ext4_issue_zeroout(inode, map.m_lblk, map.m_pblk,
+ map.m_len);
+ if (likely(!ret2))
+ ret2 = ext4_convert_unwritten_extents(NULL,
+ inode, (loff_t)map.m_lblk << blkbits,
+ (loff_t)map.m_len << blkbits);
+ if (ret2)
+ break;
+ }
+
+ map.m_lblk += ret;
+ map.m_len = len = len - ret;
}
if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
goto retry;
@@ -4636,7 +4662,11 @@ static long ext4_zero_range(struct file *file, loff_t offset,
if (end_lblk > start_lblk) {
ext4_lblk_t zero_blks = end_lblk - start_lblk;
- flags |= (EXT4_GET_BLOCKS_CONVERT_UNWRITTEN | EXT4_EX_NOCACHE);
+ if (mode & FALLOC_FL_WRITE_ZEROES)
+ flags = EXT4_GET_BLOCKS_CREATE_ZERO | EXT4_EX_NOCACHE;
+ else
+ flags |= (EXT4_GET_BLOCKS_CONVERT_UNWRITTEN |
+ EXT4_EX_NOCACHE);
ret = ext4_alloc_file_blocks(file, start_lblk, zero_blks,
new_size, flags);
if (ret)
@@ -4745,11 +4775,18 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
if (IS_ENCRYPTED(inode) &&
(mode & (FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_INSERT_RANGE)))
return -EOPNOTSUPP;
+ /*
+ * Don't allow writing zeroes if the underlying device does not
+ * enable the unmap write zeroes operation.
+ */
+ if ((mode & FALLOC_FL_WRITE_ZEROES) &&
+ !bdev_write_zeroes_unmap_sectors(inode->i_sb->s_bdev))
+ return -EOPNOTSUPP;
/* Return error if mode is not supported */
if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
- FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE |
- FALLOC_FL_INSERT_RANGE))
+ FALLOC_FL_ZERO_RANGE | FALLOC_FL_COLLAPSE_RANGE |
+ FALLOC_FL_INSERT_RANGE | FALLOC_FL_WRITE_ZEROES))
return -EOPNOTSUPP;
inode_lock(inode);
@@ -4780,16 +4817,23 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
if (ret)
goto out_invalidate_lock;
- if (mode & FALLOC_FL_PUNCH_HOLE)
+ switch (mode & FALLOC_FL_MODE_MASK) {
+ case FALLOC_FL_PUNCH_HOLE:
ret = ext4_punch_hole(file, offset, len);
- else if (mode & FALLOC_FL_COLLAPSE_RANGE)
+ break;
+ case FALLOC_FL_COLLAPSE_RANGE:
ret = ext4_collapse_range(file, offset, len);
- else if (mode & FALLOC_FL_INSERT_RANGE)
+ break;
+ case FALLOC_FL_INSERT_RANGE:
ret = ext4_insert_range(file, offset, len);
- else if (mode & FALLOC_FL_ZERO_RANGE)
+ break;
+ case FALLOC_FL_ZERO_RANGE:
+ case FALLOC_FL_WRITE_ZEROES:
ret = ext4_zero_range(file, offset, len, mode);
- else
+ break;
+ default:
ret = -EOPNOTSUPP;
+ }
out_invalidate_lock:
filemap_invalidate_unlock(mapping);
diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
index 156908641e68..6f9cf2811733 100644
--- a/include/trace/events/ext4.h
+++ b/include/trace/events/ext4.h
@@ -92,7 +92,8 @@ TRACE_DEFINE_ENUM(ES_REFERENCED_B);
{ FALLOC_FL_KEEP_SIZE, "KEEP_SIZE"}, \
{ FALLOC_FL_PUNCH_HOLE, "PUNCH_HOLE"}, \
{ FALLOC_FL_COLLAPSE_RANGE, "COLLAPSE_RANGE"}, \
- { FALLOC_FL_ZERO_RANGE, "ZERO_RANGE"})
+ { FALLOC_FL_ZERO_RANGE, "ZERO_RANGE"}, \
+ { FALLOC_FL_WRITE_ZEROES, "WRITE_ZEROES"})
TRACE_DEFINE_ENUM(EXT4_FC_REASON_XATTR);
TRACE_DEFINE_ENUM(EXT4_FC_REASON_CROSS_RENAME);
--
2.46.1
next prev parent reply other threads:[~2025-06-19 11:31 UTC|newest]
Thread overview: 20+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-06-19 11:17 [PATCH v2 0/9] fallocate: introduce FALLOC_FL_WRITE_ZEROES flag Zhang Yi
2025-06-19 11:17 ` [PATCH v2 1/9] block: introduce max_{hw|user}_wzeroes_unmap_sectors to queue limits Zhang Yi
2025-06-23 5:39 ` Christoph Hellwig
2025-08-21 12:55 ` John Garry
2025-08-23 4:37 ` Zhang Yi
2025-06-19 11:17 ` [PATCH v2 2/9] nvme: set max_hw_wzeroes_unmap_sectors if device supports DEAC bit Zhang Yi
2025-06-23 5:40 ` Christoph Hellwig
2025-06-19 11:18 ` [PATCH v2 3/9] nvmet: set WZDS and DRB if device enables unmap write zeroes operation Zhang Yi
2025-06-23 5:40 ` Christoph Hellwig
2025-06-19 11:18 ` [PATCH v2 4/9] scsi: sd: set max_hw_wzeroes_unmap_sectors if device supports SD_ZERO_*_UNMAP Zhang Yi
2025-06-19 11:18 ` [PATCH v2 5/9] dm: clear unmap write zeroes limits when disabling write zeroes Zhang Yi
2025-06-19 11:18 ` [PATCH v2 6/9] fs: introduce FALLOC_FL_WRITE_ZEROES to fallocate Zhang Yi
2025-06-19 11:18 ` [PATCH v2 7/9] block: factor out common part in blkdev_fallocate() Zhang Yi
2025-06-19 11:18 ` [PATCH v2 8/9] block: add FALLOC_FL_WRITE_ZEROES support Zhang Yi
2025-06-19 11:18 ` Zhang Yi [this message]
2025-06-23 10:46 ` [PATCH v2 0/9] fallocate: introduce FALLOC_FL_WRITE_ZEROES flag Christian Brauner
2025-07-03 3:35 ` Zhang Yi
2025-07-04 8:39 ` Christian Brauner
2025-06-23 15:08 ` Martin K. Petersen
2025-06-26 13:57 ` Zhang Yi
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20250619111806.3546162-10-yi.zhang@huaweicloud.com \
--to=yi.zhang@huaweicloud.com \
--cc=bmarzins@redhat.com \
--cc=brauner@kernel.org \
--cc=chaitanyak@nvidia.com \
--cc=chengzhihao1@huawei.com \
--cc=djwong@kernel.org \
--cc=dm-devel@lists.linux.dev \
--cc=hch@lst.de \
--cc=john.g.garry@oracle.com \
--cc=linux-block@vger.kernel.org \
--cc=linux-ext4@vger.kernel.org \
--cc=linux-fsdevel@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-nvme@lists.infradead.org \
--cc=linux-scsi@vger.kernel.org \
--cc=linux-xfs@vger.kernel.org \
--cc=martin.petersen@oracle.com \
--cc=shinichiro.kawasaki@wdc.com \
--cc=tytso@mit.edu \
--cc=yangerkun@huawei.com \
--cc=yi.zhang@huawei.com \
--cc=yukuai3@huawei.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).