* [PATCH v4 1/2] xfs: add xfs_bmap_alloc_or_convert_range function
2026-05-01 11:20 [PATCH v4 0/2] add FALLOC_FL_WRITE_ZEROES support to xfs Pankaj Raghav
@ 2026-05-01 11:20 ` Pankaj Raghav
2026-05-08 9:01 ` Christoph Hellwig
2026-05-01 11:20 ` [PATCH v4 2/2] xfs: add support for FALLOC_FL_WRITE_ZEROES Pankaj Raghav
1 sibling, 1 reply; 4+ messages in thread
From: Pankaj Raghav @ 2026-05-01 11:20 UTC (permalink / raw)
To: linux-xfs
Cc: bfoster, lukas, Darrick J . Wong, p.raghav, dgc, gost.dev,
pankaj.raghav, andres, kundan.kumar, cem, hch
Add xfs_bmap_alloc_or_convert_range() that can either allocate a range
and/or convert unwritten extents to written extents.
This function is based on xfs_iomap_write_unwritten() but we add an
extra flag parameter. Only XFS_BMAPI_CONVERT and/or XFS_BMAPI_ZERO is
accepted as flags to this function. This function also additionally
accounts while starting the transaction for the blocks that might
be created because of XFS_BMAPI_ZERO.
This is done as a preparation to add FALLOC_FL_WRITE_ZEROES flag.
xfs_iomap_write_unwritten() function will now just call
xfs_bmap_alloc_or_convert_range() with flag set to XFS_BMAPI_CONVERT.
Suggested-by: Dave Chinner <dgc@kernel.org>
Signed-off-by: Pankaj Raghav <p.raghav@samsung.com>
---
I feel like xfs_bmap_alloc_or_convert_range() has become a bit big and
logic repeated, should we rethink of going back to the original solution? [1]
[1] https://lore.kernel.org/linux-xfs/20260310194245.848034-2-lukas@herbolt.com/
fs/xfs/xfs_iomap.c | 215 ++++++++++++++++++++++++++++++++-------------
fs/xfs/xfs_iomap.h | 2 +
2 files changed, 155 insertions(+), 62 deletions(-)
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index f20a02f49ed9..40715589814a 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -609,12 +609,30 @@ xfs_iomap_prealloc_size(
return alloc_blocks;
}
+/*
+ * This function is used to allocate written extents over holes and/or convert
+ * unwritten extents to written extents based on the @flags passed to it.
+ *
+ * If @flags is zero, it will allocate written extents for holes and delalloc
+ * extents across the range.
+ *
+ * If XFS_BMAPI_CONVERT is specified in @flags, then it will also do conversion
+ * of unwritten extents in the range to written extents.
+ *
+ * If XFS_BMAPI_ZERO is specified in @flags, then both newly allocated extents
+ * and converted unwritten extents will be initialised to contain zeroes.
+ *
+ * If @update_isize is true, then if the range we are operating on extends
+ * beyond the current EOF, extend i_size to offset+len incrementally as extents
+ * in the range are allocated/converted.
+ */
int
-xfs_iomap_write_unwritten(
- xfs_inode_t *ip,
- xfs_off_t offset,
- xfs_off_t count,
- bool update_isize)
+xfs_bmap_alloc_or_convert_range(
+ struct xfs_inode *ip,
+ xfs_off_t offset,
+ xfs_off_t count,
+ uint32_t flags,
+ bool update_isize)
{
xfs_mount_t *mp = ip->i_mount;
xfs_fileoff_t offset_fsb;
@@ -625,98 +643,159 @@ xfs_iomap_write_unwritten(
xfs_bmbt_irec_t imap;
struct inode *inode = VFS_I(ip);
xfs_fsize_t i_size;
- uint resblks;
int error;
+ int rt = XFS_IS_REALTIME_INODE(ip);
+ xfs_extlen_t extsz, temp;
- trace_xfs_unwritten_convert(ip, offset, count);
+ ASSERT((flags & ~(XFS_BMAPI_ZERO | XFS_BMAPI_CONVERT)) == 0);
+ extsz = xfs_get_extsz_hint(ip);
offset_fsb = XFS_B_TO_FSBT(mp, offset);
count_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
count_fsb = (xfs_filblks_t)(count_fsb - offset_fsb);
- /*
- * Reserve enough blocks in this transaction for two complete extent
- * btree splits. We may be converting the middle part of an unwritten
- * extent and in this case we will insert two new extents in the btree
- * each of which could cause a full split.
- *
- * This reservation amount will be used in the first call to
- * xfs_bmbt_split() to select an AG with enough space to satisfy the
- * rest of the operation.
- */
- resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0) << 1;
-
/* Attach dquots so that bmbt splits are accounted correctly. */
error = xfs_qm_dqattach(ip);
if (error)
return error;
do {
+
+ uint resblks, dblocks, bmapi_total, rblocks;
+ int bmapi_error;
+
+ if (flags == XFS_BMAPI_CONVERT) {
+ /*
+ * Reserve enough blocks in this transaction for two
+ * complete extent btree splits. We may be converting
+ * the middle part of an unwritten extent and in this
+ * case we will insert two new extents in the btree each
+ * of which could cause a full split.
+ *
+ * This reservation amount will be used in the first call
+ * to xfs_bmbt_split() to select an AG with enough space
+ * to satisfy the rest of the operation.
+ */
+ resblks = 0;
+ dblocks = XFS_DIOSTRAT_SPACE_RES(mp, resblks) << 1;
+ rblocks = 0;
+ bmapi_total = dblocks;
+ } else {
+ xfs_fileoff_t s, e;
+ if (unlikely(extsz)) {
+ s = offset_fsb;
+ do_div(s, extsz);
+ s *= extsz;
+ e = offset_fsb + count_fsb;
+ div_u64_rem(offset_fsb, extsz, &temp);
+ if (temp)
+ e += temp;
+ div_u64_rem(e, extsz, &temp);
+ if (temp)
+ e += extsz - temp;
+ } else {
+ s = 0;
+ e = count_fsb;
+ }
+
+ /*
+ * We might allocate data blocks (needs resblks + 1 split)
+ * or convert an unwritten extent (needs 0 data blocks +
+ * 2 splits). Ensure we have enough block reservation for
+ * the worst case.
+ */
+ resblks = XFS_FILBLKS_MIN((e - s), XFS_MAX_BMBT_EXTLEN);
+ bmapi_total = 0;
+ if (unlikely(rt)) {
+ rblocks = XFS_DIOSTRAT_SPACE_RES(mp, resblks);
+ dblocks = XFS_DIOSTRAT_SPACE_RES(mp, 0) << 1;
+ } else {
+ rblocks = 0;
+ dblocks = XFS_DIOSTRAT_SPACE_RES(mp, resblks);
+ dblocks = max(dblocks,
+ XFS_DIOSTRAT_SPACE_RES(mp, 0) << 1);
+ }
+ }
+
/*
- * Set up a transaction to convert the range of extents
- * from unwritten to real. Do allocations in a loop until
- * we have covered the range passed in.
+ * Set up a transaction to convert the range of extents based on
+ * the flags. Do allocations in a loop until we have covered the
+ * range passed in.
*
* Note that we can't risk to recursing back into the filesystem
* here as we might be asked to write out the same inode that we
* complete here and might deadlock on the iolock.
*/
- error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, resblks,
- 0, true, &tp);
+ error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, dblocks,
+ rblocks, true, &tp);
if (error)
return error;
- error = xfs_iext_count_extend(tp, ip, XFS_DATA_FORK,
- XFS_IEXT_WRITE_UNWRITTEN_CNT);
+ if (flags & XFS_BMAPI_CONVERT)
+ error = xfs_iext_count_extend(tp, ip, XFS_DATA_FORK,
+ XFS_IEXT_WRITE_UNWRITTEN_CNT);
+ else
+ error = xfs_iext_count_extend(tp, ip, XFS_DATA_FORK,
+ XFS_IEXT_ADD_NOSPLIT_CNT);
+
if (error)
goto error_on_bmapi_transaction;
- /*
- * Modify the unwritten extent state of the buffer.
- */
nimaps = 1;
error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb,
- XFS_BMAPI_CONVERT, resblks, &imap,
- &nimaps);
- if (error)
- goto error_on_bmapi_transaction;
+ flags, bmapi_total, &imap, &nimaps);
+ bmapi_error = error;
+
+ if (error) {
+ if (error != -ENOSR)
+ goto error_on_bmapi_transaction;
+ /*
+ * Keep searching until we get one contiguous
+ * extent if we get ENOSR
+ */
+ error = 0;
+ } else {
+ /*
+ * Log the updated inode size as we go. We have to be
+ * careful to only log it up to the actual write offset
+ * if it is halfway into a block.
+ */
+ i_size = XFS_FSB_TO_B(mp, offset_fsb + count_fsb);
+ if (i_size > offset + count)
+ i_size = offset + count;
+ if (update_isize && i_size > i_size_read(inode))
+ i_size_write(inode, i_size);
+ i_size = xfs_new_eof(ip, i_size);
+ if (i_size) {
+ ip->i_disk_size = i_size;
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+ }
- /*
- * Log the updated inode size as we go. We have to be careful
- * to only log it up to the actual write offset if it is
- * halfway into a block.
- */
- i_size = XFS_FSB_TO_B(mp, offset_fsb + count_fsb);
- if (i_size > offset + count)
- i_size = offset + count;
- if (update_isize && i_size > i_size_read(inode))
- i_size_write(inode, i_size);
- i_size = xfs_new_eof(ip, i_size);
- if (i_size) {
- ip->i_disk_size = i_size;
- xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
}
error = xfs_trans_commit(tp);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
- if (error)
- return error;
- if (unlikely(!xfs_valid_startblock(ip, imap.br_startblock))) {
- xfs_bmap_mark_sick(ip, XFS_DATA_FORK);
- return xfs_alert_fsblock_zero(ip, &imap);
+ if (!bmapi_error) {
+ if (unlikely(!xfs_valid_startblock(ip, imap.br_startblock))) {
+ xfs_bmap_mark_sick(ip, XFS_DATA_FORK);
+ return xfs_alert_fsblock_zero(ip, &imap);
+ }
+ if ((numblks_fsb = imap.br_blockcount) == 0) {
+ /*
+ * The numblks_fsb value should always get smaller,
+ * otherwise the loop is stuck.
+ */
+ ASSERT(imap.br_blockcount);
+ break;
+ }
+ offset_fsb += numblks_fsb;
+ count_fsb -= numblks_fsb;
}
- if ((numblks_fsb = imap.br_blockcount) == 0) {
- /*
- * The numblks_fsb value should always get
- * smaller, otherwise the loop is stuck.
- */
- ASSERT(imap.br_blockcount);
- break;
- }
- offset_fsb += numblks_fsb;
- count_fsb -= numblks_fsb;
+ if (error)
+ return error;
+
} while (count_fsb > 0);
return 0;
@@ -727,6 +806,18 @@ xfs_iomap_write_unwritten(
return error;
}
+int
+xfs_iomap_write_unwritten(
+ xfs_inode_t *ip,
+ xfs_off_t offset,
+ xfs_off_t count,
+ bool update_isize)
+{
+ trace_xfs_unwritten_convert(ip, offset, count);
+ return xfs_bmap_alloc_or_convert_range(ip, offset, count,
+ XFS_BMAPI_CONVERT, update_isize);
+}
+
static inline bool
imap_needs_alloc(
struct inode *inode,
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index ebcce7d49446..ba7b06074539 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -16,6 +16,8 @@ int xfs_iomap_write_direct(struct xfs_inode *ip, xfs_fileoff_t offset_fsb,
xfs_fileoff_t count_fsb, unsigned int flags,
struct xfs_bmbt_irec *imap, u64 *sequence);
int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t, bool);
+int xfs_bmap_alloc_or_convert_range(struct xfs_inode *, xfs_off_t, xfs_off_t,
+ uint32_t, bool);
xfs_fileoff_t xfs_iomap_eof_align_last_fsb(struct xfs_inode *ip,
xfs_fileoff_t end_fsb);
--
2.51.2
^ permalink raw reply related [flat|nested] 4+ messages in thread* [PATCH v4 2/2] xfs: add support for FALLOC_FL_WRITE_ZEROES
2026-05-01 11:20 [PATCH v4 0/2] add FALLOC_FL_WRITE_ZEROES support to xfs Pankaj Raghav
2026-05-01 11:20 ` [PATCH v4 1/2] xfs: add xfs_bmap_alloc_or_convert_range function Pankaj Raghav
@ 2026-05-01 11:20 ` Pankaj Raghav
1 sibling, 0 replies; 4+ messages in thread
From: Pankaj Raghav @ 2026-05-01 11:20 UTC (permalink / raw)
To: linux-xfs
Cc: bfoster, lukas, Darrick J . Wong, p.raghav, dgc, gost.dev,
pankaj.raghav, andres, kundan.kumar, cem, hch
If the underlying block device supports the unmap write zeroes
operation, this flag allows users to quickly preallocate a file with
written extents that contain zeroes. This is beneficial for subsequent
overwrites as it prevents the need for unwritten-to-written extent
conversions, thereby significantly reducing metadata updates and journal
I/O overhead, improving overwrite performance.
Co-developed-by: Lukas Herbolt <lukas@herbolt.com>
Signed-off-by: Lukas Herbolt <lukas@herbolt.com>
Signed-off-by: Pankaj Raghav <p.raghav@samsung.com>
---
fs/xfs/xfs_file.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 55 insertions(+), 1 deletion(-)
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 845a97c9b063..99a02982154a 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1368,6 +1368,57 @@ xfs_falloc_force_zero(
return XFS_TEST_ERROR(ip->i_mount, XFS_ERRTAG_FORCE_ZERO_RANGE);
}
+static int
+xfs_falloc_write_zeroes(
+ struct file *file,
+ int mode,
+ loff_t offset,
+ loff_t len,
+ struct xfs_zone_alloc_ctx *ac)
+{
+ struct inode *inode = file_inode(file);
+ struct xfs_inode *ip = XFS_I(inode);
+ loff_t new_size = 0;
+ loff_t old_size = XFS_ISIZE(ip);
+ int error;
+ unsigned int blksize = i_blocksize(inode);
+ loff_t offset_aligned = round_down(offset, blksize);
+ bool did_zero;
+
+ if (xfs_is_always_cow_inode(ip) ||
+ !bdev_write_zeroes_unmap_sectors(
+ xfs_inode_buftarg(XFS_I(inode))->bt_bdev))
+ return -EOPNOTSUPP;
+
+ error = xfs_falloc_newsize(file, mode, offset, len, &new_size);
+ if (error)
+ return error;
+
+ error = xfs_free_file_space(ip, offset, len, ac);
+ if (error)
+ return error;
+
+ /*
+ * Zero the tail of the old EOF block and any space up to the new
+ * offset.
+ * In the usual truncate path, xfs_falloc_setsize takes care of
+ * zeroing those blocks.
+ */
+ if (offset_aligned > old_size)
+ error = xfs_zero_range(ip, old_size, offset_aligned - old_size,
+ NULL, &did_zero);
+ if (error)
+ return error;
+
+ error = xfs_bmap_alloc_or_convert_range(ip, offset, len,
+ XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO,
+ new_size ? true : false);
+ if (error)
+ return error;
+
+ return error;
+}
+
/*
* Punch a hole and prealloc the range. We use a hole punch rather than
* unwritten extent conversion for two reasons:
@@ -1470,7 +1521,7 @@ xfs_falloc_allocate_range(
(FALLOC_FL_ALLOCATE_RANGE | FALLOC_FL_KEEP_SIZE | \
FALLOC_FL_PUNCH_HOLE | FALLOC_FL_COLLAPSE_RANGE | \
FALLOC_FL_ZERO_RANGE | FALLOC_FL_INSERT_RANGE | \
- FALLOC_FL_UNSHARE_RANGE)
+ FALLOC_FL_UNSHARE_RANGE | FALLOC_FL_WRITE_ZEROES)
STATIC long
__xfs_file_fallocate(
@@ -1522,6 +1573,9 @@ __xfs_file_fallocate(
case FALLOC_FL_ALLOCATE_RANGE:
error = xfs_falloc_allocate_range(file, mode, offset, len);
break;
+ case FALLOC_FL_WRITE_ZEROES:
+ error = xfs_falloc_write_zeroes(file, mode, offset, len, ac);
+ break;
default:
error = -EOPNOTSUPP;
break;
--
2.51.2
^ permalink raw reply related [flat|nested] 4+ messages in thread