* [PATCH v2 1/2] xfs: add xfs_bmap_alloc_or_convert_range function
2026-04-13 13:32 [PATCH v2 0/2] add FALLOC_FL_WRITE_ZEROES support to xfs Pankaj Raghav
@ 2026-04-13 13:32 ` Pankaj Raghav
2026-04-13 13:32 ` [PATCH v2 2/2] xfs: add support for FALLOC_FL_WRITE_ZEROES Pankaj Raghav
2026-04-13 15:07 ` [PATCH v2 0/2] add FALLOC_FL_WRITE_ZEROES support to xfs Lukas Herbolt
2 siblings, 0 replies; 7+ messages in thread
From: Pankaj Raghav @ 2026-04-13 13:32 UTC (permalink / raw)
To: linux-xfs
Cc: bfoster, lukas, Darrick J . Wong, p.raghav, dgc, gost.dev,
pankaj.raghav, kundan.kumar, cem, hch
Add xfs_bmap_alloc_or_convert_range() that can either allocate a range
and/or convert unwritten extents to written extents.
This function is based on xfs_iomap_write_unwritten() but we add an
extra flag parameter. Only XFS_BMAPI_CONVERT and/or XFS_BMAPI_ZERO is
accepted as flags to this function. This function also additionally
accounts while starting the transaction for the blocks that might
be created because of XFS_BMAPI_ZERO.
This is done as a preparation to add FALLOC_FL_WRITE_ZEROES flag.
xfs_iomap_write_unwritten() function will now just call
xfs_bmap_alloc_or_convert_range() with flag set to XFS_BMAPI_CONVERT.
Suggested-by: Dave Chinner <dgc@kernel.org>
Signed-off-by: Pankaj Raghav <p.raghav@samsung.com>
---
fs/xfs/xfs_bmap_util.c | 165 +++++++++++++++++++++++++++++++++++++++++
fs/xfs/xfs_bmap_util.h | 3 +
fs/xfs/xfs_iomap.c | 113 +---------------------------
fs/xfs/xfs_iomap.h | 1 +
4 files changed, 172 insertions(+), 110 deletions(-)
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 0ab00615f1ad..5a6abca12982 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -20,6 +20,7 @@
#include "xfs_bmap.h"
#include "xfs_bmap_util.h"
#include "xfs_bmap_btree.h"
+#include "xfs_health.h"
#include "xfs_rtalloc.h"
#include "xfs_error.h"
#include "xfs_quota.h"
@@ -642,6 +643,170 @@ xfs_free_eofblocks(
return error;
}
+/*
+ * This function is used to allocate written extents over holes
+ * and/or convert unwritten extents to written extents based on the
+ * @flags passed to it.
+ *
+ * If @flags is zero, it will allocate written extents for holes and
+ * delalloc extents across the range.
+ *
+ * If XFS_BMAPI_CONVERT is specified in @flags, then it will also do
+ * conversion of unwritten extents in the range to written extents.
+ *
+ * If XFS_BMAPI_ZERO is specified in @flags, then both newly
+ * allocated extents and converted unwritten extents will be
+ * initialised to contain zeroes.
+ *
+ * If @update_isize is true, then if the range we are operating on
+ * extends beyond the current EOF, extend i_size to offset+len
+ * incrementally as extents in the range are allocated/converted.
+ */
+int
+xfs_bmap_alloc_or_convert_range(
+ struct xfs_inode *ip,
+ xfs_off_t offset,
+ xfs_off_t count,
+ uint32_t flags,
+ bool update_isize)
+{
+ xfs_mount_t *mp = ip->i_mount;
+ xfs_fileoff_t offset_fsb;
+ xfs_filblks_t count_fsb;
+ xfs_filblks_t numblks_fsb;
+ int nimaps;
+ xfs_trans_t *tp;
+ xfs_bmbt_irec_t imap;
+ struct inode *inode = VFS_I(ip);
+ xfs_fsize_t i_size;
+ int error;
+
+ ASSERT((flags & ~(XFS_BMAPI_ZERO | XFS_BMAPI_CONVERT)) == 0);
+
+ offset_fsb = XFS_B_TO_FSBT(mp, offset);
+ count_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
+ count_fsb = (xfs_filblks_t)(count_fsb - offset_fsb);
+
+ /* Attach dquots so that bmbt splits are accounted correctly. */
+ error = xfs_qm_dqattach(ip);
+ if (error)
+ return error;
+
+ do {
+
+ uint resblks, dblocks;
+ uint bmapi_total = 0;
+ int bmapi_error;
+
+ if (flags == XFS_BMAPI_CONVERT) {
+ /*
+ * Pure unwritten conversion. No data blocks needed.
+ * Reserve enough for two full b-tree splits.
+ */
+ resblks = 0;
+ dblocks = XFS_DIOSTRAT_SPACE_RES(mp, 0) << 1;
+ bmapi_total = dblocks;
+ } else {
+ /*
+ * We might allocate data blocks (needs resblks + 1 split) or
+ * convert an unwritten extent (needs 0 data blocks + 2 splits).
+ * Ensure we have enough block reservation for the worst case.
+ */
+ resblks = XFS_FILBLKS_MIN(count_fsb, XFS_MAX_BMBT_EXTLEN);
+ dblocks = XFS_DIOSTRAT_SPACE_RES(mp, resblks);
+ dblocks += (XFS_DIOSTRAT_SPACE_RES(mp, 0) << 1);
+ bmapi_total = 0;
+ }
+
+ /*
+ * Set up a transaction to convert the range of extents
+ * based on the flags. Do allocations in a loop until
+ * we have covered the range passed in.
+ *
+ * Note that we can't risk to recursing back into the filesystem
+ * here as we might be asked to write out the same inode that we
+ * complete here and might deadlock on the iolock.
+ */
+ error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, dblocks,
+ 0, true, &tp);
+ if (error)
+ return error;
+
+ if (flags & XFS_BMAPI_CONVERT)
+ error = xfs_iext_count_extend(tp, ip, XFS_DATA_FORK,
+ XFS_IEXT_WRITE_UNWRITTEN_CNT);
+ else
+ error = xfs_iext_count_extend(tp, ip, XFS_DATA_FORK,
+ XFS_IEXT_ADD_NOSPLIT_CNT);
+
+ if (error)
+ goto error_on_bmapi_transaction;
+
+ nimaps = 1;
+ error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb,
+ flags, bmapi_total, &imap, &nimaps);
+ bmapi_error = error;
+
+ if (error) {
+ if (error != -ENOSR)
+ goto error_on_bmapi_transaction;
+ /*
+ * Keep searching until we get one contiguous
+ * extent if we get ENOSR
+ */
+ error = 0;
+ } else {
+ /*
+ * Log the updated inode size as we go. We have to be careful
+ * to only log it up to the actual write offset if it is
+ * halfway into a block.
+ */
+ i_size = XFS_FSB_TO_B(mp, offset_fsb + count_fsb);
+ if (i_size > offset + count)
+ i_size = offset + count;
+ if (update_isize && i_size > i_size_read(inode))
+ i_size_write(inode, i_size);
+ i_size = xfs_new_eof(ip, i_size);
+ if (i_size) {
+ ip->i_disk_size = i_size;
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+ }
+
+ }
+
+ error = xfs_trans_commit(tp);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+
+ if (!bmapi_error) {
+ if (unlikely(!xfs_valid_startblock(ip, imap.br_startblock))) {
+ xfs_bmap_mark_sick(ip, XFS_DATA_FORK);
+ return xfs_alert_fsblock_zero(ip, &imap);
+ }
+ if ((numblks_fsb = imap.br_blockcount) == 0) {
+ /*
+ * The numblks_fsb value should always get
+ * smaller, otherwise the loop is stuck.
+ */
+ ASSERT(imap.br_blockcount);
+ break;
+ }
+ offset_fsb += numblks_fsb;
+ count_fsb -= numblks_fsb;
+ }
+
+ if (error)
+ return error;
+
+ } while (count_fsb > 0);
+
+ return 0;
+
+error_on_bmapi_transaction:
+ xfs_trans_cancel(tp);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ return error;
+}
+
int
xfs_alloc_file_space(
struct xfs_inode *ip,
diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h
index c477b3361630..0bf84130f49d 100644
--- a/fs/xfs/xfs_bmap_util.h
+++ b/fs/xfs/xfs_bmap_util.h
@@ -57,7 +57,10 @@ int xfs_bmap_last_extent(struct xfs_trans *tp, struct xfs_inode *ip,
/* preallocation and hole punch interface */
int xfs_alloc_file_space(struct xfs_inode *ip, xfs_off_t offset,
xfs_off_t len);
+int xfs_bmap_alloc_or_convert_range(struct xfs_inode *ip, xfs_off_t offset,
+ xfs_off_t count, uint32_t flags, bool update_isize);
int xfs_free_file_space(struct xfs_inode *ip, xfs_off_t offset,
+
xfs_off_t len, struct xfs_zone_alloc_ctx *ac);
int xfs_collapse_file_space(struct xfs_inode *, xfs_off_t offset,
xfs_off_t len, struct xfs_zone_alloc_ctx *ac);
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index f20a02f49ed9..4524907f91e3 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -36,7 +36,7 @@
#define XFS_ALLOC_ALIGN(mp, off) \
(((off) >> mp->m_allocsize_log) << mp->m_allocsize_log)
-static int
+int
xfs_alert_fsblock_zero(
xfs_inode_t *ip,
xfs_bmbt_irec_t *imap)
@@ -616,115 +616,8 @@ xfs_iomap_write_unwritten(
xfs_off_t count,
bool update_isize)
{
- xfs_mount_t *mp = ip->i_mount;
- xfs_fileoff_t offset_fsb;
- xfs_filblks_t count_fsb;
- xfs_filblks_t numblks_fsb;
- int nimaps;
- xfs_trans_t *tp;
- xfs_bmbt_irec_t imap;
- struct inode *inode = VFS_I(ip);
- xfs_fsize_t i_size;
- uint resblks;
- int error;
-
- trace_xfs_unwritten_convert(ip, offset, count);
-
- offset_fsb = XFS_B_TO_FSBT(mp, offset);
- count_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
- count_fsb = (xfs_filblks_t)(count_fsb - offset_fsb);
-
- /*
- * Reserve enough blocks in this transaction for two complete extent
- * btree splits. We may be converting the middle part of an unwritten
- * extent and in this case we will insert two new extents in the btree
- * each of which could cause a full split.
- *
- * This reservation amount will be used in the first call to
- * xfs_bmbt_split() to select an AG with enough space to satisfy the
- * rest of the operation.
- */
- resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0) << 1;
-
- /* Attach dquots so that bmbt splits are accounted correctly. */
- error = xfs_qm_dqattach(ip);
- if (error)
- return error;
-
- do {
- /*
- * Set up a transaction to convert the range of extents
- * from unwritten to real. Do allocations in a loop until
- * we have covered the range passed in.
- *
- * Note that we can't risk to recursing back into the filesystem
- * here as we might be asked to write out the same inode that we
- * complete here and might deadlock on the iolock.
- */
- error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, resblks,
- 0, true, &tp);
- if (error)
- return error;
-
- error = xfs_iext_count_extend(tp, ip, XFS_DATA_FORK,
- XFS_IEXT_WRITE_UNWRITTEN_CNT);
- if (error)
- goto error_on_bmapi_transaction;
-
- /*
- * Modify the unwritten extent state of the buffer.
- */
- nimaps = 1;
- error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb,
- XFS_BMAPI_CONVERT, resblks, &imap,
- &nimaps);
- if (error)
- goto error_on_bmapi_transaction;
-
- /*
- * Log the updated inode size as we go. We have to be careful
- * to only log it up to the actual write offset if it is
- * halfway into a block.
- */
- i_size = XFS_FSB_TO_B(mp, offset_fsb + count_fsb);
- if (i_size > offset + count)
- i_size = offset + count;
- if (update_isize && i_size > i_size_read(inode))
- i_size_write(inode, i_size);
- i_size = xfs_new_eof(ip, i_size);
- if (i_size) {
- ip->i_disk_size = i_size;
- xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
- }
-
- error = xfs_trans_commit(tp);
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- if (error)
- return error;
-
- if (unlikely(!xfs_valid_startblock(ip, imap.br_startblock))) {
- xfs_bmap_mark_sick(ip, XFS_DATA_FORK);
- return xfs_alert_fsblock_zero(ip, &imap);
- }
-
- if ((numblks_fsb = imap.br_blockcount) == 0) {
- /*
- * The numblks_fsb value should always get
- * smaller, otherwise the loop is stuck.
- */
- ASSERT(imap.br_blockcount);
- break;
- }
- offset_fsb += numblks_fsb;
- count_fsb -= numblks_fsb;
- } while (count_fsb > 0);
-
- return 0;
-
-error_on_bmapi_transaction:
- xfs_trans_cancel(tp);
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- return error;
+ return xfs_bmap_alloc_or_convert_range(ip, offset, count,
+ XFS_BMAPI_CONVERT, update_isize);
}
static inline bool
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index ebcce7d49446..d59428277376 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -16,6 +16,7 @@ int xfs_iomap_write_direct(struct xfs_inode *ip, xfs_fileoff_t offset_fsb,
xfs_fileoff_t count_fsb, unsigned int flags,
struct xfs_bmbt_irec *imap, u64 *sequence);
int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t, bool);
+int xfs_alert_fsblock_zero(struct xfs_inode *ip, struct xfs_bmbt_irec *imap);
xfs_fileoff_t xfs_iomap_eof_align_last_fsb(struct xfs_inode *ip,
xfs_fileoff_t end_fsb);
--
2.51.2
^ permalink raw reply related [flat|nested] 7+ messages in thread* [PATCH v2 2/2] xfs: add support for FALLOC_FL_WRITE_ZEROES
2026-04-13 13:32 [PATCH v2 0/2] add FALLOC_FL_WRITE_ZEROES support to xfs Pankaj Raghav
2026-04-13 13:32 ` [PATCH v2 1/2] xfs: add xfs_bmap_alloc_or_convert_range function Pankaj Raghav
@ 2026-04-13 13:32 ` Pankaj Raghav
2026-04-13 15:05 ` Lukas Herbolt
2026-04-13 15:07 ` [PATCH v2 0/2] add FALLOC_FL_WRITE_ZEROES support to xfs Lukas Herbolt
2 siblings, 1 reply; 7+ messages in thread
From: Pankaj Raghav @ 2026-04-13 13:32 UTC (permalink / raw)
To: linux-xfs
Cc: bfoster, lukas, Darrick J . Wong, p.raghav, dgc, gost.dev,
pankaj.raghav, kundan.kumar, cem, hch
If the underlying block device supports the unmap write zeroes
operation, this flag allows users to quickly preallocate a file with
written extents that contain zeroes. This is beneficial for subsequent
overwrites as it prevents the need for unwritten-to-written extent
conversions, thereby significantly reducing metadata updates and journal
I/O overhead, improving overwrite performance.
Co-developed-by: Lukas Herbolt <lukas@herbolt.com>
Signed-off-by: Lukas Herbolt <lukas@herbolt.com>
Signed-off-by: Pankaj Raghav <p.raghav@samsung.com>
---
fs/xfs/xfs_file.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 55 insertions(+), 1 deletion(-)
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 845a97c9b063..99a02982154a 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1368,6 +1368,57 @@ xfs_falloc_force_zero(
return XFS_TEST_ERROR(ip->i_mount, XFS_ERRTAG_FORCE_ZERO_RANGE);
}
+static int
+xfs_falloc_write_zeroes(
+ struct file *file,
+ int mode,
+ loff_t offset,
+ loff_t len,
+ struct xfs_zone_alloc_ctx *ac)
+{
+ struct inode *inode = file_inode(file);
+ struct xfs_inode *ip = XFS_I(inode);
+ loff_t new_size = 0;
+ loff_t old_size = XFS_ISIZE(ip);
+ int error;
+ unsigned int blksize = i_blocksize(inode);
+ loff_t offset_aligned = round_down(offset, blksize);
+ bool did_zero;
+
+ if (xfs_is_always_cow_inode(ip) ||
+ !bdev_write_zeroes_unmap_sectors(
+ xfs_inode_buftarg(XFS_I(inode))->bt_bdev))
+ return -EOPNOTSUPP;
+
+ error = xfs_falloc_newsize(file, mode, offset, len, &new_size);
+ if (error)
+ return error;
+
+ error = xfs_free_file_space(ip, offset, len, ac);
+ if (error)
+ return error;
+
+ /*
+ * Zero the tail of the old EOF block and any space up to the new
+ * offset.
+ * In the usual truncate path, xfs_falloc_setsize takes care of
+ * zeroing those blocks.
+ */
+ if (offset_aligned > old_size)
+ error = xfs_zero_range(ip, old_size, offset_aligned - old_size,
+ NULL, &did_zero);
+ if (error)
+ return error;
+
+ error = xfs_bmap_alloc_or_convert_range(ip, offset, len,
+ XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO,
+ new_size ? true : false);
+ if (error)
+ return error;
+
+ return error;
+}
+
/*
* Punch a hole and prealloc the range. We use a hole punch rather than
* unwritten extent conversion for two reasons:
@@ -1470,7 +1521,7 @@ xfs_falloc_allocate_range(
(FALLOC_FL_ALLOCATE_RANGE | FALLOC_FL_KEEP_SIZE | \
FALLOC_FL_PUNCH_HOLE | FALLOC_FL_COLLAPSE_RANGE | \
FALLOC_FL_ZERO_RANGE | FALLOC_FL_INSERT_RANGE | \
- FALLOC_FL_UNSHARE_RANGE)
+ FALLOC_FL_UNSHARE_RANGE | FALLOC_FL_WRITE_ZEROES)
STATIC long
__xfs_file_fallocate(
@@ -1522,6 +1573,9 @@ __xfs_file_fallocate(
case FALLOC_FL_ALLOCATE_RANGE:
error = xfs_falloc_allocate_range(file, mode, offset, len);
break;
+ case FALLOC_FL_WRITE_ZEROES:
+ error = xfs_falloc_write_zeroes(file, mode, offset, len, ac);
+ break;
default:
error = -EOPNOTSUPP;
break;
--
2.51.2
^ permalink raw reply related [flat|nested] 7+ messages in thread