* [PATCH v11 2/2] xfs: add FALLOC_FL_WRITE_ZEROES to XFS code base
@ 2026-03-09 18:12 Lukas Herbolt
2026-03-10 0:44 ` Darrick J. Wong
2026-03-11 0:12 ` Dave Chinner
0 siblings, 2 replies; 13+ messages in thread
From: Lukas Herbolt @ 2026-03-09 18:12 UTC (permalink / raw)
To: linux-xfs; +Cc: cem, hch, djwong, p.raghav, Lukas Herbolt
Add support for FALLOC_FL_WRITE_ZEROES if the underlying device
enable the unmap write zeroes operation.
Co-developed-by: Pankaj Raghav <p.raghav@samsung.com>
Signed-off-by: Pankaj Raghav <p.raghav@samsung.com>
Signed-off-by: Lukas Herbolt <lukas@herbolt.com>
---
v11 changes:
- split into 2 patches separating the bmapi_flags addition
- 2 step allocation, to avoid zeroing beyond EOF
fs/xfs/xfs_file.c | 41 +++++++++++++++++++++++++++++------------
1 file changed, 29 insertions(+), 12 deletions(-)
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index fd049a1fc9c6..f8c1611e3267 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1293,29 +1293,45 @@ xfs_falloc_zero_range(
unsigned int blksize = i_blocksize(inode);
loff_t new_size = 0;
int error;
+ bool need_convert = false;
trace_xfs_zero_file_space(ip);
+ if (mode & FALLOC_FL_WRITE_ZEROES) {
+ if (xfs_is_always_cow_inode(ip) ||
+ !bdev_write_zeroes_unmap_sectors(
+ xfs_inode_buftarg(ip)->bt_bdev))
+ return -EOPNOTSUPP;
+ need_convert = true;
+ }
+
error = xfs_falloc_newsize(file, mode, offset, len, &new_size);
if (error)
return error;
if (xfs_falloc_force_zero(ip, ac)) {
error = xfs_zero_range(ip, offset, len, ac, NULL);
- } else {
- error = xfs_free_file_space(ip, offset, len, ac);
- if (error)
- return error;
-
- len = round_up(offset + len, blksize) -
- round_down(offset, blksize);
- offset = round_down(offset, blksize);
- error = xfs_alloc_file_space(ip, offset, len,
- XFS_BMAPI_PREALLOC);
+ goto set_filesize;
}
+ error = xfs_free_file_space(ip, offset, len, ac);
if (error)
return error;
- return xfs_falloc_setsize(file, new_size);
+
+ len = round_up(offset + len, blksize) - round_down(offset, blksize);
+ offset = round_down(offset, blksize);
+ error = xfs_alloc_file_space(ip, offset, len, XFS_BMAPI_PREALLOC);
+
+set_filesize:
+ if (error)
+ return error;
+
+ error = xfs_falloc_setsize(file, new_size);
+ if (error)
+ return error;
+ if (need_convert)
+ error = xfs_alloc_file_space(ip, offset, len,
+ XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO);
+ return error;
}
static int
@@ -1377,7 +1393,7 @@ xfs_falloc_allocate_range(
(FALLOC_FL_ALLOCATE_RANGE | FALLOC_FL_KEEP_SIZE | \
FALLOC_FL_PUNCH_HOLE | FALLOC_FL_COLLAPSE_RANGE | \
FALLOC_FL_ZERO_RANGE | FALLOC_FL_INSERT_RANGE | \
- FALLOC_FL_UNSHARE_RANGE)
+ FALLOC_FL_UNSHARE_RANGE | FALLOC_FL_WRITE_ZEROES)
STATIC long
__xfs_file_fallocate(
@@ -1420,6 +1436,7 @@ __xfs_file_fallocate(
case FALLOC_FL_INSERT_RANGE:
error = xfs_falloc_insert_range(file, offset, len);
break;
+ case FALLOC_FL_WRITE_ZEROES:
case FALLOC_FL_ZERO_RANGE:
error = xfs_falloc_zero_range(file, mode, offset, len, ac);
break;
--
2.53.0
^ permalink raw reply related [flat|nested] 13+ messages in thread
* Re: [PATCH v11 2/2] xfs: add FALLOC_FL_WRITE_ZEROES to XFS code base
2026-03-09 18:12 [PATCH v11 2/2] xfs: add FALLOC_FL_WRITE_ZEROES to XFS code base Lukas Herbolt
@ 2026-03-10 0:44 ` Darrick J. Wong
2026-03-10 10:10 ` Pankaj Raghav (Samsung)
2026-03-10 10:20 ` Lukas Herbolt
2026-03-11 0:12 ` Dave Chinner
1 sibling, 2 replies; 13+ messages in thread
From: Darrick J. Wong @ 2026-03-10 0:44 UTC (permalink / raw)
To: Lukas Herbolt; +Cc: linux-xfs, cem, hch, p.raghav
On Mon, Mar 09, 2026 at 07:12:36PM +0100, Lukas Herbolt wrote:
> Add support for FALLOC_FL_WRITE_ZEROES if the underlying device
> enable the unmap write zeroes operation.
>
> Co-developed-by: Pankaj Raghav <p.raghav@samsung.com>
> Signed-off-by: Pankaj Raghav <p.raghav@samsung.com>
> Signed-off-by: Lukas Herbolt <lukas@herbolt.com>
>
> ---
> v11 changes:
> - split into 2 patches separating the bmapi_flags addition
> - 2 step allocation, to avoid zeroing beyond EOF
>
> fs/xfs/xfs_file.c | 41 +++++++++++++++++++++++++++++------------
> 1 file changed, 29 insertions(+), 12 deletions(-)
>
> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> index fd049a1fc9c6..f8c1611e3267 100644
> --- a/fs/xfs/xfs_file.c
> +++ b/fs/xfs/xfs_file.c
> @@ -1293,29 +1293,45 @@ xfs_falloc_zero_range(
> unsigned int blksize = i_blocksize(inode);
> loff_t new_size = 0;
> int error;
> + bool need_convert = false;
>
> trace_xfs_zero_file_space(ip);
>
> + if (mode & FALLOC_FL_WRITE_ZEROES) {
> + if (xfs_is_always_cow_inode(ip) ||
> + !bdev_write_zeroes_unmap_sectors(
> + xfs_inode_buftarg(ip)->bt_bdev))
> + return -EOPNOTSUPP;
> + need_convert = true;
> + }
> +
> error = xfs_falloc_newsize(file, mode, offset, len, &new_size);
> if (error)
> return error;
>
> if (xfs_falloc_force_zero(ip, ac)) {
> error = xfs_zero_range(ip, offset, len, ac, NULL);
> - } else {
> - error = xfs_free_file_space(ip, offset, len, ac);
> - if (error)
> - return error;
> -
> - len = round_up(offset + len, blksize) -
> - round_down(offset, blksize);
> - offset = round_down(offset, blksize);
> - error = xfs_alloc_file_space(ip, offset, len,
> - XFS_BMAPI_PREALLOC);
> + goto set_filesize;
> }
> + error = xfs_free_file_space(ip, offset, len, ac);
> if (error)
> return error;
> - return xfs_falloc_setsize(file, new_size);
> +
> + len = round_up(offset + len, blksize) - round_down(offset, blksize);
> + offset = round_down(offset, blksize);
> + error = xfs_alloc_file_space(ip, offset, len, XFS_BMAPI_PREALLOC);
> +
> +set_filesize:
> + if (error)
> + return error;
> +
> + error = xfs_falloc_setsize(file, new_size);
> + if (error)
> + return error;
> + if (need_convert)
> + error = xfs_alloc_file_space(ip, offset, len,
> + XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO);
> + return error;
> }
I can't help but think this would be cleaner as:
static int
xfs_falloc_write_zero_range(
struct file *file,
int mode,
loff_t offset,
loff_t len,
struct xfs_zone_alloc_ctx *ac)
{
struct inode *inode = file_inode(file);
struct xfs_inode *ip = XFS_I(inode);
unsigned int blksize = i_blocksize(inode);
loff_t new_size = 0;
int error;
trace_xfs_zero_file_space(ip);
if (xfs_is_always_cow_inode(ip) ||
!bdev_write_zeroes_unmap_sectors(
xfs_inode_buftarg(ip)->bt_bdev))
return -EOPNOTSUPP;
error = xfs_falloc_newsize(file, mode, offset, len, &new_size);
if (error)
return error;
if (xfs_falloc_force_zero(ip, ac)) {
error = xfs_zero_range(ip, offset, len, ac, NULL);
if (error)
return error;
return xfs_falloc_setsize(file, new_size);
}
error = xfs_free_file_space(ip, offset, len, ac);
if (error)
return error;
len = round_up(offset + len, blksize) -
round_down(offset, blksize);
offset = round_down(offset, blksize);
error = xfs_alloc_file_space(ip, offset, len);
if (error)
return error;
error = xfs_falloc_setsize(file, new_size);
if (error)
return error;
return xfs_alloc_file_space(ip, offset, len,
XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO);
}
...assuming there's even a point to WRITE_ZEROES on a zoned file?
--D
>>
> static int
> @@ -1377,7 +1393,7 @@ xfs_falloc_allocate_range(
> (FALLOC_FL_ALLOCATE_RANGE | FALLOC_FL_KEEP_SIZE | \
> FALLOC_FL_PUNCH_HOLE | FALLOC_FL_COLLAPSE_RANGE | \
> FALLOC_FL_ZERO_RANGE | FALLOC_FL_INSERT_RANGE | \
> - FALLOC_FL_UNSHARE_RANGE)
> + FALLOC_FL_UNSHARE_RANGE | FALLOC_FL_WRITE_ZEROES)
>
> STATIC long
> __xfs_file_fallocate(
> @@ -1420,6 +1436,7 @@ __xfs_file_fallocate(
> case FALLOC_FL_INSERT_RANGE:
> error = xfs_falloc_insert_range(file, offset, len);
> break;
> + case FALLOC_FL_WRITE_ZEROES:
> case FALLOC_FL_ZERO_RANGE:
> error = xfs_falloc_zero_range(file, mode, offset, len, ac);
> break;
> --
> 2.53.0
>
>
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH v11 2/2] xfs: add FALLOC_FL_WRITE_ZEROES to XFS code base
2026-03-10 0:44 ` Darrick J. Wong
@ 2026-03-10 10:10 ` Pankaj Raghav (Samsung)
2026-03-10 11:22 ` Lukas Herbolt
2026-03-10 10:20 ` Lukas Herbolt
1 sibling, 1 reply; 13+ messages in thread
From: Pankaj Raghav (Samsung) @ 2026-03-10 10:10 UTC (permalink / raw)
To: Darrick J. Wong
Cc: Lukas Herbolt, linux-xfs, cem, hch, p.raghav, pankaj.raghav
>
> I can't help but think this would be cleaner as:
I agree. I think it looks much cleaner as a standalone function instead
of merging it with existing zero range function.
>
> static int
> xfs_falloc_write_zero_range(
> struct file *file,
> int mode,
> loff_t offset,
> loff_t len,
> struct xfs_zone_alloc_ctx *ac)
> {
> struct inode *inode = file_inode(file);
> struct xfs_inode *ip = XFS_I(inode);
> unsigned int blksize = i_blocksize(inode);
> loff_t new_size = 0;
> int error;
>
> trace_xfs_zero_file_space(ip);
>
> if (xfs_is_always_cow_inode(ip) ||
Why should we check for this here but we don't do this check in xfs_falloc_zero_range()?
There is a comment in xfs_falloc_allocate_range():
/*
* If always_cow mode we can't use preallocations and thus should not
* create them.
*/
if (xfs_is_always_cow_inode(XFS_I(inode)))
return -EOPNOTSUPP;
Don't we also use preallocations in xfs_falloc_zero_range()?
> !bdev_write_zeroes_unmap_sectors(
> xfs_inode_buftarg(ip)->bt_bdev))
> return -EOPNOTSUPP;
>
> error = xfs_falloc_newsize(file, mode, offset, len, &new_size);
> if (error)
> return error;
>
> if (xfs_falloc_force_zero(ip, ac)) {
> error = xfs_zero_range(ip, offset, len, ac, NULL);
> if (error)
> return error;
>
> return xfs_falloc_setsize(file, new_size);
> }
>
> error = xfs_free_file_space(ip, offset, len, ac);
> if (error)
> return error;
>
> len = round_up(offset + len, blksize) -
> round_down(offset, blksize);
> offset = round_down(offset, blksize);
> error = xfs_alloc_file_space(ip, offset, len);
missed passing XFS_BMAPI_PREALLOC here.
> if (error)
> return error;
>
> error = xfs_falloc_setsize(file, new_size);
> if (error)
> return error;
>
> return xfs_alloc_file_space(ip, offset, len,
> XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO);
> }
>
> ...assuming there's even a point to WRITE_ZEROES on a zoned file?
Don't we return EOPNOTSUPP if it is a zoned file as
xfs_is_always_cow_inode() checks for it anyway?
--
Pankaj
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH v11 2/2] xfs: add FALLOC_FL_WRITE_ZEROES to XFS code base
2026-03-10 0:44 ` Darrick J. Wong
2026-03-10 10:10 ` Pankaj Raghav (Samsung)
@ 2026-03-10 10:20 ` Lukas Herbolt
2026-03-10 14:57 ` Darrick J. Wong
1 sibling, 1 reply; 13+ messages in thread
From: Lukas Herbolt @ 2026-03-10 10:20 UTC (permalink / raw)
To: Darrick J. Wong; +Cc: linux-xfs, cem, hch, p.raghav
On 2026-03-10 01:44, Darrick J. Wong wrote:
> On Mon, Mar 09, 2026 at 07:12:36PM +0100, Lukas Herbolt wrote:
>> Add support for FALLOC_FL_WRITE_ZEROES if the underlying device
>> enable the unmap write zeroes operation.
>>
>> Co-developed-by: Pankaj Raghav <p.raghav@samsung.com>
>> Signed-off-by: Pankaj Raghav <p.raghav@samsung.com>
>> Signed-off-by: Lukas Herbolt <lukas@herbolt.com>
>>
>> ---
>> v11 changes:
>> - split into 2 patches separating the bmapi_flags addition
>> - 2 step allocation, to avoid zeroing beyond EOF
>>
>> fs/xfs/xfs_file.c | 41 +++++++++++++++++++++++++++++------------
>> 1 file changed, 29 insertions(+), 12 deletions(-)
>>
>> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
>> index fd049a1fc9c6..f8c1611e3267 100644
>> --- a/fs/xfs/xfs_file.c
>> +++ b/fs/xfs/xfs_file.c
>> @@ -1293,29 +1293,45 @@ xfs_falloc_zero_range(
>> unsigned int blksize = i_blocksize(inode);
>> loff_t new_size = 0;
>> int error;
>> + bool need_convert = false;
>>
>> trace_xfs_zero_file_space(ip);
>>
>> + if (mode & FALLOC_FL_WRITE_ZEROES) {
>> + if (xfs_is_always_cow_inode(ip) ||
>> + !bdev_write_zeroes_unmap_sectors(
>> + xfs_inode_buftarg(ip)->bt_bdev))
>> + return -EOPNOTSUPP;
>> + need_convert = true;
>> + }
>> +
>> error = xfs_falloc_newsize(file, mode, offset, len, &new_size);
>> if (error)
>> return error;
>>
>> if (xfs_falloc_force_zero(ip, ac)) {
>> error = xfs_zero_range(ip, offset, len, ac, NULL);
>> - } else {
>> - error = xfs_free_file_space(ip, offset, len, ac);
>> - if (error)
>> - return error;
>> -
>> - len = round_up(offset + len, blksize) -
>> - round_down(offset, blksize);
>> - offset = round_down(offset, blksize);
>> - error = xfs_alloc_file_space(ip, offset, len,
>> - XFS_BMAPI_PREALLOC);
>> + goto set_filesize;
>> }
>> + error = xfs_free_file_space(ip, offset, len, ac);
>> if (error)
>> return error;
>> - return xfs_falloc_setsize(file, new_size);
>> +
>> + len = round_up(offset + len, blksize) - round_down(offset, blksize);
>> + offset = round_down(offset, blksize);
>> + error = xfs_alloc_file_space(ip, offset, len, XFS_BMAPI_PREALLOC);
>> +
>> +set_filesize:
>> + if (error)
>> + return error;
>> +
>> + error = xfs_falloc_setsize(file, new_size);
>> + if (error)
>> + return error;
>> + if (need_convert)
>> + error = xfs_alloc_file_space(ip, offset, len,
>> + XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO);
>> + return error;
>> }
>
> I can't help but think this would be cleaner as:
>
> static int
> xfs_falloc_write_zero_range(
> struct file *file,
> int mode,
> loff_t offset,
> loff_t len,
> struct xfs_zone_alloc_ctx *ac)
> {
> struct inode *inode = file_inode(file);
> struct xfs_inode *ip = XFS_I(inode);
> unsigned int blksize = i_blocksize(inode);
> loff_t new_size = 0;
> int error;
>
> trace_xfs_zero_file_space(ip);
>
> if (xfs_is_always_cow_inode(ip) ||
> !bdev_write_zeroes_unmap_sectors(
> xfs_inode_buftarg(ip)->bt_bdev))
> return -EOPNOTSUPP;
>
> error = xfs_falloc_newsize(file, mode, offset, len, &new_size);
> if (error)
> return error;
>
> if (xfs_falloc_force_zero(ip, ac)) {
> error = xfs_zero_range(ip, offset, len, ac, NULL);
> if (error)
> return error;
>
> return xfs_falloc_setsize(file, new_size);
> }
>
> error = xfs_free_file_space(ip, offset, len, ac);
> if (error)
> return error;
>
> len = round_up(offset + len, blksize) -
> round_down(offset, blksize);
> offset = round_down(offset, blksize);
> error = xfs_alloc_file_space(ip, offset, len);
> if (error)
> return error;
>
> error = xfs_falloc_setsize(file, new_size);
> if (error)
> return error;
>
> return xfs_alloc_file_space(ip, offset, len,
> XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO);
> }
I didn't want to duplicate most of the xfs_falloc_zero_range, but if
that's
fine we can go this way.
> ...assuming there's even a point to WRITE_ZEROES on a zoned file?
I think this is covered in:
if (xfs_is_always_cow_inode(ip) ||
!bdev_write_zeroes_unmap_sectors(
xfs_inode_buftarg(ip)->bt_bdev))
return -EOPNOTSUPP;
--
-lhe
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH v11 2/2] xfs: add FALLOC_FL_WRITE_ZEROES to XFS code base
2026-03-10 10:10 ` Pankaj Raghav (Samsung)
@ 2026-03-10 11:22 ` Lukas Herbolt
2026-03-10 15:02 ` Darrick J. Wong
0 siblings, 1 reply; 13+ messages in thread
From: Lukas Herbolt @ 2026-03-10 11:22 UTC (permalink / raw)
To: Pankaj Raghav (Samsung); +Cc: Darrick J. Wong, linux-xfs, cem, hch, p.raghav
On 2026-03-10 11:10, Pankaj Raghav (Samsung) wrote:
>> if (xfs_is_always_cow_inode(ip) ||
>
> Why should we check for this here but we don't do this check in
> xfs_falloc_zero_range()?
>
> There is a comment in xfs_falloc_allocate_range():
>
> /*
> * If always_cow mode we can't use preallocations and thus should not
> * create them.
> */
> if (xfs_is_always_cow_inode(XFS_I(inode)))
> return -EOPNOTSUPP;
>
> Don't we also use preallocations in xfs_falloc_zero_range()?
>
For the xfs_falloc_zero_range we return 0 on COW from
xfs_alloc_file_space and that
results in creating a hole in the file instead of having preallocated
extents as shown
bellow.
But for the xfs_falloc_write_zero_range(), we want fail as soon as
possible as possible.
Feel free to correct me if I am wrong.
int
xfs_alloc_file_space(
struct xfs_inode *ip,
xfs_off_t offset,
xfs_off_t len)
{
xfs_mount_t *mp = ip->i_mount;
xfs_off_t count;
xfs_filblks_t allocatesize_fsb;
xfs_extlen_t extsz, temp;
xfs_fileoff_t startoffset_fsb;
xfs_fileoff_t endoffset_fsb;
int rt;
xfs_trans_t *tp;
xfs_bmbt_irec_t imaps[1], *imapp;
int error;
if (xfs_is_always_cow_inode(ip))
return 0;
root@build-00:/mnt# dd if=/dev/urandom of=/mnt/test.bin bs=10M
oflag=direct count=1
1+0 records in
1+0 records out
10485760 bytes (10 MB, 10 MiB) copied, 0.167452 s, 62.6 MB/s
root@build-00:/mnt# xfs_bmap -vp test.bin
test.bin:
EXT: FILE-OFFSET BLOCK-RANGE AG AG-OFFSET TOTAL FLAGS
0: [0..20479]: 192..20671 0 (192..20671) 20480 000000
root@build-00:/mnt# strace -e fallocate fallocate -zl 2M test.bin
fallocate(3, FALLOC_FL_ZERO_RANGE, 0, 2097152) = 0
+++ exited with 0 +++
root@build-00:/mnt# xfs_bmap -vp test.bin
test.bin:
EXT: FILE-OFFSET BLOCK-RANGE AG AG-OFFSET TOTAL FLAGS
0: [0..4095]: hole 4096
1: [4096..20479]: 4288..20671 0 (4288..20671) 16384 000000
root@build-00:/mnt# dd if=/dev/urandom of=/mnt/test.bin bs=1M
oflag=direct count=1 conv=notrunc
1+0 records in
1+0 records out
1048576 bytes (1.0 MB, 1.0 MiB) copied, 0.0172151 s, 60.9 MB/s
root@build-00:/mnt# xfs_bmap -vp test.bin
test.bin:
EXT: FILE-OFFSET BLOCK-RANGE AG AG-OFFSET TOTAL FLAGS
0: [0..2047]: 192..2239 0 (192..2239) 2048 000000
1: [2048..4095]: hole 2048
2: [4096..20479]: 4288..20671 0 (4288..20671) 16384 000000
root@build-00:/mnt# cat /sys/fs/xfs/debug/
always_cow bload_node_slack larp mount_delay
bload_leaf_slack bug_on_assert log_recovery_delay
pwork_threads
root@build-00:/mnt# cat /sys/fs/xfs/debug/always_cow
1
--
-lhe
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH v11 2/2] xfs: add FALLOC_FL_WRITE_ZEROES to XFS code base
2026-03-10 10:20 ` Lukas Herbolt
@ 2026-03-10 14:57 ` Darrick J. Wong
0 siblings, 0 replies; 13+ messages in thread
From: Darrick J. Wong @ 2026-03-10 14:57 UTC (permalink / raw)
To: Lukas Herbolt; +Cc: linux-xfs, cem, hch, p.raghav
On Tue, Mar 10, 2026 at 11:20:54AM +0100, Lukas Herbolt wrote:
> On 2026-03-10 01:44, Darrick J. Wong wrote:
> > On Mon, Mar 09, 2026 at 07:12:36PM +0100, Lukas Herbolt wrote:
> > > Add support for FALLOC_FL_WRITE_ZEROES if the underlying device
> > > enable the unmap write zeroes operation.
> > >
> > > Co-developed-by: Pankaj Raghav <p.raghav@samsung.com>
> > > Signed-off-by: Pankaj Raghav <p.raghav@samsung.com>
> > > Signed-off-by: Lukas Herbolt <lukas@herbolt.com>
> > >
> > > ---
> > > v11 changes:
> > > - split into 2 patches separating the bmapi_flags addition
> > > - 2 step allocation, to avoid zeroing beyond EOF
> > >
> > > fs/xfs/xfs_file.c | 41 +++++++++++++++++++++++++++++------------
> > > 1 file changed, 29 insertions(+), 12 deletions(-)
> > >
> > > diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> > > index fd049a1fc9c6..f8c1611e3267 100644
> > > --- a/fs/xfs/xfs_file.c
> > > +++ b/fs/xfs/xfs_file.c
> > > @@ -1293,29 +1293,45 @@ xfs_falloc_zero_range(
> > > unsigned int blksize = i_blocksize(inode);
> > > loff_t new_size = 0;
> > > int error;
> > > + bool need_convert = false;
> > >
> > > trace_xfs_zero_file_space(ip);
> > >
> > > + if (mode & FALLOC_FL_WRITE_ZEROES) {
> > > + if (xfs_is_always_cow_inode(ip) ||
> > > + !bdev_write_zeroes_unmap_sectors(
> > > + xfs_inode_buftarg(ip)->bt_bdev))
> > > + return -EOPNOTSUPP;
> > > + need_convert = true;
> > > + }
> > > +
> > > error = xfs_falloc_newsize(file, mode, offset, len, &new_size);
> > > if (error)
> > > return error;
> > >
> > > if (xfs_falloc_force_zero(ip, ac)) {
> > > error = xfs_zero_range(ip, offset, len, ac, NULL);
> > > - } else {
> > > - error = xfs_free_file_space(ip, offset, len, ac);
> > > - if (error)
> > > - return error;
> > > -
> > > - len = round_up(offset + len, blksize) -
> > > - round_down(offset, blksize);
> > > - offset = round_down(offset, blksize);
> > > - error = xfs_alloc_file_space(ip, offset, len,
> > > - XFS_BMAPI_PREALLOC);
> > > + goto set_filesize;
> > > }
> > > + error = xfs_free_file_space(ip, offset, len, ac);
> > > if (error)
> > > return error;
> > > - return xfs_falloc_setsize(file, new_size);
> > > +
> > > + len = round_up(offset + len, blksize) - round_down(offset, blksize);
> > > + offset = round_down(offset, blksize);
> > > + error = xfs_alloc_file_space(ip, offset, len, XFS_BMAPI_PREALLOC);
> > > +
> > > +set_filesize:
> > > + if (error)
> > > + return error;
> > > +
> > > + error = xfs_falloc_setsize(file, new_size);
> > > + if (error)
> > > + return error;
> > > + if (need_convert)
> > > + error = xfs_alloc_file_space(ip, offset, len,
> > > + XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO);
> > > + return error;
> > > }
> >
> > I can't help but think this would be cleaner as:
> >
> > static int
> > xfs_falloc_write_zero_range(
> > struct file *file,
> > int mode,
> > loff_t offset,
> > loff_t len,
> > struct xfs_zone_alloc_ctx *ac)
> > {
> > struct inode *inode = file_inode(file);
> > struct xfs_inode *ip = XFS_I(inode);
> > unsigned int blksize = i_blocksize(inode);
> > loff_t new_size = 0;
> > int error;
> >
> > trace_xfs_zero_file_space(ip);
> >
> > if (xfs_is_always_cow_inode(ip) ||
> > !bdev_write_zeroes_unmap_sectors(
> > xfs_inode_buftarg(ip)->bt_bdev))
> > return -EOPNOTSUPP;
> >
> > error = xfs_falloc_newsize(file, mode, offset, len, &new_size);
> > if (error)
> > return error;
> >
> > if (xfs_falloc_force_zero(ip, ac)) {
> > error = xfs_zero_range(ip, offset, len, ac, NULL);
> > if (error)
> > return error;
> >
> > return xfs_falloc_setsize(file, new_size);
> > }
> >
> > error = xfs_free_file_space(ip, offset, len, ac);
> > if (error)
> > return error;
> >
> > len = round_up(offset + len, blksize) -
> > round_down(offset, blksize);
> > offset = round_down(offset, blksize);
> > error = xfs_alloc_file_space(ip, offset, len);
> > if (error)
> > return error;
> >
> > error = xfs_falloc_setsize(file, new_size);
> > if (error)
> > return error;
> >
> > return xfs_alloc_file_space(ip, offset, len,
> > XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO);
> > }
> I didn't want to duplicate most of the xfs_falloc_zero_range, but if that's
> fine we can go this way.
I /much/ prefer that each fallocate mode have its own small cohesive
function. We used to have one giant function with conditionals
everywhere and it was very hard to understand.
> > ...assuming there's even a point to WRITE_ZEROES on a zoned file?
> I think this is covered in:
>
> if (xfs_is_always_cow_inode(ip) ||
> !bdev_write_zeroes_unmap_sectors(
> xfs_inode_buftarg(ip)->bt_bdev))
> return -EOPNOTSUPP;
Yeah, it is. I missed that, so this becomes shorter:
static int
xfs_falloc_write_zero_range(
struct file *file,
int mode,
loff_t offset,
loff_t len,
struct xfs_zone_alloc_ctx *ac)
{
struct inode *inode = file_inode(file);
struct xfs_inode *ip = XFS_I(inode);
unsigned int blksize = i_blocksize(inode);
loff_t new_size = 0;
int error;
if (xfs_is_always_cow_inode(ip) ||
!bdev_write_zeroes_unmap_sectors(
xfs_inode_buftarg(ip)->bt_bdev))
return -EOPNOTSUPP;
trace_xfs_write_zero_range(ip);
error = xfs_falloc_newsize(file, mode, offset, len, &new_size);
if (error)
return error;
error = xfs_free_file_space(ip, offset, len, ac);
if (error)
return error;
len = round_up(offset + len, blksize) -
round_down(offset, blksize);
offset = round_down(offset, blksize);
error = xfs_alloc_file_space(ip, offset, len,
XFS_BMAPI_PREALLOC);
if (error)
return error;
error = xfs_falloc_setsize(file, new_size);
if (error)
return error;
return xfs_alloc_file_space(ip, offset, len,
XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO);
}
--D
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH v11 2/2] xfs: add FALLOC_FL_WRITE_ZEROES to XFS code base
2026-03-10 11:22 ` Lukas Herbolt
@ 2026-03-10 15:02 ` Darrick J. Wong
0 siblings, 0 replies; 13+ messages in thread
From: Darrick J. Wong @ 2026-03-10 15:02 UTC (permalink / raw)
To: Lukas Herbolt; +Cc: Pankaj Raghav (Samsung), linux-xfs, cem, hch, p.raghav
On Tue, Mar 10, 2026 at 12:22:54PM +0100, Lukas Herbolt wrote:
> On 2026-03-10 11:10, Pankaj Raghav (Samsung) wrote:
> > > if (xfs_is_always_cow_inode(ip) ||
> >
> > Why should we check for this here but we don't do this check in
> > xfs_falloc_zero_range()?
> >
> > There is a comment in xfs_falloc_allocate_range():
> >
> > /*
> > * If always_cow mode we can't use preallocations and thus should not
> > * create them.
> > */
> > if (xfs_is_always_cow_inode(XFS_I(inode)))
> > return -EOPNOTSUPP;
> >
> > Don't we also use preallocations in xfs_falloc_zero_range()?
> >
>
> For the xfs_falloc_zero_range we return 0 on COW from xfs_alloc_file_space
> and that
> results in creating a hole in the file instead of having preallocated
> extents as shown
> bellow.
>
> But for the xfs_falloc_write_zero_range(), we want fail as soon as possible
> as possible.
>
> Feel free to correct me if I am wrong.
You're correct, write-zero-range wants to actually write zeroes to
storage to avoid remapping overhead during a write. However, zoned /
alwayscow files always remap so this mode is not supported for them.
--D
> int
> xfs_alloc_file_space(
> struct xfs_inode *ip,
> xfs_off_t offset,
> xfs_off_t len)
> {
> xfs_mount_t *mp = ip->i_mount;
> xfs_off_t count;
> xfs_filblks_t allocatesize_fsb;
> xfs_extlen_t extsz, temp;
> xfs_fileoff_t startoffset_fsb;
> xfs_fileoff_t endoffset_fsb;
> int rt;
> xfs_trans_t *tp;
> xfs_bmbt_irec_t imaps[1], *imapp;
> int error;
>
> if (xfs_is_always_cow_inode(ip))
> return 0;
>
>
> root@build-00:/mnt# dd if=/dev/urandom of=/mnt/test.bin bs=10M oflag=direct
> count=1
> 1+0 records in
> 1+0 records out
> 10485760 bytes (10 MB, 10 MiB) copied, 0.167452 s, 62.6 MB/s
> root@build-00:/mnt# xfs_bmap -vp test.bin
> test.bin:
> EXT: FILE-OFFSET BLOCK-RANGE AG AG-OFFSET TOTAL FLAGS
> 0: [0..20479]: 192..20671 0 (192..20671) 20480 000000
> root@build-00:/mnt# strace -e fallocate fallocate -zl 2M test.bin
> fallocate(3, FALLOC_FL_ZERO_RANGE, 0, 2097152) = 0
> +++ exited with 0 +++
> root@build-00:/mnt# xfs_bmap -vp test.bin
> test.bin:
> EXT: FILE-OFFSET BLOCK-RANGE AG AG-OFFSET TOTAL FLAGS
> 0: [0..4095]: hole 4096
> 1: [4096..20479]: 4288..20671 0 (4288..20671) 16384 000000
> root@build-00:/mnt# dd if=/dev/urandom of=/mnt/test.bin bs=1M oflag=direct
> count=1 conv=notrunc
> 1+0 records in
> 1+0 records out
> 1048576 bytes (1.0 MB, 1.0 MiB) copied, 0.0172151 s, 60.9 MB/s
> root@build-00:/mnt# xfs_bmap -vp test.bin
> test.bin:
> EXT: FILE-OFFSET BLOCK-RANGE AG AG-OFFSET TOTAL FLAGS
> 0: [0..2047]: 192..2239 0 (192..2239) 2048 000000
> 1: [2048..4095]: hole 2048
> 2: [4096..20479]: 4288..20671 0 (4288..20671) 16384 000000
> root@build-00:/mnt# cat /sys/fs/xfs/debug/
> always_cow bload_node_slack larp mount_delay
> bload_leaf_slack bug_on_assert log_recovery_delay pwork_threads
> root@build-00:/mnt# cat /sys/fs/xfs/debug/always_cow
> 1
>
>
> --
> -lhe
>
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH v11 2/2] xfs: add FALLOC_FL_WRITE_ZEROES to XFS code base
2026-03-09 18:12 [PATCH v11 2/2] xfs: add FALLOC_FL_WRITE_ZEROES to XFS code base Lukas Herbolt
2026-03-10 0:44 ` Darrick J. Wong
@ 2026-03-11 0:12 ` Dave Chinner
2026-03-12 21:36 ` Pankaj Raghav (Samsung)
2026-03-16 5:03 ` Lukas Herbolt
1 sibling, 2 replies; 13+ messages in thread
From: Dave Chinner @ 2026-03-11 0:12 UTC (permalink / raw)
To: Lukas Herbolt; +Cc: linux-xfs, cem, hch, djwong, p.raghav
On Mon, Mar 09, 2026 at 07:12:36PM +0100, Lukas Herbolt wrote:
> Add support for FALLOC_FL_WRITE_ZEROES if the underlying device
> enable the unmap write zeroes operation.
>
> Co-developed-by: Pankaj Raghav <p.raghav@samsung.com>
> Signed-off-by: Pankaj Raghav <p.raghav@samsung.com>
> Signed-off-by: Lukas Herbolt <lukas@herbolt.com>
>
> ---
> v11 changes:
> - split into 2 patches separating the bmapi_flags addition
> - 2 step allocation, to avoid zeroing beyond EOF
>
> fs/xfs/xfs_file.c | 41 +++++++++++++++++++++++++++++------------
> 1 file changed, 29 insertions(+), 12 deletions(-)
>
> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> index fd049a1fc9c6..f8c1611e3267 100644
> --- a/fs/xfs/xfs_file.c
> +++ b/fs/xfs/xfs_file.c
> @@ -1293,29 +1293,45 @@ xfs_falloc_zero_range(
> unsigned int blksize = i_blocksize(inode);
> loff_t new_size = 0;
> int error;
> + bool need_convert = false;
>
> trace_xfs_zero_file_space(ip);
>
> + if (mode & FALLOC_FL_WRITE_ZEROES) {
> + if (xfs_is_always_cow_inode(ip) ||
> + !bdev_write_zeroes_unmap_sectors(
> + xfs_inode_buftarg(ip)->bt_bdev))
> + return -EOPNOTSUPP;
> + need_convert = true;
> + }
> +
> error = xfs_falloc_newsize(file, mode, offset, len, &new_size);
> if (error)
> return error;
>
> if (xfs_falloc_force_zero(ip, ac)) {
> error = xfs_zero_range(ip, offset, len, ac, NULL);
> - } else {
> - error = xfs_free_file_space(ip, offset, len, ac);
> - if (error)
> - return error;
> -
> - len = round_up(offset + len, blksize) -
> - round_down(offset, blksize);
> - offset = round_down(offset, blksize);
> - error = xfs_alloc_file_space(ip, offset, len,
> - XFS_BMAPI_PREALLOC);
> + goto set_filesize;
> }
> + error = xfs_free_file_space(ip, offset, len, ac);
> if (error)
> return error;
> - return xfs_falloc_setsize(file, new_size);
> +
> + len = round_up(offset + len, blksize) - round_down(offset, blksize);
> + offset = round_down(offset, blksize);
> + error = xfs_alloc_file_space(ip, offset, len, XFS_BMAPI_PREALLOC);
xfs_alloc_file_space() already rounds the offset down and the range
end upwards to block size boundaries (XFS_B_TO_FSBT() rounds down,
XFS_B_TO_FSB rounds up). There is no need to do it here.
> +
> +set_filesize:
> + if (error)
> + return error;
> +
> + error = xfs_falloc_setsize(file, new_size);
> + if (error)
> + return error;
> + if (need_convert)
> + error = xfs_alloc_file_space(ip, offset, len,
> + XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO);
> + return error;
Honestly, this "prealloc, set file size, then convert and zero" thing
seems a bit ... clunky.
I mean, this is trying to work around an issue with
xfs_falloc_setsize() operating on written extents beyond EOF to set
EOF. But why are we even using a generic truncate operation to set
the EOF when we have already done most of the complex truncate work
(exclusion, page cache invalidation, extent removal, etc) already?
My logic:
xfs_bmapi_write(off, len, XFS_BMAPI_ZERO) will allocate all the
holes and delalloc extents over the given range as written extents
that are initialised to contain zeroes.
xfs_bmapi_write(off, len, XFS_BMAPI_CONVERT) will convert all the
unwritten extents over a given range to written extents, but will
not touch the data in those extents.
xfs_bmapi_write(off, len, XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO) will
allocate holes and delalloc extents and convert unwritten extents
all to written extents, and it will write zeros to all of those
extents.
This latter case is exactly what we want, and if gives us zeroes on
disk to the end of the extent beyond the new EOF. i.e. we've
completed all the data IO, and so now all we need
to do is update the file size transactionally, just like we do in
IO completion.
Indeed, xfs_iomap_write_unwritten() integrates this size update
into post-IO unwritten extent conversion. i.e we already have code
that does exactly what we need, except for tweaking the
xfs_bmapi_write() flags for the zeroing behaviour we need.
So, a helper function in fs/xfs/xfs_bmap_util.c such as this:
/*
* This function is used to allocate written extents over holes
* and/or convert unwritten extents to written extents based on the
* @flags passed to it.
*
* If @flags is zero, if will allocate written extents for holes and
* delalloc extents across the range.
*
* If XFS_BMAPI_CONVERT is specified in @flags, then it will also do
* conversion of unwritten extents in the range to written extents.
*
* If XFS_BMAPI_ZERO is specified in @flags, then both newly
* allocated extents and converted unwritten extents will be
* initialised to contain zeroes.
*
* If @update_isize is true, then if the range we are operating on
* extends beyond the current EOF, extend i_size to offset+len
* incrementally as extents in the range are allocated/converted.
*/
int
xfs_bmap_alloc_or_convert_range(
struct xfs_inode *ip,
xfs_off_t offset,
xfs_off_t len,
uint32_t flags,
bool update_isize)
{
< guts of xfs_iomap_write_unwritten >
}
Then xfs_iomap_write_unwritten() can be factored to a one-liner
(or replaced altogether), and this new fallocate op pretty much
becomes:
error = xfs_bmap_alloc_or_convert_range(ip, offset, len,
XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO,
new_size ? true : false);
The name of the function sucks but I can't think of anything better,
so if you've got a better idea then change it. :)
-Dave.
--
Dave Chinner
dgc@kernel.org
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH v11 2/2] xfs: add FALLOC_FL_WRITE_ZEROES to XFS code base
2026-03-11 0:12 ` Dave Chinner
@ 2026-03-12 21:36 ` Pankaj Raghav (Samsung)
2026-03-15 23:49 ` Dave Chinner
2026-03-16 5:03 ` Lukas Herbolt
1 sibling, 1 reply; 13+ messages in thread
From: Pankaj Raghav (Samsung) @ 2026-03-12 21:36 UTC (permalink / raw)
To: Dave Chinner; +Cc: Lukas Herbolt, linux-xfs, cem, hch, djwong, p.raghav
> > +
> > + len = round_up(offset + len, blksize) - round_down(offset, blksize);
> > + offset = round_down(offset, blksize);
> > + error = xfs_alloc_file_space(ip, offset, len, XFS_BMAPI_PREALLOC);
>
> xfs_alloc_file_space() already rounds the offset down and the range
> end upwards to block size boundaries (XFS_B_TO_FSBT() rounds down,
> XFS_B_TO_FSB rounds up). There is no need to do it here.
>
> > +
> > +set_filesize:
> > + if (error)
> > + return error;
> > +
> > + error = xfs_falloc_setsize(file, new_size);
> > + if (error)
> > + return error;
> > + if (need_convert)
> > + error = xfs_alloc_file_space(ip, offset, len,
> > + XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO);
> > + return error;
>
> Honestly, this "prealloc, set file size, then convert and zero" thing
> seems a bit ... clunky.
>
> I mean, this is trying to work around an issue with
> xfs_falloc_setsize() operating on written extents beyond EOF to set
> EOF. But why are we even using a generic truncate operation to set
> the EOF when we have already done most of the complex truncate work
> (exclusion, page cache invalidation, extent removal, etc) already?
I agree.
>
> My logic:
>
> xfs_bmapi_write(off, len, XFS_BMAPI_ZERO) will allocate all the
> holes and delalloc extents over the given range as written extents
> that are initialised to contain zeroes.
>
> xfs_bmapi_write(off, len, XFS_BMAPI_CONVERT) will convert all the
> unwritten extents over a given range to written extents, but will
> not touch the data in those extents.
>
> xfs_bmapi_write(off, len, XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO) will
> allocate holes and delalloc extents and convert unwritten extents
> all to written extents, and it will write zeros to all of those
> extents.
>
> This latter case is exactly what we want, and if gives us zeroes on
> disk to the end of the extent beyond the new EOF. i.e. we've
> completed all the data IO, and so now all we need
> to do is update the file size transactionally, just like we do in
> IO completion.
>
> Indeed, xfs_iomap_write_unwritten() integrates this size update
> into post-IO unwritten extent conversion. i.e we already have code
> that does exactly what we need, except for tweaking the
> xfs_bmapi_write() flags for the zeroing behaviour we need.
This makes sense.
>
> So, a helper function in fs/xfs/xfs_bmap_util.c such as this:
>
> /*
> * This function is used to allocate written extents over holes
> * and/or convert unwritten extents to written extents based on the
> * @flags passed to it.
> *
> * If @flags is zero, if will allocate written extents for holes and
s/zero/XFS_BMAPI_ZERO ?
> * delalloc extents across the range.
> *
> * If XFS_BMAPI_CONVERT is specified in @flags, then it will also do
> * conversion of unwritten extents in the range to written extents.
> *
> * If XFS_BMAPI_ZERO is specified in @flags, then both newly
s/XFS_BMAPI_ZERO/(XFS_BMAPI_ZERO | XFS_BMAPI_CONVERT) ?
> * allocated extents and converted unwritten extents will be
> * initialised to contain zeroes.
> *
> * If @update_isize is true, then if the range we are operating on
> * extends beyond the current EOF, extend i_size to offset+len
> * incrementally as extents in the range are allocated/converted.
> */
> int
> xfs_bmap_alloc_or_convert_range(
> struct xfs_inode *ip,
> xfs_off_t offset,
> xfs_off_t len,
> uint32_t flags,
> bool update_isize)
> {
> < guts of xfs_iomap_write_unwritten >
> }
>
> Then xfs_iomap_write_unwritten() can be factored to a one-liner
> (or replaced altogether), and this new fallocate op pretty much
> becomes:
>
> error = xfs_bmap_alloc_or_convert_range(ip, offset, len,
> XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO,
> new_size ? true : false);
>
> The name of the function sucks but I can't think of anything better,
> so if you've got a better idea then change it. :)
Thanks for the explanation Dave! I can take a shot at your suggestion
unless @Lukas already started working on it?
--
Pankaj
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH v11 2/2] xfs: add FALLOC_FL_WRITE_ZEROES to XFS code base
2026-03-12 21:36 ` Pankaj Raghav (Samsung)
@ 2026-03-15 23:49 ` Dave Chinner
2026-03-16 7:23 ` Pankaj Raghav
0 siblings, 1 reply; 13+ messages in thread
From: Dave Chinner @ 2026-03-15 23:49 UTC (permalink / raw)
To: Pankaj Raghav (Samsung)
Cc: Lukas Herbolt, linux-xfs, cem, hch, djwong, p.raghav
On Thu, Mar 12, 2026 at 09:36:50PM +0000, Pankaj Raghav (Samsung) wrote:
> > > +
> > > + len = round_up(offset + len, blksize) - round_down(offset, blksize);
> > > + offset = round_down(offset, blksize);
> > > + error = xfs_alloc_file_space(ip, offset, len, XFS_BMAPI_PREALLOC);
> >
> > xfs_alloc_file_space() already rounds the offset down and the range
> > end upwards to block size boundaries (XFS_B_TO_FSBT() rounds down,
> > XFS_B_TO_FSB rounds up). There is no need to do it here.
> >
> > > +
> > > +set_filesize:
> > > + if (error)
> > > + return error;
> > > +
> > > + error = xfs_falloc_setsize(file, new_size);
> > > + if (error)
> > > + return error;
> > > + if (need_convert)
> > > + error = xfs_alloc_file_space(ip, offset, len,
> > > + XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO);
> > > + return error;
> >
> > Honestly, this "prealloc, set file size, then convert and zero" thing
> > seems a bit ... clunky.
> >
> > I mean, this is trying to work around an issue with
> > xfs_falloc_setsize() operating on written extents beyond EOF to set
> > EOF. But why are we even using a generic truncate operation to set
> > the EOF when we have already done most of the complex truncate work
> > (exclusion, page cache invalidation, extent removal, etc) already?
>
> I agree.
>
> >
> > My logic:
> >
> > xfs_bmapi_write(off, len, XFS_BMAPI_ZERO) will allocate all the
> > holes and delalloc extents over the given range as written extents
> > that are initialised to contain zeroes.
> >
> > xfs_bmapi_write(off, len, XFS_BMAPI_CONVERT) will convert all the
> > unwritten extents over a given range to written extents, but will
> > not touch the data in those extents.
> >
> > xfs_bmapi_write(off, len, XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO) will
> > allocate holes and delalloc extents and convert unwritten extents
> > all to written extents, and it will write zeros to all of those
> > extents.
> >
> > This latter case is exactly what we want, and if gives us zeroes on
> > disk to the end of the extent beyond the new EOF. i.e. we've
> > completed all the data IO, and so now all we need
> > to do is update the file size transactionally, just like we do in
> > IO completion.
> >
> > Indeed, xfs_iomap_write_unwritten() integrates this size update
> > into post-IO unwritten extent conversion. i.e we already have code
> > that does exactly what we need, except for tweaking the
> > xfs_bmapi_write() flags for the zeroing behaviour we need.
>
> This makes sense.
>
> >
> > So, a helper function in fs/xfs/xfs_bmap_util.c such as this:
> >
> > /*
> > * This function is used to allocate written extents over holes
> > * and/or convert unwritten extents to written extents based on the
> > * @flags passed to it.
> > *
> > * If @flags is zero, if will allocate written extents for holes and
>
> s/zero/XFS_BMAPI_ZERO ?
No, I explicitly meant flags = 0.
> > * delalloc extents across the range.
> > *
> > * If XFS_BMAPI_CONVERT is specified in @flags, then it will also do
> > * conversion of unwritten extents in the range to written extents.
> > *
> > * If XFS_BMAPI_ZERO is specified in @flags, then both newly
>
> s/XFS_BMAPI_ZERO/(XFS_BMAPI_ZERO | XFS_BMAPI_CONVERT) ?
No I explictly describing what happens when you do "flags |=
XFS_BMAPI_ZERO;" to any other xfs_bmapi_write() operation
flag specification.
i.e. XFS_BMAPI_ZERO is an optional behavioural modifier for
allocation and/or unwritten extent conversion operations. i.e all it
does is add initialisation of the new/modified extents to contain
real zeroes.
e.g. flags = 0 means "allocate holes/delalloc as uninitialised
written extents". flags = 0 | XFS_BMAPI_ZERO
means "allocate holes/delalloc and initialise them to contain
zeros". Unwritten extents will not be modified by either operation.
Cheers,
Dave.
--
Dave Chinner
dgc@kernel.org
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH v11 2/2] xfs: add FALLOC_FL_WRITE_ZEROES to XFS code base
2026-03-11 0:12 ` Dave Chinner
2026-03-12 21:36 ` Pankaj Raghav (Samsung)
@ 2026-03-16 5:03 ` Lukas Herbolt
2026-03-17 12:20 ` Pankaj Raghav (Samsung)
1 sibling, 1 reply; 13+ messages in thread
From: Lukas Herbolt @ 2026-03-16 5:03 UTC (permalink / raw)
To: Dave Chinner; +Cc: linux-xfs, cem, hch, djwong, p.raghav
> Honestly, this "prealloc, set file size, then convert and zero" thing
> seems a bit ... clunky.
>
> I mean, this is trying to work around an issue with
> xfs_falloc_setsize() operating on written extents beyond EOF to set
> EOF. But why are we even using a generic truncate operation to set
> the EOF when we have already done most of the complex truncate work
> (exclusion, page cache invalidation, extent removal, etc) already?
>
> My logic:
>
> xfs_bmapi_write(off, len, XFS_BMAPI_ZERO) will allocate all the
> holes and delalloc extents over the given range as written extents
> that are initialised to contain zeroes.
>
> xfs_bmapi_write(off, len, XFS_BMAPI_CONVERT) will convert all the
> unwritten extents over a given range to written extents, but will
> not touch the data in those extents.
>
> xfs_bmapi_write(off, len, XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO) will
> allocate holes and delalloc extents and convert unwritten extents
> all to written extents, and it will write zeros to all of those
> extents.
>
> This latter case is exactly what we want, and if gives us zeroes on
> disk to the end of the extent beyond the new EOF. i.e. we've
> completed all the data IO, and so now all we need
> to do is update the file size transactionally, just like we do in
> IO completion.
>
> Indeed, xfs_iomap_write_unwritten() integrates this size update
> into post-IO unwritten extent conversion. i.e we already have code
> that does exactly what we need, except for tweaking the
> xfs_bmapi_write() flags for the zeroing behaviour we need.
>
> So, a helper function in fs/xfs/xfs_bmap_util.c such as this:
>
> /*
> * This function is used to allocate written extents over holes
> * and/or convert unwritten extents to written extents based on the
> * @flags passed to it.
> *
> * If @flags is zero, if will allocate written extents for holes and
> * delalloc extents across the range.
> *
> * If XFS_BMAPI_CONVERT is specified in @flags, then it will also do
> * conversion of unwritten extents in the range to written extents.
> *
> * If XFS_BMAPI_ZERO is specified in @flags, then both newly
> * allocated extents and converted unwritten extents will be
> * initialised to contain zeroes.
> *
> * If @update_isize is true, then if the range we are operating on
> * extends beyond the current EOF, extend i_size to offset+len
> * incrementally as extents in the range are allocated/converted.
> */
> int
> xfs_bmap_alloc_or_convert_range(
> struct xfs_inode *ip,
> xfs_off_t offset,
> xfs_off_t len,
> uint32_t flags,
> bool update_isize)
> {
> < guts of xfs_iomap_write_unwritten >
> }
>
> Then xfs_iomap_write_unwritten() can be factored to a one-liner
> (or replaced altogether), and this new fallocate op pretty much
> becomes:
>
> error = xfs_bmap_alloc_or_convert_range(ip, offset, len,
> XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO,
> new_size ? true : false);
>
> The name of the function sucks but I can't think of anything better,
> so if you've got a better idea then change it. :)
>
> -Dave.
When I first start looking into it the xfs_iomap_write_unwritten() was
first place I was exploring, but did not think of the the above. I
started working on it but I will be travelling this week so after that
you may
expect v13.
--
-lhe
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH v11 2/2] xfs: add FALLOC_FL_WRITE_ZEROES to XFS code base
2026-03-15 23:49 ` Dave Chinner
@ 2026-03-16 7:23 ` Pankaj Raghav
0 siblings, 0 replies; 13+ messages in thread
From: Pankaj Raghav @ 2026-03-16 7:23 UTC (permalink / raw)
To: Dave Chinner; +Cc: Lukas Herbolt, linux-xfs, cem, hch, djwong, p.raghav
>>> So, a helper function in fs/xfs/xfs_bmap_util.c such as this:
>>>
>>> /*
>>> * This function is used to allocate written extents over holes
>>> * and/or convert unwritten extents to written extents based on the
>>> * @flags passed to it.
>>> *
>>> * If @flags is zero, if will allocate written extents for holes and
>>
>> s/zero/XFS_BMAPI_ZERO ?
>
> No, I explicitly meant flags = 0.
>
>>> * delalloc extents across the range.
>>> *
>>> * If XFS_BMAPI_CONVERT is specified in @flags, then it will also do
>>> * conversion of unwritten extents in the range to written extents.
>>> *
>>> * If XFS_BMAPI_ZERO is specified in @flags, then both newly
>>
>> s/XFS_BMAPI_ZERO/(XFS_BMAPI_ZERO | XFS_BMAPI_CONVERT) ?
>
> No I explictly describing what happens when you do "flags |=
> XFS_BMAPI_ZERO;" to any other xfs_bmapi_write() operation
> flag specification.
>
> i.e. XFS_BMAPI_ZERO is an optional behavioural modifier for
> allocation and/or unwritten extent conversion operations. i.e all it
> does is add initialisation of the new/modified extents to contain
> real zeroes.
>
> e.g. flags = 0 means "allocate holes/delalloc as uninitialised
> written extents". flags = 0 | XFS_BMAPI_ZERO
> means "allocate holes/delalloc and initialise them to contain
> zeros". Unwritten extents will not be modified by either operation.
>
Ok, Thanks for the clarification. I was about to send the patches.
I will make this change and send the patches.
--
Pankaj
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH v11 2/2] xfs: add FALLOC_FL_WRITE_ZEROES to XFS code base
2026-03-16 5:03 ` Lukas Herbolt
@ 2026-03-17 12:20 ` Pankaj Raghav (Samsung)
0 siblings, 0 replies; 13+ messages in thread
From: Pankaj Raghav (Samsung) @ 2026-03-17 12:20 UTC (permalink / raw)
To: Lukas Herbolt; +Cc: Dave Chinner, linux-xfs, cem, hch, djwong, p.raghav
> >
> > Then xfs_iomap_write_unwritten() can be factored to a one-liner
> > (or replaced altogether), and this new fallocate op pretty much
> > becomes:
> >
> > error = xfs_bmap_alloc_or_convert_range(ip, offset, len,
> > XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO,
> > new_size ? true : false);
> >
> > The name of the function sucks but I can't think of anything better,
> > so if you've got a better idea then change it. :)
> >
> > -Dave.
>
> When I first start looking into it the xfs_iomap_write_unwritten() was first
> place I was exploring, but did not think of the the above. I started working
> on it but I will be travelling this week so after that you may
> expect v13.
@Lukas and @Dave:
I took a shot at this in the meantime. Simple tests seems to be passing
with this approach but I noticed I am getting a panic when I run
generic/127 with my xfstests patches to ltp [1]. I will keep digging on
the side. Let me know if something looks obviously wrong in the
implementation.
This is what I have for xfs_bmap_alloc_or_convert_range() [2]:
/*
* This function is used to allocate written extents over holes
* and/or convert unwritten extents to written extents based on the
* @flags passed to it.
*
* If @flags is zero, if will allocate written extents for holes and
* delalloc extents across the range.
*
* If XFS_BMAPI_CONVERT is specified in @flags, then it will also do
* conversion of unwritten extents in the range to written extents.
*
* If XFS_BMAPI_ZERO is specified in @flags, then both newly
* allocated extents and converted unwritten extents will be
* initialised to contain zeroes.
*
* If @update_isize is true, then if the range we are operating on
* extends beyond the current EOF, extend i_size to offset+len
* incrementally as extents in the range are allocated/converted.
*/
int
xfs_bmap_alloc_or_convert_range(
struct xfs_inode *ip,
xfs_off_t offset,
xfs_off_t count,
uint32_t flags,
bool update_isize)
{
xfs_mount_t *mp = ip->i_mount;
xfs_fileoff_t offset_fsb;
xfs_filblks_t count_fsb;
xfs_filblks_t numblks_fsb;
int nimaps;
xfs_trans_t *tp;
xfs_bmbt_irec_t imap;
struct inode *inode = VFS_I(ip);
xfs_fsize_t i_size;
int error;
ASSERT((flags & ~(XFS_BMAPI_ZERO | XFS_BMAPI_CONVERT)) == 0);
offset_fsb = XFS_B_TO_FSBT(mp, offset);
count_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
count_fsb = (xfs_filblks_t)(count_fsb - offset_fsb);
/* Attach dquots so that bmbt splits are accounted correctly. */
error = xfs_qm_dqattach(ip);
if (error)
return error;
do {
uint resblks, dblocks;
/*
* The transaction reservation is limited to a 32-bit block
* count, hence we need to limit the number of blocks we are
* trying to reserve to avoid an overflow.
*/
resblks = XFS_FILBLKS_MIN(count_fsb, XFS_MAX_BMBT_EXTLEN);
/*
* The blocks either need to be allocated or converted from
* unwritten to written. Consider the allocation case to
* cover both scenarios.
*/
dblocks = XFS_DIOSTRAT_SPACE_RES(mp, resblks);
/*
* Set up a transaction to convert the range of extents
* based on the flags. Do allocations in a loop until
* we have covered the range passed in.
*
* Note that we can't risk to recursing back into the filesystem
* here as we might be asked to write out the same inode that we
* complete here and might deadlock on the iolock.
*/
error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, dblocks,
0, true, &tp);
if (error)
return error;
error = xfs_iext_count_extend(tp, ip, XFS_DATA_FORK,
XFS_IEXT_WRITE_UNWRITTEN_CNT);
if (error)
goto error_on_bmapi_transaction;
nimaps = 1;
error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb,
flags, 0, &imap, &nimaps);
/*
* TODO: should we try if we get ENOSR?
*/
if (error)
goto error_on_bmapi_transaction;
/*
* Log the updated inode size as we go. We have to be careful
* to only log it up to the actual write offset if it is
* halfway into a block.
*/
i_size = XFS_FSB_TO_B(mp, offset_fsb + count_fsb);
if (i_size > offset + count)
i_size = offset + count;
if (update_isize && i_size > i_size_read(inode))
i_size_write(inode, i_size);
i_size = xfs_new_eof(ip, i_size);
if (i_size) {
ip->i_disk_size = i_size;
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
}
error = xfs_trans_commit(tp);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
if (error)
return error;
if (unlikely(!xfs_valid_startblock(ip, imap.br_startblock))) {
xfs_bmap_mark_sick(ip, XFS_DATA_FORK);
return xfs_alert_fsblock_zero(ip, &imap);
}
if ((numblks_fsb = imap.br_blockcount) == 0) {
/*
* The numblks_fsb value should always get
* smaller, otherwise the loop is stuck.
*/
ASSERT(imap.br_blockcount);
break;
}
offset_fsb += numblks_fsb;
count_fsb -= numblks_fsb;
} while (count_fsb > 0);
return 0;
error_on_bmapi_transaction:
xfs_trans_cancel(tp);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
return error;
}
[1] https://lore.kernel.org/linux-xfs/20260312195308.738189-1-p.raghav@samsung.com/
[2] https://github.com/Panky-codes/linux/tree/xfs-falloc-write-zeroes-dave
--
Pankaj
^ permalink raw reply [flat|nested] 13+ messages in thread
end of thread, other threads:[~2026-03-17 12:20 UTC | newest]
Thread overview: 13+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-03-09 18:12 [PATCH v11 2/2] xfs: add FALLOC_FL_WRITE_ZEROES to XFS code base Lukas Herbolt
2026-03-10 0:44 ` Darrick J. Wong
2026-03-10 10:10 ` Pankaj Raghav (Samsung)
2026-03-10 11:22 ` Lukas Herbolt
2026-03-10 15:02 ` Darrick J. Wong
2026-03-10 10:20 ` Lukas Herbolt
2026-03-10 14:57 ` Darrick J. Wong
2026-03-11 0:12 ` Dave Chinner
2026-03-12 21:36 ` Pankaj Raghav (Samsung)
2026-03-15 23:49 ` Dave Chinner
2026-03-16 7:23 ` Pankaj Raghav
2026-03-16 5:03 ` Lukas Herbolt
2026-03-17 12:20 ` Pankaj Raghav (Samsung)
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox