* [PATCH v11 2/2] xfs: add FALLOC_FL_WRITE_ZEROES to XFS code base @ 2026-03-09 18:12 Lukas Herbolt 2026-03-10 0:44 ` Darrick J. Wong 2026-03-11 0:12 ` Dave Chinner 0 siblings, 2 replies; 13+ messages in thread From: Lukas Herbolt @ 2026-03-09 18:12 UTC (permalink / raw) To: linux-xfs; +Cc: cem, hch, djwong, p.raghav, Lukas Herbolt Add support for FALLOC_FL_WRITE_ZEROES if the underlying device enable the unmap write zeroes operation. Co-developed-by: Pankaj Raghav <p.raghav@samsung.com> Signed-off-by: Pankaj Raghav <p.raghav@samsung.com> Signed-off-by: Lukas Herbolt <lukas@herbolt.com> --- v11 changes: - split into 2 patches separating the bmapi_flags addition - 2 step allocation, to avoid zeroing beyond EOF fs/xfs/xfs_file.c | 41 +++++++++++++++++++++++++++++------------ 1 file changed, 29 insertions(+), 12 deletions(-) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index fd049a1fc9c6..f8c1611e3267 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1293,29 +1293,45 @@ xfs_falloc_zero_range( unsigned int blksize = i_blocksize(inode); loff_t new_size = 0; int error; + bool need_convert = false; trace_xfs_zero_file_space(ip); + if (mode & FALLOC_FL_WRITE_ZEROES) { + if (xfs_is_always_cow_inode(ip) || + !bdev_write_zeroes_unmap_sectors( + xfs_inode_buftarg(ip)->bt_bdev)) + return -EOPNOTSUPP; + need_convert = true; + } + error = xfs_falloc_newsize(file, mode, offset, len, &new_size); if (error) return error; if (xfs_falloc_force_zero(ip, ac)) { error = xfs_zero_range(ip, offset, len, ac, NULL); - } else { - error = xfs_free_file_space(ip, offset, len, ac); - if (error) - return error; - - len = round_up(offset + len, blksize) - - round_down(offset, blksize); - offset = round_down(offset, blksize); - error = xfs_alloc_file_space(ip, offset, len, - XFS_BMAPI_PREALLOC); + goto set_filesize; } + error = xfs_free_file_space(ip, offset, len, ac); if (error) return error; - return xfs_falloc_setsize(file, new_size); + + len = round_up(offset + len, blksize) - round_down(offset, blksize); + offset = round_down(offset, blksize); + error = xfs_alloc_file_space(ip, offset, len, XFS_BMAPI_PREALLOC); + +set_filesize: + if (error) + return error; + + error = xfs_falloc_setsize(file, new_size); + if (error) + return error; + if (need_convert) + error = xfs_alloc_file_space(ip, offset, len, + XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO); + return error; } static int @@ -1377,7 +1393,7 @@ xfs_falloc_allocate_range( (FALLOC_FL_ALLOCATE_RANGE | FALLOC_FL_KEEP_SIZE | \ FALLOC_FL_PUNCH_HOLE | FALLOC_FL_COLLAPSE_RANGE | \ FALLOC_FL_ZERO_RANGE | FALLOC_FL_INSERT_RANGE | \ - FALLOC_FL_UNSHARE_RANGE) + FALLOC_FL_UNSHARE_RANGE | FALLOC_FL_WRITE_ZEROES) STATIC long __xfs_file_fallocate( @@ -1420,6 +1436,7 @@ __xfs_file_fallocate( case FALLOC_FL_INSERT_RANGE: error = xfs_falloc_insert_range(file, offset, len); break; + case FALLOC_FL_WRITE_ZEROES: case FALLOC_FL_ZERO_RANGE: error = xfs_falloc_zero_range(file, mode, offset, len, ac); break; -- 2.53.0 ^ permalink raw reply related [flat|nested] 13+ messages in thread
* Re: [PATCH v11 2/2] xfs: add FALLOC_FL_WRITE_ZEROES to XFS code base 2026-03-09 18:12 [PATCH v11 2/2] xfs: add FALLOC_FL_WRITE_ZEROES to XFS code base Lukas Herbolt @ 2026-03-10 0:44 ` Darrick J. Wong 2026-03-10 10:10 ` Pankaj Raghav (Samsung) 2026-03-10 10:20 ` Lukas Herbolt 2026-03-11 0:12 ` Dave Chinner 1 sibling, 2 replies; 13+ messages in thread From: Darrick J. Wong @ 2026-03-10 0:44 UTC (permalink / raw) To: Lukas Herbolt; +Cc: linux-xfs, cem, hch, p.raghav On Mon, Mar 09, 2026 at 07:12:36PM +0100, Lukas Herbolt wrote: > Add support for FALLOC_FL_WRITE_ZEROES if the underlying device > enable the unmap write zeroes operation. > > Co-developed-by: Pankaj Raghav <p.raghav@samsung.com> > Signed-off-by: Pankaj Raghav <p.raghav@samsung.com> > Signed-off-by: Lukas Herbolt <lukas@herbolt.com> > > --- > v11 changes: > - split into 2 patches separating the bmapi_flags addition > - 2 step allocation, to avoid zeroing beyond EOF > > fs/xfs/xfs_file.c | 41 +++++++++++++++++++++++++++++------------ > 1 file changed, 29 insertions(+), 12 deletions(-) > > diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c > index fd049a1fc9c6..f8c1611e3267 100644 > --- a/fs/xfs/xfs_file.c > +++ b/fs/xfs/xfs_file.c > @@ -1293,29 +1293,45 @@ xfs_falloc_zero_range( > unsigned int blksize = i_blocksize(inode); > loff_t new_size = 0; > int error; > + bool need_convert = false; > > trace_xfs_zero_file_space(ip); > > + if (mode & FALLOC_FL_WRITE_ZEROES) { > + if (xfs_is_always_cow_inode(ip) || > + !bdev_write_zeroes_unmap_sectors( > + xfs_inode_buftarg(ip)->bt_bdev)) > + return -EOPNOTSUPP; > + need_convert = true; > + } > + > error = xfs_falloc_newsize(file, mode, offset, len, &new_size); > if (error) > return error; > > if (xfs_falloc_force_zero(ip, ac)) { > error = xfs_zero_range(ip, offset, len, ac, NULL); > - } else { > - error = xfs_free_file_space(ip, offset, len, ac); > - if (error) > - return error; > - > - len = round_up(offset + len, blksize) - > - round_down(offset, blksize); > - offset = round_down(offset, blksize); > - error = xfs_alloc_file_space(ip, offset, len, > - XFS_BMAPI_PREALLOC); > + goto set_filesize; > } > + error = xfs_free_file_space(ip, offset, len, ac); > if (error) > return error; > - return xfs_falloc_setsize(file, new_size); > + > + len = round_up(offset + len, blksize) - round_down(offset, blksize); > + offset = round_down(offset, blksize); > + error = xfs_alloc_file_space(ip, offset, len, XFS_BMAPI_PREALLOC); > + > +set_filesize: > + if (error) > + return error; > + > + error = xfs_falloc_setsize(file, new_size); > + if (error) > + return error; > + if (need_convert) > + error = xfs_alloc_file_space(ip, offset, len, > + XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO); > + return error; > } I can't help but think this would be cleaner as: static int xfs_falloc_write_zero_range( struct file *file, int mode, loff_t offset, loff_t len, struct xfs_zone_alloc_ctx *ac) { struct inode *inode = file_inode(file); struct xfs_inode *ip = XFS_I(inode); unsigned int blksize = i_blocksize(inode); loff_t new_size = 0; int error; trace_xfs_zero_file_space(ip); if (xfs_is_always_cow_inode(ip) || !bdev_write_zeroes_unmap_sectors( xfs_inode_buftarg(ip)->bt_bdev)) return -EOPNOTSUPP; error = xfs_falloc_newsize(file, mode, offset, len, &new_size); if (error) return error; if (xfs_falloc_force_zero(ip, ac)) { error = xfs_zero_range(ip, offset, len, ac, NULL); if (error) return error; return xfs_falloc_setsize(file, new_size); } error = xfs_free_file_space(ip, offset, len, ac); if (error) return error; len = round_up(offset + len, blksize) - round_down(offset, blksize); offset = round_down(offset, blksize); error = xfs_alloc_file_space(ip, offset, len); if (error) return error; error = xfs_falloc_setsize(file, new_size); if (error) return error; return xfs_alloc_file_space(ip, offset, len, XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO); } ...assuming there's even a point to WRITE_ZEROES on a zoned file? --D >> > static int > @@ -1377,7 +1393,7 @@ xfs_falloc_allocate_range( > (FALLOC_FL_ALLOCATE_RANGE | FALLOC_FL_KEEP_SIZE | \ > FALLOC_FL_PUNCH_HOLE | FALLOC_FL_COLLAPSE_RANGE | \ > FALLOC_FL_ZERO_RANGE | FALLOC_FL_INSERT_RANGE | \ > - FALLOC_FL_UNSHARE_RANGE) > + FALLOC_FL_UNSHARE_RANGE | FALLOC_FL_WRITE_ZEROES) > > STATIC long > __xfs_file_fallocate( > @@ -1420,6 +1436,7 @@ __xfs_file_fallocate( > case FALLOC_FL_INSERT_RANGE: > error = xfs_falloc_insert_range(file, offset, len); > break; > + case FALLOC_FL_WRITE_ZEROES: > case FALLOC_FL_ZERO_RANGE: > error = xfs_falloc_zero_range(file, mode, offset, len, ac); > break; > -- > 2.53.0 > > ^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH v11 2/2] xfs: add FALLOC_FL_WRITE_ZEROES to XFS code base 2026-03-10 0:44 ` Darrick J. Wong @ 2026-03-10 10:10 ` Pankaj Raghav (Samsung) 2026-03-10 11:22 ` Lukas Herbolt 2026-03-10 10:20 ` Lukas Herbolt 1 sibling, 1 reply; 13+ messages in thread From: Pankaj Raghav (Samsung) @ 2026-03-10 10:10 UTC (permalink / raw) To: Darrick J. Wong Cc: Lukas Herbolt, linux-xfs, cem, hch, p.raghav, pankaj.raghav > > I can't help but think this would be cleaner as: I agree. I think it looks much cleaner as a standalone function instead of merging it with existing zero range function. > > static int > xfs_falloc_write_zero_range( > struct file *file, > int mode, > loff_t offset, > loff_t len, > struct xfs_zone_alloc_ctx *ac) > { > struct inode *inode = file_inode(file); > struct xfs_inode *ip = XFS_I(inode); > unsigned int blksize = i_blocksize(inode); > loff_t new_size = 0; > int error; > > trace_xfs_zero_file_space(ip); > > if (xfs_is_always_cow_inode(ip) || Why should we check for this here but we don't do this check in xfs_falloc_zero_range()? There is a comment in xfs_falloc_allocate_range(): /* * If always_cow mode we can't use preallocations and thus should not * create them. */ if (xfs_is_always_cow_inode(XFS_I(inode))) return -EOPNOTSUPP; Don't we also use preallocations in xfs_falloc_zero_range()? > !bdev_write_zeroes_unmap_sectors( > xfs_inode_buftarg(ip)->bt_bdev)) > return -EOPNOTSUPP; > > error = xfs_falloc_newsize(file, mode, offset, len, &new_size); > if (error) > return error; > > if (xfs_falloc_force_zero(ip, ac)) { > error = xfs_zero_range(ip, offset, len, ac, NULL); > if (error) > return error; > > return xfs_falloc_setsize(file, new_size); > } > > error = xfs_free_file_space(ip, offset, len, ac); > if (error) > return error; > > len = round_up(offset + len, blksize) - > round_down(offset, blksize); > offset = round_down(offset, blksize); > error = xfs_alloc_file_space(ip, offset, len); missed passing XFS_BMAPI_PREALLOC here. > if (error) > return error; > > error = xfs_falloc_setsize(file, new_size); > if (error) > return error; > > return xfs_alloc_file_space(ip, offset, len, > XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO); > } > > ...assuming there's even a point to WRITE_ZEROES on a zoned file? Don't we return EOPNOTSUPP if it is a zoned file as xfs_is_always_cow_inode() checks for it anyway? -- Pankaj ^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH v11 2/2] xfs: add FALLOC_FL_WRITE_ZEROES to XFS code base 2026-03-10 10:10 ` Pankaj Raghav (Samsung) @ 2026-03-10 11:22 ` Lukas Herbolt 2026-03-10 15:02 ` Darrick J. Wong 0 siblings, 1 reply; 13+ messages in thread From: Lukas Herbolt @ 2026-03-10 11:22 UTC (permalink / raw) To: Pankaj Raghav (Samsung); +Cc: Darrick J. Wong, linux-xfs, cem, hch, p.raghav On 2026-03-10 11:10, Pankaj Raghav (Samsung) wrote: >> if (xfs_is_always_cow_inode(ip) || > > Why should we check for this here but we don't do this check in > xfs_falloc_zero_range()? > > There is a comment in xfs_falloc_allocate_range(): > > /* > * If always_cow mode we can't use preallocations and thus should not > * create them. > */ > if (xfs_is_always_cow_inode(XFS_I(inode))) > return -EOPNOTSUPP; > > Don't we also use preallocations in xfs_falloc_zero_range()? > For the xfs_falloc_zero_range we return 0 on COW from xfs_alloc_file_space and that results in creating a hole in the file instead of having preallocated extents as shown bellow. But for the xfs_falloc_write_zero_range(), we want fail as soon as possible as possible. Feel free to correct me if I am wrong. int xfs_alloc_file_space( struct xfs_inode *ip, xfs_off_t offset, xfs_off_t len) { xfs_mount_t *mp = ip->i_mount; xfs_off_t count; xfs_filblks_t allocatesize_fsb; xfs_extlen_t extsz, temp; xfs_fileoff_t startoffset_fsb; xfs_fileoff_t endoffset_fsb; int rt; xfs_trans_t *tp; xfs_bmbt_irec_t imaps[1], *imapp; int error; if (xfs_is_always_cow_inode(ip)) return 0; root@build-00:/mnt# dd if=/dev/urandom of=/mnt/test.bin bs=10M oflag=direct count=1 1+0 records in 1+0 records out 10485760 bytes (10 MB, 10 MiB) copied, 0.167452 s, 62.6 MB/s root@build-00:/mnt# xfs_bmap -vp test.bin test.bin: EXT: FILE-OFFSET BLOCK-RANGE AG AG-OFFSET TOTAL FLAGS 0: [0..20479]: 192..20671 0 (192..20671) 20480 000000 root@build-00:/mnt# strace -e fallocate fallocate -zl 2M test.bin fallocate(3, FALLOC_FL_ZERO_RANGE, 0, 2097152) = 0 +++ exited with 0 +++ root@build-00:/mnt# xfs_bmap -vp test.bin test.bin: EXT: FILE-OFFSET BLOCK-RANGE AG AG-OFFSET TOTAL FLAGS 0: [0..4095]: hole 4096 1: [4096..20479]: 4288..20671 0 (4288..20671) 16384 000000 root@build-00:/mnt# dd if=/dev/urandom of=/mnt/test.bin bs=1M oflag=direct count=1 conv=notrunc 1+0 records in 1+0 records out 1048576 bytes (1.0 MB, 1.0 MiB) copied, 0.0172151 s, 60.9 MB/s root@build-00:/mnt# xfs_bmap -vp test.bin test.bin: EXT: FILE-OFFSET BLOCK-RANGE AG AG-OFFSET TOTAL FLAGS 0: [0..2047]: 192..2239 0 (192..2239) 2048 000000 1: [2048..4095]: hole 2048 2: [4096..20479]: 4288..20671 0 (4288..20671) 16384 000000 root@build-00:/mnt# cat /sys/fs/xfs/debug/ always_cow bload_node_slack larp mount_delay bload_leaf_slack bug_on_assert log_recovery_delay pwork_threads root@build-00:/mnt# cat /sys/fs/xfs/debug/always_cow 1 -- -lhe ^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH v11 2/2] xfs: add FALLOC_FL_WRITE_ZEROES to XFS code base 2026-03-10 11:22 ` Lukas Herbolt @ 2026-03-10 15:02 ` Darrick J. Wong 0 siblings, 0 replies; 13+ messages in thread From: Darrick J. Wong @ 2026-03-10 15:02 UTC (permalink / raw) To: Lukas Herbolt; +Cc: Pankaj Raghav (Samsung), linux-xfs, cem, hch, p.raghav On Tue, Mar 10, 2026 at 12:22:54PM +0100, Lukas Herbolt wrote: > On 2026-03-10 11:10, Pankaj Raghav (Samsung) wrote: > > > if (xfs_is_always_cow_inode(ip) || > > > > Why should we check for this here but we don't do this check in > > xfs_falloc_zero_range()? > > > > There is a comment in xfs_falloc_allocate_range(): > > > > /* > > * If always_cow mode we can't use preallocations and thus should not > > * create them. > > */ > > if (xfs_is_always_cow_inode(XFS_I(inode))) > > return -EOPNOTSUPP; > > > > Don't we also use preallocations in xfs_falloc_zero_range()? > > > > For the xfs_falloc_zero_range we return 0 on COW from xfs_alloc_file_space > and that > results in creating a hole in the file instead of having preallocated > extents as shown > bellow. > > But for the xfs_falloc_write_zero_range(), we want fail as soon as possible > as possible. > > Feel free to correct me if I am wrong. You're correct, write-zero-range wants to actually write zeroes to storage to avoid remapping overhead during a write. However, zoned / alwayscow files always remap so this mode is not supported for them. --D > int > xfs_alloc_file_space( > struct xfs_inode *ip, > xfs_off_t offset, > xfs_off_t len) > { > xfs_mount_t *mp = ip->i_mount; > xfs_off_t count; > xfs_filblks_t allocatesize_fsb; > xfs_extlen_t extsz, temp; > xfs_fileoff_t startoffset_fsb; > xfs_fileoff_t endoffset_fsb; > int rt; > xfs_trans_t *tp; > xfs_bmbt_irec_t imaps[1], *imapp; > int error; > > if (xfs_is_always_cow_inode(ip)) > return 0; > > > root@build-00:/mnt# dd if=/dev/urandom of=/mnt/test.bin bs=10M oflag=direct > count=1 > 1+0 records in > 1+0 records out > 10485760 bytes (10 MB, 10 MiB) copied, 0.167452 s, 62.6 MB/s > root@build-00:/mnt# xfs_bmap -vp test.bin > test.bin: > EXT: FILE-OFFSET BLOCK-RANGE AG AG-OFFSET TOTAL FLAGS > 0: [0..20479]: 192..20671 0 (192..20671) 20480 000000 > root@build-00:/mnt# strace -e fallocate fallocate -zl 2M test.bin > fallocate(3, FALLOC_FL_ZERO_RANGE, 0, 2097152) = 0 > +++ exited with 0 +++ > root@build-00:/mnt# xfs_bmap -vp test.bin > test.bin: > EXT: FILE-OFFSET BLOCK-RANGE AG AG-OFFSET TOTAL FLAGS > 0: [0..4095]: hole 4096 > 1: [4096..20479]: 4288..20671 0 (4288..20671) 16384 000000 > root@build-00:/mnt# dd if=/dev/urandom of=/mnt/test.bin bs=1M oflag=direct > count=1 conv=notrunc > 1+0 records in > 1+0 records out > 1048576 bytes (1.0 MB, 1.0 MiB) copied, 0.0172151 s, 60.9 MB/s > root@build-00:/mnt# xfs_bmap -vp test.bin > test.bin: > EXT: FILE-OFFSET BLOCK-RANGE AG AG-OFFSET TOTAL FLAGS > 0: [0..2047]: 192..2239 0 (192..2239) 2048 000000 > 1: [2048..4095]: hole 2048 > 2: [4096..20479]: 4288..20671 0 (4288..20671) 16384 000000 > root@build-00:/mnt# cat /sys/fs/xfs/debug/ > always_cow bload_node_slack larp mount_delay > bload_leaf_slack bug_on_assert log_recovery_delay pwork_threads > root@build-00:/mnt# cat /sys/fs/xfs/debug/always_cow > 1 > > > -- > -lhe > ^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH v11 2/2] xfs: add FALLOC_FL_WRITE_ZEROES to XFS code base 2026-03-10 0:44 ` Darrick J. Wong 2026-03-10 10:10 ` Pankaj Raghav (Samsung) @ 2026-03-10 10:20 ` Lukas Herbolt 2026-03-10 14:57 ` Darrick J. Wong 1 sibling, 1 reply; 13+ messages in thread From: Lukas Herbolt @ 2026-03-10 10:20 UTC (permalink / raw) To: Darrick J. Wong; +Cc: linux-xfs, cem, hch, p.raghav On 2026-03-10 01:44, Darrick J. Wong wrote: > On Mon, Mar 09, 2026 at 07:12:36PM +0100, Lukas Herbolt wrote: >> Add support for FALLOC_FL_WRITE_ZEROES if the underlying device >> enable the unmap write zeroes operation. >> >> Co-developed-by: Pankaj Raghav <p.raghav@samsung.com> >> Signed-off-by: Pankaj Raghav <p.raghav@samsung.com> >> Signed-off-by: Lukas Herbolt <lukas@herbolt.com> >> >> --- >> v11 changes: >> - split into 2 patches separating the bmapi_flags addition >> - 2 step allocation, to avoid zeroing beyond EOF >> >> fs/xfs/xfs_file.c | 41 +++++++++++++++++++++++++++++------------ >> 1 file changed, 29 insertions(+), 12 deletions(-) >> >> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c >> index fd049a1fc9c6..f8c1611e3267 100644 >> --- a/fs/xfs/xfs_file.c >> +++ b/fs/xfs/xfs_file.c >> @@ -1293,29 +1293,45 @@ xfs_falloc_zero_range( >> unsigned int blksize = i_blocksize(inode); >> loff_t new_size = 0; >> int error; >> + bool need_convert = false; >> >> trace_xfs_zero_file_space(ip); >> >> + if (mode & FALLOC_FL_WRITE_ZEROES) { >> + if (xfs_is_always_cow_inode(ip) || >> + !bdev_write_zeroes_unmap_sectors( >> + xfs_inode_buftarg(ip)->bt_bdev)) >> + return -EOPNOTSUPP; >> + need_convert = true; >> + } >> + >> error = xfs_falloc_newsize(file, mode, offset, len, &new_size); >> if (error) >> return error; >> >> if (xfs_falloc_force_zero(ip, ac)) { >> error = xfs_zero_range(ip, offset, len, ac, NULL); >> - } else { >> - error = xfs_free_file_space(ip, offset, len, ac); >> - if (error) >> - return error; >> - >> - len = round_up(offset + len, blksize) - >> - round_down(offset, blksize); >> - offset = round_down(offset, blksize); >> - error = xfs_alloc_file_space(ip, offset, len, >> - XFS_BMAPI_PREALLOC); >> + goto set_filesize; >> } >> + error = xfs_free_file_space(ip, offset, len, ac); >> if (error) >> return error; >> - return xfs_falloc_setsize(file, new_size); >> + >> + len = round_up(offset + len, blksize) - round_down(offset, blksize); >> + offset = round_down(offset, blksize); >> + error = xfs_alloc_file_space(ip, offset, len, XFS_BMAPI_PREALLOC); >> + >> +set_filesize: >> + if (error) >> + return error; >> + >> + error = xfs_falloc_setsize(file, new_size); >> + if (error) >> + return error; >> + if (need_convert) >> + error = xfs_alloc_file_space(ip, offset, len, >> + XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO); >> + return error; >> } > > I can't help but think this would be cleaner as: > > static int > xfs_falloc_write_zero_range( > struct file *file, > int mode, > loff_t offset, > loff_t len, > struct xfs_zone_alloc_ctx *ac) > { > struct inode *inode = file_inode(file); > struct xfs_inode *ip = XFS_I(inode); > unsigned int blksize = i_blocksize(inode); > loff_t new_size = 0; > int error; > > trace_xfs_zero_file_space(ip); > > if (xfs_is_always_cow_inode(ip) || > !bdev_write_zeroes_unmap_sectors( > xfs_inode_buftarg(ip)->bt_bdev)) > return -EOPNOTSUPP; > > error = xfs_falloc_newsize(file, mode, offset, len, &new_size); > if (error) > return error; > > if (xfs_falloc_force_zero(ip, ac)) { > error = xfs_zero_range(ip, offset, len, ac, NULL); > if (error) > return error; > > return xfs_falloc_setsize(file, new_size); > } > > error = xfs_free_file_space(ip, offset, len, ac); > if (error) > return error; > > len = round_up(offset + len, blksize) - > round_down(offset, blksize); > offset = round_down(offset, blksize); > error = xfs_alloc_file_space(ip, offset, len); > if (error) > return error; > > error = xfs_falloc_setsize(file, new_size); > if (error) > return error; > > return xfs_alloc_file_space(ip, offset, len, > XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO); > } I didn't want to duplicate most of the xfs_falloc_zero_range, but if that's fine we can go this way. > ...assuming there's even a point to WRITE_ZEROES on a zoned file? I think this is covered in: if (xfs_is_always_cow_inode(ip) || !bdev_write_zeroes_unmap_sectors( xfs_inode_buftarg(ip)->bt_bdev)) return -EOPNOTSUPP; -- -lhe ^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH v11 2/2] xfs: add FALLOC_FL_WRITE_ZEROES to XFS code base 2026-03-10 10:20 ` Lukas Herbolt @ 2026-03-10 14:57 ` Darrick J. Wong 0 siblings, 0 replies; 13+ messages in thread From: Darrick J. Wong @ 2026-03-10 14:57 UTC (permalink / raw) To: Lukas Herbolt; +Cc: linux-xfs, cem, hch, p.raghav On Tue, Mar 10, 2026 at 11:20:54AM +0100, Lukas Herbolt wrote: > On 2026-03-10 01:44, Darrick J. Wong wrote: > > On Mon, Mar 09, 2026 at 07:12:36PM +0100, Lukas Herbolt wrote: > > > Add support for FALLOC_FL_WRITE_ZEROES if the underlying device > > > enable the unmap write zeroes operation. > > > > > > Co-developed-by: Pankaj Raghav <p.raghav@samsung.com> > > > Signed-off-by: Pankaj Raghav <p.raghav@samsung.com> > > > Signed-off-by: Lukas Herbolt <lukas@herbolt.com> > > > > > > --- > > > v11 changes: > > > - split into 2 patches separating the bmapi_flags addition > > > - 2 step allocation, to avoid zeroing beyond EOF > > > > > > fs/xfs/xfs_file.c | 41 +++++++++++++++++++++++++++++------------ > > > 1 file changed, 29 insertions(+), 12 deletions(-) > > > > > > diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c > > > index fd049a1fc9c6..f8c1611e3267 100644 > > > --- a/fs/xfs/xfs_file.c > > > +++ b/fs/xfs/xfs_file.c > > > @@ -1293,29 +1293,45 @@ xfs_falloc_zero_range( > > > unsigned int blksize = i_blocksize(inode); > > > loff_t new_size = 0; > > > int error; > > > + bool need_convert = false; > > > > > > trace_xfs_zero_file_space(ip); > > > > > > + if (mode & FALLOC_FL_WRITE_ZEROES) { > > > + if (xfs_is_always_cow_inode(ip) || > > > + !bdev_write_zeroes_unmap_sectors( > > > + xfs_inode_buftarg(ip)->bt_bdev)) > > > + return -EOPNOTSUPP; > > > + need_convert = true; > > > + } > > > + > > > error = xfs_falloc_newsize(file, mode, offset, len, &new_size); > > > if (error) > > > return error; > > > > > > if (xfs_falloc_force_zero(ip, ac)) { > > > error = xfs_zero_range(ip, offset, len, ac, NULL); > > > - } else { > > > - error = xfs_free_file_space(ip, offset, len, ac); > > > - if (error) > > > - return error; > > > - > > > - len = round_up(offset + len, blksize) - > > > - round_down(offset, blksize); > > > - offset = round_down(offset, blksize); > > > - error = xfs_alloc_file_space(ip, offset, len, > > > - XFS_BMAPI_PREALLOC); > > > + goto set_filesize; > > > } > > > + error = xfs_free_file_space(ip, offset, len, ac); > > > if (error) > > > return error; > > > - return xfs_falloc_setsize(file, new_size); > > > + > > > + len = round_up(offset + len, blksize) - round_down(offset, blksize); > > > + offset = round_down(offset, blksize); > > > + error = xfs_alloc_file_space(ip, offset, len, XFS_BMAPI_PREALLOC); > > > + > > > +set_filesize: > > > + if (error) > > > + return error; > > > + > > > + error = xfs_falloc_setsize(file, new_size); > > > + if (error) > > > + return error; > > > + if (need_convert) > > > + error = xfs_alloc_file_space(ip, offset, len, > > > + XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO); > > > + return error; > > > } > > > > I can't help but think this would be cleaner as: > > > > static int > > xfs_falloc_write_zero_range( > > struct file *file, > > int mode, > > loff_t offset, > > loff_t len, > > struct xfs_zone_alloc_ctx *ac) > > { > > struct inode *inode = file_inode(file); > > struct xfs_inode *ip = XFS_I(inode); > > unsigned int blksize = i_blocksize(inode); > > loff_t new_size = 0; > > int error; > > > > trace_xfs_zero_file_space(ip); > > > > if (xfs_is_always_cow_inode(ip) || > > !bdev_write_zeroes_unmap_sectors( > > xfs_inode_buftarg(ip)->bt_bdev)) > > return -EOPNOTSUPP; > > > > error = xfs_falloc_newsize(file, mode, offset, len, &new_size); > > if (error) > > return error; > > > > if (xfs_falloc_force_zero(ip, ac)) { > > error = xfs_zero_range(ip, offset, len, ac, NULL); > > if (error) > > return error; > > > > return xfs_falloc_setsize(file, new_size); > > } > > > > error = xfs_free_file_space(ip, offset, len, ac); > > if (error) > > return error; > > > > len = round_up(offset + len, blksize) - > > round_down(offset, blksize); > > offset = round_down(offset, blksize); > > error = xfs_alloc_file_space(ip, offset, len); > > if (error) > > return error; > > > > error = xfs_falloc_setsize(file, new_size); > > if (error) > > return error; > > > > return xfs_alloc_file_space(ip, offset, len, > > XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO); > > } > I didn't want to duplicate most of the xfs_falloc_zero_range, but if that's > fine we can go this way. I /much/ prefer that each fallocate mode have its own small cohesive function. We used to have one giant function with conditionals everywhere and it was very hard to understand. > > ...assuming there's even a point to WRITE_ZEROES on a zoned file? > I think this is covered in: > > if (xfs_is_always_cow_inode(ip) || > !bdev_write_zeroes_unmap_sectors( > xfs_inode_buftarg(ip)->bt_bdev)) > return -EOPNOTSUPP; Yeah, it is. I missed that, so this becomes shorter: static int xfs_falloc_write_zero_range( struct file *file, int mode, loff_t offset, loff_t len, struct xfs_zone_alloc_ctx *ac) { struct inode *inode = file_inode(file); struct xfs_inode *ip = XFS_I(inode); unsigned int blksize = i_blocksize(inode); loff_t new_size = 0; int error; if (xfs_is_always_cow_inode(ip) || !bdev_write_zeroes_unmap_sectors( xfs_inode_buftarg(ip)->bt_bdev)) return -EOPNOTSUPP; trace_xfs_write_zero_range(ip); error = xfs_falloc_newsize(file, mode, offset, len, &new_size); if (error) return error; error = xfs_free_file_space(ip, offset, len, ac); if (error) return error; len = round_up(offset + len, blksize) - round_down(offset, blksize); offset = round_down(offset, blksize); error = xfs_alloc_file_space(ip, offset, len, XFS_BMAPI_PREALLOC); if (error) return error; error = xfs_falloc_setsize(file, new_size); if (error) return error; return xfs_alloc_file_space(ip, offset, len, XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO); } --D ^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH v11 2/2] xfs: add FALLOC_FL_WRITE_ZEROES to XFS code base 2026-03-09 18:12 [PATCH v11 2/2] xfs: add FALLOC_FL_WRITE_ZEROES to XFS code base Lukas Herbolt 2026-03-10 0:44 ` Darrick J. Wong @ 2026-03-11 0:12 ` Dave Chinner 2026-03-12 21:36 ` Pankaj Raghav (Samsung) 2026-03-16 5:03 ` Lukas Herbolt 1 sibling, 2 replies; 13+ messages in thread From: Dave Chinner @ 2026-03-11 0:12 UTC (permalink / raw) To: Lukas Herbolt; +Cc: linux-xfs, cem, hch, djwong, p.raghav On Mon, Mar 09, 2026 at 07:12:36PM +0100, Lukas Herbolt wrote: > Add support for FALLOC_FL_WRITE_ZEROES if the underlying device > enable the unmap write zeroes operation. > > Co-developed-by: Pankaj Raghav <p.raghav@samsung.com> > Signed-off-by: Pankaj Raghav <p.raghav@samsung.com> > Signed-off-by: Lukas Herbolt <lukas@herbolt.com> > > --- > v11 changes: > - split into 2 patches separating the bmapi_flags addition > - 2 step allocation, to avoid zeroing beyond EOF > > fs/xfs/xfs_file.c | 41 +++++++++++++++++++++++++++++------------ > 1 file changed, 29 insertions(+), 12 deletions(-) > > diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c > index fd049a1fc9c6..f8c1611e3267 100644 > --- a/fs/xfs/xfs_file.c > +++ b/fs/xfs/xfs_file.c > @@ -1293,29 +1293,45 @@ xfs_falloc_zero_range( > unsigned int blksize = i_blocksize(inode); > loff_t new_size = 0; > int error; > + bool need_convert = false; > > trace_xfs_zero_file_space(ip); > > + if (mode & FALLOC_FL_WRITE_ZEROES) { > + if (xfs_is_always_cow_inode(ip) || > + !bdev_write_zeroes_unmap_sectors( > + xfs_inode_buftarg(ip)->bt_bdev)) > + return -EOPNOTSUPP; > + need_convert = true; > + } > + > error = xfs_falloc_newsize(file, mode, offset, len, &new_size); > if (error) > return error; > > if (xfs_falloc_force_zero(ip, ac)) { > error = xfs_zero_range(ip, offset, len, ac, NULL); > - } else { > - error = xfs_free_file_space(ip, offset, len, ac); > - if (error) > - return error; > - > - len = round_up(offset + len, blksize) - > - round_down(offset, blksize); > - offset = round_down(offset, blksize); > - error = xfs_alloc_file_space(ip, offset, len, > - XFS_BMAPI_PREALLOC); > + goto set_filesize; > } > + error = xfs_free_file_space(ip, offset, len, ac); > if (error) > return error; > - return xfs_falloc_setsize(file, new_size); > + > + len = round_up(offset + len, blksize) - round_down(offset, blksize); > + offset = round_down(offset, blksize); > + error = xfs_alloc_file_space(ip, offset, len, XFS_BMAPI_PREALLOC); xfs_alloc_file_space() already rounds the offset down and the range end upwards to block size boundaries (XFS_B_TO_FSBT() rounds down, XFS_B_TO_FSB rounds up). There is no need to do it here. > + > +set_filesize: > + if (error) > + return error; > + > + error = xfs_falloc_setsize(file, new_size); > + if (error) > + return error; > + if (need_convert) > + error = xfs_alloc_file_space(ip, offset, len, > + XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO); > + return error; Honestly, this "prealloc, set file size, then convert and zero" thing seems a bit ... clunky. I mean, this is trying to work around an issue with xfs_falloc_setsize() operating on written extents beyond EOF to set EOF. But why are we even using a generic truncate operation to set the EOF when we have already done most of the complex truncate work (exclusion, page cache invalidation, extent removal, etc) already? My logic: xfs_bmapi_write(off, len, XFS_BMAPI_ZERO) will allocate all the holes and delalloc extents over the given range as written extents that are initialised to contain zeroes. xfs_bmapi_write(off, len, XFS_BMAPI_CONVERT) will convert all the unwritten extents over a given range to written extents, but will not touch the data in those extents. xfs_bmapi_write(off, len, XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO) will allocate holes and delalloc extents and convert unwritten extents all to written extents, and it will write zeros to all of those extents. This latter case is exactly what we want, and if gives us zeroes on disk to the end of the extent beyond the new EOF. i.e. we've completed all the data IO, and so now all we need to do is update the file size transactionally, just like we do in IO completion. Indeed, xfs_iomap_write_unwritten() integrates this size update into post-IO unwritten extent conversion. i.e we already have code that does exactly what we need, except for tweaking the xfs_bmapi_write() flags for the zeroing behaviour we need. So, a helper function in fs/xfs/xfs_bmap_util.c such as this: /* * This function is used to allocate written extents over holes * and/or convert unwritten extents to written extents based on the * @flags passed to it. * * If @flags is zero, if will allocate written extents for holes and * delalloc extents across the range. * * If XFS_BMAPI_CONVERT is specified in @flags, then it will also do * conversion of unwritten extents in the range to written extents. * * If XFS_BMAPI_ZERO is specified in @flags, then both newly * allocated extents and converted unwritten extents will be * initialised to contain zeroes. * * If @update_isize is true, then if the range we are operating on * extends beyond the current EOF, extend i_size to offset+len * incrementally as extents in the range are allocated/converted. */ int xfs_bmap_alloc_or_convert_range( struct xfs_inode *ip, xfs_off_t offset, xfs_off_t len, uint32_t flags, bool update_isize) { < guts of xfs_iomap_write_unwritten > } Then xfs_iomap_write_unwritten() can be factored to a one-liner (or replaced altogether), and this new fallocate op pretty much becomes: error = xfs_bmap_alloc_or_convert_range(ip, offset, len, XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO, new_size ? true : false); The name of the function sucks but I can't think of anything better, so if you've got a better idea then change it. :) -Dave. -- Dave Chinner dgc@kernel.org ^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH v11 2/2] xfs: add FALLOC_FL_WRITE_ZEROES to XFS code base 2026-03-11 0:12 ` Dave Chinner @ 2026-03-12 21:36 ` Pankaj Raghav (Samsung) 2026-03-15 23:49 ` Dave Chinner 2026-03-16 5:03 ` Lukas Herbolt 1 sibling, 1 reply; 13+ messages in thread From: Pankaj Raghav (Samsung) @ 2026-03-12 21:36 UTC (permalink / raw) To: Dave Chinner; +Cc: Lukas Herbolt, linux-xfs, cem, hch, djwong, p.raghav > > + > > + len = round_up(offset + len, blksize) - round_down(offset, blksize); > > + offset = round_down(offset, blksize); > > + error = xfs_alloc_file_space(ip, offset, len, XFS_BMAPI_PREALLOC); > > xfs_alloc_file_space() already rounds the offset down and the range > end upwards to block size boundaries (XFS_B_TO_FSBT() rounds down, > XFS_B_TO_FSB rounds up). There is no need to do it here. > > > + > > +set_filesize: > > + if (error) > > + return error; > > + > > + error = xfs_falloc_setsize(file, new_size); > > + if (error) > > + return error; > > + if (need_convert) > > + error = xfs_alloc_file_space(ip, offset, len, > > + XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO); > > + return error; > > Honestly, this "prealloc, set file size, then convert and zero" thing > seems a bit ... clunky. > > I mean, this is trying to work around an issue with > xfs_falloc_setsize() operating on written extents beyond EOF to set > EOF. But why are we even using a generic truncate operation to set > the EOF when we have already done most of the complex truncate work > (exclusion, page cache invalidation, extent removal, etc) already? I agree. > > My logic: > > xfs_bmapi_write(off, len, XFS_BMAPI_ZERO) will allocate all the > holes and delalloc extents over the given range as written extents > that are initialised to contain zeroes. > > xfs_bmapi_write(off, len, XFS_BMAPI_CONVERT) will convert all the > unwritten extents over a given range to written extents, but will > not touch the data in those extents. > > xfs_bmapi_write(off, len, XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO) will > allocate holes and delalloc extents and convert unwritten extents > all to written extents, and it will write zeros to all of those > extents. > > This latter case is exactly what we want, and if gives us zeroes on > disk to the end of the extent beyond the new EOF. i.e. we've > completed all the data IO, and so now all we need > to do is update the file size transactionally, just like we do in > IO completion. > > Indeed, xfs_iomap_write_unwritten() integrates this size update > into post-IO unwritten extent conversion. i.e we already have code > that does exactly what we need, except for tweaking the > xfs_bmapi_write() flags for the zeroing behaviour we need. This makes sense. > > So, a helper function in fs/xfs/xfs_bmap_util.c such as this: > > /* > * This function is used to allocate written extents over holes > * and/or convert unwritten extents to written extents based on the > * @flags passed to it. > * > * If @flags is zero, if will allocate written extents for holes and s/zero/XFS_BMAPI_ZERO ? > * delalloc extents across the range. > * > * If XFS_BMAPI_CONVERT is specified in @flags, then it will also do > * conversion of unwritten extents in the range to written extents. > * > * If XFS_BMAPI_ZERO is specified in @flags, then both newly s/XFS_BMAPI_ZERO/(XFS_BMAPI_ZERO | XFS_BMAPI_CONVERT) ? > * allocated extents and converted unwritten extents will be > * initialised to contain zeroes. > * > * If @update_isize is true, then if the range we are operating on > * extends beyond the current EOF, extend i_size to offset+len > * incrementally as extents in the range are allocated/converted. > */ > int > xfs_bmap_alloc_or_convert_range( > struct xfs_inode *ip, > xfs_off_t offset, > xfs_off_t len, > uint32_t flags, > bool update_isize) > { > < guts of xfs_iomap_write_unwritten > > } > > Then xfs_iomap_write_unwritten() can be factored to a one-liner > (or replaced altogether), and this new fallocate op pretty much > becomes: > > error = xfs_bmap_alloc_or_convert_range(ip, offset, len, > XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO, > new_size ? true : false); > > The name of the function sucks but I can't think of anything better, > so if you've got a better idea then change it. :) Thanks for the explanation Dave! I can take a shot at your suggestion unless @Lukas already started working on it? -- Pankaj ^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH v11 2/2] xfs: add FALLOC_FL_WRITE_ZEROES to XFS code base 2026-03-12 21:36 ` Pankaj Raghav (Samsung) @ 2026-03-15 23:49 ` Dave Chinner 2026-03-16 7:23 ` Pankaj Raghav 0 siblings, 1 reply; 13+ messages in thread From: Dave Chinner @ 2026-03-15 23:49 UTC (permalink / raw) To: Pankaj Raghav (Samsung) Cc: Lukas Herbolt, linux-xfs, cem, hch, djwong, p.raghav On Thu, Mar 12, 2026 at 09:36:50PM +0000, Pankaj Raghav (Samsung) wrote: > > > + > > > + len = round_up(offset + len, blksize) - round_down(offset, blksize); > > > + offset = round_down(offset, blksize); > > > + error = xfs_alloc_file_space(ip, offset, len, XFS_BMAPI_PREALLOC); > > > > xfs_alloc_file_space() already rounds the offset down and the range > > end upwards to block size boundaries (XFS_B_TO_FSBT() rounds down, > > XFS_B_TO_FSB rounds up). There is no need to do it here. > > > > > + > > > +set_filesize: > > > + if (error) > > > + return error; > > > + > > > + error = xfs_falloc_setsize(file, new_size); > > > + if (error) > > > + return error; > > > + if (need_convert) > > > + error = xfs_alloc_file_space(ip, offset, len, > > > + XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO); > > > + return error; > > > > Honestly, this "prealloc, set file size, then convert and zero" thing > > seems a bit ... clunky. > > > > I mean, this is trying to work around an issue with > > xfs_falloc_setsize() operating on written extents beyond EOF to set > > EOF. But why are we even using a generic truncate operation to set > > the EOF when we have already done most of the complex truncate work > > (exclusion, page cache invalidation, extent removal, etc) already? > > I agree. > > > > > My logic: > > > > xfs_bmapi_write(off, len, XFS_BMAPI_ZERO) will allocate all the > > holes and delalloc extents over the given range as written extents > > that are initialised to contain zeroes. > > > > xfs_bmapi_write(off, len, XFS_BMAPI_CONVERT) will convert all the > > unwritten extents over a given range to written extents, but will > > not touch the data in those extents. > > > > xfs_bmapi_write(off, len, XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO) will > > allocate holes and delalloc extents and convert unwritten extents > > all to written extents, and it will write zeros to all of those > > extents. > > > > This latter case is exactly what we want, and if gives us zeroes on > > disk to the end of the extent beyond the new EOF. i.e. we've > > completed all the data IO, and so now all we need > > to do is update the file size transactionally, just like we do in > > IO completion. > > > > Indeed, xfs_iomap_write_unwritten() integrates this size update > > into post-IO unwritten extent conversion. i.e we already have code > > that does exactly what we need, except for tweaking the > > xfs_bmapi_write() flags for the zeroing behaviour we need. > > This makes sense. > > > > > So, a helper function in fs/xfs/xfs_bmap_util.c such as this: > > > > /* > > * This function is used to allocate written extents over holes > > * and/or convert unwritten extents to written extents based on the > > * @flags passed to it. > > * > > * If @flags is zero, if will allocate written extents for holes and > > s/zero/XFS_BMAPI_ZERO ? No, I explicitly meant flags = 0. > > * delalloc extents across the range. > > * > > * If XFS_BMAPI_CONVERT is specified in @flags, then it will also do > > * conversion of unwritten extents in the range to written extents. > > * > > * If XFS_BMAPI_ZERO is specified in @flags, then both newly > > s/XFS_BMAPI_ZERO/(XFS_BMAPI_ZERO | XFS_BMAPI_CONVERT) ? No I explictly describing what happens when you do "flags |= XFS_BMAPI_ZERO;" to any other xfs_bmapi_write() operation flag specification. i.e. XFS_BMAPI_ZERO is an optional behavioural modifier for allocation and/or unwritten extent conversion operations. i.e all it does is add initialisation of the new/modified extents to contain real zeroes. e.g. flags = 0 means "allocate holes/delalloc as uninitialised written extents". flags = 0 | XFS_BMAPI_ZERO means "allocate holes/delalloc and initialise them to contain zeros". Unwritten extents will not be modified by either operation. Cheers, Dave. -- Dave Chinner dgc@kernel.org ^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH v11 2/2] xfs: add FALLOC_FL_WRITE_ZEROES to XFS code base 2026-03-15 23:49 ` Dave Chinner @ 2026-03-16 7:23 ` Pankaj Raghav 0 siblings, 0 replies; 13+ messages in thread From: Pankaj Raghav @ 2026-03-16 7:23 UTC (permalink / raw) To: Dave Chinner; +Cc: Lukas Herbolt, linux-xfs, cem, hch, djwong, p.raghav >>> So, a helper function in fs/xfs/xfs_bmap_util.c such as this: >>> >>> /* >>> * This function is used to allocate written extents over holes >>> * and/or convert unwritten extents to written extents based on the >>> * @flags passed to it. >>> * >>> * If @flags is zero, if will allocate written extents for holes and >> >> s/zero/XFS_BMAPI_ZERO ? > > No, I explicitly meant flags = 0. > >>> * delalloc extents across the range. >>> * >>> * If XFS_BMAPI_CONVERT is specified in @flags, then it will also do >>> * conversion of unwritten extents in the range to written extents. >>> * >>> * If XFS_BMAPI_ZERO is specified in @flags, then both newly >> >> s/XFS_BMAPI_ZERO/(XFS_BMAPI_ZERO | XFS_BMAPI_CONVERT) ? > > No I explictly describing what happens when you do "flags |= > XFS_BMAPI_ZERO;" to any other xfs_bmapi_write() operation > flag specification. > > i.e. XFS_BMAPI_ZERO is an optional behavioural modifier for > allocation and/or unwritten extent conversion operations. i.e all it > does is add initialisation of the new/modified extents to contain > real zeroes. > > e.g. flags = 0 means "allocate holes/delalloc as uninitialised > written extents". flags = 0 | XFS_BMAPI_ZERO > means "allocate holes/delalloc and initialise them to contain > zeros". Unwritten extents will not be modified by either operation. > Ok, Thanks for the clarification. I was about to send the patches. I will make this change and send the patches. -- Pankaj ^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH v11 2/2] xfs: add FALLOC_FL_WRITE_ZEROES to XFS code base 2026-03-11 0:12 ` Dave Chinner 2026-03-12 21:36 ` Pankaj Raghav (Samsung) @ 2026-03-16 5:03 ` Lukas Herbolt 2026-03-17 12:20 ` Pankaj Raghav (Samsung) 1 sibling, 1 reply; 13+ messages in thread From: Lukas Herbolt @ 2026-03-16 5:03 UTC (permalink / raw) To: Dave Chinner; +Cc: linux-xfs, cem, hch, djwong, p.raghav > Honestly, this "prealloc, set file size, then convert and zero" thing > seems a bit ... clunky. > > I mean, this is trying to work around an issue with > xfs_falloc_setsize() operating on written extents beyond EOF to set > EOF. But why are we even using a generic truncate operation to set > the EOF when we have already done most of the complex truncate work > (exclusion, page cache invalidation, extent removal, etc) already? > > My logic: > > xfs_bmapi_write(off, len, XFS_BMAPI_ZERO) will allocate all the > holes and delalloc extents over the given range as written extents > that are initialised to contain zeroes. > > xfs_bmapi_write(off, len, XFS_BMAPI_CONVERT) will convert all the > unwritten extents over a given range to written extents, but will > not touch the data in those extents. > > xfs_bmapi_write(off, len, XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO) will > allocate holes and delalloc extents and convert unwritten extents > all to written extents, and it will write zeros to all of those > extents. > > This latter case is exactly what we want, and if gives us zeroes on > disk to the end of the extent beyond the new EOF. i.e. we've > completed all the data IO, and so now all we need > to do is update the file size transactionally, just like we do in > IO completion. > > Indeed, xfs_iomap_write_unwritten() integrates this size update > into post-IO unwritten extent conversion. i.e we already have code > that does exactly what we need, except for tweaking the > xfs_bmapi_write() flags for the zeroing behaviour we need. > > So, a helper function in fs/xfs/xfs_bmap_util.c such as this: > > /* > * This function is used to allocate written extents over holes > * and/or convert unwritten extents to written extents based on the > * @flags passed to it. > * > * If @flags is zero, if will allocate written extents for holes and > * delalloc extents across the range. > * > * If XFS_BMAPI_CONVERT is specified in @flags, then it will also do > * conversion of unwritten extents in the range to written extents. > * > * If XFS_BMAPI_ZERO is specified in @flags, then both newly > * allocated extents and converted unwritten extents will be > * initialised to contain zeroes. > * > * If @update_isize is true, then if the range we are operating on > * extends beyond the current EOF, extend i_size to offset+len > * incrementally as extents in the range are allocated/converted. > */ > int > xfs_bmap_alloc_or_convert_range( > struct xfs_inode *ip, > xfs_off_t offset, > xfs_off_t len, > uint32_t flags, > bool update_isize) > { > < guts of xfs_iomap_write_unwritten > > } > > Then xfs_iomap_write_unwritten() can be factored to a one-liner > (or replaced altogether), and this new fallocate op pretty much > becomes: > > error = xfs_bmap_alloc_or_convert_range(ip, offset, len, > XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO, > new_size ? true : false); > > The name of the function sucks but I can't think of anything better, > so if you've got a better idea then change it. :) > > -Dave. When I first start looking into it the xfs_iomap_write_unwritten() was first place I was exploring, but did not think of the the above. I started working on it but I will be travelling this week so after that you may expect v13. -- -lhe ^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH v11 2/2] xfs: add FALLOC_FL_WRITE_ZEROES to XFS code base 2026-03-16 5:03 ` Lukas Herbolt @ 2026-03-17 12:20 ` Pankaj Raghav (Samsung) 0 siblings, 0 replies; 13+ messages in thread From: Pankaj Raghav (Samsung) @ 2026-03-17 12:20 UTC (permalink / raw) To: Lukas Herbolt; +Cc: Dave Chinner, linux-xfs, cem, hch, djwong, p.raghav > > > > Then xfs_iomap_write_unwritten() can be factored to a one-liner > > (or replaced altogether), and this new fallocate op pretty much > > becomes: > > > > error = xfs_bmap_alloc_or_convert_range(ip, offset, len, > > XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO, > > new_size ? true : false); > > > > The name of the function sucks but I can't think of anything better, > > so if you've got a better idea then change it. :) > > > > -Dave. > > When I first start looking into it the xfs_iomap_write_unwritten() was first > place I was exploring, but did not think of the the above. I started working > on it but I will be travelling this week so after that you may > expect v13. @Lukas and @Dave: I took a shot at this in the meantime. Simple tests seems to be passing with this approach but I noticed I am getting a panic when I run generic/127 with my xfstests patches to ltp [1]. I will keep digging on the side. Let me know if something looks obviously wrong in the implementation. This is what I have for xfs_bmap_alloc_or_convert_range() [2]: /* * This function is used to allocate written extents over holes * and/or convert unwritten extents to written extents based on the * @flags passed to it. * * If @flags is zero, if will allocate written extents for holes and * delalloc extents across the range. * * If XFS_BMAPI_CONVERT is specified in @flags, then it will also do * conversion of unwritten extents in the range to written extents. * * If XFS_BMAPI_ZERO is specified in @flags, then both newly * allocated extents and converted unwritten extents will be * initialised to contain zeroes. * * If @update_isize is true, then if the range we are operating on * extends beyond the current EOF, extend i_size to offset+len * incrementally as extents in the range are allocated/converted. */ int xfs_bmap_alloc_or_convert_range( struct xfs_inode *ip, xfs_off_t offset, xfs_off_t count, uint32_t flags, bool update_isize) { xfs_mount_t *mp = ip->i_mount; xfs_fileoff_t offset_fsb; xfs_filblks_t count_fsb; xfs_filblks_t numblks_fsb; int nimaps; xfs_trans_t *tp; xfs_bmbt_irec_t imap; struct inode *inode = VFS_I(ip); xfs_fsize_t i_size; int error; ASSERT((flags & ~(XFS_BMAPI_ZERO | XFS_BMAPI_CONVERT)) == 0); offset_fsb = XFS_B_TO_FSBT(mp, offset); count_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count); count_fsb = (xfs_filblks_t)(count_fsb - offset_fsb); /* Attach dquots so that bmbt splits are accounted correctly. */ error = xfs_qm_dqattach(ip); if (error) return error; do { uint resblks, dblocks; /* * The transaction reservation is limited to a 32-bit block * count, hence we need to limit the number of blocks we are * trying to reserve to avoid an overflow. */ resblks = XFS_FILBLKS_MIN(count_fsb, XFS_MAX_BMBT_EXTLEN); /* * The blocks either need to be allocated or converted from * unwritten to written. Consider the allocation case to * cover both scenarios. */ dblocks = XFS_DIOSTRAT_SPACE_RES(mp, resblks); /* * Set up a transaction to convert the range of extents * based on the flags. Do allocations in a loop until * we have covered the range passed in. * * Note that we can't risk to recursing back into the filesystem * here as we might be asked to write out the same inode that we * complete here and might deadlock on the iolock. */ error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, dblocks, 0, true, &tp); if (error) return error; error = xfs_iext_count_extend(tp, ip, XFS_DATA_FORK, XFS_IEXT_WRITE_UNWRITTEN_CNT); if (error) goto error_on_bmapi_transaction; nimaps = 1; error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb, flags, 0, &imap, &nimaps); /* * TODO: should we try if we get ENOSR? */ if (error) goto error_on_bmapi_transaction; /* * Log the updated inode size as we go. We have to be careful * to only log it up to the actual write offset if it is * halfway into a block. */ i_size = XFS_FSB_TO_B(mp, offset_fsb + count_fsb); if (i_size > offset + count) i_size = offset + count; if (update_isize && i_size > i_size_read(inode)) i_size_write(inode, i_size); i_size = xfs_new_eof(ip, i_size); if (i_size) { ip->i_disk_size = i_size; xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); } error = xfs_trans_commit(tp); xfs_iunlock(ip, XFS_ILOCK_EXCL); if (error) return error; if (unlikely(!xfs_valid_startblock(ip, imap.br_startblock))) { xfs_bmap_mark_sick(ip, XFS_DATA_FORK); return xfs_alert_fsblock_zero(ip, &imap); } if ((numblks_fsb = imap.br_blockcount) == 0) { /* * The numblks_fsb value should always get * smaller, otherwise the loop is stuck. */ ASSERT(imap.br_blockcount); break; } offset_fsb += numblks_fsb; count_fsb -= numblks_fsb; } while (count_fsb > 0); return 0; error_on_bmapi_transaction: xfs_trans_cancel(tp); xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; } [1] https://lore.kernel.org/linux-xfs/20260312195308.738189-1-p.raghav@samsung.com/ [2] https://github.com/Panky-codes/linux/tree/xfs-falloc-write-zeroes-dave -- Pankaj ^ permalink raw reply [flat|nested] 13+ messages in thread
end of thread, other threads:[~2026-03-17 12:20 UTC | newest] Thread overview: 13+ messages (download: mbox.gz follow: Atom feed -- links below jump to the message on this page -- 2026-03-09 18:12 [PATCH v11 2/2] xfs: add FALLOC_FL_WRITE_ZEROES to XFS code base Lukas Herbolt 2026-03-10 0:44 ` Darrick J. Wong 2026-03-10 10:10 ` Pankaj Raghav (Samsung) 2026-03-10 11:22 ` Lukas Herbolt 2026-03-10 15:02 ` Darrick J. Wong 2026-03-10 10:20 ` Lukas Herbolt 2026-03-10 14:57 ` Darrick J. Wong 2026-03-11 0:12 ` Dave Chinner 2026-03-12 21:36 ` Pankaj Raghav (Samsung) 2026-03-15 23:49 ` Dave Chinner 2026-03-16 7:23 ` Pankaj Raghav 2026-03-16 5:03 ` Lukas Herbolt 2026-03-17 12:20 ` Pankaj Raghav (Samsung)
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox