public inbox for linux-xfs@vger.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/2] Add FL_WRITE_ZEROES to XFS, fix krealloc on xfs_uuid_table
@ 2025-10-21 14:17 Lukas Herbolt
  2025-10-21 14:17 ` [PATCH] xfs: add FALLOC_FL_WRITE_ZEROES to XFS code base Lukas Herbolt
  2025-10-21 14:17 ` [PATCH 2/2] xfs: Remove WARN_ONCE if xfs_uuid_table grows over 2x PAGE_SIZE Lukas Herbolt
  0 siblings, 2 replies; 25+ messages in thread
From: Lukas Herbolt @ 2025-10-21 14:17 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs, Lukas Herbolt

[PATCH 1/2] xfs: add FALLOC_FL_WRITE_ZEROES to XFS code base
Add support for FALLOC_FL_WRITE_ZEROES if the underlying device enable
the unmap write zeroes operation.

Inspired by the Ext4 implementation of the FALLOC_FL_WRITE_ZEROES. It
can speed up some patterns on specific hardware.

time ( ./fallocate -l 360M /mnt/test.file; dd if=/dev/zero of=/mnt/test \
bs=1M count=360 conv=notrunc,nocreat oflag=direct,dsync)

360+0 records in
360+0 records out
377487360 bytes (377 MB, 360 MiB) copied, 22.0027 s, 17.2 MB/s

real    0m22.114s
user    0m0.006s
sys     0m3.085s

time (./fallocate -wl 360M /mnt/test.file; dd if=/dev/zero of=/mnt/test \
bs=1M count=360 conv=notrunc,nocreat oflag=direct,dsync );
360+0 records in
360+0 records out
377487360 bytes (377 MB, 360 MiB) copied, 2.02512 s, 186 MB/s

real    0m6.384s
user    0m0.002s
sys     0m5.823s

v2 changes:
use xfs_inode_buftarg to determine if the underlying device supports unmap 
write zeroes
v1 patch: 
https://lore.kernel.org/linux-xfs/20251002122823.1875398-2-lukas@herbolt.com/

[PATCH 2/2] xfs: Remove WARN_ONCE if xfs_uuid_table grows over 2x PAGE_SIZE.
Currently using krealloc prints warning if the order is 2x PAGE_SIZE on 
x86_64 it's being trigered when we mount 511 XFS. Use kvrealloc instead.

Lukas Herbolt (2):
  xfs: add FALLOC_FL_WRITE_ZEROES to XFS code base
  xfs: Remove WARN_ONCE if xfs_uuid_table grows over 2x PAGE_SIZE.

 fs/xfs/xfs_bmap_util.c |  6 +++---
 fs/xfs/xfs_bmap_util.h |  4 ++--
 fs/xfs/xfs_file.c      | 25 ++++++++++++++++++-------
 fs/xfs/xfs_mount.c     |  2 +-
 4 files changed, 24 insertions(+), 13 deletions(-)

-- 
2.51.0


^ permalink raw reply	[flat|nested] 25+ messages in thread

* [PATCH] xfs: add FALLOC_FL_WRITE_ZEROES to XFS code base
  2025-10-21 14:17 [PATCH 0/2] Add FL_WRITE_ZEROES to XFS, fix krealloc on xfs_uuid_table Lukas Herbolt
@ 2025-10-21 14:17 ` Lukas Herbolt
  2025-10-21 15:55   ` Darrick J. Wong
  2025-10-22  5:00   ` Christoph Hellwig
  2025-10-21 14:17 ` [PATCH 2/2] xfs: Remove WARN_ONCE if xfs_uuid_table grows over 2x PAGE_SIZE Lukas Herbolt
  1 sibling, 2 replies; 25+ messages in thread
From: Lukas Herbolt @ 2025-10-21 14:17 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs, Lukas Herbolt

Add support for FALLOC_FL_WRITE_ZEROES if the underlying device enable
the unmap write zeroes operation.

Signed-off-by: Lukas Herbolt <lukas@herbolt.com>
---
 fs/xfs/xfs_bmap_util.c |  6 +++---
 fs/xfs/xfs_bmap_util.h |  4 ++--
 fs/xfs/xfs_file.c      | 25 ++++++++++++++++++-------
 3 files changed, 23 insertions(+), 12 deletions(-)

diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 06ca11731e430..fd43c9db79a8d 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -645,6 +645,7 @@ xfs_free_eofblocks(
 int
 xfs_alloc_file_space(
 	struct xfs_inode	*ip,
+	uint32_t		flags,		/* XFS_BMAPI_... */
 	xfs_off_t		offset,
 	xfs_off_t		len)
 {
@@ -747,9 +748,8 @@ xfs_alloc_file_space(
 		 * startoffset_fsb so that one of the following allocations
 		 * will eventually reach the requested range.
 		 */
-		error = xfs_bmapi_write(tp, ip, startoffset_fsb,
-				allocatesize_fsb, XFS_BMAPI_PREALLOC, 0, imapp,
-				&nimaps);
+		error = xfs_bmapi_write(tp, ip, startoffset_fsb, allocatesize_fsb,
+				flags, 0, imapp, &nimaps);
 		if (error) {
 			if (error != -ENOSR)
 				goto error;
diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h
index c477b33616304..67770830eb245 100644
--- a/fs/xfs/xfs_bmap_util.h
+++ b/fs/xfs/xfs_bmap_util.h
@@ -55,8 +55,8 @@ int	xfs_bmap_last_extent(struct xfs_trans *tp, struct xfs_inode *ip,
 			     int *is_empty);
 
 /* preallocation and hole punch interface */
-int	xfs_alloc_file_space(struct xfs_inode *ip, xfs_off_t offset,
-		xfs_off_t len);
+int	xfs_alloc_file_space(struct xfs_inode *ip, uint32_t flags,
+		xfs_off_t offset, xfs_off_t len);
 int	xfs_free_file_space(struct xfs_inode *ip, xfs_off_t offset,
 		xfs_off_t len, struct xfs_zone_alloc_ctx *ac);
 int	xfs_collapse_file_space(struct xfs_inode *, xfs_off_t offset,
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index f96fbf5c54c99..b7e8cda62bb73 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1255,29 +1255,37 @@ xfs_falloc_insert_range(
 static int
 xfs_falloc_zero_range(
 	struct file		*file,
-	int			mode,
+	int				mode,
 	loff_t			offset,
 	loff_t			len,
 	struct xfs_zone_alloc_ctx *ac)
 {
 	struct inode		*inode = file_inode(file);
+	struct xfs_inode	*ip = XFS_I(inode);
 	unsigned int		blksize = i_blocksize(inode);
 	loff_t			new_size = 0;
 	int			error;
 
-	trace_xfs_zero_file_space(XFS_I(inode));
+	trace_xfs_zero_file_space(ip);
 
 	error = xfs_falloc_newsize(file, mode, offset, len, &new_size);
 	if (error)
 		return error;
 
-	error = xfs_free_file_space(XFS_I(inode), offset, len, ac);
+	error = xfs_free_file_space(ip, offset, len, ac);
 	if (error)
 		return error;
 
 	len = round_up(offset + len, blksize) - round_down(offset, blksize);
 	offset = round_down(offset, blksize);
-	error = xfs_alloc_file_space(XFS_I(inode), offset, len);
+	if (mode & FALLOC_FL_WRITE_ZEROES) {
+		if (!bdev_write_zeroes_unmap_sectors(xfs_inode_buftarg(ip)->bt_bdev))
+			return -EOPNOTSUPP;
+		xfs_alloc_file_space(ip, XFS_BMAPI_ZERO, offset, len);
+	} else {
+		error = xfs_alloc_file_space(ip, XFS_BMAPI_PREALLOC,
+				offset, len);
+	}
 	if (error)
 		return error;
 	return xfs_falloc_setsize(file, new_size);
@@ -1302,7 +1310,8 @@ xfs_falloc_unshare_range(
 	if (error)
 		return error;
 
-	error = xfs_alloc_file_space(XFS_I(inode), offset, len);
+	error = xfs_alloc_file_space(XFS_I(inode), XFS_BMAPI_PREALLOC,
+			offset, len);
 	if (error)
 		return error;
 	return xfs_falloc_setsize(file, new_size);
@@ -1330,7 +1339,8 @@ xfs_falloc_allocate_range(
 	if (error)
 		return error;
 
-	error = xfs_alloc_file_space(XFS_I(inode), offset, len);
+	error = xfs_alloc_file_space(XFS_I(inode), XFS_BMAPI_PREALLOC,
+			offset, len);
 	if (error)
 		return error;
 	return xfs_falloc_setsize(file, new_size);
@@ -1340,7 +1350,7 @@ xfs_falloc_allocate_range(
 		(FALLOC_FL_ALLOCATE_RANGE | FALLOC_FL_KEEP_SIZE |	\
 		 FALLOC_FL_PUNCH_HOLE |	FALLOC_FL_COLLAPSE_RANGE |	\
 		 FALLOC_FL_ZERO_RANGE |	FALLOC_FL_INSERT_RANGE |	\
-		 FALLOC_FL_UNSHARE_RANGE)
+		 FALLOC_FL_UNSHARE_RANGE | FALLOC_FL_WRITE_ZEROES)
 
 STATIC long
 __xfs_file_fallocate(
@@ -1383,6 +1393,7 @@ __xfs_file_fallocate(
 	case FALLOC_FL_INSERT_RANGE:
 		error = xfs_falloc_insert_range(file, offset, len);
 		break;
+	case FALLOC_FL_WRITE_ZEROES:
 	case FALLOC_FL_ZERO_RANGE:
 		error = xfs_falloc_zero_range(file, mode, offset, len, ac);
 		break;
-- 
2.51.0


^ permalink raw reply related	[flat|nested] 25+ messages in thread

* [PATCH 2/2] xfs: Remove WARN_ONCE if xfs_uuid_table grows over 2x PAGE_SIZE.
  2025-10-21 14:17 [PATCH 0/2] Add FL_WRITE_ZEROES to XFS, fix krealloc on xfs_uuid_table Lukas Herbolt
  2025-10-21 14:17 ` [PATCH] xfs: add FALLOC_FL_WRITE_ZEROES to XFS code base Lukas Herbolt
@ 2025-10-21 14:17 ` Lukas Herbolt
  2025-10-21 15:56   ` Darrick J. Wong
                     ` (2 more replies)
  1 sibling, 3 replies; 25+ messages in thread
From: Lukas Herbolt @ 2025-10-21 14:17 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs, Lukas Herbolt

The krealloc prints out warning if allocation is bigger than 2x PAGE_SIZE,
lets use kvrealloc for the memory allocation.

Signed-off-by: Lukas Herbolt <lukas@herbolt.com>
---
 fs/xfs/xfs_mount.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index dc32c5e34d817..e728e61c9325a 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -89,7 +89,7 @@ xfs_uuid_mount(
 	}
 
 	if (hole < 0) {
-		xfs_uuid_table = krealloc(xfs_uuid_table,
+		xfs_uuid_table = kvrealloc(xfs_uuid_table,
 			(xfs_uuid_table_size + 1) * sizeof(*xfs_uuid_table),
 			GFP_KERNEL | __GFP_NOFAIL);
 		hole = xfs_uuid_table_size++;
-- 
2.51.0


^ permalink raw reply related	[flat|nested] 25+ messages in thread

* Re: [PATCH] xfs: add FALLOC_FL_WRITE_ZEROES to XFS code base
  2025-10-21 14:17 ` [PATCH] xfs: add FALLOC_FL_WRITE_ZEROES to XFS code base Lukas Herbolt
@ 2025-10-21 15:55   ` Darrick J. Wong
  2025-10-22  5:00   ` Christoph Hellwig
  1 sibling, 0 replies; 25+ messages in thread
From: Darrick J. Wong @ 2025-10-21 15:55 UTC (permalink / raw)
  To: Lukas Herbolt; +Cc: linux-xfs

On Tue, Oct 21, 2025 at 04:17:44PM +0200, Lukas Herbolt wrote:
> Add support for FALLOC_FL_WRITE_ZEROES if the underlying device enable
> the unmap write zeroes operation.
> 
> Signed-off-by: Lukas Herbolt <lukas@herbolt.com>
> ---
>  fs/xfs/xfs_bmap_util.c |  6 +++---
>  fs/xfs/xfs_bmap_util.h |  4 ++--
>  fs/xfs/xfs_file.c      | 25 ++++++++++++++++++-------
>  3 files changed, 23 insertions(+), 12 deletions(-)
> 
> diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
> index 06ca11731e430..fd43c9db79a8d 100644
> --- a/fs/xfs/xfs_bmap_util.c
> +++ b/fs/xfs/xfs_bmap_util.c
> @@ -645,6 +645,7 @@ xfs_free_eofblocks(
>  int
>  xfs_alloc_file_space(
>  	struct xfs_inode	*ip,
> +	uint32_t		flags,		/* XFS_BMAPI_... */
>  	xfs_off_t		offset,
>  	xfs_off_t		len)
>  {
> @@ -747,9 +748,8 @@ xfs_alloc_file_space(
>  		 * startoffset_fsb so that one of the following allocations
>  		 * will eventually reach the requested range.
>  		 */
> -		error = xfs_bmapi_write(tp, ip, startoffset_fsb,
> -				allocatesize_fsb, XFS_BMAPI_PREALLOC, 0, imapp,
> -				&nimaps);
> 		error = xfs_bmapi_write(tp, ip, startoffset_fsb, allocatesize_fsb,
> +				flags, 0, imapp, &nimaps);
>  		if (error) {
>  			if (error != -ENOSR)
>  				goto error;
> diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h
> index c477b33616304..67770830eb245 100644
> --- a/fs/xfs/xfs_bmap_util.h
> +++ b/fs/xfs/xfs_bmap_util.h
> @@ -55,8 +55,8 @@ int	xfs_bmap_last_extent(struct xfs_trans *tp, struct xfs_inode *ip,
>  			     int *is_empty);
>  
>  /* preallocation and hole punch interface */
> -int	xfs_alloc_file_space(struct xfs_inode *ip, xfs_off_t offset,
> -		xfs_off_t len);
> +int	xfs_alloc_file_space(struct xfs_inode *ip, uint32_t flags,
> +		xfs_off_t offset, xfs_off_t len);
>  int	xfs_free_file_space(struct xfs_inode *ip, xfs_off_t offset,
>  		xfs_off_t len, struct xfs_zone_alloc_ctx *ac);
>  int	xfs_collapse_file_space(struct xfs_inode *, xfs_off_t offset,
> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> index f96fbf5c54c99..b7e8cda62bb73 100644
> --- a/fs/xfs/xfs_file.c
> +++ b/fs/xfs/xfs_file.c
> @@ -1255,29 +1255,37 @@ xfs_falloc_insert_range(
>  static int
>  xfs_falloc_zero_range(
>  	struct file		*file,
> -	int			mode,
> +	int				mode,
>  	loff_t			offset,
>  	loff_t			len,
>  	struct xfs_zone_alloc_ctx *ac)
>  {
>  	struct inode		*inode = file_inode(file);
> +	struct xfs_inode	*ip = XFS_I(inode);
>  	unsigned int		blksize = i_blocksize(inode);
>  	loff_t			new_size = 0;
>  	int			error;
>  
> -	trace_xfs_zero_file_space(XFS_I(inode));
> +	trace_xfs_zero_file_space(ip);
>  
>  	error = xfs_falloc_newsize(file, mode, offset, len, &new_size);
>  	if (error)
>  		return error;
>  
> -	error = xfs_free_file_space(XFS_I(inode), offset, len, ac);
> +	error = xfs_free_file_space(ip, offset, len, ac);
>  	if (error)
>  		return error;
>  
>  	len = round_up(offset + len, blksize) - round_down(offset, blksize);
>  	offset = round_down(offset, blksize);
> -	error = xfs_alloc_file_space(XFS_I(inode), offset, len);
> +	if (mode & FALLOC_FL_WRITE_ZEROES) {
> +		if (!bdev_write_zeroes_unmap_sectors(xfs_inode_buftarg(ip)->bt_bdev))
> +			return -EOPNOTSUPP;
> +		xfs_alloc_file_space(ip, XFS_BMAPI_ZERO, offset, len);

Don't we need to check the return value of xfs_alloc_file_space?

--D

> +	} else {
> +		error = xfs_alloc_file_space(ip, XFS_BMAPI_PREALLOC,
> +				offset, len);
> +	}
>  	if (error)
>  		return error;
>  	return xfs_falloc_setsize(file, new_size);
> @@ -1302,7 +1310,8 @@ xfs_falloc_unshare_range(
>  	if (error)
>  		return error;
>  
> -	error = xfs_alloc_file_space(XFS_I(inode), offset, len);
> +	error = xfs_alloc_file_space(XFS_I(inode), XFS_BMAPI_PREALLOC,
> +			offset, len);
>  	if (error)
>  		return error;
>  	return xfs_falloc_setsize(file, new_size);
> @@ -1330,7 +1339,8 @@ xfs_falloc_allocate_range(
>  	if (error)
>  		return error;
>  
> -	error = xfs_alloc_file_space(XFS_I(inode), offset, len);
> +	error = xfs_alloc_file_space(XFS_I(inode), XFS_BMAPI_PREALLOC,
> +			offset, len);
>  	if (error)
>  		return error;
>  	return xfs_falloc_setsize(file, new_size);
> @@ -1340,7 +1350,7 @@ xfs_falloc_allocate_range(
>  		(FALLOC_FL_ALLOCATE_RANGE | FALLOC_FL_KEEP_SIZE |	\
>  		 FALLOC_FL_PUNCH_HOLE |	FALLOC_FL_COLLAPSE_RANGE |	\
>  		 FALLOC_FL_ZERO_RANGE |	FALLOC_FL_INSERT_RANGE |	\
> -		 FALLOC_FL_UNSHARE_RANGE)
> +		 FALLOC_FL_UNSHARE_RANGE | FALLOC_FL_WRITE_ZEROES)
>  
>  STATIC long
>  __xfs_file_fallocate(
> @@ -1383,6 +1393,7 @@ __xfs_file_fallocate(
>  	case FALLOC_FL_INSERT_RANGE:
>  		error = xfs_falloc_insert_range(file, offset, len);
>  		break;
> +	case FALLOC_FL_WRITE_ZEROES:
>  	case FALLOC_FL_ZERO_RANGE:
>  		error = xfs_falloc_zero_range(file, mode, offset, len, ac);
>  		break;
> -- 
> 2.51.0
> 

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 2/2] xfs: Remove WARN_ONCE if xfs_uuid_table grows over 2x PAGE_SIZE.
  2025-10-21 14:17 ` [PATCH 2/2] xfs: Remove WARN_ONCE if xfs_uuid_table grows over 2x PAGE_SIZE Lukas Herbolt
@ 2025-10-21 15:56   ` Darrick J. Wong
  2025-10-21 22:02   ` Dave Chinner
  2025-10-22  4:53   ` Christoph Hellwig
  2 siblings, 0 replies; 25+ messages in thread
From: Darrick J. Wong @ 2025-10-21 15:56 UTC (permalink / raw)
  To: Lukas Herbolt; +Cc: linux-xfs

On Tue, Oct 21, 2025 at 04:17:45PM +0200, Lukas Herbolt wrote:
> The krealloc prints out warning if allocation is bigger than 2x PAGE_SIZE,
> lets use kvrealloc for the memory allocation.
> 
> Signed-off-by: Lukas Herbolt <lukas@herbolt.com>
> ---
>  fs/xfs/xfs_mount.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
> index dc32c5e34d817..e728e61c9325a 100644
> --- a/fs/xfs/xfs_mount.c
> +++ b/fs/xfs/xfs_mount.c
> @@ -89,7 +89,7 @@ xfs_uuid_mount(
>  	}
>  
>  	if (hole < 0) {
> -		xfs_uuid_table = krealloc(xfs_uuid_table,
> +		xfs_uuid_table = kvrealloc(xfs_uuid_table,

Doesn't the table need to be kvfree'd if you make this change?

--D

>  			(xfs_uuid_table_size + 1) * sizeof(*xfs_uuid_table),
>  			GFP_KERNEL | __GFP_NOFAIL);
>  		hole = xfs_uuid_table_size++;
> -- 
> 2.51.0
> 

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 2/2] xfs: Remove WARN_ONCE if xfs_uuid_table grows over 2x PAGE_SIZE.
  2025-10-21 14:17 ` [PATCH 2/2] xfs: Remove WARN_ONCE if xfs_uuid_table grows over 2x PAGE_SIZE Lukas Herbolt
  2025-10-21 15:56   ` Darrick J. Wong
@ 2025-10-21 22:02   ` Dave Chinner
  2025-10-26 17:49     ` lukas
  2025-10-22  4:53   ` Christoph Hellwig
  2 siblings, 1 reply; 25+ messages in thread
From: Dave Chinner @ 2025-10-21 22:02 UTC (permalink / raw)
  To: Lukas Herbolt; +Cc: djwong, linux-xfs

On Tue, Oct 21, 2025 at 04:17:45PM +0200, Lukas Herbolt wrote:
> The krealloc prints out warning if allocation is bigger than 2x PAGE_SIZE,
> lets use kvrealloc for the memory allocation.

What warning is that?  i.e. it helps to quote the error in the
commit message so that, in future, we know exactly what the warning
we fixed here.

If the warning is from the  __GFP_NOFAIL directive for this
allocation, then shouldn't we convert this code not to rely on
__GFP_NOFAIL?  xfs_uuid_mount() can already return an error, so if
we get an -ENOMEM for this allocation the error should already be
handled correctly and the -ENOMEM get propagated back out to
userspace correctly....

-Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 2/2] xfs: Remove WARN_ONCE if xfs_uuid_table grows over 2x PAGE_SIZE.
  2025-10-21 14:17 ` [PATCH 2/2] xfs: Remove WARN_ONCE if xfs_uuid_table grows over 2x PAGE_SIZE Lukas Herbolt
  2025-10-21 15:56   ` Darrick J. Wong
  2025-10-21 22:02   ` Dave Chinner
@ 2025-10-22  4:53   ` Christoph Hellwig
  2 siblings, 0 replies; 25+ messages in thread
From: Christoph Hellwig @ 2025-10-22  4:53 UTC (permalink / raw)
  To: Lukas Herbolt; +Cc: djwong, linux-xfs

This doesn't seem to relate to the other patch, so bundling it into a
single series is a but unusual, not to say slightly confusing.

On Tue, Oct 21, 2025 at 04:17:45PM +0200, Lukas Herbolt wrote:
> The krealloc prints out warning if allocation is bigger than 2x PAGE_SIZE,
> lets use kvrealloc for the memory allocation.
> 
> Signed-off-by: Lukas Herbolt <lukas@herbolt.com>
> ---
>  fs/xfs/xfs_mount.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
> index dc32c5e34d817..e728e61c9325a 100644
> --- a/fs/xfs/xfs_mount.c
> +++ b/fs/xfs/xfs_mount.c
> @@ -89,7 +89,7 @@ xfs_uuid_mount(
>  	}
>  
>  	if (hole < 0) {
> -		xfs_uuid_table = krealloc(xfs_uuid_table,
> +		xfs_uuid_table = kvrealloc(xfs_uuid_table,
>  			(xfs_uuid_table_size + 1) * sizeof(*xfs_uuid_table),
>  			GFP_KERNEL | __GFP_NOFAIL);
>  		hole = xfs_uuid_table_size++;

I agree with the low-level comments from Darrick and Dave.  But once
we start bikeshedding here, maybe do the grand bikeshedding and stop
using this stupid array and replace it with an xarray using XA_FLAGS_ALLOC
which takes care of all the memory allocations and even nicely frees them
on unmount?

I suspect we'd even end up with slightly less code that way, and
certainly better memory usage.


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] xfs: add FALLOC_FL_WRITE_ZEROES to XFS code base
  2025-10-21 14:17 ` [PATCH] xfs: add FALLOC_FL_WRITE_ZEROES to XFS code base Lukas Herbolt
  2025-10-21 15:55   ` Darrick J. Wong
@ 2025-10-22  5:00   ` Christoph Hellwig
  2025-10-22  7:13     ` Zhang Yi
  1 sibling, 1 reply; 25+ messages in thread
From: Christoph Hellwig @ 2025-10-22  5:00 UTC (permalink / raw)
  To: Lukas Herbolt; +Cc: djwong, linux-xfs, Zhang Yi

On Tue, Oct 21, 2025 at 04:17:44PM +0200, Lukas Herbolt wrote:
> Add support for FALLOC_FL_WRITE_ZEROES if the underlying device enable
> -		error = xfs_bmapi_write(tp, ip, startoffset_fsb,
> -				allocatesize_fsb, XFS_BMAPI_PREALLOC, 0, imapp,
> -				&nimaps);
> +		error = xfs_bmapi_write(tp, ip, startoffset_fsb, allocatesize_fsb,
> +				flags, 0, imapp, &nimaps);

Please drop the reformatting that introduces an overly long line.

> -int	xfs_alloc_file_space(struct xfs_inode *ip, xfs_off_t offset,
> -		xfs_off_t len);
> +int	xfs_alloc_file_space(struct xfs_inode *ip, uint32_t flags,
> +		xfs_off_t offset, xfs_off_t len);

Also normal argument order in XFS would keep the flags last, I think
it's best to stick to that.

> -	int			mode,
> +	int				mode,

Spurious whitespace changes here.

>  	len = round_up(offset + len, blksize) - round_down(offset, blksize);
>  	offset = round_down(offset, blksize);
> -	error = xfs_alloc_file_space(XFS_I(inode), offset, len);
> +	if (mode & FALLOC_FL_WRITE_ZEROES) {
> +		if (!bdev_write_zeroes_unmap_sectors(xfs_inode_buftarg(ip)->bt_bdev))
> +			return -EOPNOTSUPP;

Overly long line.

> +		xfs_alloc_file_space(ip, XFS_BMAPI_ZERO, offset, len);

As already mentioned, missing error return.


Also how is the interaction of FALLOC_FL_WRITE_ZEROES and
FALLOC_FL_KEEP_SIZE defined?


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] xfs: add FALLOC_FL_WRITE_ZEROES to XFS code base
  2025-10-22  5:00   ` Christoph Hellwig
@ 2025-10-22  7:13     ` Zhang Yi
  2025-10-22  7:15       ` Christoph Hellwig
  0 siblings, 1 reply; 25+ messages in thread
From: Zhang Yi @ 2025-10-22  7:13 UTC (permalink / raw)
  To: Lukas Herbolt; +Cc: Christoph Hellwig, djwong, linux-xfs, Zhang Yi

On 10/22/2025 1:00 PM, Christoph Hellwig wrote:
> On Tue, Oct 21, 2025 at 04:17:44PM +0200, Lukas Herbolt wrote:
>> Add support for FALLOC_FL_WRITE_ZEROES if the underlying device enable
>> -		error = xfs_bmapi_write(tp, ip, startoffset_fsb,
>> -				allocatesize_fsb, XFS_BMAPI_PREALLOC, 0, imapp,
>> -				&nimaps);
>> +		error = xfs_bmapi_write(tp, ip, startoffset_fsb, allocatesize_fsb,
>> +				flags, 0, imapp, &nimaps);
> 
> Please drop the reformatting that introduces an overly long line.
> 
>> -int	xfs_alloc_file_space(struct xfs_inode *ip, xfs_off_t offset,
>> -		xfs_off_t len);
>> +int	xfs_alloc_file_space(struct xfs_inode *ip, uint32_t flags,
>> +		xfs_off_t offset, xfs_off_t len);
> 
> Also normal argument order in XFS would keep the flags last, I think
> it's best to stick to that.
> 
>> -	int			mode,
>> +	int				mode,
> 
> Spurious whitespace changes here.
> 
>>  	len = round_up(offset + len, blksize) - round_down(offset, blksize);
>>  	offset = round_down(offset, blksize);
>> -	error = xfs_alloc_file_space(XFS_I(inode), offset, len);
>> +	if (mode & FALLOC_FL_WRITE_ZEROES) {
>> +		if (!bdev_write_zeroes_unmap_sectors(xfs_inode_buftarg(ip)->bt_bdev))
>> +			return -EOPNOTSUPP;
> 
> Overly long line.
> 
>> +		xfs_alloc_file_space(ip, XFS_BMAPI_ZERO, offset, len);
> 
> As already mentioned, missing error return.
> 
> 
> Also how is the interaction of FALLOC_FL_WRITE_ZEROES and
> FALLOC_FL_KEEP_SIZE defined?
> 

This situation will be intercepted in vfs_fallcoate().

Besides, it seems that the comments for the xfs_falloc_zero_range() also
need to be updated. Specifically, for inodes that are always COW, there
is no difference between FALLOC_FL_WRITE_ZEROES and FALLOC_FL_ZERO_RANGE
because it does not create zeroed extents.

Best Regards,
Yi.


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] xfs: add FALLOC_FL_WRITE_ZEROES to XFS code base
  2025-10-22  7:13     ` Zhang Yi
@ 2025-10-22  7:15       ` Christoph Hellwig
  2025-10-22  7:27         ` Zhang Yi
  0 siblings, 1 reply; 25+ messages in thread
From: Christoph Hellwig @ 2025-10-22  7:15 UTC (permalink / raw)
  To: Zhang Yi; +Cc: Lukas Herbolt, Christoph Hellwig, djwong, linux-xfs, Zhang Yi

On Wed, Oct 22, 2025 at 03:13:38PM +0800, Zhang Yi wrote:
> This situation will be intercepted in vfs_fallcoate().

Ah, perfect.

> Besides, it seems that the comments for the xfs_falloc_zero_range() also
> need to be updated. Specifically, for inodes that are always COW, there
> is no difference between FALLOC_FL_WRITE_ZEROES and FALLOC_FL_ZERO_RANGE
> because it does not create zeroed extents.

In fact we should not offer FALLOC_FL_WRITE_ZEROES for always COW
inodes.  Yes, you can physically write zeroes if the hardware supports
it, but given that any overwrite will cause and allocation anyway it
will just increase the write amplification for no gain.


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] xfs: add FALLOC_FL_WRITE_ZEROES to XFS code base
  2025-10-22  7:15       ` Christoph Hellwig
@ 2025-10-22  7:27         ` Zhang Yi
  2025-10-29 17:53           ` [PATCH v3] " Lukas Herbolt
  0 siblings, 1 reply; 25+ messages in thread
From: Zhang Yi @ 2025-10-22  7:27 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Lukas Herbolt, djwong, linux-xfs, Zhang Yi

On 10/22/2025 3:15 PM, Christoph Hellwig wrote:
> On Wed, Oct 22, 2025 at 03:13:38PM +0800, Zhang Yi wrote:
>> This situation will be intercepted in vfs_fallcoate().
> 
> Ah, perfect.
> 
>> Besides, it seems that the comments for the xfs_falloc_zero_range() also
>> need to be updated. Specifically, for inodes that are always COW, there
>> is no difference between FALLOC_FL_WRITE_ZEROES and FALLOC_FL_ZERO_RANGE
>> because it does not create zeroed extents.
> 
> In fact we should not offer FALLOC_FL_WRITE_ZEROES for always COW
> inodes.  Yes, you can physically write zeroes if the hardware supports
> it, but given that any overwrite will cause and allocation anyway it
> will just increase the write amplification for no gain.

Yes, indeed! We can directly return -EOPNOTSUPP for always COW inodes.

Best Regards,
Yi.


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 2/2] xfs: Remove WARN_ONCE if xfs_uuid_table grows over 2x PAGE_SIZE.
  2025-10-21 22:02   ` Dave Chinner
@ 2025-10-26 17:49     ` lukas
  0 siblings, 0 replies; 25+ messages in thread
From: lukas @ 2025-10-26 17:49 UTC (permalink / raw)
  To: Dave Chinner; +Cc: djwong, linux-xfs

On 2025-10-22 00:02, Dave Chinner wrote:
> On Tue, Oct 21, 2025 at 04:17:45PM +0200, Lukas Herbolt wrote:
>> The krealloc prints out warning if allocation is bigger than 2x 
>> PAGE_SIZE,
>> lets use kvrealloc for the memory allocation.
> 
> What warning is that?  i.e. it helps to quote the error in the
> commit message so that, in future, we know exactly what the warning
> we fixed here.

My bad, it was old RHEL kernel and I did not check if the upstream still
behaves the same. I just saw the xfs_uuid_table still uses the 
krealloc()
and expected the same issue without double checking it.

The bellow patch actually removed the warning, and it landed in 6.12.
Still do we need to use the krealloc for the xfs_uuid_table?

903edea6c53

[  124.615797] XFS (loop509): Mounting V5 Filesystem 
e21c5391-f7d0-4ca3-9d91-7cfce8ec3cdd
[  124.620767] XFS (loop509): Ending clean mount
[  124.687216] loop510: detected capacity change from 0 to 614400
[  124.713986] ------------[ cut here ]------------
[  124.715034] WARNING: CPU: 32 PID: 11848 at mm/page_alloc.c:3040 
rmqueue+0x44/0x10d0
[  124.722884] Modules linked in: rfkill vfat fat intel_rapl_msr 
intel_rapl_common intel_uncore_frequency_common skx_edac_common nfit 
libnvdimm snd_hda_codec_generic kvm_intel snd_hda_intel iTCO_wdt 
snd_intel_dspcfg iTCO_vendor_support snd_intel_sdw_acpi kvm 
snd_hda_codec snd_hda_core rapl snd_hwdep snd_pcm snd_timer snd i2c_i801 
pcspkr lpc_ich soundcore virtio_gpu i2c_smbus virtio_balloon 
virtio_dma_buf sg loop nfnetlink vsock_loopback 
vmw_vsock_virtio_transport_common vmw_vsock_vmci_transport vsock 
vmw_vmci xfs sd_mod ahci libahci crct10dif_pclmul crc32_pclmul 
crc32c_intel virtio_net libata ghash_clmulni_intel virtio_scsi 
virtio_console net_failover virtio_blk failover serio_raw sunrpc 
dm_mirror dm_region_hash dm_log dm_mod fuse
[  124.741904] CPU: 32 UID: 0 PID: 11848 Comm: mount Kdump: loaded Not 
tainted 6.11.0-0.test.f42.x86_64 #1
[  124.744967] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 
edk2-20241117-5.fc40 11/17/2024
[  124.747429] RIP: 0010:rmqueue+0x44/0x10d0
[  124.748647] Code: 89 44 24 60 44 89 4c 24 44 65 48 8b 04 25 28 00 00 
00 48 89 84 24 90 00 00 00 31 c0 80 e5 80 74 0b 83 fa 01 0f 86 52 0c 00 
00 <0f> 0b 83 fd 03 0f 96 c2 83 fd 09 0f 94 c0 08 c2 88 54 24 73 0f 85
[  124.753035] RSP: 0018:ffffaa1621a67a38 EFLAGS: 00010202
[  124.754617] RAX: 0000000000000000 RBX: ffff8e25bffd5d80 RCX: 
00000000000480c0
[  124.756251] RDX: 0000000000000002 RSI: ffff8e25bffd5d80 RDI: 
ffff8e25bffd5d80
[  124.758018] RBP: 0000000000000002 R08: 0000000000000901 R09: 
0000000000000000
[  124.759910] R10: 0000000000000000 R11: ffffe76b8b367d08 R12: 
0000000000000002
[  124.761545] R13: ffff8e25bffd5d80 R14: ffff8e25bffd71c0 R15: 
0000000000000901
[  124.763508] FS:  00007fd8c7427800(0000) GS:ffff8e242fa00000(0000) 
knlGS:0000000000000000
[  124.765484] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  124.767042] CR2: 00007fd8c72d8000 CR3: 0000000234bf8002 CR4: 
0000000000770ef0
[  124.769197] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 
0000000000000000
[  124.770916] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 
0000000000000400
[  124.772989] PKRU: 55555554
[  124.773899] Call Trace:
[  124.774762]  <TASK>
[  124.775628]  ? show_trace_log_lvl+0x1b0/0x2f0
[  124.776969]  ? show_trace_log_lvl+0x1b0/0x2f0
[  124.778131]  ? get_page_from_freelist+0x129/0x770
[  124.779342]  ? rmqueue+0x44/0x10d0
[  124.780331]  ? __warn.cold+0x93/0xed
[  124.781391]  ? rmqueue+0x44/0x10d0
[  124.782383]  ? report_bug+0xff/0x140
[  124.783641]  ? handle_bug+0x3a/0x70
[  124.784953]  ? exc_invalid_op+0x17/0x70
[  124.786128]  ? asm_exc_invalid_op+0x1a/0x20
[  124.787235]  ? rmqueue+0x44/0x10d0
[  124.788335]  ? selinux_kernfs_init_security+0x79/0x230
[  124.789601]  ? down_write+0x12/0x60
[  124.790584]  ? kernfs_activate+0x82/0xd0
[  124.791651]  ? kernfs_add_one+0x141/0x150
[  124.793876]  get_page_from_freelist+0x129/0x770
[  124.795441]  __alloc_pages_noprof+0x188/0x350
[  124.796967]  ___kmalloc_large_node+0x69/0x100
[  124.798569]  __kmalloc_large_node_noprof+0x1d/0xa0
[  124.800220]  __kmalloc_node_track_caller_noprof+0x34c/0x440
[  124.801981]  ? xfs_uuid_mount+0x155/0x180 [xfs]
[  124.803883]  ? krealloc_noprof+0x68/0xe0
[  124.805241]  krealloc_noprof+0x68/0xe0
[  124.806562]  xfs_uuid_mount+0x155/0x180 [xfs]
[  124.808648]  xfs_mountfs+0x2ce/0x950 [xfs]
[  124.810332]  xfs_fs_fill_super+0x53e/0x910 [xfs]
[  124.812119]  ? __pfx_xfs_fs_fill_super+0x10/0x10 [xfs]
[  124.814002]  get_tree_bdev+0x124/0x1c0
[  124.815272]  vfs_get_tree+0x26/0xd0
[  124.816457]  vfs_cmd_create+0x59/0xe0
[  124.817700]  __do_sys_fsconfig+0x4e9/0x6b0
[  124.819000]  do_syscall_64+0x7d/0x160
[  124.820196]  ? syscall_exit_work+0xf3/0x120
[  124.821455]  ? syscall_exit_to_user_mode+0x10/0x1f0
[  124.822909]  ? do_syscall_64+0x89/0x160
[  124.823907]  ? clear_bhb_loop+0x25/0x80
[  124.824895]  ? clear_bhb_loop+0x25/0x80
[  124.826085]  ? clear_bhb_loop+0x25/0x80
[  124.826980]  entry_SYSCALL_64_after_hwframe+0x76/0x7e
[  124.828167] RIP: 0033:0x7fd8c760edbe
[  124.829061] Code: 73 01 c3 48 8b 0d 52 80 0c 00 f7 d8 64 89 01 48 83 
c8 ff c3 0f 1f 84 00 00 00 00 00 f3 0f 1e fa 49 89 ca b8 af 01 00 00 0f 
05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 22 80 0c 00 f7 d8 64 89 01 48
[  124.833439] RSP: 002b:00007ffca3bfd0b8 EFLAGS: 00000246 ORIG_RAX: 
00000000000001af
[  124.834983] RAX: ffffffffffffffda RBX: 000056408060ec90 RCX: 
00007fd8c760edbe
[  124.836809] RDX: 0000000000000000 RSI: 0000000000000006 RDI: 
0000000000000003
[  124.838281] RBP: 0000000000000000 R08: 0000000000000000 R09: 
0000000000000040
[  124.839768] R10: 0000000000000000 R11: 0000000000000246 R12: 
00007fd8c7760b00
[  124.841234] R13: 000056408065ed00 R14: 00007fd8c7754561 R15: 
000056408060edd8
[  124.842725]  </TASK>
[  124.843566] ---[ end trace 0000000000000000 ]---

^ permalink raw reply	[flat|nested] 25+ messages in thread

* [PATCH v3] xfs: add FALLOC_FL_WRITE_ZEROES to XFS code base
  2025-10-22  7:27         ` Zhang Yi
@ 2025-10-29 17:53           ` Lukas Herbolt
  2025-10-29 18:22             ` Darrick J. Wong
  2025-10-30  7:29             ` [PATCH v3] " Christoph Hellwig
  0 siblings, 2 replies; 25+ messages in thread
From: Lukas Herbolt @ 2025-10-29 17:53 UTC (permalink / raw)
  To: yi.zhang; +Cc: linux-xfs, Lukas Herbolt

Add support for FALLOC_FL_WRITE_ZEROES if the underlying device enable
the unmap write zeroes operation.

v3 changes:
 - fix formating
 - fix check  on the return value of xfs_alloc_file_space
 - add check if inode COW and return -EOPNOTSUPP

Signed-off-by: Lukas Herbolt <lukas@herbolt.com>
---
 fs/xfs/xfs_bmap_util.c |  6 +++---
 fs/xfs/xfs_bmap_util.h |  2 +-
 fs/xfs/xfs_file.c      | 24 ++++++++++++++++++------
 3 files changed, 22 insertions(+), 10 deletions(-)

diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 06ca11731e430..ddbcf4b0cea17 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -646,7 +646,8 @@ int
 xfs_alloc_file_space(
 	struct xfs_inode	*ip,
 	xfs_off_t		offset,
-	xfs_off_t		len)
+	xfs_off_t		len,
+	uint32_t		flags)	/* XFS_BMAPI_... */
 {
 	xfs_mount_t		*mp = ip->i_mount;
 	xfs_off_t		count;
@@ -748,8 +749,7 @@ xfs_alloc_file_space(
 		 * will eventually reach the requested range.
 		 */
 		error = xfs_bmapi_write(tp, ip, startoffset_fsb,
-				allocatesize_fsb, XFS_BMAPI_PREALLOC, 0, imapp,
-				&nimaps);
+				allocatesize_fsb, flags, 0, imapp, &nimaps);
 		if (error) {
 			if (error != -ENOSR)
 				goto error;
diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h
index c477b33616304..1fd4844d4ec64 100644
--- a/fs/xfs/xfs_bmap_util.h
+++ b/fs/xfs/xfs_bmap_util.h
@@ -56,7 +56,7 @@ int	xfs_bmap_last_extent(struct xfs_trans *tp, struct xfs_inode *ip,
 
 /* preallocation and hole punch interface */
 int	xfs_alloc_file_space(struct xfs_inode *ip, xfs_off_t offset,
-		xfs_off_t len);
+		xfs_off_t len, uint32_t flags);
 int	xfs_free_file_space(struct xfs_inode *ip, xfs_off_t offset,
 		xfs_off_t len, struct xfs_zone_alloc_ctx *ac);
 int	xfs_collapse_file_space(struct xfs_inode *, xfs_off_t offset,
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index f96fbf5c54c99..38de47ffb8d39 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1261,23 +1261,32 @@ xfs_falloc_zero_range(
 	struct xfs_zone_alloc_ctx *ac)
 {
 	struct inode		*inode = file_inode(file);
+	struct xfs_inode	*ip = XFS_I(inode);
 	unsigned int		blksize = i_blocksize(inode);
 	loff_t			new_size = 0;
 	int			error;
 
-	trace_xfs_zero_file_space(XFS_I(inode));
+	trace_xfs_zero_file_space(ip);
 
 	error = xfs_falloc_newsize(file, mode, offset, len, &new_size);
 	if (error)
 		return error;
 
-	error = xfs_free_file_space(XFS_I(inode), offset, len, ac);
+	error = xfs_free_file_space(ip, offset, len, ac);
 	if (error)
 		return error;
 
 	len = round_up(offset + len, blksize) - round_down(offset, blksize);
 	offset = round_down(offset, blksize);
-	error = xfs_alloc_file_space(XFS_I(inode), offset, len);
+	if (mode & FALLOC_FL_WRITE_ZEROES) {
+		if (xfs_is_cow_inode(ip) || !bdev_write_zeroes_unmap_sectors(
+				xfs_inode_buftarg(ip)->bt_bdev))
+			return -EOPNOTSUPP;
+		error = xfs_alloc_file_space(ip, offset, len, XFS_BMAPI_ZERO);
+	} else {
+		error = xfs_alloc_file_space(ip, offset, len,
+				XFS_BMAPI_PREALLOC);
+	}
 	if (error)
 		return error;
 	return xfs_falloc_setsize(file, new_size);
@@ -1302,7 +1311,8 @@ xfs_falloc_unshare_range(
 	if (error)
 		return error;
 
-	error = xfs_alloc_file_space(XFS_I(inode), offset, len);
+	error = xfs_alloc_file_space(XFS_I(inode),	offset, len,
+			XFS_BMAPI_PREALLOC);
 	if (error)
 		return error;
 	return xfs_falloc_setsize(file, new_size);
@@ -1330,7 +1340,8 @@ xfs_falloc_allocate_range(
 	if (error)
 		return error;
 
-	error = xfs_alloc_file_space(XFS_I(inode), offset, len);
+	error = xfs_alloc_file_space(XFS_I(inode), offset, len,
+			XFS_BMAPI_PREALLOC);
 	if (error)
 		return error;
 	return xfs_falloc_setsize(file, new_size);
@@ -1340,7 +1351,7 @@ xfs_falloc_allocate_range(
 		(FALLOC_FL_ALLOCATE_RANGE | FALLOC_FL_KEEP_SIZE |	\
 		 FALLOC_FL_PUNCH_HOLE |	FALLOC_FL_COLLAPSE_RANGE |	\
 		 FALLOC_FL_ZERO_RANGE |	FALLOC_FL_INSERT_RANGE |	\
-		 FALLOC_FL_UNSHARE_RANGE)
+		 FALLOC_FL_UNSHARE_RANGE | FALLOC_FL_WRITE_ZEROES)
 
 STATIC long
 __xfs_file_fallocate(
@@ -1383,6 +1394,7 @@ __xfs_file_fallocate(
 	case FALLOC_FL_INSERT_RANGE:
 		error = xfs_falloc_insert_range(file, offset, len);
 		break;
+	case FALLOC_FL_WRITE_ZEROES:
 	case FALLOC_FL_ZERO_RANGE:
 		error = xfs_falloc_zero_range(file, mode, offset, len, ac);
 		break;
-- 
2.51.0


^ permalink raw reply related	[flat|nested] 25+ messages in thread

* Re: [PATCH v3] xfs: add FALLOC_FL_WRITE_ZEROES to XFS code base
  2025-10-29 17:53           ` [PATCH v3] " Lukas Herbolt
@ 2025-10-29 18:22             ` Darrick J. Wong
  2025-10-30  7:27               ` Christoph Hellwig
  2025-10-30  7:29             ` [PATCH v3] " Christoph Hellwig
  1 sibling, 1 reply; 25+ messages in thread
From: Darrick J. Wong @ 2025-10-29 18:22 UTC (permalink / raw)
  To: Lukas Herbolt; +Cc: yi.zhang, linux-xfs

On Wed, Oct 29, 2025 at 06:53:14PM +0100, Lukas Herbolt wrote:
> Add support for FALLOC_FL_WRITE_ZEROES if the underlying device enable
> the unmap write zeroes operation.
> 
> v3 changes:
>  - fix formating
>  - fix check  on the return value of xfs_alloc_file_space
>  - add check if inode COW and return -EOPNOTSUPP
> 
> Signed-off-by: Lukas Herbolt <lukas@herbolt.com>
> ---
>  fs/xfs/xfs_bmap_util.c |  6 +++---
>  fs/xfs/xfs_bmap_util.h |  2 +-
>  fs/xfs/xfs_file.c      | 24 ++++++++++++++++++------
>  3 files changed, 22 insertions(+), 10 deletions(-)
> 
> diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
> index 06ca11731e430..ddbcf4b0cea17 100644
> --- a/fs/xfs/xfs_bmap_util.c
> +++ b/fs/xfs/xfs_bmap_util.c
> @@ -646,7 +646,8 @@ int
>  xfs_alloc_file_space(
>  	struct xfs_inode	*ip,
>  	xfs_off_t		offset,
> -	xfs_off_t		len)
> +	xfs_off_t		len,
> +	uint32_t		flags)	/* XFS_BMAPI_... */

Call the parameter bmapi_flags.

>  {
>  	xfs_mount_t		*mp = ip->i_mount;
>  	xfs_off_t		count;
> @@ -748,8 +749,7 @@ xfs_alloc_file_space(
>  		 * will eventually reach the requested range.
>  		 */
>  		error = xfs_bmapi_write(tp, ip, startoffset_fsb,
> -				allocatesize_fsb, XFS_BMAPI_PREALLOC, 0, imapp,
> -				&nimaps);
> +				allocatesize_fsb, flags, 0, imapp, &nimaps);
>  		if (error) {
>  			if (error != -ENOSR)
>  				goto error;
> diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h
> index c477b33616304..1fd4844d4ec64 100644
> --- a/fs/xfs/xfs_bmap_util.h
> +++ b/fs/xfs/xfs_bmap_util.h
> @@ -56,7 +56,7 @@ int	xfs_bmap_last_extent(struct xfs_trans *tp, struct xfs_inode *ip,
>  
>  /* preallocation and hole punch interface */
>  int	xfs_alloc_file_space(struct xfs_inode *ip, xfs_off_t offset,
> -		xfs_off_t len);
> +		xfs_off_t len, uint32_t flags);
>  int	xfs_free_file_space(struct xfs_inode *ip, xfs_off_t offset,
>  		xfs_off_t len, struct xfs_zone_alloc_ctx *ac);
>  int	xfs_collapse_file_space(struct xfs_inode *, xfs_off_t offset,
> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> index f96fbf5c54c99..38de47ffb8d39 100644
> --- a/fs/xfs/xfs_file.c
> +++ b/fs/xfs/xfs_file.c
> @@ -1261,23 +1261,32 @@ xfs_falloc_zero_range(
>  	struct xfs_zone_alloc_ctx *ac)
>  {
>  	struct inode		*inode = file_inode(file);
> +	struct xfs_inode	*ip = XFS_I(inode);
>  	unsigned int		blksize = i_blocksize(inode);
>  	loff_t			new_size = 0;
>  	int			error;
>  
> -	trace_xfs_zero_file_space(XFS_I(inode));
> +	trace_xfs_zero_file_space(ip);
>  
>  	error = xfs_falloc_newsize(file, mode, offset, len, &new_size);
>  	if (error)
>  		return error;
>  
> -	error = xfs_free_file_space(XFS_I(inode), offset, len, ac);
> +	error = xfs_free_file_space(ip, offset, len, ac);
>  	if (error)
>  		return error;
>  
>  	len = round_up(offset + len, blksize) - round_down(offset, blksize);
>  	offset = round_down(offset, blksize);
> -	error = xfs_alloc_file_space(XFS_I(inode), offset, len);
> +	if (mode & FALLOC_FL_WRITE_ZEROES) {
> +		if (xfs_is_cow_inode(ip) || !bdev_write_zeroes_unmap_sectors(

xfs_is_cow_inode() only tells us if the inode is capable of doing out of
place writes.  Why would a regular reflinked inode be ineligible for
WRITE_ZEROES?  The whole point of that fallocate mode is to avoid ioend
overhead during fsync, so I could see why you wouldn't want to allow
this for files that always do writes out of place.  But not for files
that happen to have been reflinked in the past but otherwise support
pure overwrites.

I don't understand why this bdev_write_zeroes_unmap_sectors check is
here and not in xfs_alloc_file_space.  Shouldn't other callers of
xfs_alloc_file_space be restricted from passing in XFS_BMAPI_ZERO if the
block device doesn't support unmap_sectors?

> +				xfs_inode_buftarg(ip)->bt_bdev))
> +			return -EOPNOTSUPP;
> +		error = xfs_alloc_file_space(ip, offset, len, XFS_BMAPI_ZERO);
> +	} else {
> +		error = xfs_alloc_file_space(ip, offset, len,
> +				XFS_BMAPI_PREALLOC);
> +	}
>  	if (error)
>  		return error;
>  	return xfs_falloc_setsize(file, new_size);
> @@ -1302,7 +1311,8 @@ xfs_falloc_unshare_range(
>  	if (error)
>  		return error;
>  
> -	error = xfs_alloc_file_space(XFS_I(inode), offset, len);
> +	error = xfs_alloc_file_space(XFS_I(inode),	offset, len,

Whitespace damage                                 ^^^^^

--D

> +			XFS_BMAPI_PREALLOC);
>  	if (error)
>  		return error;
>  	return xfs_falloc_setsize(file, new_size);
> @@ -1330,7 +1340,8 @@ xfs_falloc_allocate_range(
>  	if (error)
>  		return error;
>  
> -	error = xfs_alloc_file_space(XFS_I(inode), offset, len);
> +	error = xfs_alloc_file_space(XFS_I(inode), offset, len,
> +			XFS_BMAPI_PREALLOC);
>  	if (error)
>  		return error;
>  	return xfs_falloc_setsize(file, new_size);
> @@ -1340,7 +1351,7 @@ xfs_falloc_allocate_range(
>  		(FALLOC_FL_ALLOCATE_RANGE | FALLOC_FL_KEEP_SIZE |	\
>  		 FALLOC_FL_PUNCH_HOLE |	FALLOC_FL_COLLAPSE_RANGE |	\
>  		 FALLOC_FL_ZERO_RANGE |	FALLOC_FL_INSERT_RANGE |	\
> -		 FALLOC_FL_UNSHARE_RANGE)
> +		 FALLOC_FL_UNSHARE_RANGE | FALLOC_FL_WRITE_ZEROES)
>  
>  STATIC long
>  __xfs_file_fallocate(
> @@ -1383,6 +1394,7 @@ __xfs_file_fallocate(
>  	case FALLOC_FL_INSERT_RANGE:
>  		error = xfs_falloc_insert_range(file, offset, len);
>  		break;
> +	case FALLOC_FL_WRITE_ZEROES:
>  	case FALLOC_FL_ZERO_RANGE:
>  		error = xfs_falloc_zero_range(file, mode, offset, len, ac);
>  		break;
> -- 
> 2.51.0
> 
> 

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v3] xfs: add FALLOC_FL_WRITE_ZEROES to XFS code base
  2025-10-29 18:22             ` Darrick J. Wong
@ 2025-10-30  7:27               ` Christoph Hellwig
  2025-11-12 21:02                 ` [PATCH v4] " Lukas Herbolt
  0 siblings, 1 reply; 25+ messages in thread
From: Christoph Hellwig @ 2025-10-30  7:27 UTC (permalink / raw)
  To: Darrick J. Wong; +Cc: Lukas Herbolt, yi.zhang, linux-xfs

On Wed, Oct 29, 2025 at 11:22:55AM -0700, Darrick J. Wong wrote:
> > +	if (mode & FALLOC_FL_WRITE_ZEROES) {
> > +		if (xfs_is_cow_inode(ip) || !bdev_write_zeroes_unmap_sectors(
> 
> xfs_is_cow_inode() only tells us if the inode is capable of doing out of
> place writes.  Why would a regular reflinked inode be ineligible for
> WRITE_ZEROES?

Yes, this shoyuld be xfs_is_always_cow_inode.

> I don't understand why this bdev_write_zeroes_unmap_sectors check is
> here and not in xfs_alloc_file_space.  Shouldn't other callers of
> xfs_alloc_file_space be restricted from passing in XFS_BMAPI_ZERO if the
> block device doesn't support unmap_sectors?

Othere callers are fine with the software fallback for the block zeroing
helpers.  But this is a good question that should probably be documented
in a comment.


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v3] xfs: add FALLOC_FL_WRITE_ZEROES to XFS code base
  2025-10-29 17:53           ` [PATCH v3] " Lukas Herbolt
  2025-10-29 18:22             ` Darrick J. Wong
@ 2025-10-30  7:29             ` Christoph Hellwig
  1 sibling, 0 replies; 25+ messages in thread
From: Christoph Hellwig @ 2025-10-30  7:29 UTC (permalink / raw)
  To: Lukas Herbolt; +Cc: yi.zhang, linux-xfs

On Wed, Oct 29, 2025 at 06:53:14PM +0100, Lukas Herbolt wrote:
> Add support for FALLOC_FL_WRITE_ZEROES if the underlying device enable
> the unmap write zeroes operation.
> 
> v3 changes:
>  - fix formating
>  - fix check  on the return value of xfs_alloc_file_space
>  - add check if inode COW and return -EOPNOTSUPP

The changes from previous versions go below the ---.

Alternatively add a cover letter even for single patches, which can
be easier to maintain.


^ permalink raw reply	[flat|nested] 25+ messages in thread

* [PATCH v4] xfs: add FALLOC_FL_WRITE_ZEROES to XFS code base
  2025-10-30  7:27               ` Christoph Hellwig
@ 2025-11-12 21:02                 ` Lukas Herbolt
  2025-11-13  6:59                   ` Christoph Hellwig
  0 siblings, 1 reply; 25+ messages in thread
From: Lukas Herbolt @ 2025-11-12 21:02 UTC (permalink / raw)
  To: hch, djwong; +Cc: linux-xfs, Lukas Herbolt

Add support for FALLOC_FL_WRITE_ZEROES if the underlying device enable
the unmap write zeroes operation.

Signed-off-by: Lukas Herbolt <lukas@herbolt.com>
---
v4 changes:
	check if xfs_is_always_cow_inode
	rename flags -> bmapi_flags
	note about XFS_BMAPI_ZERO can cause software zeroing 

 fs/xfs/xfs_bmap_util.c | 10 ++++++++--
 fs/xfs/xfs_bmap_util.h |  2 +-
 fs/xfs/xfs_file.c      | 25 +++++++++++++++++++------
 3 files changed, 28 insertions(+), 9 deletions(-)

diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 06ca11731e430..ced34ce4597b4 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -642,11 +642,17 @@ xfs_free_eofblocks(
 	return error;
 }
 
+/*
+ * Callers can specify bmapi_flags, if XFS_BMAPI_ZERO is used
+ * there are no further checks whether the hardware supports it
+ * and it can fallback to software zeroing.
+ */
 int
 xfs_alloc_file_space(
 	struct xfs_inode	*ip,
 	xfs_off_t		offset,
-	xfs_off_t		len)
+	xfs_off_t		len,
+	uint32_t		bmapi_flags)
 {
 	xfs_mount_t		*mp = ip->i_mount;
 	xfs_off_t		count;
@@ -748,7 +754,7 @@ xfs_alloc_file_space(
 		 * will eventually reach the requested range.
 		 */
 		error = xfs_bmapi_write(tp, ip, startoffset_fsb,
-				allocatesize_fsb, XFS_BMAPI_PREALLOC, 0, imapp,
+				allocatesize_fsb, bmapi_flags, 0, imapp,
 				&nimaps);
 		if (error) {
 			if (error != -ENOSR)
diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h
index c477b33616304..2895cc97a5728 100644
--- a/fs/xfs/xfs_bmap_util.h
+++ b/fs/xfs/xfs_bmap_util.h
@@ -56,7 +56,7 @@ int	xfs_bmap_last_extent(struct xfs_trans *tp, struct xfs_inode *ip,
 
 /* preallocation and hole punch interface */
 int	xfs_alloc_file_space(struct xfs_inode *ip, xfs_off_t offset,
-		xfs_off_t len);
+		xfs_off_t len, uint32_t bmapi_flags);
 int	xfs_free_file_space(struct xfs_inode *ip, xfs_off_t offset,
 		xfs_off_t len, struct xfs_zone_alloc_ctx *ac);
 int	xfs_collapse_file_space(struct xfs_inode *, xfs_off_t offset,
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index f96fbf5c54c99..d52db0d7af8ff 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1261,23 +1261,33 @@ xfs_falloc_zero_range(
 	struct xfs_zone_alloc_ctx *ac)
 {
 	struct inode		*inode = file_inode(file);
+	struct xfs_inode	*ip = XFS_I(inode);
 	unsigned int		blksize = i_blocksize(inode);
 	loff_t			new_size = 0;
 	int			error;
 
-	trace_xfs_zero_file_space(XFS_I(inode));
+	trace_xfs_zero_file_space(ip);
 
 	error = xfs_falloc_newsize(file, mode, offset, len, &new_size);
 	if (error)
 		return error;
 
-	error = xfs_free_file_space(XFS_I(inode), offset, len, ac);
+	error = xfs_free_file_space(ip, offset, len, ac);
 	if (error)
 		return error;
 
 	len = round_up(offset + len, blksize) - round_down(offset, blksize);
 	offset = round_down(offset, blksize);
-	error = xfs_alloc_file_space(XFS_I(inode), offset, len);
+	if (mode & FALLOC_FL_WRITE_ZEROES) {
+		if (xfs_is_always_cow_inode(ip) ||
+				!bdev_write_zeroes_unmap_sectors(
+				xfs_inode_buftarg(ip)->bt_bdev))
+			return -EOPNOTSUPP;
+		error = xfs_alloc_file_space(ip, offset, len, XFS_BMAPI_ZERO);
+	} else {
+		error = xfs_alloc_file_space(ip, offset, len,
+				XFS_BMAPI_PREALLOC);
+	}
 	if (error)
 		return error;
 	return xfs_falloc_setsize(file, new_size);
@@ -1302,7 +1312,8 @@ xfs_falloc_unshare_range(
 	if (error)
 		return error;
 
-	error = xfs_alloc_file_space(XFS_I(inode), offset, len);
+	error = xfs_alloc_file_space(XFS_I(inode), offset, len,
+			XFS_BMAPI_PREALLOC);
 	if (error)
 		return error;
 	return xfs_falloc_setsize(file, new_size);
@@ -1330,7 +1341,8 @@ xfs_falloc_allocate_range(
 	if (error)
 		return error;
 
-	error = xfs_alloc_file_space(XFS_I(inode), offset, len);
+	error = xfs_alloc_file_space(XFS_I(inode), offset, len,
+			XFS_BMAPI_PREALLOC);
 	if (error)
 		return error;
 	return xfs_falloc_setsize(file, new_size);
@@ -1340,7 +1352,7 @@ xfs_falloc_allocate_range(
 		(FALLOC_FL_ALLOCATE_RANGE | FALLOC_FL_KEEP_SIZE |	\
 		 FALLOC_FL_PUNCH_HOLE |	FALLOC_FL_COLLAPSE_RANGE |	\
 		 FALLOC_FL_ZERO_RANGE |	FALLOC_FL_INSERT_RANGE |	\
-		 FALLOC_FL_UNSHARE_RANGE)
+		 FALLOC_FL_UNSHARE_RANGE | FALLOC_FL_WRITE_ZEROES)
 
 STATIC long
 __xfs_file_fallocate(
@@ -1383,6 +1395,7 @@ __xfs_file_fallocate(
 	case FALLOC_FL_INSERT_RANGE:
 		error = xfs_falloc_insert_range(file, offset, len);
 		break;
+	case FALLOC_FL_WRITE_ZEROES:
 	case FALLOC_FL_ZERO_RANGE:
 		error = xfs_falloc_zero_range(file, mode, offset, len, ac);
 		break;
-- 
2.51.1


^ permalink raw reply related	[flat|nested] 25+ messages in thread

* Re: [PATCH v4] xfs: add FALLOC_FL_WRITE_ZEROES to XFS code base
  2025-11-12 21:02                 ` [PATCH v4] " Lukas Herbolt
@ 2025-11-13  6:59                   ` Christoph Hellwig
  2025-11-14  8:55                     ` [PATCH v5] " Lukas Herbolt
  0 siblings, 1 reply; 25+ messages in thread
From: Christoph Hellwig @ 2025-11-13  6:59 UTC (permalink / raw)
  To: Lukas Herbolt; +Cc: hch, djwong, linux-xfs

> +/*
> + * Callers can specify bmapi_flags, if XFS_BMAPI_ZERO is used
> + * there are no further checks whether the hardware supports it
> + * and it can fallback to software zeroing.
> + */

Sorry for nitpicking more, but please try to use up the available 80
characters for block comments.

> +	if (mode & FALLOC_FL_WRITE_ZEROES) {
> +		if (xfs_is_always_cow_inode(ip) ||
> +				!bdev_write_zeroes_unmap_sectors(
> +				xfs_inode_buftarg(ip)->bt_bdev))
> +			return -EOPNOTSUPP;

The indentation is also still a bit odd here, although for this kind
of things there are no hard rules.  But I'd go for

		if (xfs_is_always_cow_inode(ip) ||
		    !bdev_write_zeroes_unmap_sectors(
				xfs_inode_buftarg(ip)->bt_bdev))

Functionally this looks good now:

Reviewed-by: Christoph Hellwig <hch@lst.de>

^ permalink raw reply	[flat|nested] 25+ messages in thread

* [PATCH v5] xfs: add FALLOC_FL_WRITE_ZEROES to XFS code base
  2025-11-13  6:59                   ` Christoph Hellwig
@ 2025-11-14  8:55                     ` Lukas Herbolt
  2025-11-14  8:57                       ` Christoph Hellwig
  2025-11-14 16:44                       ` Darrick J. Wong
  0 siblings, 2 replies; 25+ messages in thread
From: Lukas Herbolt @ 2025-11-14  8:55 UTC (permalink / raw)
  To: hch; +Cc: linux-xfs, Lukas Herbolt

Add support for FALLOC_FL_WRITE_ZEROES if the underlying device enable
the unmap write zeroes operation.

Signed-off-by: Lukas Herbolt <lukas@herbolt.com>
---
v5 changes:
	formating

 fs/xfs/xfs_bmap_util.c | 10 ++++++++--
 fs/xfs/xfs_bmap_util.h |  2 +-
 fs/xfs/xfs_file.c      | 25 +++++++++++++++++++------
 3 files changed, 28 insertions(+), 9 deletions(-)

diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 06ca11731e430..ee5765bf52944 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -642,11 +642,17 @@ xfs_free_eofblocks(
 	return error;
 }
 
+/*
+ * Callers can specify bmapi_flags, if XFS_BMAPI_ZERO is used there are no
+ * further checks whether the hard ware supports and it can fallback to
+ * software zeroing.
+ */
 int
 xfs_alloc_file_space(
 	struct xfs_inode	*ip,
 	xfs_off_t		offset,
-	xfs_off_t		len)
+	xfs_off_t		len,
+	uint32_t		bmapi_flags)
 {
 	xfs_mount_t		*mp = ip->i_mount;
 	xfs_off_t		count;
@@ -748,7 +754,7 @@ xfs_alloc_file_space(
 		 * will eventually reach the requested range.
 		 */
 		error = xfs_bmapi_write(tp, ip, startoffset_fsb,
-				allocatesize_fsb, XFS_BMAPI_PREALLOC, 0, imapp,
+				allocatesize_fsb, bmapi_flags, 0, imapp,
 				&nimaps);
 		if (error) {
 			if (error != -ENOSR)
diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h
index c477b33616304..2895cc97a5728 100644
--- a/fs/xfs/xfs_bmap_util.h
+++ b/fs/xfs/xfs_bmap_util.h
@@ -56,7 +56,7 @@ int	xfs_bmap_last_extent(struct xfs_trans *tp, struct xfs_inode *ip,
 
 /* preallocation and hole punch interface */
 int	xfs_alloc_file_space(struct xfs_inode *ip, xfs_off_t offset,
-		xfs_off_t len);
+		xfs_off_t len, uint32_t bmapi_flags);
 int	xfs_free_file_space(struct xfs_inode *ip, xfs_off_t offset,
 		xfs_off_t len, struct xfs_zone_alloc_ctx *ac);
 int	xfs_collapse_file_space(struct xfs_inode *, xfs_off_t offset,
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index f96fbf5c54c99..3ed11b1028563 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1261,23 +1261,33 @@ xfs_falloc_zero_range(
 	struct xfs_zone_alloc_ctx *ac)
 {
 	struct inode		*inode = file_inode(file);
+	struct xfs_inode	*ip = XFS_I(inode);
 	unsigned int		blksize = i_blocksize(inode);
 	loff_t			new_size = 0;
 	int			error;
 
-	trace_xfs_zero_file_space(XFS_I(inode));
+	trace_xfs_zero_file_space(ip);
 
 	error = xfs_falloc_newsize(file, mode, offset, len, &new_size);
 	if (error)
 		return error;
 
-	error = xfs_free_file_space(XFS_I(inode), offset, len, ac);
+	error = xfs_free_file_space(ip, offset, len, ac);
 	if (error)
 		return error;
 
 	len = round_up(offset + len, blksize) - round_down(offset, blksize);
 	offset = round_down(offset, blksize);
-	error = xfs_alloc_file_space(XFS_I(inode), offset, len);
+	if (mode & FALLOC_FL_WRITE_ZEROES) {
+		if (xfs_is_always_cow_inode(ip) ||
+				!bdev_write_zeroes_unmap_sectors(
+					xfs_inode_buftarg(ip)->bt_bdev))
+			return -EOPNOTSUPP;
+		error = xfs_alloc_file_space(ip, offset, len, XFS_BMAPI_ZERO);
+	} else {
+		error = xfs_alloc_file_space(ip, offset, len,
+				XFS_BMAPI_PREALLOC);
+	}
 	if (error)
 		return error;
 	return xfs_falloc_setsize(file, new_size);
@@ -1302,7 +1312,8 @@ xfs_falloc_unshare_range(
 	if (error)
 		return error;
 
-	error = xfs_alloc_file_space(XFS_I(inode), offset, len);
+	error = xfs_alloc_file_space(XFS_I(inode), offset, len,
+			XFS_BMAPI_PREALLOC);
 	if (error)
 		return error;
 	return xfs_falloc_setsize(file, new_size);
@@ -1330,7 +1341,8 @@ xfs_falloc_allocate_range(
 	if (error)
 		return error;
 
-	error = xfs_alloc_file_space(XFS_I(inode), offset, len);
+	error = xfs_alloc_file_space(XFS_I(inode), offset, len,
+			XFS_BMAPI_PREALLOC);
 	if (error)
 		return error;
 	return xfs_falloc_setsize(file, new_size);
@@ -1340,7 +1352,7 @@ xfs_falloc_allocate_range(
 		(FALLOC_FL_ALLOCATE_RANGE | FALLOC_FL_KEEP_SIZE |	\
 		 FALLOC_FL_PUNCH_HOLE |	FALLOC_FL_COLLAPSE_RANGE |	\
 		 FALLOC_FL_ZERO_RANGE |	FALLOC_FL_INSERT_RANGE |	\
-		 FALLOC_FL_UNSHARE_RANGE)
+		 FALLOC_FL_UNSHARE_RANGE | FALLOC_FL_WRITE_ZEROES)
 
 STATIC long
 __xfs_file_fallocate(
@@ -1383,6 +1395,7 @@ __xfs_file_fallocate(
 	case FALLOC_FL_INSERT_RANGE:
 		error = xfs_falloc_insert_range(file, offset, len);
 		break;
+	case FALLOC_FL_WRITE_ZEROES:
 	case FALLOC_FL_ZERO_RANGE:
 		error = xfs_falloc_zero_range(file, mode, offset, len, ac);
 		break;
-- 
2.51.1


^ permalink raw reply related	[flat|nested] 25+ messages in thread

* Re: [PATCH v5] xfs: add FALLOC_FL_WRITE_ZEROES to XFS code base
  2025-11-14  8:55                     ` [PATCH v5] " Lukas Herbolt
@ 2025-11-14  8:57                       ` Christoph Hellwig
  2025-11-14 16:44                       ` Darrick J. Wong
  1 sibling, 0 replies; 25+ messages in thread
From: Christoph Hellwig @ 2025-11-14  8:57 UTC (permalink / raw)
  To: Lukas Herbolt; +Cc: hch, linux-xfs

Looks good:

Reviewed-by: Christoph Hellwig <hch@lst.de>


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v5] xfs: add FALLOC_FL_WRITE_ZEROES to XFS code base
  2025-11-14  8:55                     ` [PATCH v5] " Lukas Herbolt
  2025-11-14  8:57                       ` Christoph Hellwig
@ 2025-11-14 16:44                       ` Darrick J. Wong
  2025-11-14 16:45                         ` Christoph Hellwig
  1 sibling, 1 reply; 25+ messages in thread
From: Darrick J. Wong @ 2025-11-14 16:44 UTC (permalink / raw)
  To: Lukas Herbolt; +Cc: hch, linux-xfs

On Fri, Nov 14, 2025 at 09:55:26AM +0100, Lukas Herbolt wrote:
> Add support for FALLOC_FL_WRITE_ZEROES if the underlying device enable
> the unmap write zeroes operation.
> 
> Signed-off-by: Lukas Herbolt <lukas@herbolt.com>
> ---
> v5 changes:
> 	formating
> 
>  fs/xfs/xfs_bmap_util.c | 10 ++++++++--
>  fs/xfs/xfs_bmap_util.h |  2 +-
>  fs/xfs/xfs_file.c      | 25 +++++++++++++++++++------
>  3 files changed, 28 insertions(+), 9 deletions(-)
> 
> diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
> index 06ca11731e430..ee5765bf52944 100644
> --- a/fs/xfs/xfs_bmap_util.c
> +++ b/fs/xfs/xfs_bmap_util.c
> @@ -642,11 +642,17 @@ xfs_free_eofblocks(
>  	return error;
>  }
>  
> +/*
> + * Callers can specify bmapi_flags, if XFS_BMAPI_ZERO is used there are no
> + * further checks whether the hard ware supports and it can fallback to
> + * software zeroing.
> + */
>  int
>  xfs_alloc_file_space(
>  	struct xfs_inode	*ip,
>  	xfs_off_t		offset,
> -	xfs_off_t		len)
> +	xfs_off_t		len,
> +	uint32_t		bmapi_flags)
>  {
>  	xfs_mount_t		*mp = ip->i_mount;
>  	xfs_off_t		count;
> @@ -748,7 +754,7 @@ xfs_alloc_file_space(
>  		 * will eventually reach the requested range.
>  		 */
>  		error = xfs_bmapi_write(tp, ip, startoffset_fsb,
> -				allocatesize_fsb, XFS_BMAPI_PREALLOC, 0, imapp,
> +				allocatesize_fsb, bmapi_flags, 0, imapp,
>  				&nimaps);
>  		if (error) {
>  			if (error != -ENOSR)
> diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h
> index c477b33616304..2895cc97a5728 100644
> --- a/fs/xfs/xfs_bmap_util.h
> +++ b/fs/xfs/xfs_bmap_util.h
> @@ -56,7 +56,7 @@ int	xfs_bmap_last_extent(struct xfs_trans *tp, struct xfs_inode *ip,
>  
>  /* preallocation and hole punch interface */
>  int	xfs_alloc_file_space(struct xfs_inode *ip, xfs_off_t offset,
> -		xfs_off_t len);
> +		xfs_off_t len, uint32_t bmapi_flags);
>  int	xfs_free_file_space(struct xfs_inode *ip, xfs_off_t offset,
>  		xfs_off_t len, struct xfs_zone_alloc_ctx *ac);
>  int	xfs_collapse_file_space(struct xfs_inode *, xfs_off_t offset,
> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> index f96fbf5c54c99..3ed11b1028563 100644
> --- a/fs/xfs/xfs_file.c
> +++ b/fs/xfs/xfs_file.c
> @@ -1261,23 +1261,33 @@ xfs_falloc_zero_range(
>  	struct xfs_zone_alloc_ctx *ac)
>  {
>  	struct inode		*inode = file_inode(file);
> +	struct xfs_inode	*ip = XFS_I(inode);
>  	unsigned int		blksize = i_blocksize(inode);
>  	loff_t			new_size = 0;
>  	int			error;
>  
> -	trace_xfs_zero_file_space(XFS_I(inode));
> +	trace_xfs_zero_file_space(ip);
>  
>  	error = xfs_falloc_newsize(file, mode, offset, len, &new_size);
>  	if (error)
>  		return error;
>  
> -	error = xfs_free_file_space(XFS_I(inode), offset, len, ac);
> +	error = xfs_free_file_space(ip, offset, len, ac);
>  	if (error)
>  		return error;
>  
>  	len = round_up(offset + len, blksize) - round_down(offset, blksize);
>  	offset = round_down(offset, blksize);
> -	error = xfs_alloc_file_space(XFS_I(inode), offset, len);
> +	if (mode & FALLOC_FL_WRITE_ZEROES) {
> +		if (xfs_is_always_cow_inode(ip) ||
> +				!bdev_write_zeroes_unmap_sectors(
> +					xfs_inode_buftarg(ip)->bt_bdev))

I think hch was asking for this indentation:

		if (xfs_is_always_cow_inode(ip) ||
		    !bdev_write_zeroes_unmap_sectors(
				xfs_inode_buftarg(ip)->bt_bdev))
			return -EOPNOTSUPP;

(otherwise the code looks correct to me)

--D

> +		error = xfs_alloc_file_space(ip, offset, len, XFS_BMAPI_ZERO);
> +	} else {
> +		error = xfs_alloc_file_space(ip, offset, len,
> +				XFS_BMAPI_PREALLOC);
> +	}
>  	if (error)
>  		return error;
>  	return xfs_falloc_setsize(file, new_size);
> @@ -1302,7 +1312,8 @@ xfs_falloc_unshare_range(
>  	if (error)
>  		return error;
>  
> -	error = xfs_alloc_file_space(XFS_I(inode), offset, len);
> +	error = xfs_alloc_file_space(XFS_I(inode), offset, len,
> +			XFS_BMAPI_PREALLOC);
>  	if (error)
>  		return error;
>  	return xfs_falloc_setsize(file, new_size);
> @@ -1330,7 +1341,8 @@ xfs_falloc_allocate_range(
>  	if (error)
>  		return error;
>  
> -	error = xfs_alloc_file_space(XFS_I(inode), offset, len);
> +	error = xfs_alloc_file_space(XFS_I(inode), offset, len,
> +			XFS_BMAPI_PREALLOC);
>  	if (error)
>  		return error;
>  	return xfs_falloc_setsize(file, new_size);
> @@ -1340,7 +1352,7 @@ xfs_falloc_allocate_range(
>  		(FALLOC_FL_ALLOCATE_RANGE | FALLOC_FL_KEEP_SIZE |	\
>  		 FALLOC_FL_PUNCH_HOLE |	FALLOC_FL_COLLAPSE_RANGE |	\
>  		 FALLOC_FL_ZERO_RANGE |	FALLOC_FL_INSERT_RANGE |	\
> -		 FALLOC_FL_UNSHARE_RANGE)
> +		 FALLOC_FL_UNSHARE_RANGE | FALLOC_FL_WRITE_ZEROES)
>  
>  STATIC long
>  __xfs_file_fallocate(
> @@ -1383,6 +1395,7 @@ __xfs_file_fallocate(
>  	case FALLOC_FL_INSERT_RANGE:
>  		error = xfs_falloc_insert_range(file, offset, len);
>  		break;
> +	case FALLOC_FL_WRITE_ZEROES:
>  	case FALLOC_FL_ZERO_RANGE:
>  		error = xfs_falloc_zero_range(file, mode, offset, len, ac);
>  		break;
> -- 
> 2.51.1
> 
> 

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v5] xfs: add FALLOC_FL_WRITE_ZEROES to XFS code base
  2025-11-14 16:44                       ` Darrick J. Wong
@ 2025-11-14 16:45                         ` Christoph Hellwig
  2025-11-18  9:05                           ` lukas
  2025-12-15 11:48                           ` [PATCH v6] " Lukas Herbolt
  0 siblings, 2 replies; 25+ messages in thread
From: Christoph Hellwig @ 2025-11-14 16:45 UTC (permalink / raw)
  To: Darrick J. Wong; +Cc: Lukas Herbolt, hch, linux-xfs

On Fri, Nov 14, 2025 at 08:44:36AM -0800, Darrick J. Wong wrote:
> I think hch was asking for this indentation:
> 
> 		if (xfs_is_always_cow_inode(ip) ||
> 		    !bdev_write_zeroes_unmap_sectors(
> 				xfs_inode_buftarg(ip)->bt_bdev))
> 			return -EOPNOTSUPP;

That would have been my first preference.  But the current version is
readable enough, so I'm fine.


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v5] xfs: add FALLOC_FL_WRITE_ZEROES to XFS code base
  2025-11-14 16:45                         ` Christoph Hellwig
@ 2025-11-18  9:05                           ` lukas
  2025-12-15 11:48                           ` [PATCH v6] " Lukas Herbolt
  1 sibling, 0 replies; 25+ messages in thread
From: lukas @ 2025-11-18  9:05 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Darrick J. Wong, linux-xfs

On 2025-11-14 17:45, Christoph Hellwig wrote:
> On Fri, Nov 14, 2025 at 08:44:36AM -0800, Darrick J. Wong wrote:
>> I think hch was asking for this indentation:
>> 
>> 		if (xfs_is_always_cow_inode(ip) ||
>> 		    !bdev_write_zeroes_unmap_sectors(
>> 				xfs_inode_buftarg(ip)->bt_bdev))
>> 			return -EOPNOTSUPP;
> 
> That would have been my first preference.  But the current version is
> readable enough, so I'm fine.


I see it now, sorry! Obviously roundcube, despite of being set to plain 
text,
mangled the white spaces. Let me correct it!

^ permalink raw reply	[flat|nested] 25+ messages in thread

* [PATCH v6] xfs: add FALLOC_FL_WRITE_ZEROES to XFS code base
  2025-11-14 16:45                         ` Christoph Hellwig
  2025-11-18  9:05                           ` lukas
@ 2025-12-15 11:48                           ` Lukas Herbolt
  2025-12-15 14:28                             ` Christoph Hellwig
  1 sibling, 1 reply; 25+ messages in thread
From: Lukas Herbolt @ 2025-12-15 11:48 UTC (permalink / raw)
  To: djwong; +Cc: linux-xfs, Lukas Herbolt

Add support for FALLOC_FL_WRITE_ZEROES if the underlying device enable
the unmap write zeroes operation.

Signed-off-by: Lukas Herbolt <lukas@herbolt.com>
---
v6 changes:
    formating to follow recomendation

 fs/xfs/xfs_bmap_util.c | 10 ++++++++--
 fs/xfs/xfs_bmap_util.h |  2 +-
 fs/xfs/xfs_file.c      | 25 +++++++++++++++++++------
 3 files changed, 28 insertions(+), 9 deletions(-)

diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 06ca11731e430..ee5765bf52944 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -642,11 +642,17 @@ xfs_free_eofblocks(
 	return error;
 }
 
+/*
+ * Callers can specify bmapi_flags, if XFS_BMAPI_ZERO is used there are no
+ * further checks whether the hard ware supports and it can fallback to
+ * software zeroing.
+ */
 int
 xfs_alloc_file_space(
 	struct xfs_inode	*ip,
 	xfs_off_t		offset,
-	xfs_off_t		len)
+	xfs_off_t		len,
+	uint32_t		bmapi_flags)
 {
 	xfs_mount_t		*mp = ip->i_mount;
 	xfs_off_t		count;
@@ -748,7 +754,7 @@ xfs_alloc_file_space(
 		 * will eventually reach the requested range.
 		 */
 		error = xfs_bmapi_write(tp, ip, startoffset_fsb,
-				allocatesize_fsb, XFS_BMAPI_PREALLOC, 0, imapp,
+				allocatesize_fsb, bmapi_flags, 0, imapp,
 				&nimaps);
 		if (error) {
 			if (error != -ENOSR)
diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h
index c477b33616304..2895cc97a5728 100644
--- a/fs/xfs/xfs_bmap_util.h
+++ b/fs/xfs/xfs_bmap_util.h
@@ -56,7 +56,7 @@ int	xfs_bmap_last_extent(struct xfs_trans *tp, struct xfs_inode *ip,
 
 /* preallocation and hole punch interface */
 int	xfs_alloc_file_space(struct xfs_inode *ip, xfs_off_t offset,
-		xfs_off_t len);
+		xfs_off_t len, uint32_t bmapi_flags);
 int	xfs_free_file_space(struct xfs_inode *ip, xfs_off_t offset,
 		xfs_off_t len, struct xfs_zone_alloc_ctx *ac);
 int	xfs_collapse_file_space(struct xfs_inode *, xfs_off_t offset,
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index f96fbf5c54c99..040e4407a8a07 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1261,23 +1261,33 @@ xfs_falloc_zero_range(
 	struct xfs_zone_alloc_ctx *ac)
 {
 	struct inode		*inode = file_inode(file);
+	struct xfs_inode	*ip = XFS_I(inode);
 	unsigned int		blksize = i_blocksize(inode);
 	loff_t			new_size = 0;
 	int			error;
 
-	trace_xfs_zero_file_space(XFS_I(inode));
+	trace_xfs_zero_file_space(ip);
 
 	error = xfs_falloc_newsize(file, mode, offset, len, &new_size);
 	if (error)
 		return error;
 
-	error = xfs_free_file_space(XFS_I(inode), offset, len, ac);
+	error = xfs_free_file_space(ip, offset, len, ac);
 	if (error)
 		return error;
 
 	len = round_up(offset + len, blksize) - round_down(offset, blksize);
 	offset = round_down(offset, blksize);
-	error = xfs_alloc_file_space(XFS_I(inode), offset, len);
+	if (mode & FALLOC_FL_WRITE_ZEROES) {
+		if (xfs_is_always_cow_inode(ip) ||
+		    !bdev_write_zeroes_unmap_sectors(
+				xfs_inode_buftarg(ip)->bt_bdev))
+			return -EOPNOTSUPP;
+		error = xfs_alloc_file_space(ip, offset, len, XFS_BMAPI_ZERO);
+	} else {
+		error = xfs_alloc_file_space(ip, offset, len,
+				XFS_BMAPI_PREALLOC);
+	}
 	if (error)
 		return error;
 	return xfs_falloc_setsize(file, new_size);
@@ -1302,7 +1312,8 @@ xfs_falloc_unshare_range(
 	if (error)
 		return error;
 
-	error = xfs_alloc_file_space(XFS_I(inode), offset, len);
+	error = xfs_alloc_file_space(XFS_I(inode), offset, len,
+			XFS_BMAPI_PREALLOC);
 	if (error)
 		return error;
 	return xfs_falloc_setsize(file, new_size);
@@ -1330,7 +1341,8 @@ xfs_falloc_allocate_range(
 	if (error)
 		return error;
 
-	error = xfs_alloc_file_space(XFS_I(inode), offset, len);
+	error = xfs_alloc_file_space(XFS_I(inode), offset, len,
+			XFS_BMAPI_PREALLOC);
 	if (error)
 		return error;
 	return xfs_falloc_setsize(file, new_size);
@@ -1340,7 +1352,7 @@ xfs_falloc_allocate_range(
 		(FALLOC_FL_ALLOCATE_RANGE | FALLOC_FL_KEEP_SIZE |	\
 		 FALLOC_FL_PUNCH_HOLE |	FALLOC_FL_COLLAPSE_RANGE |	\
 		 FALLOC_FL_ZERO_RANGE |	FALLOC_FL_INSERT_RANGE |	\
-		 FALLOC_FL_UNSHARE_RANGE)
+		 FALLOC_FL_UNSHARE_RANGE | FALLOC_FL_WRITE_ZEROES)
 
 STATIC long
 __xfs_file_fallocate(
@@ -1383,6 +1395,7 @@ __xfs_file_fallocate(
 	case FALLOC_FL_INSERT_RANGE:
 		error = xfs_falloc_insert_range(file, offset, len);
 		break;
+	case FALLOC_FL_WRITE_ZEROES:
 	case FALLOC_FL_ZERO_RANGE:
 		error = xfs_falloc_zero_range(file, mode, offset, len, ac);
 		break;
-- 
2.51.1


^ permalink raw reply related	[flat|nested] 25+ messages in thread

* Re: [PATCH v6] xfs: add FALLOC_FL_WRITE_ZEROES to XFS code base
  2025-12-15 11:48                           ` [PATCH v6] " Lukas Herbolt
@ 2025-12-15 14:28                             ` Christoph Hellwig
  0 siblings, 0 replies; 25+ messages in thread
From: Christoph Hellwig @ 2025-12-15 14:28 UTC (permalink / raw)
  To: Lukas Herbolt; +Cc: djwong, linux-xfs

On Mon, Dec 15, 2025 at 12:48:12PM +0100, Lukas Herbolt wrote:
> Add support for FALLOC_FL_WRITE_ZEROES if the underlying device enable
> the unmap write zeroes operation.

Oh, I thought this got merged already.

Still looks good:

Reviewed-by: Christoph Hellwig <hch@lst.de>


^ permalink raw reply	[flat|nested] 25+ messages in thread

end of thread, other threads:[~2025-12-15 14:28 UTC | newest]

Thread overview: 25+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-10-21 14:17 [PATCH 0/2] Add FL_WRITE_ZEROES to XFS, fix krealloc on xfs_uuid_table Lukas Herbolt
2025-10-21 14:17 ` [PATCH] xfs: add FALLOC_FL_WRITE_ZEROES to XFS code base Lukas Herbolt
2025-10-21 15:55   ` Darrick J. Wong
2025-10-22  5:00   ` Christoph Hellwig
2025-10-22  7:13     ` Zhang Yi
2025-10-22  7:15       ` Christoph Hellwig
2025-10-22  7:27         ` Zhang Yi
2025-10-29 17:53           ` [PATCH v3] " Lukas Herbolt
2025-10-29 18:22             ` Darrick J. Wong
2025-10-30  7:27               ` Christoph Hellwig
2025-11-12 21:02                 ` [PATCH v4] " Lukas Herbolt
2025-11-13  6:59                   ` Christoph Hellwig
2025-11-14  8:55                     ` [PATCH v5] " Lukas Herbolt
2025-11-14  8:57                       ` Christoph Hellwig
2025-11-14 16:44                       ` Darrick J. Wong
2025-11-14 16:45                         ` Christoph Hellwig
2025-11-18  9:05                           ` lukas
2025-12-15 11:48                           ` [PATCH v6] " Lukas Herbolt
2025-12-15 14:28                             ` Christoph Hellwig
2025-10-30  7:29             ` [PATCH v3] " Christoph Hellwig
2025-10-21 14:17 ` [PATCH 2/2] xfs: Remove WARN_ONCE if xfs_uuid_table grows over 2x PAGE_SIZE Lukas Herbolt
2025-10-21 15:56   ` Darrick J. Wong
2025-10-21 22:02   ` Dave Chinner
2025-10-26 17:49     ` lukas
2025-10-22  4:53   ` Christoph Hellwig

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox