* [PATCH v6 01/12] fs: add atomic write unit max opt to statx
2025-04-08 10:41 [PATCH v6 00/12] large atomic writes for xfs John Garry
@ 2025-04-08 10:41 ` John Garry
2025-04-09 2:23 ` Darrick J. Wong
2025-04-09 10:45 ` Christoph Hellwig
2025-04-08 10:41 ` [PATCH v6 02/12] xfs: add helpers to compute log item overhead John Garry
` (10 subsequent siblings)
11 siblings, 2 replies; 29+ messages in thread
From: John Garry @ 2025-04-08 10:41 UTC (permalink / raw)
To: brauner, djwong, hch, viro, jack, cem
Cc: linux-fsdevel, dchinner, linux-xfs, linux-kernel, ojaswin,
ritesh.list, martin.petersen, linux-ext4, linux-block,
catherine.hoang, John Garry
XFS will be able to support large atomic writes (atomic write > 1x block)
in future. This will be achieved by using different operating methods,
depending on the size of the write.
Specifically a new method of operation based in FS atomic extent remapping
will be supported in addition to the current HW offload-based method.
The FS method will generally be appreciably slower performing than the
HW-offload method. However the FS method will be typically able to
contribute to achieving a larger atomic write unit max limit.
XFS will support a hybrid mode, where HW offload method will be used when
possible, i.e. HW offload is used when the length of the write is
supported, and for other times FS-based atomic writes will be used.
As such, there is an atomic write length at which the user may experience
appreciably slower performance.
Advertise this limit in a new statx field, stx_atomic_write_unit_max_opt.
When zero, it means that there is no such performance boundary.
Masks STATX{_ATTR}_WRITE_ATOMIC can be used to get this new field. This is
ok for older kernels which don't support this new field, as they would
report 0 in this field (from zeroing in cp_statx()) already. Furthermore
those older kernels don't support large atomic writes - apart from block
fops, but there would be consistent performance there for atomic writes
in range [unit min, unit max].
Signed-off-by: John Garry <john.g.garry@oracle.com>
---
block/bdev.c | 3 ++-
fs/ext4/inode.c | 2 +-
fs/stat.c | 6 +++++-
fs/xfs/xfs_iops.c | 2 +-
include/linux/fs.h | 3 ++-
include/linux/stat.h | 1 +
include/uapi/linux/stat.h | 8 ++++++--
7 files changed, 18 insertions(+), 7 deletions(-)
diff --git a/block/bdev.c b/block/bdev.c
index 4844d1e27b6f..b4afc1763e8e 100644
--- a/block/bdev.c
+++ b/block/bdev.c
@@ -1301,7 +1301,8 @@ void bdev_statx(struct path *path, struct kstat *stat,
generic_fill_statx_atomic_writes(stat,
queue_atomic_write_unit_min_bytes(bd_queue),
- queue_atomic_write_unit_max_bytes(bd_queue));
+ queue_atomic_write_unit_max_bytes(bd_queue),
+ 0);
}
stat->blksize = bdev_io_min(bdev);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 1dc09ed5d403..51a45699112c 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -5663,7 +5663,7 @@ int ext4_getattr(struct mnt_idmap *idmap, const struct path *path,
awu_max = sbi->s_awu_max;
}
- generic_fill_statx_atomic_writes(stat, awu_min, awu_max);
+ generic_fill_statx_atomic_writes(stat, awu_min, awu_max, 0);
}
flags = ei->i_flags & EXT4_FL_USER_VISIBLE;
diff --git a/fs/stat.c b/fs/stat.c
index f13308bfdc98..c41855f62d22 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -136,13 +136,15 @@ EXPORT_SYMBOL(generic_fill_statx_attr);
* @stat: Where to fill in the attribute flags
* @unit_min: Minimum supported atomic write length in bytes
* @unit_max: Maximum supported atomic write length in bytes
+ * @unit_max_opt: Optimised maximum supported atomic write length in bytes
*
* Fill in the STATX{_ATTR}_WRITE_ATOMIC flags in the kstat structure from
* atomic write unit_min and unit_max values.
*/
void generic_fill_statx_atomic_writes(struct kstat *stat,
unsigned int unit_min,
- unsigned int unit_max)
+ unsigned int unit_max,
+ unsigned int unit_max_opt)
{
/* Confirm that the request type is known */
stat->result_mask |= STATX_WRITE_ATOMIC;
@@ -153,6 +155,7 @@ void generic_fill_statx_atomic_writes(struct kstat *stat,
if (unit_min) {
stat->atomic_write_unit_min = unit_min;
stat->atomic_write_unit_max = unit_max;
+ stat->atomic_write_unit_max_opt = unit_max_opt;
/* Initially only allow 1x segment */
stat->atomic_write_segments_max = 1;
@@ -732,6 +735,7 @@ cp_statx(const struct kstat *stat, struct statx __user *buffer)
tmp.stx_atomic_write_unit_min = stat->atomic_write_unit_min;
tmp.stx_atomic_write_unit_max = stat->atomic_write_unit_max;
tmp.stx_atomic_write_segments_max = stat->atomic_write_segments_max;
+ tmp.stx_atomic_write_unit_max_opt = stat->atomic_write_unit_max_opt;
return copy_to_user(buffer, &tmp, sizeof(tmp)) ? -EFAULT : 0;
}
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 756bd3ca8e00..f0e5d83195df 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -610,7 +610,7 @@ xfs_report_atomic_write(
if (xfs_inode_can_atomicwrite(ip))
unit_min = unit_max = ip->i_mount->m_sb.sb_blocksize;
- generic_fill_statx_atomic_writes(stat, unit_min, unit_max);
+ generic_fill_statx_atomic_writes(stat, unit_min, unit_max, 0);
}
STATIC int
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 016b0fe1536e..7b19d8f99aff 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3475,7 +3475,8 @@ void generic_fillattr(struct mnt_idmap *, u32, struct inode *, struct kstat *);
void generic_fill_statx_attr(struct inode *inode, struct kstat *stat);
void generic_fill_statx_atomic_writes(struct kstat *stat,
unsigned int unit_min,
- unsigned int unit_max);
+ unsigned int unit_max,
+ unsigned int unit_max_opt);
extern int vfs_getattr_nosec(const struct path *, struct kstat *, u32, unsigned int);
extern int vfs_getattr(const struct path *, struct kstat *, u32, unsigned int);
void __inode_add_bytes(struct inode *inode, loff_t bytes);
diff --git a/include/linux/stat.h b/include/linux/stat.h
index be7496a6a0dd..e3d00e7bb26d 100644
--- a/include/linux/stat.h
+++ b/include/linux/stat.h
@@ -57,6 +57,7 @@ struct kstat {
u32 dio_read_offset_align;
u32 atomic_write_unit_min;
u32 atomic_write_unit_max;
+ u32 atomic_write_unit_max_opt;
u32 atomic_write_segments_max;
};
diff --git a/include/uapi/linux/stat.h b/include/uapi/linux/stat.h
index f78ee3670dd5..1686861aae20 100644
--- a/include/uapi/linux/stat.h
+++ b/include/uapi/linux/stat.h
@@ -182,8 +182,12 @@ struct statx {
/* File offset alignment for direct I/O reads */
__u32 stx_dio_read_offset_align;
- /* 0xb8 */
- __u64 __spare3[9]; /* Spare space for future expansion */
+ /* Optimised max atomic write unit in bytes */
+ __u32 stx_atomic_write_unit_max_opt;
+ __u32 __spare2[1];
+
+ /* 0xc0 */
+ __u64 __spare3[8]; /* Spare space for future expansion */
/* 0x100 */
};
--
2.31.1
^ permalink raw reply related [flat|nested] 29+ messages in thread* Re: [PATCH v6 01/12] fs: add atomic write unit max opt to statx
2025-04-08 10:41 ` [PATCH v6 01/12] fs: add atomic write unit max opt to statx John Garry
@ 2025-04-09 2:23 ` Darrick J. Wong
2025-04-09 10:45 ` Christoph Hellwig
1 sibling, 0 replies; 29+ messages in thread
From: Darrick J. Wong @ 2025-04-09 2:23 UTC (permalink / raw)
To: John Garry
Cc: brauner, hch, viro, jack, cem, linux-fsdevel, dchinner, linux-xfs,
linux-kernel, ojaswin, ritesh.list, martin.petersen, linux-ext4,
linux-block, catherine.hoang
This probably should have cc'd linux-api...
On Tue, Apr 08, 2025 at 10:41:58AM +0000, John Garry wrote:
> XFS will be able to support large atomic writes (atomic write > 1x block)
> in future. This will be achieved by using different operating methods,
> depending on the size of the write.
>
> Specifically a new method of operation based in FS atomic extent remapping
> will be supported in addition to the current HW offload-based method.
>
> The FS method will generally be appreciably slower performing than the
> HW-offload method. However the FS method will be typically able to
> contribute to achieving a larger atomic write unit max limit.
>
> XFS will support a hybrid mode, where HW offload method will be used when
> possible, i.e. HW offload is used when the length of the write is
> supported, and for other times FS-based atomic writes will be used.
>
> As such, there is an atomic write length at which the user may experience
> appreciably slower performance.
>
> Advertise this limit in a new statx field, stx_atomic_write_unit_max_opt.
>
> When zero, it means that there is no such performance boundary.
>
> Masks STATX{_ATTR}_WRITE_ATOMIC can be used to get this new field. This is
> ok for older kernels which don't support this new field, as they would
> report 0 in this field (from zeroing in cp_statx()) already. Furthermore
> those older kernels don't support large atomic writes - apart from block
> fops, but there would be consistent performance there for atomic writes
> in range [unit min, unit max].
>
> Signed-off-by: John Garry <john.g.garry@oracle.com>
Seems fine to me, but I imagine others have stronger opinions.
Acked-by: "Darrick J. Wong" <djwong@kernel.org>
--D
> ---
> block/bdev.c | 3 ++-
> fs/ext4/inode.c | 2 +-
> fs/stat.c | 6 +++++-
> fs/xfs/xfs_iops.c | 2 +-
> include/linux/fs.h | 3 ++-
> include/linux/stat.h | 1 +
> include/uapi/linux/stat.h | 8 ++++++--
> 7 files changed, 18 insertions(+), 7 deletions(-)
>
> diff --git a/block/bdev.c b/block/bdev.c
> index 4844d1e27b6f..b4afc1763e8e 100644
> --- a/block/bdev.c
> +++ b/block/bdev.c
> @@ -1301,7 +1301,8 @@ void bdev_statx(struct path *path, struct kstat *stat,
>
> generic_fill_statx_atomic_writes(stat,
> queue_atomic_write_unit_min_bytes(bd_queue),
> - queue_atomic_write_unit_max_bytes(bd_queue));
> + queue_atomic_write_unit_max_bytes(bd_queue),
> + 0);
> }
>
> stat->blksize = bdev_io_min(bdev);
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 1dc09ed5d403..51a45699112c 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -5663,7 +5663,7 @@ int ext4_getattr(struct mnt_idmap *idmap, const struct path *path,
> awu_max = sbi->s_awu_max;
> }
>
> - generic_fill_statx_atomic_writes(stat, awu_min, awu_max);
> + generic_fill_statx_atomic_writes(stat, awu_min, awu_max, 0);
> }
>
> flags = ei->i_flags & EXT4_FL_USER_VISIBLE;
> diff --git a/fs/stat.c b/fs/stat.c
> index f13308bfdc98..c41855f62d22 100644
> --- a/fs/stat.c
> +++ b/fs/stat.c
> @@ -136,13 +136,15 @@ EXPORT_SYMBOL(generic_fill_statx_attr);
> * @stat: Where to fill in the attribute flags
> * @unit_min: Minimum supported atomic write length in bytes
> * @unit_max: Maximum supported atomic write length in bytes
> + * @unit_max_opt: Optimised maximum supported atomic write length in bytes
> *
> * Fill in the STATX{_ATTR}_WRITE_ATOMIC flags in the kstat structure from
> * atomic write unit_min and unit_max values.
> */
> void generic_fill_statx_atomic_writes(struct kstat *stat,
> unsigned int unit_min,
> - unsigned int unit_max)
> + unsigned int unit_max,
> + unsigned int unit_max_opt)
> {
> /* Confirm that the request type is known */
> stat->result_mask |= STATX_WRITE_ATOMIC;
> @@ -153,6 +155,7 @@ void generic_fill_statx_atomic_writes(struct kstat *stat,
> if (unit_min) {
> stat->atomic_write_unit_min = unit_min;
> stat->atomic_write_unit_max = unit_max;
> + stat->atomic_write_unit_max_opt = unit_max_opt;
> /* Initially only allow 1x segment */
> stat->atomic_write_segments_max = 1;
>
> @@ -732,6 +735,7 @@ cp_statx(const struct kstat *stat, struct statx __user *buffer)
> tmp.stx_atomic_write_unit_min = stat->atomic_write_unit_min;
> tmp.stx_atomic_write_unit_max = stat->atomic_write_unit_max;
> tmp.stx_atomic_write_segments_max = stat->atomic_write_segments_max;
> + tmp.stx_atomic_write_unit_max_opt = stat->atomic_write_unit_max_opt;
>
> return copy_to_user(buffer, &tmp, sizeof(tmp)) ? -EFAULT : 0;
> }
> diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
> index 756bd3ca8e00..f0e5d83195df 100644
> --- a/fs/xfs/xfs_iops.c
> +++ b/fs/xfs/xfs_iops.c
> @@ -610,7 +610,7 @@ xfs_report_atomic_write(
>
> if (xfs_inode_can_atomicwrite(ip))
> unit_min = unit_max = ip->i_mount->m_sb.sb_blocksize;
> - generic_fill_statx_atomic_writes(stat, unit_min, unit_max);
> + generic_fill_statx_atomic_writes(stat, unit_min, unit_max, 0);
> }
>
> STATIC int
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index 016b0fe1536e..7b19d8f99aff 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -3475,7 +3475,8 @@ void generic_fillattr(struct mnt_idmap *, u32, struct inode *, struct kstat *);
> void generic_fill_statx_attr(struct inode *inode, struct kstat *stat);
> void generic_fill_statx_atomic_writes(struct kstat *stat,
> unsigned int unit_min,
> - unsigned int unit_max);
> + unsigned int unit_max,
> + unsigned int unit_max_opt);
> extern int vfs_getattr_nosec(const struct path *, struct kstat *, u32, unsigned int);
> extern int vfs_getattr(const struct path *, struct kstat *, u32, unsigned int);
> void __inode_add_bytes(struct inode *inode, loff_t bytes);
> diff --git a/include/linux/stat.h b/include/linux/stat.h
> index be7496a6a0dd..e3d00e7bb26d 100644
> --- a/include/linux/stat.h
> +++ b/include/linux/stat.h
> @@ -57,6 +57,7 @@ struct kstat {
> u32 dio_read_offset_align;
> u32 atomic_write_unit_min;
> u32 atomic_write_unit_max;
> + u32 atomic_write_unit_max_opt;
> u32 atomic_write_segments_max;
> };
>
> diff --git a/include/uapi/linux/stat.h b/include/uapi/linux/stat.h
> index f78ee3670dd5..1686861aae20 100644
> --- a/include/uapi/linux/stat.h
> +++ b/include/uapi/linux/stat.h
> @@ -182,8 +182,12 @@ struct statx {
> /* File offset alignment for direct I/O reads */
> __u32 stx_dio_read_offset_align;
>
> - /* 0xb8 */
> - __u64 __spare3[9]; /* Spare space for future expansion */
> + /* Optimised max atomic write unit in bytes */
> + __u32 stx_atomic_write_unit_max_opt;
> + __u32 __spare2[1];
> +
> + /* 0xc0 */
> + __u64 __spare3[8]; /* Spare space for future expansion */
>
> /* 0x100 */
> };
> --
> 2.31.1
>
>
^ permalink raw reply [flat|nested] 29+ messages in thread* Re: [PATCH v6 01/12] fs: add atomic write unit max opt to statx
2025-04-08 10:41 ` [PATCH v6 01/12] fs: add atomic write unit max opt to statx John Garry
2025-04-09 2:23 ` Darrick J. Wong
@ 2025-04-09 10:45 ` Christoph Hellwig
1 sibling, 0 replies; 29+ messages in thread
From: Christoph Hellwig @ 2025-04-09 10:45 UTC (permalink / raw)
To: John Garry
Cc: brauner, djwong, hch, viro, jack, cem, linux-fsdevel, dchinner,
linux-xfs, linux-kernel, ojaswin, ritesh.list, martin.petersen,
linux-ext4, linux-block, catherine.hoang
Looks good:
Reviewed-by: Christoph Hellwig <hch@lst.de>
^ permalink raw reply [flat|nested] 29+ messages in thread
* [PATCH v6 02/12] xfs: add helpers to compute log item overhead
2025-04-08 10:41 [PATCH v6 00/12] large atomic writes for xfs John Garry
2025-04-08 10:41 ` [PATCH v6 01/12] fs: add atomic write unit max opt to statx John Garry
@ 2025-04-08 10:41 ` John Garry
2025-04-08 22:50 ` Dave Chinner
` (2 more replies)
2025-04-08 10:42 ` [PATCH v6 03/12] xfs: rename xfs_inode_can_atomicwrite() -> xfs_inode_can_hw_atomicwrite() John Garry
` (9 subsequent siblings)
11 siblings, 3 replies; 29+ messages in thread
From: John Garry @ 2025-04-08 10:41 UTC (permalink / raw)
To: brauner, djwong, hch, viro, jack, cem
Cc: linux-fsdevel, dchinner, linux-xfs, linux-kernel, ojaswin,
ritesh.list, martin.petersen, linux-ext4, linux-block,
catherine.hoang, John Garry
From: "Darrick J. Wong" <djwong@kernel.org>
Add selected helpers to estimate the transaction reservation required to
write various log intent and buffer items to the log. These helpers
will be used by the online repair code for more precise estimations of
how much work can be done in a single transaction.
Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Signed-off-by: John Garry <john.g.garry@oracle.com>
---
fs/xfs/libxfs/xfs_trans_resv.c | 6 +++---
fs/xfs/libxfs/xfs_trans_resv.h | 4 ++++
fs/xfs/xfs_bmap_item.c | 10 ++++++++++
fs/xfs/xfs_bmap_item.h | 3 +++
fs/xfs/xfs_buf_item.c | 19 +++++++++++++++++++
fs/xfs/xfs_buf_item.h | 3 +++
fs/xfs/xfs_extfree_item.c | 10 ++++++++++
fs/xfs/xfs_extfree_item.h | 3 +++
fs/xfs/xfs_log_cil.c | 4 +---
fs/xfs/xfs_log_priv.h | 13 +++++++++++++
fs/xfs/xfs_refcount_item.c | 10 ++++++++++
fs/xfs/xfs_refcount_item.h | 3 +++
fs/xfs/xfs_rmap_item.c | 10 ++++++++++
fs/xfs/xfs_rmap_item.h | 3 +++
14 files changed, 95 insertions(+), 6 deletions(-)
diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c
index 13d00c7166e1..ce1393bd3561 100644
--- a/fs/xfs/libxfs/xfs_trans_resv.c
+++ b/fs/xfs/libxfs/xfs_trans_resv.c
@@ -47,7 +47,7 @@ xfs_buf_log_overhead(void)
* will be changed in a transaction. size is used to tell how many
* bytes should be reserved per item.
*/
-STATIC uint
+uint
xfs_calc_buf_res(
uint nbufs,
uint size)
@@ -84,7 +84,7 @@ xfs_allocfree_block_count(
* in the same transaction as an allocation or a free, so we compute them
* separately.
*/
-static unsigned int
+unsigned int
xfs_refcountbt_block_count(
struct xfs_mount *mp,
unsigned int num_ops)
@@ -129,7 +129,7 @@ xfs_rtrefcountbt_block_count(
* additional to the records and pointers that fit inside the inode
* forks.
*/
-STATIC uint
+uint
xfs_calc_inode_res(
struct xfs_mount *mp,
uint ninodes)
diff --git a/fs/xfs/libxfs/xfs_trans_resv.h b/fs/xfs/libxfs/xfs_trans_resv.h
index 0554b9d775d2..e76052028cc9 100644
--- a/fs/xfs/libxfs/xfs_trans_resv.h
+++ b/fs/xfs/libxfs/xfs_trans_resv.h
@@ -97,6 +97,10 @@ struct xfs_trans_resv {
void xfs_trans_resv_calc(struct xfs_mount *mp, struct xfs_trans_resv *resp);
uint xfs_allocfree_block_count(struct xfs_mount *mp, uint num_ops);
+unsigned int xfs_refcountbt_block_count(struct xfs_mount *mp,
+ unsigned int num_ops);
+uint xfs_calc_buf_res(uint nbufs, uint size);
+uint xfs_calc_inode_res(struct xfs_mount *mp, uint ninodes);
unsigned int xfs_calc_itruncate_reservation_minlogsize(struct xfs_mount *mp);
unsigned int xfs_calc_write_reservation_minlogsize(struct xfs_mount *mp);
diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c
index 3d52e9d7ad57..c62b9c1dd448 100644
--- a/fs/xfs/xfs_bmap_item.c
+++ b/fs/xfs/xfs_bmap_item.c
@@ -77,6 +77,11 @@ xfs_bui_item_size(
*nbytes += xfs_bui_log_format_sizeof(buip->bui_format.bui_nextents);
}
+unsigned int xfs_bui_item_overhead(unsigned int nr)
+{
+ return xlog_item_space(1, xfs_bui_log_format_sizeof(nr));
+}
+
/*
* This is called to fill in the vector of log iovecs for the
* given bui log item. We use only 1 iovec, and we point that
@@ -168,6 +173,11 @@ xfs_bud_item_size(
*nbytes += sizeof(struct xfs_bud_log_format);
}
+unsigned int xfs_bud_item_overhead(void)
+{
+ return xlog_item_space(1, sizeof(struct xfs_bud_log_format));
+}
+
/*
* This is called to fill in the vector of log iovecs for the
* given bud log item. We use only 1 iovec, and we point that
diff --git a/fs/xfs/xfs_bmap_item.h b/fs/xfs/xfs_bmap_item.h
index 6fee6a508343..655b30bc1736 100644
--- a/fs/xfs/xfs_bmap_item.h
+++ b/fs/xfs/xfs_bmap_item.h
@@ -72,4 +72,7 @@ struct xfs_bmap_intent;
void xfs_bmap_defer_add(struct xfs_trans *tp, struct xfs_bmap_intent *bi);
+unsigned int xfs_bui_item_overhead(unsigned int nr);
+unsigned int xfs_bud_item_overhead(void);
+
#endif /* __XFS_BMAP_ITEM_H__ */
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 19eb0b7a3e58..f89fb81517c9 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -103,6 +103,25 @@ xfs_buf_item_size_segment(
return;
}
+/*
+ * Compute the worst case log item overhead for an invalidated buffer with the
+ * given map count and block size.
+ */
+unsigned int
+xfs_buf_inval_item_overhead(
+ unsigned int map_count,
+ unsigned int blocksize)
+{
+ unsigned int chunks = DIV_ROUND_UP(blocksize, XFS_BLF_CHUNK);
+ unsigned int bitmap_size = DIV_ROUND_UP(chunks, NBWORD);
+ unsigned int ret =
+ offsetof(struct xfs_buf_log_format, blf_data_map) +
+ (bitmap_size * sizeof_field(struct xfs_buf_log_format,
+ blf_data_map[0]));
+
+ return ret * map_count;
+}
+
/*
* Return the number of log iovecs and space needed to log the given buf log
* item.
diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h
index 8cde85259a58..a273f45b558d 100644
--- a/fs/xfs/xfs_buf_item.h
+++ b/fs/xfs/xfs_buf_item.h
@@ -64,6 +64,9 @@ static inline void xfs_buf_dquot_iodone(struct xfs_buf *bp)
void xfs_buf_iodone(struct xfs_buf *);
bool xfs_buf_log_check_iovec(struct xfs_log_iovec *iovec);
+unsigned int xfs_buf_inval_item_overhead(unsigned int map_count,
+ unsigned int blocksize);
+
extern struct kmem_cache *xfs_buf_item_cache;
#endif /* __XFS_BUF_ITEM_H__ */
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index 777438b853da..3454eb643627 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -83,6 +83,11 @@ xfs_efi_item_size(
*nbytes += xfs_efi_log_format_sizeof(efip->efi_format.efi_nextents);
}
+unsigned int xfs_efi_item_overhead(unsigned int nr)
+{
+ return xlog_item_space(1, xfs_efi_log_format_sizeof(nr));
+}
+
/*
* This is called to fill in the vector of log iovecs for the
* given efi log item. We use only 1 iovec, and we point that
@@ -254,6 +259,11 @@ xfs_efd_item_size(
*nbytes += xfs_efd_log_format_sizeof(efdp->efd_format.efd_nextents);
}
+unsigned int xfs_efd_item_overhead(unsigned int nr)
+{
+ return xlog_item_space(1, xfs_efd_log_format_sizeof(nr));
+}
+
/*
* This is called to fill in the vector of log iovecs for the
* given efd log item. We use only 1 iovec, and we point that
diff --git a/fs/xfs/xfs_extfree_item.h b/fs/xfs/xfs_extfree_item.h
index 41b7c4306079..ebb237a4ae87 100644
--- a/fs/xfs/xfs_extfree_item.h
+++ b/fs/xfs/xfs_extfree_item.h
@@ -94,4 +94,7 @@ void xfs_extent_free_defer_add(struct xfs_trans *tp,
struct xfs_extent_free_item *xefi,
struct xfs_defer_pending **dfpp);
+unsigned int xfs_efi_item_overhead(unsigned int nr);
+unsigned int xfs_efd_item_overhead(unsigned int nr);
+
#endif /* __XFS_EXTFREE_ITEM_H__ */
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index 1ca406ec1b40..f66d2d430e4f 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -309,9 +309,7 @@ xlog_cil_alloc_shadow_bufs(
* Then round nbytes up to 64-bit alignment so that the initial
* buffer alignment is easy to calculate and verify.
*/
- nbytes += niovecs *
- (sizeof(uint64_t) + sizeof(struct xlog_op_header));
- nbytes = round_up(nbytes, sizeof(uint64_t));
+ nbytes = xlog_item_space(niovecs, nbytes);
/*
* The data buffer needs to start 64-bit aligned, so round up
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index f3d78869e5e5..39a102cc1b43 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -698,4 +698,17 @@ xlog_kvmalloc(
return p;
}
+/*
+ * Given a count of iovecs and space for a log item, compute the space we need
+ * in the log to store that data plus the log headers.
+ */
+static inline unsigned int
+xlog_item_space(
+ unsigned int niovecs,
+ unsigned int nbytes)
+{
+ nbytes += niovecs * (sizeof(uint64_t) + sizeof(struct xlog_op_header));
+ return round_up(nbytes, sizeof(uint64_t));
+}
+
#endif /* __XFS_LOG_PRIV_H__ */
diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c
index fe2d7aab8554..02defb711641 100644
--- a/fs/xfs/xfs_refcount_item.c
+++ b/fs/xfs/xfs_refcount_item.c
@@ -78,6 +78,11 @@ xfs_cui_item_size(
*nbytes += xfs_cui_log_format_sizeof(cuip->cui_format.cui_nextents);
}
+unsigned int xfs_cui_item_overhead(unsigned int nr)
+{
+ return xlog_item_space(1, xfs_cui_log_format_sizeof(nr));
+}
+
/*
* This is called to fill in the vector of log iovecs for the
* given cui log item. We use only 1 iovec, and we point that
@@ -179,6 +184,11 @@ xfs_cud_item_size(
*nbytes += sizeof(struct xfs_cud_log_format);
}
+unsigned int xfs_cud_item_overhead(void)
+{
+ return xlog_item_space(1, sizeof(struct xfs_cud_log_format));
+}
+
/*
* This is called to fill in the vector of log iovecs for the
* given cud log item. We use only 1 iovec, and we point that
diff --git a/fs/xfs/xfs_refcount_item.h b/fs/xfs/xfs_refcount_item.h
index bfee8f30c63c..5976cf0a04a6 100644
--- a/fs/xfs/xfs_refcount_item.h
+++ b/fs/xfs/xfs_refcount_item.h
@@ -76,4 +76,7 @@ struct xfs_refcount_intent;
void xfs_refcount_defer_add(struct xfs_trans *tp,
struct xfs_refcount_intent *ri);
+unsigned int xfs_cui_item_overhead(unsigned int nr);
+unsigned int xfs_cud_item_overhead(void);
+
#endif /* __XFS_REFCOUNT_ITEM_H__ */
diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c
index 89decffe76c8..452300725641 100644
--- a/fs/xfs/xfs_rmap_item.c
+++ b/fs/xfs/xfs_rmap_item.c
@@ -77,6 +77,11 @@ xfs_rui_item_size(
*nbytes += xfs_rui_log_format_sizeof(ruip->rui_format.rui_nextents);
}
+unsigned int xfs_rui_item_overhead(unsigned int nr)
+{
+ return xlog_item_space(1, xfs_rui_log_format_sizeof(nr));
+}
+
/*
* This is called to fill in the vector of log iovecs for the
* given rui log item. We use only 1 iovec, and we point that
@@ -180,6 +185,11 @@ xfs_rud_item_size(
*nbytes += sizeof(struct xfs_rud_log_format);
}
+unsigned int xfs_rud_item_overhead(void)
+{
+ return xlog_item_space(1, sizeof(struct xfs_rud_log_format));
+}
+
/*
* This is called to fill in the vector of log iovecs for the
* given rud log item. We use only 1 iovec, and we point that
diff --git a/fs/xfs/xfs_rmap_item.h b/fs/xfs/xfs_rmap_item.h
index 40d331555675..0dac2cfe4567 100644
--- a/fs/xfs/xfs_rmap_item.h
+++ b/fs/xfs/xfs_rmap_item.h
@@ -75,4 +75,7 @@ struct xfs_rmap_intent;
void xfs_rmap_defer_add(struct xfs_trans *tp, struct xfs_rmap_intent *ri);
+unsigned int xfs_rui_item_overhead(unsigned int nr);
+unsigned int xfs_rud_item_overhead(void);
+
#endif /* __XFS_RMAP_ITEM_H__ */
--
2.31.1
^ permalink raw reply related [flat|nested] 29+ messages in thread* Re: [PATCH v6 02/12] xfs: add helpers to compute log item overhead
2025-04-08 10:41 ` [PATCH v6 02/12] xfs: add helpers to compute log item overhead John Garry
@ 2025-04-08 22:50 ` Dave Chinner
2025-04-08 23:21 ` Darrick J. Wong
2025-04-09 2:25 ` [PATCH v6.1 " Darrick J. Wong
2025-04-09 2:25 ` [PATCH v6.1 RFC 02.1/12] xfs: add helpers to compute transaction reservation for finishing intent items Darrick J. Wong
2 siblings, 1 reply; 29+ messages in thread
From: Dave Chinner @ 2025-04-08 22:50 UTC (permalink / raw)
To: John Garry
Cc: brauner, djwong, hch, viro, jack, cem, linux-fsdevel, dchinner,
linux-xfs, linux-kernel, ojaswin, ritesh.list, martin.petersen,
linux-ext4, linux-block, catherine.hoang
On Tue, Apr 08, 2025 at 10:41:59AM +0000, John Garry wrote:
> From: "Darrick J. Wong" <djwong@kernel.org>
>
> Add selected helpers to estimate the transaction reservation required to
> write various log intent and buffer items to the log. These helpers
> will be used by the online repair code for more precise estimations of
> how much work can be done in a single transaction.
>
> Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
> Signed-off-by: John Garry <john.g.garry@oracle.com>
> ---
> fs/xfs/libxfs/xfs_trans_resv.c | 6 +++---
> fs/xfs/libxfs/xfs_trans_resv.h | 4 ++++
> fs/xfs/xfs_bmap_item.c | 10 ++++++++++
> fs/xfs/xfs_bmap_item.h | 3 +++
> fs/xfs/xfs_buf_item.c | 19 +++++++++++++++++++
> fs/xfs/xfs_buf_item.h | 3 +++
> fs/xfs/xfs_extfree_item.c | 10 ++++++++++
> fs/xfs/xfs_extfree_item.h | 3 +++
> fs/xfs/xfs_log_cil.c | 4 +---
> fs/xfs/xfs_log_priv.h | 13 +++++++++++++
> fs/xfs/xfs_refcount_item.c | 10 ++++++++++
> fs/xfs/xfs_refcount_item.h | 3 +++
> fs/xfs/xfs_rmap_item.c | 10 ++++++++++
> fs/xfs/xfs_rmap_item.h | 3 +++
> 14 files changed, 95 insertions(+), 6 deletions(-)
>
> diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c
> index 13d00c7166e1..ce1393bd3561 100644
> --- a/fs/xfs/libxfs/xfs_trans_resv.c
> +++ b/fs/xfs/libxfs/xfs_trans_resv.c
> @@ -47,7 +47,7 @@ xfs_buf_log_overhead(void)
> * will be changed in a transaction. size is used to tell how many
> * bytes should be reserved per item.
> */
> -STATIC uint
> +uint
> xfs_calc_buf_res(
> uint nbufs,
> uint size)
> @@ -84,7 +84,7 @@ xfs_allocfree_block_count(
> * in the same transaction as an allocation or a free, so we compute them
> * separately.
> */
> -static unsigned int
> +unsigned int
> xfs_refcountbt_block_count(
> struct xfs_mount *mp,
> unsigned int num_ops)
> @@ -129,7 +129,7 @@ xfs_rtrefcountbt_block_count(
> * additional to the records and pointers that fit inside the inode
> * forks.
> */
> -STATIC uint
> +uint
> xfs_calc_inode_res(
> struct xfs_mount *mp,
> uint ninodes)
> diff --git a/fs/xfs/libxfs/xfs_trans_resv.h b/fs/xfs/libxfs/xfs_trans_resv.h
> index 0554b9d775d2..e76052028cc9 100644
> --- a/fs/xfs/libxfs/xfs_trans_resv.h
> +++ b/fs/xfs/libxfs/xfs_trans_resv.h
> @@ -97,6 +97,10 @@ struct xfs_trans_resv {
>
> void xfs_trans_resv_calc(struct xfs_mount *mp, struct xfs_trans_resv *resp);
> uint xfs_allocfree_block_count(struct xfs_mount *mp, uint num_ops);
> +unsigned int xfs_refcountbt_block_count(struct xfs_mount *mp,
> + unsigned int num_ops);
> +uint xfs_calc_buf_res(uint nbufs, uint size);
> +uint xfs_calc_inode_res(struct xfs_mount *mp, uint ninodes);
Why are these exported? They aren't used in this patch, and any code
that doing calculate log reservation calculation should really be
placed in xfs_trans_resv.c along with all the existing log
reservation calculations...
-Dave
--
Dave Chinner
david@fromorbit.com
^ permalink raw reply [flat|nested] 29+ messages in thread* Re: [PATCH v6 02/12] xfs: add helpers to compute log item overhead
2025-04-08 22:50 ` Dave Chinner
@ 2025-04-08 23:21 ` Darrick J. Wong
0 siblings, 0 replies; 29+ messages in thread
From: Darrick J. Wong @ 2025-04-08 23:21 UTC (permalink / raw)
To: Dave Chinner
Cc: John Garry, brauner, hch, viro, jack, cem, linux-fsdevel,
dchinner, linux-xfs, linux-kernel, ojaswin, ritesh.list,
martin.petersen, linux-ext4, linux-block, catherine.hoang
On Wed, Apr 09, 2025 at 08:50:57AM +1000, Dave Chinner wrote:
> On Tue, Apr 08, 2025 at 10:41:59AM +0000, John Garry wrote:
> > From: "Darrick J. Wong" <djwong@kernel.org>
> >
> > Add selected helpers to estimate the transaction reservation required to
> > write various log intent and buffer items to the log. These helpers
> > will be used by the online repair code for more precise estimations of
> > how much work can be done in a single transaction.
> >
> > Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
> > Signed-off-by: John Garry <john.g.garry@oracle.com>
> > ---
> > fs/xfs/libxfs/xfs_trans_resv.c | 6 +++---
> > fs/xfs/libxfs/xfs_trans_resv.h | 4 ++++
> > fs/xfs/xfs_bmap_item.c | 10 ++++++++++
> > fs/xfs/xfs_bmap_item.h | 3 +++
> > fs/xfs/xfs_buf_item.c | 19 +++++++++++++++++++
> > fs/xfs/xfs_buf_item.h | 3 +++
> > fs/xfs/xfs_extfree_item.c | 10 ++++++++++
> > fs/xfs/xfs_extfree_item.h | 3 +++
> > fs/xfs/xfs_log_cil.c | 4 +---
> > fs/xfs/xfs_log_priv.h | 13 +++++++++++++
> > fs/xfs/xfs_refcount_item.c | 10 ++++++++++
> > fs/xfs/xfs_refcount_item.h | 3 +++
> > fs/xfs/xfs_rmap_item.c | 10 ++++++++++
> > fs/xfs/xfs_rmap_item.h | 3 +++
> > 14 files changed, 95 insertions(+), 6 deletions(-)
> >
> > diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c
> > index 13d00c7166e1..ce1393bd3561 100644
> > --- a/fs/xfs/libxfs/xfs_trans_resv.c
> > +++ b/fs/xfs/libxfs/xfs_trans_resv.c
> > @@ -47,7 +47,7 @@ xfs_buf_log_overhead(void)
> > * will be changed in a transaction. size is used to tell how many
> > * bytes should be reserved per item.
> > */
> > -STATIC uint
> > +uint
> > xfs_calc_buf_res(
> > uint nbufs,
> > uint size)
> > @@ -84,7 +84,7 @@ xfs_allocfree_block_count(
> > * in the same transaction as an allocation or a free, so we compute them
> > * separately.
> > */
> > -static unsigned int
> > +unsigned int
> > xfs_refcountbt_block_count(
> > struct xfs_mount *mp,
> > unsigned int num_ops)
> > @@ -129,7 +129,7 @@ xfs_rtrefcountbt_block_count(
> > * additional to the records and pointers that fit inside the inode
> > * forks.
> > */
> > -STATIC uint
> > +uint
> > xfs_calc_inode_res(
> > struct xfs_mount *mp,
> > uint ninodes)
> > diff --git a/fs/xfs/libxfs/xfs_trans_resv.h b/fs/xfs/libxfs/xfs_trans_resv.h
> > index 0554b9d775d2..e76052028cc9 100644
> > --- a/fs/xfs/libxfs/xfs_trans_resv.h
> > +++ b/fs/xfs/libxfs/xfs_trans_resv.h
> > @@ -97,6 +97,10 @@ struct xfs_trans_resv {
> >
> > void xfs_trans_resv_calc(struct xfs_mount *mp, struct xfs_trans_resv *resp);
> > uint xfs_allocfree_block_count(struct xfs_mount *mp, uint num_ops);
> > +unsigned int xfs_refcountbt_block_count(struct xfs_mount *mp,
> > + unsigned int num_ops);
> > +uint xfs_calc_buf_res(uint nbufs, uint size);
> > +uint xfs_calc_inode_res(struct xfs_mount *mp, uint ninodes);
>
> Why are these exported? They aren't used in this patch, and any code
> that doing calculate log reservation calculation should really be
> placed in xfs_trans_resv.c along with all the existing log
> reservation calculations...
I've redone this in a different manner, will send a full patchset after
it runs through QA.
--D
> -Dave
> --
> Dave Chinner
> david@fromorbit.com
^ permalink raw reply [flat|nested] 29+ messages in thread
* [PATCH v6.1 02/12] xfs: add helpers to compute log item overhead
2025-04-08 10:41 ` [PATCH v6 02/12] xfs: add helpers to compute log item overhead John Garry
2025-04-08 22:50 ` Dave Chinner
@ 2025-04-09 2:25 ` Darrick J. Wong
2025-04-09 2:25 ` [PATCH v6.1 RFC 02.1/12] xfs: add helpers to compute transaction reservation for finishing intent items Darrick J. Wong
2 siblings, 0 replies; 29+ messages in thread
From: Darrick J. Wong @ 2025-04-09 2:25 UTC (permalink / raw)
To: John Garry
Cc: brauner, hch, viro, jack, cem, linux-fsdevel, dchinner, linux-xfs,
linux-kernel, ojaswin, ritesh.list, martin.petersen, linux-ext4,
linux-block, catherine.hoang
From: Darrick J. Wong <djwong@kernel.org>
Add selected helpers to estimate the transaction reservation required to
write various log intent and buffer items to the log. These helpers
will be used by the online repair code for more precise estimations of
how much work can be done in a single transaction.
Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
---
6.1: ripped out the transaction reservation bits
---
fs/xfs/xfs_bmap_item.h | 3 +++
fs/xfs/xfs_buf_item.h | 3 +++
fs/xfs/xfs_extfree_item.h | 3 +++
fs/xfs/xfs_log_priv.h | 13 +++++++++++++
fs/xfs/xfs_refcount_item.h | 3 +++
fs/xfs/xfs_rmap_item.h | 3 +++
fs/xfs/xfs_bmap_item.c | 10 ++++++++++
fs/xfs/xfs_buf_item.c | 19 +++++++++++++++++++
fs/xfs/xfs_extfree_item.c | 10 ++++++++++
fs/xfs/xfs_log_cil.c | 4 +---
fs/xfs/xfs_refcount_item.c | 10 ++++++++++
fs/xfs/xfs_rmap_item.c | 10 ++++++++++
12 files changed, 88 insertions(+), 3 deletions(-)
diff --git a/fs/xfs/xfs_bmap_item.h b/fs/xfs/xfs_bmap_item.h
index 6fee6a5083436b..b42fee06899df6 100644
--- a/fs/xfs/xfs_bmap_item.h
+++ b/fs/xfs/xfs_bmap_item.h
@@ -72,4 +72,7 @@ struct xfs_bmap_intent;
void xfs_bmap_defer_add(struct xfs_trans *tp, struct xfs_bmap_intent *bi);
+unsigned int xfs_bui_log_space(unsigned int nr);
+unsigned int xfs_bud_log_space(void);
+
#endif /* __XFS_BMAP_ITEM_H__ */
diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h
index 8cde85259a586d..e10e324cd24504 100644
--- a/fs/xfs/xfs_buf_item.h
+++ b/fs/xfs/xfs_buf_item.h
@@ -64,6 +64,9 @@ static inline void xfs_buf_dquot_iodone(struct xfs_buf *bp)
void xfs_buf_iodone(struct xfs_buf *);
bool xfs_buf_log_check_iovec(struct xfs_log_iovec *iovec);
+unsigned int xfs_buf_inval_log_space(unsigned int map_count,
+ unsigned int blocksize);
+
extern struct kmem_cache *xfs_buf_item_cache;
#endif /* __XFS_BUF_ITEM_H__ */
diff --git a/fs/xfs/xfs_extfree_item.h b/fs/xfs/xfs_extfree_item.h
index 41b7c43060799b..c8402040410b54 100644
--- a/fs/xfs/xfs_extfree_item.h
+++ b/fs/xfs/xfs_extfree_item.h
@@ -94,4 +94,7 @@ void xfs_extent_free_defer_add(struct xfs_trans *tp,
struct xfs_extent_free_item *xefi,
struct xfs_defer_pending **dfpp);
+unsigned int xfs_efi_log_space(unsigned int nr);
+unsigned int xfs_efd_log_space(unsigned int nr);
+
#endif /* __XFS_EXTFREE_ITEM_H__ */
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index f3d78869e5e5a3..39a102cc1b43e6 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -698,4 +698,17 @@ xlog_kvmalloc(
return p;
}
+/*
+ * Given a count of iovecs and space for a log item, compute the space we need
+ * in the log to store that data plus the log headers.
+ */
+static inline unsigned int
+xlog_item_space(
+ unsigned int niovecs,
+ unsigned int nbytes)
+{
+ nbytes += niovecs * (sizeof(uint64_t) + sizeof(struct xlog_op_header));
+ return round_up(nbytes, sizeof(uint64_t));
+}
+
#endif /* __XFS_LOG_PRIV_H__ */
diff --git a/fs/xfs/xfs_refcount_item.h b/fs/xfs/xfs_refcount_item.h
index bfee8f30c63ce9..0fc3f493342bbd 100644
--- a/fs/xfs/xfs_refcount_item.h
+++ b/fs/xfs/xfs_refcount_item.h
@@ -76,4 +76,7 @@ struct xfs_refcount_intent;
void xfs_refcount_defer_add(struct xfs_trans *tp,
struct xfs_refcount_intent *ri);
+unsigned int xfs_cui_log_space(unsigned int nr);
+unsigned int xfs_cud_log_space(void);
+
#endif /* __XFS_REFCOUNT_ITEM_H__ */
diff --git a/fs/xfs/xfs_rmap_item.h b/fs/xfs/xfs_rmap_item.h
index 40d331555675ba..3a99f0117f2d8a 100644
--- a/fs/xfs/xfs_rmap_item.h
+++ b/fs/xfs/xfs_rmap_item.h
@@ -75,4 +75,7 @@ struct xfs_rmap_intent;
void xfs_rmap_defer_add(struct xfs_trans *tp, struct xfs_rmap_intent *ri);
+unsigned int xfs_rui_log_space(unsigned int nr);
+unsigned int xfs_rud_log_space(void);
+
#endif /* __XFS_RMAP_ITEM_H__ */
diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c
index 3d52e9d7ad571a..646c515ee35518 100644
--- a/fs/xfs/xfs_bmap_item.c
+++ b/fs/xfs/xfs_bmap_item.c
@@ -77,6 +77,11 @@ xfs_bui_item_size(
*nbytes += xfs_bui_log_format_sizeof(buip->bui_format.bui_nextents);
}
+unsigned int xfs_bui_log_space(unsigned int nr)
+{
+ return xlog_item_space(1, xfs_bui_log_format_sizeof(nr));
+}
+
/*
* This is called to fill in the vector of log iovecs for the
* given bui log item. We use only 1 iovec, and we point that
@@ -168,6 +173,11 @@ xfs_bud_item_size(
*nbytes += sizeof(struct xfs_bud_log_format);
}
+unsigned int xfs_bud_log_space(void)
+{
+ return xlog_item_space(1, sizeof(struct xfs_bud_log_format));
+}
+
/*
* This is called to fill in the vector of log iovecs for the
* given bud log item. We use only 1 iovec, and we point that
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 47549cfa61cd82..254d0a072f983b 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -167,6 +167,25 @@ xfs_buf_item_size_segment(
}
}
+/*
+ * Compute the worst case log item overhead for an invalidated buffer with the
+ * given map count and block size.
+ */
+unsigned int
+xfs_buf_inval_log_space(
+ unsigned int map_count,
+ unsigned int blocksize)
+{
+ unsigned int chunks = DIV_ROUND_UP(blocksize, XFS_BLF_CHUNK);
+ unsigned int bitmap_size = DIV_ROUND_UP(chunks, NBWORD);
+ unsigned int ret =
+ offsetof(struct xfs_buf_log_format, blf_data_map) +
+ (bitmap_size * sizeof_field(struct xfs_buf_log_format,
+ blf_data_map[0]));
+
+ return ret * map_count;
+}
+
/*
* Return the number of log iovecs and space needed to log the given buf log
* item.
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index a25c713ff888c7..0163dd6426661d 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -82,6 +82,11 @@ xfs_efi_item_size(
*nbytes += xfs_efi_log_format_sizeof(efip->efi_format.efi_nextents);
}
+unsigned int xfs_efi_log_space(unsigned int nr)
+{
+ return xlog_item_space(1, xfs_efi_log_format_sizeof(nr));
+}
+
/*
* This is called to fill in the vector of log iovecs for the
* given efi log item. We use only 1 iovec, and we point that
@@ -253,6 +258,11 @@ xfs_efd_item_size(
*nbytes += xfs_efd_log_format_sizeof(efdp->efd_format.efd_nextents);
}
+unsigned int xfs_efd_log_space(unsigned int nr)
+{
+ return xlog_item_space(1, xfs_efd_log_format_sizeof(nr));
+}
+
/*
* This is called to fill in the vector of log iovecs for the
* given efd log item. We use only 1 iovec, and we point that
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index 1ca406ec1b40b3..f66d2d430e4f37 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -309,9 +309,7 @@ xlog_cil_alloc_shadow_bufs(
* Then round nbytes up to 64-bit alignment so that the initial
* buffer alignment is easy to calculate and verify.
*/
- nbytes += niovecs *
- (sizeof(uint64_t) + sizeof(struct xlog_op_header));
- nbytes = round_up(nbytes, sizeof(uint64_t));
+ nbytes = xlog_item_space(niovecs, nbytes);
/*
* The data buffer needs to start 64-bit aligned, so round up
diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c
index fe2d7aab8554fc..076501123d89f0 100644
--- a/fs/xfs/xfs_refcount_item.c
+++ b/fs/xfs/xfs_refcount_item.c
@@ -78,6 +78,11 @@ xfs_cui_item_size(
*nbytes += xfs_cui_log_format_sizeof(cuip->cui_format.cui_nextents);
}
+unsigned int xfs_cui_log_space(unsigned int nr)
+{
+ return xlog_item_space(1, xfs_cui_log_format_sizeof(nr));
+}
+
/*
* This is called to fill in the vector of log iovecs for the
* given cui log item. We use only 1 iovec, and we point that
@@ -179,6 +184,11 @@ xfs_cud_item_size(
*nbytes += sizeof(struct xfs_cud_log_format);
}
+unsigned int xfs_cud_log_space(void)
+{
+ return xlog_item_space(1, sizeof(struct xfs_cud_log_format));
+}
+
/*
* This is called to fill in the vector of log iovecs for the
* given cud log item. We use only 1 iovec, and we point that
diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c
index 89decffe76c8b5..c99700318ec24b 100644
--- a/fs/xfs/xfs_rmap_item.c
+++ b/fs/xfs/xfs_rmap_item.c
@@ -77,6 +77,11 @@ xfs_rui_item_size(
*nbytes += xfs_rui_log_format_sizeof(ruip->rui_format.rui_nextents);
}
+unsigned int xfs_rui_log_space(unsigned int nr)
+{
+ return xlog_item_space(1, xfs_rui_log_format_sizeof(nr));
+}
+
/*
* This is called to fill in the vector of log iovecs for the
* given rui log item. We use only 1 iovec, and we point that
@@ -180,6 +185,11 @@ xfs_rud_item_size(
*nbytes += sizeof(struct xfs_rud_log_format);
}
+unsigned int xfs_rud_log_space(void)
+{
+ return xlog_item_space(1, sizeof(struct xfs_rud_log_format));
+}
+
/*
* This is called to fill in the vector of log iovecs for the
* given rud log item. We use only 1 iovec, and we point that
^ permalink raw reply related [flat|nested] 29+ messages in thread* [PATCH v6.1 RFC 02.1/12] xfs: add helpers to compute transaction reservation for finishing intent items
2025-04-08 10:41 ` [PATCH v6 02/12] xfs: add helpers to compute log item overhead John Garry
2025-04-08 22:50 ` Dave Chinner
2025-04-09 2:25 ` [PATCH v6.1 " Darrick J. Wong
@ 2025-04-09 2:25 ` Darrick J. Wong
2 siblings, 0 replies; 29+ messages in thread
From: Darrick J. Wong @ 2025-04-09 2:25 UTC (permalink / raw)
To: John Garry
Cc: brauner, hch, viro, jack, cem, linux-fsdevel, dchinner, linux-xfs,
linux-kernel, ojaswin, ritesh.list, martin.petersen, linux-ext4,
linux-block, catherine.hoang
From: Darrick J. Wong <djwong@kernel.org>
In the transaction reservation code, hoist the logic that computes the
reservation needed to finish one log intent item into separate helper
functions. These will be used in subsequent patches to estimate the
number of blocks that an online repair can commit to reaping in the same
transaction as the change committing the new data structure.
Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
---
fs/xfs/libxfs/xfs_trans_resv.h | 18 ++++
fs/xfs/libxfs/xfs_trans_resv.c | 165 ++++++++++++++++++++++++++++++++--------
2 files changed, 152 insertions(+), 31 deletions(-)
diff --git a/fs/xfs/libxfs/xfs_trans_resv.h b/fs/xfs/libxfs/xfs_trans_resv.h
index 0554b9d775d269..d9d0032cbbc5d4 100644
--- a/fs/xfs/libxfs/xfs_trans_resv.h
+++ b/fs/xfs/libxfs/xfs_trans_resv.h
@@ -98,6 +98,24 @@ struct xfs_trans_resv {
void xfs_trans_resv_calc(struct xfs_mount *mp, struct xfs_trans_resv *resp);
uint xfs_allocfree_block_count(struct xfs_mount *mp, uint num_ops);
+unsigned int xfs_calc_finish_bui_reservation(struct xfs_mount *mp,
+ unsigned int nr_ops);
+
+unsigned int xfs_calc_finish_efi_reservation(struct xfs_mount *mp,
+ unsigned int nr_ops);
+unsigned int xfs_calc_finish_rt_efi_reservation(struct xfs_mount *mp,
+ unsigned int nr_ops);
+
+unsigned int xfs_calc_finish_rui_reservation(struct xfs_mount *mp,
+ unsigned int nr_ops);
+unsigned int xfs_calc_finish_rt_rui_reservation(struct xfs_mount *mp,
+ unsigned int nr_ops);
+
+unsigned int xfs_calc_finish_cui_reservation(struct xfs_mount *mp,
+ unsigned int nr_ops);
+unsigned int xfs_calc_finish_rt_cui_reservation(struct xfs_mount *mp,
+ unsigned int nr_ops);
+
unsigned int xfs_calc_itruncate_reservation_minlogsize(struct xfs_mount *mp);
unsigned int xfs_calc_write_reservation_minlogsize(struct xfs_mount *mp);
unsigned int xfs_calc_qm_dqalloc_reservation_minlogsize(struct xfs_mount *mp);
diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c
index 13d00c7166e178..580d00ae28573d 100644
--- a/fs/xfs/libxfs/xfs_trans_resv.c
+++ b/fs/xfs/libxfs/xfs_trans_resv.c
@@ -263,6 +263,42 @@ xfs_rtalloc_block_count(
* register overflow from temporaries in the calculations.
*/
+/*
+ * Finishing a data device refcount updates (t1):
+ * the agfs of the ags containing the blocks: nr_ops * sector size
+ * the refcount btrees: nr_ops * 1 trees * (2 * max depth - 1) * block size
+ */
+inline unsigned int
+xfs_calc_finish_cui_reservation(
+ struct xfs_mount *mp,
+ unsigned int nr_ops)
+{
+ if (!xfs_has_reflink(mp))
+ return 0;
+
+ return xfs_calc_buf_res(nr_ops, mp->m_sb.sb_sectsize) +
+ xfs_calc_buf_res(xfs_refcountbt_block_count(mp, nr_ops),
+ mp->m_sb.sb_blocksize);
+}
+
+/*
+ * Realtime refcount updates (t2);
+ * the rt refcount inode
+ * the rtrefcount btrees: nr_ops * 1 trees * (2 * max depth - 1) * block size
+ */
+inline unsigned int
+xfs_calc_finish_rt_cui_reservation(
+ struct xfs_mount *mp,
+ unsigned int nr_ops)
+{
+ if (!xfs_has_rtreflink(mp))
+ return 0;
+
+ return xfs_calc_inode_res(mp, 1) +
+ xfs_calc_buf_res(xfs_rtrefcountbt_block_count(mp, nr_ops),
+ mp->m_sb.sb_blocksize);
+}
+
/*
* Compute the log reservation required to handle the refcount update
* transaction. Refcount updates are always done via deferred log items.
@@ -280,19 +316,10 @@ xfs_calc_refcountbt_reservation(
struct xfs_mount *mp,
unsigned int nr_ops)
{
- unsigned int blksz = XFS_FSB_TO_B(mp, 1);
- unsigned int t1, t2 = 0;
+ unsigned int t1, t2;
- if (!xfs_has_reflink(mp))
- return 0;
-
- t1 = xfs_calc_buf_res(nr_ops, mp->m_sb.sb_sectsize) +
- xfs_calc_buf_res(xfs_refcountbt_block_count(mp, nr_ops), blksz);
-
- if (xfs_has_realtime(mp))
- t2 = xfs_calc_inode_res(mp, 1) +
- xfs_calc_buf_res(xfs_rtrefcountbt_block_count(mp, nr_ops),
- blksz);
+ t1 = xfs_calc_finish_cui_reservation(mp, nr_ops);
+ t2 = xfs_calc_finish_rt_cui_reservation(mp, nr_ops);
return max(t1, t2);
}
@@ -379,6 +406,96 @@ xfs_calc_write_reservation_minlogsize(
return xfs_calc_write_reservation(mp, true);
}
+/*
+ * Finishing an EFI can free the blocks and bmap blocks (t2):
+ * the agf for each of the ags: nr * sector size
+ * the agfl for each of the ags: nr * sector size
+ * the super block to reflect the freed blocks: sector size
+ * worst case split in allocation btrees per extent assuming nr extents:
+ * nr exts * 2 trees * (2 * max depth - 1) * block size
+ */
+inline unsigned int
+xfs_calc_finish_efi_reservation(
+ struct xfs_mount *mp,
+ unsigned int nr)
+{
+ return xfs_calc_buf_res((2 * nr) + 1, mp->m_sb.sb_sectsize) +
+ xfs_calc_buf_res(xfs_allocfree_block_count(mp, nr),
+ mp->m_sb.sb_blocksize);
+}
+
+/*
+ * Or, if it's a realtime file (t3):
+ * the agf for each of the ags: 2 * sector size
+ * the agfl for each of the ags: 2 * sector size
+ * the super block to reflect the freed blocks: sector size
+ * the realtime bitmap:
+ * 2 exts * ((XFS_BMBT_MAX_EXTLEN / rtextsize) / NBBY) bytes
+ * the realtime summary: 2 exts * 1 block
+ * worst case split in allocation btrees per extent assuming 2 extents:
+ * 2 exts * 2 trees * (2 * max depth - 1) * block size
+ */
+inline unsigned int
+xfs_calc_finish_rt_efi_reservation(
+ struct xfs_mount *mp,
+ unsigned int nr)
+{
+ if (!xfs_has_realtime(mp))
+ return 0;
+
+ return xfs_calc_buf_res((2 * nr) + 1, mp->m_sb.sb_sectsize) +
+ xfs_calc_buf_res(xfs_rtalloc_block_count(mp, nr),
+ mp->m_sb.sb_blocksize) +
+ xfs_calc_buf_res(xfs_allocfree_block_count(mp, nr),
+ mp->m_sb.sb_blocksize);
+}
+
+/*
+ * Finishing an RUI is the same as an EFI. We can split the rmap btree twice
+ * on each end of the record, and that can cause the AGFL to be refilled or
+ * emptied out.
+ */
+inline unsigned int
+xfs_calc_finish_rui_reservation(
+ struct xfs_mount *mp,
+ unsigned int nr)
+{
+ if (!xfs_has_rmapbt(mp))
+ return 0;
+ return xfs_calc_finish_efi_reservation(mp, nr);
+}
+
+/*
+ * Finishing an RUI is the same as an EFI. We can split the rmap btree twice
+ * on each end of the record, and that can cause the AGFL to be refilled or
+ * emptied out.
+ */
+inline unsigned int
+xfs_calc_finish_rt_rui_reservation(
+ struct xfs_mount *mp,
+ unsigned int nr)
+{
+ if (!xfs_has_rtrmapbt(mp))
+ return 0;
+ return xfs_calc_finish_rt_efi_reservation(mp, nr);
+}
+
+/*
+ * In finishing a BUI, we can modify:
+ * the inode being truncated: inode size
+ * dquots
+ * the inode's bmap btree: (max depth + 1) * block size
+ */
+inline unsigned int
+xfs_calc_finish_bui_reservation(
+ struct xfs_mount *mp,
+ unsigned int nr)
+{
+ return xfs_calc_inode_res(mp, 1) + XFS_DQUOT_LOGRES +
+ xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1,
+ mp->m_sb.sb_blocksize);
+}
+
/*
* In truncating a file we free up to two extents at once. We can modify (t1):
* the inode being truncated: inode size
@@ -411,16 +528,8 @@ xfs_calc_itruncate_reservation(
t1 = xfs_calc_inode_res(mp, 1) +
xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1, blksz);
- t2 = xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) +
- xfs_calc_buf_res(xfs_allocfree_block_count(mp, 4), blksz);
-
- if (xfs_has_realtime(mp)) {
- t3 = xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) +
- xfs_calc_buf_res(xfs_rtalloc_block_count(mp, 2), blksz) +
- xfs_calc_buf_res(xfs_allocfree_block_count(mp, 2), blksz);
- } else {
- t3 = 0;
- }
+ t2 = xfs_calc_finish_efi_reservation(mp, 4);
+ t3 = xfs_calc_finish_rt_efi_reservation(mp, 2);
/*
* In the early days of reflink, we included enough reservation to log
@@ -501,9 +610,7 @@ xfs_calc_rename_reservation(
xfs_calc_buf_res(2 * XFS_DIROP_LOG_COUNT(mp),
XFS_FSB_TO_B(mp, 1));
- t2 = xfs_calc_buf_res(7, mp->m_sb.sb_sectsize) +
- xfs_calc_buf_res(xfs_allocfree_block_count(mp, 3),
- XFS_FSB_TO_B(mp, 1));
+ t2 = xfs_calc_finish_efi_reservation(mp, 3);
if (xfs_has_parent(mp)) {
unsigned int rename_overhead, exchange_overhead;
@@ -611,9 +718,7 @@ xfs_calc_link_reservation(
overhead += xfs_calc_iunlink_remove_reservation(mp);
t1 = xfs_calc_inode_res(mp, 2) +
xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), XFS_FSB_TO_B(mp, 1));
- t2 = xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) +
- xfs_calc_buf_res(xfs_allocfree_block_count(mp, 1),
- XFS_FSB_TO_B(mp, 1));
+ t2 = xfs_calc_finish_efi_reservation(mp, 1);
if (xfs_has_parent(mp)) {
t3 = resp->tr_attrsetm.tr_logres;
@@ -676,9 +781,7 @@ xfs_calc_remove_reservation(
t1 = xfs_calc_inode_res(mp, 2) +
xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), XFS_FSB_TO_B(mp, 1));
- t2 = xfs_calc_buf_res(4, mp->m_sb.sb_sectsize) +
- xfs_calc_buf_res(xfs_allocfree_block_count(mp, 2),
- XFS_FSB_TO_B(mp, 1));
+ t2 = xfs_calc_finish_efi_reservation(mp, 2);
if (xfs_has_parent(mp)) {
t3 = resp->tr_attrrm.tr_logres;
^ permalink raw reply related [flat|nested] 29+ messages in thread
* [PATCH v6 03/12] xfs: rename xfs_inode_can_atomicwrite() -> xfs_inode_can_hw_atomicwrite()
2025-04-08 10:41 [PATCH v6 00/12] large atomic writes for xfs John Garry
2025-04-08 10:41 ` [PATCH v6 01/12] fs: add atomic write unit max opt to statx John Garry
2025-04-08 10:41 ` [PATCH v6 02/12] xfs: add helpers to compute log item overhead John Garry
@ 2025-04-08 10:42 ` John Garry
2025-04-09 2:02 ` Darrick J. Wong
2025-04-09 10:46 ` Christoph Hellwig
2025-04-08 10:42 ` [PATCH v6 04/12] xfs: allow block allocator to take an alignment hint John Garry
` (8 subsequent siblings)
11 siblings, 2 replies; 29+ messages in thread
From: John Garry @ 2025-04-08 10:42 UTC (permalink / raw)
To: brauner, djwong, hch, viro, jack, cem
Cc: linux-fsdevel, dchinner, linux-xfs, linux-kernel, ojaswin,
ritesh.list, martin.petersen, linux-ext4, linux-block,
catherine.hoang, John Garry
In future we will want to be able to check if specifically HW offload-based
atomic writes are possible, so rename xfs_inode_can_atomicwrite() ->
xfs_inode_can_hw_atomicwrite().
Signed-off-by: John Garry <john.g.garry@oracle.com>
---
fs/xfs/xfs_file.c | 2 +-
fs/xfs/xfs_inode.h | 2 +-
fs/xfs/xfs_iops.c | 2 +-
3 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 84f08c976ac4..653e42ccc0c3 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1488,7 +1488,7 @@ xfs_file_open(
if (xfs_is_shutdown(XFS_M(inode->i_sb)))
return -EIO;
file->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT;
- if (xfs_inode_can_atomicwrite(XFS_I(inode)))
+ if (xfs_inode_can_hw_atomicwrite(XFS_I(inode)))
file->f_mode |= FMODE_CAN_ATOMIC_WRITE;
return generic_file_open(inode, file);
}
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index eae0159983ca..cff643cd03fc 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -357,7 +357,7 @@ static inline bool xfs_inode_has_bigrtalloc(const struct xfs_inode *ip)
(ip)->i_mount->m_rtdev_targp : (ip)->i_mount->m_ddev_targp)
static inline bool
-xfs_inode_can_atomicwrite(
+xfs_inode_can_hw_atomicwrite(
struct xfs_inode *ip)
{
struct xfs_mount *mp = ip->i_mount;
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index f0e5d83195df..d324044a2225 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -608,7 +608,7 @@ xfs_report_atomic_write(
{
unsigned int unit_min = 0, unit_max = 0;
- if (xfs_inode_can_atomicwrite(ip))
+ if (xfs_inode_can_hw_atomicwrite(ip))
unit_min = unit_max = ip->i_mount->m_sb.sb_blocksize;
generic_fill_statx_atomic_writes(stat, unit_min, unit_max, 0);
}
--
2.31.1
^ permalink raw reply related [flat|nested] 29+ messages in thread* Re: [PATCH v6 03/12] xfs: rename xfs_inode_can_atomicwrite() -> xfs_inode_can_hw_atomicwrite()
2025-04-08 10:42 ` [PATCH v6 03/12] xfs: rename xfs_inode_can_atomicwrite() -> xfs_inode_can_hw_atomicwrite() John Garry
@ 2025-04-09 2:02 ` Darrick J. Wong
2025-04-09 10:46 ` Christoph Hellwig
1 sibling, 0 replies; 29+ messages in thread
From: Darrick J. Wong @ 2025-04-09 2:02 UTC (permalink / raw)
To: John Garry
Cc: brauner, hch, viro, jack, cem, linux-fsdevel, dchinner, linux-xfs,
linux-kernel, ojaswin, ritesh.list, martin.petersen, linux-ext4,
linux-block, catherine.hoang
On Tue, Apr 08, 2025 at 10:42:00AM +0000, John Garry wrote:
> In future we will want to be able to check if specifically HW offload-based
> atomic writes are possible, so rename xfs_inode_can_atomicwrite() ->
> xfs_inode_can_hw_atomicwrite().
>
> Signed-off-by: John Garry <john.g.garry@oracle.com>
Looks ok,
Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
--D
> ---
> fs/xfs/xfs_file.c | 2 +-
> fs/xfs/xfs_inode.h | 2 +-
> fs/xfs/xfs_iops.c | 2 +-
> 3 files changed, 3 insertions(+), 3 deletions(-)
>
> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> index 84f08c976ac4..653e42ccc0c3 100644
> --- a/fs/xfs/xfs_file.c
> +++ b/fs/xfs/xfs_file.c
> @@ -1488,7 +1488,7 @@ xfs_file_open(
> if (xfs_is_shutdown(XFS_M(inode->i_sb)))
> return -EIO;
> file->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT;
> - if (xfs_inode_can_atomicwrite(XFS_I(inode)))
> + if (xfs_inode_can_hw_atomicwrite(XFS_I(inode)))
> file->f_mode |= FMODE_CAN_ATOMIC_WRITE;
> return generic_file_open(inode, file);
> }
> diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
> index eae0159983ca..cff643cd03fc 100644
> --- a/fs/xfs/xfs_inode.h
> +++ b/fs/xfs/xfs_inode.h
> @@ -357,7 +357,7 @@ static inline bool xfs_inode_has_bigrtalloc(const struct xfs_inode *ip)
> (ip)->i_mount->m_rtdev_targp : (ip)->i_mount->m_ddev_targp)
>
> static inline bool
> -xfs_inode_can_atomicwrite(
> +xfs_inode_can_hw_atomicwrite(
> struct xfs_inode *ip)
> {
> struct xfs_mount *mp = ip->i_mount;
> diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
> index f0e5d83195df..d324044a2225 100644
> --- a/fs/xfs/xfs_iops.c
> +++ b/fs/xfs/xfs_iops.c
> @@ -608,7 +608,7 @@ xfs_report_atomic_write(
> {
> unsigned int unit_min = 0, unit_max = 0;
>
> - if (xfs_inode_can_atomicwrite(ip))
> + if (xfs_inode_can_hw_atomicwrite(ip))
> unit_min = unit_max = ip->i_mount->m_sb.sb_blocksize;
> generic_fill_statx_atomic_writes(stat, unit_min, unit_max, 0);
> }
> --
> 2.31.1
>
>
^ permalink raw reply [flat|nested] 29+ messages in thread* Re: [PATCH v6 03/12] xfs: rename xfs_inode_can_atomicwrite() -> xfs_inode_can_hw_atomicwrite()
2025-04-08 10:42 ` [PATCH v6 03/12] xfs: rename xfs_inode_can_atomicwrite() -> xfs_inode_can_hw_atomicwrite() John Garry
2025-04-09 2:02 ` Darrick J. Wong
@ 2025-04-09 10:46 ` Christoph Hellwig
1 sibling, 0 replies; 29+ messages in thread
From: Christoph Hellwig @ 2025-04-09 10:46 UTC (permalink / raw)
To: John Garry
Cc: brauner, djwong, hch, viro, jack, cem, linux-fsdevel, dchinner,
linux-xfs, linux-kernel, ojaswin, ritesh.list, martin.petersen,
linux-ext4, linux-block, catherine.hoang
Looks good:
Reviewed-by: Christoph Hellwig <hch@lst.de>
^ permalink raw reply [flat|nested] 29+ messages in thread
* [PATCH v6 04/12] xfs: allow block allocator to take an alignment hint
2025-04-08 10:41 [PATCH v6 00/12] large atomic writes for xfs John Garry
` (2 preceding siblings ...)
2025-04-08 10:42 ` [PATCH v6 03/12] xfs: rename xfs_inode_can_atomicwrite() -> xfs_inode_can_hw_atomicwrite() John Garry
@ 2025-04-08 10:42 ` John Garry
2025-04-08 10:42 ` [PATCH v6 05/12] xfs: refactor xfs_reflink_end_cow_extent() John Garry
` (7 subsequent siblings)
11 siblings, 0 replies; 29+ messages in thread
From: John Garry @ 2025-04-08 10:42 UTC (permalink / raw)
To: brauner, djwong, hch, viro, jack, cem
Cc: linux-fsdevel, dchinner, linux-xfs, linux-kernel, ojaswin,
ritesh.list, martin.petersen, linux-ext4, linux-block,
catherine.hoang, John Garry
Add a BMAPI flag to provide a hint to the block allocator to align extents
according to the extszhint.
This will be useful for atomic writes to ensure that we are not being
allocated extents which are not suitable (for atomic writes).
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: John Garry <john.g.garry@oracle.com>
---
fs/xfs/libxfs/xfs_bmap.c | 5 +++++
fs/xfs/libxfs/xfs_bmap.h | 6 +++++-
2 files changed, 10 insertions(+), 1 deletion(-)
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 63255820b58a..d954f9b8071f 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -3312,6 +3312,11 @@ xfs_bmap_compute_alignments(
align = xfs_get_cowextsz_hint(ap->ip);
else if (ap->datatype & XFS_ALLOC_USERDATA)
align = xfs_get_extsz_hint(ap->ip);
+
+ /* Try to align start block to any minimum allocation alignment */
+ if (align > 1 && (ap->flags & XFS_BMAPI_EXTSZALIGN))
+ args->alignment = align;
+
if (align) {
if (xfs_bmap_extsize_align(mp, &ap->got, &ap->prev, align, 0,
ap->eof, 0, ap->conv, &ap->offset,
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index b4d9c6e0f3f9..d5f2729305fa 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -87,6 +87,9 @@ struct xfs_bmalloca {
/* Do not update the rmap btree. Used for reconstructing bmbt from rmapbt. */
#define XFS_BMAPI_NORMAP (1u << 10)
+/* Try to align allocations to the extent size hint */
+#define XFS_BMAPI_EXTSZALIGN (1u << 11)
+
#define XFS_BMAPI_FLAGS \
{ XFS_BMAPI_ENTIRE, "ENTIRE" }, \
{ XFS_BMAPI_METADATA, "METADATA" }, \
@@ -98,7 +101,8 @@ struct xfs_bmalloca {
{ XFS_BMAPI_REMAP, "REMAP" }, \
{ XFS_BMAPI_COWFORK, "COWFORK" }, \
{ XFS_BMAPI_NODISCARD, "NODISCARD" }, \
- { XFS_BMAPI_NORMAP, "NORMAP" }
+ { XFS_BMAPI_NORMAP, "NORMAP" },\
+ { XFS_BMAPI_EXTSZALIGN, "EXTSZALIGN" }
static inline int xfs_bmapi_aflag(int w)
--
2.31.1
^ permalink raw reply related [flat|nested] 29+ messages in thread* [PATCH v6 05/12] xfs: refactor xfs_reflink_end_cow_extent()
2025-04-08 10:41 [PATCH v6 00/12] large atomic writes for xfs John Garry
` (3 preceding siblings ...)
2025-04-08 10:42 ` [PATCH v6 04/12] xfs: allow block allocator to take an alignment hint John Garry
@ 2025-04-08 10:42 ` John Garry
2025-04-08 10:42 ` [PATCH v6 06/12] xfs: refine atomic write size check in xfs_file_write_iter() John Garry
` (6 subsequent siblings)
11 siblings, 0 replies; 29+ messages in thread
From: John Garry @ 2025-04-08 10:42 UTC (permalink / raw)
To: brauner, djwong, hch, viro, jack, cem
Cc: linux-fsdevel, dchinner, linux-xfs, linux-kernel, ojaswin,
ritesh.list, martin.petersen, linux-ext4, linux-block,
catherine.hoang, John Garry
Refactor xfs_reflink_end_cow_extent() into separate parts which process
the CoW range and commit the transaction.
This refactoring will be used in future for when it is required to commit
a range of extents as a single transaction, similar to how it was done
pre-commit d6f215f359637.
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: John Garry <john.g.garry@oracle.com>
---
fs/xfs/xfs_reflink.c | 72 ++++++++++++++++++++++++++------------------
1 file changed, 42 insertions(+), 30 deletions(-)
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index cc3b4df88110..bd711c5bb6bb 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -786,35 +786,19 @@ xfs_reflink_update_quota(
* requirements as low as possible.
*/
STATIC int
-xfs_reflink_end_cow_extent(
+xfs_reflink_end_cow_extent_locked(
+ struct xfs_trans *tp,
struct xfs_inode *ip,
xfs_fileoff_t *offset_fsb,
xfs_fileoff_t end_fsb)
{
struct xfs_iext_cursor icur;
struct xfs_bmbt_irec got, del, data;
- struct xfs_mount *mp = ip->i_mount;
- struct xfs_trans *tp;
struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_COW_FORK);
- unsigned int resblks;
int nmaps;
bool isrt = XFS_IS_REALTIME_INODE(ip);
int error;
- resblks = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK);
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0,
- XFS_TRANS_RESERVE, &tp);
- if (error)
- return error;
-
- /*
- * Lock the inode. We have to ijoin without automatic unlock because
- * the lead transaction is the refcountbt record deletion; the data
- * fork update follows as a deferred log item.
- */
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(tp, ip, 0);
-
/*
* In case of racing, overlapping AIO writes no COW extents might be
* left by the time I/O completes for the loser of the race. In that
@@ -823,7 +807,7 @@ xfs_reflink_end_cow_extent(
if (!xfs_iext_lookup_extent(ip, ifp, *offset_fsb, &icur, &got) ||
got.br_startoff >= end_fsb) {
*offset_fsb = end_fsb;
- goto out_cancel;
+ return 0;
}
/*
@@ -837,7 +821,7 @@ xfs_reflink_end_cow_extent(
if (!xfs_iext_next_extent(ifp, &icur, &got) ||
got.br_startoff >= end_fsb) {
*offset_fsb = end_fsb;
- goto out_cancel;
+ return 0;
}
}
del = got;
@@ -846,14 +830,14 @@ xfs_reflink_end_cow_extent(
error = xfs_iext_count_extend(tp, ip, XFS_DATA_FORK,
XFS_IEXT_REFLINK_END_COW_CNT);
if (error)
- goto out_cancel;
+ return error;
/* Grab the corresponding mapping in the data fork. */
nmaps = 1;
error = xfs_bmapi_read(ip, del.br_startoff, del.br_blockcount, &data,
&nmaps, 0);
if (error)
- goto out_cancel;
+ return error;
/* We can only remap the smaller of the two extent sizes. */
data.br_blockcount = min(data.br_blockcount, del.br_blockcount);
@@ -882,7 +866,7 @@ xfs_reflink_end_cow_extent(
error = xfs_bunmapi(NULL, ip, data.br_startoff,
data.br_blockcount, 0, 1, &done);
if (error)
- goto out_cancel;
+ return error;
ASSERT(done);
}
@@ -899,17 +883,45 @@ xfs_reflink_end_cow_extent(
/* Remove the mapping from the CoW fork. */
xfs_bmap_del_extent_cow(ip, &icur, &got, &del);
- error = xfs_trans_commit(tp);
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- if (error)
- return error;
-
/* Update the caller about how much progress we made. */
*offset_fsb = del.br_startoff + del.br_blockcount;
return 0;
+}
-out_cancel:
- xfs_trans_cancel(tp);
+/*
+ * Remap part of the CoW fork into the data fork.
+ *
+ * We aim to remap the range starting at @offset_fsb and ending at @end_fsb
+ * into the data fork; this function will remap what it can (at the end of the
+ * range) and update @end_fsb appropriately. Each remap gets its own
+ * transaction because we can end up merging and splitting bmbt blocks for
+ * every remap operation and we'd like to keep the block reservation
+ * requirements as low as possible.
+ */
+STATIC int
+xfs_reflink_end_cow_extent(
+ struct xfs_inode *ip,
+ xfs_fileoff_t *offset_fsb,
+ xfs_fileoff_t end_fsb)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_trans *tp;
+ unsigned int resblks;
+ int error;
+
+ resblks = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK);
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0,
+ XFS_TRANS_RESERVE, &tp);
+ if (error)
+ return error;
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, ip, 0);
+
+ error = xfs_reflink_end_cow_extent_locked(tp, ip, offset_fsb, end_fsb);
+ if (error)
+ xfs_trans_cancel(tp);
+ else
+ error = xfs_trans_commit(tp);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
return error;
}
--
2.31.1
^ permalink raw reply related [flat|nested] 29+ messages in thread* [PATCH v6 06/12] xfs: refine atomic write size check in xfs_file_write_iter()
2025-04-08 10:41 [PATCH v6 00/12] large atomic writes for xfs John Garry
` (4 preceding siblings ...)
2025-04-08 10:42 ` [PATCH v6 05/12] xfs: refactor xfs_reflink_end_cow_extent() John Garry
@ 2025-04-08 10:42 ` John Garry
2025-04-08 10:42 ` [PATCH v6 07/12] xfs: add xfs_atomic_write_cow_iomap_begin() John Garry
` (5 subsequent siblings)
11 siblings, 0 replies; 29+ messages in thread
From: John Garry @ 2025-04-08 10:42 UTC (permalink / raw)
To: brauner, djwong, hch, viro, jack, cem
Cc: linux-fsdevel, dchinner, linux-xfs, linux-kernel, ojaswin,
ritesh.list, martin.petersen, linux-ext4, linux-block,
catherine.hoang, John Garry
Currently the size of atomic write allowed is fixed at the blocksize.
To start to lift this restriction, partly refactor
xfs_report_atomic_write() to into helpers -
xfs_get_atomic_write_{min, max}() - and use those helpers to find the
per-inode atomic write limits and check according to that.
Also add xfs_get_atomic_write_max_opt() to return the optimal limit, and
just return 0 since large atomics aren't supported yet.
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: John Garry <john.g.garry@oracle.com>
---
Please check new function xfs_get_atomic_write_max_opt(), added since RB
tags were granted
fs/xfs/xfs_file.c | 12 +++++-------
fs/xfs/xfs_iops.c | 36 +++++++++++++++++++++++++++++++-----
fs/xfs/xfs_iops.h | 3 +++
3 files changed, 39 insertions(+), 12 deletions(-)
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 653e42ccc0c3..1302783a7157 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1032,14 +1032,12 @@ xfs_file_write_iter(
return xfs_file_dax_write(iocb, from);
if (iocb->ki_flags & IOCB_ATOMIC) {
- /*
- * Currently only atomic writing of a single FS block is
- * supported. It would be possible to atomic write smaller than
- * a FS block, but there is no requirement to support this.
- * Note that iomap also does not support this yet.
- */
- if (ocount != ip->i_mount->m_sb.sb_blocksize)
+ if (ocount < xfs_get_atomic_write_min(ip))
return -EINVAL;
+
+ if (ocount > xfs_get_atomic_write_max(ip))
+ return -EINVAL;
+
ret = generic_atomic_write_valid(iocb, from);
if (ret)
return ret;
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index d324044a2225..3b5aa39dbfe9 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -601,16 +601,42 @@ xfs_report_dioalign(
stat->dio_offset_align = stat->dio_read_offset_align;
}
+unsigned int
+xfs_get_atomic_write_min(
+ struct xfs_inode *ip)
+{
+ if (!xfs_inode_can_hw_atomicwrite(ip))
+ return 0;
+
+ return ip->i_mount->m_sb.sb_blocksize;
+}
+
+unsigned int
+xfs_get_atomic_write_max(
+ struct xfs_inode *ip)
+{
+ if (!xfs_inode_can_hw_atomicwrite(ip))
+ return 0;
+
+ return ip->i_mount->m_sb.sb_blocksize;
+}
+
+unsigned int
+xfs_get_atomic_write_max_opt(
+ struct xfs_inode *ip)
+{
+ return 0;
+}
+
static void
xfs_report_atomic_write(
struct xfs_inode *ip,
struct kstat *stat)
{
- unsigned int unit_min = 0, unit_max = 0;
-
- if (xfs_inode_can_hw_atomicwrite(ip))
- unit_min = unit_max = ip->i_mount->m_sb.sb_blocksize;
- generic_fill_statx_atomic_writes(stat, unit_min, unit_max, 0);
+ generic_fill_statx_atomic_writes(stat,
+ xfs_get_atomic_write_min(ip),
+ xfs_get_atomic_write_max(ip),
+ xfs_get_atomic_write_max_opt(ip));
}
STATIC int
diff --git a/fs/xfs/xfs_iops.h b/fs/xfs/xfs_iops.h
index 3c1a2605ffd2..0896f6b8b3b8 100644
--- a/fs/xfs/xfs_iops.h
+++ b/fs/xfs/xfs_iops.h
@@ -19,5 +19,8 @@ int xfs_inode_init_security(struct inode *inode, struct inode *dir,
extern void xfs_setup_inode(struct xfs_inode *ip);
extern void xfs_setup_iops(struct xfs_inode *ip);
extern void xfs_diflags_to_iflags(struct xfs_inode *ip, bool init);
+unsigned int xfs_get_atomic_write_min(struct xfs_inode *ip);
+unsigned int xfs_get_atomic_write_max(struct xfs_inode *ip);
+unsigned int xfs_get_atomic_write_max_opt(struct xfs_inode *ip);
#endif /* __XFS_IOPS_H__ */
--
2.31.1
^ permalink raw reply related [flat|nested] 29+ messages in thread* [PATCH v6 07/12] xfs: add xfs_atomic_write_cow_iomap_begin()
2025-04-08 10:41 [PATCH v6 00/12] large atomic writes for xfs John Garry
` (5 preceding siblings ...)
2025-04-08 10:42 ` [PATCH v6 06/12] xfs: refine atomic write size check in xfs_file_write_iter() John Garry
@ 2025-04-08 10:42 ` John Garry
2025-04-08 10:42 ` [PATCH v6 08/12] xfs: add large atomic writes checks in xfs_direct_write_iomap_begin() John Garry
` (4 subsequent siblings)
11 siblings, 0 replies; 29+ messages in thread
From: John Garry @ 2025-04-08 10:42 UTC (permalink / raw)
To: brauner, djwong, hch, viro, jack, cem
Cc: linux-fsdevel, dchinner, linux-xfs, linux-kernel, ojaswin,
ritesh.list, martin.petersen, linux-ext4, linux-block,
catherine.hoang, John Garry
For CoW-based atomic writes, reuse the infrastructure for reflink CoW fork
support.
Add ->iomap_begin() callback xfs_atomic_write_cow_iomap_begin() to create
staging mappings in the CoW fork for atomic write updates.
The general steps in the function are as follows:
- find extent mapping in the CoW fork for the FS block range being written
- if part or full extent is found, proceed to process found extent
- if no extent found, map in new blocks to the CoW fork
- convert unwritten blocks in extent if required
- update iomap extent mapping and return
The bulk of this function is quite similar to the processing in
xfs_reflink_allocate_cow(), where we try to find an extent mapping; if
none exists, then allocate a new extent in the CoW fork, convert unwritten
blocks, and return a mapping.
Performance testing has shown the XFS_ILOCK_EXCL locking to be quite
a bottleneck, so this is an area which could be optimised in future.
Christoph Hellwig contributed almost all of the code in
xfs_atomic_write_cow_iomap_begin().
Signed-off-by: John Garry <john.g.garry@oracle.com>
---
fs/xfs/xfs_iomap.c | 118 +++++++++++++++++++++++++++++++++++++++++++
fs/xfs/xfs_iomap.h | 1 +
fs/xfs/xfs_reflink.c | 2 +-
fs/xfs/xfs_reflink.h | 2 +
fs/xfs/xfs_trace.h | 22 ++++++++
5 files changed, 144 insertions(+), 1 deletion(-)
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index cb23c8871f81..fab5078bbf00 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -1022,6 +1022,124 @@ const struct iomap_ops xfs_zoned_direct_write_iomap_ops = {
};
#endif /* CONFIG_XFS_RT */
+static int
+xfs_atomic_write_cow_iomap_begin(
+ struct inode *inode,
+ loff_t offset,
+ loff_t length,
+ unsigned flags,
+ struct iomap *iomap,
+ struct iomap *srcmap)
+{
+ struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_mount *mp = ip->i_mount;
+ const xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
+ xfs_fileoff_t end_fsb = xfs_iomap_end_fsb(mp, offset, length);
+ xfs_filblks_t count_fsb = end_fsb - offset_fsb;
+ int nmaps = 1;
+ xfs_filblks_t resaligned;
+ struct xfs_bmbt_irec cmap;
+ struct xfs_iext_cursor icur;
+ struct xfs_trans *tp;
+ int error;
+ u64 seq;
+
+ ASSERT(!XFS_IS_REALTIME_INODE(ip));
+ ASSERT(flags & IOMAP_WRITE);
+ ASSERT(flags & IOMAP_DIRECT);
+
+ if (xfs_is_shutdown(mp))
+ return -EIO;
+
+ if (WARN_ON_ONCE(!xfs_has_reflink(mp)))
+ return -EINVAL;
+
+ /* blocks are always allocated in this path */
+ if (flags & IOMAP_NOWAIT)
+ return -EAGAIN;
+
+ trace_xfs_iomap_atomic_write_cow(ip, offset, length);
+
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+
+ if (!ip->i_cowfp) {
+ ASSERT(!xfs_is_reflink_inode(ip));
+ xfs_ifork_init_cow(ip);
+ }
+
+ if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &cmap))
+ cmap.br_startoff = end_fsb;
+ if (cmap.br_startoff <= offset_fsb) {
+ xfs_trim_extent(&cmap, offset_fsb, count_fsb);
+ goto found;
+ }
+
+ end_fsb = cmap.br_startoff;
+ count_fsb = end_fsb - offset_fsb;
+
+ resaligned = xfs_aligned_fsb_count(offset_fsb, count_fsb,
+ xfs_get_cowextsz_hint(ip));
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+
+ error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write,
+ XFS_DIOSTRAT_SPACE_RES(mp, resaligned), 0, false, &tp);
+ if (error)
+ return error;
+
+ /* extent layout could have changed since the unlock, so check again */
+ if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &cmap))
+ cmap.br_startoff = end_fsb;
+ if (cmap.br_startoff <= offset_fsb) {
+ xfs_trim_extent(&cmap, offset_fsb, count_fsb);
+ xfs_trans_cancel(tp);
+ goto found;
+ }
+
+ /*
+ * Allocate the entire reservation as unwritten blocks.
+ *
+ * Use XFS_BMAPI_EXTSZALIGN to hint at aligning new extents according to
+ * extszhint, such that there will be a greater chance that future
+ * atomic writes to that same range will be aligned (and don't require
+ * this COW-based method).
+ */
+ error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb,
+ XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC |
+ XFS_BMAPI_EXTSZALIGN, 0, &cmap, &nmaps);
+ if (error) {
+ xfs_trans_cancel(tp);
+ goto out_unlock;
+ }
+
+ xfs_inode_set_cowblocks_tag(ip);
+ error = xfs_trans_commit(tp);
+ if (error)
+ goto out_unlock;
+
+found:
+ if (cmap.br_state != XFS_EXT_NORM) {
+ error = xfs_reflink_convert_cow_locked(ip, offset_fsb,
+ count_fsb);
+ if (error)
+ goto out_unlock;
+ cmap.br_state = XFS_EXT_NORM;
+ }
+
+ length = XFS_FSB_TO_B(mp, cmap.br_startoff + cmap.br_blockcount);
+ trace_xfs_iomap_found(ip, offset, length - offset, XFS_COW_FORK, &cmap);
+ seq = xfs_iomap_inode_sequence(ip, IOMAP_F_SHARED);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, IOMAP_F_SHARED, seq);
+
+out_unlock:
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ return error;
+}
+
+const struct iomap_ops xfs_atomic_write_cow_iomap_ops = {
+ .iomap_begin = xfs_atomic_write_cow_iomap_begin,
+};
+
static int
xfs_dax_write_iomap_end(
struct inode *inode,
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index d330c4a581b1..674f8ac1b9bd 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -56,5 +56,6 @@ extern const struct iomap_ops xfs_read_iomap_ops;
extern const struct iomap_ops xfs_seek_iomap_ops;
extern const struct iomap_ops xfs_xattr_iomap_ops;
extern const struct iomap_ops xfs_dax_write_iomap_ops;
+extern const struct iomap_ops xfs_atomic_write_cow_iomap_ops;
#endif /* __XFS_IOMAP_H__*/
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index bd711c5bb6bb..f5d338916098 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -293,7 +293,7 @@ xfs_bmap_trim_cow(
return xfs_reflink_trim_around_shared(ip, imap, shared);
}
-static int
+int
xfs_reflink_convert_cow_locked(
struct xfs_inode *ip,
xfs_fileoff_t offset_fsb,
diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h
index cc4e92278279..379619f24247 100644
--- a/fs/xfs/xfs_reflink.h
+++ b/fs/xfs/xfs_reflink.h
@@ -35,6 +35,8 @@ int xfs_reflink_allocate_cow(struct xfs_inode *ip, struct xfs_bmbt_irec *imap,
bool convert_now);
extern int xfs_reflink_convert_cow(struct xfs_inode *ip, xfs_off_t offset,
xfs_off_t count);
+int xfs_reflink_convert_cow_locked(struct xfs_inode *ip,
+ xfs_fileoff_t offset_fsb, xfs_filblks_t count_fsb);
extern int xfs_reflink_cancel_cow_blocks(struct xfs_inode *ip,
struct xfs_trans **tpp, xfs_fileoff_t offset_fsb,
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index e56ba1963160..9554578c6da4 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -1657,6 +1657,28 @@ DEFINE_RW_EVENT(xfs_file_direct_write);
DEFINE_RW_EVENT(xfs_file_dax_write);
DEFINE_RW_EVENT(xfs_reflink_bounce_dio_write);
+TRACE_EVENT(xfs_iomap_atomic_write_cow,
+ TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count),
+ TP_ARGS(ip, offset, count),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(xfs_off_t, offset)
+ __field(ssize_t, count)
+ ),
+ TP_fast_assign(
+ __entry->dev = VFS_I(ip)->i_sb->s_dev;
+ __entry->ino = ip->i_ino;
+ __entry->offset = offset;
+ __entry->count = count;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx pos 0x%llx bytecount 0x%zx",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->offset,
+ __entry->count)
+)
+
DECLARE_EVENT_CLASS(xfs_imap_class,
TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count,
int whichfork, struct xfs_bmbt_irec *irec),
--
2.31.1
^ permalink raw reply related [flat|nested] 29+ messages in thread* [PATCH v6 08/12] xfs: add large atomic writes checks in xfs_direct_write_iomap_begin()
2025-04-08 10:41 [PATCH v6 00/12] large atomic writes for xfs John Garry
` (6 preceding siblings ...)
2025-04-08 10:42 ` [PATCH v6 07/12] xfs: add xfs_atomic_write_cow_iomap_begin() John Garry
@ 2025-04-08 10:42 ` John Garry
2025-04-08 10:42 ` [PATCH v6 09/12] xfs: commit CoW-based atomic writes atomically John Garry
` (3 subsequent siblings)
11 siblings, 0 replies; 29+ messages in thread
From: John Garry @ 2025-04-08 10:42 UTC (permalink / raw)
To: brauner, djwong, hch, viro, jack, cem
Cc: linux-fsdevel, dchinner, linux-xfs, linux-kernel, ojaswin,
ritesh.list, martin.petersen, linux-ext4, linux-block,
catherine.hoang, John Garry
For when large atomic writes (> 1x FS block) are supported, there will be
various occasions when HW offload may not be possible.
Such instances include:
- unaligned extent mapping wrt write length
- extent mappings which do not cover the full write, e.g. the write spans
sparse or mixed-mapping extents
- the write length is greater than HW offload can support
In those cases, we need to fallback to the CoW-based atomic write mode. For
this, report special code -ENOPROTOOPT to inform the caller that HW
offload-based method is not possible.
In addition to the occasions mentioned, if the write covers an unallocated
range, we again judge that we need to rely on the CoW-based method when we
would need to allocate anything more than 1x block. This is because if we
allocate less blocks that is required for the write, then again HW
offload-based method would not be possible. So we are taking a pessimistic
approach to writes covering unallocated space.
Signed-off-by: John Garry <john.g.garry@oracle.com>
---
fs/xfs/xfs_iomap.c | 65 ++++++++++++++++++++++++++++++++++++++++++++--
1 file changed, 63 insertions(+), 2 deletions(-)
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index fab5078bbf00..c6b5fb824f8b 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -798,6 +798,41 @@ imap_spans_range(
return true;
}
+static bool
+xfs_bmap_hw_atomic_write_possible(
+ struct xfs_inode *ip,
+ struct xfs_bmbt_irec *imap,
+ xfs_fileoff_t offset_fsb,
+ xfs_fileoff_t end_fsb)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ xfs_fsize_t len = XFS_FSB_TO_B(mp, end_fsb - offset_fsb);
+
+ /*
+ * atomic writes are required to be naturally aligned for disk blocks,
+ * which ensures that we adhere to block layer rules that we won't
+ * straddle any boundary or violate write alignment requirement.
+ */
+ if (!IS_ALIGNED(imap->br_startblock, imap->br_blockcount))
+ return false;
+
+ /*
+ * Spanning multiple extents would mean that multiple BIOs would be
+ * issued, and so would lose atomicity required for REQ_ATOMIC-based
+ * atomics.
+ */
+ if (!imap_spans_range(imap, offset_fsb, end_fsb))
+ return false;
+
+ /*
+ * The ->iomap_begin caller should ensure this, but check anyway.
+ */
+ if (len > xfs_inode_buftarg(ip)->bt_bdev_awu_max)
+ return false;
+
+ return true;
+}
+
static int
xfs_direct_write_iomap_begin(
struct inode *inode,
@@ -812,9 +847,11 @@ xfs_direct_write_iomap_begin(
struct xfs_bmbt_irec imap, cmap;
xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
xfs_fileoff_t end_fsb = xfs_iomap_end_fsb(mp, offset, length);
+ xfs_fileoff_t orig_end_fsb = end_fsb;
int nimaps = 1, error = 0;
bool shared = false;
u16 iomap_flags = 0;
+ bool needs_alloc;
unsigned int lockmode;
u64 seq;
@@ -875,13 +912,37 @@ xfs_direct_write_iomap_begin(
(flags & IOMAP_DIRECT) || IS_DAX(inode));
if (error)
goto out_unlock;
- if (shared)
+ if (shared) {
+ if ((flags & IOMAP_ATOMIC) &&
+ !xfs_bmap_hw_atomic_write_possible(ip, &cmap,
+ offset_fsb, end_fsb)) {
+ error = -ENOPROTOOPT;
+ goto out_unlock;
+ }
goto out_found_cow;
+ }
end_fsb = imap.br_startoff + imap.br_blockcount;
length = XFS_FSB_TO_B(mp, end_fsb) - offset;
}
- if (imap_needs_alloc(inode, flags, &imap, nimaps))
+ needs_alloc = imap_needs_alloc(inode, flags, &imap, nimaps);
+
+ if (flags & IOMAP_ATOMIC) {
+ error = -ENOPROTOOPT;
+ /*
+ * If we allocate less than what is required for the write
+ * then we may end up with multiple extents, which means that
+ * REQ_ATOMIC-based cannot be used, so avoid this possibility.
+ */
+ if (needs_alloc && orig_end_fsb - offset_fsb > 1)
+ goto out_unlock;
+
+ if (!xfs_bmap_hw_atomic_write_possible(ip, &imap, offset_fsb,
+ orig_end_fsb))
+ goto out_unlock;
+ }
+
+ if (needs_alloc)
goto allocate_blocks;
/*
--
2.31.1
^ permalink raw reply related [flat|nested] 29+ messages in thread* [PATCH v6 09/12] xfs: commit CoW-based atomic writes atomically
2025-04-08 10:41 [PATCH v6 00/12] large atomic writes for xfs John Garry
` (7 preceding siblings ...)
2025-04-08 10:42 ` [PATCH v6 08/12] xfs: add large atomic writes checks in xfs_direct_write_iomap_begin() John Garry
@ 2025-04-08 10:42 ` John Garry
2025-04-08 10:42 ` [PATCH v6 10/12] xfs: add xfs_file_dio_write_atomic() John Garry
` (2 subsequent siblings)
11 siblings, 0 replies; 29+ messages in thread
From: John Garry @ 2025-04-08 10:42 UTC (permalink / raw)
To: brauner, djwong, hch, viro, jack, cem
Cc: linux-fsdevel, dchinner, linux-xfs, linux-kernel, ojaswin,
ritesh.list, martin.petersen, linux-ext4, linux-block,
catherine.hoang, John Garry
When completing a CoW-based write, each extent range mapping update is
covered by a separate transaction.
For a CoW-based atomic write, all mappings must be changed at once, so
change to use a single transaction.
Note that there is a limit on the amount of log intent items which can be
fit into a single transaction, but this is being ignored for now since
the count of items for a typical atomic write would be much less than is
typically supported. A typical atomic write would be expected to be 64KB
or less, which means only 16 possible extents unmaps, which is quite
small.
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: John Garry <john.g.garry@oracle.com>
---
fs/xfs/xfs_file.c | 5 +++-
fs/xfs/xfs_reflink.c | 56 ++++++++++++++++++++++++++++++++++++++++++++
fs/xfs/xfs_reflink.h | 2 ++
3 files changed, 62 insertions(+), 1 deletion(-)
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 1302783a7157..ba4b02abc6e4 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -576,7 +576,10 @@ xfs_dio_write_end_io(
nofs_flag = memalloc_nofs_save();
if (flags & IOMAP_DIO_COW) {
- error = xfs_reflink_end_cow(ip, offset, size);
+ if (iocb->ki_flags & IOCB_ATOMIC)
+ error = xfs_reflink_end_atomic_cow(ip, offset, size);
+ else
+ error = xfs_reflink_end_cow(ip, offset, size);
if (error)
goto out;
}
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index f5d338916098..01f1930fdde6 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -984,6 +984,62 @@ xfs_reflink_end_cow(
return error;
}
+/*
+ * Fully remap all of the file's data fork at once, which is the critical part
+ * in achieving atomic behaviour.
+ * The regular CoW end path does not use function as to keep the block
+ * reservation per transaction as low as possible.
+ */
+int
+xfs_reflink_end_atomic_cow(
+ struct xfs_inode *ip,
+ xfs_off_t offset,
+ xfs_off_t count)
+{
+ xfs_fileoff_t offset_fsb;
+ xfs_fileoff_t end_fsb;
+ int error = 0;
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_trans *tp;
+ unsigned int resblks;
+
+ trace_xfs_reflink_end_cow(ip, offset, count);
+
+ offset_fsb = XFS_B_TO_FSBT(mp, offset);
+ end_fsb = XFS_B_TO_FSB(mp, offset + count);
+
+ /*
+ * Each remapping operation could cause a btree split, so in the worst
+ * case that's one for each block.
+ */
+ resblks = (end_fsb - offset_fsb) *
+ XFS_NEXTENTADD_SPACE_RES(mp, 1, XFS_DATA_FORK);
+
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0,
+ XFS_TRANS_RESERVE, &tp);
+ if (error)
+ return error;
+
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, ip, 0);
+
+ while (end_fsb > offset_fsb && !error) {
+ error = xfs_reflink_end_cow_extent_locked(tp, ip, &offset_fsb,
+ end_fsb);
+ }
+ if (error) {
+ trace_xfs_reflink_end_cow_error(ip, error, _RET_IP_);
+ goto out_cancel;
+ }
+ error = xfs_trans_commit(tp);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ return error;
+out_cancel:
+ xfs_trans_cancel(tp);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ return error;
+}
+
/*
* Free all CoW staging blocks that are still referenced by the ondisk refcount
* metadata. The ondisk metadata does not track which inode created the
diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h
index 379619f24247..412e9b6f2082 100644
--- a/fs/xfs/xfs_reflink.h
+++ b/fs/xfs/xfs_reflink.h
@@ -45,6 +45,8 @@ extern int xfs_reflink_cancel_cow_range(struct xfs_inode *ip, xfs_off_t offset,
xfs_off_t count, bool cancel_real);
extern int xfs_reflink_end_cow(struct xfs_inode *ip, xfs_off_t offset,
xfs_off_t count);
+int xfs_reflink_end_atomic_cow(struct xfs_inode *ip, xfs_off_t offset,
+ xfs_off_t count);
extern int xfs_reflink_recover_cow(struct xfs_mount *mp);
extern loff_t xfs_reflink_remap_range(struct file *file_in, loff_t pos_in,
struct file *file_out, loff_t pos_out, loff_t len,
--
2.31.1
^ permalink raw reply related [flat|nested] 29+ messages in thread* [PATCH v6 10/12] xfs: add xfs_file_dio_write_atomic()
2025-04-08 10:41 [PATCH v6 00/12] large atomic writes for xfs John Garry
` (8 preceding siblings ...)
2025-04-08 10:42 ` [PATCH v6 09/12] xfs: commit CoW-based atomic writes atomically John Garry
@ 2025-04-08 10:42 ` John Garry
2025-04-08 10:42 ` [PATCH v6 11/12] xfs: add xfs_compute_atomic_write_unit_max() John Garry
2025-04-08 10:42 ` [PATCH v6 12/12] xfs: update atomic write limits John Garry
11 siblings, 0 replies; 29+ messages in thread
From: John Garry @ 2025-04-08 10:42 UTC (permalink / raw)
To: brauner, djwong, hch, viro, jack, cem
Cc: linux-fsdevel, dchinner, linux-xfs, linux-kernel, ojaswin,
ritesh.list, martin.petersen, linux-ext4, linux-block,
catherine.hoang, John Garry
Add xfs_file_dio_write_atomic() for dedicated handling of atomic writes.
The function works based on two operating modes:
- HW offload, i.e. REQ_ATOMIC-based
- CoW based with out-of-places write and atomic extent remapping
The preferred method is HW offload as it will be faster. If HW offload is
not possible, then we fallback to the CoW-based method.
HW offload would not be possible for the write length exceeding the HW
offload limit, the write spanning multiple extents, unaligned disk blocks,
etc.
Apart from the write exceeding the HW offload limit, other conditions for
HW offload can only be detected in the iomap handling for the write. As
such, we use a fallback method to issue the write if we detect in the
->iomap_begin() handler that HW offload is not possible. Special code
-ENOPROTOOPT is returned from ->iomap_begin() to inform that HW offload
not possible.
Signed-off-by: John Garry <john.g.garry@oracle.com>
---
fs/xfs/xfs_file.c | 68 +++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 68 insertions(+)
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index ba4b02abc6e4..81a377f65aa3 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -728,6 +728,72 @@ xfs_file_dio_write_zoned(
return ret;
}
+/*
+ * Handle block atomic writes
+ *
+ * Two methods of atomic writes are supported:
+ * - REQ_ATOMIC-based, which would typically use some form of HW offload in the
+ * disk
+ * - COW-based, which uses a COW fork as a staging extent for data updates
+ * before atomically updating extent mappings for the range being written
+ *
+ */
+static noinline ssize_t
+xfs_file_dio_write_atomic(
+ struct xfs_inode *ip,
+ struct kiocb *iocb,
+ struct iov_iter *from)
+{
+ unsigned int iolock = XFS_IOLOCK_SHARED;
+ ssize_t ret, ocount = iov_iter_count(from);
+ const struct iomap_ops *dops;
+
+ /*
+ * HW offload should be faster, so try that first if it is already
+ * known that the write length is not too large.
+ */
+ if (ocount > xfs_inode_buftarg(ip)->bt_bdev_awu_max)
+ dops = &xfs_atomic_write_cow_iomap_ops;
+ else
+ dops = &xfs_direct_write_iomap_ops;
+
+retry:
+ ret = xfs_ilock_iocb_for_write(iocb, &iolock);
+ if (ret)
+ return ret;
+
+ ret = xfs_file_write_checks(iocb, from, &iolock, NULL);
+ if (ret)
+ goto out_unlock;
+
+ /* Demote similar to xfs_file_dio_write_aligned() */
+ if (iolock == XFS_IOLOCK_EXCL) {
+ xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
+ iolock = XFS_IOLOCK_SHARED;
+ }
+
+ trace_xfs_file_direct_write(iocb, from);
+ ret = iomap_dio_rw(iocb, from, dops, &xfs_dio_write_ops,
+ 0, NULL, 0);
+
+ /*
+ * The retry mechanism is based on the ->iomap_begin method returning
+ * -ENOPROTOOPT, which would be when the REQ_ATOMIC-based write is not
+ * possible. The REQ_ATOMIC-based method typically not be possible if
+ * the write spans multiple extents or the disk blocks are misaligned.
+ */
+ if (ret == -ENOPROTOOPT && dops == &xfs_direct_write_iomap_ops) {
+ xfs_iunlock(ip, iolock);
+ dops = &xfs_atomic_write_cow_iomap_ops;
+ goto retry;
+ }
+
+out_unlock:
+ if (iolock)
+ xfs_iunlock(ip, iolock);
+ return ret;
+}
+
/*
* Handle block unaligned direct I/O writes
*
@@ -843,6 +909,8 @@ xfs_file_dio_write(
return xfs_file_dio_write_unaligned(ip, iocb, from);
if (xfs_is_zoned_inode(ip))
return xfs_file_dio_write_zoned(ip, iocb, from);
+ if (iocb->ki_flags & IOCB_ATOMIC)
+ return xfs_file_dio_write_atomic(ip, iocb, from);
return xfs_file_dio_write_aligned(ip, iocb, from,
&xfs_direct_write_iomap_ops, &xfs_dio_write_ops, NULL);
}
--
2.31.1
^ permalink raw reply related [flat|nested] 29+ messages in thread* [PATCH v6 11/12] xfs: add xfs_compute_atomic_write_unit_max()
2025-04-08 10:41 [PATCH v6 00/12] large atomic writes for xfs John Garry
` (9 preceding siblings ...)
2025-04-08 10:42 ` [PATCH v6 10/12] xfs: add xfs_file_dio_write_atomic() John Garry
@ 2025-04-08 10:42 ` John Garry
2025-04-08 21:28 ` Darrick J. Wong
2025-04-08 22:47 ` Dave Chinner
2025-04-08 10:42 ` [PATCH v6 12/12] xfs: update atomic write limits John Garry
11 siblings, 2 replies; 29+ messages in thread
From: John Garry @ 2025-04-08 10:42 UTC (permalink / raw)
To: brauner, djwong, hch, viro, jack, cem
Cc: linux-fsdevel, dchinner, linux-xfs, linux-kernel, ojaswin,
ritesh.list, martin.petersen, linux-ext4, linux-block,
catherine.hoang, John Garry
Now that CoW-based atomic writes are supported, update the max size of an
atomic write for the data device.
The limit of a CoW-based atomic write will be the limit of the number of
logitems which can fit into a single transaction.
In addition, the max atomic write size needs to be aligned to the agsize.
Limit the size of atomic writes to the greatest power-of-two factor of the
agsize so that allocations for an atomic write will always be aligned
compatibly with the alignment requirements of the storage.
rtvol is not commonly used, so it is not very important to support large
atomic writes there initially.
Furthermore, adding large atomic writes for rtvol would be complicated due
to alignment already offered by rtextsize and also the limitation of
reflink support only be possible for rtextsize is a power-of-2.
Function xfs_atomic_write_logitems() is added to find the limit the number
of log items which can fit in a single transaction.
Darrick Wong contributed the changes in xfs_atomic_write_logitems()
originally, but may now be outdated by [0].
[0] https://lore.kernel.org/linux-xfs/20250406172227.GC6307@frogsfrogsfrogs/
Signed-off-by: John Garry <john.g.garry@oracle.com>
---
fs/xfs/xfs_mount.c | 36 ++++++++++++++++++++++++++++++++++++
fs/xfs/xfs_mount.h | 5 +++++
fs/xfs/xfs_super.c | 22 ++++++++++++++++++++++
fs/xfs/xfs_super.h | 1 +
4 files changed, 64 insertions(+)
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 00b53f479ece..27a737202637 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -666,6 +666,37 @@ xfs_agbtree_compute_maxlevels(
mp->m_agbtree_maxlevels = max(levels, mp->m_refc_maxlevels);
}
+static inline void
+xfs_compute_atomic_write_unit_max(
+ struct xfs_mount *mp)
+{
+ xfs_agblock_t agsize = mp->m_sb.sb_agblocks;
+ unsigned int max_extents_logitems;
+ unsigned int max_agsize;
+
+ if (!xfs_has_reflink(mp)) {
+ mp->m_atomic_write_unit_max = 1;
+ return;
+ }
+
+ /*
+ * Find limit according to logitems.
+ */
+ max_extents_logitems = xfs_atomic_write_logitems(mp);
+
+ /*
+ * Also limit the size of atomic writes to the greatest power-of-two
+ * factor of the agsize so that allocations for an atomic write will
+ * always be aligned compatibly with the alignment requirements of the
+ * storage.
+ * The greatest power-of-two is the value according to the lowest bit
+ * set.
+ */
+ max_agsize = 1 << (ffs(agsize) - 1);
+
+ mp->m_atomic_write_unit_max = min(max_extents_logitems, max_agsize);
+}
+
/* Compute maximum possible height for realtime btree types for this fs. */
static inline void
xfs_rtbtree_compute_maxlevels(
@@ -842,6 +873,11 @@ xfs_mountfs(
*/
xfs_trans_init(mp);
+ /*
+ * Pre-calculate atomic write unit max.
+ */
+ xfs_compute_atomic_write_unit_max(mp);
+
/*
* Allocate and initialize the per-ag data.
*/
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 799b84220ebb..4462bffbf0ff 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -230,6 +230,11 @@ typedef struct xfs_mount {
bool m_update_sb; /* sb needs update in mount */
unsigned int m_max_open_zones;
+ /*
+ * data device max atomic write.
+ */
+ xfs_extlen_t m_atomic_write_unit_max;
+
/*
* Bitsets of per-fs metadata that have been checked and/or are sick.
* Callers must hold m_sb_lock to access these two fields.
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index b2dd0c0bf509..42b2b7540507 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -615,6 +615,28 @@ xfs_init_mount_workqueues(
return -ENOMEM;
}
+unsigned int
+xfs_atomic_write_logitems(
+ struct xfs_mount *mp)
+{
+ unsigned int efi = xfs_efi_item_overhead(1);
+ unsigned int rui = xfs_rui_item_overhead(1);
+ unsigned int cui = xfs_cui_item_overhead(1);
+ unsigned int bui = xfs_bui_item_overhead(1);
+ unsigned int logres = M_RES(mp)->tr_write.tr_logres;
+
+ /*
+ * Maximum overhead to complete an atomic write ioend in software:
+ * remove data fork extent + remove cow fork extent +
+ * map extent into data fork
+ */
+ unsigned int atomic_logitems =
+ (bui + cui + rui + efi) + (cui + rui) + (bui + rui);
+
+ /* atomic write limits are always a power-of-2 */
+ return rounddown_pow_of_two(logres / (2 * atomic_logitems));
+}
+
STATIC void
xfs_destroy_mount_workqueues(
struct xfs_mount *mp)
diff --git a/fs/xfs/xfs_super.h b/fs/xfs/xfs_super.h
index c0e85c1e42f2..e0f82be9093a 100644
--- a/fs/xfs/xfs_super.h
+++ b/fs/xfs/xfs_super.h
@@ -100,5 +100,6 @@ extern struct workqueue_struct *xfs_discard_wq;
#define XFS_M(sb) ((struct xfs_mount *)((sb)->s_fs_info))
struct dentry *xfs_debugfs_mkdir(const char *name, struct dentry *parent);
+unsigned int xfs_atomic_write_logitems(struct xfs_mount *mp);
#endif /* __XFS_SUPER_H__ */
--
2.31.1
^ permalink raw reply related [flat|nested] 29+ messages in thread* Re: [PATCH v6 11/12] xfs: add xfs_compute_atomic_write_unit_max()
2025-04-08 10:42 ` [PATCH v6 11/12] xfs: add xfs_compute_atomic_write_unit_max() John Garry
@ 2025-04-08 21:28 ` Darrick J. Wong
2025-04-08 22:47 ` Dave Chinner
1 sibling, 0 replies; 29+ messages in thread
From: Darrick J. Wong @ 2025-04-08 21:28 UTC (permalink / raw)
To: John Garry
Cc: brauner, hch, viro, jack, cem, linux-fsdevel, dchinner, linux-xfs,
linux-kernel, ojaswin, ritesh.list, martin.petersen, linux-ext4,
linux-block, catherine.hoang
On Tue, Apr 08, 2025 at 10:42:08AM +0000, John Garry wrote:
> Now that CoW-based atomic writes are supported, update the max size of an
> atomic write for the data device.
>
> The limit of a CoW-based atomic write will be the limit of the number of
> logitems which can fit into a single transaction.
>
> In addition, the max atomic write size needs to be aligned to the agsize.
> Limit the size of atomic writes to the greatest power-of-two factor of the
> agsize so that allocations for an atomic write will always be aligned
> compatibly with the alignment requirements of the storage.
>
> rtvol is not commonly used, so it is not very important to support large
> atomic writes there initially.
>
> Furthermore, adding large atomic writes for rtvol would be complicated due
> to alignment already offered by rtextsize and also the limitation of
> reflink support only be possible for rtextsize is a power-of-2.
>
> Function xfs_atomic_write_logitems() is added to find the limit the number
> of log items which can fit in a single transaction.
>
> Darrick Wong contributed the changes in xfs_atomic_write_logitems()
> originally, but may now be outdated by [0].
>
> [0] https://lore.kernel.org/linux-xfs/20250406172227.GC6307@frogsfrogsfrogs/
>
> Signed-off-by: John Garry <john.g.garry@oracle.com>
> ---
> fs/xfs/xfs_mount.c | 36 ++++++++++++++++++++++++++++++++++++
> fs/xfs/xfs_mount.h | 5 +++++
> fs/xfs/xfs_super.c | 22 ++++++++++++++++++++++
> fs/xfs/xfs_super.h | 1 +
> 4 files changed, 64 insertions(+)
>
> diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
> index 00b53f479ece..27a737202637 100644
> --- a/fs/xfs/xfs_mount.c
> +++ b/fs/xfs/xfs_mount.c
> @@ -666,6 +666,37 @@ xfs_agbtree_compute_maxlevels(
> mp->m_agbtree_maxlevels = max(levels, mp->m_refc_maxlevels);
> }
>
> +static inline void
> +xfs_compute_atomic_write_unit_max(
> + struct xfs_mount *mp)
> +{
> + xfs_agblock_t agsize = mp->m_sb.sb_agblocks;
> + unsigned int max_extents_logitems;
> + unsigned int max_agsize;
> +
> + if (!xfs_has_reflink(mp)) {
> + mp->m_atomic_write_unit_max = 1;
> + return;
> + }
> +
> + /*
> + * Find limit according to logitems.
> + */
> + max_extents_logitems = xfs_atomic_write_logitems(mp);
> +
> + /*
> + * Also limit the size of atomic writes to the greatest power-of-two
> + * factor of the agsize so that allocations for an atomic write will
> + * always be aligned compatibly with the alignment requirements of the
> + * storage.
> + * The greatest power-of-two is the value according to the lowest bit
> + * set.
> + */
> + max_agsize = 1 << (ffs(agsize) - 1);
> +
> + mp->m_atomic_write_unit_max = min(max_extents_logitems, max_agsize);
> +}
> +
> /* Compute maximum possible height for realtime btree types for this fs. */
> static inline void
> xfs_rtbtree_compute_maxlevels(
> @@ -842,6 +873,11 @@ xfs_mountfs(
> */
> xfs_trans_init(mp);
>
> + /*
> + * Pre-calculate atomic write unit max.
> + */
> + xfs_compute_atomic_write_unit_max(mp);
> +
> /*
> * Allocate and initialize the per-ag data.
> */
> diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
> index 799b84220ebb..4462bffbf0ff 100644
> --- a/fs/xfs/xfs_mount.h
> +++ b/fs/xfs/xfs_mount.h
> @@ -230,6 +230,11 @@ typedef struct xfs_mount {
> bool m_update_sb; /* sb needs update in mount */
> unsigned int m_max_open_zones;
>
> + /*
> + * data device max atomic write.
> + */
> + xfs_extlen_t m_atomic_write_unit_max;
> +
> /*
> * Bitsets of per-fs metadata that have been checked and/or are sick.
> * Callers must hold m_sb_lock to access these two fields.
> diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
> index b2dd0c0bf509..42b2b7540507 100644
> --- a/fs/xfs/xfs_super.c
> +++ b/fs/xfs/xfs_super.c
> @@ -615,6 +615,28 @@ xfs_init_mount_workqueues(
> return -ENOMEM;
> }
>
> +unsigned int
> +xfs_atomic_write_logitems(
> + struct xfs_mount *mp)
> +{
> + unsigned int efi = xfs_efi_item_overhead(1);
> + unsigned int rui = xfs_rui_item_overhead(1);
> + unsigned int cui = xfs_cui_item_overhead(1);
> + unsigned int bui = xfs_bui_item_overhead(1);
Intent items can be relogged during a transaction roll, so you need to
add the done item overhead too, e.g.
const unsigned int efi = xfs_efi_item_overhead(1) +
xfs_efd_item_overhead(1);
const unsigned int rui = xfs_rui_item_overhead(1) +
xfs_rud_item_overhead();
const unsigned int cui = xfs_cui_item_overhead(1) +
xfs_cud_item_overhead();
const unsigned int bui = xfs_bui_item_overhead(1) +
xfs_bud_item_overhead();
> + unsigned int logres = M_RES(mp)->tr_write.tr_logres;
> +
> + /*
> + * Maximum overhead to complete an atomic write ioend in software:
> + * remove data fork extent + remove cow fork extent +
> + * map extent into data fork
> + */
> + unsigned int atomic_logitems =
> + (bui + cui + rui + efi) + (cui + rui) + (bui + rui);
You still have to leave enough space to finish at least one step of the
intent types that can be attached to the untorn cow ioend. Assuming
that you have functions to compute the reservation needed to finish one
step of each of the four intent item types, the worst case reservation
to finish one item is:
/* Overhead to finish one step of each intent item type */
const unsigned int f1 = xfs_calc_finish_efi_reservation(mp, 1);
const unsigned int f2 = xfs_calc_finish_rui_reservation(mp, 1);
const unsigned int f3 = xfs_calc_finish_cui_reservation(mp, 1);
const unsigned int f4 = xfs_calc_finish_bui_reservation(mp, 1);
/* We only finish one item per transaction in a chain */
const unsigned int step_size = max(f4, max3(f1, f2, f3));
So the worst case limit on the number of loops through
xfs_reflink_remap_extent is:
return rounddown_pow_of_two((logres - step_size) /
atomic_logitems);
and that's the maximum software untorn write unit. On my system that
gets you 128 blocks, but YMMY. Those xfs_calc_finish_*_reservation
helpers look something like this:
/*
* Finishing an EFI can free the blocks and bmap blocks (t2):
* the agf for each of the ags: nr * sector size
* the agfl for each of the ags: nr * sector size
* the super block to reflect the freed blocks: sector size
* worst case split in allocation btrees per extent assuming nr extents:
* nr exts * 2 trees * (2 * max depth - 1) * block size
*/
inline unsigned int
xfs_calc_finish_efi_reservation(
struct xfs_mount *mp,
unsigned int nr)
{
return xfs_calc_buf_res((2 * nr) + 1, mp->m_sb.sb_sectsize) +
xfs_calc_buf_res(xfs_allocfree_block_count(mp, nr),
mp->m_sb.sb_blocksize);
}
/*
* Finishing an RUI is the same as an EFI. We can split the rmap btree twice
* on each end of the record, and that can cause the AGFL to be refilled or
* emptied out.
*/
inline unsigned int
xfs_calc_finish_rui_reservation(
struct xfs_mount *mp,
unsigned int nr)
{
if (!xfs_has_rmapbt(mp))
return 0;
return xfs_calc_finish_efi_reservation(mp, nr);
}
/*
* In finishing a BUI, we can modify:
* the inode being truncated: inode size
* dquots
* the inode's bmap btree: (max depth + 1) * block size
*/
inline unsigned int
xfs_calc_finish_bui_reservation(
struct xfs_mount *mp,
unsigned int nr)
{
return xfs_calc_inode_res(mp, 1) + XFS_DQUOT_LOGRES +
xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1,
mp->m_sb.sb_blocksize);
}
/*
* Finishing a data device refcount updates (t1):
* the agfs of the ags containing the blocks: nr_ops * sector size
* the refcount btrees: nr_ops * 1 trees * (2 * max depth - 1) * block size
*/
inline unsigned int
xfs_calc_finish_cui_reservation(
struct xfs_mount *mp,
unsigned int nr_ops)
{
if (!xfs_has_reflink(mp))
return 0;
return xfs_calc_buf_res(nr_ops, mp->m_sb.sb_sectsize) +
xfs_calc_buf_res(xfs_refcountbt_block_count(mp, nr_ops),
mp->m_sb.sb_blocksize);
}
--D
> +
> + /* atomic write limits are always a power-of-2 */
> + return rounddown_pow_of_two(logres / (2 * atomic_logitems));
> +}
> +
> STATIC void
> xfs_destroy_mount_workqueues(
> struct xfs_mount *mp)
> diff --git a/fs/xfs/xfs_super.h b/fs/xfs/xfs_super.h
> index c0e85c1e42f2..e0f82be9093a 100644
> --- a/fs/xfs/xfs_super.h
> +++ b/fs/xfs/xfs_super.h
> @@ -100,5 +100,6 @@ extern struct workqueue_struct *xfs_discard_wq;
> #define XFS_M(sb) ((struct xfs_mount *)((sb)->s_fs_info))
>
> struct dentry *xfs_debugfs_mkdir(const char *name, struct dentry *parent);
> +unsigned int xfs_atomic_write_logitems(struct xfs_mount *mp);
>
> #endif /* __XFS_SUPER_H__ */
> --
> 2.31.1
>
>
^ permalink raw reply [flat|nested] 29+ messages in thread* Re: [PATCH v6 11/12] xfs: add xfs_compute_atomic_write_unit_max()
2025-04-08 10:42 ` [PATCH v6 11/12] xfs: add xfs_compute_atomic_write_unit_max() John Garry
2025-04-08 21:28 ` Darrick J. Wong
@ 2025-04-08 22:47 ` Dave Chinner
2025-04-09 0:41 ` Darrick J. Wong
1 sibling, 1 reply; 29+ messages in thread
From: Dave Chinner @ 2025-04-08 22:47 UTC (permalink / raw)
To: John Garry
Cc: brauner, djwong, hch, viro, jack, cem, linux-fsdevel, dchinner,
linux-xfs, linux-kernel, ojaswin, ritesh.list, martin.petersen,
linux-ext4, linux-block, catherine.hoang
On Tue, Apr 08, 2025 at 10:42:08AM +0000, John Garry wrote:
> Now that CoW-based atomic writes are supported, update the max size of an
> atomic write for the data device.
>
> The limit of a CoW-based atomic write will be the limit of the number of
> logitems which can fit into a single transaction.
I still think this is the wrong way to define the maximum
size of a COW-based atomic write because it is going to change from
filesystem to filesystem and that variability in supported maximum
length will be exposed to userspace...
i.e. Maximum supported atomic write size really should be defined as
a well documented fixed size (e.g. 16MB). Then the transaction
reservations sizes needed to perform that conversion can be
calculated directly from that maximum size and optimised directly
for the conversion operation that atomic writes need to perform.
.....
> diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
> index b2dd0c0bf509..42b2b7540507 100644
> --- a/fs/xfs/xfs_super.c
> +++ b/fs/xfs/xfs_super.c
> @@ -615,6 +615,28 @@ xfs_init_mount_workqueues(
> return -ENOMEM;
> }
>
> +unsigned int
> +xfs_atomic_write_logitems(
> + struct xfs_mount *mp)
> +{
> + unsigned int efi = xfs_efi_item_overhead(1);
> + unsigned int rui = xfs_rui_item_overhead(1);
> + unsigned int cui = xfs_cui_item_overhead(1);
> + unsigned int bui = xfs_bui_item_overhead(1);
> + unsigned int logres = M_RES(mp)->tr_write.tr_logres;
> +
> + /*
> + * Maximum overhead to complete an atomic write ioend in software:
> + * remove data fork extent + remove cow fork extent +
> + * map extent into data fork
> + */
> + unsigned int atomic_logitems =
> + (bui + cui + rui + efi) + (cui + rui) + (bui + rui);
This seems wrong. Unmap from the data fork only logs a (bui + cui)
pair, we don't log a RUI or an EFI until the transaction that
processes the BUI or CUI actually frees an extent from the the BMBT
or removes a block from the refcount btree.
We also need to be able to relog all the intents and everything that
was modified, so we effectively have at least one
xfs_allocfree_block_count() reservation needed here as well. Even
finishing an invalidation BUI can result in BMBT block allocation
occurring if the operation splits an existing extent record and the
insert of the new record causes a BMBT block split....
> +
> + /* atomic write limits are always a power-of-2 */
> + return rounddown_pow_of_two(logres / (2 * atomic_logitems));
What is the magic 2 in that division?
> +}
Also this function does not belong in xfs_super.c - that file is for
interfacing with the VFS layer. Calculating log reservation
constants at mount time is done in xfs_trans_resv.c - I suspect most
of the code in this patch should probably be moved there and run
from xfs_trans_resv_calc()...
-Dave.
--
Dave Chinner
david@fromorbit.com
^ permalink raw reply [flat|nested] 29+ messages in thread* Re: [PATCH v6 11/12] xfs: add xfs_compute_atomic_write_unit_max()
2025-04-08 22:47 ` Dave Chinner
@ 2025-04-09 0:41 ` Darrick J. Wong
2025-04-09 5:30 ` Dave Chinner
0 siblings, 1 reply; 29+ messages in thread
From: Darrick J. Wong @ 2025-04-09 0:41 UTC (permalink / raw)
To: Dave Chinner
Cc: John Garry, brauner, hch, viro, jack, cem, linux-fsdevel,
dchinner, linux-xfs, linux-kernel, ojaswin, ritesh.list,
martin.petersen, linux-ext4, linux-block, catherine.hoang
On Wed, Apr 09, 2025 at 08:47:09AM +1000, Dave Chinner wrote:
> On Tue, Apr 08, 2025 at 10:42:08AM +0000, John Garry wrote:
> > Now that CoW-based atomic writes are supported, update the max size of an
> > atomic write for the data device.
> >
> > The limit of a CoW-based atomic write will be the limit of the number of
> > logitems which can fit into a single transaction.
>
> I still think this is the wrong way to define the maximum
> size of a COW-based atomic write because it is going to change from
> filesystem to filesystem and that variability in supported maximum
> length will be exposed to userspace...
>
> i.e. Maximum supported atomic write size really should be defined as
> a well documented fixed size (e.g. 16MB). Then the transaction
> reservations sizes needed to perform that conversion can be
> calculated directly from that maximum size and optimised directly
> for the conversion operation that atomic writes need to perform.
I'll get to this below...
> .....
>
> > diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
> > index b2dd0c0bf509..42b2b7540507 100644
> > --- a/fs/xfs/xfs_super.c
> > +++ b/fs/xfs/xfs_super.c
> > @@ -615,6 +615,28 @@ xfs_init_mount_workqueues(
> > return -ENOMEM;
> > }
> >
> > +unsigned int
> > +xfs_atomic_write_logitems(
> > + struct xfs_mount *mp)
> > +{
> > + unsigned int efi = xfs_efi_item_overhead(1);
> > + unsigned int rui = xfs_rui_item_overhead(1);
> > + unsigned int cui = xfs_cui_item_overhead(1);
> > + unsigned int bui = xfs_bui_item_overhead(1);
> > + unsigned int logres = M_RES(mp)->tr_write.tr_logres;
> > +
> > + /*
> > + * Maximum overhead to complete an atomic write ioend in software:
> > + * remove data fork extent + remove cow fork extent +
> > + * map extent into data fork
> > + */
> > + unsigned int atomic_logitems =
> > + (bui + cui + rui + efi) + (cui + rui) + (bui + rui);
>
> This seems wrong. Unmap from the data fork only logs a (bui + cui)
> pair, we don't log a RUI or an EFI until the transaction that
> processes the BUI or CUI actually frees an extent from the the BMBT
> or removes a block from the refcount btree.
This is tricky -- the first transaction in the chain creates a BUI and a
CUI and that's all it needs.
Then we roll to finish the BUI. The second transaction needs space for
the BUD, an RUI, and enough space to relog the CUI (== CUI + CUD).
Then we roll again to finish the RUI. This third transaction needs
space for the RUD and space to relog the CUI.
Roll again, fourth transaction needs space for the CUD and possibly a
new EFI.
Roll again, fifth transaction needs space for an EFD.
const unsigned int efi = xfs_efi_log_space(1);
const unsigned int efd = xfs_efd_log_space(1);
const unsigned int rui = xfs_rui_log_space(1);
const unsigned int rud = xfs_rud_log_space();
const unsigned int cui = xfs_cui_log_space(1);
const unsigned int cud = xfs_cud_log_space();
const unsigned int bui = xfs_bui_log_space(1);
const unsigned int bud = xfs_bud_log_space();
/*
* Maximum overhead to complete an atomic write ioend in software:
* remove data fork extent + remove cow fork extent + map extent into
* data fork.
*
* tx0: Creates a BUI and a CUI and that's all it needs.
*
* tx1: Roll to finish the BUI. Need space for the BUD, an RUI, and
* enough space to relog the CUI (== CUI + CUD).
*
* tx2: Roll again to finish the RUI. Need space for the RUD and space
* to relog the CUI.
*
* tx3: Roll again, need space for the CUD and possibly a new EFI.
*
* tx4: Roll again, need space for an EFD.
*/
const unsigned int tx0 = bui + cui;
const unsigned int tx1 = bud + rui + cui + cud;
const unsigned int tx2 = rud + cui + cud;
const unsigned int tx3 = cud + efi;
const unsigned int tx4 = efd;
const unsigned int per_intent = max3(max3(tx0, tx1, tx2), tx3, tx4);
> We also need to be able to relog all the intents and everything that
> was modified, so we effectively have at least one
> xfs_allocfree_block_count() reservation needed here as well. Even
> finishing an invalidation BUI can result in BMBT block allocation
> occurring if the operation splits an existing extent record and the
> insert of the new record causes a BMBT block split....
Agreed.
> > +
> > + /* atomic write limits are always a power-of-2 */
> > + return rounddown_pow_of_two(logres / (2 * atomic_logitems));
>
> What is the magic 2 in that division?
That's handwaving the lack of a computation involving
xfs_allocfree_block_count. A better version would be to figure out the
log space needed:
/* Overhead to finish one step of each intent item type */
const unsigned int f1 = libxfs_calc_finish_efi_reservation(mp, 1);
const unsigned int f2 = libxfs_calc_finish_rui_reservation(mp, 1);
const unsigned int f3 = libxfs_calc_finish_cui_reservation(mp, 1);
const unsigned int f4 = libxfs_calc_finish_bui_reservation(mp, 1);
/* We only finish one item per transaction in a chain */
const unsigned int step_size = max(f4, max3(f1, f2, f3));
and then you have what you need to figure out the logres needed to
support a given awu_max, or vice versa:
if (desired_max) {
dbprintf(
"desired_max: %u\nstep_size: %u\nper_intent: %u\nlogres: %u\n",
desired_max, step_size, per_intent,
(desired_max * per_intent) + step_size);
} else if (logres) {
dbprintf(
"logres: %u\nstep_size: %u\nper_intent: %u\nmax_awu: %u\n",
logres, step_size, per_intent,
logres >= step_size ? (logres - step_size) / per_intent : 0);
}
I hacked this into xfs_db so that I could compute a variety of
scenarios. Let's pretend I have a 10T filesystem with 4k fsblocks and
the default configuration.
# xfs_db /dev/mapper/moo -c logres -c "untorn_max -b $(( (16 * 1048576) / 4096 ))" -c "untorn_max -l 244216"
type 0 logres 244216 logcount 5 flags 0x4
type 1 logres 428928 logcount 5 flags 0x4
<snip>
minlogsize logres 648576 logcount 8
To emulate a 16MB untorn write, you'd need a logres of:
desired_max: 4096
step_size: 107520
per_intent: 208
logres: 959488
959488 > 648576, which would alter the minlogsize calculation. I know
we banned tiny filesystems a few years ago, but there's still a risk in
increasing it.
For the tr_write logres that John picked, the awu_max we can emulate is:
logres: 244216
step_size: 107520
per_intent: 208
max_awu: 657
That's enough to emulate a 2MB untorn write.
Now let's try again with a realtime filesystem, because the log
reservations are absurdly huge there:
# xfs_db /dev/mapper/moo -c logres -c "untorn_max -b $(( (16 * 1048576) / 4096 ))" -c "untorn_max -l 417400"
type 0 logres 417400 logcount 5 flags 0x4
type 1 logres 772736 logcount 5 flags 0x4
<snip>
minlogsize logres 772736 logcount 5
For 16MB, you'd need a logres of:
desired_max: 4096
step_size: 107520
per_intent: 208
logres: 959488
959488 > 772736, which would alter the minlogsize calculation.
For the tr_write logres that John picked, the awu_max we can emulate is:
logres: 417400
step_size: 107520
per_intent: 208
max_awu: 1489
This is more than enough to handle a 4MB atomic write without affecting
any the other filesystem geometry. Now I'll try a 400MB filesystem:
# xfs_db /dev/mapper/moo -c logres -c "untorn_max -b $(( (16 * 1048576) / 4096 ))" -c "untorn_max -l 142840"
type 0 logres 142840 logcount 5 flags 0x4
type 1 logres 226176 logcount 5 flags 0x4
<snip>
minlogsize logres 445824 logcount 8
desired_max: 4096
step_size: 56832
per_intent: 208
logres: 908800
Definitely still above minlogsize.
logres: 142840
step_size: 56832
per_intent: 208
max_awu: 413
We can still simulate a 1MB untorn write even on a tiny filesystem.
On a 1k fsblock 400MB filesystem:
# xfs_db /dev/mapper/moo -c logres -c "untorn_max -b $(( (16 * 1048576) / 4096 ))" -c "untorn_max -l 63352"
type 0 logres 63352 logcount 5 flags 0x4
type 1 logres 103296 logcount 5 flags 0x4
<snip>
minlogsize logres 153984 logcount 8
desired_max: 4096
step_size: 26112
per_intent: 208
logres: 878080
Again we see that we'd have to increase the min logsize to support a
16mB untorn write.
logres: 63352
step_size: 26112
per_intent: 208
max_awu: 179
But we can still emulate a 128K untorn write in software.
This is why I don't agree with adding a static 16MB limit -- we clearly
don't need it to emulate current hardware, which can commit up to 64k
atomically. Future hardware can increase that by 64x and we'll still be
ok with using the existing tr_write transaction type.
By contrast, adding a 16MB limit would result in a much larger minimum
log size. If we add that to struct xfs_trans_resv for all filesystems
then we run the risk of some ancient filesystem with a 12M log failing
suddenly failing to mount on a new kernel.
I don't see the point.
--D
> > +}
>
> Also this function does not belong in xfs_super.c - that file is for
> interfacing with the VFS layer. Calculating log reservation
> constants at mount time is done in xfs_trans_resv.c - I suspect most
> of the code in this patch should probably be moved there and run
> from xfs_trans_resv_calc()...
>
> -Dave.
> --
> Dave Chinner
> david@fromorbit.com
^ permalink raw reply [flat|nested] 29+ messages in thread* Re: [PATCH v6 11/12] xfs: add xfs_compute_atomic_write_unit_max()
2025-04-09 0:41 ` Darrick J. Wong
@ 2025-04-09 5:30 ` Dave Chinner
2025-04-09 8:15 ` John Garry
2025-04-09 23:46 ` Darrick J. Wong
0 siblings, 2 replies; 29+ messages in thread
From: Dave Chinner @ 2025-04-09 5:30 UTC (permalink / raw)
To: Darrick J. Wong
Cc: John Garry, brauner, hch, viro, jack, cem, linux-fsdevel,
dchinner, linux-xfs, linux-kernel, ojaswin, ritesh.list,
martin.petersen, linux-ext4, linux-block, catherine.hoang
On Tue, Apr 08, 2025 at 05:41:56PM -0700, Darrick J. Wong wrote:
> On Wed, Apr 09, 2025 at 08:47:09AM +1000, Dave Chinner wrote:
> > On Tue, Apr 08, 2025 at 10:42:08AM +0000, John Garry wrote:
> > > Now that CoW-based atomic writes are supported, update the max size of an
> > > atomic write for the data device.
> > >
> > > The limit of a CoW-based atomic write will be the limit of the number of
> > > logitems which can fit into a single transaction.
> >
> > I still think this is the wrong way to define the maximum
> > size of a COW-based atomic write because it is going to change from
> > filesystem to filesystem and that variability in supported maximum
> > length will be exposed to userspace...
> >
> > i.e. Maximum supported atomic write size really should be defined as
> > a well documented fixed size (e.g. 16MB). Then the transaction
> > reservations sizes needed to perform that conversion can be
> > calculated directly from that maximum size and optimised directly
> > for the conversion operation that atomic writes need to perform.
>
> I'll get to this below...
I'll paste it here so it's all in one context.
> This is why I don't agree with adding a static 16MB limit -- we clearly
> don't need it to emulate current hardware, which can commit up to 64k
> atomically. Future hardware can increase that by 64x and we'll still be
> ok with using the existing tr_write transaction type.
>
> By contrast, adding a 16MB limit would result in a much larger minimum
> log size. If we add that to struct xfs_trans_resv for all filesystems
> then we run the risk of some ancient filesystem with a 12M log failing
> suddenly failing to mount on a new kernel.
>
> I don't see the point.
You've got stuck on ithe example size of 16MB I gave, not
the actual reason I gave that example.
That is: we should not be exposing some unpredictable, filesystem
geometry depending maximum atomic size to a userspace API. We need
to define a maximum size that we support, that we can clearly
document and guarantee will work on all filesystem geometries.
What size that is needs to be discussed - all you've done is
demonstrate that 16MB would require a larger minimum log size for a
small set of historic/legacy filesystem configs.
I'm actually quite fine with adding new reservations that change
minimum required log sizes - this isn't a show-stopper in any way.
Atomic writes are optional functionality, so if the log size is too
small for the atomic write transaction requirements, then we don't
enable the atomic write functionality on that filesystem. Hence the
minimum required log size for the filesystem -does not change- and
the filesystem mounts as normal..
i.e. the user hasn't lost anything on these tiny log filesystems
when the kernel is upgraded to support software atomic writes.
All that happens is that the legacy filesystem will never support
atomic writes....
Such a mount time check allows us to design the atomic write
functionality around a fixed upper size bound that we can guarantee
will work correctly. i.e. the size of the transaction reservation
needed for it is irrelevant because we guarantee to only run that
transaction on filesytsems with logs large enough to support it.
Having a consistent maximum atomic write size makes it easier for
userspace to create logic and algorithms around, and it makes it
much easier for us to do boundary condition testing as we don't have
to reverse engineer the expected boundaries from filesysetm geometry
in test code. Fixed sizes make API verification and testing a whole
lot simpler.
And, finally, if everything is sized from an single API constant, it
makes it easy to modify the supported size in future. The 64MB
minimum log size gives us lots of headroom to increase supported
atomic write sizes, so if userspace finds that it really needs
larger sizes than what we initially support, it's trivial to change
both the kernel and the test code to support a larger size...
> > > diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
> > > index b2dd0c0bf509..42b2b7540507 100644
> > > --- a/fs/xfs/xfs_super.c
> > > +++ b/fs/xfs/xfs_super.c
> > > @@ -615,6 +615,28 @@ xfs_init_mount_workqueues(
> > > return -ENOMEM;
> > > }
> > >
> > > +unsigned int
> > > +xfs_atomic_write_logitems(
> > > + struct xfs_mount *mp)
> > > +{
> > > + unsigned int efi = xfs_efi_item_overhead(1);
> > > + unsigned int rui = xfs_rui_item_overhead(1);
> > > + unsigned int cui = xfs_cui_item_overhead(1);
> > > + unsigned int bui = xfs_bui_item_overhead(1);
> > > + unsigned int logres = M_RES(mp)->tr_write.tr_logres;
> > > +
> > > + /*
> > > + * Maximum overhead to complete an atomic write ioend in software:
> > > + * remove data fork extent + remove cow fork extent +
> > > + * map extent into data fork
> > > + */
> > > + unsigned int atomic_logitems =
> > > + (bui + cui + rui + efi) + (cui + rui) + (bui + rui);
> >
> > This seems wrong. Unmap from the data fork only logs a (bui + cui)
> > pair, we don't log a RUI or an EFI until the transaction that
> > processes the BUI or CUI actually frees an extent from the the BMBT
> > or removes a block from the refcount btree.
>
> This is tricky -- the first transaction in the chain creates a BUI and a
> CUI and that's all it needs.
>
> Then we roll to finish the BUI. The second transaction needs space for
> the BUD, an RUI, and enough space to relog the CUI (== CUI + CUD).
>
> Then we roll again to finish the RUI. This third transaction needs
> space for the RUD and space to relog the CUI.
>
> Roll again, fourth transaction needs space for the CUD and possibly a
> new EFI.
>
> Roll again, fifth transaction needs space for an EFD.
Yes, that is exactly the point I was making.
> > > +
> > > + /* atomic write limits are always a power-of-2 */
> > > + return rounddown_pow_of_two(logres / (2 * atomic_logitems));
> >
> > What is the magic 2 in that division?
>
> That's handwaving the lack of a computation involving
> xfs_allocfree_block_count. A better version would be to figure out the
> log space needed:
>
> /* Overhead to finish one step of each intent item type */
> const unsigned int f1 = libxfs_calc_finish_efi_reservation(mp, 1);
> const unsigned int f2 = libxfs_calc_finish_rui_reservation(mp, 1);
Yup, those should be a single call to xfs_calc_buf_res(xfs_allocfree_block_count())
> const unsigned int f3 = libxfs_calc_finish_cui_reservation(mp, 1);
Similarly, xfs_calc_refcountbt_reservation().
> const unsigned int f4 = libxfs_calc_finish_bui_reservation(mp, 1);
xfs_calc_write_reservation() but for a single extent instead of 2.
Also, I think the bui finish needs to take into account dquots, too.
> /* We only finish one item per transaction in a chain */
> const unsigned int step_size = max(f4, max3(f1, f2, f3));
And that is xfs_calc_itruncate_reservation(), except it reserves
space for 1 extent to be processed instead of 4.
FWIW, I'd suggest that these helpers make for a better way of
calculating high level reservations such as
xfs_calc_write_reservation() and xfs_calc_itruncate_reservation()
because those functions currently don't take into account how
intents are relogged during those operations.
> and then you have what you need to figure out the logres needed to
> support a given awu_max, or vice versa:
>
> if (desired_max) {
> dbprintf(
> "desired_max: %u\nstep_size: %u\nper_intent: %u\nlogres: %u\n",
> desired_max, step_size, per_intent,
> (desired_max * per_intent) + step_size);
> } else if (logres) {
> dbprintf(
> "logres: %u\nstep_size: %u\nper_intent: %u\nmax_awu: %u\n",
> logres, step_size, per_intent,
> logres >= step_size ? (logres - step_size) / per_intent : 0);
> }
>
> I hacked this into xfs_db so that I could compute a variety of
> scenarios. Let's pretend I have a 10T filesystem with 4k fsblocks and
> the default configuration.
>
> # xfs_db /dev/mapper/moo -c logres -c "untorn_max -b $(( (16 * 1048576) / 4096 ))" -c "untorn_max -l 244216"
> type 0 logres 244216 logcount 5 flags 0x4
> type 1 logres 428928 logcount 5 flags 0x4
> <snip>
> minlogsize logres 648576 logcount 8
>
> To emulate a 16MB untorn write, you'd need a logres of:
>
> desired_max: 4096
> step_size: 107520
> per_intent: 208
> logres: 959488
>
> 959488 > 648576, which would alter the minlogsize calculation. I know
> we banned tiny filesystems a few years ago, but there's still a risk in
> increasing it.
Yeah, it's a bit under a megabyte of reservation space. That's no
big deal, as log reservations get much larger than this as block
size and log stripe units are increased.
<snip the rest, they all show roughly the same thing>
Ok, these numbers pretty much prove my point - that a fixed max
atomic write size somewhere around 16MB isn't going to be a problem
for any filesystem that uses the historic small fs default log size
(10MB) or larger.
Let's avoid the internal XFS minlogsize problems by disabling the
atomic write functionality on the increasingly rare legacy
filesystems where the log is too small. That allows us to design and
implement sane userspace API bounds and guarantees for atomic writes
and not expose internal filesystem constraints and inconsistencies
to applications...
-Dave.
--
Dave Chinner
david@fromorbit.com
^ permalink raw reply [flat|nested] 29+ messages in thread* Re: [PATCH v6 11/12] xfs: add xfs_compute_atomic_write_unit_max()
2025-04-09 5:30 ` Dave Chinner
@ 2025-04-09 8:15 ` John Garry
2025-04-09 22:49 ` Dave Chinner
2025-04-09 23:46 ` Darrick J. Wong
1 sibling, 1 reply; 29+ messages in thread
From: John Garry @ 2025-04-09 8:15 UTC (permalink / raw)
To: Dave Chinner, Darrick J. Wong
Cc: brauner, hch, viro, jack, cem, linux-fsdevel, dchinner, linux-xfs,
linux-kernel, ojaswin, ritesh.list, martin.petersen, linux-ext4,
linux-block, catherine.hoang
On 09/04/2025 06:30, Dave Chinner wrote:
>> This is why I don't agree with adding a static 16MB limit -- we clearly
>> don't need it to emulate current hardware, which can commit up to 64k
>> atomically. Future hardware can increase that by 64x and we'll still be
>> ok with using the existing tr_write transaction type.
>>
>> By contrast, adding a 16MB limit would result in a much larger minimum
>> log size. If we add that to struct xfs_trans_resv for all filesystems
>> then we run the risk of some ancient filesystem with a 12M log failing
>> suddenly failing to mount on a new kernel.
>>
>> I don't see the point.
> You've got stuck on ithe example size of 16MB I gave, not
> the actual reason I gave that example.
You did provide a relatively large value in 16MB. When I say relative, I
mean relative to what can be achieved with HW offload today.
The target user we see for this feature is DBs, and they want to do
writes in the 16/32/64KB size range. Indeed, these are the sort of sizes
we see supported in terms of disk atomic write support today.
Furthermore, they (DBs) want fast and predictable performance which HW
offload provides. They do not want to use a slow software-based
solution. Such a software-based solution will always be slower, as we
need to deal with block alloc/de-alloc and extent remapping for every write.
So are there people who really want very large atomic write support and
will tolerate slow performance, i.e. slower than what can be achieved
with double-write buffer or some other application logging?
Thanks,
John
^ permalink raw reply [flat|nested] 29+ messages in thread
* Re: [PATCH v6 11/12] xfs: add xfs_compute_atomic_write_unit_max()
2025-04-09 8:15 ` John Garry
@ 2025-04-09 22:49 ` Dave Chinner
2025-04-10 8:58 ` John Garry
0 siblings, 1 reply; 29+ messages in thread
From: Dave Chinner @ 2025-04-09 22:49 UTC (permalink / raw)
To: John Garry
Cc: Darrick J. Wong, brauner, hch, viro, jack, cem, linux-fsdevel,
dchinner, linux-xfs, linux-kernel, ojaswin, ritesh.list,
martin.petersen, linux-ext4, linux-block, catherine.hoang
On Wed, Apr 09, 2025 at 09:15:23AM +0100, John Garry wrote:
> On 09/04/2025 06:30, Dave Chinner wrote:
> > > This is why I don't agree with adding a static 16MB limit -- we clearly
> > > don't need it to emulate current hardware, which can commit up to 64k
> > > atomically. Future hardware can increase that by 64x and we'll still be
> > > ok with using the existing tr_write transaction type.
> > >
> > > By contrast, adding a 16MB limit would result in a much larger minimum
> > > log size. If we add that to struct xfs_trans_resv for all filesystems
> > > then we run the risk of some ancient filesystem with a 12M log failing
> > > suddenly failing to mount on a new kernel.
> > >
> > > I don't see the point.
> > You've got stuck on ithe example size of 16MB I gave, not
> > the actual reason I gave that example.
>
> You did provide a relatively large value in 16MB. When I say relative, I
> mean relative to what can be achieved with HW offload today.
>
> The target user we see for this feature is DBs, and they want to do writes
> in the 16/32/64KB size range. Indeed, these are the sort of sizes we see
> supported in terms of disk atomic write support today.
The target user I see for RWF_ATOMIC write is applications
overwriting files safely (e.g. config files, documents, etc).
This requires an atomic write operation that is large enough to
overwrite the file entirely in one go.
i.e. we need to think about how RWF_ATOMIC is applicable to the
entire userspace ecosystem, not just a narrow database specific
niche. Databases really want atomic writes to avoid the need for
WAL, whereas application developers that keep asking us for safe
file overwrite without fsync() for arbitrary sized files and IO.
> Furthermore, they (DBs) want fast and predictable performance which HW
> offload provides. They do not want to use a slow software-based solution.
> Such a software-based solution will always be slower, as we need to deal
> with block alloc/de-alloc and extent remapping for every write.
"slow" is relative to the use case for atomic writes.
> So are there people who really want very large atomic write support and will
> tolerate slow performance, i.e. slower than what can be achieved with
> double-write buffer or some other application logging?
Large atomic write support solves the O_PONIES problem, which is
fundamentally a performance problem w.r.t. ensuring data integrity.
I'll quote myself when you asked this exact same question back about
4 months ago:
| "At this point we actually provide app developers with what they've
| been repeatedly asking kernel filesystem engineers to provide them
| for the past 20 years: a way of overwriting arbitrary file data
| safely without needing an expensive fdatasync operation on every
| file that gets modified.
|
| Put simply: atomic writes have a huge potential to fundamentally
| change the way applications interact with Linux filesystems and to
| make it *much* simpler for applications to safely overwrite user
| data. Hence there is an imperitive here to make the foundational
| support for this technology solid and robust because atomic writes
| are going to be with us for the next few decades..."
https://lwn.net/Articles/1001770/
-Dave.
--
Dave Chinner
david@fromorbit.com
^ permalink raw reply [flat|nested] 29+ messages in thread
* Re: [PATCH v6 11/12] xfs: add xfs_compute_atomic_write_unit_max()
2025-04-09 22:49 ` Dave Chinner
@ 2025-04-10 8:58 ` John Garry
0 siblings, 0 replies; 29+ messages in thread
From: John Garry @ 2025-04-10 8:58 UTC (permalink / raw)
To: Dave Chinner
Cc: Darrick J. Wong, brauner, hch, viro, jack, cem, linux-fsdevel,
dchinner, linux-xfs, linux-kernel, ojaswin, ritesh.list,
martin.petersen, linux-ext4, linux-block, catherine.hoang
On 09/04/2025 23:49, Dave Chinner wrote:
>> You did provide a relatively large value in 16MB. When I say relative, I
>> mean relative to what can be achieved with HW offload today.
>>
>> The target user we see for this feature is DBs, and they want to do writes
>> in the 16/32/64KB size range. Indeed, these are the sort of sizes we see
>> supported in terms of disk atomic write support today.
> The target user I see for RWF_ATOMIC write is applications
> overwriting files safely (e.g. config files, documents, etc).
>
> This requires an atomic write operation that is large enough to
> overwrite the file entirely in one go.
>
> i.e. we need to think about how RWF_ATOMIC is applicable to the
> entire userspace ecosystem, not just a narrow database specific
> niche. Databases really want atomic writes to avoid the need for
> WAL, whereas application developers that keep asking us for safe
> file overwrite without fsync() for arbitrary sized files and IO.
If they want to use this API, then arbitrary-sized files will need to be
power-of-2 sized files (if the user is happy to atomic update all of the
file).
I would have thought that the work Christoph did with O_ATOMIC a few
years ago for atomic file updates would be more suited for scenario
which you mention.
Anyway, back to the main topic..
What is the method you propose that the admin can use to fix this upper
atomic write limit? Is this an mkfs and/or mount option?
I appreciate that I am asking the same question as Darrick did in his
follow up mail.
John
^ permalink raw reply [flat|nested] 29+ messages in thread
* Re: [PATCH v6 11/12] xfs: add xfs_compute_atomic_write_unit_max()
2025-04-09 5:30 ` Dave Chinner
2025-04-09 8:15 ` John Garry
@ 2025-04-09 23:46 ` Darrick J. Wong
1 sibling, 0 replies; 29+ messages in thread
From: Darrick J. Wong @ 2025-04-09 23:46 UTC (permalink / raw)
To: Dave Chinner
Cc: John Garry, brauner, hch, viro, jack, cem, linux-fsdevel,
dchinner, linux-xfs, linux-kernel, ojaswin, ritesh.list,
martin.petersen, linux-ext4, linux-block, catherine.hoang
On Wed, Apr 09, 2025 at 03:30:28PM +1000, Dave Chinner wrote:
> On Tue, Apr 08, 2025 at 05:41:56PM -0700, Darrick J. Wong wrote:
> > On Wed, Apr 09, 2025 at 08:47:09AM +1000, Dave Chinner wrote:
> > > On Tue, Apr 08, 2025 at 10:42:08AM +0000, John Garry wrote:
> > > > Now that CoW-based atomic writes are supported, update the max size of an
> > > > atomic write for the data device.
> > > >
> > > > The limit of a CoW-based atomic write will be the limit of the number of
> > > > logitems which can fit into a single transaction.
> > >
> > > I still think this is the wrong way to define the maximum
> > > size of a COW-based atomic write because it is going to change from
> > > filesystem to filesystem and that variability in supported maximum
> > > length will be exposed to userspace...
> > >
> > > i.e. Maximum supported atomic write size really should be defined as
> > > a well documented fixed size (e.g. 16MB). Then the transaction
> > > reservations sizes needed to perform that conversion can be
> > > calculated directly from that maximum size and optimised directly
> > > for the conversion operation that atomic writes need to perform.
> >
> > I'll get to this below...
>
> I'll paste it here so it's all in one context.
>
> > This is why I don't agree with adding a static 16MB limit -- we clearly
> > don't need it to emulate current hardware, which can commit up to 64k
> > atomically. Future hardware can increase that by 64x and we'll still be
> > ok with using the existing tr_write transaction type.
> >
> > By contrast, adding a 16MB limit would result in a much larger minimum
> > log size. If we add that to struct xfs_trans_resv for all filesystems
> > then we run the risk of some ancient filesystem with a 12M log failing
> > suddenly failing to mount on a new kernel.
> >
> > I don't see the point.
>
> You've got stuck on ithe example size of 16MB I gave, not
> the actual reason I gave that example.
Well you kept saying 16MB, so that's why I ran the numbers on it.
> That is: we should not be exposing some unpredictable, filesystem
> geometry depending maximum atomic size to a userspace API. We need
> to define a maximum size that we support, that we can clearly
> document and guarantee will work on all filesystem geometries.
>
> What size that is needs to be discussed - all you've done is
> demonstrate that 16MB would require a larger minimum log size for a
> small set of historic/legacy filesystem configs.
No, I've done more than that. I've shown that it's possible to
determine the maximum permissible size of a software untorn writes
completions given a log reservation, which means that we can:
1) Determine if we can backstop hardware untorn writes with software
untorn writes.
2) Determine the largest untorn write we can complete in software for
an existing filesystems with no other changes required.
in addition to 3) Determine if we can land an untorn write of a
specific size.
We care about (1). (2) is basically done (albeit with calculation
errors) by this patchset. (3) is not something we're interested in.
If you believe that having a fixed upper size bound that isn't tied to
existing log reservations is worth pursuing, then please send patches
building off this series.
I think it wouldn't be much work to have a larger fixed limit that
doesn't depend on any of the existing static transaction reservations.
Use the computation I provided to decide if that limit won't push the
filesystem too close to the maximum transaction size, then dynamically
compute the reservation at ioend time for the specific IO being
completed.
But again, our users don't want that, they're fine with 64k. They
really don't want the software fallback at all. If you have users who
prefer a well known static limit, then please work with them.
> I'm actually quite fine with adding new reservations that change
> minimum required log sizes - this isn't a show-stopper in any way.
I'm not convinced. The minimum log size footgun still exists; it's just
that the new(ish) 64M floor makes it much less likely to happen. The
image factories have been upgraded with newer xfsprogs so the number of
complaints will (at long last) slowly go down, but 12M logs are still
causing problems.
> Atomic writes are optional functionality, so if the log size is too
> small for the atomic write transaction requirements, then we don't
> enable the atomic write functionality on that filesystem. Hence the
> minimum required log size for the filesystem -does not change- and
> the filesystem mounts as normal..
>
> i.e. the user hasn't lost anything on these tiny log filesystems
> when the kernel is upgraded to support software atomic writes.
> All that happens is that the legacy filesystem will never support
> atomic writes....
>
> Such a mount time check allows us to design the atomic write
> functionality around a fixed upper size bound that we can guarantee
> will work correctly. i.e. the size of the transaction reservation
> needed for it is irrelevant because we guarantee to only run that
> transaction on filesytsems with logs large enough to support it.
Yes, that's what I've been trying to say all along.
> Having a consistent maximum atomic write size makes it easier for
> userspace to create logic and algorithms around, and it makes it
> much easier for us to do boundary condition testing as we don't have
> to reverse engineer the expected boundaries from filesysetm geometry
> in test code. Fixed sizes make API verification and testing a whole
> lot simpler.
John added a statx field so that the kernel can advertise the hardware
and software untorn write capabilities. Are you saying that's not good
enough even for (1) and (2) from above?
If it turns out that users want a higher limit, they can come talk to us
about adding a mkfs option or whatever to declare that they want more,
and then we can enforce that.
> And, finally, if everything is sized from an single API constant, it
> makes it easy to modify the supported size in future. The 64MB
> minimum log size gives us lots of headroom to increase supported
> atomic write sizes, so if userspace finds that it really needs
> larger sizes than what we initially support, it's trivial to change
> both the kernel and the test code to support a larger size...
>
> > > > diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
> > > > index b2dd0c0bf509..42b2b7540507 100644
> > > > --- a/fs/xfs/xfs_super.c
> > > > +++ b/fs/xfs/xfs_super.c
> > > > @@ -615,6 +615,28 @@ xfs_init_mount_workqueues(
> > > > return -ENOMEM;
> > > > }
> > > >
> > > > +unsigned int
> > > > +xfs_atomic_write_logitems(
> > > > + struct xfs_mount *mp)
> > > > +{
> > > > + unsigned int efi = xfs_efi_item_overhead(1);
> > > > + unsigned int rui = xfs_rui_item_overhead(1);
> > > > + unsigned int cui = xfs_cui_item_overhead(1);
> > > > + unsigned int bui = xfs_bui_item_overhead(1);
> > > > + unsigned int logres = M_RES(mp)->tr_write.tr_logres;
> > > > +
> > > > + /*
> > > > + * Maximum overhead to complete an atomic write ioend in software:
> > > > + * remove data fork extent + remove cow fork extent +
> > > > + * map extent into data fork
> > > > + */
> > > > + unsigned int atomic_logitems =
> > > > + (bui + cui + rui + efi) + (cui + rui) + (bui + rui);
> > >
> > > This seems wrong. Unmap from the data fork only logs a (bui + cui)
> > > pair, we don't log a RUI or an EFI until the transaction that
> > > processes the BUI or CUI actually frees an extent from the the BMBT
> > > or removes a block from the refcount btree.
> >
> > This is tricky -- the first transaction in the chain creates a BUI and a
> > CUI and that's all it needs.
> >
> > Then we roll to finish the BUI. The second transaction needs space for
> > the BUD, an RUI, and enough space to relog the CUI (== CUI + CUD).
> >
> > Then we roll again to finish the RUI. This third transaction needs
> > space for the RUD and space to relog the CUI.
> >
> > Roll again, fourth transaction needs space for the CUD and possibly a
> > new EFI.
> >
> > Roll again, fifth transaction needs space for an EFD.
>
> Yes, that is exactly the point I was making.
It's also slightly wrong -- if an extent referenced by the chain isn't
actively being worked on, its BUI/CUI can get relogged. So the actual
computation is:
relog = bui + bud + cui + cud;
per_intent = max(tx0..tx4, relog);
That doesn't seem to change the output though.
> > > > +
> > > > + /* atomic write limits are always a power-of-2 */
> > > > + return rounddown_pow_of_two(logres / (2 * atomic_logitems));
> > >
> > > What is the magic 2 in that division?
> >
> > That's handwaving the lack of a computation involving
> > xfs_allocfree_block_count. A better version would be to figure out the
> > log space needed:
> >
> > /* Overhead to finish one step of each intent item type */
> > const unsigned int f1 = libxfs_calc_finish_efi_reservation(mp, 1);
> > const unsigned int f2 = libxfs_calc_finish_rui_reservation(mp, 1);
>
> Yup, those should be a single call to xfs_calc_buf_res(xfs_allocfree_block_count())
>
> > const unsigned int f3 = libxfs_calc_finish_cui_reservation(mp, 1);
>
> Similarly, xfs_calc_refcountbt_reservation().
>
> > const unsigned int f4 = libxfs_calc_finish_bui_reservation(mp, 1);
>
> xfs_calc_write_reservation() but for a single extent instead of 2.
> Also, I think the bui finish needs to take into account dquots, too.
>
> > /* We only finish one item per transaction in a chain */
> > const unsigned int step_size = max(f4, max3(f1, f2, f3));
>
> And that is xfs_calc_itruncate_reservation(), except it reserves
> space for 1 extent to be processed instead of 4.
>
> FWIW, I'd suggest that these helpers make for a better way of
> calculating high level reservations such as
> xfs_calc_write_reservation() and xfs_calc_itruncate_reservation()
> because those functions currently don't take into account how
> intents are relogged during those operations.
>
> > and then you have what you need to figure out the logres needed to
> > support a given awu_max, or vice versa:
> >
> > if (desired_max) {
> > dbprintf(
> > "desired_max: %u\nstep_size: %u\nper_intent: %u\nlogres: %u\n",
> > desired_max, step_size, per_intent,
> > (desired_max * per_intent) + step_size);
> > } else if (logres) {
> > dbprintf(
> > "logres: %u\nstep_size: %u\nper_intent: %u\nmax_awu: %u\n",
> > logres, step_size, per_intent,
> > logres >= step_size ? (logres - step_size) / per_intent : 0);
> > }
> >
> > I hacked this into xfs_db so that I could compute a variety of
> > scenarios. Let's pretend I have a 10T filesystem with 4k fsblocks and
> > the default configuration.
> >
> > # xfs_db /dev/mapper/moo -c logres -c "untorn_max -b $(( (16 * 1048576) / 4096 ))" -c "untorn_max -l 244216"
> > type 0 logres 244216 logcount 5 flags 0x4
> > type 1 logres 428928 logcount 5 flags 0x4
> > <snip>
> > minlogsize logres 648576 logcount 8
> >
> > To emulate a 16MB untorn write, you'd need a logres of:
> >
> > desired_max: 4096
> > step_size: 107520
> > per_intent: 208
> > logres: 959488
> >
> > 959488 > 648576, which would alter the minlogsize calculation. I know
> > we banned tiny filesystems a few years ago, but there's still a risk in
> > increasing it.
>
> Yeah, it's a bit under a megabyte of reservation space. That's no
> big deal, as log reservations get much larger than this as block
> size and log stripe units are increased.
>
> <snip the rest, they all show roughly the same thing>
>
> Ok, these numbers pretty much prove my point - that a fixed max
> atomic write size somewhere around 16MB isn't going to be a problem
> for any filesystem that uses the historic small fs default log size
> (10MB) or larger.
>
> Let's avoid the internal XFS minlogsize problems by disabling the
> atomic write functionality on the increasingly rare legacy
> filesystems where the log is too small. That allows us to design and
> implement sane userspace API bounds and guarantees for atomic writes
> and not expose internal filesystem constraints and inconsistencies
> to applications...
Ok then.
--D
>
> -Dave.
> --
> Dave Chinner
> david@fromorbit.com
>
^ permalink raw reply [flat|nested] 29+ messages in thread
* [PATCH v6 12/12] xfs: update atomic write limits
2025-04-08 10:41 [PATCH v6 00/12] large atomic writes for xfs John Garry
` (10 preceding siblings ...)
2025-04-08 10:42 ` [PATCH v6 11/12] xfs: add xfs_compute_atomic_write_unit_max() John Garry
@ 2025-04-08 10:42 ` John Garry
11 siblings, 0 replies; 29+ messages in thread
From: John Garry @ 2025-04-08 10:42 UTC (permalink / raw)
To: brauner, djwong, hch, viro, jack, cem
Cc: linux-fsdevel, dchinner, linux-xfs, linux-kernel, ojaswin,
ritesh.list, martin.petersen, linux-ext4, linux-block,
catherine.hoang, John Garry
Update the limits returned from xfs_get_atomic_write_{min, max, max_opt)().
No reflink support always means no CoW-based atomic writes.
For updating xfs_get_atomic_write_min(), we support blocksize only and that
depends on HW or reflink support.
For updating xfs_get_atomic_write_max(), for rtvol or no reflink, we are
limited to blocksize but only if HW support. Otherwise we are limited to
combined limit in mp->m_atomic_write_unit_max.
For updating xfs_get_atomic_write_max_opt(), ultimately we are limited by
the bdev atomic write limit. If xfs_get_atomic_write_max() does not report
> 1x blocksize, then just continue to report 0 as before.
Signed-off-by: John Garry <john.g.garry@oracle.com>
---
fs/xfs/xfs_file.c | 2 +-
fs/xfs/xfs_iops.c | 37 +++++++++++++++++++++++++++++++------
2 files changed, 32 insertions(+), 7 deletions(-)
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 81a377f65aa3..d1ddbc4a98c3 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1557,7 +1557,7 @@ xfs_file_open(
if (xfs_is_shutdown(XFS_M(inode->i_sb)))
return -EIO;
file->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT;
- if (xfs_inode_can_hw_atomicwrite(XFS_I(inode)))
+ if (xfs_get_atomic_write_min(XFS_I(inode)))
file->f_mode |= FMODE_CAN_ATOMIC_WRITE;
return generic_file_open(inode, file);
}
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 3b5aa39dbfe9..894f56f1a830 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -605,27 +605,52 @@ unsigned int
xfs_get_atomic_write_min(
struct xfs_inode *ip)
{
- if (!xfs_inode_can_hw_atomicwrite(ip))
- return 0;
+ if (xfs_inode_can_hw_atomicwrite(ip) || xfs_has_reflink(ip->i_mount))
+ return ip->i_mount->m_sb.sb_blocksize;
- return ip->i_mount->m_sb.sb_blocksize;
+ return 0;
}
unsigned int
xfs_get_atomic_write_max(
struct xfs_inode *ip)
{
- if (!xfs_inode_can_hw_atomicwrite(ip))
+ struct xfs_mount *mp = ip->i_mount;
+
+ /*
+ * If no reflink, then best we can do is 1x block as no CoW fallback
+ * for when HW offload not possible.
+ *
+ * rtvol is not commonly used and supporting large atomic writes
+ * would also be complicated to support there, so limit to a single
+ * block for now.
+ */
+ if (!xfs_has_reflink(mp) || XFS_IS_REALTIME_INODE(ip)) {
+ if (xfs_inode_can_hw_atomicwrite(ip))
+ return ip->i_mount->m_sb.sb_blocksize;
return 0;
+ }
- return ip->i_mount->m_sb.sb_blocksize;
+ /*
+ * Even though HW support could be larger (than CoW), we rely on
+ * CoW-based method as a fallback for when HW-based is not possible,
+ * so always limit at m_atomic_write_unit_max (which is evaluated
+ * according to CoW-based limit.
+ */
+ return XFS_FSB_TO_B(mp, mp->m_atomic_write_unit_max);
}
unsigned int
xfs_get_atomic_write_max_opt(
struct xfs_inode *ip)
{
- return 0;
+ struct xfs_buftarg *target = xfs_inode_buftarg(ip);
+
+ /* if the max is 1x block, then just keep behaviour that opt is 0 */
+ if (xfs_get_atomic_write_max(ip) <= ip->i_mount->m_sb.sb_blocksize)
+ return 0;
+
+ return min(xfs_get_atomic_write_max(ip), target->bt_bdev_awu_max);
}
static void
--
2.31.1
^ permalink raw reply related [flat|nested] 29+ messages in thread