* [PATCH v6 01/13] iomap: inline iomap_dio_bio_opflags()
2025-03-13 17:12 [PATCH v6 00/13] large atomic writes for xfs with CoW John Garry
@ 2025-03-13 17:12 ` John Garry
2025-03-16 13:40 ` Ritesh Harjani
2025-03-17 6:07 ` Christoph Hellwig
2025-03-13 17:12 ` [PATCH v6 02/13] iomap: comment on atomic write checks in iomap_dio_bio_iter() John Garry
` (12 subsequent siblings)
13 siblings, 2 replies; 65+ messages in thread
From: John Garry @ 2025-03-13 17:12 UTC (permalink / raw)
To: brauner, djwong, cem, dchinner, hch
Cc: linux-xfs, linux-fsdevel, linux-kernel, ojaswin, ritesh.list,
martin.petersen, tytso, linux-ext4, John Garry
It is neater to build blk_opf_t fully in one place, so inline
iomap_dio_bio_opflags() in iomap_dio_bio_iter().
Also tidy up the logic in dealing with IOMAP_DIO_CALLER_COMP, in generally
separate the logic in dealing with flags associated with reads and writes.
Originally-from: Christoph Hellwig <hch@lst.de>
Signed-off-by: John Garry <john.g.garry@oracle.com>
---
Should I change author?
fs/iomap/direct-io.c | 112 +++++++++++++++++++------------------------
1 file changed, 49 insertions(+), 63 deletions(-)
diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index 5299f70428ef..8c1bec473586 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -312,27 +312,20 @@ static int iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio,
}
/*
- * Figure out the bio's operation flags from the dio request, the
- * mapping, and whether or not we want FUA. Note that we can end up
- * clearing the WRITE_THROUGH flag in the dio request.
+ * Use a FUA write if we need datasync semantics and this is a pure data I/O
+ * that doesn't require any metadata updates (including after I/O completion
+ * such as unwritten extent conversion) and the underlying device either
+ * doesn't have a volatile write cache or supports FUA.
+ * This allows us to avoid cache flushes on I/O completion.
*/
-static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio,
- const struct iomap *iomap, bool use_fua, bool atomic_hw)
+static inline bool iomap_dio_can_use_fua(const struct iomap *iomap,
+ struct iomap_dio *dio)
{
- blk_opf_t opflags = REQ_SYNC | REQ_IDLE;
-
- if (!(dio->flags & IOMAP_DIO_WRITE))
- return REQ_OP_READ;
-
- opflags |= REQ_OP_WRITE;
- if (use_fua)
- opflags |= REQ_FUA;
- else
- dio->flags &= ~IOMAP_DIO_WRITE_THROUGH;
- if (atomic_hw)
- opflags |= REQ_ATOMIC;
-
- return opflags;
+ if (iomap->flags & (IOMAP_F_SHARED | IOMAP_F_DIRTY))
+ return false;
+ if (!(dio->flags & IOMAP_DIO_WRITE_THROUGH))
+ return false;
+ return !bdev_write_cache(iomap->bdev) || bdev_fua(iomap->bdev);
}
static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio)
@@ -340,52 +333,59 @@ static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio)
const struct iomap *iomap = &iter->iomap;
struct inode *inode = iter->inode;
unsigned int fs_block_size = i_blocksize(inode), pad;
- bool atomic_hw = iter->flags & IOMAP_ATOMIC_HW;
const loff_t length = iomap_length(iter);
loff_t pos = iter->pos;
- blk_opf_t bio_opf;
+ blk_opf_t bio_opf = REQ_SYNC | REQ_IDLE;
struct bio *bio;
bool need_zeroout = false;
- bool use_fua = false;
int nr_pages, ret = 0;
u64 copied = 0;
size_t orig_count;
- if (atomic_hw && length != iter->len)
- return -EINVAL;
-
if ((pos | length) & (bdev_logical_block_size(iomap->bdev) - 1) ||
!bdev_iter_is_aligned(iomap->bdev, dio->submit.iter))
return -EINVAL;
- if (iomap->type == IOMAP_UNWRITTEN) {
- dio->flags |= IOMAP_DIO_UNWRITTEN;
- need_zeroout = true;
- }
+ if (dio->flags & IOMAP_DIO_WRITE) {
+ bio_opf |= REQ_OP_WRITE;
+
+ if (iter->flags & IOMAP_ATOMIC_HW) {
+ if (length != iter->len)
+ return -EINVAL;
+ bio_opf |= REQ_ATOMIC;
+ }
+
+ if (iomap->type == IOMAP_UNWRITTEN) {
+ dio->flags |= IOMAP_DIO_UNWRITTEN;
+ need_zeroout = true;
+ }
- if (iomap->flags & IOMAP_F_SHARED)
- dio->flags |= IOMAP_DIO_COW;
+ if (iomap->flags & IOMAP_F_SHARED)
+ dio->flags |= IOMAP_DIO_COW;
+
+ if (iomap->flags & IOMAP_F_NEW) {
+ need_zeroout = true;
+ } else if (iomap->type == IOMAP_MAPPED) {
+ if (iomap_dio_can_use_fua(iomap, dio))
+ bio_opf |= REQ_FUA;
+ else
+ dio->flags &= ~IOMAP_DIO_WRITE_THROUGH;
+ }
- if (iomap->flags & IOMAP_F_NEW) {
- need_zeroout = true;
- } else if (iomap->type == IOMAP_MAPPED) {
/*
- * Use a FUA write if we need datasync semantics, this is a pure
- * data IO that doesn't require any metadata updates (including
- * after IO completion such as unwritten extent conversion) and
- * the underlying device either supports FUA or doesn't have
- * a volatile write cache. This allows us to avoid cache flushes
- * on IO completion. If we can't use writethrough and need to
- * sync, disable in-task completions as dio completion will
- * need to call generic_write_sync() which will do a blocking
- * fsync / cache flush call.
+ * We can only do deferred completion for pure overwrites that
+ * don't require additional I/O at completion time.
+ *
+ * This rules out writes that need zeroing or extent conversion,
+ * extend the file size, or issue metadata I/O or cache flushes
+ * during completion processing.
*/
- if (!(iomap->flags & (IOMAP_F_SHARED|IOMAP_F_DIRTY)) &&
- (dio->flags & IOMAP_DIO_WRITE_THROUGH) &&
- (bdev_fua(iomap->bdev) || !bdev_write_cache(iomap->bdev)))
- use_fua = true;
- else if (dio->flags & IOMAP_DIO_NEED_SYNC)
+ if (need_zeroout || (pos >= i_size_read(inode)) ||
+ ((dio->flags & IOMAP_DIO_NEED_SYNC) &&
+ !(bio_opf & REQ_FUA)))
dio->flags &= ~IOMAP_DIO_CALLER_COMP;
+ } else {
+ bio_opf |= REQ_OP_READ;
}
/*
@@ -399,18 +399,6 @@ static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio)
if (!iov_iter_count(dio->submit.iter))
goto out;
- /*
- * We can only do deferred completion for pure overwrites that
- * don't require additional IO at completion. This rules out
- * writes that need zeroing or extent conversion, extend
- * the file size, or issue journal IO or cache flushes
- * during completion processing.
- */
- if (need_zeroout ||
- ((dio->flags & IOMAP_DIO_NEED_SYNC) && !use_fua) ||
- ((dio->flags & IOMAP_DIO_WRITE) && pos >= i_size_read(inode)))
- dio->flags &= ~IOMAP_DIO_CALLER_COMP;
-
/*
* The rules for polled IO completions follow the guidelines as the
* ones we set for inline and deferred completions. If none of those
@@ -428,8 +416,6 @@ static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio)
goto out;
}
- bio_opf = iomap_dio_bio_opflags(dio, iomap, use_fua, atomic_hw);
-
nr_pages = bio_iov_vecs_to_alloc(dio->submit.iter, BIO_MAX_VECS);
do {
size_t n;
@@ -461,7 +447,7 @@ static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio)
}
n = bio->bi_iter.bi_size;
- if (WARN_ON_ONCE(atomic_hw && n != length)) {
+ if (WARN_ON_ONCE((bio_opf & REQ_ATOMIC) && n != length)) {
/*
* This bio should have covered the complete length,
* which it doesn't, so error. We may need to zero out
--
2.31.1
^ permalink raw reply related [flat|nested] 65+ messages in thread
* Re: [PATCH v6 01/13] iomap: inline iomap_dio_bio_opflags()
2025-03-13 17:12 ` [PATCH v6 01/13] iomap: inline iomap_dio_bio_opflags() John Garry
@ 2025-03-16 13:40 ` Ritesh Harjani
2025-03-17 6:07 ` Christoph Hellwig
1 sibling, 0 replies; 65+ messages in thread
From: Ritesh Harjani @ 2025-03-16 13:40 UTC (permalink / raw)
To: John Garry, brauner, djwong, cem, dchinner, hch
Cc: linux-xfs, linux-fsdevel, linux-kernel, ojaswin, martin.petersen,
tytso, linux-ext4, John Garry
John Garry <john.g.garry@oracle.com> writes:
> It is neater to build blk_opf_t fully in one place, so inline
> iomap_dio_bio_opflags() in iomap_dio_bio_iter().
>
> Also tidy up the logic in dealing with IOMAP_DIO_CALLER_COMP, in generally
> separate the logic in dealing with flags associated with reads and writes.
>
Indeed it clean things up and separates the logic required for
IOMAP_DIO_WRITE v/s reads.
The change looks good to me. Please feel free to add -
Reviewed-by: Ritesh Harjani (IBM) <ritesh.list@gmail.com>
> Originally-from: Christoph Hellwig <hch@lst.de>
> Signed-off-by: John Garry <john.g.garry@oracle.com>
> ---
> Should I change author?
> fs/iomap/direct-io.c | 112 +++++++++++++++++++------------------------
> 1 file changed, 49 insertions(+), 63 deletions(-)
>
> diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
> index 5299f70428ef..8c1bec473586 100644
> --- a/fs/iomap/direct-io.c
> +++ b/fs/iomap/direct-io.c
> @@ -312,27 +312,20 @@ static int iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio,
> }
>
> /*
> - * Figure out the bio's operation flags from the dio request, the
> - * mapping, and whether or not we want FUA. Note that we can end up
> - * clearing the WRITE_THROUGH flag in the dio request.
> + * Use a FUA write if we need datasync semantics and this is a pure data I/O
> + * that doesn't require any metadata updates (including after I/O completion
> + * such as unwritten extent conversion) and the underlying device either
> + * doesn't have a volatile write cache or supports FUA.
> + * This allows us to avoid cache flushes on I/O completion.
> */
> -static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio,
> - const struct iomap *iomap, bool use_fua, bool atomic_hw)
> +static inline bool iomap_dio_can_use_fua(const struct iomap *iomap,
> + struct iomap_dio *dio)
> {
> - blk_opf_t opflags = REQ_SYNC | REQ_IDLE;
> -
> - if (!(dio->flags & IOMAP_DIO_WRITE))
> - return REQ_OP_READ;
> -
> - opflags |= REQ_OP_WRITE;
> - if (use_fua)
> - opflags |= REQ_FUA;
> - else
> - dio->flags &= ~IOMAP_DIO_WRITE_THROUGH;
> - if (atomic_hw)
> - opflags |= REQ_ATOMIC;
> -
> - return opflags;
> + if (iomap->flags & (IOMAP_F_SHARED | IOMAP_F_DIRTY))
> + return false;
> + if (!(dio->flags & IOMAP_DIO_WRITE_THROUGH))
> + return false;
> + return !bdev_write_cache(iomap->bdev) || bdev_fua(iomap->bdev);
> }
>
> static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio)
> @@ -340,52 +333,59 @@ static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio)
> const struct iomap *iomap = &iter->iomap;
> struct inode *inode = iter->inode;
> unsigned int fs_block_size = i_blocksize(inode), pad;
> - bool atomic_hw = iter->flags & IOMAP_ATOMIC_HW;
> const loff_t length = iomap_length(iter);
> loff_t pos = iter->pos;
> - blk_opf_t bio_opf;
> + blk_opf_t bio_opf = REQ_SYNC | REQ_IDLE;
> struct bio *bio;
> bool need_zeroout = false;
> - bool use_fua = false;
> int nr_pages, ret = 0;
> u64 copied = 0;
> size_t orig_count;
>
> - if (atomic_hw && length != iter->len)
> - return -EINVAL;
> -
> if ((pos | length) & (bdev_logical_block_size(iomap->bdev) - 1) ||
> !bdev_iter_is_aligned(iomap->bdev, dio->submit.iter))
> return -EINVAL;
>
> - if (iomap->type == IOMAP_UNWRITTEN) {
> - dio->flags |= IOMAP_DIO_UNWRITTEN;
> - need_zeroout = true;
> - }
> + if (dio->flags & IOMAP_DIO_WRITE) {
> + bio_opf |= REQ_OP_WRITE;
> +
> + if (iter->flags & IOMAP_ATOMIC_HW) {
> + if (length != iter->len)
> + return -EINVAL;
> + bio_opf |= REQ_ATOMIC;
> + }
> +
> + if (iomap->type == IOMAP_UNWRITTEN) {
> + dio->flags |= IOMAP_DIO_UNWRITTEN;
> + need_zeroout = true;
> + }
>
> - if (iomap->flags & IOMAP_F_SHARED)
> - dio->flags |= IOMAP_DIO_COW;
> + if (iomap->flags & IOMAP_F_SHARED)
> + dio->flags |= IOMAP_DIO_COW;
> +
> + if (iomap->flags & IOMAP_F_NEW) {
> + need_zeroout = true;
> + } else if (iomap->type == IOMAP_MAPPED) {
> + if (iomap_dio_can_use_fua(iomap, dio))
> + bio_opf |= REQ_FUA;
> + else
> + dio->flags &= ~IOMAP_DIO_WRITE_THROUGH;
> + }
>
> - if (iomap->flags & IOMAP_F_NEW) {
> - need_zeroout = true;
> - } else if (iomap->type == IOMAP_MAPPED) {
> /*
> - * Use a FUA write if we need datasync semantics, this is a pure
> - * data IO that doesn't require any metadata updates (including
> - * after IO completion such as unwritten extent conversion) and
> - * the underlying device either supports FUA or doesn't have
> - * a volatile write cache. This allows us to avoid cache flushes
> - * on IO completion. If we can't use writethrough and need to
> - * sync, disable in-task completions as dio completion will
> - * need to call generic_write_sync() which will do a blocking
> - * fsync / cache flush call.
> + * We can only do deferred completion for pure overwrites that
> + * don't require additional I/O at completion time.
> + *
> + * This rules out writes that need zeroing or extent conversion,
> + * extend the file size, or issue metadata I/O or cache flushes
> + * during completion processing.
> */
> - if (!(iomap->flags & (IOMAP_F_SHARED|IOMAP_F_DIRTY)) &&
> - (dio->flags & IOMAP_DIO_WRITE_THROUGH) &&
> - (bdev_fua(iomap->bdev) || !bdev_write_cache(iomap->bdev)))
> - use_fua = true;
> - else if (dio->flags & IOMAP_DIO_NEED_SYNC)
> + if (need_zeroout || (pos >= i_size_read(inode)) ||
> + ((dio->flags & IOMAP_DIO_NEED_SYNC) &&
> + !(bio_opf & REQ_FUA)))
> dio->flags &= ~IOMAP_DIO_CALLER_COMP;
> + } else {
> + bio_opf |= REQ_OP_READ;
> }
>
> /*
> @@ -399,18 +399,6 @@ static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio)
> if (!iov_iter_count(dio->submit.iter))
> goto out;
>
> - /*
> - * We can only do deferred completion for pure overwrites that
> - * don't require additional IO at completion. This rules out
> - * writes that need zeroing or extent conversion, extend
> - * the file size, or issue journal IO or cache flushes
> - * during completion processing.
> - */
> - if (need_zeroout ||
> - ((dio->flags & IOMAP_DIO_NEED_SYNC) && !use_fua) ||
> - ((dio->flags & IOMAP_DIO_WRITE) && pos >= i_size_read(inode)))
> - dio->flags &= ~IOMAP_DIO_CALLER_COMP;
> -
> /*
> * The rules for polled IO completions follow the guidelines as the
> * ones we set for inline and deferred completions. If none of those
> @@ -428,8 +416,6 @@ static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio)
> goto out;
> }
>
> - bio_opf = iomap_dio_bio_opflags(dio, iomap, use_fua, atomic_hw);
> -
> nr_pages = bio_iov_vecs_to_alloc(dio->submit.iter, BIO_MAX_VECS);
> do {
> size_t n;
> @@ -461,7 +447,7 @@ static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio)
> }
>
> n = bio->bi_iter.bi_size;
> - if (WARN_ON_ONCE(atomic_hw && n != length)) {
> + if (WARN_ON_ONCE((bio_opf & REQ_ATOMIC) && n != length)) {
> /*
> * This bio should have covered the complete length,
> * which it doesn't, so error. We may need to zero out
> --
> 2.31.1
^ permalink raw reply [flat|nested] 65+ messages in thread
* Re: [PATCH v6 01/13] iomap: inline iomap_dio_bio_opflags()
2025-03-13 17:12 ` [PATCH v6 01/13] iomap: inline iomap_dio_bio_opflags() John Garry
2025-03-16 13:40 ` Ritesh Harjani
@ 2025-03-17 6:07 ` Christoph Hellwig
1 sibling, 0 replies; 65+ messages in thread
From: Christoph Hellwig @ 2025-03-17 6:07 UTC (permalink / raw)
To: John Garry
Cc: brauner, djwong, cem, dchinner, hch, linux-xfs, linux-fsdevel,
linux-kernel, ojaswin, ritesh.list, martin.petersen, tytso,
linux-ext4
On Thu, Mar 13, 2025 at 05:12:58PM +0000, John Garry wrote:
> It is neater to build blk_opf_t fully in one place, so inline
> iomap_dio_bio_opflags() in iomap_dio_bio_iter().
>
> Also tidy up the logic in dealing with IOMAP_DIO_CALLER_COMP, in generally
> separate the logic in dealing with flags associated with reads and writes.
No review from me as that would feel weird having draftead this, but
it obviously looks good.
^ permalink raw reply [flat|nested] 65+ messages in thread
* [PATCH v6 02/13] iomap: comment on atomic write checks in iomap_dio_bio_iter()
2025-03-13 17:12 [PATCH v6 00/13] large atomic writes for xfs with CoW John Garry
2025-03-13 17:12 ` [PATCH v6 01/13] iomap: inline iomap_dio_bio_opflags() John Garry
@ 2025-03-13 17:12 ` John Garry
2025-03-17 6:08 ` Christoph Hellwig
2025-03-17 14:16 ` Ritesh Harjani
2025-03-13 17:13 ` [PATCH v6 03/13] iomap: rework IOMAP atomic flags John Garry
` (11 subsequent siblings)
13 siblings, 2 replies; 65+ messages in thread
From: John Garry @ 2025-03-13 17:12 UTC (permalink / raw)
To: brauner, djwong, cem, dchinner, hch
Cc: linux-xfs, linux-fsdevel, linux-kernel, ojaswin, ritesh.list,
martin.petersen, tytso, linux-ext4, John Garry
Help explain the code.
Also clarify the comment for bio size check.
Signed-off-by: John Garry <john.g.garry@oracle.com>
---
fs/iomap/direct-io.c | 7 ++++++-
1 file changed, 6 insertions(+), 1 deletion(-)
diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index 8c1bec473586..9d72b99cb447 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -350,6 +350,11 @@ static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio)
bio_opf |= REQ_OP_WRITE;
if (iter->flags & IOMAP_ATOMIC_HW) {
+ /*
+ * Ensure that the mapping covers the full write length,
+ * otherwise we will submit multiple BIOs, which is
+ * disallowed.
+ */
if (length != iter->len)
return -EINVAL;
bio_opf |= REQ_ATOMIC;
@@ -449,7 +454,7 @@ static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio)
n = bio->bi_iter.bi_size;
if (WARN_ON_ONCE((bio_opf & REQ_ATOMIC) && n != length)) {
/*
- * This bio should have covered the complete length,
+ * An atomic write bio must cover the complete length,
* which it doesn't, so error. We may need to zero out
* the tail (complete FS block), similar to when
* bio_iov_iter_get_pages() returns an error, above.
--
2.31.1
^ permalink raw reply related [flat|nested] 65+ messages in thread
* Re: [PATCH v6 02/13] iomap: comment on atomic write checks in iomap_dio_bio_iter()
2025-03-13 17:12 ` [PATCH v6 02/13] iomap: comment on atomic write checks in iomap_dio_bio_iter() John Garry
@ 2025-03-17 6:08 ` Christoph Hellwig
2025-03-17 8:22 ` John Garry
2025-03-17 14:16 ` Ritesh Harjani
1 sibling, 1 reply; 65+ messages in thread
From: Christoph Hellwig @ 2025-03-17 6:08 UTC (permalink / raw)
To: John Garry
Cc: brauner, djwong, cem, dchinner, hch, linux-xfs, linux-fsdevel,
linux-kernel, ojaswin, ritesh.list, martin.petersen, tytso,
linux-ext4
On Thu, Mar 13, 2025 at 05:12:59PM +0000, John Garry wrote:
> if (iter->flags & IOMAP_ATOMIC_HW) {
> + /*
> + * Ensure that the mapping covers the full write length,
> + * otherwise we will submit multiple BIOs, which is
> + * disallowed.
> + */
"disallowed" doesn't really explain anything, why is it disallowed?
Maybe:
* Ensure that the mapping covers the full write length,
* otherwise it can't be submitted as a single bio,
* which is required to use hardware atomics.
^ permalink raw reply [flat|nested] 65+ messages in thread
* Re: [PATCH v6 02/13] iomap: comment on atomic write checks in iomap_dio_bio_iter()
2025-03-17 6:08 ` Christoph Hellwig
@ 2025-03-17 8:22 ` John Garry
0 siblings, 0 replies; 65+ messages in thread
From: John Garry @ 2025-03-17 8:22 UTC (permalink / raw)
To: Christoph Hellwig
Cc: brauner, djwong, cem, dchinner, linux-xfs, linux-fsdevel,
linux-kernel, ojaswin, ritesh.list, martin.petersen, tytso,
linux-ext4
On 17/03/2025 06:08, Christoph Hellwig wrote:
> On Thu, Mar 13, 2025 at 05:12:59PM +0000, John Garry wrote:
>> if (iter->flags & IOMAP_ATOMIC_HW) {
>> + /*
>> + * Ensure that the mapping covers the full write length,
>> + * otherwise we will submit multiple BIOs, which is
>> + * disallowed.
>> + */
>
> "disallowed" doesn't really explain anything, why is it disallowed?
>
> Maybe:
>
> * Ensure that the mapping covers the full write length,
> * otherwise it can't be submitted as a single bio,
> * which is required to use hardware atomics.
ok, fine.
>
^ permalink raw reply [flat|nested] 65+ messages in thread
* Re: [PATCH v6 02/13] iomap: comment on atomic write checks in iomap_dio_bio_iter()
2025-03-13 17:12 ` [PATCH v6 02/13] iomap: comment on atomic write checks in iomap_dio_bio_iter() John Garry
2025-03-17 6:08 ` Christoph Hellwig
@ 2025-03-17 14:16 ` Ritesh Harjani
1 sibling, 0 replies; 65+ messages in thread
From: Ritesh Harjani @ 2025-03-17 14:16 UTC (permalink / raw)
To: John Garry, brauner, djwong, cem, dchinner, hch
Cc: linux-xfs, linux-fsdevel, linux-kernel, ojaswin, martin.petersen,
tytso, linux-ext4, John Garry
John Garry <john.g.garry@oracle.com> writes:
> Help explain the code.
>
> Also clarify the comment for bio size check.
>
> Signed-off-by: John Garry <john.g.garry@oracle.com>
> ---
> fs/iomap/direct-io.c | 7 ++++++-
> 1 file changed, 6 insertions(+), 1 deletion(-)
>
> diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
> index 8c1bec473586..9d72b99cb447 100644
> --- a/fs/iomap/direct-io.c
> +++ b/fs/iomap/direct-io.c
> @@ -350,6 +350,11 @@ static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio)
> bio_opf |= REQ_OP_WRITE;
>
> if (iter->flags & IOMAP_ATOMIC_HW) {
> + /*
> + * Ensure that the mapping covers the full write length,
> + * otherwise we will submit multiple BIOs, which is
> + * disallowed.
> + */
Nit: IMO, this can be clubbed together with your next patch PATCH-03 itself.
But either ways, no strong preference.
-ritesh
> if (length != iter->len)
> return -EINVAL;
> bio_opf |= REQ_ATOMIC;
> @@ -449,7 +454,7 @@ static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio)
> n = bio->bi_iter.bi_size;
> if (WARN_ON_ONCE((bio_opf & REQ_ATOMIC) && n != length)) {
> /*
> - * This bio should have covered the complete length,
> + * An atomic write bio must cover the complete length,
> * which it doesn't, so error. We may need to zero out
> * the tail (complete FS block), similar to when
> * bio_iov_iter_get_pages() returns an error, above.
> --
> 2.31.1
^ permalink raw reply [flat|nested] 65+ messages in thread
* [PATCH v6 03/13] iomap: rework IOMAP atomic flags
2025-03-13 17:12 [PATCH v6 00/13] large atomic writes for xfs with CoW John Garry
2025-03-13 17:12 ` [PATCH v6 01/13] iomap: inline iomap_dio_bio_opflags() John Garry
2025-03-13 17:12 ` [PATCH v6 02/13] iomap: comment on atomic write checks in iomap_dio_bio_iter() John Garry
@ 2025-03-13 17:13 ` John Garry
2025-03-17 6:11 ` Christoph Hellwig
2025-03-17 13:44 ` Ritesh Harjani
2025-03-13 17:13 ` [PATCH v6 04/13] xfs: pass flags to xfs_reflink_allocate_cow() John Garry
` (10 subsequent siblings)
13 siblings, 2 replies; 65+ messages in thread
From: John Garry @ 2025-03-13 17:13 UTC (permalink / raw)
To: brauner, djwong, cem, dchinner, hch
Cc: linux-xfs, linux-fsdevel, linux-kernel, ojaswin, ritesh.list,
martin.petersen, tytso, linux-ext4, John Garry
Flag IOMAP_ATOMIC_SW is not really required. The idea of having this flag
is that the FS ->iomap_begin callback could check if this flag is set to
decide whether to do a SW (FS-based) atomic write. But the FS can set
which ->iomap_begin callback it wants when deciding to do a FS-based
atomic write.
Furthermore, it was thought that IOMAP_ATOMIC_HW is not a proper name, as
the block driver can use SW-methods to emulate an atomic write. So change
back to IOMAP_ATOMIC.
The ->iomap_begin callback needs though to indicate to iomap core that
REQ_ATOMIC needs to be set, so add IOMAP_F_ATOMIC_BIO for that.
These changes were suggested by Christoph Hellwig and Dave Chinner.
Signed-off-by: John Garry <john.g.garry@oracle.com>
---
fs/ext4/inode.c | 5 ++++-
fs/iomap/direct-io.c | 8 +++-----
fs/iomap/trace.h | 2 +-
fs/xfs/xfs_iomap.c | 3 +++
include/linux/iomap.h | 12 +++++-------
5 files changed, 16 insertions(+), 14 deletions(-)
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index ba2f1e3db7c7..949d74d34926 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3290,6 +3290,9 @@ static void ext4_set_iomap(struct inode *inode, struct iomap *iomap,
if (map->m_flags & EXT4_MAP_NEW)
iomap->flags |= IOMAP_F_NEW;
+ if (flags & IOMAP_ATOMIC)
+ iomap->flags |= IOMAP_F_ATOMIC_BIO;
+
if (flags & IOMAP_DAX)
iomap->dax_dev = EXT4_SB(inode->i_sb)->s_daxdev;
else
@@ -3467,7 +3470,7 @@ static inline bool ext4_want_directio_fallback(unsigned flags, ssize_t written)
return false;
/* atomic writes are all-or-nothing */
- if (flags & IOMAP_ATOMIC_HW)
+ if (flags & IOMAP_ATOMIC)
return false;
/* can only try again if we wrote nothing */
diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index 9d72b99cb447..c28685fd3362 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -349,7 +349,7 @@ static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio)
if (dio->flags & IOMAP_DIO_WRITE) {
bio_opf |= REQ_OP_WRITE;
- if (iter->flags & IOMAP_ATOMIC_HW) {
+ if (iomap->flags & IOMAP_F_ATOMIC_BIO) {
/*
* Ensure that the mapping covers the full write length,
* otherwise we will submit multiple BIOs, which is
@@ -677,10 +677,8 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
iomi.flags |= IOMAP_OVERWRITE_ONLY;
}
- if (dio_flags & IOMAP_DIO_ATOMIC_SW)
- iomi.flags |= IOMAP_ATOMIC_SW;
- else if (iocb->ki_flags & IOCB_ATOMIC)
- iomi.flags |= IOMAP_ATOMIC_HW;
+ if (iocb->ki_flags & IOCB_ATOMIC)
+ iomi.flags |= IOMAP_ATOMIC;
/* for data sync or sync, we need sync completion processing */
if (iocb_is_dsync(iocb)) {
diff --git a/fs/iomap/trace.h b/fs/iomap/trace.h
index 69af89044ebd..9eab2c8ac3c5 100644
--- a/fs/iomap/trace.h
+++ b/fs/iomap/trace.h
@@ -99,7 +99,7 @@ DEFINE_RANGE_EVENT(iomap_dio_rw_queued);
{ IOMAP_FAULT, "FAULT" }, \
{ IOMAP_DIRECT, "DIRECT" }, \
{ IOMAP_NOWAIT, "NOWAIT" }, \
- { IOMAP_ATOMIC_HW, "ATOMIC_HW" }
+ { IOMAP_ATOMIC, "ATOMIC" }
#define IOMAP_F_FLAGS_STRINGS \
{ IOMAP_F_NEW, "NEW" }, \
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 30e257f683bb..9a22ecd794eb 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -831,6 +831,9 @@ xfs_direct_write_iomap_begin(
if (offset + length > i_size_read(inode))
iomap_flags |= IOMAP_F_DIRTY;
+ if (flags & IOMAP_ATOMIC)
+ iomap_flags |= IOMAP_F_ATOMIC_BIO;
+
/*
* COW writes may allocate delalloc space or convert unwritten COW
* extents, so we need to make sure to take the lock exclusively here.
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 9cd93530013c..51f4c13bd17a 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -60,6 +60,9 @@ struct vm_fault;
* IOMAP_F_ANON_WRITE indicates that (write) I/O does not have a target block
* assigned to it yet and the file system will do that in the bio submission
* handler, splitting the I/O as needed.
+ *
+ * IOMAP_F_ATOMIC_BIO indicates that (write) I/O needs to be issued as an
+ * atomic bio, i.e. set REQ_ATOMIC.
*/
#define IOMAP_F_NEW (1U << 0)
#define IOMAP_F_DIRTY (1U << 1)
@@ -73,6 +76,7 @@ struct vm_fault;
#define IOMAP_F_XATTR (1U << 5)
#define IOMAP_F_BOUNDARY (1U << 6)
#define IOMAP_F_ANON_WRITE (1U << 7)
+#define IOMAP_F_ATOMIC_BIO (1U << 8)
/*
* Flags set by the core iomap code during operations:
@@ -189,9 +193,8 @@ struct iomap_folio_ops {
#else
#define IOMAP_DAX 0
#endif /* CONFIG_FS_DAX */
-#define IOMAP_ATOMIC_HW (1 << 9) /* HW-based torn-write protection */
+#define IOMAP_ATOMIC (1 << 9) /* torn-write protection */
#define IOMAP_DONTCACHE (1 << 10)
-#define IOMAP_ATOMIC_SW (1 << 11)/* SW-based torn-write protection */
struct iomap_ops {
/*
@@ -503,11 +506,6 @@ struct iomap_dio_ops {
*/
#define IOMAP_DIO_PARTIAL (1 << 2)
-/*
- * Use software-based torn-write protection.
- */
-#define IOMAP_DIO_ATOMIC_SW (1 << 3)
-
ssize_t iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
const struct iomap_ops *ops, const struct iomap_dio_ops *dops,
unsigned int dio_flags, void *private, size_t done_before);
--
2.31.1
^ permalink raw reply related [flat|nested] 65+ messages in thread
* Re: [PATCH v6 03/13] iomap: rework IOMAP atomic flags
2025-03-13 17:13 ` [PATCH v6 03/13] iomap: rework IOMAP atomic flags John Garry
@ 2025-03-17 6:11 ` Christoph Hellwig
2025-03-17 9:05 ` John Garry
2025-03-17 13:44 ` Ritesh Harjani
1 sibling, 1 reply; 65+ messages in thread
From: Christoph Hellwig @ 2025-03-17 6:11 UTC (permalink / raw)
To: John Garry
Cc: brauner, djwong, cem, dchinner, hch, linux-xfs, linux-fsdevel,
linux-kernel, ojaswin, ritesh.list, martin.petersen, tytso,
linux-ext4
> iomap->flags |= IOMAP_F_NEW;
>
> + if (flags & IOMAP_ATOMIC)
> + iomap->flags |= IOMAP_F_ATOMIC_BIO;
> +
Add a comment here that ext4 is always using hardware atomics?
> + if (flags & IOMAP_ATOMIC)
> + iomap_flags |= IOMAP_F_ATOMIC_BIO;
Same here (at least for now until it is changed later).
> + * IOMAP_F_ATOMIC_BIO indicates that (write) I/O needs to be issued as an
> + * atomic bio, i.e. set REQ_ATOMIC.
s/needs to/will be/ ?
^ permalink raw reply [flat|nested] 65+ messages in thread
* Re: [PATCH v6 03/13] iomap: rework IOMAP atomic flags
2025-03-17 6:11 ` Christoph Hellwig
@ 2025-03-17 9:05 ` John Garry
2025-03-18 5:32 ` Christoph Hellwig
0 siblings, 1 reply; 65+ messages in thread
From: John Garry @ 2025-03-17 9:05 UTC (permalink / raw)
To: Christoph Hellwig
Cc: brauner, djwong, cem, dchinner, linux-xfs, linux-fsdevel,
linux-kernel, ojaswin, ritesh.list, martin.petersen, tytso,
linux-ext4
On 17/03/2025 06:11, Christoph Hellwig wrote:
>> iomap->flags |= IOMAP_F_NEW;
>>
>> + if (flags & IOMAP_ATOMIC)
>> + iomap->flags |= IOMAP_F_ATOMIC_BIO;
>> +
>
> Add a comment here that ext4 is always using hardware atomics?
>
>> + if (flags & IOMAP_ATOMIC)
>> + iomap_flags |= IOMAP_F_ATOMIC_BIO;
>
> Same here (at least for now until it is changed later).
Please note that Christian plans on sending the earlier iomap changes
related to this work for 6.15. Those changes are also in the xfs queue.
We are kinda reverting those changes here, so I think that it would
still make sense for the iomap changes in this series to make 6.15
The xfs changes in this series are unlikely to make 6.15
As such, if we say that ext4 always uses hardware atomics, then we
should mention that xfs does also (until it doesn't).
So, in the end, I'd rather not add those comments at all - ok?
>
>> + * IOMAP_F_ATOMIC_BIO indicates that (write) I/O needs to be issued as an
>> + * atomic bio, i.e. set REQ_ATOMIC.
>
> s/needs to/will be/ ?
>
ok
^ permalink raw reply [flat|nested] 65+ messages in thread
* Re: [PATCH v6 03/13] iomap: rework IOMAP atomic flags
2025-03-17 9:05 ` John Garry
@ 2025-03-18 5:32 ` Christoph Hellwig
2025-03-18 8:11 ` John Garry
0 siblings, 1 reply; 65+ messages in thread
From: Christoph Hellwig @ 2025-03-18 5:32 UTC (permalink / raw)
To: John Garry
Cc: Christoph Hellwig, brauner, djwong, cem, dchinner, linux-xfs,
linux-fsdevel, linux-kernel, ojaswin, ritesh.list,
martin.petersen, tytso, linux-ext4
On Mon, Mar 17, 2025 at 09:05:39AM +0000, John Garry wrote:
>> Same here (at least for now until it is changed later).
>
> Please note that Christian plans on sending the earlier iomap changes
> related to this work for 6.15. Those changes are also in the xfs queue. We
> are kinda reverting those changes here, so I think that it would still make
> sense for the iomap changes in this series to make 6.15
>
> The xfs changes in this series are unlikely to make 6.15
>
> As such, if we say that ext4 always uses hardware atomics, then we should
> mention that xfs does also (until it doesn't).
That's what I meant.
> So, in the end, I'd rather not add those comments at all - ok?
If I read through this code it would be kinda nice to figure out why
we're instructing the iomap code to do it. If you look at
xfs_direct_write_iomap_begin it also generally comments on why we
set specific flags.
^ permalink raw reply [flat|nested] 65+ messages in thread
* Re: [PATCH v6 03/13] iomap: rework IOMAP atomic flags
2025-03-18 5:32 ` Christoph Hellwig
@ 2025-03-18 8:11 ` John Garry
0 siblings, 0 replies; 65+ messages in thread
From: John Garry @ 2025-03-18 8:11 UTC (permalink / raw)
To: Christoph Hellwig
Cc: brauner, djwong, cem, dchinner, linux-xfs, linux-fsdevel,
linux-kernel, ojaswin, ritesh.list, martin.petersen, tytso,
linux-ext4
On 18/03/2025 05:32, Christoph Hellwig wrote:
> On Mon, Mar 17, 2025 at 09:05:39AM +0000, John Garry wrote:
>>> Same here (at least for now until it is changed later).
>> Please note that Christian plans on sending the earlier iomap changes
>> related to this work for 6.15. Those changes are also in the xfs queue. We
>> are kinda reverting those changes here, so I think that it would still make
>> sense for the iomap changes in this series to make 6.15
>>
>> The xfs changes in this series are unlikely to make 6.15
>>
>> As such, if we say that ext4 always uses hardware atomics, then we should
>> mention that xfs does also (until it doesn't).
> That's what I meant.
ok
>
>> So, in the end, I'd rather not add those comments at all - ok?
> If I read through this code it would be kinda nice to figure out why
> we're instructing the iomap code to do it. If you look at
> xfs_direct_write_iomap_begin it also generally comments on why we
> set specific flags.
understood
^ permalink raw reply [flat|nested] 65+ messages in thread
* Re: [PATCH v6 03/13] iomap: rework IOMAP atomic flags
2025-03-13 17:13 ` [PATCH v6 03/13] iomap: rework IOMAP atomic flags John Garry
2025-03-17 6:11 ` Christoph Hellwig
@ 2025-03-17 13:44 ` Ritesh Harjani
2025-03-17 14:25 ` John Garry
1 sibling, 1 reply; 65+ messages in thread
From: Ritesh Harjani @ 2025-03-17 13:44 UTC (permalink / raw)
To: John Garry, brauner, djwong, cem, dchinner, hch
Cc: linux-xfs, linux-fsdevel, linux-kernel, ojaswin, martin.petersen,
tytso, linux-ext4, John Garry
John Garry <john.g.garry@oracle.com> writes:
> Flag IOMAP_ATOMIC_SW is not really required. The idea of having this flag
> is that the FS ->iomap_begin callback could check if this flag is set to
> decide whether to do a SW (FS-based) atomic write. But the FS can set
> which ->iomap_begin callback it wants when deciding to do a FS-based
> atomic write.
>
> Furthermore, it was thought that IOMAP_ATOMIC_HW is not a proper name, as
> the block driver can use SW-methods to emulate an atomic write. So change
> back to IOMAP_ATOMIC.
>
> The ->iomap_begin callback needs though to indicate to iomap core that
> REQ_ATOMIC needs to be set, so add IOMAP_F_ATOMIC_BIO for that.
>
> These changes were suggested by Christoph Hellwig and Dave Chinner.
>
> Signed-off-by: John Garry <john.g.garry@oracle.com>
> ---
> fs/ext4/inode.c | 5 ++++-
> fs/iomap/direct-io.c | 8 +++-----
> fs/iomap/trace.h | 2 +-
> fs/xfs/xfs_iomap.c | 3 +++
> include/linux/iomap.h | 12 +++++-------
> 5 files changed, 16 insertions(+), 14 deletions(-)
>
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index ba2f1e3db7c7..949d74d34926 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -3290,6 +3290,9 @@ static void ext4_set_iomap(struct inode *inode, struct iomap *iomap,
> if (map->m_flags & EXT4_MAP_NEW)
> iomap->flags |= IOMAP_F_NEW;
>
> + if (flags & IOMAP_ATOMIC)
> + iomap->flags |= IOMAP_F_ATOMIC_BIO;
> +
> if (flags & IOMAP_DAX)
> iomap->dax_dev = EXT4_SB(inode->i_sb)->s_daxdev;
> else
> @@ -3467,7 +3470,7 @@ static inline bool ext4_want_directio_fallback(unsigned flags, ssize_t written)
> return false;
>
> /* atomic writes are all-or-nothing */
> - if (flags & IOMAP_ATOMIC_HW)
> + if (flags & IOMAP_ATOMIC)
> return false;
>
The changes in ext4 is mostly straight forward. Essentially for
an IOMAP_ATOMIC write requests we are always setting IOMAP_F_ATOMIC_BIO in
the ->iomap_begin() routine. This is done to inform the iomap that this
write request needs to issue an atomic bio, so iomap then goes and sets
REQ_ATOMIC flag in the bio.
> /* can only try again if we wrote nothing */
> diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
> index 9d72b99cb447..c28685fd3362 100644
> --- a/fs/iomap/direct-io.c
> +++ b/fs/iomap/direct-io.c
> @@ -349,7 +349,7 @@ static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio)
> if (dio->flags & IOMAP_DIO_WRITE) {
> bio_opf |= REQ_OP_WRITE;
>
> - if (iter->flags & IOMAP_ATOMIC_HW) {
> + if (iomap->flags & IOMAP_F_ATOMIC_BIO) {
> /*
> * Ensure that the mapping covers the full write length,
> * otherwise we will submit multiple BIOs, which is
> @@ -677,10 +677,8 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
> iomi.flags |= IOMAP_OVERWRITE_ONLY;
> }
>
> - if (dio_flags & IOMAP_DIO_ATOMIC_SW)
> - iomi.flags |= IOMAP_ATOMIC_SW;
> - else if (iocb->ki_flags & IOCB_ATOMIC)
> - iomi.flags |= IOMAP_ATOMIC_HW;
> + if (iocb->ki_flags & IOCB_ATOMIC)
> + iomi.flags |= IOMAP_ATOMIC;
>
> /* for data sync or sync, we need sync completion processing */
> if (iocb_is_dsync(iocb)) {
> diff --git a/fs/iomap/trace.h b/fs/iomap/trace.h
> index 69af89044ebd..9eab2c8ac3c5 100644
> --- a/fs/iomap/trace.h
> +++ b/fs/iomap/trace.h
> @@ -99,7 +99,7 @@ DEFINE_RANGE_EVENT(iomap_dio_rw_queued);
> { IOMAP_FAULT, "FAULT" }, \
> { IOMAP_DIRECT, "DIRECT" }, \
> { IOMAP_NOWAIT, "NOWAIT" }, \
> - { IOMAP_ATOMIC_HW, "ATOMIC_HW" }
> + { IOMAP_ATOMIC, "ATOMIC" }
>
> #define IOMAP_F_FLAGS_STRINGS \
> { IOMAP_F_NEW, "NEW" }, \
> diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
> index 30e257f683bb..9a22ecd794eb 100644
> --- a/fs/xfs/xfs_iomap.c
> +++ b/fs/xfs/xfs_iomap.c
> @@ -831,6 +831,9 @@ xfs_direct_write_iomap_begin(
> if (offset + length > i_size_read(inode))
> iomap_flags |= IOMAP_F_DIRTY;
>
> + if (flags & IOMAP_ATOMIC)
> + iomap_flags |= IOMAP_F_ATOMIC_BIO;
> +
> /*
> * COW writes may allocate delalloc space or convert unwritten COW
> * extents, so we need to make sure to take the lock exclusively here.
> diff --git a/include/linux/iomap.h b/include/linux/iomap.h
> index 9cd93530013c..51f4c13bd17a 100644
> --- a/include/linux/iomap.h
> +++ b/include/linux/iomap.h
> @@ -60,6 +60,9 @@ struct vm_fault;
> * IOMAP_F_ANON_WRITE indicates that (write) I/O does not have a target block
> * assigned to it yet and the file system will do that in the bio submission
> * handler, splitting the I/O as needed.
> + *
> + * IOMAP_F_ATOMIC_BIO indicates that (write) I/O needs to be issued as an
> + * atomic bio, i.e. set REQ_ATOMIC.
> */
Maybe we can be more explicit here?
IOMAP_F_ATOMIC_BIO flag indicates that write I/O must be issued as an
atomic bio by setting the REQ_ATOMIC flag. Filesystems need to set this
flag to inform iomap that the write I/O operation should be submitted as
an atomic bio.
This definition (or whatever you feel is the better version), should also
go in Documentation/filesystems/iomap/design.rst
> #define IOMAP_F_NEW (1U << 0)
> #define IOMAP_F_DIRTY (1U << 1)
> @@ -73,6 +76,7 @@ struct vm_fault;
> #define IOMAP_F_XATTR (1U << 5)
> #define IOMAP_F_BOUNDARY (1U << 6)
> #define IOMAP_F_ANON_WRITE (1U << 7)
> +#define IOMAP_F_ATOMIC_BIO (1U << 8)
>
> /*
> * Flags set by the core iomap code during operations:
> @@ -189,9 +193,8 @@ struct iomap_folio_ops {
> #else
> #define IOMAP_DAX 0
> #endif /* CONFIG_FS_DAX */
> -#define IOMAP_ATOMIC_HW (1 << 9) /* HW-based torn-write protection */
> +#define IOMAP_ATOMIC (1 << 9) /* torn-write protection */
> #define IOMAP_DONTCACHE (1 << 10)
> -#define IOMAP_ATOMIC_SW (1 << 11)/* SW-based torn-write protection */
Now that we are killing separate IOMAP_ATOMIC_** names, we may would
like to update the iomap design document as well. Otherwise it will
carry use of IOMAP_ATOMIC_HW & IOMAP_ATOMIC_SW definitions. Instead we
should only keep IOMAP_ATOMIC and update the design info there.
-ritesh
>
> struct iomap_ops {
> /*
> @@ -503,11 +506,6 @@ struct iomap_dio_ops {
> */
> #define IOMAP_DIO_PARTIAL (1 << 2)
>
> -/*
> - * Use software-based torn-write protection.
> - */
> -#define IOMAP_DIO_ATOMIC_SW (1 << 3)
> -
> ssize_t iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
> const struct iomap_ops *ops, const struct iomap_dio_ops *dops,
> unsigned int dio_flags, void *private, size_t done_before);
> --
> 2.31.1
^ permalink raw reply [flat|nested] 65+ messages in thread
* Re: [PATCH v6 03/13] iomap: rework IOMAP atomic flags
2025-03-17 13:44 ` Ritesh Harjani
@ 2025-03-17 14:25 ` John Garry
0 siblings, 0 replies; 65+ messages in thread
From: John Garry @ 2025-03-17 14:25 UTC (permalink / raw)
To: Ritesh Harjani (IBM), brauner, djwong, cem, dchinner, hch
Cc: linux-xfs, linux-fsdevel, linux-kernel, ojaswin, martin.petersen,
tytso, linux-ext4
On 17/03/2025 13:44, Ritesh Harjani (IBM) wrote:
>> if (flags & IOMAP_DAX)
>> iomap->dax_dev = EXT4_SB(inode->i_sb)->s_daxdev;
>> else
>> @@ -3467,7 +3470,7 @@ static inline bool ext4_want_directio_fallback(unsigned flags, ssize_t written)
>> return false;
>>
>> /* atomic writes are all-or-nothing */
>> - if (flags & IOMAP_ATOMIC_HW)
>> + if (flags & IOMAP_ATOMIC)
>> return false;
>>
> The changes in ext4 is mostly straight forward. Essentially for
> an IOMAP_ATOMIC write requests we are always setting IOMAP_F_ATOMIC_BIO in
> the ->iomap_begin() routine. This is done to inform the iomap that this
> write request needs to issue an atomic bio, so iomap then goes and sets
> REQ_ATOMIC flag in the bio.
Right
>
>
>> /* can only try again if we wrote nothing */
>> diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
>> index 9d72b99cb447..c28685fd3362 100644
>> --- a/fs/iomap/direct-io.c
>> +++ b/fs/iomap/direct-io.c
>> @@ -349,7 +349,7 @@ static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio)
>> if (dio->flags & IOMAP_DIO_WRITE) {
>> bio_opf |= REQ_OP_WRITE;
>>
>> - if (iter->flags & IOMAP_ATOMIC_HW) {
>> + if (iomap->flags & IOMAP_F_ATOMIC_BIO) {
>> /*
>> * Ensure that the mapping covers the full write length,
>> * otherwise we will submit multiple BIOs, which is
>> @@ -677,10 +677,8 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
>> iomi.flags |= IOMAP_OVERWRITE_ONLY;
>> }
>>
>> - if (dio_flags & IOMAP_DIO_ATOMIC_SW)
>> - iomi.flags |= IOMAP_ATOMIC_SW;
>> - else if (iocb->ki_flags & IOCB_ATOMIC)
>> - iomi.flags |= IOMAP_ATOMIC_HW;
>> + if (iocb->ki_flags & IOCB_ATOMIC)
>> + iomi.flags |= IOMAP_ATOMIC;
>>
>> /* for data sync or sync, we need sync completion processing */
>> if (iocb_is_dsync(iocb)) {
>> diff --git a/fs/iomap/trace.h b/fs/iomap/trace.h
>> index 69af89044ebd..9eab2c8ac3c5 100644
>> --- a/fs/iomap/trace.h
>> +++ b/fs/iomap/trace.h
>> @@ -99,7 +99,7 @@ DEFINE_RANGE_EVENT(iomap_dio_rw_queued);
>> { IOMAP_FAULT, "FAULT" }, \
>> { IOMAP_DIRECT, "DIRECT" }, \
>> { IOMAP_NOWAIT, "NOWAIT" }, \
>> - { IOMAP_ATOMIC_HW, "ATOMIC_HW" }
>> + { IOMAP_ATOMIC, "ATOMIC" }
>>
>> #define IOMAP_F_FLAGS_STRINGS \
>> { IOMAP_F_NEW, "NEW" }, \
>> diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
>> index 30e257f683bb..9a22ecd794eb 100644
>> --- a/fs/xfs/xfs_iomap.c
>> +++ b/fs/xfs/xfs_iomap.c
>> @@ -831,6 +831,9 @@ xfs_direct_write_iomap_begin(
>> if (offset + length > i_size_read(inode))
>> iomap_flags |= IOMAP_F_DIRTY;
>>
>> + if (flags & IOMAP_ATOMIC)
>> + iomap_flags |= IOMAP_F_ATOMIC_BIO;
>> +
>> /*
>> * COW writes may allocate delalloc space or convert unwritten COW
>> * extents, so we need to make sure to take the lock exclusively here.
>> diff --git a/include/linux/iomap.h b/include/linux/iomap.h
>> index 9cd93530013c..51f4c13bd17a 100644
>> --- a/include/linux/iomap.h
>> +++ b/include/linux/iomap.h
>> @@ -60,6 +60,9 @@ struct vm_fault;
>> * IOMAP_F_ANON_WRITE indicates that (write) I/O does not have a target block
>> * assigned to it yet and the file system will do that in the bio submission
>> * handler, splitting the I/O as needed.
>> + *
>> + * IOMAP_F_ATOMIC_BIO indicates that (write) I/O needs to be issued as an
>> + * atomic bio, i.e. set REQ_ATOMIC.
>> */
>
> Maybe we can be more explicit here?
>
> IOMAP_F_ATOMIC_BIO flag indicates that write I/O must be issued as an
> atomic bio by setting the REQ_ATOMIC flag. Filesystems need to set this
> flag to inform iomap that the write I/O operation should be submitted as
> an atomic bio.
The comment for all these flags is that they should be set by the FS:
"Flags reported by the file system from iomap_begin"
So the second sentence seems to just repeat what is already said.
>
> This definition (or whatever you feel is the better version), should also
> go in Documentation/filesystems/iomap/design.rst
Yes, I need to update that again
>
>> #define IOMAP_F_NEW (1U << 0)
>> #define IOMAP_F_DIRTY (1U << 1)
>> @@ -73,6 +76,7 @@ struct vm_fault;
>> #define IOMAP_F_XATTR (1U << 5)
>> #define IOMAP_F_BOUNDARY (1U << 6)
>> #define IOMAP_F_ANON_WRITE (1U << 7)
>> +#define IOMAP_F_ATOMIC_BIO (1U << 8)
>>
>> /*
>> * Flags set by the core iomap code during operations:
>> @@ -189,9 +193,8 @@ struct iomap_folio_ops {
>> #else
>> #define IOMAP_DAX 0
>> #endif /* CONFIG_FS_DAX */
>> -#define IOMAP_ATOMIC_HW (1 << 9) /* HW-based torn-write protection */
>> +#define IOMAP_ATOMIC (1 << 9) /* torn-write protection */
>> #define IOMAP_DONTCACHE (1 << 10)
>> -#define IOMAP_ATOMIC_SW (1 << 11)/* SW-based torn-write protection */
> Now that we are killing separate IOMAP_ATOMIC_** names, we may would
> like to update the iomap design document as well. Otherwise it will
> carry use of IOMAP_ATOMIC_HW & IOMAP_ATOMIC_SW definitions. Instead we
> should only keep IOMAP_ATOMIC and update the design info there.
Yes, I will update it.
Thanks for the reminder.
John
^ permalink raw reply [flat|nested] 65+ messages in thread
* [PATCH v6 04/13] xfs: pass flags to xfs_reflink_allocate_cow()
2025-03-13 17:12 [PATCH v6 00/13] large atomic writes for xfs with CoW John Garry
` (2 preceding siblings ...)
2025-03-13 17:13 ` [PATCH v6 03/13] iomap: rework IOMAP atomic flags John Garry
@ 2025-03-13 17:13 ` John Garry
2025-03-17 6:15 ` Christoph Hellwig
2025-03-13 17:13 ` [PATCH v6 05/13] xfs: allow block allocator to take an alignment hint John Garry
` (9 subsequent siblings)
13 siblings, 1 reply; 65+ messages in thread
From: John Garry @ 2025-03-13 17:13 UTC (permalink / raw)
To: brauner, djwong, cem, dchinner, hch
Cc: linux-xfs, linux-fsdevel, linux-kernel, ojaswin, ritesh.list,
martin.petersen, tytso, linux-ext4, John Garry
In future we will want more boolean options for xfs_reflink_allocate_cow(),
so just prepare for this by passing a flags arg for @convert_now.
Suggested-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: John Garry <john.g.garry@oracle.com>
---
fs/xfs/xfs_iomap.c | 7 +++++--
fs/xfs/xfs_reflink.c | 12 ++++++++----
fs/xfs/xfs_reflink.h | 8 +++++++-
3 files changed, 20 insertions(+), 7 deletions(-)
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 9a22ecd794eb..8196e66b099b 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -813,6 +813,7 @@ xfs_direct_write_iomap_begin(
xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
xfs_fileoff_t end_fsb = xfs_iomap_end_fsb(mp, offset, length);
int nimaps = 1, error = 0;
+ unsigned int reflink_flags = 0;
bool shared = false;
u16 iomap_flags = 0;
unsigned int lockmode;
@@ -823,6 +824,9 @@ xfs_direct_write_iomap_begin(
if (xfs_is_shutdown(mp))
return -EIO;
+ if (flags & IOMAP_DIRECT || IS_DAX(inode))
+ reflink_flags |= XFS_REFLINK_CONVERT_UNWRITTEN;
+
/*
* Writes that span EOF might trigger an IO size update on completion,
* so consider them to be dirty for the purposes of O_DSYNC even if
@@ -870,8 +874,7 @@ xfs_direct_write_iomap_begin(
/* may drop and re-acquire the ilock */
error = xfs_reflink_allocate_cow(ip, &imap, &cmap, &shared,
- &lockmode,
- (flags & IOMAP_DIRECT) || IS_DAX(inode));
+ &lockmode, reflink_flags);
if (error)
goto out_unlock;
if (shared)
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index cc3b4df88110..f8363c6b0f39 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -435,7 +435,7 @@ xfs_reflink_fill_cow_hole(
struct xfs_bmbt_irec *cmap,
bool *shared,
uint *lockmode,
- bool convert_now)
+ unsigned int flags)
{
struct xfs_mount *mp = ip->i_mount;
struct xfs_trans *tp;
@@ -488,7 +488,8 @@ xfs_reflink_fill_cow_hole(
return error;
convert:
- return xfs_reflink_convert_unwritten(ip, imap, cmap, convert_now);
+ return xfs_reflink_convert_unwritten(ip, imap, cmap,
+ flags & XFS_REFLINK_CONVERT_UNWRITTEN);
out_trans_cancel:
xfs_trans_cancel(tp);
@@ -566,10 +567,13 @@ xfs_reflink_allocate_cow(
struct xfs_bmbt_irec *cmap,
bool *shared,
uint *lockmode,
- bool convert_now)
+ unsigned int flags)
{
int error;
bool found;
+ bool convert_now;
+
+ convert_now = flags & XFS_REFLINK_CONVERT_UNWRITTEN;
xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
if (!ip->i_cowfp) {
@@ -592,7 +596,7 @@ xfs_reflink_allocate_cow(
*/
if (cmap->br_startoff > imap->br_startoff)
return xfs_reflink_fill_cow_hole(ip, imap, cmap, shared,
- lockmode, convert_now);
+ lockmode, flags);
/*
* CoW fork has a delalloc reservation. Replace it with a real extent.
diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h
index cc4e92278279..18f9624017cd 100644
--- a/fs/xfs/xfs_reflink.h
+++ b/fs/xfs/xfs_reflink.h
@@ -6,6 +6,12 @@
#ifndef __XFS_REFLINK_H
#define __XFS_REFLINK_H 1
+/*
+ * Flags for xfs_reflink_allocate_cow() and callees
+ */
+/* convert unwritten extents now */
+#define XFS_REFLINK_CONVERT_UNWRITTEN (1u << 0)
+
/*
* Check whether it is safe to free COW fork blocks from an inode. It is unsafe
* to do so when an inode has dirty cache or I/O in-flight, even if no shared
@@ -32,7 +38,7 @@ int xfs_bmap_trim_cow(struct xfs_inode *ip, struct xfs_bmbt_irec *imap,
int xfs_reflink_allocate_cow(struct xfs_inode *ip, struct xfs_bmbt_irec *imap,
struct xfs_bmbt_irec *cmap, bool *shared, uint *lockmode,
- bool convert_now);
+ unsigned int flags);
extern int xfs_reflink_convert_cow(struct xfs_inode *ip, xfs_off_t offset,
xfs_off_t count);
--
2.31.1
^ permalink raw reply related [flat|nested] 65+ messages in thread
* Re: [PATCH v6 04/13] xfs: pass flags to xfs_reflink_allocate_cow()
2025-03-13 17:13 ` [PATCH v6 04/13] xfs: pass flags to xfs_reflink_allocate_cow() John Garry
@ 2025-03-17 6:15 ` Christoph Hellwig
2025-03-17 9:17 ` John Garry
0 siblings, 1 reply; 65+ messages in thread
From: Christoph Hellwig @ 2025-03-17 6:15 UTC (permalink / raw)
To: John Garry
Cc: brauner, djwong, cem, dchinner, hch, linux-xfs, linux-fsdevel,
linux-kernel, ojaswin, ritesh.list, martin.petersen, tytso,
linux-ext4
On Thu, Mar 13, 2025 at 05:13:01PM +0000, John Garry wrote:
> @@ -823,6 +824,9 @@ xfs_direct_write_iomap_begin(
> if (xfs_is_shutdown(mp))
> return -EIO;
>
> + if (flags & IOMAP_DIRECT || IS_DAX(inode))
> + reflink_flags |= XFS_REFLINK_CONVERT_UNWRITTEN;
Given that this is where the policy is implemented now, this comment:
/*
* COW fork extents are supposed to remain unwritten until we're ready
* to initiate a disk write. For direct I/O we are going to write the
* data and need the conversion, but for buffered writes we're done.
*/
from xfs_reflink_convert_unwritten should probably move here now.
> - return xfs_reflink_convert_unwritten(ip, imap, cmap, convert_now);
> + return xfs_reflink_convert_unwritten(ip, imap, cmap,
> + flags & XFS_REFLINK_CONVERT_UNWRITTEN);
I'd probably thread the flags argument all the way through
xfs_reflink_convert_unwritten as that documents the intent better.
> +/*
> + * Flags for xfs_reflink_allocate_cow() and callees
> + */
And the full sentence with a .?
^ permalink raw reply [flat|nested] 65+ messages in thread
* Re: [PATCH v6 04/13] xfs: pass flags to xfs_reflink_allocate_cow()
2025-03-17 6:15 ` Christoph Hellwig
@ 2025-03-17 9:17 ` John Garry
2025-03-18 5:33 ` Christoph Hellwig
0 siblings, 1 reply; 65+ messages in thread
From: John Garry @ 2025-03-17 9:17 UTC (permalink / raw)
To: Christoph Hellwig
Cc: brauner, djwong, cem, dchinner, linux-xfs, linux-fsdevel,
linux-kernel, ojaswin, ritesh.list, martin.petersen, tytso,
linux-ext4
On 17/03/2025 06:15, Christoph Hellwig wrote:
> On Thu, Mar 13, 2025 at 05:13:01PM +0000, John Garry wrote:
>> @@ -823,6 +824,9 @@ xfs_direct_write_iomap_begin(
>> if (xfs_is_shutdown(mp))
>> return -EIO;
>>
>> + if (flags & IOMAP_DIRECT || IS_DAX(inode))
>> + reflink_flags |= XFS_REFLINK_CONVERT_UNWRITTEN;
>
> Given that this is where the policy is implemented now, this comment:
>
> /*
> * COW fork extents are supposed to remain unwritten until we're ready
> * to initiate a disk write. For direct I/O we are going to write the
> * data and need the conversion, but for buffered writes we're done.
> */
>
> from xfs_reflink_convert_unwritten should probably move here now.
ok, fine, I can relocate this comment to xfs_direct_write_iomap_begin(),
but please let me know if you prefer an rewording.
>
>> - return xfs_reflink_convert_unwritten(ip, imap, cmap, convert_now);
>> + return xfs_reflink_convert_unwritten(ip, imap, cmap,
>> + flags & XFS_REFLINK_CONVERT_UNWRITTEN);
>
> I'd probably thread the flags argument all the way through
> xfs_reflink_convert_unwritten as that documents the intent better.
ok
>
>> +/*
>> + * Flags for xfs_reflink_allocate_cow() and callees
>> + */
>
> And the full sentence with a .?
>
ok
^ permalink raw reply [flat|nested] 65+ messages in thread
* Re: [PATCH v6 04/13] xfs: pass flags to xfs_reflink_allocate_cow()
2025-03-17 9:17 ` John Garry
@ 2025-03-18 5:33 ` Christoph Hellwig
2025-03-18 8:12 ` John Garry
0 siblings, 1 reply; 65+ messages in thread
From: Christoph Hellwig @ 2025-03-18 5:33 UTC (permalink / raw)
To: John Garry
Cc: Christoph Hellwig, brauner, djwong, cem, dchinner, linux-xfs,
linux-fsdevel, linux-kernel, ojaswin, ritesh.list,
martin.petersen, tytso, linux-ext4
On Mon, Mar 17, 2025 at 09:17:10AM +0000, John Garry wrote:
>>
>> Given that this is where the policy is implemented now, this comment:
>>
>> /*
>> * COW fork extents are supposed to remain unwritten until we're ready
>> * to initiate a disk write. For direct I/O we are going to write the
>> * data and need the conversion, but for buffered writes we're done.
>> */
>>
>> from xfs_reflink_convert_unwritten should probably move here now.
>
> ok, fine, I can relocate this comment to xfs_direct_write_iomap_begin(),
> but please let me know if you prefer an rewording.
I have to admit I found the wording a bit odd, but I failed to come up
with something significantly better.
^ permalink raw reply [flat|nested] 65+ messages in thread
* Re: [PATCH v6 04/13] xfs: pass flags to xfs_reflink_allocate_cow()
2025-03-18 5:33 ` Christoph Hellwig
@ 2025-03-18 8:12 ` John Garry
0 siblings, 0 replies; 65+ messages in thread
From: John Garry @ 2025-03-18 8:12 UTC (permalink / raw)
To: Christoph Hellwig
Cc: brauner, djwong, cem, dchinner, linux-xfs, linux-fsdevel,
linux-kernel, ojaswin, ritesh.list, martin.petersen, tytso,
linux-ext4
On 18/03/2025 05:33, Christoph Hellwig wrote:
> On Mon, Mar 17, 2025 at 09:17:10AM +0000, John Garry wrote:
>>>
>>> Given that this is where the policy is implemented now, this comment:
>>>
>>> /*
>>> * COW fork extents are supposed to remain unwritten until we're ready
>>> * to initiate a disk write. For direct I/O we are going to write the
>>> * data and need the conversion, but for buffered writes we're done.
>>> */
>>>
>>> from xfs_reflink_convert_unwritten should probably move here now.
>>
>> ok, fine, I can relocate this comment to xfs_direct_write_iomap_begin(),
>> but please let me know if you prefer an rewording.
>
> I have to admit I found the wording a bit odd, but I failed to come up
> with something significantly better.
>
maybe when you see the code, you could have a better suggestion, but
I'll keep the wording the same verbatim for now.
^ permalink raw reply [flat|nested] 65+ messages in thread
* [PATCH v6 05/13] xfs: allow block allocator to take an alignment hint
2025-03-13 17:12 [PATCH v6 00/13] large atomic writes for xfs with CoW John Garry
` (3 preceding siblings ...)
2025-03-13 17:13 ` [PATCH v6 04/13] xfs: pass flags to xfs_reflink_allocate_cow() John Garry
@ 2025-03-13 17:13 ` John Garry
2025-03-17 6:16 ` Christoph Hellwig
2025-03-13 17:13 ` [PATCH v6 06/13] xfs: switch atomic write size check in xfs_file_write_iter() John Garry
` (8 subsequent siblings)
13 siblings, 1 reply; 65+ messages in thread
From: John Garry @ 2025-03-13 17:13 UTC (permalink / raw)
To: brauner, djwong, cem, dchinner, hch
Cc: linux-xfs, linux-fsdevel, linux-kernel, ojaswin, ritesh.list,
martin.petersen, tytso, linux-ext4, John Garry
Add a BMAPI flag to provide a hint to the block allocator to align extents
according to the extszhint.
This will be useful for atomic writes to ensure that we are not being
allocated extents which are not suitable (for atomic writes).
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: John Garry <john.g.garry@oracle.com>
---
fs/xfs/libxfs/xfs_bmap.c | 5 +++++
fs/xfs/libxfs/xfs_bmap.h | 6 +++++-
2 files changed, 10 insertions(+), 1 deletion(-)
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 63255820b58a..d954f9b8071f 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -3312,6 +3312,11 @@ xfs_bmap_compute_alignments(
align = xfs_get_cowextsz_hint(ap->ip);
else if (ap->datatype & XFS_ALLOC_USERDATA)
align = xfs_get_extsz_hint(ap->ip);
+
+ /* Try to align start block to any minimum allocation alignment */
+ if (align > 1 && (ap->flags & XFS_BMAPI_EXTSZALIGN))
+ args->alignment = align;
+
if (align) {
if (xfs_bmap_extsize_align(mp, &ap->got, &ap->prev, align, 0,
ap->eof, 0, ap->conv, &ap->offset,
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index b4d9c6e0f3f9..d5f2729305fa 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -87,6 +87,9 @@ struct xfs_bmalloca {
/* Do not update the rmap btree. Used for reconstructing bmbt from rmapbt. */
#define XFS_BMAPI_NORMAP (1u << 10)
+/* Try to align allocations to the extent size hint */
+#define XFS_BMAPI_EXTSZALIGN (1u << 11)
+
#define XFS_BMAPI_FLAGS \
{ XFS_BMAPI_ENTIRE, "ENTIRE" }, \
{ XFS_BMAPI_METADATA, "METADATA" }, \
@@ -98,7 +101,8 @@ struct xfs_bmalloca {
{ XFS_BMAPI_REMAP, "REMAP" }, \
{ XFS_BMAPI_COWFORK, "COWFORK" }, \
{ XFS_BMAPI_NODISCARD, "NODISCARD" }, \
- { XFS_BMAPI_NORMAP, "NORMAP" }
+ { XFS_BMAPI_NORMAP, "NORMAP" },\
+ { XFS_BMAPI_EXTSZALIGN, "EXTSZALIGN" }
static inline int xfs_bmapi_aflag(int w)
--
2.31.1
^ permalink raw reply related [flat|nested] 65+ messages in thread
* Re: [PATCH v6 05/13] xfs: allow block allocator to take an alignment hint
2025-03-13 17:13 ` [PATCH v6 05/13] xfs: allow block allocator to take an alignment hint John Garry
@ 2025-03-17 6:16 ` Christoph Hellwig
0 siblings, 0 replies; 65+ messages in thread
From: Christoph Hellwig @ 2025-03-17 6:16 UTC (permalink / raw)
To: John Garry
Cc: brauner, djwong, cem, dchinner, linux-xfs, linux-fsdevel,
linux-kernel, ojaswin, ritesh.list, martin.petersen, tytso,
linux-ext4
Looks good:
Reviewed-by: Christoph Hellwig <hch@lst.de>
^ permalink raw reply [flat|nested] 65+ messages in thread
* [PATCH v6 06/13] xfs: switch atomic write size check in xfs_file_write_iter()
2025-03-13 17:12 [PATCH v6 00/13] large atomic writes for xfs with CoW John Garry
` (4 preceding siblings ...)
2025-03-13 17:13 ` [PATCH v6 05/13] xfs: allow block allocator to take an alignment hint John Garry
@ 2025-03-13 17:13 ` John Garry
2025-03-17 6:18 ` Christoph Hellwig
2025-03-13 17:13 ` [PATCH v6 07/13] xfs: refactor xfs_reflink_end_cow_extent() John Garry
` (7 subsequent siblings)
13 siblings, 1 reply; 65+ messages in thread
From: John Garry @ 2025-03-13 17:13 UTC (permalink / raw)
To: brauner, djwong, cem, dchinner, hch
Cc: linux-xfs, linux-fsdevel, linux-kernel, ojaswin, ritesh.list,
martin.petersen, tytso, linux-ext4, John Garry
Currently the size of atomic write allowed is fixed at the blocksize.
To start to lift this restriction, partly refactor
xfs_report_atomic_write() to into helpers -
xfs_get_atomic_write_{min, max}_attr() - and use those helpers to find the
per-inode atomic write limits and check according to that.
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: John Garry <john.g.garry@oracle.com>
---
fs/xfs/xfs_file.c | 12 +++++-------
fs/xfs/xfs_iops.c | 28 +++++++++++++++++++++++-----
fs/xfs/xfs_iops.h | 2 ++
3 files changed, 30 insertions(+), 12 deletions(-)
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index fe8cf9d96eb0..7a56ddb86fd2 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1032,14 +1032,12 @@ xfs_file_write_iter(
return xfs_file_dax_write(iocb, from);
if (iocb->ki_flags & IOCB_ATOMIC) {
- /*
- * Currently only atomic writing of a single FS block is
- * supported. It would be possible to atomic write smaller than
- * a FS block, but there is no requirement to support this.
- * Note that iomap also does not support this yet.
- */
- if (ocount != ip->i_mount->m_sb.sb_blocksize)
+ if (ocount < xfs_get_atomic_write_min_attr(ip))
return -EINVAL;
+
+ if (ocount > xfs_get_atomic_write_max_attr(ip))
+ return -EINVAL;
+
ret = generic_atomic_write_valid(iocb, from);
if (ret)
return ret;
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 444193f543ef..64b1f8c73824 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -601,16 +601,34 @@ xfs_report_dioalign(
stat->dio_offset_align = stat->dio_read_offset_align;
}
+unsigned int
+xfs_get_atomic_write_min_attr(
+ struct xfs_inode *ip)
+{
+ if (!xfs_inode_can_atomicwrite(ip))
+ return 0;
+
+ return ip->i_mount->m_sb.sb_blocksize;
+}
+
+unsigned int
+xfs_get_atomic_write_max_attr(
+ struct xfs_inode *ip)
+{
+ if (!xfs_inode_can_atomicwrite(ip))
+ return 0;
+
+ return ip->i_mount->m_sb.sb_blocksize;
+}
+
static void
xfs_report_atomic_write(
struct xfs_inode *ip,
struct kstat *stat)
{
- unsigned int unit_min = 0, unit_max = 0;
-
- if (xfs_inode_can_atomicwrite(ip))
- unit_min = unit_max = ip->i_mount->m_sb.sb_blocksize;
- generic_fill_statx_atomic_writes(stat, unit_min, unit_max);
+ generic_fill_statx_atomic_writes(stat,
+ xfs_get_atomic_write_min_attr(ip),
+ xfs_get_atomic_write_max_attr(ip));
}
STATIC int
diff --git a/fs/xfs/xfs_iops.h b/fs/xfs/xfs_iops.h
index 3c1a2605ffd2..3ef3bb632ad8 100644
--- a/fs/xfs/xfs_iops.h
+++ b/fs/xfs/xfs_iops.h
@@ -19,5 +19,7 @@ int xfs_inode_init_security(struct inode *inode, struct inode *dir,
extern void xfs_setup_inode(struct xfs_inode *ip);
extern void xfs_setup_iops(struct xfs_inode *ip);
extern void xfs_diflags_to_iflags(struct xfs_inode *ip, bool init);
+unsigned int xfs_get_atomic_write_min_attr(struct xfs_inode *ip);
+unsigned int xfs_get_atomic_write_max_attr(struct xfs_inode *ip);
#endif /* __XFS_IOPS_H__ */
--
2.31.1
^ permalink raw reply related [flat|nested] 65+ messages in thread
* Re: [PATCH v6 06/13] xfs: switch atomic write size check in xfs_file_write_iter()
2025-03-13 17:13 ` [PATCH v6 06/13] xfs: switch atomic write size check in xfs_file_write_iter() John Garry
@ 2025-03-17 6:18 ` Christoph Hellwig
2025-03-17 9:17 ` John Garry
0 siblings, 1 reply; 65+ messages in thread
From: Christoph Hellwig @ 2025-03-17 6:18 UTC (permalink / raw)
To: John Garry
Cc: brauner, djwong, cem, dchinner, linux-xfs, linux-fsdevel,
linux-kernel, ojaswin, ritesh.list, martin.petersen, tytso,
linux-ext4
switch seems like odd wording here (but then again I'm not a native
speaker). What about something like "refine" instead?
Otherwise looks good:
Reviewed-by: Christoph Hellwig <hch@lst.de>
^ permalink raw reply [flat|nested] 65+ messages in thread
* Re: [PATCH v6 06/13] xfs: switch atomic write size check in xfs_file_write_iter()
2025-03-17 6:18 ` Christoph Hellwig
@ 2025-03-17 9:17 ` John Garry
0 siblings, 0 replies; 65+ messages in thread
From: John Garry @ 2025-03-17 9:17 UTC (permalink / raw)
To: Christoph Hellwig
Cc: brauner, djwong, cem, dchinner, linux-xfs, linux-fsdevel,
linux-kernel, ojaswin, ritesh.list, martin.petersen, tytso,
linux-ext4
On 17/03/2025 06:18, Christoph Hellwig wrote:
> switch seems like odd wording here (but then again I'm not a native
> speaker). What about something like "refine" instead?
Sure, I can change it
>
> Otherwise looks good:
>
> Reviewed-by: Christoph Hellwig <hch@lst.de>
Thanks
^ permalink raw reply [flat|nested] 65+ messages in thread
* [PATCH v6 07/13] xfs: refactor xfs_reflink_end_cow_extent()
2025-03-13 17:12 [PATCH v6 00/13] large atomic writes for xfs with CoW John Garry
` (5 preceding siblings ...)
2025-03-13 17:13 ` [PATCH v6 06/13] xfs: switch atomic write size check in xfs_file_write_iter() John Garry
@ 2025-03-13 17:13 ` John Garry
2025-03-17 6:19 ` Christoph Hellwig
2025-03-13 17:13 ` [PATCH v6 08/13] xfs: reflink CoW-based atomic write support John Garry
` (6 subsequent siblings)
13 siblings, 1 reply; 65+ messages in thread
From: John Garry @ 2025-03-13 17:13 UTC (permalink / raw)
To: brauner, djwong, cem, dchinner, hch
Cc: linux-xfs, linux-fsdevel, linux-kernel, ojaswin, ritesh.list,
martin.petersen, tytso, linux-ext4, John Garry
Refactor xfs_reflink_end_cow_extent() into separate parts which process
the CoW range and commit the transaction.
This refactoring will be used in future for when it is required to commit
a range of extents as a single transaction, similar to how it was done
pre-commit d6f215f359637.
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: John Garry <john.g.garry@oracle.com>
---
fs/xfs/xfs_reflink.c | 73 ++++++++++++++++++++++++++------------------
1 file changed, 43 insertions(+), 30 deletions(-)
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index f8363c6b0f39..674a812ecb4f 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -790,35 +790,19 @@ xfs_reflink_update_quota(
* requirements as low as possible.
*/
STATIC int
-xfs_reflink_end_cow_extent(
+xfs_reflink_end_cow_extent_locked(
+ struct xfs_trans *tp,
struct xfs_inode *ip,
xfs_fileoff_t *offset_fsb,
xfs_fileoff_t end_fsb)
{
struct xfs_iext_cursor icur;
struct xfs_bmbt_irec got, del, data;
- struct xfs_mount *mp = ip->i_mount;
- struct xfs_trans *tp;
struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_COW_FORK);
- unsigned int resblks;
int nmaps;
bool isrt = XFS_IS_REALTIME_INODE(ip);
int error;
- resblks = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK);
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0,
- XFS_TRANS_RESERVE, &tp);
- if (error)
- return error;
-
- /*
- * Lock the inode. We have to ijoin without automatic unlock because
- * the lead transaction is the refcountbt record deletion; the data
- * fork update follows as a deferred log item.
- */
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(tp, ip, 0);
-
/*
* In case of racing, overlapping AIO writes no COW extents might be
* left by the time I/O completes for the loser of the race. In that
@@ -827,7 +811,7 @@ xfs_reflink_end_cow_extent(
if (!xfs_iext_lookup_extent(ip, ifp, *offset_fsb, &icur, &got) ||
got.br_startoff >= end_fsb) {
*offset_fsb = end_fsb;
- goto out_cancel;
+ return 0;
}
/*
@@ -841,7 +825,7 @@ xfs_reflink_end_cow_extent(
if (!xfs_iext_next_extent(ifp, &icur, &got) ||
got.br_startoff >= end_fsb) {
*offset_fsb = end_fsb;
- goto out_cancel;
+ return 0;
}
}
del = got;
@@ -850,14 +834,14 @@ xfs_reflink_end_cow_extent(
error = xfs_iext_count_extend(tp, ip, XFS_DATA_FORK,
XFS_IEXT_REFLINK_END_COW_CNT);
if (error)
- goto out_cancel;
+ return error;
/* Grab the corresponding mapping in the data fork. */
nmaps = 1;
error = xfs_bmapi_read(ip, del.br_startoff, del.br_blockcount, &data,
&nmaps, 0);
if (error)
- goto out_cancel;
+ return error;
/* We can only remap the smaller of the two extent sizes. */
data.br_blockcount = min(data.br_blockcount, del.br_blockcount);
@@ -886,7 +870,7 @@ xfs_reflink_end_cow_extent(
error = xfs_bunmapi(NULL, ip, data.br_startoff,
data.br_blockcount, 0, 1, &done);
if (error)
- goto out_cancel;
+ return error;
ASSERT(done);
}
@@ -903,17 +887,46 @@ xfs_reflink_end_cow_extent(
/* Remove the mapping from the CoW fork. */
xfs_bmap_del_extent_cow(ip, &icur, &got, &del);
- error = xfs_trans_commit(tp);
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- if (error)
- return error;
-
/* Update the caller about how much progress we made. */
*offset_fsb = del.br_startoff + del.br_blockcount;
return 0;
+}
-out_cancel:
- xfs_trans_cancel(tp);
+
+/*
+ * Remap part of the CoW fork into the data fork.
+ *
+ * We aim to remap the range starting at @offset_fsb and ending at @end_fsb
+ * into the data fork; this function will remap what it can (at the end of the
+ * range) and update @end_fsb appropriately. Each remap gets its own
+ * transaction because we can end up merging and splitting bmbt blocks for
+ * every remap operation and we'd like to keep the block reservation
+ * requirements as low as possible.
+ */
+STATIC int
+xfs_reflink_end_cow_extent(
+ struct xfs_inode *ip,
+ xfs_fileoff_t *offset_fsb,
+ xfs_fileoff_t end_fsb)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_trans *tp;
+ unsigned int resblks;
+ int error;
+
+ resblks = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK);
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0,
+ XFS_TRANS_RESERVE, &tp);
+ if (error)
+ return error;
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, ip, 0);
+
+ error = xfs_reflink_end_cow_extent_locked(tp, ip, offset_fsb, end_fsb);
+ if (error)
+ xfs_trans_cancel(tp);
+ else
+ error = xfs_trans_commit(tp);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
return error;
}
--
2.31.1
^ permalink raw reply related [flat|nested] 65+ messages in thread
* Re: [PATCH v6 07/13] xfs: refactor xfs_reflink_end_cow_extent()
2025-03-13 17:13 ` [PATCH v6 07/13] xfs: refactor xfs_reflink_end_cow_extent() John Garry
@ 2025-03-17 6:19 ` Christoph Hellwig
0 siblings, 0 replies; 65+ messages in thread
From: Christoph Hellwig @ 2025-03-17 6:19 UTC (permalink / raw)
To: John Garry
Cc: brauner, djwong, cem, dchinner, linux-xfs, linux-fsdevel,
linux-kernel, ojaswin, ritesh.list, martin.petersen, tytso,
linux-ext4
Looks good:
Reviewed-by: Christoph Hellwig <hch@lst.de>
^ permalink raw reply [flat|nested] 65+ messages in thread
* [PATCH v6 08/13] xfs: reflink CoW-based atomic write support
2025-03-13 17:12 [PATCH v6 00/13] large atomic writes for xfs with CoW John Garry
` (6 preceding siblings ...)
2025-03-13 17:13 ` [PATCH v6 07/13] xfs: refactor xfs_reflink_end_cow_extent() John Garry
@ 2025-03-13 17:13 ` John Garry
2025-03-17 6:20 ` Christoph Hellwig
2025-03-13 17:13 ` [PATCH v6 09/13] xfs: add XFS_REFLINK_ALLOC_EXTSZALIGN John Garry
` (5 subsequent siblings)
13 siblings, 1 reply; 65+ messages in thread
From: John Garry @ 2025-03-13 17:13 UTC (permalink / raw)
To: brauner, djwong, cem, dchinner, hch
Cc: linux-xfs, linux-fsdevel, linux-kernel, ojaswin, ritesh.list,
martin.petersen, tytso, linux-ext4, John Garry
Add reflink support for CoW-based atomic writes.
A new flag - XFS_REFLINK_FORCE_COW - is added to indicate that a
COW fork extent mapping must be returned from xfs_reflink_allocate_cow().
For atomic writes, the idea is that first a CoW fork staging extent is
allocated and then the data is written to the new extent before finally
atomically the mapping is updated.
The semantics are that if XFS_REFLINK_FORCE_COW is set, we will be passed
a CoW fork extent mapping for no error returned.
If XFS_REFLINK_FORCE_COW is set and we find a real extent in the COW fork,
then continue to return that directly, as this would this belong to either
a. the same CoW fork extent which the atomic write previously allocated.
b. a pre-existing real cow extent which is unwritten
A atomic write cow fork extent should not be shared with other inodes,
and will only exist for the lifetime of the atomic write.
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: John Garry <john.g.garry@oracle.com>
---
fs/xfs/xfs_reflink.c | 18 ++++++++++++++++--
fs/xfs/xfs_reflink.h | 2 ++
2 files changed, 18 insertions(+), 2 deletions(-)
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index 674a812ecb4f..690b1eefeb0e 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -466,7 +466,7 @@ xfs_reflink_fill_cow_hole(
*lockmode = XFS_ILOCK_EXCL;
error = xfs_find_trim_cow_extent(ip, imap, cmap, shared, &found);
- if (error || !*shared)
+ if (error || (!*shared && !(flags & XFS_REFLINK_FORCE_COW)))
goto out_trans_cancel;
if (found) {
@@ -582,9 +582,23 @@ xfs_reflink_allocate_cow(
}
error = xfs_find_trim_cow_extent(ip, imap, cmap, shared, &found);
- if (error || !*shared)
+ if (error)
return error;
+ /*
+ * For no shared data extent, return only as long as
+ * XFS_REFLINK_FORCE_COW is not set.
+ *
+ * For XFS_REFLINK_FORCE_COW set, we always return a COW fork extent
+ * mapping. That would be from either a previously allocated unwritten
+ * COW fork extent, or else a new COW fork extent needs to be
+ * allocated. A previously allocated unwritten COW fork extent could be
+ * from an earlier call with XFS_REFLINK_FORCE_COW set or from a
+ * earlier normal unshare of a data extent.
+ */
+ if (!*shared && !(flags & XFS_REFLINK_FORCE_COW))
+ return 0;
+
/* CoW fork has a real extent */
if (found)
return xfs_reflink_convert_unwritten(ip, imap, cmap,
diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h
index 18f9624017cd..f4115836064b 100644
--- a/fs/xfs/xfs_reflink.h
+++ b/fs/xfs/xfs_reflink.h
@@ -11,6 +11,8 @@
*/
/* convert unwritten extents now */
#define XFS_REFLINK_CONVERT_UNWRITTEN (1u << 0)
+/* force a new COW mapping to be allocated */
+#define XFS_REFLINK_FORCE_COW (1u << 1)
/*
* Check whether it is safe to free COW fork blocks from an inode. It is unsafe
--
2.31.1
^ permalink raw reply related [flat|nested] 65+ messages in thread
* Re: [PATCH v6 08/13] xfs: reflink CoW-based atomic write support
2025-03-13 17:13 ` [PATCH v6 08/13] xfs: reflink CoW-based atomic write support John Garry
@ 2025-03-17 6:20 ` Christoph Hellwig
0 siblings, 0 replies; 65+ messages in thread
From: Christoph Hellwig @ 2025-03-17 6:20 UTC (permalink / raw)
To: John Garry
Cc: brauner, djwong, cem, dchinner, linux-xfs, linux-fsdevel,
linux-kernel, ojaswin, ritesh.list, martin.petersen, tytso,
linux-ext4
Looks good:
Reviewed-by: Christoph Hellwig <hch@lst.de>
^ permalink raw reply [flat|nested] 65+ messages in thread
* [PATCH v6 09/13] xfs: add XFS_REFLINK_ALLOC_EXTSZALIGN
2025-03-13 17:12 [PATCH v6 00/13] large atomic writes for xfs with CoW John Garry
` (7 preceding siblings ...)
2025-03-13 17:13 ` [PATCH v6 08/13] xfs: reflink CoW-based atomic write support John Garry
@ 2025-03-13 17:13 ` John Garry
2025-03-13 18:03 ` Darrick J. Wong
2025-03-17 6:23 ` Christoph Hellwig
2025-03-13 17:13 ` [PATCH v6 10/13] xfs: iomap COW-based atomic write support John Garry
` (4 subsequent siblings)
13 siblings, 2 replies; 65+ messages in thread
From: John Garry @ 2025-03-13 17:13 UTC (permalink / raw)
To: brauner, djwong, cem, dchinner, hch
Cc: linux-xfs, linux-fsdevel, linux-kernel, ojaswin, ritesh.list,
martin.petersen, tytso, linux-ext4, John Garry
Add a flag for the xfs_reflink_allocate_cow() API to allow the caller
indirectly set XFS_BMAPI_EXTSZALIGN.
Signed-off-by: John Garry <john.g.garry@oracle.com>
---
fs/xfs/xfs_reflink.c | 8 ++++++--
fs/xfs/xfs_reflink.h | 2 ++
2 files changed, 8 insertions(+), 2 deletions(-)
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index 690b1eefeb0e..9a419af89949 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -444,6 +444,11 @@ xfs_reflink_fill_cow_hole(
int nimaps;
int error;
bool found;
+ uint32_t bmapi_flags = XFS_BMAPI_COWFORK |
+ XFS_BMAPI_PREALLOC;
+
+ if (flags & XFS_REFLINK_ALLOC_EXTSZALIGN)
+ bmapi_flags |= XFS_BMAPI_EXTSZALIGN;
resaligned = xfs_aligned_fsb_count(imap->br_startoff,
imap->br_blockcount, xfs_get_cowextsz_hint(ip));
@@ -477,8 +482,7 @@ xfs_reflink_fill_cow_hole(
/* Allocate the entire reservation as unwritten blocks. */
nimaps = 1;
error = xfs_bmapi_write(tp, ip, imap->br_startoff, imap->br_blockcount,
- XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC, 0, cmap,
- &nimaps);
+ bmapi_flags, 0, cmap, &nimaps);
if (error)
goto out_trans_cancel;
diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h
index f4115836064b..0ab1857074e5 100644
--- a/fs/xfs/xfs_reflink.h
+++ b/fs/xfs/xfs_reflink.h
@@ -13,6 +13,8 @@
#define XFS_REFLINK_CONVERT_UNWRITTEN (1u << 0)
/* force a new COW mapping to be allocated */
#define XFS_REFLINK_FORCE_COW (1u << 1)
+/* request block allocations aligned to extszhint */
+#define XFS_REFLINK_ALLOC_EXTSZALIGN (1u << 2)
/*
* Check whether it is safe to free COW fork blocks from an inode. It is unsafe
--
2.31.1
^ permalink raw reply related [flat|nested] 65+ messages in thread
* Re: [PATCH v6 09/13] xfs: add XFS_REFLINK_ALLOC_EXTSZALIGN
2025-03-13 17:13 ` [PATCH v6 09/13] xfs: add XFS_REFLINK_ALLOC_EXTSZALIGN John Garry
@ 2025-03-13 18:03 ` Darrick J. Wong
2025-03-17 6:23 ` Christoph Hellwig
1 sibling, 0 replies; 65+ messages in thread
From: Darrick J. Wong @ 2025-03-13 18:03 UTC (permalink / raw)
To: John Garry
Cc: brauner, cem, dchinner, hch, linux-xfs, linux-fsdevel,
linux-kernel, ojaswin, ritesh.list, martin.petersen, tytso,
linux-ext4
On Thu, Mar 13, 2025 at 05:13:06PM +0000, John Garry wrote:
> Add a flag for the xfs_reflink_allocate_cow() API to allow the caller
> indirectly set XFS_BMAPI_EXTSZALIGN.
>
> Signed-off-by: John Garry <john.g.garry@oracle.com>
Looks pretty straightforward to me...
Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
--D
> ---
> fs/xfs/xfs_reflink.c | 8 ++++++--
> fs/xfs/xfs_reflink.h | 2 ++
> 2 files changed, 8 insertions(+), 2 deletions(-)
>
> diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
> index 690b1eefeb0e..9a419af89949 100644
> --- a/fs/xfs/xfs_reflink.c
> +++ b/fs/xfs/xfs_reflink.c
> @@ -444,6 +444,11 @@ xfs_reflink_fill_cow_hole(
> int nimaps;
> int error;
> bool found;
> + uint32_t bmapi_flags = XFS_BMAPI_COWFORK |
> + XFS_BMAPI_PREALLOC;
> +
> + if (flags & XFS_REFLINK_ALLOC_EXTSZALIGN)
> + bmapi_flags |= XFS_BMAPI_EXTSZALIGN;
>
> resaligned = xfs_aligned_fsb_count(imap->br_startoff,
> imap->br_blockcount, xfs_get_cowextsz_hint(ip));
> @@ -477,8 +482,7 @@ xfs_reflink_fill_cow_hole(
> /* Allocate the entire reservation as unwritten blocks. */
> nimaps = 1;
> error = xfs_bmapi_write(tp, ip, imap->br_startoff, imap->br_blockcount,
> - XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC, 0, cmap,
> - &nimaps);
> + bmapi_flags, 0, cmap, &nimaps);
> if (error)
> goto out_trans_cancel;
>
> diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h
> index f4115836064b..0ab1857074e5 100644
> --- a/fs/xfs/xfs_reflink.h
> +++ b/fs/xfs/xfs_reflink.h
> @@ -13,6 +13,8 @@
> #define XFS_REFLINK_CONVERT_UNWRITTEN (1u << 0)
> /* force a new COW mapping to be allocated */
> #define XFS_REFLINK_FORCE_COW (1u << 1)
> +/* request block allocations aligned to extszhint */
> +#define XFS_REFLINK_ALLOC_EXTSZALIGN (1u << 2)
>
> /*
> * Check whether it is safe to free COW fork blocks from an inode. It is unsafe
> --
> 2.31.1
>
>
^ permalink raw reply [flat|nested] 65+ messages in thread
* Re: [PATCH v6 09/13] xfs: add XFS_REFLINK_ALLOC_EXTSZALIGN
2025-03-13 17:13 ` [PATCH v6 09/13] xfs: add XFS_REFLINK_ALLOC_EXTSZALIGN John Garry
2025-03-13 18:03 ` Darrick J. Wong
@ 2025-03-17 6:23 ` Christoph Hellwig
1 sibling, 0 replies; 65+ messages in thread
From: Christoph Hellwig @ 2025-03-17 6:23 UTC (permalink / raw)
To: John Garry
Cc: brauner, djwong, cem, dchinner, hch, linux-xfs, linux-fsdevel,
linux-kernel, ojaswin, ritesh.list, martin.petersen, tytso,
linux-ext4
On Thu, Mar 13, 2025 at 05:13:06PM +0000, John Garry wrote:
> + if (flags & XFS_REFLINK_ALLOC_EXTSZALIGN)
This line has an extra whitespace.
Otherwise looks good:
Reviewed-by: Christoph Hellwig <hch@lst.de>
^ permalink raw reply [flat|nested] 65+ messages in thread
* [PATCH v6 10/13] xfs: iomap COW-based atomic write support
2025-03-13 17:12 [PATCH v6 00/13] large atomic writes for xfs with CoW John Garry
` (8 preceding siblings ...)
2025-03-13 17:13 ` [PATCH v6 09/13] xfs: add XFS_REFLINK_ALLOC_EXTSZALIGN John Garry
@ 2025-03-13 17:13 ` John Garry
2025-03-16 6:53 ` Ritesh Harjani
2025-03-17 7:26 ` Christoph Hellwig
2025-03-13 17:13 ` [PATCH v6 11/13] xfs: add xfs_file_dio_write_atomic() John Garry
` (3 subsequent siblings)
13 siblings, 2 replies; 65+ messages in thread
From: John Garry @ 2025-03-13 17:13 UTC (permalink / raw)
To: brauner, djwong, cem, dchinner, hch
Cc: linux-xfs, linux-fsdevel, linux-kernel, ojaswin, ritesh.list,
martin.petersen, tytso, linux-ext4, John Garry
In cases of an atomic write covering misaligned or discontiguous disk
blocks, we will use a CoW-based method to issue the atomic write.
So, for that case, return -EAGAIN to request that the write be issued in
CoW atomic write mode. The dio write path should detect this, similar to
how misaligned regular DIO writes are handled.
For normal REQ_ATOMIC-based mode, when the range which we are atomic
writing to covers a shared data extent, try to allocate a new CoW fork.
However, if we find that what we allocated does not meet atomic write
requirements in terms of length and alignment, then fallback on the
CoW-based mode for the atomic write.
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: John Garry <john.g.garry@oracle.com>
---
fs/xfs/xfs_iomap.c | 131 ++++++++++++++++++++++++++++++++++++++++++++-
fs/xfs/xfs_iomap.h | 1 +
2 files changed, 130 insertions(+), 2 deletions(-)
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 8196e66b099b..88d86cabb8a1 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -798,6 +798,23 @@ imap_spans_range(
return true;
}
+static bool
+xfs_bmap_valid_for_atomic_write(
+ struct xfs_bmbt_irec *imap,
+ xfs_fileoff_t offset_fsb,
+ xfs_fileoff_t end_fsb)
+{
+ /* Misaligned start block wrt size */
+ if (!IS_ALIGNED(imap->br_startblock, imap->br_blockcount))
+ return false;
+
+ /* Discontiguous extents */
+ if (!imap_spans_range(imap, offset_fsb, end_fsb))
+ return false;
+
+ return true;
+}
+
static int
xfs_direct_write_iomap_begin(
struct inode *inode,
@@ -812,10 +829,12 @@ xfs_direct_write_iomap_begin(
struct xfs_bmbt_irec imap, cmap;
xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
xfs_fileoff_t end_fsb = xfs_iomap_end_fsb(mp, offset, length);
+ xfs_fileoff_t orig_end_fsb = end_fsb;
int nimaps = 1, error = 0;
unsigned int reflink_flags = 0;
bool shared = false;
u16 iomap_flags = 0;
+ bool needs_alloc;
unsigned int lockmode;
u64 seq;
@@ -877,13 +896,44 @@ xfs_direct_write_iomap_begin(
&lockmode, reflink_flags);
if (error)
goto out_unlock;
- if (shared)
+ if (shared) {
+ /*
+ * Since we got a CoW fork extent mapping, ensure that
+ * the mapping is actually suitable for an
+ * REQ_ATOMIC-based atomic write, i.e. properly aligned
+ * and covers the full range of the write. Otherwise,
+ * we need to use the COW-based atomic write mode.
+ */
+ if ((flags & IOMAP_ATOMIC) &&
+ !xfs_bmap_valid_for_atomic_write(&cmap,
+ offset_fsb, end_fsb)) {
+ error = -EAGAIN;
+ goto out_unlock;
+ }
goto out_found_cow;
+ }
end_fsb = imap.br_startoff + imap.br_blockcount;
length = XFS_FSB_TO_B(mp, end_fsb) - offset;
}
- if (imap_needs_alloc(inode, flags, &imap, nimaps))
+ needs_alloc = imap_needs_alloc(inode, flags, &imap, nimaps);
+
+ if (flags & IOMAP_ATOMIC) {
+ error = -EAGAIN;
+ /*
+ * If we allocate less than what is required for the write
+ * then we may end up with multiple mappings, which means that
+ * REQ_ATOMIC-based cannot be used, so avoid this possibility.
+ */
+ if (needs_alloc && orig_end_fsb - offset_fsb > 1)
+ goto out_unlock;
+
+ if (!xfs_bmap_valid_for_atomic_write(&imap, offset_fsb,
+ orig_end_fsb))
+ goto out_unlock;
+ }
+
+ if (needs_alloc)
goto allocate_blocks;
/*
@@ -1024,6 +1074,83 @@ const struct iomap_ops xfs_zoned_direct_write_iomap_ops = {
};
#endif /* CONFIG_XFS_RT */
+static int
+xfs_atomic_write_cow_iomap_begin(
+ struct inode *inode,
+ loff_t offset,
+ loff_t length,
+ unsigned flags,
+ struct iomap *iomap,
+ struct iomap *srcmap)
+{
+ ASSERT(flags & IOMAP_WRITE);
+ ASSERT(flags & IOMAP_DIRECT);
+
+ struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_bmbt_irec imap, cmap;
+ xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
+ xfs_fileoff_t end_fsb = xfs_iomap_end_fsb(mp, offset, length);
+ int nimaps = 1, error;
+ bool shared = false;
+ unsigned int lockmode = XFS_ILOCK_EXCL;
+ u64 seq;
+
+ if (xfs_is_shutdown(mp))
+ return -EIO;
+
+ if (!xfs_has_reflink(mp))
+ return -EINVAL;
+
+ error = xfs_ilock_for_iomap(ip, flags, &lockmode);
+ if (error)
+ return error;
+
+ error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap,
+ &nimaps, 0);
+ if (error)
+ goto out_unlock;
+
+ /*
+ * Use XFS_REFLINK_ALLOC_EXTSZALIGN to hint at aligning new extents
+ * according to extszhint, such that there will be a greater chance
+ * that future atomic writes to that same range will be aligned (and
+ * don't require this COW-based method).
+ */
+ error = xfs_reflink_allocate_cow(ip, &imap, &cmap, &shared,
+ &lockmode, XFS_REFLINK_CONVERT_UNWRITTEN |
+ XFS_REFLINK_FORCE_COW | XFS_REFLINK_ALLOC_EXTSZALIGN);
+ /*
+ * Don't check @shared. For atomic writes, we should error when
+ * we don't get a COW fork extent mapping.
+ */
+ if (error)
+ goto out_unlock;
+
+ end_fsb = imap.br_startoff + imap.br_blockcount;
+
+ length = XFS_FSB_TO_B(mp, cmap.br_startoff + cmap.br_blockcount);
+ trace_xfs_iomap_found(ip, offset, length - offset, XFS_COW_FORK, &cmap);
+ if (imap.br_startblock != HOLESTARTBLOCK) {
+ seq = xfs_iomap_inode_sequence(ip, 0);
+ error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0, seq);
+ if (error)
+ goto out_unlock;
+ }
+ seq = xfs_iomap_inode_sequence(ip, IOMAP_F_SHARED);
+ xfs_iunlock(ip, lockmode);
+ return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, IOMAP_F_SHARED, seq);
+
+out_unlock:
+ if (lockmode)
+ xfs_iunlock(ip, lockmode);
+ return error;
+}
+
+const struct iomap_ops xfs_atomic_write_cow_iomap_ops = {
+ .iomap_begin = xfs_atomic_write_cow_iomap_begin,
+};
+
static int
xfs_dax_write_iomap_end(
struct inode *inode,
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index d330c4a581b1..674f8ac1b9bd 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -56,5 +56,6 @@ extern const struct iomap_ops xfs_read_iomap_ops;
extern const struct iomap_ops xfs_seek_iomap_ops;
extern const struct iomap_ops xfs_xattr_iomap_ops;
extern const struct iomap_ops xfs_dax_write_iomap_ops;
+extern const struct iomap_ops xfs_atomic_write_cow_iomap_ops;
#endif /* __XFS_IOMAP_H__*/
--
2.31.1
^ permalink raw reply related [flat|nested] 65+ messages in thread
* Re: [PATCH v6 10/13] xfs: iomap COW-based atomic write support
2025-03-13 17:13 ` [PATCH v6 10/13] xfs: iomap COW-based atomic write support John Garry
@ 2025-03-16 6:53 ` Ritesh Harjani
2025-03-17 8:54 ` John Garry
2025-03-17 7:26 ` Christoph Hellwig
1 sibling, 1 reply; 65+ messages in thread
From: Ritesh Harjani @ 2025-03-16 6:53 UTC (permalink / raw)
To: John Garry, brauner, djwong, cem, dchinner, hch
Cc: linux-xfs, linux-fsdevel, linux-kernel, ojaswin, martin.petersen,
tytso, linux-ext4, John Garry
Hello,
John Garry <john.g.garry@oracle.com> writes:
> In cases of an atomic write covering misaligned or discontiguous disk
> blocks, we will use a CoW-based method to issue the atomic write.
Looks like the 1st time write to a given logical range of a file (e.g an
append write or writes on a hole), will also result into CoW based
fallback method, right?. More on that ask below. The commit msg should
capture that as well IMO.
>
> So, for that case, return -EAGAIN to request that the write be issued in
> CoW atomic write mode. The dio write path should detect this, similar to
> how misaligned regular DIO writes are handled.
>
> For normal REQ_ATOMIC-based mode, when the range which we are atomic
> writing to covers a shared data extent, try to allocate a new CoW fork.
> However, if we find that what we allocated does not meet atomic write
> requirements in terms of length and alignment, then fallback on the
> CoW-based mode for the atomic write.
>
> Reviewed-by: Darrick J. Wong <djwong@kernel.org>
> Signed-off-by: John Garry <john.g.garry@oracle.com>
> ---
> fs/xfs/xfs_iomap.c | 131 ++++++++++++++++++++++++++++++++++++++++++++-
> fs/xfs/xfs_iomap.h | 1 +
> 2 files changed, 130 insertions(+), 2 deletions(-)
>
> diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
> index 8196e66b099b..88d86cabb8a1 100644
> --- a/fs/xfs/xfs_iomap.c
> +++ b/fs/xfs/xfs_iomap.c
> @@ -798,6 +798,23 @@ imap_spans_range(
> return true;
> }
>
> +static bool
> +xfs_bmap_valid_for_atomic_write(
> + struct xfs_bmbt_irec *imap,
> + xfs_fileoff_t offset_fsb,
> + xfs_fileoff_t end_fsb)
> +{
> + /* Misaligned start block wrt size */
> + if (!IS_ALIGNED(imap->br_startblock, imap->br_blockcount))
> + return false;
> +
> + /* Discontiguous extents */
> + if (!imap_spans_range(imap, offset_fsb, end_fsb))
> + return false;
> +
> + return true;
> +}
> +
> static int
> xfs_direct_write_iomap_begin(
> struct inode *inode,
> @@ -812,10 +829,12 @@ xfs_direct_write_iomap_begin(
> struct xfs_bmbt_irec imap, cmap;
> xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
> xfs_fileoff_t end_fsb = xfs_iomap_end_fsb(mp, offset, length);
> + xfs_fileoff_t orig_end_fsb = end_fsb;
> int nimaps = 1, error = 0;
> unsigned int reflink_flags = 0;
> bool shared = false;
> u16 iomap_flags = 0;
> + bool needs_alloc;
> unsigned int lockmode;
> u64 seq;
>
> @@ -877,13 +896,44 @@ xfs_direct_write_iomap_begin(
> &lockmode, reflink_flags);
> if (error)
> goto out_unlock;
> - if (shared)
> + if (shared) {
> + /*
> + * Since we got a CoW fork extent mapping, ensure that
> + * the mapping is actually suitable for an
> + * REQ_ATOMIC-based atomic write, i.e. properly aligned
> + * and covers the full range of the write. Otherwise,
> + * we need to use the COW-based atomic write mode.
> + */
> + if ((flags & IOMAP_ATOMIC) &&
> + !xfs_bmap_valid_for_atomic_write(&cmap,
> + offset_fsb, end_fsb)) {
> + error = -EAGAIN;
> + goto out_unlock;
> + }
> goto out_found_cow;
> + }
> end_fsb = imap.br_startoff + imap.br_blockcount;
> length = XFS_FSB_TO_B(mp, end_fsb) - offset;
> }
>
> - if (imap_needs_alloc(inode, flags, &imap, nimaps))
> + needs_alloc = imap_needs_alloc(inode, flags, &imap, nimaps);
> +
> + if (flags & IOMAP_ATOMIC) {
> + error = -EAGAIN;
> + /*
> + * If we allocate less than what is required for the write
> + * then we may end up with multiple mappings, which means that
> + * REQ_ATOMIC-based cannot be used, so avoid this possibility.
> + */
> + if (needs_alloc && orig_end_fsb - offset_fsb > 1)
> + goto out_unlock;
I have a quick question here. Based on above check it looks like
allocation requests on a hole or the 1st time allocation (append writes)
for a given logical range will always be done using CoW fallback
mechanism, isn't it? So that means HW based multi-fsblock atomic write
request will only happen for over writes (non-discontigous extent),
correct?
Now, it's not always necessary that if we try to allocate an extent for
the given range, it results into discontiguous extents. e.g. say, if the
entire range being written to is a hole or append writes, then it might
just allocate a single unwritten extent which is valid for doing an
atomic write using HW/BIOs right?
And it is valid to write using unwritten extent as long as we don't have
mixed mappings i.e. the entire range should either be unwritten or
written for the atomic write to be untorned, correct?
I am guessing this is kept intentional?
-ritesh
> +
> + if (!xfs_bmap_valid_for_atomic_write(&imap, offset_fsb,
> + orig_end_fsb))
> + goto out_unlock;
> + }
> +
> + if (needs_alloc)
> goto allocate_blocks;
>
> /*
> @@ -1024,6 +1074,83 @@ const struct iomap_ops xfs_zoned_direct_write_iomap_ops = {
> };
> #endif /* CONFIG_XFS_RT */
>
> +static int
> +xfs_atomic_write_cow_iomap_begin(
> + struct inode *inode,
> + loff_t offset,
> + loff_t length,
> + unsigned flags,
> + struct iomap *iomap,
> + struct iomap *srcmap)
> +{
> + ASSERT(flags & IOMAP_WRITE);
> + ASSERT(flags & IOMAP_DIRECT);
> +
> + struct xfs_inode *ip = XFS_I(inode);
> + struct xfs_mount *mp = ip->i_mount;
> + struct xfs_bmbt_irec imap, cmap;
> + xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
> + xfs_fileoff_t end_fsb = xfs_iomap_end_fsb(mp, offset, length);
> + int nimaps = 1, error;
> + bool shared = false;
> + unsigned int lockmode = XFS_ILOCK_EXCL;
> + u64 seq;
> +
> + if (xfs_is_shutdown(mp))
> + return -EIO;
> +
> + if (!xfs_has_reflink(mp))
> + return -EINVAL;
> +
> + error = xfs_ilock_for_iomap(ip, flags, &lockmode);
> + if (error)
> + return error;
> +
> + error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap,
> + &nimaps, 0);
> + if (error)
> + goto out_unlock;
> +
> + /*
> + * Use XFS_REFLINK_ALLOC_EXTSZALIGN to hint at aligning new extents
> + * according to extszhint, such that there will be a greater chance
> + * that future atomic writes to that same range will be aligned (and
> + * don't require this COW-based method).
> + */
> + error = xfs_reflink_allocate_cow(ip, &imap, &cmap, &shared,
> + &lockmode, XFS_REFLINK_CONVERT_UNWRITTEN |
> + XFS_REFLINK_FORCE_COW | XFS_REFLINK_ALLOC_EXTSZALIGN);
> + /*
> + * Don't check @shared. For atomic writes, we should error when
> + * we don't get a COW fork extent mapping.
> + */
> + if (error)
> + goto out_unlock;
> +
> + end_fsb = imap.br_startoff + imap.br_blockcount;
> +
> + length = XFS_FSB_TO_B(mp, cmap.br_startoff + cmap.br_blockcount);
> + trace_xfs_iomap_found(ip, offset, length - offset, XFS_COW_FORK, &cmap);
> + if (imap.br_startblock != HOLESTARTBLOCK) {
> + seq = xfs_iomap_inode_sequence(ip, 0);
> + error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0, seq);
> + if (error)
> + goto out_unlock;
> + }
> + seq = xfs_iomap_inode_sequence(ip, IOMAP_F_SHARED);
> + xfs_iunlock(ip, lockmode);
> + return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, IOMAP_F_SHARED, seq);
> +
> +out_unlock:
> + if (lockmode)
> + xfs_iunlock(ip, lockmode);
> + return error;
> +}
> +
> +const struct iomap_ops xfs_atomic_write_cow_iomap_ops = {
> + .iomap_begin = xfs_atomic_write_cow_iomap_begin,
> +};
> +
> static int
> xfs_dax_write_iomap_end(
> struct inode *inode,
> diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
> index d330c4a581b1..674f8ac1b9bd 100644
> --- a/fs/xfs/xfs_iomap.h
> +++ b/fs/xfs/xfs_iomap.h
> @@ -56,5 +56,6 @@ extern const struct iomap_ops xfs_read_iomap_ops;
> extern const struct iomap_ops xfs_seek_iomap_ops;
> extern const struct iomap_ops xfs_xattr_iomap_ops;
> extern const struct iomap_ops xfs_dax_write_iomap_ops;
> +extern const struct iomap_ops xfs_atomic_write_cow_iomap_ops;
>
> #endif /* __XFS_IOMAP_H__*/
> --
> 2.31.1
^ permalink raw reply [flat|nested] 65+ messages in thread
* Re: [PATCH v6 10/13] xfs: iomap COW-based atomic write support
2025-03-16 6:53 ` Ritesh Harjani
@ 2025-03-17 8:54 ` John Garry
2025-03-17 14:20 ` Ritesh Harjani
0 siblings, 1 reply; 65+ messages in thread
From: John Garry @ 2025-03-17 8:54 UTC (permalink / raw)
To: Ritesh Harjani (IBM), brauner, djwong, cem, dchinner, hch
Cc: linux-xfs, linux-fsdevel, linux-kernel, ojaswin, martin.petersen,
tytso, linux-ext4
>> + }
>> end_fsb = imap.br_startoff + imap.br_blockcount;
>> length = XFS_FSB_TO_B(mp, end_fsb) - offset;
>> }
>>
>> - if (imap_needs_alloc(inode, flags, &imap, nimaps))
>> + needs_alloc = imap_needs_alloc(inode, flags, &imap, nimaps);
>> +
>> + if (flags & IOMAP_ATOMIC) {
>> + error = -EAGAIN;
>> + /*
>> + * If we allocate less than what is required for the write
>> + * then we may end up with multiple mappings, which means that
>> + * REQ_ATOMIC-based cannot be used, so avoid this possibility.
>> + */
>> + if (needs_alloc && orig_end_fsb - offset_fsb > 1)
>> + goto out_unlock;
>
> I have a quick question here. Based on above check it looks like
> allocation requests on a hole or the 1st time allocation (append writes)
> for a given logical range will always be done using CoW fallback
> mechanism, isn't it?
Right, but...
> So that means HW based multi-fsblock atomic write
> request will only happen for over writes (non-discontigous extent),
> correct?
For an unwritten pre-allocated extent, we can use the REQ_ATOMIC method.
fallocate (without ZERO RANGE) would give a pre-allocated unwritten
extent, and a write there would not technically be an overwrite.
>
> Now, it's not always necessary that if we try to allocate an extent for
> the given range, it results into discontiguous extents. e.g. say, if the
> entire range being written to is a hole or append writes, then it might
> just allocate a single unwritten extent which is valid for doing an
> atomic write using HW/BIOs right?
Right
> And it is valid to write using unwritten extent as long as we don't have
> mixed mappings i.e. the entire range should either be unwritten or
> written for the atomic write to be untorned, correct?
>
We can't write to discontiguous extents, and a mixed mapping would mean
discontiguous extents.
And, as mentioned earlier, it is ok to use REQ_ATOMIC method on an
unwritten extent.
> I am guessing this is kept intentional?
>
Yes
Thanks,
John
^ permalink raw reply [flat|nested] 65+ messages in thread
* Re: [PATCH v6 10/13] xfs: iomap COW-based atomic write support
2025-03-17 8:54 ` John Garry
@ 2025-03-17 14:20 ` Ritesh Harjani
2025-03-17 14:56 ` John Garry
0 siblings, 1 reply; 65+ messages in thread
From: Ritesh Harjani @ 2025-03-17 14:20 UTC (permalink / raw)
To: John Garry, brauner, djwong, cem, dchinner, hch
Cc: linux-xfs, linux-fsdevel, linux-kernel, ojaswin, martin.petersen,
tytso, linux-ext4
John Garry <john.g.garry@oracle.com> writes:
>>> + }
>>> end_fsb = imap.br_startoff + imap.br_blockcount;
>>> length = XFS_FSB_TO_B(mp, end_fsb) - offset;
>>> }
>>>
>>> - if (imap_needs_alloc(inode, flags, &imap, nimaps))
>>> + needs_alloc = imap_needs_alloc(inode, flags, &imap, nimaps);
>>> +
>>> + if (flags & IOMAP_ATOMIC) {
>>> + error = -EAGAIN;
>>> + /*
>>> + * If we allocate less than what is required for the write
>>> + * then we may end up with multiple mappings, which means that
>>> + * REQ_ATOMIC-based cannot be used, so avoid this possibility.
>>> + */
>>> + if (needs_alloc && orig_end_fsb - offset_fsb > 1)
>>> + goto out_unlock;
>>
>> I have a quick question here. Based on above check it looks like
>> allocation requests on a hole or the 1st time allocation (append writes)
>> for a given logical range will always be done using CoW fallback
>> mechanism, isn't it?
>
> Right, but...
>
>
>> So that means HW based multi-fsblock atomic write
>> request will only happen for over writes (non-discontigous extent),
>> correct?
>
> For an unwritten pre-allocated extent, we can use the REQ_ATOMIC method.
>
> fallocate (without ZERO RANGE) would give a pre-allocated unwritten
> extent, and a write there would not technically be an overwrite.
>
>>
>> Now, it's not always necessary that if we try to allocate an extent for
>> the given range, it results into discontiguous extents. e.g. say, if the
>> entire range being written to is a hole or append writes, then it might
>> just allocate a single unwritten extent which is valid for doing an
>> atomic write using HW/BIOs right?
>
> Right
>
>> And it is valid to write using unwritten extent as long as we don't have
>> mixed mappings i.e. the entire range should either be unwritten or
>> written for the atomic write to be untorned, correct?
>>
>
> We can't write to discontiguous extents, and a mixed mapping would mean
> discontiguous extents.
>
> And, as mentioned earlier, it is ok to use REQ_ATOMIC method on an
> unwritten extent.
>
>> I am guessing this is kept intentional?
>>
> Yes
Thanks, John for addressing the queries. It would be helpful to include
this information in the commit message as well then right? Otherwise
IMO, the original commit message looks incomplete.
Maybe we can add this too?
=========================
This patch adds CoW based atomic write support which will be used as a
SW fallback in following scenarios:
- All append write scenarios.
- Any new writes on the region containing holes.
- Writes to any misaligned regions
- Writes to discontiguous extents.
<original commit msg snip>
=========================
In cases of an atomic write covering misaligned or discontiguous disk
blocks, we will use a CoW-based method to issue the atomic write.
-ritesh
^ permalink raw reply [flat|nested] 65+ messages in thread
* Re: [PATCH v6 10/13] xfs: iomap COW-based atomic write support
2025-03-17 14:20 ` Ritesh Harjani
@ 2025-03-17 14:56 ` John Garry
2025-03-18 5:35 ` Christoph Hellwig
0 siblings, 1 reply; 65+ messages in thread
From: John Garry @ 2025-03-17 14:56 UTC (permalink / raw)
To: Ritesh Harjani (IBM), brauner, djwong, cem, dchinner, hch
Cc: linux-xfs, linux-fsdevel, linux-kernel, ojaswin, martin.petersen,
tytso, linux-ext4
On 17/03/2025 14:20, Ritesh Harjani (IBM) wrote:
>> And, as mentioned earlier, it is ok to use REQ_ATOMIC method on an
>> unwritten extent.
>>
>>> I am guessing this is kept intentional?
>>>
>> Yes
> Thanks, John for addressing the queries. It would be helpful to include
> this information in the commit message as well then right? Otherwise
> IMO, the original commit message looks incomplete.
ok, fine. I am just worried that these commit messages become too wordy.
But, if people want this info, then I can provide it.
>
> Maybe we can add this too?
> =========================
> This patch adds CoW based atomic write support which will be used as a
> SW fallback in following scenarios:
>
> - All append write scenarios.
> - Any new writes on the region containing holes.
but only for > 1x block.
if we must be able to alloc at least a block :)
> - Writes to any misaligned regions
> - Writes to discontiguous extents.
ok, fine
>
>
> <original commit msg snip>
> =========================
> In cases of an atomic write covering misaligned or discontiguous disk
> blocks, we will use a CoW-based method to issue the atomic write.
sure
Thanks,
John
^ permalink raw reply [flat|nested] 65+ messages in thread
* Re: [PATCH v6 10/13] xfs: iomap COW-based atomic write support
2025-03-17 14:56 ` John Garry
@ 2025-03-18 5:35 ` Christoph Hellwig
0 siblings, 0 replies; 65+ messages in thread
From: Christoph Hellwig @ 2025-03-18 5:35 UTC (permalink / raw)
To: John Garry
Cc: Ritesh Harjani (IBM), brauner, djwong, cem, dchinner, hch,
linux-xfs, linux-fsdevel, linux-kernel, ojaswin, martin.petersen,
tytso, linux-ext4
On Mon, Mar 17, 2025 at 02:56:52PM +0000, John Garry wrote:
> ok, fine. I am just worried that these commit messages become too wordy.
> But, if people want this info, then I can provide it.
While too wordy commit messages do exit, you're really far from that
threshold. So yes, please explain this.
I think we also really need a document outside of commit logs and
comments that explains the exact atomic write semantics and how
we implement them between the hardware offload and always COW writes.
^ permalink raw reply [flat|nested] 65+ messages in thread
* Re: [PATCH v6 10/13] xfs: iomap COW-based atomic write support
2025-03-13 17:13 ` [PATCH v6 10/13] xfs: iomap COW-based atomic write support John Garry
2025-03-16 6:53 ` Ritesh Harjani
@ 2025-03-17 7:26 ` Christoph Hellwig
2025-03-17 10:18 ` John Garry
1 sibling, 1 reply; 65+ messages in thread
From: Christoph Hellwig @ 2025-03-17 7:26 UTC (permalink / raw)
To: John Garry
Cc: brauner, djwong, cem, dchinner, hch, linux-xfs, linux-fsdevel,
linux-kernel, ojaswin, ritesh.list, martin.petersen, tytso,
linux-ext4
> +static bool
> +xfs_bmap_valid_for_atomic_write(
This is misnamed. It checks if the hardware offload an be used.
> + /* Misaligned start block wrt size */
> + if (!IS_ALIGNED(imap->br_startblock, imap->br_blockcount))
> + return false;
It is very obvious that this checks for a natural alignment of the
block number. But it fails to explain why it checks for that.
> +
> + /* Discontiguous extents */
> + if (!imap_spans_range(imap, offset_fsb, end_fsb))
Same here.
> + if (shared) {
> + /*
> + * Since we got a CoW fork extent mapping, ensure that
> + * the mapping is actually suitable for an
> + * REQ_ATOMIC-based atomic write, i.e. properly aligned
> + * and covers the full range of the write. Otherwise,
> + * we need to use the COW-based atomic write mode.
> + */
> + if ((flags & IOMAP_ATOMIC) &&
> + !xfs_bmap_valid_for_atomic_write(&cmap,
The "Since.." implies there is something special about COW fork mappings.
But I don't think there is, or am I missing something?
If xfs_bmap_valid_for_atomic_write was properly named and documented
this comment should probably just go away.
> +static int
> +xfs_atomic_write_cow_iomap_begin(
Should the atomic and cow be together for coherent naming?
But even if the naming is coherent it isn't really
self-explanatory, so please add a little top of the function
comment introducing it.
> + error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap,
> + &nimaps, 0);
> + if (error)
> + goto out_unlock;
Why does this need to read the existing data for mapping? You'll
overwrite everything through the COW fork anyway.
^ permalink raw reply [flat|nested] 65+ messages in thread
* Re: [PATCH v6 10/13] xfs: iomap COW-based atomic write support
2025-03-17 7:26 ` Christoph Hellwig
@ 2025-03-17 10:18 ` John Garry
2025-03-18 5:39 ` Christoph Hellwig
0 siblings, 1 reply; 65+ messages in thread
From: John Garry @ 2025-03-17 10:18 UTC (permalink / raw)
To: Christoph Hellwig
Cc: brauner, djwong, cem, dchinner, hch, linux-xfs, linux-fsdevel,
linux-kernel, ojaswin, ritesh.list, martin.petersen, tytso,
linux-ext4
On 17/03/2025 07:26, Christoph Hellwig wrote:
>> +static bool
>> +xfs_bmap_valid_for_atomic_write(
>
> This is misnamed. It checks if the hardware offload an be used.
ok, so maybe:
xfs_bmap_atomic_write_hw_possible()?
>
>> + /* Misaligned start block wrt size */
>> + if (!IS_ALIGNED(imap->br_startblock, imap->br_blockcount))
>> + return false;
>
> It is very obvious that this checks for a natural alignment of the
> block number. But it fails to explain why it checks for that.
Fine, so it will be something like "atomic writes are required to be
naturally aligned for disk blocks, which is a block layer rule to ensure
that we won't straddle any boundary or violate write alignment requirement".
>
>> +
>> + /* Discontiguous extents */
>> + if (!imap_spans_range(imap, offset_fsb, end_fsb))
>
> Same here.
>
>> + if (shared) {
>> + /*
>> + * Since we got a CoW fork extent mapping, ensure that
>> + * the mapping is actually suitable for an
>> + * REQ_ATOMIC-based atomic write, i.e. properly aligned
>> + * and covers the full range of the write. Otherwise,
>> + * we need to use the COW-based atomic write mode.
>> + */
>> + if ((flags & IOMAP_ATOMIC) &&
>> + !xfs_bmap_valid_for_atomic_write(&cmap,
>
> The "Since.." implies there is something special about COW fork mappings.
> But I don't think there is, or am I missing something?
nothing special
> If xfs_bmap_valid_for_atomic_write was properly named and documented
> this comment should probably just go away.
sure
>
>> +static int
>> +xfs_atomic_write_cow_iomap_begin(
>
> Should the atomic and cow be together for coherent naming?
> But even if the naming is coherent it isn't really
> self-explanatory, so please add a little top of the function
> comment introducing it.
I can add a comment, but please let me know of any name suggestion
>
>> + error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap,
>> + &nimaps, 0);
>> + if (error)
>> + goto out_unlock;
>
> Why does this need to read the existing data for mapping? You'll
> overwrite everything through the COW fork anyway.
>
We next call xfs_reflink_allocate_cow(), which uses the imap as the
basis to carry the offset and count.
Are you hinting at statically declaring imap, like:
struct xfs_bmbt_irec imap = {
.br_startoff = offset_fsb,
.br_startblock = HOLESTARTBLOCK, //?
.br_blockcount = end_fsb - offset_fsb,
.br_state = XFS_EXT_UNWRITTEN,
};
Note I am not sure what problems which could be encountered in such an
approach.
^ permalink raw reply [flat|nested] 65+ messages in thread
* Re: [PATCH v6 10/13] xfs: iomap COW-based atomic write support
2025-03-17 10:18 ` John Garry
@ 2025-03-18 5:39 ` Christoph Hellwig
2025-03-18 8:22 ` John Garry
0 siblings, 1 reply; 65+ messages in thread
From: Christoph Hellwig @ 2025-03-18 5:39 UTC (permalink / raw)
To: John Garry
Cc: Christoph Hellwig, brauner, djwong, cem, dchinner, hch, linux-xfs,
linux-fsdevel, linux-kernel, ojaswin, ritesh.list,
martin.petersen, tytso, linux-ext4
On Mon, Mar 17, 2025 at 10:18:58AM +0000, John Garry wrote:
> On 17/03/2025 07:26, Christoph Hellwig wrote:
>>> +static bool
>>> +xfs_bmap_valid_for_atomic_write(
>>
>> This is misnamed. It checks if the hardware offload an be used.
>
> ok, so maybe:
>
> xfs_bmap_atomic_write_hw_possible()?
That does sound better.
> Fine, so it will be something like "atomic writes are required to be
> naturally aligned for disk blocks, which is a block layer rule to ensure
> that we won't straddle any boundary or violate write alignment
> requirement".
Much better! Maybe spell out the actual block layer rule, though?
>>
>> Should the atomic and cow be together for coherent naming?
>> But even if the naming is coherent it isn't really
>> self-explanatory, so please add a little top of the function
>> comment introducing it.
>
> I can add a comment, but please let me know of any name suggestion
/*
* Handler for atomic writes implemented by writing out of place through
* the COW fork. If possible we try to use hardware provided atomicy
* instead, which is handled directly in xfs_direct_write_iomap_begin.
*/
>
>>
>>> + error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap,
>>> + &nimaps, 0);
>>> + if (error)
>>> + goto out_unlock;
>>
>> Why does this need to read the existing data for mapping? You'll
>> overwrite everything through the COW fork anyway.
>>
>
> We next call xfs_reflink_allocate_cow(), which uses the imap as the basis
> to carry the offset and count.
Is xfs_reflink_allocate_cow even the right helper to use? We know we
absolutely want a a COW fork extent, we know there can't be delalloc
extent to convert as we flushed dirty data, so most of the logic in it
is pointless.
^ permalink raw reply [flat|nested] 65+ messages in thread
* Re: [PATCH v6 10/13] xfs: iomap COW-based atomic write support
2025-03-18 5:39 ` Christoph Hellwig
@ 2025-03-18 8:22 ` John Garry
2025-03-18 8:32 ` Christoph Hellwig
0 siblings, 1 reply; 65+ messages in thread
From: John Garry @ 2025-03-18 8:22 UTC (permalink / raw)
To: Christoph Hellwig
Cc: Christoph Hellwig, brauner, djwong, cem, dchinner, linux-xfs,
linux-fsdevel, linux-kernel, ojaswin, ritesh.list,
martin.petersen, tytso, linux-ext4
On 18/03/2025 05:39, Christoph Hellwig wrote:
> On Mon, Mar 17, 2025 at 10:18:58AM +0000, John Garry wrote:
>> On 17/03/2025 07:26, Christoph Hellwig wrote:
>>>> +static bool
>>>> +xfs_bmap_valid_for_atomic_write(
>>>
>>> This is misnamed. It checks if the hardware offload an be used.
>>
>> ok, so maybe:
>>
>> xfs_bmap_atomic_write_hw_possible()?
>
> That does sound better.
>
>> Fine, so it will be something like "atomic writes are required to be
>> naturally aligned for disk blocks, which is a block layer rule to ensure
>> that we won't straddle any boundary or violate write alignment
>> requirement".
>
> Much better!
good
> Maybe spell out the actual block layer rule, though?
ok, fine, so that will be something like "writes need to be naturally
aligned to ensure that we don't straddle any bdev atomic boundary".
>
>>>
>>> Should the atomic and cow be together for coherent naming?
>>> But even if the naming is coherent it isn't really
>>> self-explanatory, so please add a little top of the function
>>> comment introducing it.
>>
>> I can add a comment, but please let me know of any name suggestion
>
> /*
> * Handler for atomic writes implemented by writing out of place through
> * the COW fork. If possible we try to use hardware provided atomicy
> * instead, which is handled directly in xfs_direct_write_iomap_begin.
> */
ok, fine
>
>>
>>>
>>>> + error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap,
>>>> + &nimaps, 0);
>>>> + if (error)
>>>> + goto out_unlock;
>>>
>>> Why does this need to read the existing data for mapping? You'll
>>> overwrite everything through the COW fork anyway.
>>>
>>
>> We next call xfs_reflink_allocate_cow(), which uses the imap as the basis
>> to carry the offset and count.
>
> Is xfs_reflink_allocate_cow even the right helper to use? We know we
> absolutely want a a COW fork extent, we know there can't be delalloc
> extent to convert as we flushed dirty data, so most of the logic in it
> is pointless.
Well xfs_reflink_allocate_cow gives us what we want when we set some
flag (XFS_REFLINK_FORCE_COW).
Are you hinting at a dedicated helper? Note that
xfs_reflink_fill_cow_hole() also handles the XFS_REFLINK_FORCE_COW flag.
>
^ permalink raw reply [flat|nested] 65+ messages in thread
* Re: [PATCH v6 10/13] xfs: iomap COW-based atomic write support
2025-03-18 8:22 ` John Garry
@ 2025-03-18 8:32 ` Christoph Hellwig
2025-03-18 17:44 ` John Garry
0 siblings, 1 reply; 65+ messages in thread
From: Christoph Hellwig @ 2025-03-18 8:32 UTC (permalink / raw)
To: John Garry
Cc: brauner, djwong, cem, dchinner, linux-xfs, linux-fsdevel,
linux-kernel, ojaswin, ritesh.list, martin.petersen, tytso,
linux-ext4
On Tue, Mar 18, 2025 at 08:22:53AM +0000, John Garry wrote:
>> Is xfs_reflink_allocate_cow even the right helper to use? We know we
>> absolutely want a a COW fork extent, we know there can't be delalloc
>> extent to convert as we flushed dirty data, so most of the logic in it
>> is pointless.
>
> Well xfs_reflink_allocate_cow gives us what we want when we set some flag
> (XFS_REFLINK_FORCE_COW).
>
> Are you hinting at a dedicated helper? Note that
> xfs_reflink_fill_cow_hole() also handles the XFS_REFLINK_FORCE_COW flag.
We might not even need much of a helper. This all comes from my
experience with the zoned code that always writes out of place as well.
I initially tried to reuse the existing iomap_begin methods and all
these helpers, but in the end it turned out to much, much simpler to
just open code the logic. Now the atomic cow code will be a little
more complex in some aspect (unwritten extents, speculative
preallocations), but also simpler in others (no need to support buffered
I/O including zeroing and sub-block writes that require the ugly
srcmap based read-modify-write), but I suspect the overall trade-off
will be similar.
After all the high-level logic for the atomic COW iomap_begin really
should just be:
- take the ilock
- check the COW fork if there is an extent for the start of the range.
- If yes:
- if the extent is unwritten, convert the part overlapping with
the range to written
- return the extent
- If no:
- allocate a new extent covering the range in the COW fork
Doing this without going down multiple levels of helpers designed for
an entirely different use case will probably simplify things
significantly.
^ permalink raw reply [flat|nested] 65+ messages in thread
* Re: [PATCH v6 10/13] xfs: iomap COW-based atomic write support
2025-03-18 8:32 ` Christoph Hellwig
@ 2025-03-18 17:44 ` John Garry
2025-03-19 7:30 ` Christoph Hellwig
0 siblings, 1 reply; 65+ messages in thread
From: John Garry @ 2025-03-18 17:44 UTC (permalink / raw)
To: Christoph Hellwig
Cc: brauner, djwong, cem, dchinner, linux-xfs, linux-fsdevel,
linux-kernel, ojaswin, ritesh.list, martin.petersen, tytso,
linux-ext4
On 18/03/2025 08:32, Christoph Hellwig wrote:
> On Tue, Mar 18, 2025 at 08:22:53AM +0000, John Garry wrote:
>>> Is xfs_reflink_allocate_cow even the right helper to use? We know we
>>> absolutely want a a COW fork extent, we know there can't be delalloc
>>> extent to convert as we flushed dirty data, so most of the logic in it
>>> is pointless.
>>
>> Well xfs_reflink_allocate_cow gives us what we want when we set some flag
>> (XFS_REFLINK_FORCE_COW).
>>
>> Are you hinting at a dedicated helper? Note that
>> xfs_reflink_fill_cow_hole() also handles the XFS_REFLINK_FORCE_COW flag.
>
> We might not even need much of a helper. This all comes from my
> experience with the zoned code that always writes out of place as well.
> I initially tried to reuse the existing iomap_begin methods and all
> these helpers, but in the end it turned out to much, much simpler to
> just open code the logic. Now the atomic cow code will be a little
> more complex in some aspect (unwritten extents, speculative
> preallocations), but also simpler in others (no need to support buffered
> I/O including zeroing and sub-block writes that require the ugly
> srcmap based read-modify-write), but I suspect the overall trade-off
> will be similar.
>
> After all the high-level logic for the atomic COW iomap_begin really
> should just be:
>
> - take the ilock
> - check the COW fork if there is an extent for the start of the range.
> - If yes:
> - if the extent is unwritten, convert the part overlapping with
> the range to written
I was wondering when it could be written, and then I figured it could be
if we had this sort of scenario:
- initially we have a mixed of shared and unshared file, like:
mnt/file:
EXT: FILE-OFFSET BLOCK-RANGE AG AG-OFFSET TOTAL FLAGS
0: [0..7]: 192..199 0 (192..199) 8 100000
1: [8..15]: 32840..32847 0 (32840..32847) 8 000000
2: [16..20479]: 208..20671 0 (208..20671) 20464 100000
FLAG Values:
0100000 Shared extent
0010000 Unwritten preallocated extent
0001000 Doesn't begin on stripe unit
0000100 Doesn't end on stripe unit
0000010 Doesn't begin on stripe width
0000001 Doesn't end on stripe width
- try atomic write of size 32K @ offset 0
- we first try HW atomics method and call
xfs_direct_write_iomap_begin() -> xfs_reflink_allocate_cow()
- CoW fork mapping is no good for atomics (too short), so try CoW
atomic method
- in CoW atomic method, we find that pre-alloc'ed CoW fork extent (which
was converted to written already)
> - return the extent
> - If no:
> - allocate a new extent covering the range in the COW fork
>
> Doing this without going down multiple levels of helpers designed for
> an entirely different use case will probably simplify things
> significantly.
Please suggest any further modifications to the following attempt. I
have XFS_REFLINK_FORCE_COW still being passed to
xfs_reflink_fill_cow_hole(), but xfs_reflink_fill_cow_hole() is quite a
large function and I am not sure if I want to duplicate lots of it.
static int
xfs_atomic_write_cow_iomap_begin(
struct inode *inode,
loff_t offset,
loff_t length,
unsigned flags,
struct iomap *iomap,
struct iomap *srcmap)
{
ASSERT(flags & IOMAP_WRITE);
ASSERT(flags & IOMAP_DIRECT);
struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
xfs_fileoff_t end_fsb = xfs_iomap_end_fsb(mp, offset, length);
xfs_fileoff_t count_fsb = end_fsb - offset_fsb;
struct xfs_bmbt_irec imap = {
.br_startoff = offset_fsb,
.br_startblock = HOLESTARTBLOCK,
.br_blockcount = end_fsb - offset_fsb,
.br_state = XFS_EXT_UNWRITTEN,
};
struct xfs_bmbt_irec cmap;
bool shared = false;
unsigned int lockmode = XFS_ILOCK_EXCL;
u64 seq;
struct xfs_iext_cursor icur;
if (xfs_is_shutdown(mp))
return -EIO;
if (!xfs_has_reflink(mp))
return -EINVAL;
if (!ip->i_cowfp) {
ASSERT(!xfs_is_reflink_inode(ip));
xfs_ifork_init_cow(ip);
}
error = xfs_ilock_for_iomap(ip, flags, &lockmode);
if (error)
return error;
if (xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &cmap)
&& cmap.br_startoff <= offset_fsb) {
if (cmap.br_state == XFS_EXT_UNWRITTEN) {
xfs_trim_extent(&cmap, offset_fsb, count_fsb);
error = xfs_reflink_convert_unwritten(ip, &imap, &cmap,
XFS_REFLINK_CONVERT_UNWRITTEN);
if (error)
goto out_unlock;
}
} else {
error = xfs_reflink_fill_cow_hole(ip, &imap, &cmap, &shared,
&lockmode, XFS_REFLINK_CONVERT_UNWRITTEN |
XFS_REFLINK_FORCE_COW | XFS_REFLINK_ALLOC_EXTSZALIGN);
if (error)
goto out_unlock;
}
finish:
... // as before
}
xfs_reflink_convert_unwritten() and others are private to reflink.c, so
this function should also prob live there.
thanks
^ permalink raw reply [flat|nested] 65+ messages in thread
* Re: [PATCH v6 10/13] xfs: iomap COW-based atomic write support
2025-03-18 17:44 ` John Garry
@ 2025-03-19 7:30 ` Christoph Hellwig
2025-03-19 10:24 ` John Garry
0 siblings, 1 reply; 65+ messages in thread
From: Christoph Hellwig @ 2025-03-19 7:30 UTC (permalink / raw)
To: John Garry
Cc: Christoph Hellwig, brauner, djwong, cem, dchinner, linux-xfs,
linux-fsdevel, linux-kernel, ojaswin, ritesh.list,
martin.petersen, tytso, linux-ext4
On Tue, Mar 18, 2025 at 05:44:46PM +0000, John Garry wrote:
> Please suggest any further modifications to the following attempt. I have
> XFS_REFLINK_FORCE_COW still being passed to xfs_reflink_fill_cow_hole(),
> but xfs_reflink_fill_cow_hole() is quite a large function and I am not sure
> if I want to duplicate lots of it.
As said I'd do away with the helpers. Below is my completely
untested whiteboard coding attempt, based against the series you
sent out.
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 88d86cabb8a1..06ece7070cfd 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -1083,67 +1083,104 @@ xfs_atomic_write_cow_iomap_begin(
struct iomap *iomap,
struct iomap *srcmap)
{
- ASSERT(flags & IOMAP_WRITE);
- ASSERT(flags & IOMAP_DIRECT);
-
struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
- struct xfs_bmbt_irec imap, cmap;
xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
xfs_fileoff_t end_fsb = xfs_iomap_end_fsb(mp, offset, length);
- int nimaps = 1, error;
- bool shared = false;
- unsigned int lockmode = XFS_ILOCK_EXCL;
+ xfs_filblks_t count_fsb = end_fsb - offset_fsb;
+ int nmaps = 1;
+ xfs_filblks_t resaligned;
+ struct xfs_bmbt_irec cmap;
+ struct xfs_iext_cursor icur;
+ struct xfs_trans *tp;
+ int error;
u64 seq;
+ ASSERT(!XFS_IS_REALTIME_INODE(ip));
+ ASSERT(flags & IOMAP_WRITE);
+ ASSERT(flags & IOMAP_DIRECT);
+
if (xfs_is_shutdown(mp))
return -EIO;
- if (!xfs_has_reflink(mp))
+ if (WARN_ON_ONCE(!xfs_has_reflink(mp)))
return -EINVAL;
- error = xfs_ilock_for_iomap(ip, flags, &lockmode);
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+
+ if (!ip->i_cowfp) {
+ ASSERT(!xfs_is_reflink_inode(ip));
+ xfs_ifork_init_cow(ip);
+ }
+
+ /*
+ * If we don't find an overlapping extent, trim the range we need to
+ * allocate to fit the hole we found.
+ */
+ if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &cmap))
+ cmap.br_startoff = end_fsb;
+ if (cmap.br_startoff <= offset_fsb) {
+ xfs_trim_extent(&cmap, offset_fsb, count_fsb);
+ goto found;
+ }
+
+ end_fsb = cmap.br_startoff;
+ count_fsb = end_fsb - offset_fsb;
+ resaligned = xfs_aligned_fsb_count(offset_fsb, count_fsb,
+ xfs_get_cowextsz_hint(ip));
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+
+ error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write,
+ XFS_DIOSTRAT_SPACE_RES(mp, resaligned), 0, false, &tp);
if (error)
return error;
- error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap,
- &nimaps, 0);
- if (error)
- goto out_unlock;
+ if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &cmap))
+ cmap.br_startoff = end_fsb;
+ if (cmap.br_startoff <= offset_fsb) {
+ xfs_trim_extent(&cmap, offset_fsb, count_fsb);
+ xfs_trans_cancel(tp);
+ goto found;
+ }
- /*
- * Use XFS_REFLINK_ALLOC_EXTSZALIGN to hint at aligning new extents
- * according to extszhint, such that there will be a greater chance
- * that future atomic writes to that same range will be aligned (and
- * don't require this COW-based method).
- */
- error = xfs_reflink_allocate_cow(ip, &imap, &cmap, &shared,
- &lockmode, XFS_REFLINK_CONVERT_UNWRITTEN |
- XFS_REFLINK_FORCE_COW | XFS_REFLINK_ALLOC_EXTSZALIGN);
/*
- * Don't check @shared. For atomic writes, we should error when
- * we don't get a COW fork extent mapping.
+ * Allocate the entire reservation as unwritten blocks.
+ *
+ * Use XFS_BMAPI_EXTSZALIGN to hint at aligning new extents according to
+ * extszhint, such that there will be a greater chance that future
+ * atomic writes to that same range will be aligned (and don't require
+ * this COW-based method).
*/
- if (error)
+ error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb,
+ XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC |
+ XFS_BMAPI_EXTSZALIGN, 0, &cmap, &nmaps);
+ if (error) {
+ xfs_trans_cancel(tp);
goto out_unlock;
+ }
- end_fsb = imap.br_startoff + imap.br_blockcount;
+ xfs_inode_set_cowblocks_tag(ip);
+ error = xfs_trans_commit(tp);
+ if (error)
+ goto out_unlock;
- length = XFS_FSB_TO_B(mp, cmap.br_startoff + cmap.br_blockcount);
- trace_xfs_iomap_found(ip, offset, length - offset, XFS_COW_FORK, &cmap);
- if (imap.br_startblock != HOLESTARTBLOCK) {
- seq = xfs_iomap_inode_sequence(ip, 0);
- error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0, seq);
+found:
+ if (cmap.br_state != XFS_EXT_NORM) {
+ error = xfs_reflink_convert_cow_locked(ip, offset_fsb,
+ count_fsb);
if (error)
goto out_unlock;
+ cmap.br_state = XFS_EXT_NORM;
}
+
+ length = XFS_FSB_TO_B(mp, cmap.br_startoff + cmap.br_blockcount);
+ trace_xfs_iomap_found(ip, offset, length - offset, XFS_COW_FORK, &cmap);
seq = xfs_iomap_inode_sequence(ip, IOMAP_F_SHARED);
- xfs_iunlock(ip, lockmode);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, IOMAP_F_SHARED, seq);
out_unlock:
- if (lockmode)
- xfs_iunlock(ip, lockmode);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
return error;
}
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index b983f5413be6..71116e6a692c 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -293,7 +293,7 @@ xfs_bmap_trim_cow(
return xfs_reflink_trim_around_shared(ip, imap, shared);
}
-static int
+int
xfs_reflink_convert_cow_locked(
struct xfs_inode *ip,
xfs_fileoff_t offset_fsb,
diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h
index 969006661a3f..ab3fa3c95196 100644
--- a/fs/xfs/xfs_reflink.h
+++ b/fs/xfs/xfs_reflink.h
@@ -45,6 +45,8 @@ int xfs_reflink_allocate_cow(struct xfs_inode *ip, struct xfs_bmbt_irec *imap,
unsigned int flags);
extern int xfs_reflink_convert_cow(struct xfs_inode *ip, xfs_off_t offset,
xfs_off_t count);
+int xfs_reflink_convert_cow_locked(struct xfs_inode *ip,
+ xfs_fileoff_t offset_fsb, xfs_filblks_t count_fsb);
extern int xfs_reflink_cancel_cow_blocks(struct xfs_inode *ip,
struct xfs_trans **tpp, xfs_fileoff_t offset_fsb,
^ permalink raw reply related [flat|nested] 65+ messages in thread
* Re: [PATCH v6 10/13] xfs: iomap COW-based atomic write support
2025-03-19 7:30 ` Christoph Hellwig
@ 2025-03-19 10:24 ` John Garry
2025-03-20 5:29 ` Christoph Hellwig
0 siblings, 1 reply; 65+ messages in thread
From: John Garry @ 2025-03-19 10:24 UTC (permalink / raw)
To: Christoph Hellwig
Cc: brauner, djwong, cem, dchinner, linux-xfs, linux-fsdevel,
linux-kernel, ojaswin, ritesh.list, martin.petersen, tytso,
linux-ext4
On 19/03/2025 07:30, Christoph Hellwig wrote:
> On Tue, Mar 18, 2025 at 05:44:46PM +0000, John Garry wrote:
>> Please suggest any further modifications to the following attempt. I have
>> XFS_REFLINK_FORCE_COW still being passed to xfs_reflink_fill_cow_hole(),
>> but xfs_reflink_fill_cow_hole() is quite a large function and I am not sure
>> if I want to duplicate lots of it.
>
> As said I'd do away with the helpers. Below is my completely
> untested whiteboard coding attempt, based against the series you
> sent out.
it seems to work ok, cheers
Just a query, below
>
> diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
> index 88d86cabb8a1..06ece7070cfd 100644
> --- a/fs/xfs/xfs_iomap.c
> +++ b/fs/xfs/xfs_iomap.c
> @@ -1083,67 +1083,104 @@ xfs_atomic_write_cow_iomap_begin(
> struct iomap *iomap,
> struct iomap *srcmap)
> {
> - ASSERT(flags & IOMAP_WRITE);
> - ASSERT(flags & IOMAP_DIRECT);
> -
> struct xfs_inode *ip = XFS_I(inode);
> struct xfs_mount *mp = ip->i_mount;
> - struct xfs_bmbt_irec imap, cmap;
> xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
> xfs_fileoff_t end_fsb = xfs_iomap_end_fsb(mp, offset, length);
> - int nimaps = 1, error;
> - bool shared = false;
> - unsigned int lockmode = XFS_ILOCK_EXCL;
> + xfs_filblks_t count_fsb = end_fsb - offset_fsb;
> + int nmaps = 1;
> + xfs_filblks_t resaligned;
> + struct xfs_bmbt_irec cmap;
> + struct xfs_iext_cursor icur;
> + struct xfs_trans *tp;
> + int error;
> u64 seq;
>
> + ASSERT(!XFS_IS_REALTIME_INODE(ip));
> + ASSERT(flags & IOMAP_WRITE);
> + ASSERT(flags & IOMAP_DIRECT);
> +
> if (xfs_is_shutdown(mp))
> return -EIO;
>
> - if (!xfs_has_reflink(mp))
> + if (WARN_ON_ONCE(!xfs_has_reflink(mp)))
> return -EINVAL;
>
> - error = xfs_ilock_for_iomap(ip, flags, &lockmode);
> + xfs_ilock(ip, XFS_ILOCK_EXCL);
> +
> + if (!ip->i_cowfp) {
> + ASSERT(!xfs_is_reflink_inode(ip));
> + xfs_ifork_init_cow(ip);
> + }
> +
> + /*
> + * If we don't find an overlapping extent, trim the range we need to
> + * allocate to fit the hole we found.
> + */
> + if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &cmap))
> + cmap.br_startoff = end_fsb;
> + if (cmap.br_startoff <= offset_fsb) {
> + xfs_trim_extent(&cmap, offset_fsb, count_fsb);
> + goto found;
> + }
> +
> + end_fsb = cmap.br_startoff;
> + count_fsb = end_fsb - offset_fsb;
> + resaligned = xfs_aligned_fsb_count(offset_fsb, count_fsb,
> + xfs_get_cowextsz_hint(ip));
> + xfs_iunlock(ip, XFS_ILOCK_EXCL);
> +
> + error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write,
> + XFS_DIOSTRAT_SPACE_RES(mp, resaligned), 0, false, &tp);
> if (error)
> return error;
>
> - error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap,
> - &nimaps, 0);
> - if (error)
> - goto out_unlock;
> + if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &cmap))
> + cmap.br_startoff = end_fsb;
Do we really need this logic?
offset_fsb does not change, and logically cmap.br_startoff == end_fsb
already, right?
> + if (cmap.br_startoff <= offset_fsb) {
> + xfs_trim_extent(&cmap, offset_fsb, count_fsb);
> + xfs_trans_cancel(tp);
> + goto found;
> + }
>
> - /*
> - * Use XFS_REFLINK_ALLOC_EXTSZALIGN to hint at aligning new extents
> - * according to extszhint, such that there will be a greater chance
> - * that future atomic writes to that same range will be aligned (and
> - * don't require this COW-based method).
> - */
> - error = xfs_reflink_allocate_cow(ip, &imap, &cmap, &shared,
> - &lockmode, XFS_REFLINK_CONVERT_UNWRITTEN |
> - XFS_REFLINK_FORCE_COW | XFS_REFLINK_ALLOC_EXTSZALIGN);
> /*
> - * Don't check @shared. For atomic writes, we should error when
> - * we don't get a COW fork extent mapping.
> + * Allocate the entire reservation as unwritten blocks.
> + *
> + * Use XFS_BMAPI_EXTSZALIGN to hint at aligning new extents according to
> + * extszhint, such that there will be a greater chance that future
> + * atomic writes to that same range will be aligned (and don't require
> + * this COW-based method).
> */
> - if (error)
> + error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb,
> + XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC |
> + XFS_BMAPI_EXTSZALIGN, 0, &cmap, &nmaps);
^ permalink raw reply [flat|nested] 65+ messages in thread
* Re: [PATCH v6 10/13] xfs: iomap COW-based atomic write support
2025-03-19 10:24 ` John Garry
@ 2025-03-20 5:29 ` Christoph Hellwig
2025-03-20 9:49 ` John Garry
0 siblings, 1 reply; 65+ messages in thread
From: Christoph Hellwig @ 2025-03-20 5:29 UTC (permalink / raw)
To: John Garry
Cc: Christoph Hellwig, brauner, djwong, cem, dchinner, linux-xfs,
linux-fsdevel, linux-kernel, ojaswin, ritesh.list,
martin.petersen, tytso, linux-ext4
On Wed, Mar 19, 2025 at 10:24:55AM +0000, John Garry wrote:
> it seems to work ok, cheers
Better test it very well, this was really just intended as a sketch..
>> + count_fsb = end_fsb - offset_fsb;
>> + resaligned = xfs_aligned_fsb_count(offset_fsb, count_fsb,
>> + xfs_get_cowextsz_hint(ip));
>> + xfs_iunlock(ip, XFS_ILOCK_EXCL);
>> +
>> + error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write,
>> + XFS_DIOSTRAT_SPACE_RES(mp, resaligned), 0, false, &tp);
>> if (error)
>> return error;
>> - error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap,
>> - &nimaps, 0);
>> - if (error)
>> - goto out_unlock;
>> + if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &cmap))
>> + cmap.br_startoff = end_fsb;
>
> Do we really need this logic?
>
> offset_fsb does not change, and logically cmap.br_startoff == end_fsb
> already, right?
Afte unlocking and relocking the ilock the extent layout could have
changed.
^ permalink raw reply [flat|nested] 65+ messages in thread
* Re: [PATCH v6 10/13] xfs: iomap COW-based atomic write support
2025-03-20 5:29 ` Christoph Hellwig
@ 2025-03-20 9:49 ` John Garry
2025-03-20 14:12 ` Christoph Hellwig
0 siblings, 1 reply; 65+ messages in thread
From: John Garry @ 2025-03-20 9:49 UTC (permalink / raw)
To: Christoph Hellwig
Cc: brauner, djwong, cem, dchinner, linux-xfs, linux-fsdevel,
linux-kernel, ojaswin, ritesh.list, martin.petersen, tytso,
linux-ext4
On 20/03/2025 05:29, Christoph Hellwig wrote:
> On Wed, Mar 19, 2025 at 10:24:55AM +0000, John Garry wrote:
>> it seems to work ok, cheers
>
> Better test it very well, this was really just intended as a sketch..
Sure, I have been testing a lot so far.
I had been using fio in verify mode as a method to check racing threads
reading and atomically writing the same file range, so I need to ensure
that it covers the various paths in this function.
>
>>> + count_fsb = end_fsb - offset_fsb;
>>> + resaligned = xfs_aligned_fsb_count(offset_fsb, count_fsb,
>>> + xfs_get_cowextsz_hint(ip));
>>> + xfs_iunlock(ip, XFS_ILOCK_EXCL);
>>> +
>>> + error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write,
>>> + XFS_DIOSTRAT_SPACE_RES(mp, resaligned), 0, false, &tp);
>>> if (error)
>>> return error;
>>> - error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap,
>>> - &nimaps, 0);
>>> - if (error)
>>> - goto out_unlock;
>>> + if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &cmap))
>>> + cmap.br_startoff = end_fsb;
>>
>> Do we really need this logic?
>>
>> offset_fsb does not change, and logically cmap.br_startoff == end_fsb
>> already, right?
>
> Afte unlocking and relocking the ilock the extent layout could have
> changed.
ok, understood. Maybe a comment will help understanding that.
>
^ permalink raw reply [flat|nested] 65+ messages in thread
* Re: [PATCH v6 10/13] xfs: iomap COW-based atomic write support
2025-03-20 9:49 ` John Garry
@ 2025-03-20 14:12 ` Christoph Hellwig
0 siblings, 0 replies; 65+ messages in thread
From: Christoph Hellwig @ 2025-03-20 14:12 UTC (permalink / raw)
To: John Garry
Cc: Christoph Hellwig, brauner, djwong, cem, dchinner, linux-xfs,
linux-fsdevel, linux-kernel, ojaswin, ritesh.list,
martin.petersen, tytso, linux-ext4
On Thu, Mar 20, 2025 at 09:49:44AM +0000, John Garry wrote:
>>> offset_fsb does not change, and logically cmap.br_startoff == end_fsb
>>> already, right?
>>
>> Afte unlocking and relocking the ilock the extent layout could have
>> changed.
>
> ok, understood. Maybe a comment will help understanding that.
Sure. As said this was just intended as draft code. Maybe factoring
out a helper or two would also be useful, although I couldn't think
of a really obvious one.
^ permalink raw reply [flat|nested] 65+ messages in thread
* [PATCH v6 11/13] xfs: add xfs_file_dio_write_atomic()
2025-03-13 17:12 [PATCH v6 00/13] large atomic writes for xfs with CoW John Garry
` (9 preceding siblings ...)
2025-03-13 17:13 ` [PATCH v6 10/13] xfs: iomap COW-based atomic write support John Garry
@ 2025-03-13 17:13 ` John Garry
2025-03-17 6:41 ` Christoph Hellwig
2025-03-13 17:13 ` [PATCH v6 12/13] xfs: commit CoW-based atomic writes atomically John Garry
` (2 subsequent siblings)
13 siblings, 1 reply; 65+ messages in thread
From: John Garry @ 2025-03-13 17:13 UTC (permalink / raw)
To: brauner, djwong, cem, dchinner, hch
Cc: linux-xfs, linux-fsdevel, linux-kernel, ojaswin, ritesh.list,
martin.petersen, tytso, linux-ext4, John Garry
Add xfs_file_dio_write_atomic() for dedicated handling of atomic writes.
In case of -EAGAIN being returned from iomap_dio_rw(), reissue the write
in CoW-based atomic write mode.
For CoW-based mode, ensure that we have no outstanding IOs which we
may trample on.
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: John Garry <john.g.garry@oracle.com>
---
fs/xfs/xfs_file.c | 73 +++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 73 insertions(+)
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 7a56ddb86fd2..029684b54dda 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -725,6 +725,75 @@ xfs_file_dio_write_zoned(
return ret;
}
+/*
+ * Handle block atomic writes
+ *
+ * Two methods of atomic writes are supported:
+ * - REQ_ATOMIC-based, which would typically use some form of HW offload in the
+ * disk
+ * - COW-based, which uses a COW fork as a staging extent for data updates
+ * before atomically updating extent mappings for the range being written
+ *
+ * REQ_ATOMIC-based is the preferred method, and is attempted first. If this
+ * method fails due to REQ_ATOMIC-related constraints, then we retry with the
+ * COW-based method. The REQ_ATOMIC-based method typically will fail if the
+ * write spans multiple extents or the disk blocks are misaligned.
+ *
+ * Similar to xfs_file_dio_write_unaligned(), the retry mechanism is based on
+ * the ->iomap_begin method returning -EAGAIN, which would be when the
+ * REQ_ATOMIC-based write is not possible. In the case of IOCB_NOWAIT being set,
+ * then we will not retry with the COW-based method, and instead pass that
+ * error code back to the caller immediately.
+ *
+ * REQ_ATOMIC-based atomic writes behave such that a racing read which overlaps
+ * with range being atomically written will see all or none of the old data.
+ * Emulate this behaviour for COW-based atomic writes by using
+ * IOMAP_DIO_FORCE_WAIT and inode_dio_wait() to ensure active reads. This also
+ * locks out racing writes, which could trample on the COW fork extent.
+ */
+
+static noinline ssize_t
+xfs_file_dio_write_atomic(
+ struct xfs_inode *ip,
+ struct kiocb *iocb,
+ struct iov_iter *from)
+{
+ unsigned int iolock = XFS_IOLOCK_SHARED;
+ unsigned int dio_flags = 0;
+ const struct iomap_ops *dops = &xfs_direct_write_iomap_ops;
+ ssize_t ret;
+
+retry:
+ ret = xfs_ilock_iocb_for_write(iocb, &iolock);
+ if (ret)
+ return ret;
+
+ ret = xfs_file_write_checks(iocb, from, &iolock, NULL);
+ if (ret)
+ goto out_unlock;
+
+ if (dio_flags & IOMAP_DIO_FORCE_WAIT)
+ inode_dio_wait(VFS_I(ip));
+
+ trace_xfs_file_direct_write(iocb, from);
+ ret = iomap_dio_rw(iocb, from, dops, &xfs_dio_write_ops,
+ dio_flags, NULL, 0);
+
+ if (ret == -EAGAIN && !(iocb->ki_flags & IOCB_NOWAIT) &&
+ dops == &xfs_direct_write_iomap_ops) {
+ xfs_iunlock(ip, iolock);
+ dio_flags = IOMAP_DIO_FORCE_WAIT;
+ dops = &xfs_atomic_write_cow_iomap_ops;
+ iolock = XFS_IOLOCK_EXCL;
+ goto retry;
+ }
+
+out_unlock:
+ if (iolock)
+ xfs_iunlock(ip, iolock);
+ return ret;
+}
+
/*
* Handle block unaligned direct I/O writes
*
@@ -840,6 +909,10 @@ xfs_file_dio_write(
return xfs_file_dio_write_unaligned(ip, iocb, from);
if (xfs_is_zoned_inode(ip))
return xfs_file_dio_write_zoned(ip, iocb, from);
+
+ if (iocb->ki_flags & IOCB_ATOMIC)
+ return xfs_file_dio_write_atomic(ip, iocb, from);
+
return xfs_file_dio_write_aligned(ip, iocb, from,
&xfs_direct_write_iomap_ops, &xfs_dio_write_ops, NULL);
}
--
2.31.1
^ permalink raw reply related [flat|nested] 65+ messages in thread
* Re: [PATCH v6 11/13] xfs: add xfs_file_dio_write_atomic()
2025-03-13 17:13 ` [PATCH v6 11/13] xfs: add xfs_file_dio_write_atomic() John Garry
@ 2025-03-17 6:41 ` Christoph Hellwig
2025-03-17 9:36 ` John Garry
0 siblings, 1 reply; 65+ messages in thread
From: Christoph Hellwig @ 2025-03-17 6:41 UTC (permalink / raw)
To: John Garry
Cc: brauner, djwong, cem, dchinner, hch, linux-xfs, linux-fsdevel,
linux-kernel, ojaswin, ritesh.list, martin.petersen, tytso,
linux-ext4
On Thu, Mar 13, 2025 at 05:13:08PM +0000, John Garry wrote:
> + * REQ_ATOMIC-based is the preferred method, and is attempted first. If this
> + * method fails due to REQ_ATOMIC-related constraints, then we retry with the
> + * COW-based method. The REQ_ATOMIC-based method typically will fail if the
> + * write spans multiple extents or the disk blocks are misaligned.
It is only preferred if actually supported by the underlying hardware.
If it isn't it really shouldn't even be tried, as that is just a waste
of cycles.
Also a lot of comment should probably be near the code not on top
of the function as that's where people would look for them.
> +static noinline ssize_t
> +xfs_file_dio_write_atomic(
> + struct xfs_inode *ip,
> + struct kiocb *iocb,
> + struct iov_iter *from)
> +{
> + unsigned int iolock = XFS_IOLOCK_SHARED;
> + unsigned int dio_flags = 0;
> + const struct iomap_ops *dops = &xfs_direct_write_iomap_ops;
> + ssize_t ret;
> +
> +retry:
> + ret = xfs_ilock_iocb_for_write(iocb, &iolock);
> + if (ret)
> + return ret;
> +
> + ret = xfs_file_write_checks(iocb, from, &iolock, NULL);
> + if (ret)
> + goto out_unlock;
> +
> + if (dio_flags & IOMAP_DIO_FORCE_WAIT)
> + inode_dio_wait(VFS_I(ip));
> +
> + trace_xfs_file_direct_write(iocb, from);
> + ret = iomap_dio_rw(iocb, from, dops, &xfs_dio_write_ops,
> + dio_flags, NULL, 0);
The normal direct I/O path downgrades the iolock to shared before
doing the I/O here. Why isn't that done here?
> + if (ret == -EAGAIN && !(iocb->ki_flags & IOCB_NOWAIT) &&
> + dops == &xfs_direct_write_iomap_ops) {
This should probably explain the unusual use of EGAIN. Although I
still feel that picking a different error code for the fallback would
be much more maintainable.
> + xfs_iunlock(ip, iolock);
> + dio_flags = IOMAP_DIO_FORCE_WAIT;
I notice the top of function comment mentions the IOMAP_DIO_FORCE_WAIT
flag. Maybe use the chance to write a full sentence here or where
it is checked to explain the logic a bit better?
> * Handle block unaligned direct I/O writes
> *
> @@ -840,6 +909,10 @@ xfs_file_dio_write(
> return xfs_file_dio_write_unaligned(ip, iocb, from);
> if (xfs_is_zoned_inode(ip))
> return xfs_file_dio_write_zoned(ip, iocb, from);
> +
> + if (iocb->ki_flags & IOCB_ATOMIC)
> + return xfs_file_dio_write_atomic(ip, iocb, from);
> +
Either keep space between all the conditional calls or none. I doubt
just stick to the existing style.
^ permalink raw reply [flat|nested] 65+ messages in thread
* Re: [PATCH v6 11/13] xfs: add xfs_file_dio_write_atomic()
2025-03-17 6:41 ` Christoph Hellwig
@ 2025-03-17 9:36 ` John Garry
2025-03-18 5:43 ` Christoph Hellwig
0 siblings, 1 reply; 65+ messages in thread
From: John Garry @ 2025-03-17 9:36 UTC (permalink / raw)
To: Christoph Hellwig
Cc: brauner, djwong, cem, dchinner, linux-xfs, linux-fsdevel,
linux-kernel, ojaswin, ritesh.list, martin.petersen, tytso,
linux-ext4
On 17/03/2025 06:41, Christoph Hellwig wrote:
> On Thu, Mar 13, 2025 at 05:13:08PM +0000, John Garry wrote:
>> + * REQ_ATOMIC-based is the preferred method, and is attempted first. If this
>> + * method fails due to REQ_ATOMIC-related constraints, then we retry with the
>> + * COW-based method. The REQ_ATOMIC-based method typically will fail if the
>> + * write spans multiple extents or the disk blocks are misaligned.
>
> It is only preferred if actually supported by the underlying hardware.
> If it isn't it really shouldn't even be tried, as that is just a waste
> of cycles.
We should not even call this function if atomics are not supported by HW
- please see IOCB_ATOMIC checks in xfs_file_write_iter(). So maybe I
will mention that the caller must ensure atomics are supported for the
write size.
>
> Also a lot of comment should probably be near the code not on top
> of the function as that's where people would look for them.
sure, if you prefer
>
>> +static noinline ssize_t
>> +xfs_file_dio_write_atomic(
>> + struct xfs_inode *ip,
>> + struct kiocb *iocb,
>> + struct iov_iter *from)
>> +{
>> + unsigned int iolock = XFS_IOLOCK_SHARED;
>> + unsigned int dio_flags = 0;
>> + const struct iomap_ops *dops = &xfs_direct_write_iomap_ops;
>> + ssize_t ret;
>> +
>> +retry:
>> + ret = xfs_ilock_iocb_for_write(iocb, &iolock);
>> + if (ret)
>> + return ret;
>> +
>> + ret = xfs_file_write_checks(iocb, from, &iolock, NULL);
>> + if (ret)
>> + goto out_unlock;
>> +
>> + if (dio_flags & IOMAP_DIO_FORCE_WAIT)
>> + inode_dio_wait(VFS_I(ip));
>> +
>> + trace_xfs_file_direct_write(iocb, from);
>> + ret = iomap_dio_rw(iocb, from, dops, &xfs_dio_write_ops,
>> + dio_flags, NULL, 0);
>
> The normal direct I/O path downgrades the iolock to shared before
> doing the I/O here. Why isn't that done here?
OK, I can do that. But we still require exclusive lock always for the
CoW-based method.
>
>> + if (ret == -EAGAIN && !(iocb->ki_flags & IOCB_NOWAIT) &&
>> + dops == &xfs_direct_write_iomap_ops) {
>
> This should probably explain the unusual use of EGAIN. Although I
> still feel that picking a different error code for the fallback would
> be much more maintainable.
I could try another error code - can you suggest one? Is it going to be
something unrelated to storage stack, like EREMOTEIO?
>
>> + xfs_iunlock(ip, iolock);
>> + dio_flags = IOMAP_DIO_FORCE_WAIT;
>
> I notice the top of function comment mentions the IOMAP_DIO_FORCE_WAIT
> flag. Maybe use the chance to write a full sentence here or where
> it is checked to explain the logic a bit better?
ok, fine
>
>> * Handle block unaligned direct I/O writes
>> *
>> @@ -840,6 +909,10 @@ xfs_file_dio_write(
>> return xfs_file_dio_write_unaligned(ip, iocb, from);
>> if (xfs_is_zoned_inode(ip))
>> return xfs_file_dio_write_zoned(ip, iocb, from);
>> +
>> + if (iocb->ki_flags & IOCB_ATOMIC)
>> + return xfs_file_dio_write_atomic(ip, iocb, from);
>> +
>
> Either keep space between all the conditional calls or none. I doubt
> just stick to the existing style.
Sure
>
^ permalink raw reply [flat|nested] 65+ messages in thread
* Re: [PATCH v6 11/13] xfs: add xfs_file_dio_write_atomic()
2025-03-17 9:36 ` John Garry
@ 2025-03-18 5:43 ` Christoph Hellwig
2025-03-18 8:42 ` John Garry
0 siblings, 1 reply; 65+ messages in thread
From: Christoph Hellwig @ 2025-03-18 5:43 UTC (permalink / raw)
To: John Garry
Cc: Christoph Hellwig, brauner, djwong, cem, dchinner, linux-xfs,
linux-fsdevel, linux-kernel, ojaswin, ritesh.list,
martin.petersen, tytso, linux-ext4
On Mon, Mar 17, 2025 at 09:36:13AM +0000, John Garry wrote:
>> It is only preferred if actually supported by the underlying hardware.
>> If it isn't it really shouldn't even be tried, as that is just a waste
>> of cycles.
>
> We should not even call this function if atomics are not supported by HW -
> please see IOCB_ATOMIC checks in xfs_file_write_iter(). So maybe I will
> mention that the caller must ensure atomics are supported for the write
> size.
I see that this is what's done in the current series now. But that feels
very wrong. Why do you want to deprive the user of this nice and useful
code if they don't have the right hardware? Why do we limit us to the
hardware supported size when we support more in software? How do you
force test the software code if you require the hardware support?
>>> + trace_xfs_file_direct_write(iocb, from);
>>> + ret = iomap_dio_rw(iocb, from, dops, &xfs_dio_write_ops,
>>> + dio_flags, NULL, 0);
>>
>> The normal direct I/O path downgrades the iolock to shared before
>> doing the I/O here. Why isn't that done here?
>
> OK, I can do that. But we still require exclusive lock always for the
> CoW-based method.
If you can do away with the lock that's great and useful to get good
performance. But if not at least document why this is different from
others. Similarly if the COW path needs an exclusive lock document why
in the code.
>
>>
>>> + if (ret == -EAGAIN && !(iocb->ki_flags & IOCB_NOWAIT) &&
>>> + dops == &xfs_direct_write_iomap_ops) {
>>
>> This should probably explain the unusual use of EGAIN. Although I
>> still feel that picking a different error code for the fallback would
>> be much more maintainable.
>
> I could try another error code - can you suggest one? Is it going to be
> something unrelated to storage stack, like EREMOTEIO?
Yes, the funky networking codes tends to be good candidates. E.g.
ENOPROTOOPT for something that sounds at least vaguely related.
>>> +
>>> + if (iocb->ki_flags & IOCB_ATOMIC)
>>> + return xfs_file_dio_write_atomic(ip, iocb, from);
>>> +
>>
>> Either keep space between all the conditional calls or none. I doubt
>> just stick to the existing style.
>
> Sure
FYI, that I doubt should have been in doubt. I was just so happy to
finally get the mail out after a flakey connection on the train.
^ permalink raw reply [flat|nested] 65+ messages in thread
* Re: [PATCH v6 11/13] xfs: add xfs_file_dio_write_atomic()
2025-03-18 5:43 ` Christoph Hellwig
@ 2025-03-18 8:42 ` John Garry
2025-03-18 8:46 ` Christoph Hellwig
0 siblings, 1 reply; 65+ messages in thread
From: John Garry @ 2025-03-18 8:42 UTC (permalink / raw)
To: Christoph Hellwig
Cc: brauner, djwong, cem, dchinner, linux-xfs, linux-fsdevel,
linux-kernel, ojaswin, ritesh.list, martin.petersen, tytso,
linux-ext4
On 18/03/2025 05:43, Christoph Hellwig wrote:
> On Mon, Mar 17, 2025 at 09:36:13AM +0000, John Garry wrote:
>>> It is only preferred if actually supported by the underlying hardware.
>>> If it isn't it really shouldn't even be tried, as that is just a waste
>>> of cycles.
>>
>> We should not even call this function if atomics are not supported by HW -
>> please see IOCB_ATOMIC checks in xfs_file_write_iter(). So maybe I will
>> mention that the caller must ensure atomics are supported for the write
>> size.
>
> I see that this is what's done in the current series now. But that feels
> very wrong. Why do you want to deprive the user of this nice and useful
> code if they don't have the right hardware?
I don't think it's fair to say that we deprive the user - so far we just
don't and nobody has asked for atomics without HW support.
> Why do we limit us to the
> hardware supported size when we support more in software?
As I see, HW offload gives fast and predictable performance.
The COW method is just a (slow) fallback is when HW offload is not possible.
If we want to allow the user to avail of atomics greater than the
mounted bdev, then we should have a method to tell the user of the
optimised threshold. They could read the bdev atomic limits and infer
this, but that is not a good user experience.
> How do you
> force test the software code if you require the hardware support?
>
>>>> + trace_xfs_file_direct_write(iocb, from);
>>>> + ret = iomap_dio_rw(iocb, from, dops, &xfs_dio_write_ops,
>>>> + dio_flags, NULL, 0);
>>>
>>> The normal direct I/O path downgrades the iolock to shared before
>>> doing the I/O here. Why isn't that done here?
>>
>> OK, I can do that. But we still require exclusive lock always for the
>> CoW-based method.
>
> If you can do away with the lock that's great and useful to get good
> performance. But if not at least document why this is different from
> others. Similarly if the COW path needs an exclusive lock document why
> in the code.
ok, I'll do that.
>
>
>>
>>>
>>>> + if (ret == -EAGAIN && !(iocb->ki_flags & IOCB_NOWAIT) &&
>>>> + dops == &xfs_direct_write_iomap_ops) {
>>>
>>> This should probably explain the unusual use of EGAIN. Although I
>>> still feel that picking a different error code for the fallback would
>>> be much more maintainable.
>>
>> I could try another error code - can you suggest one? Is it going to be
>> something unrelated to storage stack, like EREMOTEIO?
>
> Yes, the funky networking codes tends to be good candidates. E.g.
> ENOPROTOOPT for something that sounds at least vaguely related.
ok
>
>>>> +
>>>> + if (iocb->ki_flags & IOCB_ATOMIC)
>>>> + return xfs_file_dio_write_atomic(ip, iocb, from);
>>>> +
>>>
>>> Either keep space between all the conditional calls or none. I doubt
>>> just stick to the existing style.
>>
>> Sure
>
> FYI, that I doubt should have been in doubt. I was just so happy to
> finally get the mail out after a flakey connection on the train.
>
thanks
^ permalink raw reply [flat|nested] 65+ messages in thread
* Re: [PATCH v6 11/13] xfs: add xfs_file_dio_write_atomic()
2025-03-18 8:42 ` John Garry
@ 2025-03-18 8:46 ` Christoph Hellwig
2025-03-18 9:12 ` John Garry
0 siblings, 1 reply; 65+ messages in thread
From: Christoph Hellwig @ 2025-03-18 8:46 UTC (permalink / raw)
To: John Garry
Cc: Christoph Hellwig, brauner, djwong, cem, dchinner, linux-xfs,
linux-fsdevel, linux-kernel, ojaswin, ritesh.list,
martin.petersen, tytso, linux-ext4
On Tue, Mar 18, 2025 at 08:42:36AM +0000, John Garry wrote:
>> I see that this is what's done in the current series now. But that feels
>> very wrong. Why do you want to deprive the user of this nice and useful
>> code if they don't have the right hardware?
>
> I don't think it's fair to say that we deprive the user - so far we just
> don't and nobody has asked for atomics without HW support.
You're still keeping this nice functionality from the users..
>
>> Why do we limit us to the
>> hardware supported size when we support more in software?
>
> As I see, HW offload gives fast and predictable performance.
>
> The COW method is just a (slow) fallback is when HW offload is not possible.
>
> If we want to allow the user to avail of atomics greater than the mounted
> bdev, then we should have a method to tell the user of the optimised
> threshold. They could read the bdev atomic limits and infer this, but that
> is not a good user experience.
Yes, there should be an interface to expose that. But even without
the hardware acceleration a guaranteed untorn write is a really nice
feature to have.
^ permalink raw reply [flat|nested] 65+ messages in thread
* Re: [PATCH v6 11/13] xfs: add xfs_file_dio_write_atomic()
2025-03-18 8:46 ` Christoph Hellwig
@ 2025-03-18 9:12 ` John Garry
0 siblings, 0 replies; 65+ messages in thread
From: John Garry @ 2025-03-18 9:12 UTC (permalink / raw)
To: Christoph Hellwig
Cc: brauner, djwong, cem, dchinner, linux-xfs, linux-fsdevel,
linux-kernel, ojaswin, ritesh.list, martin.petersen, tytso,
linux-ext4
On 18/03/2025 08:46, Christoph Hellwig wrote:
> On Tue, Mar 18, 2025 at 08:42:36AM +0000, John Garry wrote:
>>> I see that this is what's done in the current series now. But that feels
>>> very wrong. Why do you want to deprive the user of this nice and useful
>>> code if they don't have the right hardware?
>>
>> I don't think it's fair to say that we deprive the user - so far we just
>> don't and nobody has asked for atomics without HW support.
>
> You're still keeping this nice functionality from the users..
>
>>
>>> Why do we limit us to the
>>> hardware supported size when we support more in software?
>>
>> As I see, HW offload gives fast and predictable performance.
>>
>> The COW method is just a (slow) fallback is when HW offload is not possible.
>>
>> If we want to allow the user to avail of atomics greater than the mounted
>> bdev, then we should have a method to tell the user of the optimised
>> threshold. They could read the bdev atomic limits and infer this, but that
>> is not a good user experience.
>
> Yes, there should be an interface to expose that.
So we could add to statx.atomic_write_unit_max_opt, which is the
optimised/fast limit, i.e. bdev limit. Is that sane?
Note that we already have STATX_ATTR_WRITE_ATOMIC to get the atomic
limits. I wouldn't want to add a new flag just to get this new member.
So I suppose that if we added statx.atomic_write_unit_max_opt, the API
would be: if statx.atomic_write_unit_max_opt is zero, then
statx.atomic_write_unit_max is optimised limit also.
> But even without
> the hardware acceleration a guaranteed untorn write is a really nice
> feature to have.
>
^ permalink raw reply [flat|nested] 65+ messages in thread
* [PATCH v6 12/13] xfs: commit CoW-based atomic writes atomically
2025-03-13 17:12 [PATCH v6 00/13] large atomic writes for xfs with CoW John Garry
` (10 preceding siblings ...)
2025-03-13 17:13 ` [PATCH v6 11/13] xfs: add xfs_file_dio_write_atomic() John Garry
@ 2025-03-13 17:13 ` John Garry
2025-03-17 6:56 ` Christoph Hellwig
2025-03-13 17:13 ` [PATCH v6 13/13] xfs: update atomic write max size John Garry
2025-03-18 5:48 ` [PATCH v6 00/13] large atomic writes for xfs with CoW Christoph Hellwig
13 siblings, 1 reply; 65+ messages in thread
From: John Garry @ 2025-03-13 17:13 UTC (permalink / raw)
To: brauner, djwong, cem, dchinner, hch
Cc: linux-xfs, linux-fsdevel, linux-kernel, ojaswin, ritesh.list,
martin.petersen, tytso, linux-ext4, John Garry
When completing a CoW-based write, each extent range mapping update is
covered by a separate transaction.
For a CoW-based atomic write, all mappings must be changed at once, so
change to use a single transaction.
Note that there is a limit on the amount of log intent items which can be
fit into a single transaction, but this is being ignored for now since
the count of items for a typical atomic write would be much less than is
typically supported. A typical atomic write would be expected to be 64KB
or less, which means only 16 possible extents unmaps, which is quite
small.
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: John Garry <john.g.garry@oracle.com>
---
fs/xfs/xfs_file.c | 5 ++++-
fs/xfs/xfs_reflink.c | 49 ++++++++++++++++++++++++++++++++++++++++++++
fs/xfs/xfs_reflink.h | 2 ++
3 files changed, 55 insertions(+), 1 deletion(-)
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 029684b54dda..b6b714693d1a 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -576,7 +576,10 @@ xfs_dio_write_end_io(
nofs_flag = memalloc_nofs_save();
if (flags & IOMAP_DIO_COW) {
- error = xfs_reflink_end_cow(ip, offset, size);
+ if (iocb->ki_flags & IOCB_ATOMIC)
+ error = xfs_reflink_end_atomic_cow(ip, offset, size);
+ else
+ error = xfs_reflink_end_cow(ip, offset, size);
if (error)
goto out;
}
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index 9a419af89949..b983f5413be6 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -1006,6 +1006,55 @@ xfs_reflink_end_cow(
trace_xfs_reflink_end_cow_error(ip, error, _RET_IP_);
return error;
}
+int
+xfs_reflink_end_atomic_cow(
+ struct xfs_inode *ip,
+ xfs_off_t offset,
+ xfs_off_t count)
+{
+ xfs_fileoff_t offset_fsb;
+ xfs_fileoff_t end_fsb;
+ int error = 0;
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_trans *tp;
+ unsigned int resblks;
+
+ trace_xfs_reflink_end_cow(ip, offset, count);
+
+ offset_fsb = XFS_B_TO_FSBT(mp, offset);
+ end_fsb = XFS_B_TO_FSB(mp, offset + count);
+
+ /*
+ * Each remapping operation could cause a btree split, so in the worst
+ * case that's one for each block.
+ */
+ resblks = (end_fsb - offset_fsb) *
+ XFS_NEXTENTADD_SPACE_RES(mp, 1, XFS_DATA_FORK);
+
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0,
+ XFS_TRANS_RESERVE, &tp);
+ if (error)
+ return error;
+
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, ip, 0);
+
+ while (end_fsb > offset_fsb && !error) {
+ error = xfs_reflink_end_cow_extent_locked(tp, ip, &offset_fsb,
+ end_fsb);
+ }
+ if (error) {
+ trace_xfs_reflink_end_cow_error(ip, error, _RET_IP_);
+ goto out_cancel;
+ }
+ error = xfs_trans_commit(tp);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ return error;
+out_cancel:
+ xfs_trans_cancel(tp);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ return error;
+}
/*
* Free all CoW staging blocks that are still referenced by the ondisk refcount
diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h
index 0ab1857074e5..969006661a3f 100644
--- a/fs/xfs/xfs_reflink.h
+++ b/fs/xfs/xfs_reflink.h
@@ -53,6 +53,8 @@ extern int xfs_reflink_cancel_cow_range(struct xfs_inode *ip, xfs_off_t offset,
xfs_off_t count, bool cancel_real);
extern int xfs_reflink_end_cow(struct xfs_inode *ip, xfs_off_t offset,
xfs_off_t count);
+int xfs_reflink_end_atomic_cow(struct xfs_inode *ip, xfs_off_t offset,
+ xfs_off_t count);
extern int xfs_reflink_recover_cow(struct xfs_mount *mp);
extern loff_t xfs_reflink_remap_range(struct file *file_in, loff_t pos_in,
struct file *file_out, loff_t pos_out, loff_t len,
--
2.31.1
^ permalink raw reply related [flat|nested] 65+ messages in thread
* Re: [PATCH v6 12/13] xfs: commit CoW-based atomic writes atomically
2025-03-13 17:13 ` [PATCH v6 12/13] xfs: commit CoW-based atomic writes atomically John Garry
@ 2025-03-17 6:56 ` Christoph Hellwig
2025-03-17 9:43 ` John Garry
0 siblings, 1 reply; 65+ messages in thread
From: Christoph Hellwig @ 2025-03-17 6:56 UTC (permalink / raw)
To: John Garry
Cc: brauner, djwong, cem, dchinner, hch, linux-xfs, linux-fsdevel,
linux-kernel, ojaswin, ritesh.list, martin.petersen, tytso,
linux-ext4
> trace_xfs_reflink_end_cow_error(ip, error, _RET_IP_);
> return error;
> }
> +int
Still a missing empty line after the previous function. Also
please add a comment what is atomic about this function and why
the regular path doesn't use it.
^ permalink raw reply [flat|nested] 65+ messages in thread
* Re: [PATCH v6 12/13] xfs: commit CoW-based atomic writes atomically
2025-03-17 6:56 ` Christoph Hellwig
@ 2025-03-17 9:43 ` John Garry
0 siblings, 0 replies; 65+ messages in thread
From: John Garry @ 2025-03-17 9:43 UTC (permalink / raw)
To: Christoph Hellwig
Cc: brauner, djwong, cem, dchinner, linux-xfs, linux-fsdevel,
linux-kernel, ojaswin, ritesh.list, martin.petersen, tytso,
linux-ext4
On 17/03/2025 06:56, Christoph Hellwig wrote:
>> trace_xfs_reflink_end_cow_error(ip, error,_RET_IP_);
>> return error;
>> }
>> +int
> Still a missing empty line after the previous function.
Will fix.
> Also
> please add a comment what is atomic about this function and why
> the regular path doesn't use it.
ok, fine
^ permalink raw reply [flat|nested] 65+ messages in thread
* [PATCH v6 13/13] xfs: update atomic write max size
2025-03-13 17:12 [PATCH v6 00/13] large atomic writes for xfs with CoW John Garry
` (11 preceding siblings ...)
2025-03-13 17:13 ` [PATCH v6 12/13] xfs: commit CoW-based atomic writes atomically John Garry
@ 2025-03-13 17:13 ` John Garry
2025-03-17 7:25 ` Christoph Hellwig
2025-03-18 5:48 ` [PATCH v6 00/13] large atomic writes for xfs with CoW Christoph Hellwig
13 siblings, 1 reply; 65+ messages in thread
From: John Garry @ 2025-03-13 17:13 UTC (permalink / raw)
To: brauner, djwong, cem, dchinner, hch
Cc: linux-xfs, linux-fsdevel, linux-kernel, ojaswin, ritesh.list,
martin.petersen, tytso, linux-ext4, John Garry, Carlos Maiolino
Now that CoW-based atomic writes are supported, update the max size of an
atomic write.
For simplicity, limit at the max of what the mounted bdev can support in
terms of atomic write limits. Maybe in future we will have a better way
to advertise this optimised limit.
In addition, the max atomic write size needs to be aligned to the agsize.
Limit the size of atomic writes to the greatest power-of-two factor of the
agsize so that allocations for an atomic write will always be aligned
compatibly with the alignment requirements of the storage.
rtvol is not commonly used, so it is not very important to support large
atomic writes there initially.
Furthermore, adding large atomic writes for rtvol would be complicated due
to alignment already offered by rtextsize and also the limitation of
reflink support only be possible for rtextsize is a power-of-2.
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Carlos Maiolino <cmaiolino@redhat.com>
Signed-off-by: John Garry <john.g.garry@oracle.com>
---
fs/xfs/xfs_iops.c | 14 +++++++++++++-
fs/xfs/xfs_mount.c | 29 +++++++++++++++++++++++++++++
fs/xfs/xfs_mount.h | 1 +
3 files changed, 43 insertions(+), 1 deletion(-)
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 64b1f8c73824..7c22eefd6b89 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -615,10 +615,22 @@ unsigned int
xfs_get_atomic_write_max_attr(
struct xfs_inode *ip)
{
+ struct xfs_buftarg *target = xfs_inode_buftarg(ip);
+ struct xfs_mount *mp = ip->i_mount;
+
if (!xfs_inode_can_atomicwrite(ip))
return 0;
- return ip->i_mount->m_sb.sb_blocksize;
+ /*
+ * rtvol is not commonly used and supporting large atomic writes
+ * would also be complicated to support there, so limit to a single
+ * block for now.
+ */
+ if (XFS_IS_REALTIME_INODE(ip))
+ return mp->m_sb.sb_blocksize;
+
+ return min_t(unsigned int, XFS_FSB_TO_B(mp, mp->m_awu_max),
+ target->bt_bdev_awu_max);
}
static void
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index e65a659901d5..fd89cb7a83fd 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -666,6 +666,33 @@ xfs_agbtree_compute_maxlevels(
mp->m_agbtree_maxlevels = max(levels, mp->m_refc_maxlevels);
}
+static inline void
+xfs_compute_awu_max(
+ struct xfs_mount *mp)
+{
+ xfs_agblock_t agsize = mp->m_sb.sb_agblocks;
+ xfs_agblock_t awu_max;
+
+ if (!xfs_has_reflink(mp)) {
+ mp->m_awu_max = 1;
+ return;
+ }
+
+ /*
+ * Find highest power-of-2 evenly divisible into agsize and which
+ * also fits into an unsigned int field.
+ */
+ awu_max = 1;
+ while (1) {
+ if (agsize % (awu_max * 2))
+ break;
+ if (XFS_FSB_TO_B(mp, awu_max * 2) > UINT_MAX)
+ break;
+ awu_max *= 2;
+ }
+ mp->m_awu_max = awu_max;
+}
+
/* Compute maximum possible height for realtime btree types for this fs. */
static inline void
xfs_rtbtree_compute_maxlevels(
@@ -751,6 +778,8 @@ xfs_mountfs(
xfs_agbtree_compute_maxlevels(mp);
xfs_rtbtree_compute_maxlevels(mp);
+ xfs_compute_awu_max(mp);
+
/*
* Check if sb_agblocks is aligned at stripe boundary. If sb_agblocks
* is NOT aligned turn off m_dalign since allocator alignment is within
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 799b84220ebb..1b0136da2aec 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -229,6 +229,7 @@ typedef struct xfs_mount {
bool m_finobt_nores; /* no per-AG finobt resv. */
bool m_update_sb; /* sb needs update in mount */
unsigned int m_max_open_zones;
+ xfs_extlen_t m_awu_max; /* data device max atomic write */
/*
* Bitsets of per-fs metadata that have been checked and/or are sick.
--
2.31.1
^ permalink raw reply related [flat|nested] 65+ messages in thread
* Re: [PATCH v6 13/13] xfs: update atomic write max size
2025-03-13 17:13 ` [PATCH v6 13/13] xfs: update atomic write max size John Garry
@ 2025-03-17 7:25 ` Christoph Hellwig
2025-03-17 9:57 ` John Garry
0 siblings, 1 reply; 65+ messages in thread
From: Christoph Hellwig @ 2025-03-17 7:25 UTC (permalink / raw)
To: John Garry
Cc: brauner, djwong, cem, dchinner, linux-xfs, linux-fsdevel,
linux-kernel, ojaswin, ritesh.list, martin.petersen, tytso,
linux-ext4, Carlos Maiolino
On Thu, Mar 13, 2025 at 05:13:10PM +0000, John Garry wrote:
> For simplicity, limit at the max of what the mounted bdev can support in
> terms of atomic write limits. Maybe in future we will have a better way
> to advertise this optimised limit.
You'll still need to cover limit this by the amount that can
be commited in a single transactions. And handle the case where there
is no hardware support at all.
> xfs_get_atomic_write_max_attr(
I missed it in the previous version, but can be drop the
pointless _attr for these two helpers?
> +static inline void
> +xfs_compute_awu_max(
And use a more descriptive name than AWU, wich really just is a
nvme field name.
> + awu_max = 1;
> + while (1) {
> + if (agsize % (awu_max * 2))
> + break;
while ((agsize % (awu_max * 2) == 0)) {
?
> + xfs_extlen_t m_awu_max; /* data device max atomic write */
overly long line.
^ permalink raw reply [flat|nested] 65+ messages in thread
* Re: [PATCH v6 13/13] xfs: update atomic write max size
2025-03-17 7:25 ` Christoph Hellwig
@ 2025-03-17 9:57 ` John Garry
2025-03-18 5:47 ` Christoph Hellwig
0 siblings, 1 reply; 65+ messages in thread
From: John Garry @ 2025-03-17 9:57 UTC (permalink / raw)
To: Christoph Hellwig
Cc: brauner, djwong, cem, dchinner, linux-xfs, linux-fsdevel,
linux-kernel, ojaswin, ritesh.list, martin.petersen, tytso,
linux-ext4, Carlos Maiolino
On 17/03/2025 07:25, Christoph Hellwig wrote:
> On Thu, Mar 13, 2025 at 05:13:10PM +0000, John Garry wrote:
>> For simplicity, limit at the max of what the mounted bdev can support in
>> terms of atomic write limits. Maybe in future we will have a better way
>> to advertise this optimised limit.
>
> You'll still need to cover limit this by the amount that can
> be commited in a single transactions.
yeah ... I'll revisit that
> And handle the case where there
> is no hardware support at all.
So xfs_get_atomic_write_max_attr() -> xfs_inode_can_atomicwrite() covers
no HW support.
The point of this function is just to calc atomic write limits according
to mount point geometry and features.
Do you think that it is necessary to call xfs_inode_can_atomicwrite()
here also? [And remove the xfs_get_atomic_write_max_attr() ->
xfs_inode_can_atomicwrite()?]
>
>> xfs_get_atomic_write_max_attr(
>
> I missed it in the previous version, but can be drop the
> pointless _attr for these two helpers?
ok, fine
>
>> +static inline void
>> +xfs_compute_awu_max(
>
> And use a more descriptive name than AWU, wich really just is a
> nvme field name.
I am just trying to be concise to limit spilling lines.
Maybe atomicwrite_unit_max is preferred
>
>> + awu_max = 1;
>> + while (1) {
>> + if (agsize % (awu_max * 2))
>> + break;
>
> while ((agsize % (awu_max * 2) == 0)) {
>
> ?
>
>> + xfs_extlen_t m_awu_max; /* data device max atomic write */
>
> overly long line.
there are a few overly long lines here (so following that example), but
since there is a request to change the name, I'll be definitely using a
newline for the comment
Thanks,
John
>
^ permalink raw reply [flat|nested] 65+ messages in thread
* Re: [PATCH v6 13/13] xfs: update atomic write max size
2025-03-17 9:57 ` John Garry
@ 2025-03-18 5:47 ` Christoph Hellwig
0 siblings, 0 replies; 65+ messages in thread
From: Christoph Hellwig @ 2025-03-18 5:47 UTC (permalink / raw)
To: John Garry
Cc: brauner, djwong, cem, dchinner, linux-xfs, linux-fsdevel,
linux-kernel, ojaswin, ritesh.list, martin.petersen, tytso,
linux-ext4, Carlos Maiolino
On Mon, Mar 17, 2025 at 09:57:45AM +0000, John Garry wrote:
>> And handle the case where there
>> is no hardware support at all.
>
> So xfs_get_atomic_write_max_attr() -> xfs_inode_can_atomicwrite() covers no
> HW support.
>
> The point of this function is just to calc atomic write limits according to
> mount point geometry and features.
>
> Do you think that it is necessary to call xfs_inode_can_atomicwrite() here
> also? [And remove the xfs_get_atomic_write_max_attr() ->
> xfs_inode_can_atomicwrite()?]
At least document what it does..
>>> +static inline void
>>> +xfs_compute_awu_max(
>>
>> And use a more descriptive name than AWU, wich really just is a
>> nvme field name.
>
> I am just trying to be concise to limit spilling lines.
>
> Maybe atomicwrite_unit_max is preferred
I guess if we ant to stick to the unit encoded in awu and used by the
block layer, yes.
^ permalink raw reply [flat|nested] 65+ messages in thread
* Re: [PATCH v6 00/13] large atomic writes for xfs with CoW
2025-03-13 17:12 [PATCH v6 00/13] large atomic writes for xfs with CoW John Garry
` (12 preceding siblings ...)
2025-03-13 17:13 ` [PATCH v6 13/13] xfs: update atomic write max size John Garry
@ 2025-03-18 5:48 ` Christoph Hellwig
2025-03-18 8:44 ` John Garry
13 siblings, 1 reply; 65+ messages in thread
From: Christoph Hellwig @ 2025-03-18 5:48 UTC (permalink / raw)
To: John Garry
Cc: brauner, djwong, cem, dchinner, hch, linux-xfs, linux-fsdevel,
linux-kernel, ojaswin, ritesh.list, martin.petersen, tytso,
linux-ext4
As said before what I'd really like to see going along with this is
a good set of test for xfstests, including exercising the worst case
of the atomic always COW writes.
^ permalink raw reply [flat|nested] 65+ messages in thread
* Re: [PATCH v6 00/13] large atomic writes for xfs with CoW
2025-03-18 5:48 ` [PATCH v6 00/13] large atomic writes for xfs with CoW Christoph Hellwig
@ 2025-03-18 8:44 ` John Garry
0 siblings, 0 replies; 65+ messages in thread
From: John Garry @ 2025-03-18 8:44 UTC (permalink / raw)
To: Christoph Hellwig
Cc: brauner, djwong, cem, dchinner, linux-xfs, linux-fsdevel,
linux-kernel, ojaswin, ritesh.list, martin.petersen, tytso,
linux-ext4
On 18/03/2025 05:48, Christoph Hellwig wrote:
> As said before what I'd really like to see going along with this is
> a good set of test for xfstests, including exercising the worst case
> of the atomic always COW writes.
>
ok, fine. We already had something for the forcealign stuff, so can
rework that.
Thanks
^ permalink raw reply [flat|nested] 65+ messages in thread