* [PATCH 01/10] iomap: allow the file system to submit the writeback bios
2024-12-19 17:39 iomap patches for zoned XFS v1 Christoph Hellwig
@ 2024-12-19 17:39 ` Christoph Hellwig
2024-12-19 17:54 ` Darrick J. Wong
2024-12-19 17:39 ` [PATCH 02/10] iomap: simplify io_flags and io_type in struct iomap_ioend Christoph Hellwig
` (8 subsequent siblings)
9 siblings, 1 reply; 20+ messages in thread
From: Christoph Hellwig @ 2024-12-19 17:39 UTC (permalink / raw)
To: Christian Brauner
Cc: Darrick J. Wong, Carlos Maiolino, linux-xfs, linux-fsdevel
Change ->prepare_ioend to ->submit_ioend and require file systems that
implement it to submit the bio. This is needed for file systems that
do their own work on the bios before submitting them to the block layer
like btrfs or zoned xfs. To make this easier also pass the writeback
context to the method.
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
Documentation/filesystems/iomap/operations.rst | 11 +++++------
fs/iomap/buffered-io.c | 10 +++++-----
fs/xfs/xfs_aops.c | 13 +++++++++----
include/linux/iomap.h | 12 +++++++-----
4 files changed, 26 insertions(+), 20 deletions(-)
diff --git a/Documentation/filesystems/iomap/operations.rst b/Documentation/filesystems/iomap/operations.rst
index ef082e5a4e0c..7ef39b13e65c 100644
--- a/Documentation/filesystems/iomap/operations.rst
+++ b/Documentation/filesystems/iomap/operations.rst
@@ -283,7 +283,7 @@ The ``ops`` structure must be specified and is as follows:
struct iomap_writeback_ops {
int (*map_blocks)(struct iomap_writepage_ctx *wpc, struct inode *inode,
loff_t offset, unsigned len);
- int (*prepare_ioend)(struct iomap_ioend *ioend, int status);
+ int (*submit_ioend)(struct iomap_writepage_ctx *wpc, int status);
void (*discard_folio)(struct folio *folio, loff_t pos);
};
@@ -306,13 +306,12 @@ The fields are as follows:
purpose.
This function must be supplied by the filesystem.
- - ``prepare_ioend``: Enables filesystems to transform the writeback
- ioend or perform any other preparatory work before the writeback I/O
- is submitted.
+ - ``submit_ioend``: Allows the file systems to hook into writeback bio
+ submission.
This might include pre-write space accounting updates, or installing
a custom ``->bi_end_io`` function for internal purposes, such as
deferring the ioend completion to a workqueue to run metadata update
- transactions from process context.
+ transactions from process context before submitting the bio.
This function is optional.
- ``discard_folio``: iomap calls this function after ``->map_blocks``
@@ -341,7 +340,7 @@ This can happen in interrupt or process context, depending on the
storage device.
Filesystems that need to update internal bookkeeping (e.g. unwritten
-extent conversions) should provide a ``->prepare_ioend`` function to
+extent conversions) should provide a ``->submit_ioend`` function to
set ``struct iomap_end::bio::bi_end_io`` to its own function.
This function should call ``iomap_finish_ioends`` after finishing its
own work (e.g. unwritten extent conversion).
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 955f19e27e47..cdccf11bb3be 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -1675,7 +1675,7 @@ static void iomap_writepage_end_bio(struct bio *bio)
}
/*
- * Submit the final bio for an ioend.
+ * Submit an ioend.
*
* If @error is non-zero, it means that we have a situation where some part of
* the submission process has failed after we've marked pages for writeback.
@@ -1694,14 +1694,14 @@ static int iomap_submit_ioend(struct iomap_writepage_ctx *wpc, int error)
* failure happened so that the file system end I/O handler gets called
* to clean up.
*/
- if (wpc->ops->prepare_ioend)
- error = wpc->ops->prepare_ioend(wpc->ioend, error);
+ if (wpc->ops->submit_ioend)
+ error = wpc->ops->submit_ioend(wpc, error);
+ else if (!error)
+ submit_bio(&wpc->ioend->io_bio);
if (error) {
wpc->ioend->io_bio.bi_status = errno_to_blk_status(error);
bio_endio(&wpc->ioend->io_bio);
- } else {
- submit_bio(&wpc->ioend->io_bio);
}
wpc->ioend = NULL;
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 559a3a577097..d175853da5ae 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -395,10 +395,11 @@ xfs_map_blocks(
}
static int
-xfs_prepare_ioend(
- struct iomap_ioend *ioend,
+xfs_submit_ioend(
+ struct iomap_writepage_ctx *wpc,
int status)
{
+ struct iomap_ioend *ioend = wpc->ioend;
unsigned int nofs_flag;
/*
@@ -420,7 +421,11 @@ xfs_prepare_ioend(
if (xfs_ioend_is_append(ioend) || ioend->io_type == IOMAP_UNWRITTEN ||
(ioend->io_flags & IOMAP_F_SHARED))
ioend->io_bio.bi_end_io = xfs_end_bio;
- return status;
+
+ if (status)
+ return status;
+ submit_bio(&ioend->io_bio);
+ return 0;
}
/*
@@ -462,7 +467,7 @@ xfs_discard_folio(
static const struct iomap_writeback_ops xfs_writeback_ops = {
.map_blocks = xfs_map_blocks,
- .prepare_ioend = xfs_prepare_ioend,
+ .submit_ioend = xfs_submit_ioend,
.discard_folio = xfs_discard_folio,
};
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 5675af6b740c..c0339678d798 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -362,12 +362,14 @@ struct iomap_writeback_ops {
loff_t offset, unsigned len);
/*
- * Optional, allows the file systems to perform actions just before
- * submitting the bio and/or override the bio end_io handler for complex
- * operations like copy on write extent manipulation or unwritten extent
- * conversions.
+ * Optional, allows the file systems to hook into bio submission,
+ * including overriding the bi_end_io handler.
+ *
+ * Returns 0 if the bio was successfully submitted, or a negative
+ * error code if status was non-zero or another error happened and
+ * the bio could not be submitted.
*/
- int (*prepare_ioend)(struct iomap_ioend *ioend, int status);
+ int (*submit_ioend)(struct iomap_writepage_ctx *wpc, int status);
/*
* Optional, allows the file system to discard state on a page where
--
2.45.2
^ permalink raw reply related [flat|nested] 20+ messages in thread* Re: [PATCH 01/10] iomap: allow the file system to submit the writeback bios
2024-12-19 17:39 ` [PATCH 01/10] iomap: allow the file system to submit the writeback bios Christoph Hellwig
@ 2024-12-19 17:54 ` Darrick J. Wong
0 siblings, 0 replies; 20+ messages in thread
From: Darrick J. Wong @ 2024-12-19 17:54 UTC (permalink / raw)
To: Christoph Hellwig
Cc: Christian Brauner, Carlos Maiolino, linux-xfs, linux-fsdevel
On Thu, Dec 19, 2024 at 05:39:06PM +0000, Christoph Hellwig wrote:
> Change ->prepare_ioend to ->submit_ioend and require file systems that
> implement it to submit the bio. This is needed for file systems that
> do their own work on the bios before submitting them to the block layer
> like btrfs or zoned xfs. To make this easier also pass the writeback
> context to the method.
>
> Signed-off-by: Christoph Hellwig <hch@lst.de>
Looks good now,
Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
--D
> ---
> Documentation/filesystems/iomap/operations.rst | 11 +++++------
> fs/iomap/buffered-io.c | 10 +++++-----
> fs/xfs/xfs_aops.c | 13 +++++++++----
> include/linux/iomap.h | 12 +++++++-----
> 4 files changed, 26 insertions(+), 20 deletions(-)
>
> diff --git a/Documentation/filesystems/iomap/operations.rst b/Documentation/filesystems/iomap/operations.rst
> index ef082e5a4e0c..7ef39b13e65c 100644
> --- a/Documentation/filesystems/iomap/operations.rst
> +++ b/Documentation/filesystems/iomap/operations.rst
> @@ -283,7 +283,7 @@ The ``ops`` structure must be specified and is as follows:
> struct iomap_writeback_ops {
> int (*map_blocks)(struct iomap_writepage_ctx *wpc, struct inode *inode,
> loff_t offset, unsigned len);
> - int (*prepare_ioend)(struct iomap_ioend *ioend, int status);
> + int (*submit_ioend)(struct iomap_writepage_ctx *wpc, int status);
> void (*discard_folio)(struct folio *folio, loff_t pos);
> };
>
> @@ -306,13 +306,12 @@ The fields are as follows:
> purpose.
> This function must be supplied by the filesystem.
>
> - - ``prepare_ioend``: Enables filesystems to transform the writeback
> - ioend or perform any other preparatory work before the writeback I/O
> - is submitted.
> + - ``submit_ioend``: Allows the file systems to hook into writeback bio
> + submission.
> This might include pre-write space accounting updates, or installing
> a custom ``->bi_end_io`` function for internal purposes, such as
> deferring the ioend completion to a workqueue to run metadata update
> - transactions from process context.
> + transactions from process context before submitting the bio.
> This function is optional.
>
> - ``discard_folio``: iomap calls this function after ``->map_blocks``
> @@ -341,7 +340,7 @@ This can happen in interrupt or process context, depending on the
> storage device.
>
> Filesystems that need to update internal bookkeeping (e.g. unwritten
> -extent conversions) should provide a ``->prepare_ioend`` function to
> +extent conversions) should provide a ``->submit_ioend`` function to
> set ``struct iomap_end::bio::bi_end_io`` to its own function.
> This function should call ``iomap_finish_ioends`` after finishing its
> own work (e.g. unwritten extent conversion).
> diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
> index 955f19e27e47..cdccf11bb3be 100644
> --- a/fs/iomap/buffered-io.c
> +++ b/fs/iomap/buffered-io.c
> @@ -1675,7 +1675,7 @@ static void iomap_writepage_end_bio(struct bio *bio)
> }
>
> /*
> - * Submit the final bio for an ioend.
> + * Submit an ioend.
> *
> * If @error is non-zero, it means that we have a situation where some part of
> * the submission process has failed after we've marked pages for writeback.
> @@ -1694,14 +1694,14 @@ static int iomap_submit_ioend(struct iomap_writepage_ctx *wpc, int error)
> * failure happened so that the file system end I/O handler gets called
> * to clean up.
> */
> - if (wpc->ops->prepare_ioend)
> - error = wpc->ops->prepare_ioend(wpc->ioend, error);
> + if (wpc->ops->submit_ioend)
> + error = wpc->ops->submit_ioend(wpc, error);
> + else if (!error)
> + submit_bio(&wpc->ioend->io_bio);
>
> if (error) {
> wpc->ioend->io_bio.bi_status = errno_to_blk_status(error);
> bio_endio(&wpc->ioend->io_bio);
> - } else {
> - submit_bio(&wpc->ioend->io_bio);
> }
>
> wpc->ioend = NULL;
> diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
> index 559a3a577097..d175853da5ae 100644
> --- a/fs/xfs/xfs_aops.c
> +++ b/fs/xfs/xfs_aops.c
> @@ -395,10 +395,11 @@ xfs_map_blocks(
> }
>
> static int
> -xfs_prepare_ioend(
> - struct iomap_ioend *ioend,
> +xfs_submit_ioend(
> + struct iomap_writepage_ctx *wpc,
> int status)
> {
> + struct iomap_ioend *ioend = wpc->ioend;
> unsigned int nofs_flag;
>
> /*
> @@ -420,7 +421,11 @@ xfs_prepare_ioend(
> if (xfs_ioend_is_append(ioend) || ioend->io_type == IOMAP_UNWRITTEN ||
> (ioend->io_flags & IOMAP_F_SHARED))
> ioend->io_bio.bi_end_io = xfs_end_bio;
> - return status;
> +
> + if (status)
> + return status;
> + submit_bio(&ioend->io_bio);
> + return 0;
> }
>
> /*
> @@ -462,7 +467,7 @@ xfs_discard_folio(
>
> static const struct iomap_writeback_ops xfs_writeback_ops = {
> .map_blocks = xfs_map_blocks,
> - .prepare_ioend = xfs_prepare_ioend,
> + .submit_ioend = xfs_submit_ioend,
> .discard_folio = xfs_discard_folio,
> };
>
> diff --git a/include/linux/iomap.h b/include/linux/iomap.h
> index 5675af6b740c..c0339678d798 100644
> --- a/include/linux/iomap.h
> +++ b/include/linux/iomap.h
> @@ -362,12 +362,14 @@ struct iomap_writeback_ops {
> loff_t offset, unsigned len);
>
> /*
> - * Optional, allows the file systems to perform actions just before
> - * submitting the bio and/or override the bio end_io handler for complex
> - * operations like copy on write extent manipulation or unwritten extent
> - * conversions.
> + * Optional, allows the file systems to hook into bio submission,
> + * including overriding the bi_end_io handler.
> + *
> + * Returns 0 if the bio was successfully submitted, or a negative
> + * error code if status was non-zero or another error happened and
> + * the bio could not be submitted.
> */
> - int (*prepare_ioend)(struct iomap_ioend *ioend, int status);
> + int (*submit_ioend)(struct iomap_writepage_ctx *wpc, int status);
>
> /*
> * Optional, allows the file system to discard state on a page where
> --
> 2.45.2
>
>
^ permalink raw reply [flat|nested] 20+ messages in thread
* [PATCH 02/10] iomap: simplify io_flags and io_type in struct iomap_ioend
2024-12-19 17:39 iomap patches for zoned XFS v1 Christoph Hellwig
2024-12-19 17:39 ` [PATCH 01/10] iomap: allow the file system to submit the writeback bios Christoph Hellwig
@ 2024-12-19 17:39 ` Christoph Hellwig
2024-12-19 17:56 ` Darrick J. Wong
2024-12-19 17:39 ` [PATCH 03/10] iomap: add a IOMAP_F_ANON_WRITE flag Christoph Hellwig
` (7 subsequent siblings)
9 siblings, 1 reply; 20+ messages in thread
From: Christoph Hellwig @ 2024-12-19 17:39 UTC (permalink / raw)
To: Christian Brauner
Cc: Darrick J. Wong, Carlos Maiolino, linux-xfs, linux-fsdevel
The ioend fields for distinct types of I/O are a bit complicated.
Consolidate them into a single io_flag field with it's own flags
decoupled from the iomap flags. This also prepares for adding a new
flag that is unrelated to both of the iomap namespaces.
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
fs/iomap/buffered-io.c | 39 ++++++++++++++++++++++-----------------
fs/xfs/xfs_aops.c | 12 ++++++------
include/linux/iomap.h | 20 ++++++++++++++++++--
3 files changed, 46 insertions(+), 25 deletions(-)
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index cdccf11bb3be..3176dc996fb7 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -1605,13 +1605,10 @@ iomap_ioend_can_merge(struct iomap_ioend *ioend, struct iomap_ioend *next)
{
if (ioend->io_bio.bi_status != next->io_bio.bi_status)
return false;
- if (next->io_flags & IOMAP_F_BOUNDARY)
+ if (next->io_flags & IOMAP_IOEND_BOUNDARY)
return false;
- if ((ioend->io_flags & IOMAP_F_SHARED) ^
- (next->io_flags & IOMAP_F_SHARED))
- return false;
- if ((ioend->io_type == IOMAP_UNWRITTEN) ^
- (next->io_type == IOMAP_UNWRITTEN))
+ if ((ioend->io_flags & IOMAP_IOEND_NOMERGE_FLAGS) !=
+ (next->io_flags & IOMAP_IOEND_NOMERGE_FLAGS))
return false;
if (ioend->io_offset + ioend->io_size != next->io_offset)
return false;
@@ -1709,7 +1706,8 @@ static int iomap_submit_ioend(struct iomap_writepage_ctx *wpc, int error)
}
static struct iomap_ioend *iomap_alloc_ioend(struct iomap_writepage_ctx *wpc,
- struct writeback_control *wbc, struct inode *inode, loff_t pos)
+ struct writeback_control *wbc, struct inode *inode, loff_t pos,
+ u16 ioend_flags)
{
struct iomap_ioend *ioend;
struct bio *bio;
@@ -1724,8 +1722,7 @@ static struct iomap_ioend *iomap_alloc_ioend(struct iomap_writepage_ctx *wpc,
ioend = iomap_ioend_from_bio(bio);
INIT_LIST_HEAD(&ioend->io_list);
- ioend->io_type = wpc->iomap.type;
- ioend->io_flags = wpc->iomap.flags;
+ ioend->io_flags = ioend_flags;
if (pos > wpc->iomap.offset)
wpc->iomap.flags &= ~IOMAP_F_BOUNDARY;
ioend->io_inode = inode;
@@ -1737,14 +1734,13 @@ static struct iomap_ioend *iomap_alloc_ioend(struct iomap_writepage_ctx *wpc,
return ioend;
}
-static bool iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t pos)
+static bool iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t pos,
+ u16 ioend_flags)
{
- if (wpc->iomap.offset == pos && (wpc->iomap.flags & IOMAP_F_BOUNDARY))
- return false;
- if ((wpc->iomap.flags & IOMAP_F_SHARED) !=
- (wpc->ioend->io_flags & IOMAP_F_SHARED))
+ if (ioend_flags & IOMAP_IOEND_BOUNDARY)
return false;
- if (wpc->iomap.type != wpc->ioend->io_type)
+ if ((ioend_flags & IOMAP_IOEND_NOMERGE_FLAGS) !=
+ (wpc->ioend->io_flags & IOMAP_IOEND_NOMERGE_FLAGS))
return false;
if (pos != wpc->ioend->io_offset + wpc->ioend->io_size)
return false;
@@ -1778,14 +1774,23 @@ static int iomap_add_to_ioend(struct iomap_writepage_ctx *wpc,
{
struct iomap_folio_state *ifs = folio->private;
size_t poff = offset_in_folio(folio, pos);
+ unsigned int ioend_flags = 0;
int error;
- if (!wpc->ioend || !iomap_can_add_to_ioend(wpc, pos)) {
+ if (wpc->iomap.type == IOMAP_UNWRITTEN)
+ ioend_flags |= IOMAP_IOEND_UNWRITTEN;
+ if (wpc->iomap.flags & IOMAP_F_SHARED)
+ ioend_flags |= IOMAP_IOEND_SHARED;
+ if (pos == wpc->iomap.offset && (wpc->iomap.flags & IOMAP_F_BOUNDARY))
+ ioend_flags |= IOMAP_IOEND_BOUNDARY;
+
+ if (!wpc->ioend || !iomap_can_add_to_ioend(wpc, pos, ioend_flags)) {
new_ioend:
error = iomap_submit_ioend(wpc, 0);
if (error)
return error;
- wpc->ioend = iomap_alloc_ioend(wpc, wbc, inode, pos);
+ wpc->ioend = iomap_alloc_ioend(wpc, wbc, inode, pos,
+ ioend_flags);
}
if (!bio_add_folio(&wpc->ioend->io_bio, folio, len, poff))
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index d175853da5ae..d35ac4c19fb2 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -114,7 +114,7 @@ xfs_end_ioend(
*/
error = blk_status_to_errno(ioend->io_bio.bi_status);
if (unlikely(error)) {
- if (ioend->io_flags & IOMAP_F_SHARED) {
+ if (ioend->io_flags & IOMAP_IOEND_SHARED) {
xfs_reflink_cancel_cow_range(ip, offset, size, true);
xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK, offset,
offset + size);
@@ -125,9 +125,9 @@ xfs_end_ioend(
/*
* Success: commit the COW or unwritten blocks if needed.
*/
- if (ioend->io_flags & IOMAP_F_SHARED)
+ if (ioend->io_flags & IOMAP_IOEND_SHARED)
error = xfs_reflink_end_cow(ip, offset, size);
- else if (ioend->io_type == IOMAP_UNWRITTEN)
+ else if (ioend->io_flags & IOMAP_IOEND_UNWRITTEN)
error = xfs_iomap_write_unwritten(ip, offset, size, false);
if (!error && xfs_ioend_is_append(ioend))
@@ -410,7 +410,7 @@ xfs_submit_ioend(
nofs_flag = memalloc_nofs_save();
/* Convert CoW extents to regular */
- if (!status && (ioend->io_flags & IOMAP_F_SHARED)) {
+ if (!status && (ioend->io_flags & IOMAP_IOEND_SHARED)) {
status = xfs_reflink_convert_cow(XFS_I(ioend->io_inode),
ioend->io_offset, ioend->io_size);
}
@@ -418,8 +418,8 @@ xfs_submit_ioend(
memalloc_nofs_restore(nofs_flag);
/* send ioends that might require a transaction to the completion wq */
- if (xfs_ioend_is_append(ioend) || ioend->io_type == IOMAP_UNWRITTEN ||
- (ioend->io_flags & IOMAP_F_SHARED))
+ if (xfs_ioend_is_append(ioend) ||
+ (ioend->io_flags & (IOMAP_IOEND_UNWRITTEN | IOMAP_IOEND_SHARED)))
ioend->io_bio.bi_end_io = xfs_end_bio;
if (status)
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index c0339678d798..31857d4750a9 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -327,13 +327,29 @@ loff_t iomap_seek_data(struct inode *inode, loff_t offset,
sector_t iomap_bmap(struct address_space *mapping, sector_t bno,
const struct iomap_ops *ops);
+/*
+ * Flags for iomap_ioend->io_flags.
+ */
+/* shared COW extent */
+#define IOMAP_IOEND_SHARED (1U << 0)
+/* unwritten extent */
+#define IOMAP_IOEND_UNWRITTEN (1U << 1)
+/* don't merge into previous ioend */
+#define IOMAP_IOEND_BOUNDARY (1U << 2)
+
+/*
+ * Flags that if set on either ioend prevent the merge of two ioends.
+ * (IOMAP_IOEND_BOUNDARY also prevents merged, but only one-way)
+ */
+#define IOMAP_IOEND_NOMERGE_FLAGS \
+ (IOMAP_IOEND_SHARED | IOMAP_IOEND_UNWRITTEN)
+
/*
* Structure for writeback I/O completions.
*/
struct iomap_ioend {
struct list_head io_list; /* next ioend in chain */
- u16 io_type;
- u16 io_flags; /* IOMAP_F_* */
+ u16 io_flags; /* IOMAP_IOEND_* */
struct inode *io_inode; /* file being written to */
size_t io_size; /* size of the extent */
loff_t io_offset; /* offset in the file */
--
2.45.2
^ permalink raw reply related [flat|nested] 20+ messages in thread* Re: [PATCH 02/10] iomap: simplify io_flags and io_type in struct iomap_ioend
2024-12-19 17:39 ` [PATCH 02/10] iomap: simplify io_flags and io_type in struct iomap_ioend Christoph Hellwig
@ 2024-12-19 17:56 ` Darrick J. Wong
0 siblings, 0 replies; 20+ messages in thread
From: Darrick J. Wong @ 2024-12-19 17:56 UTC (permalink / raw)
To: Christoph Hellwig
Cc: Christian Brauner, Carlos Maiolino, linux-xfs, linux-fsdevel
On Thu, Dec 19, 2024 at 05:39:07PM +0000, Christoph Hellwig wrote:
> The ioend fields for distinct types of I/O are a bit complicated.
> Consolidate them into a single io_flag field with it's own flags
> decoupled from the iomap flags. This also prepares for adding a new
> flag that is unrelated to both of the iomap namespaces.
>
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
> fs/iomap/buffered-io.c | 39 ++++++++++++++++++++++-----------------
> fs/xfs/xfs_aops.c | 12 ++++++------
> include/linux/iomap.h | 20 ++++++++++++++++++--
> 3 files changed, 46 insertions(+), 25 deletions(-)
>
> diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
> index cdccf11bb3be..3176dc996fb7 100644
> --- a/fs/iomap/buffered-io.c
> +++ b/fs/iomap/buffered-io.c
> @@ -1605,13 +1605,10 @@ iomap_ioend_can_merge(struct iomap_ioend *ioend, struct iomap_ioend *next)
> {
> if (ioend->io_bio.bi_status != next->io_bio.bi_status)
> return false;
> - if (next->io_flags & IOMAP_F_BOUNDARY)
> + if (next->io_flags & IOMAP_IOEND_BOUNDARY)
> return false;
> - if ((ioend->io_flags & IOMAP_F_SHARED) ^
> - (next->io_flags & IOMAP_F_SHARED))
> - return false;
> - if ((ioend->io_type == IOMAP_UNWRITTEN) ^
> - (next->io_type == IOMAP_UNWRITTEN))
> + if ((ioend->io_flags & IOMAP_IOEND_NOMERGE_FLAGS) !=
> + (next->io_flags & IOMAP_IOEND_NOMERGE_FLAGS))
> return false;
> if (ioend->io_offset + ioend->io_size != next->io_offset)
> return false;
> @@ -1709,7 +1706,8 @@ static int iomap_submit_ioend(struct iomap_writepage_ctx *wpc, int error)
> }
>
> static struct iomap_ioend *iomap_alloc_ioend(struct iomap_writepage_ctx *wpc,
> - struct writeback_control *wbc, struct inode *inode, loff_t pos)
> + struct writeback_control *wbc, struct inode *inode, loff_t pos,
> + u16 ioend_flags)
> {
> struct iomap_ioend *ioend;
> struct bio *bio;
> @@ -1724,8 +1722,7 @@ static struct iomap_ioend *iomap_alloc_ioend(struct iomap_writepage_ctx *wpc,
>
> ioend = iomap_ioend_from_bio(bio);
> INIT_LIST_HEAD(&ioend->io_list);
> - ioend->io_type = wpc->iomap.type;
> - ioend->io_flags = wpc->iomap.flags;
> + ioend->io_flags = ioend_flags;
> if (pos > wpc->iomap.offset)
> wpc->iomap.flags &= ~IOMAP_F_BOUNDARY;
> ioend->io_inode = inode;
> @@ -1737,14 +1734,13 @@ static struct iomap_ioend *iomap_alloc_ioend(struct iomap_writepage_ctx *wpc,
> return ioend;
> }
>
> -static bool iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t pos)
> +static bool iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t pos,
> + u16 ioend_flags)
> {
> - if (wpc->iomap.offset == pos && (wpc->iomap.flags & IOMAP_F_BOUNDARY))
> - return false;
> - if ((wpc->iomap.flags & IOMAP_F_SHARED) !=
> - (wpc->ioend->io_flags & IOMAP_F_SHARED))
> + if (ioend_flags & IOMAP_IOEND_BOUNDARY)
> return false;
> - if (wpc->iomap.type != wpc->ioend->io_type)
> + if ((ioend_flags & IOMAP_IOEND_NOMERGE_FLAGS) !=
> + (wpc->ioend->io_flags & IOMAP_IOEND_NOMERGE_FLAGS))
> return false;
> if (pos != wpc->ioend->io_offset + wpc->ioend->io_size)
> return false;
> @@ -1778,14 +1774,23 @@ static int iomap_add_to_ioend(struct iomap_writepage_ctx *wpc,
> {
> struct iomap_folio_state *ifs = folio->private;
> size_t poff = offset_in_folio(folio, pos);
> + unsigned int ioend_flags = 0;
> int error;
>
> - if (!wpc->ioend || !iomap_can_add_to_ioend(wpc, pos)) {
> + if (wpc->iomap.type == IOMAP_UNWRITTEN)
> + ioend_flags |= IOMAP_IOEND_UNWRITTEN;
> + if (wpc->iomap.flags & IOMAP_F_SHARED)
> + ioend_flags |= IOMAP_IOEND_SHARED;
> + if (pos == wpc->iomap.offset && (wpc->iomap.flags & IOMAP_F_BOUNDARY))
> + ioend_flags |= IOMAP_IOEND_BOUNDARY;
> +
> + if (!wpc->ioend || !iomap_can_add_to_ioend(wpc, pos, ioend_flags)) {
> new_ioend:
> error = iomap_submit_ioend(wpc, 0);
> if (error)
> return error;
> - wpc->ioend = iomap_alloc_ioend(wpc, wbc, inode, pos);
> + wpc->ioend = iomap_alloc_ioend(wpc, wbc, inode, pos,
> + ioend_flags);
> }
>
> if (!bio_add_folio(&wpc->ioend->io_bio, folio, len, poff))
> diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
> index d175853da5ae..d35ac4c19fb2 100644
> --- a/fs/xfs/xfs_aops.c
> +++ b/fs/xfs/xfs_aops.c
> @@ -114,7 +114,7 @@ xfs_end_ioend(
> */
> error = blk_status_to_errno(ioend->io_bio.bi_status);
> if (unlikely(error)) {
> - if (ioend->io_flags & IOMAP_F_SHARED) {
> + if (ioend->io_flags & IOMAP_IOEND_SHARED) {
> xfs_reflink_cancel_cow_range(ip, offset, size, true);
> xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK, offset,
> offset + size);
> @@ -125,9 +125,9 @@ xfs_end_ioend(
> /*
> * Success: commit the COW or unwritten blocks if needed.
> */
> - if (ioend->io_flags & IOMAP_F_SHARED)
> + if (ioend->io_flags & IOMAP_IOEND_SHARED)
> error = xfs_reflink_end_cow(ip, offset, size);
> - else if (ioend->io_type == IOMAP_UNWRITTEN)
> + else if (ioend->io_flags & IOMAP_IOEND_UNWRITTEN)
> error = xfs_iomap_write_unwritten(ip, offset, size, false);
>
> if (!error && xfs_ioend_is_append(ioend))
> @@ -410,7 +410,7 @@ xfs_submit_ioend(
> nofs_flag = memalloc_nofs_save();
>
> /* Convert CoW extents to regular */
> - if (!status && (ioend->io_flags & IOMAP_F_SHARED)) {
> + if (!status && (ioend->io_flags & IOMAP_IOEND_SHARED)) {
> status = xfs_reflink_convert_cow(XFS_I(ioend->io_inode),
> ioend->io_offset, ioend->io_size);
> }
> @@ -418,8 +418,8 @@ xfs_submit_ioend(
> memalloc_nofs_restore(nofs_flag);
>
> /* send ioends that might require a transaction to the completion wq */
> - if (xfs_ioend_is_append(ioend) || ioend->io_type == IOMAP_UNWRITTEN ||
> - (ioend->io_flags & IOMAP_F_SHARED))
> + if (xfs_ioend_is_append(ioend) ||
> + (ioend->io_flags & (IOMAP_IOEND_UNWRITTEN | IOMAP_IOEND_SHARED)))
> ioend->io_bio.bi_end_io = xfs_end_bio;
>
> if (status)
> diff --git a/include/linux/iomap.h b/include/linux/iomap.h
> index c0339678d798..31857d4750a9 100644
> --- a/include/linux/iomap.h
> +++ b/include/linux/iomap.h
> @@ -327,13 +327,29 @@ loff_t iomap_seek_data(struct inode *inode, loff_t offset,
> sector_t iomap_bmap(struct address_space *mapping, sector_t bno,
> const struct iomap_ops *ops);
>
> +/*
> + * Flags for iomap_ioend->io_flags.
> + */
> +/* shared COW extent */
> +#define IOMAP_IOEND_SHARED (1U << 0)
> +/* unwritten extent */
> +#define IOMAP_IOEND_UNWRITTEN (1U << 1)
> +/* don't merge into previous ioend */
> +#define IOMAP_IOEND_BOUNDARY (1U << 2)
> +
> +/*
> + * Flags that if set on either ioend prevent the merge of two ioends.
> + * (IOMAP_IOEND_BOUNDARY also prevents merged, but only one-way)
merges
With that fixed,
Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
--D
> + */
> +#define IOMAP_IOEND_NOMERGE_FLAGS \
> + (IOMAP_IOEND_SHARED | IOMAP_IOEND_UNWRITTEN)
> +
> /*
> * Structure for writeback I/O completions.
> */
> struct iomap_ioend {
> struct list_head io_list; /* next ioend in chain */
> - u16 io_type;
> - u16 io_flags; /* IOMAP_F_* */
> + u16 io_flags; /* IOMAP_IOEND_* */
> struct inode *io_inode; /* file being written to */
> size_t io_size; /* size of the extent */
> loff_t io_offset; /* offset in the file */
> --
> 2.45.2
>
>
^ permalink raw reply [flat|nested] 20+ messages in thread
* [PATCH 03/10] iomap: add a IOMAP_F_ANON_WRITE flag
2024-12-19 17:39 iomap patches for zoned XFS v1 Christoph Hellwig
2024-12-19 17:39 ` [PATCH 01/10] iomap: allow the file system to submit the writeback bios Christoph Hellwig
2024-12-19 17:39 ` [PATCH 02/10] iomap: simplify io_flags and io_type in struct iomap_ioend Christoph Hellwig
@ 2024-12-19 17:39 ` Christoph Hellwig
2024-12-19 18:02 ` Darrick J. Wong
2024-12-19 17:39 ` [PATCH 04/10] iomap: split bios to zone append limits in the submission handlers Christoph Hellwig
` (6 subsequent siblings)
9 siblings, 1 reply; 20+ messages in thread
From: Christoph Hellwig @ 2024-12-19 17:39 UTC (permalink / raw)
To: Christian Brauner
Cc: Darrick J. Wong, Carlos Maiolino, linux-xfs, linux-fsdevel
Add a IOMAP_F_ANON_WRITE flag that indicates that the write I/O does not
have a target block assigned to it yet at iomap time and the file system
will do that in the bio submission handler, splitting the I/O as needed.
This is used to implement Zone Append based I/O for zoned XFS, where
splitting writes to the hardware limits and assigning a zone to them
happens just before sending the I/O off to the block layer, but could
also be useful for other things like compressed I/O.
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
Documentation/filesystems/iomap/design.rst | 4 ++++
fs/iomap/buffered-io.c | 13 +++++++++----
fs/iomap/direct-io.c | 6 ++++--
include/linux/iomap.h | 7 +++++++
4 files changed, 24 insertions(+), 6 deletions(-)
diff --git a/Documentation/filesystems/iomap/design.rst b/Documentation/filesystems/iomap/design.rst
index b0d0188a095e..28ab3758c474 100644
--- a/Documentation/filesystems/iomap/design.rst
+++ b/Documentation/filesystems/iomap/design.rst
@@ -246,6 +246,10 @@ The fields are as follows:
* **IOMAP_F_PRIVATE**: Starting with this value, the upper bits can
be set by the filesystem for its own purposes.
+ * **IOMAP_F_ANON_WRITE**: Indicates that (write) I/O does not have a target
+ block assigned to it yet and the file system will do that in the bio
+ submission handler, splitting the I/O as needed.
+
These flags can be set by iomap itself during file operations.
The filesystem should supply an ``->iomap_end`` function if it needs
to observe these flags:
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 3176dc996fb7..8c18fb2a82e0 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -1691,10 +1691,14 @@ static int iomap_submit_ioend(struct iomap_writepage_ctx *wpc, int error)
* failure happened so that the file system end I/O handler gets called
* to clean up.
*/
- if (wpc->ops->submit_ioend)
+ if (wpc->ops->submit_ioend) {
error = wpc->ops->submit_ioend(wpc, error);
- else if (!error)
- submit_bio(&wpc->ioend->io_bio);
+ } else {
+ if (WARN_ON_ONCE(wpc->iomap.flags & IOMAP_F_ANON_WRITE))
+ error = -EIO;
+ if (!error)
+ submit_bio(&wpc->ioend->io_bio);
+ }
if (error) {
wpc->ioend->io_bio.bi_status = errno_to_blk_status(error);
@@ -1744,7 +1748,8 @@ static bool iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t pos,
return false;
if (pos != wpc->ioend->io_offset + wpc->ioend->io_size)
return false;
- if (iomap_sector(&wpc->iomap, pos) !=
+ if (!(wpc->iomap.flags & IOMAP_F_ANON_WRITE) &&
+ iomap_sector(&wpc->iomap, pos) !=
bio_end_sector(&wpc->ioend->io_bio))
return false;
/*
diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index b521eb15759e..641649a04614 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -81,10 +81,12 @@ static void iomap_dio_submit_bio(const struct iomap_iter *iter,
WRITE_ONCE(iocb->private, bio);
}
- if (dio->dops && dio->dops->submit_io)
+ if (dio->dops && dio->dops->submit_io) {
dio->dops->submit_io(iter, bio, pos);
- else
+ } else {
+ WARN_ON_ONCE(iter->iomap.flags & IOMAP_F_ANON_WRITE);
submit_bio(bio);
+ }
}
ssize_t iomap_dio_complete(struct iomap_dio *dio)
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 31857d4750a9..36a7298b6cea 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -56,6 +56,10 @@ struct vm_fault;
*
* IOMAP_F_BOUNDARY indicates that I/O and I/O completions for this iomap must
* never be merged with the mapping before it.
+ *
+ * IOMAP_F_ANON_WRITE indicates that (write) I/O does not have a target block
+ * assigned to it yet and the file system will do that in the bio submission
+ * handler, splitting the I/O as needed.
*/
#define IOMAP_F_NEW (1U << 0)
#define IOMAP_F_DIRTY (1U << 1)
@@ -68,6 +72,7 @@ struct vm_fault;
#endif /* CONFIG_BUFFER_HEAD */
#define IOMAP_F_XATTR (1U << 5)
#define IOMAP_F_BOUNDARY (1U << 6)
+#define IOMAP_F_ANON_WRITE (1U << 7)
/*
* Flags set by the core iomap code during operations:
@@ -111,6 +116,8 @@ struct iomap {
static inline sector_t iomap_sector(const struct iomap *iomap, loff_t pos)
{
+ if (iomap->flags & IOMAP_F_ANON_WRITE)
+ return U64_MAX; /* invalid */
return (iomap->addr + pos - iomap->offset) >> SECTOR_SHIFT;
}
--
2.45.2
^ permalink raw reply related [flat|nested] 20+ messages in thread* Re: [PATCH 03/10] iomap: add a IOMAP_F_ANON_WRITE flag
2024-12-19 17:39 ` [PATCH 03/10] iomap: add a IOMAP_F_ANON_WRITE flag Christoph Hellwig
@ 2024-12-19 18:02 ` Darrick J. Wong
2024-12-19 18:24 ` Christoph Hellwig
0 siblings, 1 reply; 20+ messages in thread
From: Darrick J. Wong @ 2024-12-19 18:02 UTC (permalink / raw)
To: Christoph Hellwig
Cc: Christian Brauner, Carlos Maiolino, linux-xfs, linux-fsdevel
On Thu, Dec 19, 2024 at 05:39:08PM +0000, Christoph Hellwig wrote:
> Add a IOMAP_F_ANON_WRITE flag that indicates that the write I/O does not
> have a target block assigned to it yet at iomap time and the file system
> will do that in the bio submission handler, splitting the I/O as needed.
>
> This is used to implement Zone Append based I/O for zoned XFS, where
> splitting writes to the hardware limits and assigning a zone to them
> happens just before sending the I/O off to the block layer, but could
> also be useful for other things like compressed I/O.
>
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
> Documentation/filesystems/iomap/design.rst | 4 ++++
> fs/iomap/buffered-io.c | 13 +++++++++----
> fs/iomap/direct-io.c | 6 ++++--
> include/linux/iomap.h | 7 +++++++
> 4 files changed, 24 insertions(+), 6 deletions(-)
>
> diff --git a/Documentation/filesystems/iomap/design.rst b/Documentation/filesystems/iomap/design.rst
> index b0d0188a095e..28ab3758c474 100644
> --- a/Documentation/filesystems/iomap/design.rst
> +++ b/Documentation/filesystems/iomap/design.rst
> @@ -246,6 +246,10 @@ The fields are as follows:
> * **IOMAP_F_PRIVATE**: Starting with this value, the upper bits can
> be set by the filesystem for its own purposes.
>
> + * **IOMAP_F_ANON_WRITE**: Indicates that (write) I/O does not have a target
> + block assigned to it yet and the file system will do that in the bio
> + submission handler, splitting the I/O as needed.
> +
> These flags can be set by iomap itself during file operations.
> The filesystem should supply an ``->iomap_end`` function if it needs
> to observe these flags:
> diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
> index 3176dc996fb7..8c18fb2a82e0 100644
> --- a/fs/iomap/buffered-io.c
> +++ b/fs/iomap/buffered-io.c
> @@ -1691,10 +1691,14 @@ static int iomap_submit_ioend(struct iomap_writepage_ctx *wpc, int error)
> * failure happened so that the file system end I/O handler gets called
> * to clean up.
> */
> - if (wpc->ops->submit_ioend)
> + if (wpc->ops->submit_ioend) {
> error = wpc->ops->submit_ioend(wpc, error);
> - else if (!error)
> - submit_bio(&wpc->ioend->io_bio);
> + } else {
> + if (WARN_ON_ONCE(wpc->iomap.flags & IOMAP_F_ANON_WRITE))
> + error = -EIO;
> + if (!error)
> + submit_bio(&wpc->ioend->io_bio);
> + }
>
> if (error) {
> wpc->ioend->io_bio.bi_status = errno_to_blk_status(error);
> @@ -1744,7 +1748,8 @@ static bool iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t pos,
> return false;
> if (pos != wpc->ioend->io_offset + wpc->ioend->io_size)
> return false;
> - if (iomap_sector(&wpc->iomap, pos) !=
> + if (!(wpc->iomap.flags & IOMAP_F_ANON_WRITE) &&
> + iomap_sector(&wpc->iomap, pos) !=
> bio_end_sector(&wpc->ioend->io_bio))
> return false;
> /*
> diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
> index b521eb15759e..641649a04614 100644
> --- a/fs/iomap/direct-io.c
> +++ b/fs/iomap/direct-io.c
> @@ -81,10 +81,12 @@ static void iomap_dio_submit_bio(const struct iomap_iter *iter,
> WRITE_ONCE(iocb->private, bio);
> }
>
> - if (dio->dops && dio->dops->submit_io)
> + if (dio->dops && dio->dops->submit_io) {
> dio->dops->submit_io(iter, bio, pos);
> - else
> + } else {
> + WARN_ON_ONCE(iter->iomap.flags & IOMAP_F_ANON_WRITE);
> submit_bio(bio);
Do we need to error the bio instead of submitting it if
IOMAP_F_ANON_WRITE is set here? Or are we relying on the block
layer/device will reject an IO to U64_MAX and produce the EIO for us?
If yes, then that's acceptagble to me
Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
--D
> + }
> }
>
> ssize_t iomap_dio_complete(struct iomap_dio *dio)
> diff --git a/include/linux/iomap.h b/include/linux/iomap.h
> index 31857d4750a9..36a7298b6cea 100644
> --- a/include/linux/iomap.h
> +++ b/include/linux/iomap.h
> @@ -56,6 +56,10 @@ struct vm_fault;
> *
> * IOMAP_F_BOUNDARY indicates that I/O and I/O completions for this iomap must
> * never be merged with the mapping before it.
> + *
> + * IOMAP_F_ANON_WRITE indicates that (write) I/O does not have a target block
> + * assigned to it yet and the file system will do that in the bio submission
> + * handler, splitting the I/O as needed.
> */
> #define IOMAP_F_NEW (1U << 0)
> #define IOMAP_F_DIRTY (1U << 1)
> @@ -68,6 +72,7 @@ struct vm_fault;
> #endif /* CONFIG_BUFFER_HEAD */
> #define IOMAP_F_XATTR (1U << 5)
> #define IOMAP_F_BOUNDARY (1U << 6)
> +#define IOMAP_F_ANON_WRITE (1U << 7)
>
> /*
> * Flags set by the core iomap code during operations:
> @@ -111,6 +116,8 @@ struct iomap {
>
> static inline sector_t iomap_sector(const struct iomap *iomap, loff_t pos)
> {
> + if (iomap->flags & IOMAP_F_ANON_WRITE)
> + return U64_MAX; /* invalid */
> return (iomap->addr + pos - iomap->offset) >> SECTOR_SHIFT;
> }
>
> --
> 2.45.2
>
>
^ permalink raw reply [flat|nested] 20+ messages in thread* Re: [PATCH 03/10] iomap: add a IOMAP_F_ANON_WRITE flag
2024-12-19 18:02 ` Darrick J. Wong
@ 2024-12-19 18:24 ` Christoph Hellwig
0 siblings, 0 replies; 20+ messages in thread
From: Christoph Hellwig @ 2024-12-19 18:24 UTC (permalink / raw)
To: Darrick J. Wong
Cc: Christoph Hellwig, Christian Brauner, Carlos Maiolino, linux-xfs,
linux-fsdevel
On Thu, Dec 19, 2024 at 10:02:08AM -0800, Darrick J. Wong wrote:
> Do we need to error the bio instead of submitting it if
> IOMAP_F_ANON_WRITE is set here? Or are we relying on the block
> layer/device will reject an IO to U64_MAX and produce the EIO for us?
I'd rather not add an error return where we previously didn't for
catching a grave programmer error. I wonder if I should also drop
the error handling in the writeback submission to be consistent?
^ permalink raw reply [flat|nested] 20+ messages in thread
* [PATCH 04/10] iomap: split bios to zone append limits in the submission handlers
2024-12-19 17:39 iomap patches for zoned XFS v1 Christoph Hellwig
` (2 preceding siblings ...)
2024-12-19 17:39 ` [PATCH 03/10] iomap: add a IOMAP_F_ANON_WRITE flag Christoph Hellwig
@ 2024-12-19 17:39 ` Christoph Hellwig
2024-12-19 18:17 ` Darrick J. Wong
2024-12-19 17:39 ` [PATCH 05/10] iomap: move common ioend code to ioend.c Christoph Hellwig
` (5 subsequent siblings)
9 siblings, 1 reply; 20+ messages in thread
From: Christoph Hellwig @ 2024-12-19 17:39 UTC (permalink / raw)
To: Christian Brauner
Cc: Darrick J. Wong, Carlos Maiolino, linux-xfs, linux-fsdevel
Provide helpers for file systems to split bios in the direct I/O and
writeback I/O submission handlers.
This Follows btrfs' lead and don't try to build bios to hardware limits
for zone append commands, but instead build them as normal unconstrained
bios and split them to the hardware limits in the I/O submission handler.
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
fs/iomap/Makefile | 1 +
fs/iomap/buffered-io.c | 49 ++++++++++++++----------
fs/iomap/ioend.c | 86 ++++++++++++++++++++++++++++++++++++++++++
include/linux/iomap.h | 9 +++++
4 files changed, 125 insertions(+), 20 deletions(-)
create mode 100644 fs/iomap/ioend.c
diff --git a/fs/iomap/Makefile b/fs/iomap/Makefile
index 381d76c5c232..69e8ebb41302 100644
--- a/fs/iomap/Makefile
+++ b/fs/iomap/Makefile
@@ -12,6 +12,7 @@ iomap-y += trace.o \
iter.o
iomap-$(CONFIG_BLOCK) += buffered-io.o \
direct-io.o \
+ ioend.o \
fiemap.o \
seek.o
iomap-$(CONFIG_SWAP) += swapfile.o
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 8c18fb2a82e0..0b68c9584a7f 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -40,7 +40,8 @@ struct iomap_folio_state {
unsigned long state[];
};
-static struct bio_set iomap_ioend_bioset;
+struct bio_set iomap_ioend_bioset;
+EXPORT_SYMBOL_GPL(iomap_ioend_bioset);
static inline bool ifs_is_fully_uptodate(struct folio *folio,
struct iomap_folio_state *ifs)
@@ -1539,15 +1540,15 @@ static void iomap_finish_folio_write(struct inode *inode, struct folio *folio,
* ioend after this.
*/
static u32
-iomap_finish_ioend(struct iomap_ioend *ioend, int error)
+iomap_finish_ioend_buffered(struct iomap_ioend *ioend)
{
struct inode *inode = ioend->io_inode;
struct bio *bio = &ioend->io_bio;
struct folio_iter fi;
u32 folio_count = 0;
- if (error) {
- mapping_set_error(inode->i_mapping, error);
+ if (ioend->io_error) {
+ mapping_set_error(inode->i_mapping, ioend->io_error);
if (!bio_flagged(bio, BIO_QUIET)) {
pr_err_ratelimited(
"%s: writeback error on inode %lu, offset %lld, sector %llu",
@@ -1566,6 +1567,24 @@ iomap_finish_ioend(struct iomap_ioend *ioend, int error)
return folio_count;
}
+static u32
+iomap_finish_ioend(struct iomap_ioend *ioend, int error)
+{
+ if (ioend->io_parent) {
+ struct bio *bio = &ioend->io_bio;
+
+ ioend = ioend->io_parent;
+ bio_put(bio);
+ }
+
+ if (error)
+ cmpxchg(&ioend->io_error, 0, error);
+
+ if (!atomic_dec_and_test(&ioend->io_remaining))
+ return 0;
+ return iomap_finish_ioend_buffered(ioend);
+}
+
/*
* Ioend completion routine for merged bios. This can only be called from task
* contexts as merged ioends can be of unbound length. Hence we have to break up
@@ -1667,8 +1686,10 @@ EXPORT_SYMBOL_GPL(iomap_sort_ioends);
static void iomap_writepage_end_bio(struct bio *bio)
{
- iomap_finish_ioend(iomap_ioend_from_bio(bio),
- blk_status_to_errno(bio->bi_status));
+ struct iomap_ioend *ioend = iomap_ioend_from_bio(bio);
+
+ ioend->io_error = blk_status_to_errno(bio->bi_status);
+ iomap_finish_ioend_buffered(ioend);
}
/*
@@ -1713,7 +1734,6 @@ static struct iomap_ioend *iomap_alloc_ioend(struct iomap_writepage_ctx *wpc,
struct writeback_control *wbc, struct inode *inode, loff_t pos,
u16 ioend_flags)
{
- struct iomap_ioend *ioend;
struct bio *bio;
bio = bio_alloc_bioset(wpc->iomap.bdev, BIO_MAX_VECS,
@@ -1721,21 +1741,10 @@ static struct iomap_ioend *iomap_alloc_ioend(struct iomap_writepage_ctx *wpc,
GFP_NOFS, &iomap_ioend_bioset);
bio->bi_iter.bi_sector = iomap_sector(&wpc->iomap, pos);
bio->bi_end_io = iomap_writepage_end_bio;
- wbc_init_bio(wbc, bio);
bio->bi_write_hint = inode->i_write_hint;
-
- ioend = iomap_ioend_from_bio(bio);
- INIT_LIST_HEAD(&ioend->io_list);
- ioend->io_flags = ioend_flags;
- if (pos > wpc->iomap.offset)
- wpc->iomap.flags &= ~IOMAP_F_BOUNDARY;
- ioend->io_inode = inode;
- ioend->io_size = 0;
- ioend->io_offset = pos;
- ioend->io_sector = bio->bi_iter.bi_sector;
-
+ wbc_init_bio(wbc, bio);
wpc->nr_folios = 0;
- return ioend;
+ return iomap_init_ioend(inode, bio, pos, ioend_flags);
}
static bool iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t pos,
diff --git a/fs/iomap/ioend.c b/fs/iomap/ioend.c
new file mode 100644
index 000000000000..1b032323ee4e
--- /dev/null
+++ b/fs/iomap/ioend.c
@@ -0,0 +1,86 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2024 Christoph Hellwig.
+ */
+#include <linux/iomap.h>
+
+struct iomap_ioend *iomap_init_ioend(struct inode *inode,
+ struct bio *bio, loff_t file_offset, u16 ioend_flags)
+{
+ struct iomap_ioend *ioend = iomap_ioend_from_bio(bio);
+
+ atomic_set(&ioend->io_remaining, 1);
+ ioend->io_error = 0;
+ ioend->io_parent = NULL;
+ INIT_LIST_HEAD(&ioend->io_list);
+ ioend->io_flags = ioend_flags;
+ ioend->io_inode = inode;
+ ioend->io_offset = file_offset;
+ ioend->io_size = bio->bi_iter.bi_size;
+ ioend->io_sector = bio->bi_iter.bi_sector;
+ return ioend;
+}
+EXPORT_SYMBOL_GPL(iomap_init_ioend);
+
+/*
+ * Split up to the first @max_len bytes from @ioend if the ioend covers more
+ * than @max_len bytes.
+ *
+ * If @is_append is set, the split will be based on the hardware limits for
+ * REQ_OP_ZONE_APPEND commands and can be less than @max_len if the hardware
+ * limits don't allow the entire @max_len length.
+ *
+ * The bio embedded into @ioend must be a REQ_OP_WRITE because the block layer
+ * does not allow splitting REQ_OP_ZONE_APPEND bios. The file systems has to
+ * switch the operation after this call, but before submitting the bio.
+ */
+struct iomap_ioend *iomap_split_ioend(struct iomap_ioend *ioend,
+ unsigned int max_len, bool is_append)
+{
+ struct bio *bio = &ioend->io_bio;
+ struct iomap_ioend *split_ioend;
+ unsigned int nr_segs;
+ int sector_offset;
+ struct bio *split;
+
+ if (is_append) {
+ struct queue_limits *lim = bdev_limits(bio->bi_bdev);
+
+ max_len = min(max_len,
+ lim->max_zone_append_sectors << SECTOR_SHIFT);
+
+ sector_offset = bio_split_rw_at(bio, lim, &nr_segs, max_len);
+ if (unlikely(sector_offset < 0))
+ return ERR_PTR(sector_offset);
+ if (!sector_offset)
+ return NULL;
+ } else {
+ if (bio->bi_iter.bi_size <= max_len)
+ return NULL;
+ sector_offset = max_len >> SECTOR_SHIFT;
+ }
+
+ /* ensure the split ioend is still block size aligned */
+ sector_offset = ALIGN_DOWN(sector_offset << SECTOR_SHIFT,
+ i_blocksize(ioend->io_inode)) >> SECTOR_SHIFT;
+
+ split = bio_split(bio, sector_offset, GFP_NOFS, &iomap_ioend_bioset);
+ if (IS_ERR_OR_NULL(split))
+ return ERR_CAST(split);
+ split->bi_private = bio->bi_private;
+ split->bi_end_io = bio->bi_end_io;
+
+ split_ioend = iomap_init_ioend(ioend->io_inode, split, ioend->io_offset,
+ ioend->io_flags);
+ split_ioend->io_parent = ioend;
+
+ atomic_inc(&ioend->io_remaining);
+ ioend->io_offset += split_ioend->io_size;
+ ioend->io_size -= split_ioend->io_size;
+
+ split_ioend->io_sector = ioend->io_sector;
+ if (!is_append)
+ ioend->io_sector += (split_ioend->io_size >> SECTOR_SHIFT);
+ return split_ioend;
+}
+EXPORT_SYMBOL_GPL(iomap_split_ioend);
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 36a7298b6cea..0d221fbe0eb3 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -358,6 +358,9 @@ struct iomap_ioend {
struct list_head io_list; /* next ioend in chain */
u16 io_flags; /* IOMAP_IOEND_* */
struct inode *io_inode; /* file being written to */
+ atomic_t io_remaining; /* completetion defer count */
+ int io_error; /* stashed away status */
+ struct iomap_ioend *io_parent; /* parent for completions */
size_t io_size; /* size of the extent */
loff_t io_offset; /* offset in the file */
sector_t io_sector; /* start sector of ioend */
@@ -408,6 +411,10 @@ struct iomap_writepage_ctx {
u32 nr_folios; /* folios added to the ioend */
};
+struct iomap_ioend *iomap_init_ioend(struct inode *inode, struct bio *bio,
+ loff_t file_offset, u16 ioend_flags);
+struct iomap_ioend *iomap_split_ioend(struct iomap_ioend *ioend,
+ unsigned int max_len, bool is_append);
void iomap_finish_ioends(struct iomap_ioend *ioend, int error);
void iomap_ioend_try_merge(struct iomap_ioend *ioend,
struct list_head *more_ioends);
@@ -479,4 +486,6 @@ int iomap_swapfile_activate(struct swap_info_struct *sis,
# define iomap_swapfile_activate(sis, swapfile, pagespan, ops) (-EIO)
#endif /* CONFIG_SWAP */
+extern struct bio_set iomap_ioend_bioset;
+
#endif /* LINUX_IOMAP_H */
--
2.45.2
^ permalink raw reply related [flat|nested] 20+ messages in thread* Re: [PATCH 04/10] iomap: split bios to zone append limits in the submission handlers
2024-12-19 17:39 ` [PATCH 04/10] iomap: split bios to zone append limits in the submission handlers Christoph Hellwig
@ 2024-12-19 18:17 ` Darrick J. Wong
2024-12-19 18:19 ` Christoph Hellwig
0 siblings, 1 reply; 20+ messages in thread
From: Darrick J. Wong @ 2024-12-19 18:17 UTC (permalink / raw)
To: Christoph Hellwig
Cc: Christian Brauner, Carlos Maiolino, linux-xfs, linux-fsdevel
On Thu, Dec 19, 2024 at 05:39:09PM +0000, Christoph Hellwig wrote:
> Provide helpers for file systems to split bios in the direct I/O and
> writeback I/O submission handlers.
>
> This Follows btrfs' lead and don't try to build bios to hardware limits
> for zone append commands, but instead build them as normal unconstrained
> bios and split them to the hardware limits in the I/O submission handler.
>
> Signed-off-by: Christoph Hellwig <hch@lst.de>
I wonder what iomap_split_ioend callsites look like now that the
alloc_len outparam from the previous version is gone, but I guess I'll
have to wait to see that.
> ---
> fs/iomap/Makefile | 1 +
> fs/iomap/buffered-io.c | 49 ++++++++++++++----------
> fs/iomap/ioend.c | 86 ++++++++++++++++++++++++++++++++++++++++++
> include/linux/iomap.h | 9 +++++
> 4 files changed, 125 insertions(+), 20 deletions(-)
> create mode 100644 fs/iomap/ioend.c
>
> diff --git a/fs/iomap/Makefile b/fs/iomap/Makefile
> index 381d76c5c232..69e8ebb41302 100644
> --- a/fs/iomap/Makefile
> +++ b/fs/iomap/Makefile
> @@ -12,6 +12,7 @@ iomap-y += trace.o \
> iter.o
> iomap-$(CONFIG_BLOCK) += buffered-io.o \
> direct-io.o \
> + ioend.o \
> fiemap.o \
> seek.o
> iomap-$(CONFIG_SWAP) += swapfile.o
> diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
> index 8c18fb2a82e0..0b68c9584a7f 100644
> --- a/fs/iomap/buffered-io.c
> +++ b/fs/iomap/buffered-io.c
> @@ -40,7 +40,8 @@ struct iomap_folio_state {
> unsigned long state[];
> };
>
> -static struct bio_set iomap_ioend_bioset;
> +struct bio_set iomap_ioend_bioset;
> +EXPORT_SYMBOL_GPL(iomap_ioend_bioset);
>
> static inline bool ifs_is_fully_uptodate(struct folio *folio,
> struct iomap_folio_state *ifs)
> @@ -1539,15 +1540,15 @@ static void iomap_finish_folio_write(struct inode *inode, struct folio *folio,
> * ioend after this.
> */
> static u32
> -iomap_finish_ioend(struct iomap_ioend *ioend, int error)
> +iomap_finish_ioend_buffered(struct iomap_ioend *ioend)
> {
> struct inode *inode = ioend->io_inode;
> struct bio *bio = &ioend->io_bio;
> struct folio_iter fi;
> u32 folio_count = 0;
>
> - if (error) {
> - mapping_set_error(inode->i_mapping, error);
> + if (ioend->io_error) {
> + mapping_set_error(inode->i_mapping, ioend->io_error);
> if (!bio_flagged(bio, BIO_QUIET)) {
> pr_err_ratelimited(
> "%s: writeback error on inode %lu, offset %lld, sector %llu",
> @@ -1566,6 +1567,24 @@ iomap_finish_ioend(struct iomap_ioend *ioend, int error)
> return folio_count;
> }
>
> +static u32
> +iomap_finish_ioend(struct iomap_ioend *ioend, int error)
> +{
> + if (ioend->io_parent) {
> + struct bio *bio = &ioend->io_bio;
> +
> + ioend = ioend->io_parent;
> + bio_put(bio);
> + }
> +
> + if (error)
> + cmpxchg(&ioend->io_error, 0, error);
> +
> + if (!atomic_dec_and_test(&ioend->io_remaining))
> + return 0;
> + return iomap_finish_ioend_buffered(ioend);
> +}
> +
> /*
> * Ioend completion routine for merged bios. This can only be called from task
> * contexts as merged ioends can be of unbound length. Hence we have to break up
> @@ -1667,8 +1686,10 @@ EXPORT_SYMBOL_GPL(iomap_sort_ioends);
>
> static void iomap_writepage_end_bio(struct bio *bio)
> {
> - iomap_finish_ioend(iomap_ioend_from_bio(bio),
> - blk_status_to_errno(bio->bi_status));
> + struct iomap_ioend *ioend = iomap_ioend_from_bio(bio);
> +
> + ioend->io_error = blk_status_to_errno(bio->bi_status);
> + iomap_finish_ioend_buffered(ioend);
Hmm. This wasn't in the previous version of the patch. But my guess is
that anyone using the io_parent chaining has its own ->submit_ioend
function and therefore set its own bi_end_io function? IOWs, letting
iomap submit the bio itself is not compatible with io_parent != NULL.
If so, then you might want to note that in the declaration of io_parent
in iomap.h; and with that,
Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
--D
> }
>
> /*
> @@ -1713,7 +1734,6 @@ static struct iomap_ioend *iomap_alloc_ioend(struct iomap_writepage_ctx *wpc,
> struct writeback_control *wbc, struct inode *inode, loff_t pos,
> u16 ioend_flags)
> {
> - struct iomap_ioend *ioend;
> struct bio *bio;
>
> bio = bio_alloc_bioset(wpc->iomap.bdev, BIO_MAX_VECS,
> @@ -1721,21 +1741,10 @@ static struct iomap_ioend *iomap_alloc_ioend(struct iomap_writepage_ctx *wpc,
> GFP_NOFS, &iomap_ioend_bioset);
> bio->bi_iter.bi_sector = iomap_sector(&wpc->iomap, pos);
> bio->bi_end_io = iomap_writepage_end_bio;
> - wbc_init_bio(wbc, bio);
> bio->bi_write_hint = inode->i_write_hint;
> -
> - ioend = iomap_ioend_from_bio(bio);
> - INIT_LIST_HEAD(&ioend->io_list);
> - ioend->io_flags = ioend_flags;
> - if (pos > wpc->iomap.offset)
> - wpc->iomap.flags &= ~IOMAP_F_BOUNDARY;
> - ioend->io_inode = inode;
> - ioend->io_size = 0;
> - ioend->io_offset = pos;
> - ioend->io_sector = bio->bi_iter.bi_sector;
> -
> + wbc_init_bio(wbc, bio);
> wpc->nr_folios = 0;
> - return ioend;
> + return iomap_init_ioend(inode, bio, pos, ioend_flags);
> }
>
> static bool iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t pos,
> diff --git a/fs/iomap/ioend.c b/fs/iomap/ioend.c
> new file mode 100644
> index 000000000000..1b032323ee4e
> --- /dev/null
> +++ b/fs/iomap/ioend.c
> @@ -0,0 +1,86 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Copyright (c) 2024 Christoph Hellwig.
> + */
> +#include <linux/iomap.h>
> +
> +struct iomap_ioend *iomap_init_ioend(struct inode *inode,
> + struct bio *bio, loff_t file_offset, u16 ioend_flags)
> +{
> + struct iomap_ioend *ioend = iomap_ioend_from_bio(bio);
> +
> + atomic_set(&ioend->io_remaining, 1);
> + ioend->io_error = 0;
> + ioend->io_parent = NULL;
> + INIT_LIST_HEAD(&ioend->io_list);
> + ioend->io_flags = ioend_flags;
> + ioend->io_inode = inode;
> + ioend->io_offset = file_offset;
> + ioend->io_size = bio->bi_iter.bi_size;
> + ioend->io_sector = bio->bi_iter.bi_sector;
> + return ioend;
> +}
> +EXPORT_SYMBOL_GPL(iomap_init_ioend);
> +
> +/*
> + * Split up to the first @max_len bytes from @ioend if the ioend covers more
> + * than @max_len bytes.
> + *
> + * If @is_append is set, the split will be based on the hardware limits for
> + * REQ_OP_ZONE_APPEND commands and can be less than @max_len if the hardware
> + * limits don't allow the entire @max_len length.
> + *
> + * The bio embedded into @ioend must be a REQ_OP_WRITE because the block layer
> + * does not allow splitting REQ_OP_ZONE_APPEND bios. The file systems has to
> + * switch the operation after this call, but before submitting the bio.
> + */
> +struct iomap_ioend *iomap_split_ioend(struct iomap_ioend *ioend,
> + unsigned int max_len, bool is_append)
> +{
> + struct bio *bio = &ioend->io_bio;
> + struct iomap_ioend *split_ioend;
> + unsigned int nr_segs;
> + int sector_offset;
> + struct bio *split;
> +
> + if (is_append) {
> + struct queue_limits *lim = bdev_limits(bio->bi_bdev);
> +
> + max_len = min(max_len,
> + lim->max_zone_append_sectors << SECTOR_SHIFT);
> +
> + sector_offset = bio_split_rw_at(bio, lim, &nr_segs, max_len);
> + if (unlikely(sector_offset < 0))
> + return ERR_PTR(sector_offset);
> + if (!sector_offset)
> + return NULL;
> + } else {
> + if (bio->bi_iter.bi_size <= max_len)
> + return NULL;
> + sector_offset = max_len >> SECTOR_SHIFT;
> + }
> +
> + /* ensure the split ioend is still block size aligned */
> + sector_offset = ALIGN_DOWN(sector_offset << SECTOR_SHIFT,
> + i_blocksize(ioend->io_inode)) >> SECTOR_SHIFT;
> +
> + split = bio_split(bio, sector_offset, GFP_NOFS, &iomap_ioend_bioset);
> + if (IS_ERR_OR_NULL(split))
> + return ERR_CAST(split);
> + split->bi_private = bio->bi_private;
> + split->bi_end_io = bio->bi_end_io;
> +
> + split_ioend = iomap_init_ioend(ioend->io_inode, split, ioend->io_offset,
> + ioend->io_flags);
> + split_ioend->io_parent = ioend;
> +
> + atomic_inc(&ioend->io_remaining);
> + ioend->io_offset += split_ioend->io_size;
> + ioend->io_size -= split_ioend->io_size;
> +
> + split_ioend->io_sector = ioend->io_sector;
> + if (!is_append)
> + ioend->io_sector += (split_ioend->io_size >> SECTOR_SHIFT);
> + return split_ioend;
> +}
> +EXPORT_SYMBOL_GPL(iomap_split_ioend);
> diff --git a/include/linux/iomap.h b/include/linux/iomap.h
> index 36a7298b6cea..0d221fbe0eb3 100644
> --- a/include/linux/iomap.h
> +++ b/include/linux/iomap.h
> @@ -358,6 +358,9 @@ struct iomap_ioend {
> struct list_head io_list; /* next ioend in chain */
> u16 io_flags; /* IOMAP_IOEND_* */
> struct inode *io_inode; /* file being written to */
> + atomic_t io_remaining; /* completetion defer count */
> + int io_error; /* stashed away status */
> + struct iomap_ioend *io_parent; /* parent for completions */
> size_t io_size; /* size of the extent */
> loff_t io_offset; /* offset in the file */
> sector_t io_sector; /* start sector of ioend */
> @@ -408,6 +411,10 @@ struct iomap_writepage_ctx {
> u32 nr_folios; /* folios added to the ioend */
> };
>
> +struct iomap_ioend *iomap_init_ioend(struct inode *inode, struct bio *bio,
> + loff_t file_offset, u16 ioend_flags);
> +struct iomap_ioend *iomap_split_ioend(struct iomap_ioend *ioend,
> + unsigned int max_len, bool is_append);
> void iomap_finish_ioends(struct iomap_ioend *ioend, int error);
> void iomap_ioend_try_merge(struct iomap_ioend *ioend,
> struct list_head *more_ioends);
> @@ -479,4 +486,6 @@ int iomap_swapfile_activate(struct swap_info_struct *sis,
> # define iomap_swapfile_activate(sis, swapfile, pagespan, ops) (-EIO)
> #endif /* CONFIG_SWAP */
>
> +extern struct bio_set iomap_ioend_bioset;
> +
> #endif /* LINUX_IOMAP_H */
> --
> 2.45.2
>
>
^ permalink raw reply [flat|nested] 20+ messages in thread* Re: [PATCH 04/10] iomap: split bios to zone append limits in the submission handlers
2024-12-19 18:17 ` Darrick J. Wong
@ 2024-12-19 18:19 ` Christoph Hellwig
0 siblings, 0 replies; 20+ messages in thread
From: Christoph Hellwig @ 2024-12-19 18:19 UTC (permalink / raw)
To: Darrick J. Wong
Cc: Christoph Hellwig, Christian Brauner, Carlos Maiolino, linux-xfs,
linux-fsdevel
On Thu, Dec 19, 2024 at 10:17:25AM -0800, Darrick J. Wong wrote:
> On Thu, Dec 19, 2024 at 05:39:09PM +0000, Christoph Hellwig wrote:
> > Provide helpers for file systems to split bios in the direct I/O and
> > writeback I/O submission handlers.
> >
> > This Follows btrfs' lead and don't try to build bios to hardware limits
> > for zone append commands, but instead build them as normal unconstrained
> > bios and split them to the hardware limits in the I/O submission handler.
> >
> > Signed-off-by: Christoph Hellwig <hch@lst.de>
>
> I wonder what iomap_split_ioend callsites look like now that the
> alloc_len outparam from the previous version is gone, but I guess I'll
> have to wait to see that.
The initial callsite in XFS looks like this now:
http://git.infradead.org/?p=users/hch/xfs.git;a=blob;f=fs/xfs/xfs_zone_alloc.c;h=00b4df1e628f5d65c7aba44326e5f62306ffbd98;hb=refs/heads/xfs-zoned-rebase#l790
^ permalink raw reply [flat|nested] 20+ messages in thread
* [PATCH 05/10] iomap: move common ioend code to ioend.c
2024-12-19 17:39 iomap patches for zoned XFS v1 Christoph Hellwig
` (3 preceding siblings ...)
2024-12-19 17:39 ` [PATCH 04/10] iomap: split bios to zone append limits in the submission handlers Christoph Hellwig
@ 2024-12-19 17:39 ` Christoph Hellwig
2024-12-19 18:20 ` Darrick J. Wong
2024-12-19 17:39 ` [PATCH 06/10] iomap: factor out a iomap_dio_done helper Christoph Hellwig
` (4 subsequent siblings)
9 siblings, 1 reply; 20+ messages in thread
From: Christoph Hellwig @ 2024-12-19 17:39 UTC (permalink / raw)
To: Christian Brauner
Cc: Darrick J. Wong, Carlos Maiolino, linux-xfs, linux-fsdevel
This code will be reused for direct I/O soon, so split it out of
buffered-io.c.
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
fs/iomap/buffered-io.c | 135 +----------------------------------------
fs/iomap/internal.h | 9 +++
fs/iomap/ioend.c | 127 ++++++++++++++++++++++++++++++++++++++
3 files changed, 138 insertions(+), 133 deletions(-)
create mode 100644 fs/iomap/internal.h
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 0b68c9584a7f..06b90d859d74 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -12,17 +12,15 @@
#include <linux/buffer_head.h>
#include <linux/dax.h>
#include <linux/writeback.h>
-#include <linux/list_sort.h>
#include <linux/swap.h>
#include <linux/bio.h>
#include <linux/sched/signal.h>
#include <linux/migrate.h>
+#include "internal.h"
#include "trace.h"
#include "../internal.h"
-#define IOEND_BATCH_SIZE 4096
-
/*
* Structure allocated for each folio to track per-block uptodate, dirty state
* and I/O completions.
@@ -40,9 +38,6 @@ struct iomap_folio_state {
unsigned long state[];
};
-struct bio_set iomap_ioend_bioset;
-EXPORT_SYMBOL_GPL(iomap_ioend_bioset);
-
static inline bool ifs_is_fully_uptodate(struct folio *folio,
struct iomap_folio_state *ifs)
{
@@ -1539,8 +1534,7 @@ static void iomap_finish_folio_write(struct inode *inode, struct folio *folio,
* state, release holds on bios, and finally free up memory. Do not use the
* ioend after this.
*/
-static u32
-iomap_finish_ioend_buffered(struct iomap_ioend *ioend)
+u32 iomap_finish_ioend_buffered(struct iomap_ioend *ioend)
{
struct inode *inode = ioend->io_inode;
struct bio *bio = &ioend->io_bio;
@@ -1567,123 +1561,6 @@ iomap_finish_ioend_buffered(struct iomap_ioend *ioend)
return folio_count;
}
-static u32
-iomap_finish_ioend(struct iomap_ioend *ioend, int error)
-{
- if (ioend->io_parent) {
- struct bio *bio = &ioend->io_bio;
-
- ioend = ioend->io_parent;
- bio_put(bio);
- }
-
- if (error)
- cmpxchg(&ioend->io_error, 0, error);
-
- if (!atomic_dec_and_test(&ioend->io_remaining))
- return 0;
- return iomap_finish_ioend_buffered(ioend);
-}
-
-/*
- * Ioend completion routine for merged bios. This can only be called from task
- * contexts as merged ioends can be of unbound length. Hence we have to break up
- * the writeback completions into manageable chunks to avoid long scheduler
- * holdoffs. We aim to keep scheduler holdoffs down below 10ms so that we get
- * good batch processing throughput without creating adverse scheduler latency
- * conditions.
- */
-void
-iomap_finish_ioends(struct iomap_ioend *ioend, int error)
-{
- struct list_head tmp;
- u32 completions;
-
- might_sleep();
-
- list_replace_init(&ioend->io_list, &tmp);
- completions = iomap_finish_ioend(ioend, error);
-
- while (!list_empty(&tmp)) {
- if (completions > IOEND_BATCH_SIZE * 8) {
- cond_resched();
- completions = 0;
- }
- ioend = list_first_entry(&tmp, struct iomap_ioend, io_list);
- list_del_init(&ioend->io_list);
- completions += iomap_finish_ioend(ioend, error);
- }
-}
-EXPORT_SYMBOL_GPL(iomap_finish_ioends);
-
-/*
- * We can merge two adjacent ioends if they have the same set of work to do.
- */
-static bool
-iomap_ioend_can_merge(struct iomap_ioend *ioend, struct iomap_ioend *next)
-{
- if (ioend->io_bio.bi_status != next->io_bio.bi_status)
- return false;
- if (next->io_flags & IOMAP_IOEND_BOUNDARY)
- return false;
- if ((ioend->io_flags & IOMAP_IOEND_NOMERGE_FLAGS) !=
- (next->io_flags & IOMAP_IOEND_NOMERGE_FLAGS))
- return false;
- if (ioend->io_offset + ioend->io_size != next->io_offset)
- return false;
- /*
- * Do not merge physically discontiguous ioends. The filesystem
- * completion functions will have to iterate the physical
- * discontiguities even if we merge the ioends at a logical level, so
- * we don't gain anything by merging physical discontiguities here.
- *
- * We cannot use bio->bi_iter.bi_sector here as it is modified during
- * submission so does not point to the start sector of the bio at
- * completion.
- */
- if (ioend->io_sector + (ioend->io_size >> 9) != next->io_sector)
- return false;
- return true;
-}
-
-void
-iomap_ioend_try_merge(struct iomap_ioend *ioend, struct list_head *more_ioends)
-{
- struct iomap_ioend *next;
-
- INIT_LIST_HEAD(&ioend->io_list);
-
- while ((next = list_first_entry_or_null(more_ioends, struct iomap_ioend,
- io_list))) {
- if (!iomap_ioend_can_merge(ioend, next))
- break;
- list_move_tail(&next->io_list, &ioend->io_list);
- ioend->io_size += next->io_size;
- }
-}
-EXPORT_SYMBOL_GPL(iomap_ioend_try_merge);
-
-static int
-iomap_ioend_compare(void *priv, const struct list_head *a,
- const struct list_head *b)
-{
- struct iomap_ioend *ia = container_of(a, struct iomap_ioend, io_list);
- struct iomap_ioend *ib = container_of(b, struct iomap_ioend, io_list);
-
- if (ia->io_offset < ib->io_offset)
- return -1;
- if (ia->io_offset > ib->io_offset)
- return 1;
- return 0;
-}
-
-void
-iomap_sort_ioends(struct list_head *ioend_list)
-{
- list_sort(NULL, ioend_list, iomap_ioend_compare);
-}
-EXPORT_SYMBOL_GPL(iomap_sort_ioends);
-
static void iomap_writepage_end_bio(struct bio *bio)
{
struct iomap_ioend *ioend = iomap_ioend_from_bio(bio);
@@ -2033,11 +1910,3 @@ iomap_writepages(struct address_space *mapping, struct writeback_control *wbc,
return iomap_submit_ioend(wpc, error);
}
EXPORT_SYMBOL_GPL(iomap_writepages);
-
-static int __init iomap_buffered_init(void)
-{
- return bioset_init(&iomap_ioend_bioset, 4 * (PAGE_SIZE / SECTOR_SIZE),
- offsetof(struct iomap_ioend, io_bio),
- BIOSET_NEED_BVECS);
-}
-fs_initcall(iomap_buffered_init);
diff --git a/fs/iomap/internal.h b/fs/iomap/internal.h
new file mode 100644
index 000000000000..36d5c56e073e
--- /dev/null
+++ b/fs/iomap/internal.h
@@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _IOMAP_INTERNAL_H
+#define _IOMAP_INTERNAL_H 1
+
+#define IOEND_BATCH_SIZE 4096
+
+u32 iomap_finish_ioend_buffered(struct iomap_ioend *ioend);
+
+#endif /* _IOMAP_INTERNAL_H */
diff --git a/fs/iomap/ioend.c b/fs/iomap/ioend.c
index 1b032323ee4e..b4f6dd9e319a 100644
--- a/fs/iomap/ioend.c
+++ b/fs/iomap/ioend.c
@@ -3,6 +3,11 @@
* Copyright (c) 2024 Christoph Hellwig.
*/
#include <linux/iomap.h>
+#include <linux/list_sort.h>
+#include "internal.h"
+
+struct bio_set iomap_ioend_bioset;
+EXPORT_SYMBOL_GPL(iomap_ioend_bioset);
struct iomap_ioend *iomap_init_ioend(struct inode *inode,
struct bio *bio, loff_t file_offset, u16 ioend_flags)
@@ -22,6 +27,120 @@ struct iomap_ioend *iomap_init_ioend(struct inode *inode,
}
EXPORT_SYMBOL_GPL(iomap_init_ioend);
+static u32 iomap_finish_ioend(struct iomap_ioend *ioend, int error)
+{
+ if (ioend->io_parent) {
+ struct bio *bio = &ioend->io_bio;
+
+ ioend = ioend->io_parent;
+ bio_put(bio);
+ }
+
+ if (error)
+ cmpxchg(&ioend->io_error, 0, error);
+
+ if (!atomic_dec_and_test(&ioend->io_remaining))
+ return 0;
+ return iomap_finish_ioend_buffered(ioend);
+}
+
+/*
+ * Ioend completion routine for merged bios. This can only be called from task
+ * contexts as merged ioends can be of unbound length. Hence we have to break up
+ * the writeback completions into manageable chunks to avoid long scheduler
+ * holdoffs. We aim to keep scheduler holdoffs down below 10ms so that we get
+ * good batch processing throughput without creating adverse scheduler latency
+ * conditions.
+ */
+void iomap_finish_ioends(struct iomap_ioend *ioend, int error)
+{
+ struct list_head tmp;
+ u32 completions;
+
+ might_sleep();
+
+ list_replace_init(&ioend->io_list, &tmp);
+ completions = iomap_finish_ioend(ioend, error);
+
+ while (!list_empty(&tmp)) {
+ if (completions > IOEND_BATCH_SIZE * 8) {
+ cond_resched();
+ completions = 0;
+ }
+ ioend = list_first_entry(&tmp, struct iomap_ioend, io_list);
+ list_del_init(&ioend->io_list);
+ completions += iomap_finish_ioend(ioend, error);
+ }
+}
+EXPORT_SYMBOL_GPL(iomap_finish_ioends);
+
+/*
+ * We can merge two adjacent ioends if they have the same set of work to do.
+ */
+static bool iomap_ioend_can_merge(struct iomap_ioend *ioend,
+ struct iomap_ioend *next)
+{
+ if (ioend->io_bio.bi_status != next->io_bio.bi_status)
+ return false;
+ if (next->io_flags & IOMAP_IOEND_BOUNDARY)
+ return false;
+ if ((ioend->io_flags & IOMAP_IOEND_NOMERGE_FLAGS) !=
+ (next->io_flags & IOMAP_IOEND_NOMERGE_FLAGS))
+ return false;
+ if (ioend->io_offset + ioend->io_size != next->io_offset)
+ return false;
+ /*
+ * Do not merge physically discontiguous ioends. The filesystem
+ * completion functions will have to iterate the physical
+ * discontiguities even if we merge the ioends at a logical level, so
+ * we don't gain anything by merging physical discontiguities here.
+ *
+ * We cannot use bio->bi_iter.bi_sector here as it is modified during
+ * submission so does not point to the start sector of the bio at
+ * completion.
+ */
+ if (ioend->io_sector + (ioend->io_size >> SECTOR_SHIFT) !=
+ next->io_sector)
+ return false;
+ return true;
+}
+
+void iomap_ioend_try_merge(struct iomap_ioend *ioend,
+ struct list_head *more_ioends)
+{
+ struct iomap_ioend *next;
+
+ INIT_LIST_HEAD(&ioend->io_list);
+
+ while ((next = list_first_entry_or_null(more_ioends, struct iomap_ioend,
+ io_list))) {
+ if (!iomap_ioend_can_merge(ioend, next))
+ break;
+ list_move_tail(&next->io_list, &ioend->io_list);
+ ioend->io_size += next->io_size;
+ }
+}
+EXPORT_SYMBOL_GPL(iomap_ioend_try_merge);
+
+static int iomap_ioend_compare(void *priv, const struct list_head *a,
+ const struct list_head *b)
+{
+ struct iomap_ioend *ia = container_of(a, struct iomap_ioend, io_list);
+ struct iomap_ioend *ib = container_of(b, struct iomap_ioend, io_list);
+
+ if (ia->io_offset < ib->io_offset)
+ return -1;
+ if (ia->io_offset > ib->io_offset)
+ return 1;
+ return 0;
+}
+
+void iomap_sort_ioends(struct list_head *ioend_list)
+{
+ list_sort(NULL, ioend_list, iomap_ioend_compare);
+}
+EXPORT_SYMBOL_GPL(iomap_sort_ioends);
+
/*
* Split up to the first @max_len bytes from @ioend if the ioend covers more
* than @max_len bytes.
@@ -84,3 +203,11 @@ struct iomap_ioend *iomap_split_ioend(struct iomap_ioend *ioend,
return split_ioend;
}
EXPORT_SYMBOL_GPL(iomap_split_ioend);
+
+static int __init iomap_ioend_init(void)
+{
+ return bioset_init(&iomap_ioend_bioset, 4 * (PAGE_SIZE / SECTOR_SIZE),
+ offsetof(struct iomap_ioend, io_bio),
+ BIOSET_NEED_BVECS);
+}
+fs_initcall(iomap_ioend_init);
--
2.45.2
^ permalink raw reply related [flat|nested] 20+ messages in thread* Re: [PATCH 05/10] iomap: move common ioend code to ioend.c
2024-12-19 17:39 ` [PATCH 05/10] iomap: move common ioend code to ioend.c Christoph Hellwig
@ 2024-12-19 18:20 ` Darrick J. Wong
0 siblings, 0 replies; 20+ messages in thread
From: Darrick J. Wong @ 2024-12-19 18:20 UTC (permalink / raw)
To: Christoph Hellwig
Cc: Christian Brauner, Carlos Maiolino, linux-xfs, linux-fsdevel
On Thu, Dec 19, 2024 at 05:39:10PM +0000, Christoph Hellwig wrote:
> This code will be reused for direct I/O soon, so split it out of
> buffered-io.c.
>
> Signed-off-by: Christoph Hellwig <hch@lst.de>
This appears to be a straight rearranagement, so
Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
--D
> ---
> fs/iomap/buffered-io.c | 135 +----------------------------------------
> fs/iomap/internal.h | 9 +++
> fs/iomap/ioend.c | 127 ++++++++++++++++++++++++++++++++++++++
> 3 files changed, 138 insertions(+), 133 deletions(-)
> create mode 100644 fs/iomap/internal.h
>
> diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
> index 0b68c9584a7f..06b90d859d74 100644
> --- a/fs/iomap/buffered-io.c
> +++ b/fs/iomap/buffered-io.c
> @@ -12,17 +12,15 @@
> #include <linux/buffer_head.h>
> #include <linux/dax.h>
> #include <linux/writeback.h>
> -#include <linux/list_sort.h>
> #include <linux/swap.h>
> #include <linux/bio.h>
> #include <linux/sched/signal.h>
> #include <linux/migrate.h>
> +#include "internal.h"
> #include "trace.h"
>
> #include "../internal.h"
>
> -#define IOEND_BATCH_SIZE 4096
> -
> /*
> * Structure allocated for each folio to track per-block uptodate, dirty state
> * and I/O completions.
> @@ -40,9 +38,6 @@ struct iomap_folio_state {
> unsigned long state[];
> };
>
> -struct bio_set iomap_ioend_bioset;
> -EXPORT_SYMBOL_GPL(iomap_ioend_bioset);
> -
> static inline bool ifs_is_fully_uptodate(struct folio *folio,
> struct iomap_folio_state *ifs)
> {
> @@ -1539,8 +1534,7 @@ static void iomap_finish_folio_write(struct inode *inode, struct folio *folio,
> * state, release holds on bios, and finally free up memory. Do not use the
> * ioend after this.
> */
> -static u32
> -iomap_finish_ioend_buffered(struct iomap_ioend *ioend)
> +u32 iomap_finish_ioend_buffered(struct iomap_ioend *ioend)
> {
> struct inode *inode = ioend->io_inode;
> struct bio *bio = &ioend->io_bio;
> @@ -1567,123 +1561,6 @@ iomap_finish_ioend_buffered(struct iomap_ioend *ioend)
> return folio_count;
> }
>
> -static u32
> -iomap_finish_ioend(struct iomap_ioend *ioend, int error)
> -{
> - if (ioend->io_parent) {
> - struct bio *bio = &ioend->io_bio;
> -
> - ioend = ioend->io_parent;
> - bio_put(bio);
> - }
> -
> - if (error)
> - cmpxchg(&ioend->io_error, 0, error);
> -
> - if (!atomic_dec_and_test(&ioend->io_remaining))
> - return 0;
> - return iomap_finish_ioend_buffered(ioend);
> -}
> -
> -/*
> - * Ioend completion routine for merged bios. This can only be called from task
> - * contexts as merged ioends can be of unbound length. Hence we have to break up
> - * the writeback completions into manageable chunks to avoid long scheduler
> - * holdoffs. We aim to keep scheduler holdoffs down below 10ms so that we get
> - * good batch processing throughput without creating adverse scheduler latency
> - * conditions.
> - */
> -void
> -iomap_finish_ioends(struct iomap_ioend *ioend, int error)
> -{
> - struct list_head tmp;
> - u32 completions;
> -
> - might_sleep();
> -
> - list_replace_init(&ioend->io_list, &tmp);
> - completions = iomap_finish_ioend(ioend, error);
> -
> - while (!list_empty(&tmp)) {
> - if (completions > IOEND_BATCH_SIZE * 8) {
> - cond_resched();
> - completions = 0;
> - }
> - ioend = list_first_entry(&tmp, struct iomap_ioend, io_list);
> - list_del_init(&ioend->io_list);
> - completions += iomap_finish_ioend(ioend, error);
> - }
> -}
> -EXPORT_SYMBOL_GPL(iomap_finish_ioends);
> -
> -/*
> - * We can merge two adjacent ioends if they have the same set of work to do.
> - */
> -static bool
> -iomap_ioend_can_merge(struct iomap_ioend *ioend, struct iomap_ioend *next)
> -{
> - if (ioend->io_bio.bi_status != next->io_bio.bi_status)
> - return false;
> - if (next->io_flags & IOMAP_IOEND_BOUNDARY)
> - return false;
> - if ((ioend->io_flags & IOMAP_IOEND_NOMERGE_FLAGS) !=
> - (next->io_flags & IOMAP_IOEND_NOMERGE_FLAGS))
> - return false;
> - if (ioend->io_offset + ioend->io_size != next->io_offset)
> - return false;
> - /*
> - * Do not merge physically discontiguous ioends. The filesystem
> - * completion functions will have to iterate the physical
> - * discontiguities even if we merge the ioends at a logical level, so
> - * we don't gain anything by merging physical discontiguities here.
> - *
> - * We cannot use bio->bi_iter.bi_sector here as it is modified during
> - * submission so does not point to the start sector of the bio at
> - * completion.
> - */
> - if (ioend->io_sector + (ioend->io_size >> 9) != next->io_sector)
> - return false;
> - return true;
> -}
> -
> -void
> -iomap_ioend_try_merge(struct iomap_ioend *ioend, struct list_head *more_ioends)
> -{
> - struct iomap_ioend *next;
> -
> - INIT_LIST_HEAD(&ioend->io_list);
> -
> - while ((next = list_first_entry_or_null(more_ioends, struct iomap_ioend,
> - io_list))) {
> - if (!iomap_ioend_can_merge(ioend, next))
> - break;
> - list_move_tail(&next->io_list, &ioend->io_list);
> - ioend->io_size += next->io_size;
> - }
> -}
> -EXPORT_SYMBOL_GPL(iomap_ioend_try_merge);
> -
> -static int
> -iomap_ioend_compare(void *priv, const struct list_head *a,
> - const struct list_head *b)
> -{
> - struct iomap_ioend *ia = container_of(a, struct iomap_ioend, io_list);
> - struct iomap_ioend *ib = container_of(b, struct iomap_ioend, io_list);
> -
> - if (ia->io_offset < ib->io_offset)
> - return -1;
> - if (ia->io_offset > ib->io_offset)
> - return 1;
> - return 0;
> -}
> -
> -void
> -iomap_sort_ioends(struct list_head *ioend_list)
> -{
> - list_sort(NULL, ioend_list, iomap_ioend_compare);
> -}
> -EXPORT_SYMBOL_GPL(iomap_sort_ioends);
> -
> static void iomap_writepage_end_bio(struct bio *bio)
> {
> struct iomap_ioend *ioend = iomap_ioend_from_bio(bio);
> @@ -2033,11 +1910,3 @@ iomap_writepages(struct address_space *mapping, struct writeback_control *wbc,
> return iomap_submit_ioend(wpc, error);
> }
> EXPORT_SYMBOL_GPL(iomap_writepages);
> -
> -static int __init iomap_buffered_init(void)
> -{
> - return bioset_init(&iomap_ioend_bioset, 4 * (PAGE_SIZE / SECTOR_SIZE),
> - offsetof(struct iomap_ioend, io_bio),
> - BIOSET_NEED_BVECS);
> -}
> -fs_initcall(iomap_buffered_init);
> diff --git a/fs/iomap/internal.h b/fs/iomap/internal.h
> new file mode 100644
> index 000000000000..36d5c56e073e
> --- /dev/null
> +++ b/fs/iomap/internal.h
> @@ -0,0 +1,9 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef _IOMAP_INTERNAL_H
> +#define _IOMAP_INTERNAL_H 1
> +
> +#define IOEND_BATCH_SIZE 4096
> +
> +u32 iomap_finish_ioend_buffered(struct iomap_ioend *ioend);
> +
> +#endif /* _IOMAP_INTERNAL_H */
> diff --git a/fs/iomap/ioend.c b/fs/iomap/ioend.c
> index 1b032323ee4e..b4f6dd9e319a 100644
> --- a/fs/iomap/ioend.c
> +++ b/fs/iomap/ioend.c
> @@ -3,6 +3,11 @@
> * Copyright (c) 2024 Christoph Hellwig.
> */
> #include <linux/iomap.h>
> +#include <linux/list_sort.h>
> +#include "internal.h"
> +
> +struct bio_set iomap_ioend_bioset;
> +EXPORT_SYMBOL_GPL(iomap_ioend_bioset);
>
> struct iomap_ioend *iomap_init_ioend(struct inode *inode,
> struct bio *bio, loff_t file_offset, u16 ioend_flags)
> @@ -22,6 +27,120 @@ struct iomap_ioend *iomap_init_ioend(struct inode *inode,
> }
> EXPORT_SYMBOL_GPL(iomap_init_ioend);
>
> +static u32 iomap_finish_ioend(struct iomap_ioend *ioend, int error)
> +{
> + if (ioend->io_parent) {
> + struct bio *bio = &ioend->io_bio;
> +
> + ioend = ioend->io_parent;
> + bio_put(bio);
> + }
> +
> + if (error)
> + cmpxchg(&ioend->io_error, 0, error);
> +
> + if (!atomic_dec_and_test(&ioend->io_remaining))
> + return 0;
> + return iomap_finish_ioend_buffered(ioend);
> +}
> +
> +/*
> + * Ioend completion routine for merged bios. This can only be called from task
> + * contexts as merged ioends can be of unbound length. Hence we have to break up
> + * the writeback completions into manageable chunks to avoid long scheduler
> + * holdoffs. We aim to keep scheduler holdoffs down below 10ms so that we get
> + * good batch processing throughput without creating adverse scheduler latency
> + * conditions.
> + */
> +void iomap_finish_ioends(struct iomap_ioend *ioend, int error)
> +{
> + struct list_head tmp;
> + u32 completions;
> +
> + might_sleep();
> +
> + list_replace_init(&ioend->io_list, &tmp);
> + completions = iomap_finish_ioend(ioend, error);
> +
> + while (!list_empty(&tmp)) {
> + if (completions > IOEND_BATCH_SIZE * 8) {
> + cond_resched();
> + completions = 0;
> + }
> + ioend = list_first_entry(&tmp, struct iomap_ioend, io_list);
> + list_del_init(&ioend->io_list);
> + completions += iomap_finish_ioend(ioend, error);
> + }
> +}
> +EXPORT_SYMBOL_GPL(iomap_finish_ioends);
> +
> +/*
> + * We can merge two adjacent ioends if they have the same set of work to do.
> + */
> +static bool iomap_ioend_can_merge(struct iomap_ioend *ioend,
> + struct iomap_ioend *next)
> +{
> + if (ioend->io_bio.bi_status != next->io_bio.bi_status)
> + return false;
> + if (next->io_flags & IOMAP_IOEND_BOUNDARY)
> + return false;
> + if ((ioend->io_flags & IOMAP_IOEND_NOMERGE_FLAGS) !=
> + (next->io_flags & IOMAP_IOEND_NOMERGE_FLAGS))
> + return false;
> + if (ioend->io_offset + ioend->io_size != next->io_offset)
> + return false;
> + /*
> + * Do not merge physically discontiguous ioends. The filesystem
> + * completion functions will have to iterate the physical
> + * discontiguities even if we merge the ioends at a logical level, so
> + * we don't gain anything by merging physical discontiguities here.
> + *
> + * We cannot use bio->bi_iter.bi_sector here as it is modified during
> + * submission so does not point to the start sector of the bio at
> + * completion.
> + */
> + if (ioend->io_sector + (ioend->io_size >> SECTOR_SHIFT) !=
> + next->io_sector)
> + return false;
> + return true;
> +}
> +
> +void iomap_ioend_try_merge(struct iomap_ioend *ioend,
> + struct list_head *more_ioends)
> +{
> + struct iomap_ioend *next;
> +
> + INIT_LIST_HEAD(&ioend->io_list);
> +
> + while ((next = list_first_entry_or_null(more_ioends, struct iomap_ioend,
> + io_list))) {
> + if (!iomap_ioend_can_merge(ioend, next))
> + break;
> + list_move_tail(&next->io_list, &ioend->io_list);
> + ioend->io_size += next->io_size;
> + }
> +}
> +EXPORT_SYMBOL_GPL(iomap_ioend_try_merge);
> +
> +static int iomap_ioend_compare(void *priv, const struct list_head *a,
> + const struct list_head *b)
> +{
> + struct iomap_ioend *ia = container_of(a, struct iomap_ioend, io_list);
> + struct iomap_ioend *ib = container_of(b, struct iomap_ioend, io_list);
> +
> + if (ia->io_offset < ib->io_offset)
> + return -1;
> + if (ia->io_offset > ib->io_offset)
> + return 1;
> + return 0;
> +}
> +
> +void iomap_sort_ioends(struct list_head *ioend_list)
> +{
> + list_sort(NULL, ioend_list, iomap_ioend_compare);
> +}
> +EXPORT_SYMBOL_GPL(iomap_sort_ioends);
> +
> /*
> * Split up to the first @max_len bytes from @ioend if the ioend covers more
> * than @max_len bytes.
> @@ -84,3 +203,11 @@ struct iomap_ioend *iomap_split_ioend(struct iomap_ioend *ioend,
> return split_ioend;
> }
> EXPORT_SYMBOL_GPL(iomap_split_ioend);
> +
> +static int __init iomap_ioend_init(void)
> +{
> + return bioset_init(&iomap_ioend_bioset, 4 * (PAGE_SIZE / SECTOR_SIZE),
> + offsetof(struct iomap_ioend, io_bio),
> + BIOSET_NEED_BVECS);
> +}
> +fs_initcall(iomap_ioend_init);
> --
> 2.45.2
>
>
^ permalink raw reply [flat|nested] 20+ messages in thread
* [PATCH 06/10] iomap: factor out a iomap_dio_done helper
2024-12-19 17:39 iomap patches for zoned XFS v1 Christoph Hellwig
` (4 preceding siblings ...)
2024-12-19 17:39 ` [PATCH 05/10] iomap: move common ioend code to ioend.c Christoph Hellwig
@ 2024-12-19 17:39 ` Christoph Hellwig
2024-12-19 18:22 ` Darrick J. Wong
2024-12-19 17:39 ` [PATCH 07/10] iomap: optionally use ioends for direct I/O Christoph Hellwig
` (3 subsequent siblings)
9 siblings, 1 reply; 20+ messages in thread
From: Christoph Hellwig @ 2024-12-19 17:39 UTC (permalink / raw)
To: Christian Brauner
Cc: Darrick J. Wong, Carlos Maiolino, linux-xfs, linux-fsdevel
Split out the struct iomap-dio level final completion from
iomap_dio_bio_end_io into a helper to clean up the code and make it
reusable.
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
fs/iomap/direct-io.c | 76 ++++++++++++++++++++++----------------------
1 file changed, 38 insertions(+), 38 deletions(-)
diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index 641649a04614..ed658eb09a1a 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -165,43 +165,31 @@ static inline void iomap_dio_set_error(struct iomap_dio *dio, int ret)
cmpxchg(&dio->error, 0, ret);
}
-void iomap_dio_bio_end_io(struct bio *bio)
+/*
+ * Called when dio->ref reaches zero from and I/O completion.
+ */
+static void iomap_dio_done(struct iomap_dio *dio)
{
- struct iomap_dio *dio = bio->bi_private;
- bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY);
struct kiocb *iocb = dio->iocb;
- if (bio->bi_status)
- iomap_dio_set_error(dio, blk_status_to_errno(bio->bi_status));
- if (!atomic_dec_and_test(&dio->ref))
- goto release_bio;
-
- /*
- * Synchronous dio, task itself will handle any completion work
- * that needs after IO. All we need to do is wake the task.
- */
if (dio->wait_for_completion) {
+ /*
+ * Synchronous I/O, task itself will handle any completion work
+ * that needs after IO. All we need to do is wake the task.
+ */
struct task_struct *waiter = dio->submit.waiter;
WRITE_ONCE(dio->submit.waiter, NULL);
blk_wake_io_task(waiter);
- goto release_bio;
- }
-
- /*
- * Flagged with IOMAP_DIO_INLINE_COMP, we can complete it inline
- */
- if (dio->flags & IOMAP_DIO_INLINE_COMP) {
+ } else if (dio->flags & IOMAP_DIO_INLINE_COMP) {
WRITE_ONCE(iocb->private, NULL);
iomap_dio_complete_work(&dio->aio.work);
- goto release_bio;
- }
-
- /*
- * If this dio is flagged with IOMAP_DIO_CALLER_COMP, then schedule
- * our completion that way to avoid an async punt to a workqueue.
- */
- if (dio->flags & IOMAP_DIO_CALLER_COMP) {
+ } else if (dio->flags & IOMAP_DIO_CALLER_COMP) {
+ /*
+ * If this dio is flagged with IOMAP_DIO_CALLER_COMP, then
+ * schedule our completion that way to avoid an async punt to a
+ * workqueue.
+ */
/* only polled IO cares about private cleared */
iocb->private = dio;
iocb->dio_complete = iomap_dio_deferred_complete;
@@ -219,19 +207,31 @@ void iomap_dio_bio_end_io(struct bio *bio)
* issuer.
*/
iocb->ki_complete(iocb, 0);
- goto release_bio;
+ } else {
+ struct inode *inode = file_inode(iocb->ki_filp);
+
+ /*
+ * Async DIO completion that requires filesystem level
+ * completion work gets punted to a work queue to complete as
+ * the operation may require more IO to be issued to finalise
+ * filesystem metadata changes or guarantee data integrity.
+ */
+ INIT_WORK(&dio->aio.work, iomap_dio_complete_work);
+ queue_work(inode->i_sb->s_dio_done_wq, &dio->aio.work);
}
+}
+
+void iomap_dio_bio_end_io(struct bio *bio)
+{
+ struct iomap_dio *dio = bio->bi_private;
+ bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY);
+
+ if (bio->bi_status)
+ iomap_dio_set_error(dio, blk_status_to_errno(bio->bi_status));
+
+ if (atomic_dec_and_test(&dio->ref))
+ iomap_dio_done(dio);
- /*
- * Async DIO completion that requires filesystem level completion work
- * gets punted to a work queue to complete as the operation may require
- * more IO to be issued to finalise filesystem metadata changes or
- * guarantee data integrity.
- */
- INIT_WORK(&dio->aio.work, iomap_dio_complete_work);
- queue_work(file_inode(iocb->ki_filp)->i_sb->s_dio_done_wq,
- &dio->aio.work);
-release_bio:
if (should_dirty) {
bio_check_pages_dirty(bio);
} else {
--
2.45.2
^ permalink raw reply related [flat|nested] 20+ messages in thread* Re: [PATCH 06/10] iomap: factor out a iomap_dio_done helper
2024-12-19 17:39 ` [PATCH 06/10] iomap: factor out a iomap_dio_done helper Christoph Hellwig
@ 2024-12-19 18:22 ` Darrick J. Wong
0 siblings, 0 replies; 20+ messages in thread
From: Darrick J. Wong @ 2024-12-19 18:22 UTC (permalink / raw)
To: Christoph Hellwig
Cc: Christian Brauner, Carlos Maiolino, linux-xfs, linux-fsdevel
On Thu, Dec 19, 2024 at 05:39:11PM +0000, Christoph Hellwig wrote:
> Split out the struct iomap-dio level final completion from
> iomap_dio_bio_end_io into a helper to clean up the code and make it
> reusable.
>
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
> fs/iomap/direct-io.c | 76 ++++++++++++++++++++++----------------------
> 1 file changed, 38 insertions(+), 38 deletions(-)
>
> diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
> index 641649a04614..ed658eb09a1a 100644
> --- a/fs/iomap/direct-io.c
> +++ b/fs/iomap/direct-io.c
> @@ -165,43 +165,31 @@ static inline void iomap_dio_set_error(struct iomap_dio *dio, int ret)
> cmpxchg(&dio->error, 0, ret);
> }
>
> -void iomap_dio_bio_end_io(struct bio *bio)
> +/*
> + * Called when dio->ref reaches zero from and I/O completion.
an I/O completion
This hoist looks fine to me, so with that fixed:
Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
--D
> + */
> +static void iomap_dio_done(struct iomap_dio *dio)
> {
> - struct iomap_dio *dio = bio->bi_private;
> - bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY);
> struct kiocb *iocb = dio->iocb;
>
> - if (bio->bi_status)
> - iomap_dio_set_error(dio, blk_status_to_errno(bio->bi_status));
> - if (!atomic_dec_and_test(&dio->ref))
> - goto release_bio;
> -
> - /*
> - * Synchronous dio, task itself will handle any completion work
> - * that needs after IO. All we need to do is wake the task.
> - */
> if (dio->wait_for_completion) {
> + /*
> + * Synchronous I/O, task itself will handle any completion work
> + * that needs after IO. All we need to do is wake the task.
> + */
> struct task_struct *waiter = dio->submit.waiter;
>
> WRITE_ONCE(dio->submit.waiter, NULL);
> blk_wake_io_task(waiter);
> - goto release_bio;
> - }
> -
> - /*
> - * Flagged with IOMAP_DIO_INLINE_COMP, we can complete it inline
> - */
> - if (dio->flags & IOMAP_DIO_INLINE_COMP) {
> + } else if (dio->flags & IOMAP_DIO_INLINE_COMP) {
> WRITE_ONCE(iocb->private, NULL);
> iomap_dio_complete_work(&dio->aio.work);
> - goto release_bio;
> - }
> -
> - /*
> - * If this dio is flagged with IOMAP_DIO_CALLER_COMP, then schedule
> - * our completion that way to avoid an async punt to a workqueue.
> - */
> - if (dio->flags & IOMAP_DIO_CALLER_COMP) {
> + } else if (dio->flags & IOMAP_DIO_CALLER_COMP) {
> + /*
> + * If this dio is flagged with IOMAP_DIO_CALLER_COMP, then
> + * schedule our completion that way to avoid an async punt to a
> + * workqueue.
> + */
> /* only polled IO cares about private cleared */
> iocb->private = dio;
> iocb->dio_complete = iomap_dio_deferred_complete;
> @@ -219,19 +207,31 @@ void iomap_dio_bio_end_io(struct bio *bio)
> * issuer.
> */
> iocb->ki_complete(iocb, 0);
> - goto release_bio;
> + } else {
> + struct inode *inode = file_inode(iocb->ki_filp);
> +
> + /*
> + * Async DIO completion that requires filesystem level
> + * completion work gets punted to a work queue to complete as
> + * the operation may require more IO to be issued to finalise
> + * filesystem metadata changes or guarantee data integrity.
> + */
> + INIT_WORK(&dio->aio.work, iomap_dio_complete_work);
> + queue_work(inode->i_sb->s_dio_done_wq, &dio->aio.work);
> }
> +}
> +
> +void iomap_dio_bio_end_io(struct bio *bio)
> +{
> + struct iomap_dio *dio = bio->bi_private;
> + bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY);
> +
> + if (bio->bi_status)
> + iomap_dio_set_error(dio, blk_status_to_errno(bio->bi_status));
> +
> + if (atomic_dec_and_test(&dio->ref))
> + iomap_dio_done(dio);
>
> - /*
> - * Async DIO completion that requires filesystem level completion work
> - * gets punted to a work queue to complete as the operation may require
> - * more IO to be issued to finalise filesystem metadata changes or
> - * guarantee data integrity.
> - */
> - INIT_WORK(&dio->aio.work, iomap_dio_complete_work);
> - queue_work(file_inode(iocb->ki_filp)->i_sb->s_dio_done_wq,
> - &dio->aio.work);
> -release_bio:
> if (should_dirty) {
> bio_check_pages_dirty(bio);
> } else {
> --
> 2.45.2
>
>
^ permalink raw reply [flat|nested] 20+ messages in thread
* [PATCH 07/10] iomap: optionally use ioends for direct I/O
2024-12-19 17:39 iomap patches for zoned XFS v1 Christoph Hellwig
` (5 preceding siblings ...)
2024-12-19 17:39 ` [PATCH 06/10] iomap: factor out a iomap_dio_done helper Christoph Hellwig
@ 2024-12-19 17:39 ` Christoph Hellwig
2024-12-19 18:25 ` Darrick J. Wong
2024-12-19 17:39 ` [PATCH 08/10] iomap: pass private data to iomap_page_mkwrite Christoph Hellwig
` (2 subsequent siblings)
9 siblings, 1 reply; 20+ messages in thread
From: Christoph Hellwig @ 2024-12-19 17:39 UTC (permalink / raw)
To: Christian Brauner
Cc: Darrick J. Wong, Carlos Maiolino, linux-xfs, linux-fsdevel
struct iomap_ioend currently tracks outstanding buffered writes and has
some really nice code in core iomap and XFS to merge contiguous I/Os
an defer them to userspace for completion in a very efficient way.
For zoned writes we'll also need a per-bio user context completion to
record the written blocks, and the infrastructure for that would look
basically like the ioend handling for buffered I/O.
So instead of reinventing the wheel, reuse the existing infrastructure.
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
fs/iomap/direct-io.c | 49 +++++++++++++++++++++++++++++++++++++++++--
fs/iomap/internal.h | 1 +
fs/iomap/ioend.c | 2 ++
include/linux/iomap.h | 4 +++-
4 files changed, 53 insertions(+), 3 deletions(-)
diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index ed658eb09a1a..dd521f4edf55 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -1,7 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2010 Red Hat, Inc.
- * Copyright (c) 2016-2021 Christoph Hellwig.
+ * Copyright (c) 2016-2024 Christoph Hellwig.
*/
#include <linux/module.h>
#include <linux/compiler.h>
@@ -12,6 +12,7 @@
#include <linux/backing-dev.h>
#include <linux/uio.h>
#include <linux/task_io_accounting_ops.h>
+#include "internal.h"
#include "trace.h"
#include "../internal.h"
@@ -20,6 +21,7 @@
* Private flags for iomap_dio, must not overlap with the public ones in
* iomap.h:
*/
+#define IOMAP_DIO_NO_INVALIDATE (1U << 25)
#define IOMAP_DIO_CALLER_COMP (1U << 26)
#define IOMAP_DIO_INLINE_COMP (1U << 27)
#define IOMAP_DIO_WRITE_THROUGH (1U << 28)
@@ -119,7 +121,8 @@ ssize_t iomap_dio_complete(struct iomap_dio *dio)
* ->end_io() when necessary, otherwise a racing buffer read would cache
* zeros from unwritten extents.
*/
- if (!dio->error && dio->size && (dio->flags & IOMAP_DIO_WRITE))
+ if (!dio->error && dio->size && (dio->flags & IOMAP_DIO_WRITE) &&
+ !(dio->flags & IOMAP_DIO_NO_INVALIDATE))
kiocb_invalidate_post_direct_write(iocb, dio->size);
inode_dio_end(file_inode(iocb->ki_filp));
@@ -221,6 +224,7 @@ static void iomap_dio_done(struct iomap_dio *dio)
}
}
+
void iomap_dio_bio_end_io(struct bio *bio)
{
struct iomap_dio *dio = bio->bi_private;
@@ -241,6 +245,47 @@ void iomap_dio_bio_end_io(struct bio *bio)
}
EXPORT_SYMBOL_GPL(iomap_dio_bio_end_io);
+u32 iomap_finish_ioend_direct(struct iomap_ioend *ioend)
+{
+ struct iomap_dio *dio = ioend->io_bio.bi_private;
+ bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY);
+ u32 vec_count = ioend->io_bio.bi_vcnt;
+
+ if (ioend->io_error)
+ iomap_dio_set_error(dio, ioend->io_error);
+
+ if (atomic_dec_and_test(&dio->ref)) {
+ /*
+ * Try to avoid another context switch for the completion given
+ * that we are already called from the ioend completion
+ * workqueue, but never invalidate pages from this thread to
+ * avoid deadlocks with buffered I/O completions. Tough luck if
+ * yoy hit the tiny race with someone dirtying the range now
+ * betweem this check and the actual completion.
+ */
+ if (!dio->iocb->ki_filp->f_mapping->nrpages) {
+ dio->flags |= IOMAP_DIO_INLINE_COMP;
+ dio->flags |= IOMAP_DIO_NO_INVALIDATE;
+ }
+ dio->flags &= ~IOMAP_DIO_CALLER_COMP;
+ iomap_dio_done(dio);
+ }
+
+ if (should_dirty) {
+ bio_check_pages_dirty(&ioend->io_bio);
+ } else {
+ bio_release_pages(&ioend->io_bio, false);
+ bio_put(&ioend->io_bio);
+ }
+
+ /*
+ * Return the number of bvecs completed as even direct I/O completions
+ * do significant per-folio work and we'll still want to give up the
+ * CPU after a lot of completions.
+ */
+ return vec_count;
+}
+
static int iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio,
loff_t pos, unsigned len)
{
diff --git a/fs/iomap/internal.h b/fs/iomap/internal.h
index 36d5c56e073e..f6992a3bf66a 100644
--- a/fs/iomap/internal.h
+++ b/fs/iomap/internal.h
@@ -5,5 +5,6 @@
#define IOEND_BATCH_SIZE 4096
u32 iomap_finish_ioend_buffered(struct iomap_ioend *ioend);
+u32 iomap_finish_ioend_direct(struct iomap_ioend *ioend);
#endif /* _IOMAP_INTERNAL_H */
diff --git a/fs/iomap/ioend.c b/fs/iomap/ioend.c
index b4f6dd9e319a..158fa685d81f 100644
--- a/fs/iomap/ioend.c
+++ b/fs/iomap/ioend.c
@@ -41,6 +41,8 @@ static u32 iomap_finish_ioend(struct iomap_ioend *ioend, int error)
if (!atomic_dec_and_test(&ioend->io_remaining))
return 0;
+ if (ioend->io_flags & IOMAP_IOEND_DIRECT)
+ return iomap_finish_ioend_direct(ioend);
return iomap_finish_ioend_buffered(ioend);
}
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 0d221fbe0eb3..1ef4c44fa36f 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -343,13 +343,15 @@ sector_t iomap_bmap(struct address_space *mapping, sector_t bno,
#define IOMAP_IOEND_UNWRITTEN (1U << 1)
/* don't merge into previous ioend */
#define IOMAP_IOEND_BOUNDARY (1U << 2)
+/* is direct I/O */
+#define IOMAP_IOEND_DIRECT (1U << 3)
/*
* Flags that if set on either ioend prevent the merge of two ioends.
* (IOMAP_IOEND_BOUNDARY also prevents merged, but only one-way)
*/
#define IOMAP_IOEND_NOMERGE_FLAGS \
- (IOMAP_IOEND_SHARED | IOMAP_IOEND_UNWRITTEN)
+ (IOMAP_IOEND_SHARED | IOMAP_IOEND_UNWRITTEN | IOMAP_IOEND_DIRECT)
/*
* Structure for writeback I/O completions.
--
2.45.2
^ permalink raw reply related [flat|nested] 20+ messages in thread* Re: [PATCH 07/10] iomap: optionally use ioends for direct I/O
2024-12-19 17:39 ` [PATCH 07/10] iomap: optionally use ioends for direct I/O Christoph Hellwig
@ 2024-12-19 18:25 ` Darrick J. Wong
0 siblings, 0 replies; 20+ messages in thread
From: Darrick J. Wong @ 2024-12-19 18:25 UTC (permalink / raw)
To: Christoph Hellwig
Cc: Christian Brauner, Carlos Maiolino, linux-xfs, linux-fsdevel
On Thu, Dec 19, 2024 at 05:39:12PM +0000, Christoph Hellwig wrote:
> struct iomap_ioend currently tracks outstanding buffered writes and has
> some really nice code in core iomap and XFS to merge contiguous I/Os
> an defer them to userspace for completion in a very efficient way.
>
> For zoned writes we'll also need a per-bio user context completion to
> record the written blocks, and the infrastructure for that would look
> basically like the ioend handling for buffered I/O.
>
> So instead of reinventing the wheel, reuse the existing infrastructure.
>
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
> fs/iomap/direct-io.c | 49 +++++++++++++++++++++++++++++++++++++++++--
> fs/iomap/internal.h | 1 +
> fs/iomap/ioend.c | 2 ++
> include/linux/iomap.h | 4 +++-
> 4 files changed, 53 insertions(+), 3 deletions(-)
>
> diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
> index ed658eb09a1a..dd521f4edf55 100644
> --- a/fs/iomap/direct-io.c
> +++ b/fs/iomap/direct-io.c
> @@ -1,7 +1,7 @@
> // SPDX-License-Identifier: GPL-2.0
> /*
> * Copyright (C) 2010 Red Hat, Inc.
> - * Copyright (c) 2016-2021 Christoph Hellwig.
> + * Copyright (c) 2016-2024 Christoph Hellwig.
> */
> #include <linux/module.h>
> #include <linux/compiler.h>
> @@ -12,6 +12,7 @@
> #include <linux/backing-dev.h>
> #include <linux/uio.h>
> #include <linux/task_io_accounting_ops.h>
> +#include "internal.h"
> #include "trace.h"
>
> #include "../internal.h"
> @@ -20,6 +21,7 @@
> * Private flags for iomap_dio, must not overlap with the public ones in
> * iomap.h:
> */
> +#define IOMAP_DIO_NO_INVALIDATE (1U << 25)
> #define IOMAP_DIO_CALLER_COMP (1U << 26)
> #define IOMAP_DIO_INLINE_COMP (1U << 27)
> #define IOMAP_DIO_WRITE_THROUGH (1U << 28)
> @@ -119,7 +121,8 @@ ssize_t iomap_dio_complete(struct iomap_dio *dio)
> * ->end_io() when necessary, otherwise a racing buffer read would cache
> * zeros from unwritten extents.
> */
> - if (!dio->error && dio->size && (dio->flags & IOMAP_DIO_WRITE))
> + if (!dio->error && dio->size && (dio->flags & IOMAP_DIO_WRITE) &&
> + !(dio->flags & IOMAP_DIO_NO_INVALIDATE))
> kiocb_invalidate_post_direct_write(iocb, dio->size);
>
> inode_dio_end(file_inode(iocb->ki_filp));
> @@ -221,6 +224,7 @@ static void iomap_dio_done(struct iomap_dio *dio)
> }
> }
>
> +
> void iomap_dio_bio_end_io(struct bio *bio)
> {
> struct iomap_dio *dio = bio->bi_private;
> @@ -241,6 +245,47 @@ void iomap_dio_bio_end_io(struct bio *bio)
> }
> EXPORT_SYMBOL_GPL(iomap_dio_bio_end_io);
>
> +u32 iomap_finish_ioend_direct(struct iomap_ioend *ioend)
> +{
> + struct iomap_dio *dio = ioend->io_bio.bi_private;
> + bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY);
> + u32 vec_count = ioend->io_bio.bi_vcnt;
> +
> + if (ioend->io_error)
> + iomap_dio_set_error(dio, ioend->io_error);
> +
> + if (atomic_dec_and_test(&dio->ref)) {
> + /*
> + * Try to avoid another context switch for the completion given
> + * that we are already called from the ioend completion
> + * workqueue, but never invalidate pages from this thread to
> + * avoid deadlocks with buffered I/O completions. Tough luck if
> + * yoy hit the tiny race with someone dirtying the range now
you
> + * betweem this check and the actual completion.
between
With those fixed,
Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
--D
> + */
> + if (!dio->iocb->ki_filp->f_mapping->nrpages) {
> + dio->flags |= IOMAP_DIO_INLINE_COMP;
> + dio->flags |= IOMAP_DIO_NO_INVALIDATE;
> + }
> + dio->flags &= ~IOMAP_DIO_CALLER_COMP;
> + iomap_dio_done(dio);
> + }
> +
> + if (should_dirty) {
> + bio_check_pages_dirty(&ioend->io_bio);
> + } else {
> + bio_release_pages(&ioend->io_bio, false);
> + bio_put(&ioend->io_bio);
> + }
> +
> + /*
> + * Return the number of bvecs completed as even direct I/O completions
> + * do significant per-folio work and we'll still want to give up the
> + * CPU after a lot of completions.
> + */
> + return vec_count;
> +}
> +
> static int iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio,
> loff_t pos, unsigned len)
> {
> diff --git a/fs/iomap/internal.h b/fs/iomap/internal.h
> index 36d5c56e073e..f6992a3bf66a 100644
> --- a/fs/iomap/internal.h
> +++ b/fs/iomap/internal.h
> @@ -5,5 +5,6 @@
> #define IOEND_BATCH_SIZE 4096
>
> u32 iomap_finish_ioend_buffered(struct iomap_ioend *ioend);
> +u32 iomap_finish_ioend_direct(struct iomap_ioend *ioend);
>
> #endif /* _IOMAP_INTERNAL_H */
> diff --git a/fs/iomap/ioend.c b/fs/iomap/ioend.c
> index b4f6dd9e319a..158fa685d81f 100644
> --- a/fs/iomap/ioend.c
> +++ b/fs/iomap/ioend.c
> @@ -41,6 +41,8 @@ static u32 iomap_finish_ioend(struct iomap_ioend *ioend, int error)
>
> if (!atomic_dec_and_test(&ioend->io_remaining))
> return 0;
> + if (ioend->io_flags & IOMAP_IOEND_DIRECT)
> + return iomap_finish_ioend_direct(ioend);
> return iomap_finish_ioend_buffered(ioend);
> }
>
> diff --git a/include/linux/iomap.h b/include/linux/iomap.h
> index 0d221fbe0eb3..1ef4c44fa36f 100644
> --- a/include/linux/iomap.h
> +++ b/include/linux/iomap.h
> @@ -343,13 +343,15 @@ sector_t iomap_bmap(struct address_space *mapping, sector_t bno,
> #define IOMAP_IOEND_UNWRITTEN (1U << 1)
> /* don't merge into previous ioend */
> #define IOMAP_IOEND_BOUNDARY (1U << 2)
> +/* is direct I/O */
> +#define IOMAP_IOEND_DIRECT (1U << 3)
>
> /*
> * Flags that if set on either ioend prevent the merge of two ioends.
> * (IOMAP_IOEND_BOUNDARY also prevents merged, but only one-way)
> */
> #define IOMAP_IOEND_NOMERGE_FLAGS \
> - (IOMAP_IOEND_SHARED | IOMAP_IOEND_UNWRITTEN)
> + (IOMAP_IOEND_SHARED | IOMAP_IOEND_UNWRITTEN | IOMAP_IOEND_DIRECT)
>
> /*
> * Structure for writeback I/O completions.
> --
> 2.45.2
>
>
^ permalink raw reply [flat|nested] 20+ messages in thread
* [PATCH 08/10] iomap: pass private data to iomap_page_mkwrite
2024-12-19 17:39 iomap patches for zoned XFS v1 Christoph Hellwig
` (6 preceding siblings ...)
2024-12-19 17:39 ` [PATCH 07/10] iomap: optionally use ioends for direct I/O Christoph Hellwig
@ 2024-12-19 17:39 ` Christoph Hellwig
2024-12-19 17:39 ` [PATCH 09/10] iomap: pass private data to iomap_zero_range Christoph Hellwig
2024-12-19 17:39 ` [PATCH 10/10] iomap: pass private data to iomap_truncate_page Christoph Hellwig
9 siblings, 0 replies; 20+ messages in thread
From: Christoph Hellwig @ 2024-12-19 17:39 UTC (permalink / raw)
To: Christian Brauner
Cc: Darrick J. Wong, Carlos Maiolino, linux-xfs, linux-fsdevel
Allow the file system to pass private data which can be used by the
iomap_begin and iomap_end methods through the private pointer in the
iomap_iter structure.
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
---
fs/iomap/buffered-io.c | 4 +++-
fs/xfs/xfs_file.c | 3 ++-
fs/zonefs/file.c | 2 +-
include/linux/iomap.h | 5 ++---
4 files changed, 8 insertions(+), 6 deletions(-)
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 06b90d859d74..e33309fa0a11 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -1489,11 +1489,13 @@ static loff_t iomap_folio_mkwrite_iter(struct iomap_iter *iter,
return length;
}
-vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops)
+vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops,
+ void *private)
{
struct iomap_iter iter = {
.inode = file_inode(vmf->vma->vm_file),
.flags = IOMAP_WRITE | IOMAP_FAULT,
+ .private = private,
};
struct folio *folio = page_folio(vmf->page);
ssize_t ret;
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 9a435b1ff264..2b6d4c71994d 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1498,7 +1498,8 @@ xfs_write_fault(
if (IS_DAX(inode))
ret = xfs_dax_fault_locked(vmf, order, true);
else
- ret = iomap_page_mkwrite(vmf, &xfs_buffered_write_iomap_ops);
+ ret = iomap_page_mkwrite(vmf, &xfs_buffered_write_iomap_ops,
+ NULL);
xfs_iunlock(ip, lock_mode);
sb_end_pagefault(inode->i_sb);
diff --git a/fs/zonefs/file.c b/fs/zonefs/file.c
index 35166c92420c..42e2c0065bb3 100644
--- a/fs/zonefs/file.c
+++ b/fs/zonefs/file.c
@@ -299,7 +299,7 @@ static vm_fault_t zonefs_filemap_page_mkwrite(struct vm_fault *vmf)
/* Serialize against truncates */
filemap_invalidate_lock_shared(inode->i_mapping);
- ret = iomap_page_mkwrite(vmf, &zonefs_write_iomap_ops);
+ ret = iomap_page_mkwrite(vmf, &zonefs_write_iomap_ops, NULL);
filemap_invalidate_unlock_shared(inode->i_mapping);
sb_end_pagefault(inode->i_sb);
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 1ef4c44fa36f..6697da4610cc 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -316,9 +316,8 @@ int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len,
bool *did_zero, const struct iomap_ops *ops);
int iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
const struct iomap_ops *ops);
-vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf,
- const struct iomap_ops *ops);
-
+vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops,
+ void *private);
typedef void (*iomap_punch_t)(struct inode *inode, loff_t offset, loff_t length,
struct iomap *iomap);
void iomap_write_delalloc_release(struct inode *inode, loff_t start_byte,
--
2.45.2
^ permalink raw reply related [flat|nested] 20+ messages in thread* [PATCH 09/10] iomap: pass private data to iomap_zero_range
2024-12-19 17:39 iomap patches for zoned XFS v1 Christoph Hellwig
` (7 preceding siblings ...)
2024-12-19 17:39 ` [PATCH 08/10] iomap: pass private data to iomap_page_mkwrite Christoph Hellwig
@ 2024-12-19 17:39 ` Christoph Hellwig
2024-12-19 17:39 ` [PATCH 10/10] iomap: pass private data to iomap_truncate_page Christoph Hellwig
9 siblings, 0 replies; 20+ messages in thread
From: Christoph Hellwig @ 2024-12-19 17:39 UTC (permalink / raw)
To: Christian Brauner
Cc: Darrick J. Wong, Carlos Maiolino, linux-xfs, linux-fsdevel
Allow the file system to pass private data which can be used by the
iomap_begin and iomap_end methods through the private pointer in the
iomap_iter structure.
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
---
fs/gfs2/bmap.c | 3 ++-
fs/iomap/buffered-io.c | 6 ++++--
fs/xfs/xfs_iomap.c | 2 +-
include/linux/iomap.h | 2 +-
4 files changed, 8 insertions(+), 5 deletions(-)
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 1795c4e8dbf6..366516b98b3f 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -1300,7 +1300,8 @@ static int gfs2_block_zero_range(struct inode *inode, loff_t from,
unsigned int length)
{
BUG_ON(current->journal_info);
- return iomap_zero_range(inode, from, length, NULL, &gfs2_iomap_ops);
+ return iomap_zero_range(inode, from, length, NULL, &gfs2_iomap_ops,
+ NULL);
}
#define GFS2_JTRUNC_REVOKES 8192
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index e33309fa0a11..c2957ec9ab8b 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -1391,13 +1391,14 @@ static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero)
int
iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
- const struct iomap_ops *ops)
+ const struct iomap_ops *ops, void *private)
{
struct iomap_iter iter = {
.inode = inode,
.pos = pos,
.len = len,
.flags = IOMAP_ZERO,
+ .private = private,
};
struct address_space *mapping = inode->i_mapping;
unsigned int blocksize = i_blocksize(inode);
@@ -1465,7 +1466,8 @@ iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
/* Block boundary? Nothing to do */
if (!off)
return 0;
- return iomap_zero_range(inode, pos, blocksize - off, did_zero, ops);
+ return iomap_zero_range(inode, pos, blocksize - off, did_zero, ops,
+ NULL);
}
EXPORT_SYMBOL_GPL(iomap_truncate_page);
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 50fa3ef89f6c..3410c55f544a 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -1497,7 +1497,7 @@ xfs_zero_range(
return dax_zero_range(inode, pos, len, did_zero,
&xfs_dax_write_iomap_ops);
return iomap_zero_range(inode, pos, len, did_zero,
- &xfs_buffered_write_iomap_ops);
+ &xfs_buffered_write_iomap_ops, NULL);
}
int
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 6697da4610cc..d2debb7f4344 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -313,7 +313,7 @@ bool iomap_dirty_folio(struct address_space *mapping, struct folio *folio);
int iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len,
const struct iomap_ops *ops);
int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len,
- bool *did_zero, const struct iomap_ops *ops);
+ bool *did_zero, const struct iomap_ops *ops, void *private);
int iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
const struct iomap_ops *ops);
vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops,
--
2.45.2
^ permalink raw reply related [flat|nested] 20+ messages in thread* [PATCH 10/10] iomap: pass private data to iomap_truncate_page
2024-12-19 17:39 iomap patches for zoned XFS v1 Christoph Hellwig
` (8 preceding siblings ...)
2024-12-19 17:39 ` [PATCH 09/10] iomap: pass private data to iomap_zero_range Christoph Hellwig
@ 2024-12-19 17:39 ` Christoph Hellwig
9 siblings, 0 replies; 20+ messages in thread
From: Christoph Hellwig @ 2024-12-19 17:39 UTC (permalink / raw)
To: Christian Brauner
Cc: Darrick J. Wong, Carlos Maiolino, linux-xfs, linux-fsdevel
Allow the file system to pass private data which can be used by the
iomap_begin and iomap_end methods through the private pointer in the
iomap_iter structure.
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
---
fs/iomap/buffered-io.c | 4 ++--
fs/xfs/xfs_iomap.c | 2 +-
include/linux/iomap.h | 2 +-
3 files changed, 4 insertions(+), 4 deletions(-)
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index c2957ec9ab8b..54c42a88155a 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -1458,7 +1458,7 @@ EXPORT_SYMBOL_GPL(iomap_zero_range);
int
iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
- const struct iomap_ops *ops)
+ const struct iomap_ops *ops, void *private)
{
unsigned int blocksize = i_blocksize(inode);
unsigned int off = pos & (blocksize - 1);
@@ -1467,7 +1467,7 @@ iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
if (!off)
return 0;
return iomap_zero_range(inode, pos, blocksize - off, did_zero, ops,
- NULL);
+ private);
}
EXPORT_SYMBOL_GPL(iomap_truncate_page);
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 3410c55f544a..5dd0922fe2d1 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -1512,5 +1512,5 @@ xfs_truncate_page(
return dax_truncate_page(inode, pos, did_zero,
&xfs_dax_write_iomap_ops);
return iomap_truncate_page(inode, pos, did_zero,
- &xfs_buffered_write_iomap_ops);
+ &xfs_buffered_write_iomap_ops, NULL);
}
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index d2debb7f4344..de2ecde47aea 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -315,7 +315,7 @@ int iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len,
int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len,
bool *did_zero, const struct iomap_ops *ops, void *private);
int iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
- const struct iomap_ops *ops);
+ const struct iomap_ops *ops, void *private);
vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops,
void *private);
typedef void (*iomap_punch_t)(struct inode *inode, loff_t offset, loff_t length,
--
2.45.2
^ permalink raw reply related [flat|nested] 20+ messages in thread