From: Joanne Koong <joannelkoong@gmail.com>
To: miklos@szeredi.hu
Cc: djwong@kernel.org, brauner@kernel.org,
linux-fsdevel@vger.kernel.org, linux-xfs@vger.kernel.org,
bernd.schubert@fastmail.fm, kernel-team@meta.com
Subject: [PATCH v1 1/8] iomap: move buffered io bio logic into separate file
Date: Fri, 6 Jun 2025 16:37:56 -0700 [thread overview]
Message-ID: <20250606233803.1421259-2-joannelkoong@gmail.com> (raw)
In-Reply-To: <20250606233803.1421259-1-joannelkoong@gmail.com>
Move buffered io bio logic into a separate file, buffered-io-bio.c, so
that callers that do not have CONFIG_BLOCK set and do not depend on bios
can also use iomap and hook into some of its internal features such as
granular dirty tracking for large folios.
Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
---
fs/iomap/Makefile | 5 +-
fs/iomap/buffered-io-bio.c | 291 ++++++++++++++++++++++++++++++++++
fs/iomap/buffered-io.c | 309 ++-----------------------------------
fs/iomap/internal.h | 46 ++++++
4 files changed, 356 insertions(+), 295 deletions(-)
create mode 100644 fs/iomap/buffered-io-bio.c
diff --git a/fs/iomap/Makefile b/fs/iomap/Makefile
index 69e8ebb41302..477d78c58807 100644
--- a/fs/iomap/Makefile
+++ b/fs/iomap/Makefile
@@ -9,8 +9,9 @@ ccflags-y += -I $(src) # needed for trace events
obj-$(CONFIG_FS_IOMAP) += iomap.o
iomap-y += trace.o \
- iter.o
-iomap-$(CONFIG_BLOCK) += buffered-io.o \
+ iter.o \
+ buffered-io.o
+iomap-$(CONFIG_BLOCK) += buffered-io-bio.o \
direct-io.o \
ioend.o \
fiemap.o \
diff --git a/fs/iomap/buffered-io-bio.c b/fs/iomap/buffered-io-bio.c
new file mode 100644
index 000000000000..03841bed72e7
--- /dev/null
+++ b/fs/iomap/buffered-io-bio.c
@@ -0,0 +1,291 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/bio.h>
+#include <linux/buffer_head.h>
+#include <linux/iomap.h>
+#include <linux/writeback.h>
+
+#include "internal.h"
+
+static void iomap_finish_folio_read(struct folio *folio, size_t off,
+ size_t len, int error)
+{
+ struct iomap_folio_state *ifs = folio->private;
+ bool uptodate = !error;
+ bool finished = true;
+
+ if (ifs) {
+ unsigned long flags;
+
+ spin_lock_irqsave(&ifs->state_lock, flags);
+ if (!error)
+ uptodate = ifs_set_range_uptodate(folio, ifs, off, len);
+ ifs->read_bytes_pending -= len;
+ finished = !ifs->read_bytes_pending;
+ spin_unlock_irqrestore(&ifs->state_lock, flags);
+ }
+
+ if (finished)
+ folio_end_read(folio, uptodate);
+}
+
+static void iomap_read_end_io(struct bio *bio)
+{
+ int error = blk_status_to_errno(bio->bi_status);
+ struct folio_iter fi;
+
+ bio_for_each_folio_all(fi, bio)
+ iomap_finish_folio_read(fi.folio, fi.offset, fi.length, error);
+ bio_put(bio);
+}
+
+int iomap_bio_readpage(const struct iomap *iomap, loff_t pos,
+ struct iomap_readpage_ctx *ctx,
+ size_t poff, size_t plen,
+ loff_t length)
+{
+ struct folio *folio = ctx->cur_folio;
+ sector_t sector;
+
+ sector = iomap_sector(iomap, pos);
+ if (!ctx->bio ||
+ bio_end_sector(ctx->bio) != sector ||
+ !bio_add_folio(ctx->bio, folio, plen, poff)) {
+ gfp_t gfp = mapping_gfp_constraint(folio->mapping, GFP_KERNEL);
+ gfp_t orig_gfp = gfp;
+ unsigned int nr_vecs = DIV_ROUND_UP(length, PAGE_SIZE);
+
+ if (ctx->bio)
+ submit_bio(ctx->bio);
+
+ if (ctx->rac) /* same as readahead_gfp_mask */
+ gfp |= __GFP_NORETRY | __GFP_NOWARN;
+ ctx->bio = bio_alloc(iomap->bdev, bio_max_segs(nr_vecs),
+ REQ_OP_READ, gfp);
+ /*
+ * If the bio_alloc fails, try it again for a single page to
+ * avoid having to deal with partial page reads. This emulates
+ * what do_mpage_read_folio does.
+ */
+ if (!ctx->bio) {
+ ctx->bio = bio_alloc(iomap->bdev, 1, REQ_OP_READ,
+ orig_gfp);
+ }
+ if (ctx->rac)
+ ctx->bio->bi_opf |= REQ_RAHEAD;
+ ctx->bio->bi_iter.bi_sector = sector;
+ ctx->bio->bi_end_io = iomap_read_end_io;
+ bio_add_folio_nofail(ctx->bio, folio, plen, poff);
+ }
+ return 0;
+}
+
+void iomap_submit_bio(struct bio *bio)
+{
+ submit_bio(bio);
+}
+
+int iomap_bio_read_folio_sync(loff_t block_start, struct folio *folio, size_t poff,
+ size_t plen, const struct iomap *iomap) {
+ struct bio_vec bvec;
+ struct bio bio;
+
+ bio_init(&bio, iomap->bdev, &bvec, 1, REQ_OP_READ);
+ bio.bi_iter.bi_sector = iomap_sector(iomap, block_start);
+ bio_add_folio_nofail(&bio, folio, plen, poff);
+ return submit_bio_wait(&bio);
+}
+
+static void iomap_finish_folio_write(struct inode *inode, struct folio *folio,
+ size_t len)
+{
+ struct iomap_folio_state *ifs = folio->private;
+
+ WARN_ON_ONCE(i_blocks_per_folio(inode, folio) > 1 && !ifs);
+ WARN_ON_ONCE(ifs && atomic_read(&ifs->write_bytes_pending) <= 0);
+
+ if (!ifs || atomic_sub_and_test(len, &ifs->write_bytes_pending))
+ folio_end_writeback(folio);
+}
+
+/*
+ * We're now finished for good with this ioend structure. Update the page
+ * state, release holds on bios, and finally free up memory. Do not use the
+ * ioend after this.
+ */
+u32 iomap_finish_ioend_buffered(struct iomap_ioend *ioend)
+{
+ struct inode *inode = ioend->io_inode;
+ struct bio *bio = &ioend->io_bio;
+ struct folio_iter fi;
+ u32 folio_count = 0;
+
+ if (ioend->io_error) {
+ mapping_set_error(inode->i_mapping, ioend->io_error);
+ if (!bio_flagged(bio, BIO_QUIET)) {
+ pr_err_ratelimited(
+"%s: writeback error on inode %lu, offset %lld, sector %llu",
+ inode->i_sb->s_id, inode->i_ino,
+ ioend->io_offset, ioend->io_sector);
+ }
+ }
+
+ /* walk all folios in bio, ending page IO on them */
+ bio_for_each_folio_all(fi, bio) {
+ iomap_finish_folio_write(inode, fi.folio, fi.length);
+ folio_count++;
+ }
+
+ bio_put(bio); /* frees the ioend */
+ return folio_count;
+}
+
+static void iomap_writepage_end_bio(struct bio *bio)
+{
+ struct iomap_ioend *ioend = iomap_ioend_from_bio(bio);
+
+ ioend->io_error = blk_status_to_errno(bio->bi_status);
+ iomap_finish_ioend_buffered(ioend);
+}
+
+void iomap_bio_ioend_error(struct iomap_writepage_ctx *wpc, int error)
+{
+ wpc->ioend->io_bio.bi_status = errno_to_blk_status(error);
+ bio_endio(&wpc->ioend->io_bio);
+}
+
+static struct iomap_ioend *iomap_alloc_ioend(struct iomap_writepage_ctx *wpc,
+ struct writeback_control *wbc,
+ struct inode *inode, loff_t pos,
+ u16 ioend_flags)
+{
+ struct bio *bio;
+
+ bio = bio_alloc_bioset(wpc->iomap.bdev, BIO_MAX_VECS,
+ REQ_OP_WRITE | wbc_to_write_flags(wbc),
+ GFP_NOFS, &iomap_ioend_bioset);
+ bio->bi_iter.bi_sector = iomap_sector(&wpc->iomap, pos);
+ bio->bi_end_io = iomap_writepage_end_bio;
+ bio->bi_write_hint = inode->i_write_hint;
+ wbc_init_bio(wbc, bio);
+ wpc->nr_folios = 0;
+ return iomap_init_ioend(inode, bio, pos, ioend_flags);
+}
+
+static bool iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t pos,
+ u16 ioend_flags)
+{
+ if (ioend_flags & IOMAP_IOEND_BOUNDARY)
+ return false;
+ if ((ioend_flags & IOMAP_IOEND_NOMERGE_FLAGS) !=
+ (wpc->ioend->io_flags & IOMAP_IOEND_NOMERGE_FLAGS))
+ return false;
+ if (pos != wpc->ioend->io_offset + wpc->ioend->io_size)
+ return false;
+ if (!(wpc->iomap.flags & IOMAP_F_ANON_WRITE) &&
+ iomap_sector(&wpc->iomap, pos) !=
+ bio_end_sector(&wpc->ioend->io_bio))
+ return false;
+ /*
+ * Limit ioend bio chain lengths to minimise IO completion latency. This
+ * also prevents long tight loops ending page writeback on all the
+ * folios in the ioend.
+ */
+ if (wpc->nr_folios >= IOEND_BATCH_SIZE)
+ return false;
+ return true;
+}
+
+/*
+ * Test to see if we have an existing ioend structure that we could append to
+ * first; otherwise finish off the current ioend and start another.
+ *
+ * If a new ioend is created and cached, the old ioend is submitted to the block
+ * layer instantly. Batching optimisations are provided by higher level block
+ * plugging.
+ *
+ * At the end of a writeback pass, there will be a cached ioend remaining on the
+ * writepage context that the caller will need to submit.
+ */
+int iomap_bio_add_to_ioend(struct iomap_writepage_ctx *wpc,
+ struct writeback_control *wbc, struct folio *folio,
+ struct inode *inode, loff_t pos, loff_t end_pos,
+ unsigned len)
+{
+ struct iomap_folio_state *ifs = folio->private;
+ size_t poff = offset_in_folio(folio, pos);
+ unsigned int ioend_flags = 0;
+ int error;
+
+ if (wpc->iomap.type == IOMAP_UNWRITTEN)
+ ioend_flags |= IOMAP_IOEND_UNWRITTEN;
+ if (wpc->iomap.flags & IOMAP_F_SHARED)
+ ioend_flags |= IOMAP_IOEND_SHARED;
+ if (pos == wpc->iomap.offset && (wpc->iomap.flags & IOMAP_F_BOUNDARY))
+ ioend_flags |= IOMAP_IOEND_BOUNDARY;
+
+ if (!wpc->ioend || !iomap_can_add_to_ioend(wpc, pos, ioend_flags)) {
+new_ioend:
+ error = iomap_submit_ioend(wpc, 0);
+ if (error)
+ return error;
+ wpc->ioend = iomap_alloc_ioend(wpc, wbc, inode, pos,
+ ioend_flags);
+ if (!wpc->ioend)
+ return -ENOTSUPP;
+ }
+
+ if (!bio_add_folio(&wpc->ioend->io_bio, folio, len, poff))
+ goto new_ioend;
+
+ if (ifs)
+ atomic_add(len, &ifs->write_bytes_pending);
+
+ /*
+ * Clamp io_offset and io_size to the incore EOF so that ondisk
+ * file size updates in the ioend completion are byte-accurate.
+ * This avoids recovering files with zeroed tail regions when
+ * writeback races with appending writes:
+ *
+ * Thread 1: Thread 2:
+ * ------------ -----------
+ * write [A, A+B]
+ * update inode size to A+B
+ * submit I/O [A, A+BS]
+ * write [A+B, A+B+C]
+ * update inode size to A+B+C
+ * <I/O completes, updates disk size to min(A+B+C, A+BS)>
+ * <power failure>
+ *
+ * After reboot:
+ * 1) with A+B+C < A+BS, the file has zero padding in range
+ * [A+B, A+B+C]
+ *
+ * |< Block Size (BS) >|
+ * |DDDDDDDDDDDD0000000000000|
+ * ^ ^ ^
+ * A A+B A+B+C
+ * (EOF)
+ *
+ * 2) with A+B+C > A+BS, the file has zero padding in range
+ * [A+B, A+BS]
+ *
+ * |< Block Size (BS) >|< Block Size (BS) >|
+ * |DDDDDDDDDDDD0000000000000|00000000000000000000000000|
+ * ^ ^ ^ ^
+ * A A+B A+BS A+B+C
+ * (EOF)
+ *
+ * D = Valid Data
+ * 0 = Zero Padding
+ *
+ * Note that this defeats the ability to chain the ioends of
+ * appending writes.
+ */
+ wpc->ioend->io_size += len;
+ if (wpc->ioend->io_offset + wpc->ioend->io_size > end_pos)
+ wpc->ioend->io_size = end_pos - wpc->ioend->io_offset;
+
+ wbc_account_cgroup_owner(wbc, folio, len);
+ return 0;
+}
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 31553372b33a..1caeb4921035 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -13,7 +13,6 @@
#include <linux/dax.h>
#include <linux/writeback.h>
#include <linux/swap.h>
-#include <linux/bio.h>
#include <linux/sched/signal.h>
#include <linux/migrate.h>
#include "internal.h"
@@ -21,23 +20,6 @@
#include "../internal.h"
-/*
- * Structure allocated for each folio to track per-block uptodate, dirty state
- * and I/O completions.
- */
-struct iomap_folio_state {
- spinlock_t state_lock;
- unsigned int read_bytes_pending;
- atomic_t write_bytes_pending;
-
- /*
- * Each block has two bits in this bitmap:
- * Bits [0..blocks_per_folio) has the uptodate status.
- * Bits [b_p_f...(2*b_p_f)) has the dirty status.
- */
- unsigned long state[];
-};
-
static inline bool ifs_is_fully_uptodate(struct folio *folio,
struct iomap_folio_state *ifs)
{
@@ -52,8 +34,8 @@ static inline bool ifs_block_is_uptodate(struct iomap_folio_state *ifs,
return test_bit(block, ifs->state);
}
-static bool ifs_set_range_uptodate(struct folio *folio,
- struct iomap_folio_state *ifs, size_t off, size_t len)
+bool ifs_set_range_uptodate(struct folio *folio, struct iomap_folio_state *ifs,
+ size_t off, size_t len)
{
struct inode *inode = folio->mapping->host;
unsigned int first_blk = off >> inode->i_blkbits;
@@ -284,45 +266,6 @@ static void iomap_adjust_read_range(struct inode *inode, struct folio *folio,
*lenp = plen;
}
-static void iomap_finish_folio_read(struct folio *folio, size_t off,
- size_t len, int error)
-{
- struct iomap_folio_state *ifs = folio->private;
- bool uptodate = !error;
- bool finished = true;
-
- if (ifs) {
- unsigned long flags;
-
- spin_lock_irqsave(&ifs->state_lock, flags);
- if (!error)
- uptodate = ifs_set_range_uptodate(folio, ifs, off, len);
- ifs->read_bytes_pending -= len;
- finished = !ifs->read_bytes_pending;
- spin_unlock_irqrestore(&ifs->state_lock, flags);
- }
-
- if (finished)
- folio_end_read(folio, uptodate);
-}
-
-static void iomap_read_end_io(struct bio *bio)
-{
- int error = blk_status_to_errno(bio->bi_status);
- struct folio_iter fi;
-
- bio_for_each_folio_all(fi, bio)
- iomap_finish_folio_read(fi.folio, fi.offset, fi.length, error);
- bio_put(bio);
-}
-
-struct iomap_readpage_ctx {
- struct folio *cur_folio;
- bool cur_folio_in_bio;
- struct bio *bio;
- struct readahead_control *rac;
-};
-
/**
* iomap_read_inline_data - copy inline data into the page cache
* @iter: iteration structure
@@ -371,7 +314,6 @@ static int iomap_readpage_iter(struct iomap_iter *iter,
struct folio *folio = ctx->cur_folio;
struct iomap_folio_state *ifs;
size_t poff, plen;
- sector_t sector;
int ret;
if (iomap->type == IOMAP_INLINE) {
@@ -394,43 +336,17 @@ static int iomap_readpage_iter(struct iomap_iter *iter,
}
ctx->cur_folio_in_bio = true;
+
+ ret = iomap_bio_readpage(iomap, pos, ctx, poff, plen, length);
+ if (ret)
+ return ret;
+
if (ifs) {
spin_lock_irq(&ifs->state_lock);
ifs->read_bytes_pending += plen;
spin_unlock_irq(&ifs->state_lock);
}
- sector = iomap_sector(iomap, pos);
- if (!ctx->bio ||
- bio_end_sector(ctx->bio) != sector ||
- !bio_add_folio(ctx->bio, folio, plen, poff)) {
- gfp_t gfp = mapping_gfp_constraint(folio->mapping, GFP_KERNEL);
- gfp_t orig_gfp = gfp;
- unsigned int nr_vecs = DIV_ROUND_UP(length, PAGE_SIZE);
-
- if (ctx->bio)
- submit_bio(ctx->bio);
-
- if (ctx->rac) /* same as readahead_gfp_mask */
- gfp |= __GFP_NORETRY | __GFP_NOWARN;
- ctx->bio = bio_alloc(iomap->bdev, bio_max_segs(nr_vecs),
- REQ_OP_READ, gfp);
- /*
- * If the bio_alloc fails, try it again for a single page to
- * avoid having to deal with partial page reads. This emulates
- * what do_mpage_read_folio does.
- */
- if (!ctx->bio) {
- ctx->bio = bio_alloc(iomap->bdev, 1, REQ_OP_READ,
- orig_gfp);
- }
- if (ctx->rac)
- ctx->bio->bi_opf |= REQ_RAHEAD;
- ctx->bio->bi_iter.bi_sector = sector;
- ctx->bio->bi_end_io = iomap_read_end_io;
- bio_add_folio_nofail(ctx->bio, folio, plen, poff);
- }
-
done:
/*
* Move the caller beyond our range so that it keeps making progress.
@@ -474,7 +390,7 @@ int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops)
iter.status = iomap_read_folio_iter(&iter, &ctx);
if (ctx.bio) {
- submit_bio(ctx.bio);
+ iomap_submit_bio(ctx.bio);
WARN_ON_ONCE(!ctx.cur_folio_in_bio);
} else {
WARN_ON_ONCE(ctx.cur_folio_in_bio);
@@ -546,7 +462,7 @@ void iomap_readahead(struct readahead_control *rac, const struct iomap_ops *ops)
iter.status = iomap_readahead_iter(&iter, &ctx);
if (ctx.bio)
- submit_bio(ctx.bio);
+ iomap_submit_bio(ctx.bio);
if (ctx.cur_folio) {
if (!ctx.cur_folio_in_bio)
folio_unlock(ctx.cur_folio);
@@ -670,13 +586,7 @@ iomap_write_failed(struct inode *inode, loff_t pos, unsigned len)
static int iomap_read_folio_sync(loff_t block_start, struct folio *folio,
size_t poff, size_t plen, const struct iomap *iomap)
{
- struct bio_vec bvec;
- struct bio bio;
-
- bio_init(&bio, iomap->bdev, &bvec, 1, REQ_OP_READ);
- bio.bi_iter.bi_sector = iomap_sector(iomap, block_start);
- bio_add_folio_nofail(&bio, folio, plen, poff);
- return submit_bio_wait(&bio);
+ return iomap_bio_read_folio_sync(block_start, folio, poff, plen, iomap);
}
static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos,
@@ -1519,58 +1429,6 @@ vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops,
}
EXPORT_SYMBOL_GPL(iomap_page_mkwrite);
-static void iomap_finish_folio_write(struct inode *inode, struct folio *folio,
- size_t len)
-{
- struct iomap_folio_state *ifs = folio->private;
-
- WARN_ON_ONCE(i_blocks_per_folio(inode, folio) > 1 && !ifs);
- WARN_ON_ONCE(ifs && atomic_read(&ifs->write_bytes_pending) <= 0);
-
- if (!ifs || atomic_sub_and_test(len, &ifs->write_bytes_pending))
- folio_end_writeback(folio);
-}
-
-/*
- * We're now finished for good with this ioend structure. Update the page
- * state, release holds on bios, and finally free up memory. Do not use the
- * ioend after this.
- */
-u32 iomap_finish_ioend_buffered(struct iomap_ioend *ioend)
-{
- struct inode *inode = ioend->io_inode;
- struct bio *bio = &ioend->io_bio;
- struct folio_iter fi;
- u32 folio_count = 0;
-
- if (ioend->io_error) {
- mapping_set_error(inode->i_mapping, ioend->io_error);
- if (!bio_flagged(bio, BIO_QUIET)) {
- pr_err_ratelimited(
-"%s: writeback error on inode %lu, offset %lld, sector %llu",
- inode->i_sb->s_id, inode->i_ino,
- ioend->io_offset, ioend->io_sector);
- }
- }
-
- /* walk all folios in bio, ending page IO on them */
- bio_for_each_folio_all(fi, bio) {
- iomap_finish_folio_write(inode, fi.folio, fi.length);
- folio_count++;
- }
-
- bio_put(bio); /* frees the ioend */
- return folio_count;
-}
-
-static void iomap_writepage_end_bio(struct bio *bio)
-{
- struct iomap_ioend *ioend = iomap_ioend_from_bio(bio);
-
- ioend->io_error = blk_status_to_errno(bio->bi_status);
- iomap_finish_ioend_buffered(ioend);
-}
-
/*
* Submit an ioend.
*
@@ -1580,7 +1438,7 @@ static void iomap_writepage_end_bio(struct bio *bio)
* with the error status here to run the normal I/O completion handler to clear
* the writeback bit and let the file system proess the errors.
*/
-static int iomap_submit_ioend(struct iomap_writepage_ctx *wpc, int error)
+int iomap_submit_ioend(struct iomap_writepage_ctx *wpc, int error)
{
if (!wpc->ioend)
return error;
@@ -1597,151 +1455,16 @@ static int iomap_submit_ioend(struct iomap_writepage_ctx *wpc, int error)
if (WARN_ON_ONCE(wpc->iomap.flags & IOMAP_F_ANON_WRITE))
error = -EIO;
if (!error)
- submit_bio(&wpc->ioend->io_bio);
+ iomap_submit_bio(&wpc->ioend->io_bio);
}
- if (error) {
- wpc->ioend->io_bio.bi_status = errno_to_blk_status(error);
- bio_endio(&wpc->ioend->io_bio);
- }
+ if (error)
+ iomap_bio_ioend_error(wpc, error);
wpc->ioend = NULL;
return error;
}
-static struct iomap_ioend *iomap_alloc_ioend(struct iomap_writepage_ctx *wpc,
- struct writeback_control *wbc, struct inode *inode, loff_t pos,
- u16 ioend_flags)
-{
- struct bio *bio;
-
- bio = bio_alloc_bioset(wpc->iomap.bdev, BIO_MAX_VECS,
- REQ_OP_WRITE | wbc_to_write_flags(wbc),
- GFP_NOFS, &iomap_ioend_bioset);
- bio->bi_iter.bi_sector = iomap_sector(&wpc->iomap, pos);
- bio->bi_end_io = iomap_writepage_end_bio;
- bio->bi_write_hint = inode->i_write_hint;
- wbc_init_bio(wbc, bio);
- wpc->nr_folios = 0;
- return iomap_init_ioend(inode, bio, pos, ioend_flags);
-}
-
-static bool iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t pos,
- u16 ioend_flags)
-{
- if (ioend_flags & IOMAP_IOEND_BOUNDARY)
- return false;
- if ((ioend_flags & IOMAP_IOEND_NOMERGE_FLAGS) !=
- (wpc->ioend->io_flags & IOMAP_IOEND_NOMERGE_FLAGS))
- return false;
- if (pos != wpc->ioend->io_offset + wpc->ioend->io_size)
- return false;
- if (!(wpc->iomap.flags & IOMAP_F_ANON_WRITE) &&
- iomap_sector(&wpc->iomap, pos) !=
- bio_end_sector(&wpc->ioend->io_bio))
- return false;
- /*
- * Limit ioend bio chain lengths to minimise IO completion latency. This
- * also prevents long tight loops ending page writeback on all the
- * folios in the ioend.
- */
- if (wpc->nr_folios >= IOEND_BATCH_SIZE)
- return false;
- return true;
-}
-
-/*
- * Test to see if we have an existing ioend structure that we could append to
- * first; otherwise finish off the current ioend and start another.
- *
- * If a new ioend is created and cached, the old ioend is submitted to the block
- * layer instantly. Batching optimisations are provided by higher level block
- * plugging.
- *
- * At the end of a writeback pass, there will be a cached ioend remaining on the
- * writepage context that the caller will need to submit.
- */
-static int iomap_add_to_ioend(struct iomap_writepage_ctx *wpc,
- struct writeback_control *wbc, struct folio *folio,
- struct inode *inode, loff_t pos, loff_t end_pos,
- unsigned len)
-{
- struct iomap_folio_state *ifs = folio->private;
- size_t poff = offset_in_folio(folio, pos);
- unsigned int ioend_flags = 0;
- int error;
-
- if (wpc->iomap.type == IOMAP_UNWRITTEN)
- ioend_flags |= IOMAP_IOEND_UNWRITTEN;
- if (wpc->iomap.flags & IOMAP_F_SHARED)
- ioend_flags |= IOMAP_IOEND_SHARED;
- if (pos == wpc->iomap.offset && (wpc->iomap.flags & IOMAP_F_BOUNDARY))
- ioend_flags |= IOMAP_IOEND_BOUNDARY;
-
- if (!wpc->ioend || !iomap_can_add_to_ioend(wpc, pos, ioend_flags)) {
-new_ioend:
- error = iomap_submit_ioend(wpc, 0);
- if (error)
- return error;
- wpc->ioend = iomap_alloc_ioend(wpc, wbc, inode, pos,
- ioend_flags);
- }
-
- if (!bio_add_folio(&wpc->ioend->io_bio, folio, len, poff))
- goto new_ioend;
-
- if (ifs)
- atomic_add(len, &ifs->write_bytes_pending);
-
- /*
- * Clamp io_offset and io_size to the incore EOF so that ondisk
- * file size updates in the ioend completion are byte-accurate.
- * This avoids recovering files with zeroed tail regions when
- * writeback races with appending writes:
- *
- * Thread 1: Thread 2:
- * ------------ -----------
- * write [A, A+B]
- * update inode size to A+B
- * submit I/O [A, A+BS]
- * write [A+B, A+B+C]
- * update inode size to A+B+C
- * <I/O completes, updates disk size to min(A+B+C, A+BS)>
- * <power failure>
- *
- * After reboot:
- * 1) with A+B+C < A+BS, the file has zero padding in range
- * [A+B, A+B+C]
- *
- * |< Block Size (BS) >|
- * |DDDDDDDDDDDD0000000000000|
- * ^ ^ ^
- * A A+B A+B+C
- * (EOF)
- *
- * 2) with A+B+C > A+BS, the file has zero padding in range
- * [A+B, A+BS]
- *
- * |< Block Size (BS) >|< Block Size (BS) >|
- * |DDDDDDDDDDDD0000000000000|00000000000000000000000000|
- * ^ ^ ^ ^
- * A A+B A+BS A+B+C
- * (EOF)
- *
- * D = Valid Data
- * 0 = Zero Padding
- *
- * Note that this defeats the ability to chain the ioends of
- * appending writes.
- */
- wpc->ioend->io_size += len;
- if (wpc->ioend->io_offset + wpc->ioend->io_size > end_pos)
- wpc->ioend->io_size = end_pos - wpc->ioend->io_offset;
-
- wbc_account_cgroup_owner(wbc, folio, len);
- return 0;
-}
-
static int iomap_writepage_map_blocks(struct iomap_writepage_ctx *wpc,
struct writeback_control *wbc, struct folio *folio,
struct inode *inode, u64 pos, u64 end_pos,
@@ -1769,8 +1492,8 @@ static int iomap_writepage_map_blocks(struct iomap_writepage_ctx *wpc,
case IOMAP_HOLE:
break;
default:
- error = iomap_add_to_ioend(wpc, wbc, folio, inode, pos,
- end_pos, map_len);
+ error = iomap_bio_add_to_ioend(wpc, wbc, folio, inode,
+ pos, end_pos, map_len);
if (!error)
(*count)++;
break;
diff --git a/fs/iomap/internal.h b/fs/iomap/internal.h
index f6992a3bf66a..8e59950ad8d6 100644
--- a/fs/iomap/internal.h
+++ b/fs/iomap/internal.h
@@ -4,7 +4,53 @@
#define IOEND_BATCH_SIZE 4096
+/*
+ * Structure allocated for each folio to track per-block uptodate, dirty state
+ * and I/O completions.
+ */
+struct iomap_folio_state {
+ spinlock_t state_lock;
+ unsigned int read_bytes_pending;
+ atomic_t write_bytes_pending;
+
+ /*
+ * Each block has two bits in this bitmap:
+ * Bits [0..blocks_per_folio) has the uptodate status.
+ * Bits [b_p_f...(2*b_p_f)) has the dirty status.
+ */
+ unsigned long state[];
+};
+
+struct iomap_readpage_ctx {
+ struct folio *cur_folio;
+ bool cur_folio_in_bio;
+ struct bio *bio;
+ struct readahead_control *rac;
+};
+
u32 iomap_finish_ioend_buffered(struct iomap_ioend *ioend);
u32 iomap_finish_ioend_direct(struct iomap_ioend *ioend);
+bool ifs_set_range_uptodate(struct folio *folio, struct iomap_folio_state *ifs,
+ size_t off, size_t len);
+int iomap_submit_ioend(struct iomap_writepage_ctx *wpc, int error);
+
+#ifdef CONFIG_BLOCK
+int iomap_bio_readpage(const struct iomap *iomap, loff_t pos,
+ struct iomap_readpage_ctx *ctx, size_t poff,
+ size_t plen, loff_t length);
+void iomap_submit_bio(struct bio *bio);
+int iomap_bio_read_folio_sync(loff_t block_start, struct folio *folio, size_t poff,
+ size_t plen, const struct iomap *iomap);
+void iomap_bio_ioend_error(struct iomap_writepage_ctx *wpc, int error);
+int iomap_bio_add_to_ioend(struct iomap_writepage_ctx *wpc, struct writeback_control *wbc,
+ struct folio *folio, struct inode *inode, loff_t pos,
+ loff_t end_pos, unsigned len);
+#else
+#define iomap_bio_readpage(...) (-ENOSYS)
+#define iomap_submit_bio(...) ((void)0)
+#define iomap_bio_read_folio_sync(...) (-ENOSYS)
+#define iomap_bio_ioend_error(...) ((void)0)
+#define iomap_bio_add_to_ioend(...) (-ENOSYS)
+#endif /* CONFIG_BLOCK */
#endif /* _IOMAP_INTERNAL_H */
--
2.47.1
next prev parent reply other threads:[~2025-06-06 23:42 UTC|newest]
Thread overview: 70+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-06-06 23:37 [PATCH v1 0/8] fuse: use iomap for buffered writes + writeback Joanne Koong
2025-06-06 23:37 ` Joanne Koong [this message]
2025-06-08 19:17 ` [PATCH v1 1/8] iomap: move buffered io bio logic into separate file Anuj gupta
2025-06-09 4:44 ` Christoph Hellwig
2025-06-09 20:01 ` Joanne Koong
2025-06-06 23:37 ` [PATCH v1 2/8] iomap: add IOMAP_IN_MEM iomap type Joanne Koong
2025-06-09 4:45 ` Christoph Hellwig
2025-06-09 21:45 ` Joanne Koong
2025-06-10 3:39 ` Christoph Hellwig
2025-06-10 13:27 ` Christoph Hellwig
2025-06-10 20:13 ` Joanne Koong
2025-06-11 4:04 ` Christoph Hellwig
2025-06-11 6:00 ` Joanne Koong
2025-06-11 6:08 ` Christoph Hellwig
2025-06-11 18:33 ` Joanne Koong
2025-06-11 18:50 ` Darrick J. Wong
2025-06-11 23:08 ` Joanne Koong
2025-06-12 4:42 ` Christoph Hellwig
2025-06-09 16:24 ` Darrick J. Wong
2025-06-09 21:28 ` Joanne Koong
2025-06-12 3:53 ` Darrick J. Wong
2025-06-06 23:37 ` [PATCH v1 3/8] iomap: add buffered write support for IOMAP_IN_MEM iomaps Joanne Koong
2025-06-09 4:56 ` Christoph Hellwig
2025-06-09 22:45 ` Joanne Koong
2025-06-10 3:44 ` Christoph Hellwig
2025-06-09 16:38 ` Darrick J. Wong
2025-06-09 22:03 ` Joanne Koong
2025-06-12 3:54 ` Darrick J. Wong
2025-06-06 23:37 ` [PATCH v1 4/8] iomap: add writepages " Joanne Koong
2025-06-09 5:32 ` Christoph Hellwig
2025-06-09 16:57 ` Darrick J. Wong
2025-06-10 3:49 ` Christoph Hellwig
2025-06-12 3:56 ` Darrick J. Wong
2025-06-09 23:15 ` Joanne Koong
2025-06-10 3:58 ` Christoph Hellwig
2025-06-10 18:23 ` Joanne Koong
2025-06-10 18:58 ` Joanne Koong
2025-06-11 4:01 ` Christoph Hellwig
2025-06-06 23:38 ` [PATCH v1 5/8] iomap: add iomap_writeback_dirty_folio() Joanne Koong
2025-06-09 4:51 ` Christoph Hellwig
2025-06-09 17:14 ` Darrick J. Wong
2025-06-09 23:54 ` Joanne Koong
2025-06-10 3:59 ` Christoph Hellwig
2025-06-11 4:34 ` Matthew Wilcox
2025-06-18 4:47 ` does fuse need ->launder_folios, was: " Christoph Hellwig
2025-06-18 12:17 ` Jeff Layton
2025-06-20 18:15 ` Matthew Wilcox
2025-06-25 5:26 ` Joanne Koong
2025-06-25 6:26 ` Christoph Hellwig
2025-06-25 16:44 ` Joanne Koong
2025-07-01 5:41 ` Darrick J. Wong
2025-07-02 21:36 ` Joanne Koong
2025-07-02 21:47 ` Joanne Koong
2025-07-01 6:23 ` Miklos Szeredi
2025-06-09 23:30 ` Joanne Koong
2025-06-10 4:03 ` Christoph Hellwig
2025-06-06 23:38 ` [PATCH v1 6/8] fuse: use iomap for buffered writes Joanne Koong
2025-06-06 23:38 ` [PATCH v1 7/8] fuse: use iomap for writeback Joanne Koong
2025-06-08 19:20 ` Anuj gupta
2025-06-06 23:38 ` [PATCH v1 8/8] fuse: use iomap for folio laundering Joanne Koong
2025-06-08 19:12 ` [PATCH v1 0/8] fuse: use iomap for buffered writes + writeback Anuj gupta
2025-06-09 19:59 ` Joanne Koong
2025-06-14 14:22 ` Anuj gupta
2025-06-09 4:40 ` Christoph Hellwig
2025-06-09 12:38 ` Anuj gupta
2025-06-09 19:47 ` Joanne Koong
2025-06-10 4:04 ` Christoph Hellwig
2025-06-10 0:47 ` Dave Chinner
2025-06-10 4:06 ` Christoph Hellwig
2025-06-10 20:33 ` Joanne Koong
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20250606233803.1421259-2-joannelkoong@gmail.com \
--to=joannelkoong@gmail.com \
--cc=bernd.schubert@fastmail.fm \
--cc=brauner@kernel.org \
--cc=djwong@kernel.org \
--cc=kernel-team@meta.com \
--cc=linux-fsdevel@vger.kernel.org \
--cc=linux-xfs@vger.kernel.org \
--cc=miklos@szeredi.hu \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).