* [PATCH v1 1/8] iomap: move buffered io bio logic into separate file
2025-06-06 23:37 [PATCH v1 0/8] fuse: use iomap for buffered writes + writeback Joanne Koong
@ 2025-06-06 23:37 ` Joanne Koong
2025-06-08 19:17 ` Anuj gupta
2025-06-09 4:44 ` Christoph Hellwig
2025-06-06 23:37 ` [PATCH v1 2/8] iomap: add IOMAP_IN_MEM iomap type Joanne Koong
` (9 subsequent siblings)
10 siblings, 2 replies; 70+ messages in thread
From: Joanne Koong @ 2025-06-06 23:37 UTC (permalink / raw)
To: miklos
Cc: djwong, brauner, linux-fsdevel, linux-xfs, bernd.schubert,
kernel-team
Move buffered io bio logic into a separate file, buffered-io-bio.c, so
that callers that do not have CONFIG_BLOCK set and do not depend on bios
can also use iomap and hook into some of its internal features such as
granular dirty tracking for large folios.
Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
---
fs/iomap/Makefile | 5 +-
fs/iomap/buffered-io-bio.c | 291 ++++++++++++++++++++++++++++++++++
fs/iomap/buffered-io.c | 309 ++-----------------------------------
fs/iomap/internal.h | 46 ++++++
4 files changed, 356 insertions(+), 295 deletions(-)
create mode 100644 fs/iomap/buffered-io-bio.c
diff --git a/fs/iomap/Makefile b/fs/iomap/Makefile
index 69e8ebb41302..477d78c58807 100644
--- a/fs/iomap/Makefile
+++ b/fs/iomap/Makefile
@@ -9,8 +9,9 @@ ccflags-y += -I $(src) # needed for trace events
obj-$(CONFIG_FS_IOMAP) += iomap.o
iomap-y += trace.o \
- iter.o
-iomap-$(CONFIG_BLOCK) += buffered-io.o \
+ iter.o \
+ buffered-io.o
+iomap-$(CONFIG_BLOCK) += buffered-io-bio.o \
direct-io.o \
ioend.o \
fiemap.o \
diff --git a/fs/iomap/buffered-io-bio.c b/fs/iomap/buffered-io-bio.c
new file mode 100644
index 000000000000..03841bed72e7
--- /dev/null
+++ b/fs/iomap/buffered-io-bio.c
@@ -0,0 +1,291 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/bio.h>
+#include <linux/buffer_head.h>
+#include <linux/iomap.h>
+#include <linux/writeback.h>
+
+#include "internal.h"
+
+static void iomap_finish_folio_read(struct folio *folio, size_t off,
+ size_t len, int error)
+{
+ struct iomap_folio_state *ifs = folio->private;
+ bool uptodate = !error;
+ bool finished = true;
+
+ if (ifs) {
+ unsigned long flags;
+
+ spin_lock_irqsave(&ifs->state_lock, flags);
+ if (!error)
+ uptodate = ifs_set_range_uptodate(folio, ifs, off, len);
+ ifs->read_bytes_pending -= len;
+ finished = !ifs->read_bytes_pending;
+ spin_unlock_irqrestore(&ifs->state_lock, flags);
+ }
+
+ if (finished)
+ folio_end_read(folio, uptodate);
+}
+
+static void iomap_read_end_io(struct bio *bio)
+{
+ int error = blk_status_to_errno(bio->bi_status);
+ struct folio_iter fi;
+
+ bio_for_each_folio_all(fi, bio)
+ iomap_finish_folio_read(fi.folio, fi.offset, fi.length, error);
+ bio_put(bio);
+}
+
+int iomap_bio_readpage(const struct iomap *iomap, loff_t pos,
+ struct iomap_readpage_ctx *ctx,
+ size_t poff, size_t plen,
+ loff_t length)
+{
+ struct folio *folio = ctx->cur_folio;
+ sector_t sector;
+
+ sector = iomap_sector(iomap, pos);
+ if (!ctx->bio ||
+ bio_end_sector(ctx->bio) != sector ||
+ !bio_add_folio(ctx->bio, folio, plen, poff)) {
+ gfp_t gfp = mapping_gfp_constraint(folio->mapping, GFP_KERNEL);
+ gfp_t orig_gfp = gfp;
+ unsigned int nr_vecs = DIV_ROUND_UP(length, PAGE_SIZE);
+
+ if (ctx->bio)
+ submit_bio(ctx->bio);
+
+ if (ctx->rac) /* same as readahead_gfp_mask */
+ gfp |= __GFP_NORETRY | __GFP_NOWARN;
+ ctx->bio = bio_alloc(iomap->bdev, bio_max_segs(nr_vecs),
+ REQ_OP_READ, gfp);
+ /*
+ * If the bio_alloc fails, try it again for a single page to
+ * avoid having to deal with partial page reads. This emulates
+ * what do_mpage_read_folio does.
+ */
+ if (!ctx->bio) {
+ ctx->bio = bio_alloc(iomap->bdev, 1, REQ_OP_READ,
+ orig_gfp);
+ }
+ if (ctx->rac)
+ ctx->bio->bi_opf |= REQ_RAHEAD;
+ ctx->bio->bi_iter.bi_sector = sector;
+ ctx->bio->bi_end_io = iomap_read_end_io;
+ bio_add_folio_nofail(ctx->bio, folio, plen, poff);
+ }
+ return 0;
+}
+
+void iomap_submit_bio(struct bio *bio)
+{
+ submit_bio(bio);
+}
+
+int iomap_bio_read_folio_sync(loff_t block_start, struct folio *folio, size_t poff,
+ size_t plen, const struct iomap *iomap) {
+ struct bio_vec bvec;
+ struct bio bio;
+
+ bio_init(&bio, iomap->bdev, &bvec, 1, REQ_OP_READ);
+ bio.bi_iter.bi_sector = iomap_sector(iomap, block_start);
+ bio_add_folio_nofail(&bio, folio, plen, poff);
+ return submit_bio_wait(&bio);
+}
+
+static void iomap_finish_folio_write(struct inode *inode, struct folio *folio,
+ size_t len)
+{
+ struct iomap_folio_state *ifs = folio->private;
+
+ WARN_ON_ONCE(i_blocks_per_folio(inode, folio) > 1 && !ifs);
+ WARN_ON_ONCE(ifs && atomic_read(&ifs->write_bytes_pending) <= 0);
+
+ if (!ifs || atomic_sub_and_test(len, &ifs->write_bytes_pending))
+ folio_end_writeback(folio);
+}
+
+/*
+ * We're now finished for good with this ioend structure. Update the page
+ * state, release holds on bios, and finally free up memory. Do not use the
+ * ioend after this.
+ */
+u32 iomap_finish_ioend_buffered(struct iomap_ioend *ioend)
+{
+ struct inode *inode = ioend->io_inode;
+ struct bio *bio = &ioend->io_bio;
+ struct folio_iter fi;
+ u32 folio_count = 0;
+
+ if (ioend->io_error) {
+ mapping_set_error(inode->i_mapping, ioend->io_error);
+ if (!bio_flagged(bio, BIO_QUIET)) {
+ pr_err_ratelimited(
+"%s: writeback error on inode %lu, offset %lld, sector %llu",
+ inode->i_sb->s_id, inode->i_ino,
+ ioend->io_offset, ioend->io_sector);
+ }
+ }
+
+ /* walk all folios in bio, ending page IO on them */
+ bio_for_each_folio_all(fi, bio) {
+ iomap_finish_folio_write(inode, fi.folio, fi.length);
+ folio_count++;
+ }
+
+ bio_put(bio); /* frees the ioend */
+ return folio_count;
+}
+
+static void iomap_writepage_end_bio(struct bio *bio)
+{
+ struct iomap_ioend *ioend = iomap_ioend_from_bio(bio);
+
+ ioend->io_error = blk_status_to_errno(bio->bi_status);
+ iomap_finish_ioend_buffered(ioend);
+}
+
+void iomap_bio_ioend_error(struct iomap_writepage_ctx *wpc, int error)
+{
+ wpc->ioend->io_bio.bi_status = errno_to_blk_status(error);
+ bio_endio(&wpc->ioend->io_bio);
+}
+
+static struct iomap_ioend *iomap_alloc_ioend(struct iomap_writepage_ctx *wpc,
+ struct writeback_control *wbc,
+ struct inode *inode, loff_t pos,
+ u16 ioend_flags)
+{
+ struct bio *bio;
+
+ bio = bio_alloc_bioset(wpc->iomap.bdev, BIO_MAX_VECS,
+ REQ_OP_WRITE | wbc_to_write_flags(wbc),
+ GFP_NOFS, &iomap_ioend_bioset);
+ bio->bi_iter.bi_sector = iomap_sector(&wpc->iomap, pos);
+ bio->bi_end_io = iomap_writepage_end_bio;
+ bio->bi_write_hint = inode->i_write_hint;
+ wbc_init_bio(wbc, bio);
+ wpc->nr_folios = 0;
+ return iomap_init_ioend(inode, bio, pos, ioend_flags);
+}
+
+static bool iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t pos,
+ u16 ioend_flags)
+{
+ if (ioend_flags & IOMAP_IOEND_BOUNDARY)
+ return false;
+ if ((ioend_flags & IOMAP_IOEND_NOMERGE_FLAGS) !=
+ (wpc->ioend->io_flags & IOMAP_IOEND_NOMERGE_FLAGS))
+ return false;
+ if (pos != wpc->ioend->io_offset + wpc->ioend->io_size)
+ return false;
+ if (!(wpc->iomap.flags & IOMAP_F_ANON_WRITE) &&
+ iomap_sector(&wpc->iomap, pos) !=
+ bio_end_sector(&wpc->ioend->io_bio))
+ return false;
+ /*
+ * Limit ioend bio chain lengths to minimise IO completion latency. This
+ * also prevents long tight loops ending page writeback on all the
+ * folios in the ioend.
+ */
+ if (wpc->nr_folios >= IOEND_BATCH_SIZE)
+ return false;
+ return true;
+}
+
+/*
+ * Test to see if we have an existing ioend structure that we could append to
+ * first; otherwise finish off the current ioend and start another.
+ *
+ * If a new ioend is created and cached, the old ioend is submitted to the block
+ * layer instantly. Batching optimisations are provided by higher level block
+ * plugging.
+ *
+ * At the end of a writeback pass, there will be a cached ioend remaining on the
+ * writepage context that the caller will need to submit.
+ */
+int iomap_bio_add_to_ioend(struct iomap_writepage_ctx *wpc,
+ struct writeback_control *wbc, struct folio *folio,
+ struct inode *inode, loff_t pos, loff_t end_pos,
+ unsigned len)
+{
+ struct iomap_folio_state *ifs = folio->private;
+ size_t poff = offset_in_folio(folio, pos);
+ unsigned int ioend_flags = 0;
+ int error;
+
+ if (wpc->iomap.type == IOMAP_UNWRITTEN)
+ ioend_flags |= IOMAP_IOEND_UNWRITTEN;
+ if (wpc->iomap.flags & IOMAP_F_SHARED)
+ ioend_flags |= IOMAP_IOEND_SHARED;
+ if (pos == wpc->iomap.offset && (wpc->iomap.flags & IOMAP_F_BOUNDARY))
+ ioend_flags |= IOMAP_IOEND_BOUNDARY;
+
+ if (!wpc->ioend || !iomap_can_add_to_ioend(wpc, pos, ioend_flags)) {
+new_ioend:
+ error = iomap_submit_ioend(wpc, 0);
+ if (error)
+ return error;
+ wpc->ioend = iomap_alloc_ioend(wpc, wbc, inode, pos,
+ ioend_flags);
+ if (!wpc->ioend)
+ return -ENOTSUPP;
+ }
+
+ if (!bio_add_folio(&wpc->ioend->io_bio, folio, len, poff))
+ goto new_ioend;
+
+ if (ifs)
+ atomic_add(len, &ifs->write_bytes_pending);
+
+ /*
+ * Clamp io_offset and io_size to the incore EOF so that ondisk
+ * file size updates in the ioend completion are byte-accurate.
+ * This avoids recovering files with zeroed tail regions when
+ * writeback races with appending writes:
+ *
+ * Thread 1: Thread 2:
+ * ------------ -----------
+ * write [A, A+B]
+ * update inode size to A+B
+ * submit I/O [A, A+BS]
+ * write [A+B, A+B+C]
+ * update inode size to A+B+C
+ * <I/O completes, updates disk size to min(A+B+C, A+BS)>
+ * <power failure>
+ *
+ * After reboot:
+ * 1) with A+B+C < A+BS, the file has zero padding in range
+ * [A+B, A+B+C]
+ *
+ * |< Block Size (BS) >|
+ * |DDDDDDDDDDDD0000000000000|
+ * ^ ^ ^
+ * A A+B A+B+C
+ * (EOF)
+ *
+ * 2) with A+B+C > A+BS, the file has zero padding in range
+ * [A+B, A+BS]
+ *
+ * |< Block Size (BS) >|< Block Size (BS) >|
+ * |DDDDDDDDDDDD0000000000000|00000000000000000000000000|
+ * ^ ^ ^ ^
+ * A A+B A+BS A+B+C
+ * (EOF)
+ *
+ * D = Valid Data
+ * 0 = Zero Padding
+ *
+ * Note that this defeats the ability to chain the ioends of
+ * appending writes.
+ */
+ wpc->ioend->io_size += len;
+ if (wpc->ioend->io_offset + wpc->ioend->io_size > end_pos)
+ wpc->ioend->io_size = end_pos - wpc->ioend->io_offset;
+
+ wbc_account_cgroup_owner(wbc, folio, len);
+ return 0;
+}
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 31553372b33a..1caeb4921035 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -13,7 +13,6 @@
#include <linux/dax.h>
#include <linux/writeback.h>
#include <linux/swap.h>
-#include <linux/bio.h>
#include <linux/sched/signal.h>
#include <linux/migrate.h>
#include "internal.h"
@@ -21,23 +20,6 @@
#include "../internal.h"
-/*
- * Structure allocated for each folio to track per-block uptodate, dirty state
- * and I/O completions.
- */
-struct iomap_folio_state {
- spinlock_t state_lock;
- unsigned int read_bytes_pending;
- atomic_t write_bytes_pending;
-
- /*
- * Each block has two bits in this bitmap:
- * Bits [0..blocks_per_folio) has the uptodate status.
- * Bits [b_p_f...(2*b_p_f)) has the dirty status.
- */
- unsigned long state[];
-};
-
static inline bool ifs_is_fully_uptodate(struct folio *folio,
struct iomap_folio_state *ifs)
{
@@ -52,8 +34,8 @@ static inline bool ifs_block_is_uptodate(struct iomap_folio_state *ifs,
return test_bit(block, ifs->state);
}
-static bool ifs_set_range_uptodate(struct folio *folio,
- struct iomap_folio_state *ifs, size_t off, size_t len)
+bool ifs_set_range_uptodate(struct folio *folio, struct iomap_folio_state *ifs,
+ size_t off, size_t len)
{
struct inode *inode = folio->mapping->host;
unsigned int first_blk = off >> inode->i_blkbits;
@@ -284,45 +266,6 @@ static void iomap_adjust_read_range(struct inode *inode, struct folio *folio,
*lenp = plen;
}
-static void iomap_finish_folio_read(struct folio *folio, size_t off,
- size_t len, int error)
-{
- struct iomap_folio_state *ifs = folio->private;
- bool uptodate = !error;
- bool finished = true;
-
- if (ifs) {
- unsigned long flags;
-
- spin_lock_irqsave(&ifs->state_lock, flags);
- if (!error)
- uptodate = ifs_set_range_uptodate(folio, ifs, off, len);
- ifs->read_bytes_pending -= len;
- finished = !ifs->read_bytes_pending;
- spin_unlock_irqrestore(&ifs->state_lock, flags);
- }
-
- if (finished)
- folio_end_read(folio, uptodate);
-}
-
-static void iomap_read_end_io(struct bio *bio)
-{
- int error = blk_status_to_errno(bio->bi_status);
- struct folio_iter fi;
-
- bio_for_each_folio_all(fi, bio)
- iomap_finish_folio_read(fi.folio, fi.offset, fi.length, error);
- bio_put(bio);
-}
-
-struct iomap_readpage_ctx {
- struct folio *cur_folio;
- bool cur_folio_in_bio;
- struct bio *bio;
- struct readahead_control *rac;
-};
-
/**
* iomap_read_inline_data - copy inline data into the page cache
* @iter: iteration structure
@@ -371,7 +314,6 @@ static int iomap_readpage_iter(struct iomap_iter *iter,
struct folio *folio = ctx->cur_folio;
struct iomap_folio_state *ifs;
size_t poff, plen;
- sector_t sector;
int ret;
if (iomap->type == IOMAP_INLINE) {
@@ -394,43 +336,17 @@ static int iomap_readpage_iter(struct iomap_iter *iter,
}
ctx->cur_folio_in_bio = true;
+
+ ret = iomap_bio_readpage(iomap, pos, ctx, poff, plen, length);
+ if (ret)
+ return ret;
+
if (ifs) {
spin_lock_irq(&ifs->state_lock);
ifs->read_bytes_pending += plen;
spin_unlock_irq(&ifs->state_lock);
}
- sector = iomap_sector(iomap, pos);
- if (!ctx->bio ||
- bio_end_sector(ctx->bio) != sector ||
- !bio_add_folio(ctx->bio, folio, plen, poff)) {
- gfp_t gfp = mapping_gfp_constraint(folio->mapping, GFP_KERNEL);
- gfp_t orig_gfp = gfp;
- unsigned int nr_vecs = DIV_ROUND_UP(length, PAGE_SIZE);
-
- if (ctx->bio)
- submit_bio(ctx->bio);
-
- if (ctx->rac) /* same as readahead_gfp_mask */
- gfp |= __GFP_NORETRY | __GFP_NOWARN;
- ctx->bio = bio_alloc(iomap->bdev, bio_max_segs(nr_vecs),
- REQ_OP_READ, gfp);
- /*
- * If the bio_alloc fails, try it again for a single page to
- * avoid having to deal with partial page reads. This emulates
- * what do_mpage_read_folio does.
- */
- if (!ctx->bio) {
- ctx->bio = bio_alloc(iomap->bdev, 1, REQ_OP_READ,
- orig_gfp);
- }
- if (ctx->rac)
- ctx->bio->bi_opf |= REQ_RAHEAD;
- ctx->bio->bi_iter.bi_sector = sector;
- ctx->bio->bi_end_io = iomap_read_end_io;
- bio_add_folio_nofail(ctx->bio, folio, plen, poff);
- }
-
done:
/*
* Move the caller beyond our range so that it keeps making progress.
@@ -474,7 +390,7 @@ int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops)
iter.status = iomap_read_folio_iter(&iter, &ctx);
if (ctx.bio) {
- submit_bio(ctx.bio);
+ iomap_submit_bio(ctx.bio);
WARN_ON_ONCE(!ctx.cur_folio_in_bio);
} else {
WARN_ON_ONCE(ctx.cur_folio_in_bio);
@@ -546,7 +462,7 @@ void iomap_readahead(struct readahead_control *rac, const struct iomap_ops *ops)
iter.status = iomap_readahead_iter(&iter, &ctx);
if (ctx.bio)
- submit_bio(ctx.bio);
+ iomap_submit_bio(ctx.bio);
if (ctx.cur_folio) {
if (!ctx.cur_folio_in_bio)
folio_unlock(ctx.cur_folio);
@@ -670,13 +586,7 @@ iomap_write_failed(struct inode *inode, loff_t pos, unsigned len)
static int iomap_read_folio_sync(loff_t block_start, struct folio *folio,
size_t poff, size_t plen, const struct iomap *iomap)
{
- struct bio_vec bvec;
- struct bio bio;
-
- bio_init(&bio, iomap->bdev, &bvec, 1, REQ_OP_READ);
- bio.bi_iter.bi_sector = iomap_sector(iomap, block_start);
- bio_add_folio_nofail(&bio, folio, plen, poff);
- return submit_bio_wait(&bio);
+ return iomap_bio_read_folio_sync(block_start, folio, poff, plen, iomap);
}
static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos,
@@ -1519,58 +1429,6 @@ vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops,
}
EXPORT_SYMBOL_GPL(iomap_page_mkwrite);
-static void iomap_finish_folio_write(struct inode *inode, struct folio *folio,
- size_t len)
-{
- struct iomap_folio_state *ifs = folio->private;
-
- WARN_ON_ONCE(i_blocks_per_folio(inode, folio) > 1 && !ifs);
- WARN_ON_ONCE(ifs && atomic_read(&ifs->write_bytes_pending) <= 0);
-
- if (!ifs || atomic_sub_and_test(len, &ifs->write_bytes_pending))
- folio_end_writeback(folio);
-}
-
-/*
- * We're now finished for good with this ioend structure. Update the page
- * state, release holds on bios, and finally free up memory. Do not use the
- * ioend after this.
- */
-u32 iomap_finish_ioend_buffered(struct iomap_ioend *ioend)
-{
- struct inode *inode = ioend->io_inode;
- struct bio *bio = &ioend->io_bio;
- struct folio_iter fi;
- u32 folio_count = 0;
-
- if (ioend->io_error) {
- mapping_set_error(inode->i_mapping, ioend->io_error);
- if (!bio_flagged(bio, BIO_QUIET)) {
- pr_err_ratelimited(
-"%s: writeback error on inode %lu, offset %lld, sector %llu",
- inode->i_sb->s_id, inode->i_ino,
- ioend->io_offset, ioend->io_sector);
- }
- }
-
- /* walk all folios in bio, ending page IO on them */
- bio_for_each_folio_all(fi, bio) {
- iomap_finish_folio_write(inode, fi.folio, fi.length);
- folio_count++;
- }
-
- bio_put(bio); /* frees the ioend */
- return folio_count;
-}
-
-static void iomap_writepage_end_bio(struct bio *bio)
-{
- struct iomap_ioend *ioend = iomap_ioend_from_bio(bio);
-
- ioend->io_error = blk_status_to_errno(bio->bi_status);
- iomap_finish_ioend_buffered(ioend);
-}
-
/*
* Submit an ioend.
*
@@ -1580,7 +1438,7 @@ static void iomap_writepage_end_bio(struct bio *bio)
* with the error status here to run the normal I/O completion handler to clear
* the writeback bit and let the file system proess the errors.
*/
-static int iomap_submit_ioend(struct iomap_writepage_ctx *wpc, int error)
+int iomap_submit_ioend(struct iomap_writepage_ctx *wpc, int error)
{
if (!wpc->ioend)
return error;
@@ -1597,151 +1455,16 @@ static int iomap_submit_ioend(struct iomap_writepage_ctx *wpc, int error)
if (WARN_ON_ONCE(wpc->iomap.flags & IOMAP_F_ANON_WRITE))
error = -EIO;
if (!error)
- submit_bio(&wpc->ioend->io_bio);
+ iomap_submit_bio(&wpc->ioend->io_bio);
}
- if (error) {
- wpc->ioend->io_bio.bi_status = errno_to_blk_status(error);
- bio_endio(&wpc->ioend->io_bio);
- }
+ if (error)
+ iomap_bio_ioend_error(wpc, error);
wpc->ioend = NULL;
return error;
}
-static struct iomap_ioend *iomap_alloc_ioend(struct iomap_writepage_ctx *wpc,
- struct writeback_control *wbc, struct inode *inode, loff_t pos,
- u16 ioend_flags)
-{
- struct bio *bio;
-
- bio = bio_alloc_bioset(wpc->iomap.bdev, BIO_MAX_VECS,
- REQ_OP_WRITE | wbc_to_write_flags(wbc),
- GFP_NOFS, &iomap_ioend_bioset);
- bio->bi_iter.bi_sector = iomap_sector(&wpc->iomap, pos);
- bio->bi_end_io = iomap_writepage_end_bio;
- bio->bi_write_hint = inode->i_write_hint;
- wbc_init_bio(wbc, bio);
- wpc->nr_folios = 0;
- return iomap_init_ioend(inode, bio, pos, ioend_flags);
-}
-
-static bool iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t pos,
- u16 ioend_flags)
-{
- if (ioend_flags & IOMAP_IOEND_BOUNDARY)
- return false;
- if ((ioend_flags & IOMAP_IOEND_NOMERGE_FLAGS) !=
- (wpc->ioend->io_flags & IOMAP_IOEND_NOMERGE_FLAGS))
- return false;
- if (pos != wpc->ioend->io_offset + wpc->ioend->io_size)
- return false;
- if (!(wpc->iomap.flags & IOMAP_F_ANON_WRITE) &&
- iomap_sector(&wpc->iomap, pos) !=
- bio_end_sector(&wpc->ioend->io_bio))
- return false;
- /*
- * Limit ioend bio chain lengths to minimise IO completion latency. This
- * also prevents long tight loops ending page writeback on all the
- * folios in the ioend.
- */
- if (wpc->nr_folios >= IOEND_BATCH_SIZE)
- return false;
- return true;
-}
-
-/*
- * Test to see if we have an existing ioend structure that we could append to
- * first; otherwise finish off the current ioend and start another.
- *
- * If a new ioend is created and cached, the old ioend is submitted to the block
- * layer instantly. Batching optimisations are provided by higher level block
- * plugging.
- *
- * At the end of a writeback pass, there will be a cached ioend remaining on the
- * writepage context that the caller will need to submit.
- */
-static int iomap_add_to_ioend(struct iomap_writepage_ctx *wpc,
- struct writeback_control *wbc, struct folio *folio,
- struct inode *inode, loff_t pos, loff_t end_pos,
- unsigned len)
-{
- struct iomap_folio_state *ifs = folio->private;
- size_t poff = offset_in_folio(folio, pos);
- unsigned int ioend_flags = 0;
- int error;
-
- if (wpc->iomap.type == IOMAP_UNWRITTEN)
- ioend_flags |= IOMAP_IOEND_UNWRITTEN;
- if (wpc->iomap.flags & IOMAP_F_SHARED)
- ioend_flags |= IOMAP_IOEND_SHARED;
- if (pos == wpc->iomap.offset && (wpc->iomap.flags & IOMAP_F_BOUNDARY))
- ioend_flags |= IOMAP_IOEND_BOUNDARY;
-
- if (!wpc->ioend || !iomap_can_add_to_ioend(wpc, pos, ioend_flags)) {
-new_ioend:
- error = iomap_submit_ioend(wpc, 0);
- if (error)
- return error;
- wpc->ioend = iomap_alloc_ioend(wpc, wbc, inode, pos,
- ioend_flags);
- }
-
- if (!bio_add_folio(&wpc->ioend->io_bio, folio, len, poff))
- goto new_ioend;
-
- if (ifs)
- atomic_add(len, &ifs->write_bytes_pending);
-
- /*
- * Clamp io_offset and io_size to the incore EOF so that ondisk
- * file size updates in the ioend completion are byte-accurate.
- * This avoids recovering files with zeroed tail regions when
- * writeback races with appending writes:
- *
- * Thread 1: Thread 2:
- * ------------ -----------
- * write [A, A+B]
- * update inode size to A+B
- * submit I/O [A, A+BS]
- * write [A+B, A+B+C]
- * update inode size to A+B+C
- * <I/O completes, updates disk size to min(A+B+C, A+BS)>
- * <power failure>
- *
- * After reboot:
- * 1) with A+B+C < A+BS, the file has zero padding in range
- * [A+B, A+B+C]
- *
- * |< Block Size (BS) >|
- * |DDDDDDDDDDDD0000000000000|
- * ^ ^ ^
- * A A+B A+B+C
- * (EOF)
- *
- * 2) with A+B+C > A+BS, the file has zero padding in range
- * [A+B, A+BS]
- *
- * |< Block Size (BS) >|< Block Size (BS) >|
- * |DDDDDDDDDDDD0000000000000|00000000000000000000000000|
- * ^ ^ ^ ^
- * A A+B A+BS A+B+C
- * (EOF)
- *
- * D = Valid Data
- * 0 = Zero Padding
- *
- * Note that this defeats the ability to chain the ioends of
- * appending writes.
- */
- wpc->ioend->io_size += len;
- if (wpc->ioend->io_offset + wpc->ioend->io_size > end_pos)
- wpc->ioend->io_size = end_pos - wpc->ioend->io_offset;
-
- wbc_account_cgroup_owner(wbc, folio, len);
- return 0;
-}
-
static int iomap_writepage_map_blocks(struct iomap_writepage_ctx *wpc,
struct writeback_control *wbc, struct folio *folio,
struct inode *inode, u64 pos, u64 end_pos,
@@ -1769,8 +1492,8 @@ static int iomap_writepage_map_blocks(struct iomap_writepage_ctx *wpc,
case IOMAP_HOLE:
break;
default:
- error = iomap_add_to_ioend(wpc, wbc, folio, inode, pos,
- end_pos, map_len);
+ error = iomap_bio_add_to_ioend(wpc, wbc, folio, inode,
+ pos, end_pos, map_len);
if (!error)
(*count)++;
break;
diff --git a/fs/iomap/internal.h b/fs/iomap/internal.h
index f6992a3bf66a..8e59950ad8d6 100644
--- a/fs/iomap/internal.h
+++ b/fs/iomap/internal.h
@@ -4,7 +4,53 @@
#define IOEND_BATCH_SIZE 4096
+/*
+ * Structure allocated for each folio to track per-block uptodate, dirty state
+ * and I/O completions.
+ */
+struct iomap_folio_state {
+ spinlock_t state_lock;
+ unsigned int read_bytes_pending;
+ atomic_t write_bytes_pending;
+
+ /*
+ * Each block has two bits in this bitmap:
+ * Bits [0..blocks_per_folio) has the uptodate status.
+ * Bits [b_p_f...(2*b_p_f)) has the dirty status.
+ */
+ unsigned long state[];
+};
+
+struct iomap_readpage_ctx {
+ struct folio *cur_folio;
+ bool cur_folio_in_bio;
+ struct bio *bio;
+ struct readahead_control *rac;
+};
+
u32 iomap_finish_ioend_buffered(struct iomap_ioend *ioend);
u32 iomap_finish_ioend_direct(struct iomap_ioend *ioend);
+bool ifs_set_range_uptodate(struct folio *folio, struct iomap_folio_state *ifs,
+ size_t off, size_t len);
+int iomap_submit_ioend(struct iomap_writepage_ctx *wpc, int error);
+
+#ifdef CONFIG_BLOCK
+int iomap_bio_readpage(const struct iomap *iomap, loff_t pos,
+ struct iomap_readpage_ctx *ctx, size_t poff,
+ size_t plen, loff_t length);
+void iomap_submit_bio(struct bio *bio);
+int iomap_bio_read_folio_sync(loff_t block_start, struct folio *folio, size_t poff,
+ size_t plen, const struct iomap *iomap);
+void iomap_bio_ioend_error(struct iomap_writepage_ctx *wpc, int error);
+int iomap_bio_add_to_ioend(struct iomap_writepage_ctx *wpc, struct writeback_control *wbc,
+ struct folio *folio, struct inode *inode, loff_t pos,
+ loff_t end_pos, unsigned len);
+#else
+#define iomap_bio_readpage(...) (-ENOSYS)
+#define iomap_submit_bio(...) ((void)0)
+#define iomap_bio_read_folio_sync(...) (-ENOSYS)
+#define iomap_bio_ioend_error(...) ((void)0)
+#define iomap_bio_add_to_ioend(...) (-ENOSYS)
+#endif /* CONFIG_BLOCK */
#endif /* _IOMAP_INTERNAL_H */
--
2.47.1
^ permalink raw reply related [flat|nested] 70+ messages in thread
* Re: [PATCH v1 1/8] iomap: move buffered io bio logic into separate file
2025-06-06 23:37 ` [PATCH v1 1/8] iomap: move buffered io bio logic into separate file Joanne Koong
@ 2025-06-08 19:17 ` Anuj gupta
2025-06-09 4:44 ` Christoph Hellwig
1 sibling, 0 replies; 70+ messages in thread
From: Anuj gupta @ 2025-06-08 19:17 UTC (permalink / raw)
To: Joanne Koong
Cc: miklos, djwong, brauner, linux-fsdevel, linux-xfs, bernd.schubert,
kernel-team, Anuj Gupta
> +
> +int iomap_bio_read_folio_sync(loff_t block_start, struct folio *folio, size_t poff,
> + size_t plen, const struct iomap *iomap) {
Nit: { should go in the next line.
^ permalink raw reply [flat|nested] 70+ messages in thread
* Re: [PATCH v1 1/8] iomap: move buffered io bio logic into separate file
2025-06-06 23:37 ` [PATCH v1 1/8] iomap: move buffered io bio logic into separate file Joanne Koong
2025-06-08 19:17 ` Anuj gupta
@ 2025-06-09 4:44 ` Christoph Hellwig
2025-06-09 20:01 ` Joanne Koong
1 sibling, 1 reply; 70+ messages in thread
From: Christoph Hellwig @ 2025-06-09 4:44 UTC (permalink / raw)
To: Joanne Koong
Cc: miklos, djwong, brauner, linux-fsdevel, linux-xfs, bernd.schubert,
kernel-team
> +++ b/fs/iomap/buffered-io-bio.c
> @@ -0,0 +1,291 @@
> +// SPDX-License-Identifier: GPL-2.0
> +
> +#include <linux/bio.h>
Please keep the copyrights from the original file.
From a quick look the split looks a bit odd. But I'll wait for
a version that I can apply and look at the result before juding
it.
> +void iomap_submit_bio(struct bio *bio)
> +{
> + submit_bio(bio);
> +}
This is an entirely new function and not just a code movement.
Please add new abstractions independent of code movement.
^ permalink raw reply [flat|nested] 70+ messages in thread
* Re: [PATCH v1 1/8] iomap: move buffered io bio logic into separate file
2025-06-09 4:44 ` Christoph Hellwig
@ 2025-06-09 20:01 ` Joanne Koong
0 siblings, 0 replies; 70+ messages in thread
From: Joanne Koong @ 2025-06-09 20:01 UTC (permalink / raw)
To: Christoph Hellwig
Cc: miklos, djwong, brauner, linux-fsdevel, linux-xfs, bernd.schubert,
kernel-team
On Sun, Jun 8, 2025 at 9:44 PM Christoph Hellwig <hch@infradead.org> wrote:
>
> > +++ b/fs/iomap/buffered-io-bio.c
> > @@ -0,0 +1,291 @@
> > +// SPDX-License-Identifier: GPL-2.0
> > +
> > +#include <linux/bio.h>
>
> Please keep the copyrights from the original file.
>
> From a quick look the split looks a bit odd. But I'll wait for
> a version that I can apply and look at the result before juding
> it.
>
> > +void iomap_submit_bio(struct bio *bio)
> > +{
> > + submit_bio(bio);
> > +}
>
> This is an entirely new function and not just a code movement.
>
> Please add new abstractions independent of code movement.
>
I'll fix both of these in v2.
^ permalink raw reply [flat|nested] 70+ messages in thread
* [PATCH v1 2/8] iomap: add IOMAP_IN_MEM iomap type
2025-06-06 23:37 [PATCH v1 0/8] fuse: use iomap for buffered writes + writeback Joanne Koong
2025-06-06 23:37 ` [PATCH v1 1/8] iomap: move buffered io bio logic into separate file Joanne Koong
@ 2025-06-06 23:37 ` Joanne Koong
2025-06-09 4:45 ` Christoph Hellwig
2025-06-09 16:24 ` Darrick J. Wong
2025-06-06 23:37 ` [PATCH v1 3/8] iomap: add buffered write support for IOMAP_IN_MEM iomaps Joanne Koong
` (8 subsequent siblings)
10 siblings, 2 replies; 70+ messages in thread
From: Joanne Koong @ 2025-06-06 23:37 UTC (permalink / raw)
To: miklos
Cc: djwong, brauner, linux-fsdevel, linux-xfs, bernd.schubert,
kernel-team
Add a new iomap type, IOMAP_IN_MEM, that represents data that resides in
memory and does not map to or depend on the block layer and is not
embedded inline in an inode. This will be used for example by filesystems
such as FUSE where the data is in memory or needs to be fetched from a
server and is not coupled with the block layer. This lets these
filesystems use some of the internal features in iomaps such as
granular dirty tracking for large folios.
Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
---
include/linux/iomap.h | 1 +
1 file changed, 1 insertion(+)
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 68416b135151..dbbf217eb03f 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -30,6 +30,7 @@ struct vm_fault;
#define IOMAP_MAPPED 2 /* blocks allocated at @addr */
#define IOMAP_UNWRITTEN 3 /* blocks allocated at @addr in unwritten state */
#define IOMAP_INLINE 4 /* data inline in the inode */
+#define IOMAP_IN_MEM 5 /* data in memory, does not map to blocks */
/*
* Flags reported by the file system from iomap_begin:
--
2.47.1
^ permalink raw reply related [flat|nested] 70+ messages in thread
* Re: [PATCH v1 2/8] iomap: add IOMAP_IN_MEM iomap type
2025-06-06 23:37 ` [PATCH v1 2/8] iomap: add IOMAP_IN_MEM iomap type Joanne Koong
@ 2025-06-09 4:45 ` Christoph Hellwig
2025-06-09 21:45 ` Joanne Koong
2025-06-09 16:24 ` Darrick J. Wong
1 sibling, 1 reply; 70+ messages in thread
From: Christoph Hellwig @ 2025-06-09 4:45 UTC (permalink / raw)
To: Joanne Koong
Cc: miklos, djwong, brauner, linux-fsdevel, linux-xfs, bernd.schubert,
kernel-team
On Fri, Jun 06, 2025 at 04:37:57PM -0700, Joanne Koong wrote:
> Add a new iomap type, IOMAP_IN_MEM, that represents data that resides in
> memory and does not map to or depend on the block layer and is not
> embedded inline in an inode. This will be used for example by filesystems
> such as FUSE where the data is in memory or needs to be fetched from a
> server and is not coupled with the block layer. This lets these
> filesystems use some of the internal features in iomaps such as
> granular dirty tracking for large folios.
How does this differ from the naming of the existing flags?
Nothing here explains why we need a new type vs reusing the existing
ones.
^ permalink raw reply [flat|nested] 70+ messages in thread
* Re: [PATCH v1 2/8] iomap: add IOMAP_IN_MEM iomap type
2025-06-09 4:45 ` Christoph Hellwig
@ 2025-06-09 21:45 ` Joanne Koong
2025-06-10 3:39 ` Christoph Hellwig
0 siblings, 1 reply; 70+ messages in thread
From: Joanne Koong @ 2025-06-09 21:45 UTC (permalink / raw)
To: Christoph Hellwig
Cc: miklos, djwong, brauner, linux-fsdevel, linux-xfs, bernd.schubert,
kernel-team
On Sun, Jun 8, 2025 at 9:45 PM Christoph Hellwig <hch@infradead.org> wrote:
>
> On Fri, Jun 06, 2025 at 04:37:57PM -0700, Joanne Koong wrote:
> > Add a new iomap type, IOMAP_IN_MEM, that represents data that resides in
> > memory and does not map to or depend on the block layer and is not
> > embedded inline in an inode. This will be used for example by filesystems
> > such as FUSE where the data is in memory or needs to be fetched from a
> > server and is not coupled with the block layer. This lets these
> > filesystems use some of the internal features in iomaps such as
> > granular dirty tracking for large folios.
>
> How does this differ from the naming of the existing flags?
>
> Nothing here explains why we need a new type vs reusing the existing
> ones.
I'll update this commit message in v2.
IOMAP_INLINE is the closest in idea to what I'm looking for in that
it's completely independent from block io, but it falls short in a few
ways (described in the reply to Darrick in [1]). In terms of
implementation logic, IOMAP_MAPPED fits great but that's tied in idea
to mapped blocks and we'd be unable to make certain assertions (eg if
the iomap type doesn't use bios, the caller must provide
->read_folio_sync() and ->writeback_folio() callbacks).
[1] https://lore.kernel.org/linux-fsdevel/CAJnrk1aXtMcUsmZPCjre1u2=mDPhk5W5Jzp8HOS+ASxkby1+Lw@mail.gmail.com/T/#ma71d1befb676b948c34f170aa687738133f5907a
>
^ permalink raw reply [flat|nested] 70+ messages in thread
* Re: [PATCH v1 2/8] iomap: add IOMAP_IN_MEM iomap type
2025-06-09 21:45 ` Joanne Koong
@ 2025-06-10 3:39 ` Christoph Hellwig
2025-06-10 13:27 ` Christoph Hellwig
0 siblings, 1 reply; 70+ messages in thread
From: Christoph Hellwig @ 2025-06-10 3:39 UTC (permalink / raw)
To: Joanne Koong
Cc: Christoph Hellwig, miklos, djwong, brauner, linux-fsdevel,
linux-xfs, bernd.schubert, kernel-team
On Mon, Jun 09, 2025 at 02:45:34PM -0700, Joanne Koong wrote:
> IOMAP_INLINE is the closest in idea to what I'm looking for in that
> it's completely independent from block io, but it falls short in a few
> ways (described in the reply to Darrick in [1]). In terms of
> implementation logic, IOMAP_MAPPED fits great but that's tied in idea
> to mapped blocks and we'd be unable to make certain assertions (eg if
> the iomap type doesn't use bios, the caller must provide
> ->read_folio_sync() and ->writeback_folio() callbacks).
Well, IFF we end up with both my proposals those two are the clean
abstractions between the generic and block-level code and everyone
is using them. Let's see how that plays out. I'd rather have a strong
abstraction like that if we can and not add new types for what is
essentially the same.
Btw, for zoned XFS, IOMAP_MAPPED also doesn't have a block number
assigned yet and only actually does the I/O in the submit_ioend method.
^ permalink raw reply [flat|nested] 70+ messages in thread
* Re: [PATCH v1 2/8] iomap: add IOMAP_IN_MEM iomap type
2025-06-10 3:39 ` Christoph Hellwig
@ 2025-06-10 13:27 ` Christoph Hellwig
2025-06-10 20:13 ` Joanne Koong
0 siblings, 1 reply; 70+ messages in thread
From: Christoph Hellwig @ 2025-06-10 13:27 UTC (permalink / raw)
To: Joanne Koong
Cc: Christoph Hellwig, miklos, djwong, brauner, linux-fsdevel,
linux-xfs, bernd.schubert, kernel-team
So I looked into something else, what if we just use ->read_folio
despite it not seeming ideal initially? After going through with
it I think it's actually less bad than I thought. This passes
-g auto on xfs with 4k blocks, and has three regression with 1k
blocks, 2 look are the seek hole testers upset that we can't
easily create detectable sub-block holes now, and one because
generic/563 thinks the cgroup accounting is off, probably because
we read more data now or something like that.
---
From c5d3cf651c815d3327199c74eac43149fc958098 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 10 Jun 2025 09:39:57 +0200
Subject: iomap: use ->read_folio instead of iomap_read_folio_sync
iomap_file_buffered_write has it's own private read path for reading
in folios that are only partially overwritten, which not only adds
extra code, but also extra problem when e.g. we want reads to go
through a file system method to support checksums or RAID, or even
support non-block based file systems.
Switch to using ->read_folio instead, which has a few up- and downsides.
->read_folio always reads the entire folios and not just the start and
the tail that is not being overwritten. Historically this was seen as a
downside as it reads more data than needed. But with modern file systems
and modern storage devices this is probably a benefit. If the folio is
stored contiguously on disk, the single read will be more efficient than
two small reads on almost all current hardware. If the folio is backed by
two blocks, at least we pipeline the two reads instead of doing two
synchronous ones. And if the file system fragmented the folio so badly
that we'll now need to do more than two reads we're still at least
pipelining it, although that should basically never happen with modern
file systems.
->read_folio unlocks the folio on completion. This adds extract atomics
to the write fast path, but the actual signaling by doing a lock_page
after ->read_folio is not any slower than the completion wakeup. We
just have to recheck the mapping in this case do lock out truncates
and other mapping manipulations.
->read_folio starts another, nested, iomap iteration, with an extra
lookup of the extent at the current file position. For in-place update
file systems this is extra work, although if they use a good data
structure like the xfs iext btree there is very little overhead in
another lookup. For file system that write out of place this actually
implements the desired semantics as they don't care about the existing
data for the write iteration at all, although untangling this and
removing the srcmap member in the iomap_iter will require additional
work to turn the block zeroing and unshare helpers upside down.
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
fs/iomap/buffered-io.c | 116 ++++++++++++++++-------------------------
1 file changed, 45 insertions(+), 71 deletions(-)
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 3729391a18f3..52b4040208dd 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -667,30 +667,34 @@ iomap_write_failed(struct inode *inode, loff_t pos, unsigned len)
pos + len - 1);
}
-static int iomap_read_folio_sync(loff_t block_start, struct folio *folio,
- size_t poff, size_t plen, const struct iomap *iomap)
+/*
+ * Now that we have a locked folio, check that the iomap we have cached is not
+ * stale before we do anything.
+ *
+ * The extent mapping can change due to concurrent IO in flight, e.g. the
+ * IOMAP_UNWRITTEN state can change and memory reclaim could have reclaimed a
+ * previously partially written page at this index after IO completion before
+ * this write reaches this file offset, and hence we could do the wrong thing
+ * here (zero a page range incorrectly or fail to zero) and corrupt data.
+ */
+static bool iomap_validate(struct iomap_iter *iter)
{
- struct bio_vec bvec;
- struct bio bio;
+ const struct iomap_folio_ops *folio_ops = iter->iomap.folio_ops;
- bio_init(&bio, iomap->bdev, &bvec, 1, REQ_OP_READ);
- bio.bi_iter.bi_sector = iomap_sector(iomap, block_start);
- bio_add_folio_nofail(&bio, folio, plen, poff);
- return submit_bio_wait(&bio);
+ if (folio_ops && folio_ops->iomap_valid &&
+ !folio_ops->iomap_valid(iter->inode, &iter->iomap)) {
+ iter->iomap.flags |= IOMAP_F_STALE;
+ return false;
+ }
+
+ return true;
}
-static int __iomap_write_begin(const struct iomap_iter *iter, size_t len,
+static int __iomap_write_begin(struct iomap_iter *iter, size_t len,
struct folio *folio)
{
- const struct iomap *srcmap = iomap_iter_srcmap(iter);
+ struct inode *inode = iter->inode;
struct iomap_folio_state *ifs;
- loff_t pos = iter->pos;
- loff_t block_size = i_blocksize(iter->inode);
- loff_t block_start = round_down(pos, block_size);
- loff_t block_end = round_up(pos + len, block_size);
- unsigned int nr_blocks = i_blocks_per_folio(iter->inode, folio);
- size_t from = offset_in_folio(folio, pos), to = from + len;
- size_t poff, plen;
/*
* If the write or zeroing completely overlaps the current folio, then
@@ -699,45 +703,29 @@ static int __iomap_write_begin(const struct iomap_iter *iter, size_t len,
* For the unshare case, we must read in the ondisk contents because we
* are not changing pagecache contents.
*/
- if (!(iter->flags & IOMAP_UNSHARE) && pos <= folio_pos(folio) &&
- pos + len >= folio_pos(folio) + folio_size(folio))
+ if (!(iter->flags & IOMAP_UNSHARE) &&
+ iter->pos <= folio_pos(folio) &&
+ iter->pos + len >= folio_pos(folio) + folio_size(folio))
return 0;
- ifs = ifs_alloc(iter->inode, folio, iter->flags);
- if ((iter->flags & IOMAP_NOWAIT) && !ifs && nr_blocks > 1)
+ ifs = ifs_alloc(inode, folio, iter->flags);
+ if ((iter->flags & IOMAP_NOWAIT) && !ifs &&
+ i_blocks_per_folio(inode, folio) > 1)
return -EAGAIN;
- if (folio_test_uptodate(folio))
- return 0;
-
- do {
- iomap_adjust_read_range(iter->inode, folio, &block_start,
- block_end - block_start, &poff, &plen);
- if (plen == 0)
- break;
+ if (!folio_test_uptodate(folio)) {
+ inode->i_mapping->a_ops->read_folio(NULL, folio);
- if (!(iter->flags & IOMAP_UNSHARE) &&
- (from <= poff || from >= poff + plen) &&
- (to <= poff || to >= poff + plen))
- continue;
-
- if (iomap_block_needs_zeroing(iter, block_start)) {
- if (WARN_ON_ONCE(iter->flags & IOMAP_UNSHARE))
- return -EIO;
- folio_zero_segments(folio, poff, from, to, poff + plen);
- } else {
- int status;
-
- if (iter->flags & IOMAP_NOWAIT)
- return -EAGAIN;
-
- status = iomap_read_folio_sync(block_start, folio,
- poff, plen, srcmap);
- if (status)
- return status;
- }
- iomap_set_range_uptodate(folio, poff, plen);
- } while ((block_start += plen) < block_end);
+ /*
+ * ->read_folio unlocks the folio. Relock and revalidate the
+ * folio.
+ */
+ folio_lock(folio);
+ if (unlikely(folio->mapping != inode->i_mapping))
+ return 1;
+ if (unlikely(!iomap_validate(iter)))
+ return 1;
+ }
return 0;
}
@@ -803,7 +791,6 @@ static int iomap_write_begin_inline(const struct iomap_iter *iter,
static int iomap_write_begin(struct iomap_iter *iter, struct folio **foliop,
size_t *poffset, u64 *plen)
{
- const struct iomap_folio_ops *folio_ops = iter->iomap.folio_ops;
const struct iomap *srcmap = iomap_iter_srcmap(iter);
loff_t pos = iter->pos;
u64 len = min_t(u64, SIZE_MAX, iomap_length(iter));
@@ -818,28 +805,14 @@ static int iomap_write_begin(struct iomap_iter *iter, struct folio **foliop,
if (fatal_signal_pending(current))
return -EINTR;
+lookup_again:
folio = __iomap_get_folio(iter, len);
if (IS_ERR(folio))
return PTR_ERR(folio);
- /*
- * Now we have a locked folio, before we do anything with it we need to
- * check that the iomap we have cached is not stale. The inode extent
- * mapping can change due to concurrent IO in flight (e.g.
- * IOMAP_UNWRITTEN state can change and memory reclaim could have
- * reclaimed a previously partially written page at this index after IO
- * completion before this write reaches this file offset) and hence we
- * could do the wrong thing here (zero a page range incorrectly or fail
- * to zero) and corrupt data.
- */
- if (folio_ops && folio_ops->iomap_valid) {
- bool iomap_valid = folio_ops->iomap_valid(iter->inode,
- &iter->iomap);
- if (!iomap_valid) {
- iter->iomap.flags |= IOMAP_F_STALE;
- status = 0;
- goto out_unlock;
- }
+ if (unlikely(!iomap_validate(iter))) {
+ status = 0;
+ goto out_unlock;
}
pos = iomap_trim_folio_range(iter, folio, poffset, &len);
@@ -860,7 +833,8 @@ static int iomap_write_begin(struct iomap_iter *iter, struct folio **foliop,
out_unlock:
__iomap_put_folio(iter, 0, folio);
-
+ if (status == 1)
+ goto lookup_again;
return status;
}
--
2.47.2
^ permalink raw reply related [flat|nested] 70+ messages in thread
* Re: [PATCH v1 2/8] iomap: add IOMAP_IN_MEM iomap type
2025-06-10 13:27 ` Christoph Hellwig
@ 2025-06-10 20:13 ` Joanne Koong
2025-06-11 4:04 ` Christoph Hellwig
0 siblings, 1 reply; 70+ messages in thread
From: Joanne Koong @ 2025-06-10 20:13 UTC (permalink / raw)
To: Christoph Hellwig
Cc: miklos, djwong, brauner, linux-fsdevel, linux-xfs, bernd.schubert,
kernel-team
On Tue, Jun 10, 2025 at 6:27 AM Christoph Hellwig <hch@infradead.org> wrote:
>
> So I looked into something else, what if we just use ->read_folio
> despite it not seeming ideal initially? After going through with
> it I think it's actually less bad than I thought. This passes
> -g auto on xfs with 4k blocks, and has three regression with 1k
> blocks, 2 look are the seek hole testers upset that we can't
> easily create detectable sub-block holes now, and one because
> generic/563 thinks the cgroup accounting is off, probably because
> we read more data now or something like that.
>
> ---
> From c5d3cf651c815d3327199c74eac43149fc958098 Mon Sep 17 00:00:00 2001
> From: Christoph Hellwig <hch@lst.de>
> Date: Tue, 10 Jun 2025 09:39:57 +0200
> Subject: iomap: use ->read_folio instead of iomap_read_folio_sync
>
> iomap_file_buffered_write has it's own private read path for reading
> in folios that are only partially overwritten, which not only adds
> extra code, but also extra problem when e.g. we want reads to go
> through a file system method to support checksums or RAID, or even
> support non-block based file systems.
>
> Switch to using ->read_folio instead, which has a few up- and downsides.
>
> ->read_folio always reads the entire folios and not just the start and
> the tail that is not being overwritten. Historically this was seen as a
> downside as it reads more data than needed. But with modern file systems
> and modern storage devices this is probably a benefit. If the folio is
> stored contiguously on disk, the single read will be more efficient than
> two small reads on almost all current hardware. If the folio is backed by
> two blocks, at least we pipeline the two reads instead of doing two
> synchronous ones. And if the file system fragmented the folio so badly
> that we'll now need to do more than two reads we're still at least
> pipelining it, although that should basically never happen with modern
> file systems.
If the filesystem wants granular folio reads, it can also just do that
itself by calling an iomap helper (eg what iomap_adjust_read_range()
is doing right now) in its ->read_folio() implementation, correct?
For fuse at least, we definitely want granular reads, since reads may
be extremely expensive (eg it may be a network fetch) and there's
non-trivial mempcy overhead incurred with fuse needing to memcpy read
buffer data from userspace back to the kernel.
>
> ->read_folio unlocks the folio on completion. This adds extract atomics
> to the write fast path, but the actual signaling by doing a lock_page
> after ->read_folio is not any slower than the completion wakeup. We
> just have to recheck the mapping in this case do lock out truncates
> and other mapping manipulations.
>
> ->read_folio starts another, nested, iomap iteration, with an extra
> lookup of the extent at the current file position. For in-place update
> file systems this is extra work, although if they use a good data
> structure like the xfs iext btree there is very little overhead in
> another lookup. For file system that write out of place this actually
> implements the desired semantics as they don't care about the existing
> data for the write iteration at all, although untangling this and
> removing the srcmap member in the iomap_iter will require additional
> work to turn the block zeroing and unshare helpers upside down.
>
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
> fs/iomap/buffered-io.c | 116 ++++++++++++++++-------------------------
> 1 file changed, 45 insertions(+), 71 deletions(-)
>
> diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
> index 3729391a18f3..52b4040208dd 100644
> --- a/fs/iomap/buffered-io.c
> +++ b/fs/iomap/buffered-io.c
> @@ -667,30 +667,34 @@ iomap_write_failed(struct inode *inode, loff_t pos, unsigned len)
> pos + len - 1);
> }
>
> -static int iomap_read_folio_sync(loff_t block_start, struct folio *folio,
> - size_t poff, size_t plen, const struct iomap *iomap)
> +/*
> + * Now that we have a locked folio, check that the iomap we have cached is not
> + * stale before we do anything.
> + *
> + * The extent mapping can change due to concurrent IO in flight, e.g. the
> + * IOMAP_UNWRITTEN state can change and memory reclaim could have reclaimed a
> + * previously partially written page at this index after IO completion before
> + * this write reaches this file offset, and hence we could do the wrong thing
> + * here (zero a page range incorrectly or fail to zero) and corrupt data.
> + */
> +static bool iomap_validate(struct iomap_iter *iter)
> {
> - struct bio_vec bvec;
> - struct bio bio;
> + const struct iomap_folio_ops *folio_ops = iter->iomap.folio_ops;
>
> - bio_init(&bio, iomap->bdev, &bvec, 1, REQ_OP_READ);
> - bio.bi_iter.bi_sector = iomap_sector(iomap, block_start);
> - bio_add_folio_nofail(&bio, folio, plen, poff);
> - return submit_bio_wait(&bio);
> + if (folio_ops && folio_ops->iomap_valid &&
> + !folio_ops->iomap_valid(iter->inode, &iter->iomap)) {
> + iter->iomap.flags |= IOMAP_F_STALE;
> + return false;
> + }
> +
> + return true;
> }
>
> -static int __iomap_write_begin(const struct iomap_iter *iter, size_t len,
> +static int __iomap_write_begin(struct iomap_iter *iter, size_t len,
> struct folio *folio)
> {
> - const struct iomap *srcmap = iomap_iter_srcmap(iter);
> + struct inode *inode = iter->inode;
> struct iomap_folio_state *ifs;
> - loff_t pos = iter->pos;
> - loff_t block_size = i_blocksize(iter->inode);
> - loff_t block_start = round_down(pos, block_size);
> - loff_t block_end = round_up(pos + len, block_size);
> - unsigned int nr_blocks = i_blocks_per_folio(iter->inode, folio);
> - size_t from = offset_in_folio(folio, pos), to = from + len;
> - size_t poff, plen;
>
> /*
> * If the write or zeroing completely overlaps the current folio, then
> @@ -699,45 +703,29 @@ static int __iomap_write_begin(const struct iomap_iter *iter, size_t len,
> * For the unshare case, we must read in the ondisk contents because we
> * are not changing pagecache contents.
> */
> - if (!(iter->flags & IOMAP_UNSHARE) && pos <= folio_pos(folio) &&
> - pos + len >= folio_pos(folio) + folio_size(folio))
> + if (!(iter->flags & IOMAP_UNSHARE) &&
> + iter->pos <= folio_pos(folio) &&
> + iter->pos + len >= folio_pos(folio) + folio_size(folio))
> return 0;
>
> - ifs = ifs_alloc(iter->inode, folio, iter->flags);
> - if ((iter->flags & IOMAP_NOWAIT) && !ifs && nr_blocks > 1)
> + ifs = ifs_alloc(inode, folio, iter->flags);
> + if ((iter->flags & IOMAP_NOWAIT) && !ifs &&
> + i_blocks_per_folio(inode, folio) > 1)
> return -EAGAIN;
>
> - if (folio_test_uptodate(folio))
> - return 0;
> -
> - do {
> - iomap_adjust_read_range(iter->inode, folio, &block_start,
> - block_end - block_start, &poff, &plen);
> - if (plen == 0)
> - break;
> + if (!folio_test_uptodate(folio)) {
> + inode->i_mapping->a_ops->read_folio(NULL, folio);
>
> - if (!(iter->flags & IOMAP_UNSHARE) &&
> - (from <= poff || from >= poff + plen) &&
> - (to <= poff || to >= poff + plen))
> - continue;
> -
> - if (iomap_block_needs_zeroing(iter, block_start)) {
> - if (WARN_ON_ONCE(iter->flags & IOMAP_UNSHARE))
> - return -EIO;
> - folio_zero_segments(folio, poff, from, to, poff + plen);
> - } else {
> - int status;
> -
> - if (iter->flags & IOMAP_NOWAIT)
> - return -EAGAIN;
> -
> - status = iomap_read_folio_sync(block_start, folio,
> - poff, plen, srcmap);
> - if (status)
> - return status;
> - }
> - iomap_set_range_uptodate(folio, poff, plen);
> - } while ((block_start += plen) < block_end);
> + /*
> + * ->read_folio unlocks the folio. Relock and revalidate the
> + * folio.
> + */
> + folio_lock(folio);
> + if (unlikely(folio->mapping != inode->i_mapping))
> + return 1;
> + if (unlikely(!iomap_validate(iter)))
> + return 1;
Does this now basically mean that every caller that uses iomap for
writes will have to implement ->iomap_valid and up the sequence
counter anytime there's a write or truncate, in case the folio changes
during the lock drop? Or were we already supposed to be doing this?
> + }
>
> return 0;
> }
> @@ -803,7 +791,6 @@ static int iomap_write_begin_inline(const struct iomap_iter *iter,
> static int iomap_write_begin(struct iomap_iter *iter, struct folio **foliop,
> size_t *poffset, u64 *plen)
> {
> - const struct iomap_folio_ops *folio_ops = iter->iomap.folio_ops;
> const struct iomap *srcmap = iomap_iter_srcmap(iter);
> loff_t pos = iter->pos;
> u64 len = min_t(u64, SIZE_MAX, iomap_length(iter));
> @@ -818,28 +805,14 @@ static int iomap_write_begin(struct iomap_iter *iter, struct folio **foliop,
> if (fatal_signal_pending(current))
> return -EINTR;
>
> +lookup_again:
> folio = __iomap_get_folio(iter, len);
> if (IS_ERR(folio))
> return PTR_ERR(folio);
>
> - /*
> - * Now we have a locked folio, before we do anything with it we need to
> - * check that the iomap we have cached is not stale. The inode extent
> - * mapping can change due to concurrent IO in flight (e.g.
> - * IOMAP_UNWRITTEN state can change and memory reclaim could have
> - * reclaimed a previously partially written page at this index after IO
> - * completion before this write reaches this file offset) and hence we
> - * could do the wrong thing here (zero a page range incorrectly or fail
> - * to zero) and corrupt data.
> - */
> - if (folio_ops && folio_ops->iomap_valid) {
> - bool iomap_valid = folio_ops->iomap_valid(iter->inode,
> - &iter->iomap);
> - if (!iomap_valid) {
> - iter->iomap.flags |= IOMAP_F_STALE;
> - status = 0;
> - goto out_unlock;
> - }
> + if (unlikely(!iomap_validate(iter))) {
> + status = 0;
> + goto out_unlock;
> }
>
> pos = iomap_trim_folio_range(iter, folio, poffset, &len);
> @@ -860,7 +833,8 @@ static int iomap_write_begin(struct iomap_iter *iter, struct folio **foliop,
>
> out_unlock:
> __iomap_put_folio(iter, 0, folio);
> -
> + if (status == 1)
> + goto lookup_again;
> return status;
> }
>
> --
> 2.47.2
>
^ permalink raw reply [flat|nested] 70+ messages in thread
* Re: [PATCH v1 2/8] iomap: add IOMAP_IN_MEM iomap type
2025-06-10 20:13 ` Joanne Koong
@ 2025-06-11 4:04 ` Christoph Hellwig
2025-06-11 6:00 ` Joanne Koong
0 siblings, 1 reply; 70+ messages in thread
From: Christoph Hellwig @ 2025-06-11 4:04 UTC (permalink / raw)
To: Joanne Koong
Cc: Christoph Hellwig, miklos, djwong, brauner, linux-fsdevel,
linux-xfs, bernd.schubert, kernel-team
On Tue, Jun 10, 2025 at 01:13:09PM -0700, Joanne Koong wrote:
> > synchronous ones. And if the file system fragmented the folio so badly
> > that we'll now need to do more than two reads we're still at least
> > pipelining it, although that should basically never happen with modern
> > file systems.
>
> If the filesystem wants granular folio reads, it can also just do that
> itself by calling an iomap helper (eg what iomap_adjust_read_range()
> is doing right now) in its ->read_folio() implementation, correct?
Well, nothing tells ->read_folio how much to read. But having a new
variant of read_folio that allows partial reads might still be nicer
than a iomap_folio_op. Let me draft that and see if willy or other mm
folks choke on it :)
> For fuse at least, we definitely want granular reads, since reads may
> be extremely expensive (eg it may be a network fetch) and there's
> non-trivial mempcy overhead incurred with fuse needing to memcpy read
> buffer data from userspace back to the kernel.
Ok, with that the plain ->read_folio variant is not going to fly.
> > + folio_lock(folio);
> > + if (unlikely(folio->mapping != inode->i_mapping))
> > + return 1;
> > + if (unlikely(!iomap_validate(iter)))
> > + return 1;
>
> Does this now basically mean that every caller that uses iomap for
> writes will have to implement ->iomap_valid and up the sequence
> counter anytime there's a write or truncate, in case the folio changes
> during the lock drop? Or were we already supposed to be doing this?
Not any more than before. It's is still option, but you still
very much want it to protect against races updating the mapping.
^ permalink raw reply [flat|nested] 70+ messages in thread
* Re: [PATCH v1 2/8] iomap: add IOMAP_IN_MEM iomap type
2025-06-11 4:04 ` Christoph Hellwig
@ 2025-06-11 6:00 ` Joanne Koong
2025-06-11 6:08 ` Christoph Hellwig
2025-06-11 18:33 ` Joanne Koong
0 siblings, 2 replies; 70+ messages in thread
From: Joanne Koong @ 2025-06-11 6:00 UTC (permalink / raw)
To: Christoph Hellwig
Cc: miklos, djwong, brauner, linux-fsdevel, linux-xfs, bernd.schubert,
kernel-team
On Tue, Jun 10, 2025 at 9:04 PM Christoph Hellwig <hch@infradead.org> wrote:
>
> On Tue, Jun 10, 2025 at 01:13:09PM -0700, Joanne Koong wrote:
> > > synchronous ones. And if the file system fragmented the folio so badly
> > > that we'll now need to do more than two reads we're still at least
> > > pipelining it, although that should basically never happen with modern
> > > file systems.
> >
> > If the filesystem wants granular folio reads, it can also just do that
> > itself by calling an iomap helper (eg what iomap_adjust_read_range()
> > is doing right now) in its ->read_folio() implementation, correct?
>
> Well, nothing tells ->read_folio how much to read. But having a new
Not a great idea, but theoretically we could stash that info (offset
and len) in the folio->private iomap_folio_state struct. I don't think
that runs into synchronization issues since it would be set and
cleared while the file lock is held for that read.
But regardless I think we still need a new variant of read_folio
because if a non block-io iomap wants to use iomap_read_folio() /
iomap_readahead() for the granular uptodate parsing logic that's in
there, it'll need to provide a method for reading a partial folio. I
initially wasn't planning to have fuse use iomap_read_folio() /
iomap_readahead() but I realized there's some cases where fuse will
find it useful, so i'm planning to add that in.
> variant of read_folio that allows partial reads might still be nicer
> than a iomap_folio_op. Let me draft that and see if willy or other mm
> folks choke on it :)
writeback_folio() is also a VM level concept so under that same logic,
should writeback_folio() also be an address space operation?
A more general question i've been trying to figure out is if the
vision is that iomap is going to be the defacto generic library that
all/most filesystems will be using in the future? If so then it makes
sense to me to add this to the address space operations but if not
then I don't think I see the hate for having the folio callbacks be
embedded in iomap_folio_op.
>
> > For fuse at least, we definitely want granular reads, since reads may
> > be extremely expensive (eg it may be a network fetch) and there's
> > non-trivial mempcy overhead incurred with fuse needing to memcpy read
> > buffer data from userspace back to the kernel.
>
> Ok, with that the plain ->read_folio variant is not going to fly.
>
> > > + folio_lock(folio);
> > > + if (unlikely(folio->mapping != inode->i_mapping))
> > > + return 1;
> > > + if (unlikely(!iomap_validate(iter)))
> > > + return 1;
> >
> > Does this now basically mean that every caller that uses iomap for
> > writes will have to implement ->iomap_valid and up the sequence
> > counter anytime there's a write or truncate, in case the folio changes
> > during the lock drop? Or were we already supposed to be doing this?
>
> Not any more than before. It's is still option, but you still
> very much want it to protect against races updating the mapping.
>
Okay thanks, I think I'll need to add this in for fuse then. I'll look
at this some more
^ permalink raw reply [flat|nested] 70+ messages in thread
* Re: [PATCH v1 2/8] iomap: add IOMAP_IN_MEM iomap type
2025-06-11 6:00 ` Joanne Koong
@ 2025-06-11 6:08 ` Christoph Hellwig
2025-06-11 18:33 ` Joanne Koong
1 sibling, 0 replies; 70+ messages in thread
From: Christoph Hellwig @ 2025-06-11 6:08 UTC (permalink / raw)
To: Joanne Koong
Cc: Christoph Hellwig, miklos, djwong, brauner, linux-fsdevel,
linux-xfs, bernd.schubert, kernel-team
On Tue, Jun 10, 2025 at 11:00:51PM -0700, Joanne Koong wrote:
> Not a great idea, but theoretically we could stash that info (offset
> and len) in the folio->private iomap_folio_state struct. I don't think
> that runs into synchronization issues since it would be set and
> cleared while the file lock is held for that read.
Yeah, I thought about it, but there's no simple hole, so we'd bloat
a long living structure for short-term argument passing.
So I've been thinking of maybe sticking to something similar to
your version, but with a few tweaks. Let me cook up a patch for
review.
> But regardless I think we still need a new variant of read_folio
> because if a non block-io iomap wants to use iomap_read_folio() /
> iomap_readahead() for the granular uptodate parsing logic that's in
> there, it'll need to provide a method for reading a partial folio. I
> initially wasn't planning to have fuse use iomap_read_folio() /
> iomap_readahead() but I realized there's some cases where fuse will
> find it useful, so i'm planning to add that in.
Heh. How much of the iomap code can you reuse vs just using another
variant that shareds the uptodate/dirty bitmaps?
> > variant of read_folio that allows partial reads might still be nicer
> > than a iomap_folio_op. Let me draft that and see if willy or other mm
> > folks choke on it :)
>
> writeback_folio() is also a VM level concept so under that same logic,
> should writeback_folio() also be an address space operation?
Not really. Yes, writing back a part of a folio is high level in
concept, but you really need quite a lot of iomap speific in practive.
>
> A more general question i've been trying to figure out is if the
> vision is that iomap is going to be the defacto generic library that
> all/most filesystems will be using in the future?
That's always been my plan. But it's not been overly successful
so far.
> If so then it makes
> sense to me to add this to the address space operations but if not
> then I don't think I see the hate for having the folio callbacks be
> embedded in iomap_folio_op.
I'd really avoid sticking something that is just a callback into the
address_space operations. We've done that for write_begin/write_end
and it has turned out to be a huge mess. Something in the aops
must be callable standalone and not depend on a very specific caller
context.
^ permalink raw reply [flat|nested] 70+ messages in thread
* Re: [PATCH v1 2/8] iomap: add IOMAP_IN_MEM iomap type
2025-06-11 6:00 ` Joanne Koong
2025-06-11 6:08 ` Christoph Hellwig
@ 2025-06-11 18:33 ` Joanne Koong
2025-06-11 18:50 ` Darrick J. Wong
1 sibling, 1 reply; 70+ messages in thread
From: Joanne Koong @ 2025-06-11 18:33 UTC (permalink / raw)
To: Christoph Hellwig
Cc: miklos, djwong, brauner, linux-fsdevel, linux-xfs, bernd.schubert,
kernel-team
On Tue, Jun 10, 2025 at 11:00 PM Joanne Koong <joannelkoong@gmail.com> wrote:
>
> On Tue, Jun 10, 2025 at 9:04 PM Christoph Hellwig <hch@infradead.org> wrote:
> >
> > On Tue, Jun 10, 2025 at 01:13:09PM -0700, Joanne Koong wrote:
> >
> > > For fuse at least, we definitely want granular reads, since reads may
> > > be extremely expensive (eg it may be a network fetch) and there's
> > > non-trivial mempcy overhead incurred with fuse needing to memcpy read
> > > buffer data from userspace back to the kernel.
> >
> > Ok, with that the plain ->read_folio variant is not going to fly.
> >
> > > > + folio_lock(folio);
> > > > + if (unlikely(folio->mapping != inode->i_mapping))
> > > > + return 1;
> > > > + if (unlikely(!iomap_validate(iter)))
> > > > + return 1;
> > >
> > > Does this now basically mean that every caller that uses iomap for
> > > writes will have to implement ->iomap_valid and up the sequence
> > > counter anytime there's a write or truncate, in case the folio changes
> > > during the lock drop? Or were we already supposed to be doing this?
> >
> > Not any more than before. It's is still option, but you still
> > very much want it to protect against races updating the mapping.
> >
> Okay thanks, I think I'll need to add this in for fuse then. I'll look
> at this some more
I read some of the thread in [1] and I don't think fuse needs this
after all. The iomap mapping won't be changing state and concurrent
writes are already protected by the file lock (if we don't use the
plain ->read_folio variant).
[1] https://lore.kernel.org/linux-xfs/20220817093627.GZ3600936@dread.disaster.area/
^ permalink raw reply [flat|nested] 70+ messages in thread
* Re: [PATCH v1 2/8] iomap: add IOMAP_IN_MEM iomap type
2025-06-11 18:33 ` Joanne Koong
@ 2025-06-11 18:50 ` Darrick J. Wong
2025-06-11 23:08 ` Joanne Koong
0 siblings, 1 reply; 70+ messages in thread
From: Darrick J. Wong @ 2025-06-11 18:50 UTC (permalink / raw)
To: Joanne Koong
Cc: Christoph Hellwig, miklos, brauner, linux-fsdevel, linux-xfs,
bernd.schubert, kernel-team
On Wed, Jun 11, 2025 at 11:33:40AM -0700, Joanne Koong wrote:
> On Tue, Jun 10, 2025 at 11:00 PM Joanne Koong <joannelkoong@gmail.com> wrote:
> >
> > On Tue, Jun 10, 2025 at 9:04 PM Christoph Hellwig <hch@infradead.org> wrote:
> > >
> > > On Tue, Jun 10, 2025 at 01:13:09PM -0700, Joanne Koong wrote:
> > >
> > > > For fuse at least, we definitely want granular reads, since reads may
> > > > be extremely expensive (eg it may be a network fetch) and there's
> > > > non-trivial mempcy overhead incurred with fuse needing to memcpy read
> > > > buffer data from userspace back to the kernel.
> > >
> > > Ok, with that the plain ->read_folio variant is not going to fly.
> > >
> > > > > + folio_lock(folio);
> > > > > + if (unlikely(folio->mapping != inode->i_mapping))
> > > > > + return 1;
> > > > > + if (unlikely(!iomap_validate(iter)))
> > > > > + return 1;
> > > >
> > > > Does this now basically mean that every caller that uses iomap for
> > > > writes will have to implement ->iomap_valid and up the sequence
> > > > counter anytime there's a write or truncate, in case the folio changes
> > > > during the lock drop? Or were we already supposed to be doing this?
> > >
> > > Not any more than before. It's is still option, but you still
> > > very much want it to protect against races updating the mapping.
> > >
> > Okay thanks, I think I'll need to add this in for fuse then. I'll look
> > at this some more
>
> I read some of the thread in [1] and I don't think fuse needs this
> after all. The iomap mapping won't be changing state and concurrent
> writes are already protected by the file lock (if we don't use the
> plain ->read_folio variant).
>
> [1] https://lore.kernel.org/linux-xfs/20220817093627.GZ3600936@dread.disaster.area/
<nod> If the mapping types don't change between read/write (which take
i_rwsem in exclusive mode) and writeback (which doesn't take it at all)
then I don't think there's a need to revalidate the mapping after
grabbing a folio. I think the other ways to avoid those races are (a)
avoid unaligned zeroing if you can guarantee that the folios are always
fully uptodate; and (b) don't do things that change the out-of-place
write status of pagecache (e.g. reflink).
--D
^ permalink raw reply [flat|nested] 70+ messages in thread
* Re: [PATCH v1 2/8] iomap: add IOMAP_IN_MEM iomap type
2025-06-11 18:50 ` Darrick J. Wong
@ 2025-06-11 23:08 ` Joanne Koong
2025-06-12 4:42 ` Christoph Hellwig
0 siblings, 1 reply; 70+ messages in thread
From: Joanne Koong @ 2025-06-11 23:08 UTC (permalink / raw)
To: Darrick J. Wong
Cc: Christoph Hellwig, miklos, brauner, linux-fsdevel, linux-xfs,
bernd.schubert, kernel-team
On Wed, Jun 11, 2025 at 11:50 AM Darrick J. Wong <djwong@kernel.org> wrote:
>
> On Wed, Jun 11, 2025 at 11:33:40AM -0700, Joanne Koong wrote:
> > On Tue, Jun 10, 2025 at 11:00 PM Joanne Koong <joannelkoong@gmail.com> wrote:
> > >
> > > On Tue, Jun 10, 2025 at 9:04 PM Christoph Hellwig <hch@infradead.org> wrote:
> > > >
> > > > On Tue, Jun 10, 2025 at 01:13:09PM -0700, Joanne Koong wrote:
> > > >
> > > > > For fuse at least, we definitely want granular reads, since reads may
> > > > > be extremely expensive (eg it may be a network fetch) and there's
> > > > > non-trivial mempcy overhead incurred with fuse needing to memcpy read
> > > > > buffer data from userspace back to the kernel.
> > > >
> > > > Ok, with that the plain ->read_folio variant is not going to fly.
> > > >
> > > > > > + folio_lock(folio);
> > > > > > + if (unlikely(folio->mapping != inode->i_mapping))
> > > > > > + return 1;
> > > > > > + if (unlikely(!iomap_validate(iter)))
> > > > > > + return 1;
> > > > >
> > > > > Does this now basically mean that every caller that uses iomap for
> > > > > writes will have to implement ->iomap_valid and up the sequence
> > > > > counter anytime there's a write or truncate, in case the folio changes
> > > > > during the lock drop? Or were we already supposed to be doing this?
> > > >
> > > > Not any more than before. It's is still option, but you still
> > > > very much want it to protect against races updating the mapping.
> > > >
> > > Okay thanks, I think I'll need to add this in for fuse then. I'll look
> > > at this some more
> >
> > I read some of the thread in [1] and I don't think fuse needs this
> > after all. The iomap mapping won't be changing state and concurrent
> > writes are already protected by the file lock (if we don't use the
> > plain ->read_folio variant).
> >
> > [1] https://lore.kernel.org/linux-xfs/20220817093627.GZ3600936@dread.disaster.area/
>
> <nod> If the mapping types don't change between read/write (which take
> i_rwsem in exclusive mode) and writeback (which doesn't take it at all)
> then I don't think there's a need to revalidate the mapping after
> grabbing a folio. I think the other ways to avoid those races are (a)
> avoid unaligned zeroing if you can guarantee that the folios are always
> fully uptodate; and (b) don't do things that change the out-of-place
> write status of pagecache (e.g. reflink).
>
Awesome, thanks for verifying
I'll submit v2 rebased on top of the linux tree (if fuse is still
behind mainline) after Christoph sends out his patch
> --D
^ permalink raw reply [flat|nested] 70+ messages in thread
* Re: [PATCH v1 2/8] iomap: add IOMAP_IN_MEM iomap type
2025-06-11 23:08 ` Joanne Koong
@ 2025-06-12 4:42 ` Christoph Hellwig
0 siblings, 0 replies; 70+ messages in thread
From: Christoph Hellwig @ 2025-06-12 4:42 UTC (permalink / raw)
To: Joanne Koong
Cc: Darrick J. Wong, Christoph Hellwig, miklos, brauner,
linux-fsdevel, linux-xfs, bernd.schubert, kernel-team
On Wed, Jun 11, 2025 at 04:08:42PM -0700, Joanne Koong wrote:
> Awesome, thanks for verifying
>
> I'll submit v2 rebased on top of the linux tree (if fuse is still
> behind mainline) after Christoph sends out his patch
Where my patch is trying to come up with a good idea for the
read-modify-write in write_begin? Still thinking hard about this,
so maybe just resend with your current approach for now.
^ permalink raw reply [flat|nested] 70+ messages in thread
* Re: [PATCH v1 2/8] iomap: add IOMAP_IN_MEM iomap type
2025-06-06 23:37 ` [PATCH v1 2/8] iomap: add IOMAP_IN_MEM iomap type Joanne Koong
2025-06-09 4:45 ` Christoph Hellwig
@ 2025-06-09 16:24 ` Darrick J. Wong
2025-06-09 21:28 ` Joanne Koong
1 sibling, 1 reply; 70+ messages in thread
From: Darrick J. Wong @ 2025-06-09 16:24 UTC (permalink / raw)
To: Joanne Koong
Cc: miklos, brauner, linux-fsdevel, linux-xfs, bernd.schubert,
kernel-team
On Fri, Jun 06, 2025 at 04:37:57PM -0700, Joanne Koong wrote:
> Add a new iomap type, IOMAP_IN_MEM, that represents data that resides in
> memory and does not map to or depend on the block layer and is not
> embedded inline in an inode. This will be used for example by filesystems
> such as FUSE where the data is in memory or needs to be fetched from a
> server and is not coupled with the block layer. This lets these
> filesystems use some of the internal features in iomaps such as
> granular dirty tracking for large folios.
How does this differ from using IOMAP_INLINE and setting
iomap::inline_data = kmap_local_folio(...)? Is the situation here that
FUSE already /has/ a folio from the mapping, so all you really need
iomap to do is manage the folio's uptodate/dirty state?
--D
> Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
> ---
> include/linux/iomap.h | 1 +
> 1 file changed, 1 insertion(+)
>
> diff --git a/include/linux/iomap.h b/include/linux/iomap.h
> index 68416b135151..dbbf217eb03f 100644
> --- a/include/linux/iomap.h
> +++ b/include/linux/iomap.h
> @@ -30,6 +30,7 @@ struct vm_fault;
> #define IOMAP_MAPPED 2 /* blocks allocated at @addr */
> #define IOMAP_UNWRITTEN 3 /* blocks allocated at @addr in unwritten state */
> #define IOMAP_INLINE 4 /* data inline in the inode */
> +#define IOMAP_IN_MEM 5 /* data in memory, does not map to blocks */
>
> /*
> * Flags reported by the file system from iomap_begin:
> --
> 2.47.1
>
>
^ permalink raw reply [flat|nested] 70+ messages in thread
* Re: [PATCH v1 2/8] iomap: add IOMAP_IN_MEM iomap type
2025-06-09 16:24 ` Darrick J. Wong
@ 2025-06-09 21:28 ` Joanne Koong
2025-06-12 3:53 ` Darrick J. Wong
0 siblings, 1 reply; 70+ messages in thread
From: Joanne Koong @ 2025-06-09 21:28 UTC (permalink / raw)
To: Darrick J. Wong
Cc: miklos, brauner, linux-fsdevel, linux-xfs, bernd.schubert,
kernel-team
On Mon, Jun 9, 2025 at 9:24 AM Darrick J. Wong <djwong@kernel.org> wrote:
>
> On Fri, Jun 06, 2025 at 04:37:57PM -0700, Joanne Koong wrote:
> > Add a new iomap type, IOMAP_IN_MEM, that represents data that resides in
> > memory and does not map to or depend on the block layer and is not
> > embedded inline in an inode. This will be used for example by filesystems
> > such as FUSE where the data is in memory or needs to be fetched from a
> > server and is not coupled with the block layer. This lets these
> > filesystems use some of the internal features in iomaps such as
> > granular dirty tracking for large folios.
>
> How does this differ from using IOMAP_INLINE and setting
> iomap::inline_data = kmap_local_folio(...)? Is the situation here that
> FUSE already /has/ a folio from the mapping, so all you really need
> iomap to do is manage the folio's uptodate/dirty state?
>
I had looked into whether IOMAP_INLINE could be used but there are a few issues:
a) no granular uptodate reading of the folio if the folio needs to be
read into the page cache
If fuse uses IOMAP_INLINE then it'll need to read in all the bytes of
whatever needs to be written into the folio because the IOMAP_INLINE
points to one contiguous memory region, not different chunks. For
example if there's a 2 MB file and position 0 to 1 MB of the file is
represented by a 1 MB folio, and a client issues a write from position
1 to 1048575, we'll need to read in the entire folio instead of just
the first and last chunks.
b) an extra memcpy is incurred if the folio needs to be read in (extra
read comes from reading inline data into folio) and an extra memcpy is
incurred after the write (extra write comes from writing from folio ->
inline data)
IOMAP_INLINE copies the inline data into the folio
(iomap_write_begin_inline() -> iomap_read_inline_data() ->
folio_fill_tail()) but for fuse, the folio would already have had to
been fetched from the server in fuse's ->iomap_begin callback (and
similarly, the folio tail zeroing and dcache flush will be
unnecessary work here too). When the write is finished, there's an
extra memcpy incurred from iomap_write_end_inline() copying data from
the folio back to inline data (for fuse, inline data is already the
folio).
I guess we could add some flag that the filesystem can set in
->iomap_begin() to indicate that it's an IOMAP_INLINE type where the
mem is the folio being written, but that still doesn't help with the
issue in a).
c) IOMAP_INLINE isn't supported for writepages. From what I see, this
was added in commit 3e19e6f3e (" iomap: warn on inline maps in
iomap_writepage_map"). Maybe it's as simple as now allowing inline
maps to be used in writepages but it also seems to suggest that inline
maps is meant for something different than what fuse is trying to do
with it.
> --D
>
> > Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
> > ---
> > include/linux/iomap.h | 1 +
> > 1 file changed, 1 insertion(+)
> >
> > diff --git a/include/linux/iomap.h b/include/linux/iomap.h
> > index 68416b135151..dbbf217eb03f 100644
> > --- a/include/linux/iomap.h
> > +++ b/include/linux/iomap.h
> > @@ -30,6 +30,7 @@ struct vm_fault;
> > #define IOMAP_MAPPED 2 /* blocks allocated at @addr */
> > #define IOMAP_UNWRITTEN 3 /* blocks allocated at @addr in unwritten state */
> > #define IOMAP_INLINE 4 /* data inline in the inode */
> > +#define IOMAP_IN_MEM 5 /* data in memory, does not map to blocks */
> >
> > /*
> > * Flags reported by the file system from iomap_begin:
> > --
> > 2.47.1
> >
> >
^ permalink raw reply [flat|nested] 70+ messages in thread
* Re: [PATCH v1 2/8] iomap: add IOMAP_IN_MEM iomap type
2025-06-09 21:28 ` Joanne Koong
@ 2025-06-12 3:53 ` Darrick J. Wong
0 siblings, 0 replies; 70+ messages in thread
From: Darrick J. Wong @ 2025-06-12 3:53 UTC (permalink / raw)
To: Joanne Koong
Cc: miklos, brauner, linux-fsdevel, linux-xfs, bernd.schubert,
kernel-team
On Mon, Jun 09, 2025 at 02:28:52PM -0700, Joanne Koong wrote:
> On Mon, Jun 9, 2025 at 9:24 AM Darrick J. Wong <djwong@kernel.org> wrote:
> >
> > On Fri, Jun 06, 2025 at 04:37:57PM -0700, Joanne Koong wrote:
> > > Add a new iomap type, IOMAP_IN_MEM, that represents data that resides in
> > > memory and does not map to or depend on the block layer and is not
> > > embedded inline in an inode. This will be used for example by filesystems
> > > such as FUSE where the data is in memory or needs to be fetched from a
> > > server and is not coupled with the block layer. This lets these
> > > filesystems use some of the internal features in iomaps such as
> > > granular dirty tracking for large folios.
> >
> > How does this differ from using IOMAP_INLINE and setting
> > iomap::inline_data = kmap_local_folio(...)? Is the situation here that
> > FUSE already /has/ a folio from the mapping, so all you really need
> > iomap to do is manage the folio's uptodate/dirty state?
> >
>
> I had looked into whether IOMAP_INLINE could be used but there are a few issues:
>
> a) no granular uptodate reading of the folio if the folio needs to be
> read into the page cache
> If fuse uses IOMAP_INLINE then it'll need to read in all the bytes of
> whatever needs to be written into the folio because the IOMAP_INLINE
> points to one contiguous memory region, not different chunks. For
> example if there's a 2 MB file and position 0 to 1 MB of the file is
> represented by a 1 MB folio, and a client issues a write from position
> 1 to 1048575, we'll need to read in the entire folio instead of just
> the first and last chunks.
Well we could modify the IOMAP_INLINE code to handle iomap::offset > 0
so you could keep feeding the pagecache inline mappings as packets of
data become available. But that statement is missing the point, since I
think you already /have/ the folios populated and stuffed in i_mapping;
you just need iomap for the sub-folio state tracking when things get
dirty.
> b) an extra memcpy is incurred if the folio needs to be read in (extra
> read comes from reading inline data into folio) and an extra memcpy is
> incurred after the write (extra write comes from writing from folio ->
> inline data)
> IOMAP_INLINE copies the inline data into the folio
> (iomap_write_begin_inline() -> iomap_read_inline_data() ->
> folio_fill_tail()) but for fuse, the folio would already have had to
> been fetched from the server in fuse's ->iomap_begin callback (and
> similarly, the folio tail zeroing and dcache flush will be
> unnecessary work here too). When the write is finished, there's an
> extra memcpy incurred from iomap_write_end_inline() copying data from
> the folio back to inline data (for fuse, inline data is already the
> folio).
>
> I guess we could add some flag that the filesystem can set in
> ->iomap_begin() to indicate that it's an IOMAP_INLINE type where the
> mem is the folio being written, but that still doesn't help with the
> issue in a).
I think we already did something like that for fsdax.
> c) IOMAP_INLINE isn't supported for writepages. From what I see, this
> was added in commit 3e19e6f3e (" iomap: warn on inline maps in
> iomap_writepage_map"). Maybe it's as simple as now allowing inline
> maps to be used in writepages but it also seems to suggest that inline
> maps is meant for something different than what fuse is trying to do
> with it.
Yeah -- the sole user (gfs2) stores the inline data near the inode, so
->iomap_begin initiates a transaction and locks the inode and returns.
iomap copies data between the pagecache and iomap::addr, and calls
->iomap_end, which commits the transaction, unlocks the inode, and
cleans the page. That's why writeback doesn't support IOMAP_INLINE;
there's no users for it.
If ext4 ever gets to handling inline data via iomap, I think they'd do a
similar dance.
> > --D
> >
> > > Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
> > > ---
> > > include/linux/iomap.h | 1 +
> > > 1 file changed, 1 insertion(+)
> > >
> > > diff --git a/include/linux/iomap.h b/include/linux/iomap.h
> > > index 68416b135151..dbbf217eb03f 100644
> > > --- a/include/linux/iomap.h
> > > +++ b/include/linux/iomap.h
> > > @@ -30,6 +30,7 @@ struct vm_fault;
> > > #define IOMAP_MAPPED 2 /* blocks allocated at @addr */
> > > #define IOMAP_UNWRITTEN 3 /* blocks allocated at @addr in unwritten state */
> > > #define IOMAP_INLINE 4 /* data inline in the inode */
> > > +#define IOMAP_IN_MEM 5 /* data in memory, does not map to blocks */
> > >
> > > /*
> > > * Flags reported by the file system from iomap_begin:
> > > --
> > > 2.47.1
> > >
> > >
>
^ permalink raw reply [flat|nested] 70+ messages in thread
* [PATCH v1 3/8] iomap: add buffered write support for IOMAP_IN_MEM iomaps
2025-06-06 23:37 [PATCH v1 0/8] fuse: use iomap for buffered writes + writeback Joanne Koong
2025-06-06 23:37 ` [PATCH v1 1/8] iomap: move buffered io bio logic into separate file Joanne Koong
2025-06-06 23:37 ` [PATCH v1 2/8] iomap: add IOMAP_IN_MEM iomap type Joanne Koong
@ 2025-06-06 23:37 ` Joanne Koong
2025-06-09 4:56 ` Christoph Hellwig
2025-06-09 16:38 ` Darrick J. Wong
2025-06-06 23:37 ` [PATCH v1 4/8] iomap: add writepages " Joanne Koong
` (7 subsequent siblings)
10 siblings, 2 replies; 70+ messages in thread
From: Joanne Koong @ 2025-06-06 23:37 UTC (permalink / raw)
To: miklos
Cc: djwong, brauner, linux-fsdevel, linux-xfs, bernd.schubert,
kernel-team
Add buffered write support for IOMAP_IN_MEM iomaps. This lets
IOMAP_IN_MEM iomaps use some of the internal features in iomaps
such as granular dirty tracking for large folios.
Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
---
fs/iomap/buffered-io.c | 24 +++++++++++++++++-------
include/linux/iomap.h | 10 ++++++++++
2 files changed, 27 insertions(+), 7 deletions(-)
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 1caeb4921035..fd2ea1306d88 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -300,7 +300,7 @@ static inline bool iomap_block_needs_zeroing(const struct iomap_iter *iter,
{
const struct iomap *srcmap = iomap_iter_srcmap(iter);
- return srcmap->type != IOMAP_MAPPED ||
+ return (srcmap->type != IOMAP_MAPPED && srcmap->type != IOMAP_IN_MEM) ||
(srcmap->flags & IOMAP_F_NEW) ||
pos >= i_size_read(iter->inode);
}
@@ -583,16 +583,26 @@ iomap_write_failed(struct inode *inode, loff_t pos, unsigned len)
pos + len - 1);
}
-static int iomap_read_folio_sync(loff_t block_start, struct folio *folio,
- size_t poff, size_t plen, const struct iomap *iomap)
+static int iomap_read_folio_sync(const struct iomap_iter *iter, loff_t block_start,
+ struct folio *folio, size_t poff, size_t plen)
{
- return iomap_bio_read_folio_sync(block_start, folio, poff, plen, iomap);
+ const struct iomap_folio_ops *folio_ops = iter->iomap.folio_ops;
+ const struct iomap *srcmap = iomap_iter_srcmap(iter);
+
+ if (folio_ops && folio_ops->read_folio_sync)
+ return folio_ops->read_folio_sync(block_start, folio,
+ poff, plen, srcmap,
+ iter->private);
+
+ /* IOMAP_IN_MEM iomaps must always handle ->read_folio_sync() */
+ WARN_ON_ONCE(iter->iomap.type == IOMAP_IN_MEM);
+
+ return iomap_bio_read_folio_sync(block_start, folio, poff, plen, srcmap);
}
static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos,
size_t len, struct folio *folio)
{
- const struct iomap *srcmap = iomap_iter_srcmap(iter);
struct iomap_folio_state *ifs;
loff_t block_size = i_blocksize(iter->inode);
loff_t block_start = round_down(pos, block_size);
@@ -640,8 +650,8 @@ static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos,
if (iter->flags & IOMAP_NOWAIT)
return -EAGAIN;
- status = iomap_read_folio_sync(block_start, folio,
- poff, plen, srcmap);
+ status = iomap_read_folio_sync(iter, block_start, folio,
+ poff, plen);
if (status)
return status;
}
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index dbbf217eb03f..e748aeebe1a5 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -175,6 +175,16 @@ struct iomap_folio_ops {
* locked by the iomap code.
*/
bool (*iomap_valid)(struct inode *inode, const struct iomap *iomap);
+
+ /*
+ * Required for IOMAP_IN_MEM iomaps. Otherwise optional if the caller
+ * wishes to handle reading in a folio.
+ *
+ * The read must be done synchronously.
+ */
+ int (*read_folio_sync)(loff_t block_start, struct folio *folio,
+ size_t off, size_t len, const struct iomap *iomap,
+ void *private);
};
/*
--
2.47.1
^ permalink raw reply related [flat|nested] 70+ messages in thread
* Re: [PATCH v1 3/8] iomap: add buffered write support for IOMAP_IN_MEM iomaps
2025-06-06 23:37 ` [PATCH v1 3/8] iomap: add buffered write support for IOMAP_IN_MEM iomaps Joanne Koong
@ 2025-06-09 4:56 ` Christoph Hellwig
2025-06-09 22:45 ` Joanne Koong
2025-06-09 16:38 ` Darrick J. Wong
1 sibling, 1 reply; 70+ messages in thread
From: Christoph Hellwig @ 2025-06-09 4:56 UTC (permalink / raw)
To: Joanne Koong
Cc: miklos, djwong, brauner, linux-fsdevel, linux-xfs, bernd.schubert,
kernel-team
> -static int iomap_read_folio_sync(loff_t block_start, struct folio *folio,
> - size_t poff, size_t plen, const struct iomap *iomap)
> +static int iomap_read_folio_sync(const struct iomap_iter *iter, loff_t block_start,
> + struct folio *folio, size_t poff, size_t plen)
> {
> - return iomap_bio_read_folio_sync(block_start, folio, poff, plen, iomap);
> + const struct iomap_folio_ops *folio_ops = iter->iomap.folio_ops;
> + const struct iomap *srcmap = iomap_iter_srcmap(iter);
> +
> + if (folio_ops && folio_ops->read_folio_sync)
> + return folio_ops->read_folio_sync(block_start, folio,
> + poff, plen, srcmap,
> + iter->private);
> +
> + /* IOMAP_IN_MEM iomaps must always handle ->read_folio_sync() */
> + WARN_ON_ONCE(iter->iomap.type == IOMAP_IN_MEM);
> +
> + return iomap_bio_read_folio_sync(block_start, folio, poff, plen, srcmap);
I just ran into this for another project and I hated my plumbing for
this. I hate yours very slightly less but I still don't like it.
This is really more of a VM level concept, so I wonder if we should
instead:
- add a new read_folio_sync method to the address space operations that
reads a folio without unlocking it.
- figure out if just reading the head/tail really is as much of an
optimization, and if it it pass arguments to it to just read the
head/tail, and if not skip it.
^ permalink raw reply [flat|nested] 70+ messages in thread
* Re: [PATCH v1 3/8] iomap: add buffered write support for IOMAP_IN_MEM iomaps
2025-06-09 4:56 ` Christoph Hellwig
@ 2025-06-09 22:45 ` Joanne Koong
2025-06-10 3:44 ` Christoph Hellwig
0 siblings, 1 reply; 70+ messages in thread
From: Joanne Koong @ 2025-06-09 22:45 UTC (permalink / raw)
To: Christoph Hellwig
Cc: miklos, djwong, brauner, linux-fsdevel, linux-xfs, bernd.schubert,
kernel-team
On Sun, Jun 8, 2025 at 9:56 PM Christoph Hellwig <hch@infradead.org> wrote:
>
> > -static int iomap_read_folio_sync(loff_t block_start, struct folio *folio,
> > - size_t poff, size_t plen, const struct iomap *iomap)
> > +static int iomap_read_folio_sync(const struct iomap_iter *iter, loff_t block_start,
> > + struct folio *folio, size_t poff, size_t plen)
> > {
> > - return iomap_bio_read_folio_sync(block_start, folio, poff, plen, iomap);
> > + const struct iomap_folio_ops *folio_ops = iter->iomap.folio_ops;
> > + const struct iomap *srcmap = iomap_iter_srcmap(iter);
> > +
> > + if (folio_ops && folio_ops->read_folio_sync)
> > + return folio_ops->read_folio_sync(block_start, folio,
> > + poff, plen, srcmap,
> > + iter->private);
> > +
> > + /* IOMAP_IN_MEM iomaps must always handle ->read_folio_sync() */
> > + WARN_ON_ONCE(iter->iomap.type == IOMAP_IN_MEM);
> > +
> > + return iomap_bio_read_folio_sync(block_start, folio, poff, plen, srcmap);
>
> I just ran into this for another project and I hated my plumbing for
> this. I hate yours very slightly less but I still don't like it.
>
> This is really more of a VM level concept, so I wonder if we should
> instead:
>
> - add a new read_folio_sync method to the address space operations that
> reads a folio without unlocking it.
imo I hate this more. AFAIU, basically fuse will be the only one
actually needing/using this?
Though it's a more intensive change, what about just expanding the
existing address space operations ->read_folio() callback to take in
an offset, length, and a boolean for whether the folio should be
unlocked after the read?
> - figure out if just reading the head/tail really is as much of an
> optimization, and if it it pass arguments to it to just read the
> head/tail, and if not skip it.
>
^ permalink raw reply [flat|nested] 70+ messages in thread
* Re: [PATCH v1 3/8] iomap: add buffered write support for IOMAP_IN_MEM iomaps
2025-06-09 22:45 ` Joanne Koong
@ 2025-06-10 3:44 ` Christoph Hellwig
0 siblings, 0 replies; 70+ messages in thread
From: Christoph Hellwig @ 2025-06-10 3:44 UTC (permalink / raw)
To: Joanne Koong
Cc: Christoph Hellwig, miklos, djwong, brauner, linux-fsdevel,
linux-xfs, bernd.schubert, kernel-team
On Mon, Jun 09, 2025 at 03:45:05PM -0700, Joanne Koong wrote:
> > I just ran into this for another project and I hated my plumbing for
> > this. I hate yours very slightly less but I still don't like it.
> >
> > This is really more of a VM level concept, so I wonder if we should
> > instead:
> >
> > - add a new read_folio_sync method to the address space operations that
> > reads a folio without unlocking it.
>
> imo I hate this more. AFAIU, basically fuse will be the only one
> actually needing/using this?
No, not at all. We have a few projects that needs a submit_io hook
for iomap buffered reads - on is the btrfs conversion for their checksum
and raid, the other is my series to support T10 PI (and maybe checksums
in non-PI metadata as a follow on). For ->read_folio and ->readahead
we just has patches from Goldwyn and me to pass a new ops vector for
that. But I found it really ugly to do this for the write path, even
if it works (my current version). Also this path is the only reason
why the srcmap in the iomap_iter exists - so that file systems that
do out of place writes can read partial folios from one place, but
return a different mapping to write it back to. If we split it into
a separate aops we could kill all that, something I've been wanting
to do for a long time.
> Though it's a more intensive change, what about just expanding the
> existing address space operations ->read_folio() callback to take in
> an offset, length, and a boolean for whether the folio should be
> unlocked after the read?
The boolean for locking behavior is not a pattern liked much in
the kernel, for good reason - it makes verification really hard.
Now maybe always unlocking the folio in the caller might make sense,
but I'll need to consult at least willy on that before looking into
that.
^ permalink raw reply [flat|nested] 70+ messages in thread
* Re: [PATCH v1 3/8] iomap: add buffered write support for IOMAP_IN_MEM iomaps
2025-06-06 23:37 ` [PATCH v1 3/8] iomap: add buffered write support for IOMAP_IN_MEM iomaps Joanne Koong
2025-06-09 4:56 ` Christoph Hellwig
@ 2025-06-09 16:38 ` Darrick J. Wong
2025-06-09 22:03 ` Joanne Koong
1 sibling, 1 reply; 70+ messages in thread
From: Darrick J. Wong @ 2025-06-09 16:38 UTC (permalink / raw)
To: Joanne Koong
Cc: miklos, brauner, linux-fsdevel, linux-xfs, bernd.schubert,
kernel-team
On Fri, Jun 06, 2025 at 04:37:58PM -0700, Joanne Koong wrote:
> Add buffered write support for IOMAP_IN_MEM iomaps. This lets
> IOMAP_IN_MEM iomaps use some of the internal features in iomaps
> such as granular dirty tracking for large folios.
>
> Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
> ---
> fs/iomap/buffered-io.c | 24 +++++++++++++++++-------
> include/linux/iomap.h | 10 ++++++++++
> 2 files changed, 27 insertions(+), 7 deletions(-)
>
> diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
> index 1caeb4921035..fd2ea1306d88 100644
> --- a/fs/iomap/buffered-io.c
> +++ b/fs/iomap/buffered-io.c
> @@ -300,7 +300,7 @@ static inline bool iomap_block_needs_zeroing(const struct iomap_iter *iter,
> {
> const struct iomap *srcmap = iomap_iter_srcmap(iter);
>
> - return srcmap->type != IOMAP_MAPPED ||
> + return (srcmap->type != IOMAP_MAPPED && srcmap->type != IOMAP_IN_MEM) ||
> (srcmap->flags & IOMAP_F_NEW) ||
> pos >= i_size_read(iter->inode);
> }
> @@ -583,16 +583,26 @@ iomap_write_failed(struct inode *inode, loff_t pos, unsigned len)
> pos + len - 1);
> }
>
> -static int iomap_read_folio_sync(loff_t block_start, struct folio *folio,
> - size_t poff, size_t plen, const struct iomap *iomap)
> +static int iomap_read_folio_sync(const struct iomap_iter *iter, loff_t block_start,
> + struct folio *folio, size_t poff, size_t plen)
> {
> - return iomap_bio_read_folio_sync(block_start, folio, poff, plen, iomap);
> + const struct iomap_folio_ops *folio_ops = iter->iomap.folio_ops;
> + const struct iomap *srcmap = iomap_iter_srcmap(iter);
> +
> + if (folio_ops && folio_ops->read_folio_sync)
> + return folio_ops->read_folio_sync(block_start, folio,
> + poff, plen, srcmap,
> + iter->private);
Hmm, patch 6 hooks this up to fuse_do_readfolio, which means that iomap
provides the folios and manages their uptodate/dirty state. You still
want fuse to handle the folio contents (aka poke the fuse server via
FUSE_READ/FUSE_WRITE), but this avoids the memcpy that IOMAP_INLINE
performs.
So I think you're effectively addding another IO path to buffered-io.c,
which explains why you moved the bio code to a separate file. I wonder
if you could hook up this new IO path by checking for a non-NULL
->read_folio_sync function and calling it regardless of iomap::type?
--D
> +
> + /* IOMAP_IN_MEM iomaps must always handle ->read_folio_sync() */
> + WARN_ON_ONCE(iter->iomap.type == IOMAP_IN_MEM);
> +
> + return iomap_bio_read_folio_sync(block_start, folio, poff, plen, srcmap);
> }
>
> static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos,
> size_t len, struct folio *folio)
> {
> - const struct iomap *srcmap = iomap_iter_srcmap(iter);
> struct iomap_folio_state *ifs;
> loff_t block_size = i_blocksize(iter->inode);
> loff_t block_start = round_down(pos, block_size);
> @@ -640,8 +650,8 @@ static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos,
> if (iter->flags & IOMAP_NOWAIT)
> return -EAGAIN;
>
> - status = iomap_read_folio_sync(block_start, folio,
> - poff, plen, srcmap);
> + status = iomap_read_folio_sync(iter, block_start, folio,
> + poff, plen);
> if (status)
> return status;
> }
> diff --git a/include/linux/iomap.h b/include/linux/iomap.h
> index dbbf217eb03f..e748aeebe1a5 100644
> --- a/include/linux/iomap.h
> +++ b/include/linux/iomap.h
> @@ -175,6 +175,16 @@ struct iomap_folio_ops {
> * locked by the iomap code.
> */
> bool (*iomap_valid)(struct inode *inode, const struct iomap *iomap);
> +
> + /*
> + * Required for IOMAP_IN_MEM iomaps. Otherwise optional if the caller
> + * wishes to handle reading in a folio.
> + *
> + * The read must be done synchronously.
> + */
> + int (*read_folio_sync)(loff_t block_start, struct folio *folio,
> + size_t off, size_t len, const struct iomap *iomap,
> + void *private);
> };
>
> /*
> --
> 2.47.1
>
>
^ permalink raw reply [flat|nested] 70+ messages in thread
* Re: [PATCH v1 3/8] iomap: add buffered write support for IOMAP_IN_MEM iomaps
2025-06-09 16:38 ` Darrick J. Wong
@ 2025-06-09 22:03 ` Joanne Koong
2025-06-12 3:54 ` Darrick J. Wong
0 siblings, 1 reply; 70+ messages in thread
From: Joanne Koong @ 2025-06-09 22:03 UTC (permalink / raw)
To: Darrick J. Wong
Cc: miklos, brauner, linux-fsdevel, linux-xfs, bernd.schubert,
kernel-team
On Mon, Jun 9, 2025 at 9:38 AM Darrick J. Wong <djwong@kernel.org> wrote:
>
> On Fri, Jun 06, 2025 at 04:37:58PM -0700, Joanne Koong wrote:
> > Add buffered write support for IOMAP_IN_MEM iomaps. This lets
> > IOMAP_IN_MEM iomaps use some of the internal features in iomaps
> > such as granular dirty tracking for large folios.
> >
> > Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
> > ---
> > fs/iomap/buffered-io.c | 24 +++++++++++++++++-------
> > include/linux/iomap.h | 10 ++++++++++
> > 2 files changed, 27 insertions(+), 7 deletions(-)
> >
> > diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
> > index 1caeb4921035..fd2ea1306d88 100644
> > --- a/fs/iomap/buffered-io.c
> > +++ b/fs/iomap/buffered-io.c
> > @@ -300,7 +300,7 @@ static inline bool iomap_block_needs_zeroing(const struct iomap_iter *iter,
> > {
> > const struct iomap *srcmap = iomap_iter_srcmap(iter);
> >
> > - return srcmap->type != IOMAP_MAPPED ||
> > + return (srcmap->type != IOMAP_MAPPED && srcmap->type != IOMAP_IN_MEM) ||
> > (srcmap->flags & IOMAP_F_NEW) ||
> > pos >= i_size_read(iter->inode);
> > }
> > @@ -583,16 +583,26 @@ iomap_write_failed(struct inode *inode, loff_t pos, unsigned len)
> > pos + len - 1);
> > }
> >
> > -static int iomap_read_folio_sync(loff_t block_start, struct folio *folio,
> > - size_t poff, size_t plen, const struct iomap *iomap)
> > +static int iomap_read_folio_sync(const struct iomap_iter *iter, loff_t block_start,
> > + struct folio *folio, size_t poff, size_t plen)
> > {
> > - return iomap_bio_read_folio_sync(block_start, folio, poff, plen, iomap);
> > + const struct iomap_folio_ops *folio_ops = iter->iomap.folio_ops;
> > + const struct iomap *srcmap = iomap_iter_srcmap(iter);
> > +
> > + if (folio_ops && folio_ops->read_folio_sync)
> > + return folio_ops->read_folio_sync(block_start, folio,
> > + poff, plen, srcmap,
> > + iter->private);
>
> Hmm, patch 6 hooks this up to fuse_do_readfolio, which means that iomap
> provides the folios and manages their uptodate/dirty state. You still
> want fuse to handle the folio contents (aka poke the fuse server via
> FUSE_READ/FUSE_WRITE), but this avoids the memcpy that IOMAP_INLINE
> performs.
>
> So I think you're effectively addding another IO path to buffered-io.c,
> which explains why you moved the bio code to a separate file. I wonder
The bio code needed to be moved to its own separate file because it
depends on CONFIG_BLOCK whereas fuse should still compile/run even if
CONFIG_BLOCK is not set.
Btw, I think you will need this too for your fuse server iomap patchset.
> if you could hook up this new IO path by checking for a non-NULL
> ->read_folio_sync function and calling it regardless of iomap::type?
I think this is already doing that? It will call ->read_folio_sync()
if the callback was provided, regardless of what the iomap type is.
>
> --D
>
> > +
> > + /* IOMAP_IN_MEM iomaps must always handle ->read_folio_sync() */
> > + WARN_ON_ONCE(iter->iomap.type == IOMAP_IN_MEM);
> > +
> > + return iomap_bio_read_folio_sync(block_start, folio, poff, plen, srcmap);
> > }
> >
> > static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos,
> > size_t len, struct folio *folio)
> > {
> > - const struct iomap *srcmap = iomap_iter_srcmap(iter);
> > struct iomap_folio_state *ifs;
> > loff_t block_size = i_blocksize(iter->inode);
> > loff_t block_start = round_down(pos, block_size);
> > @@ -640,8 +650,8 @@ static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos,
> > if (iter->flags & IOMAP_NOWAIT)
> > return -EAGAIN;
> >
> > - status = iomap_read_folio_sync(block_start, folio,
> > - poff, plen, srcmap);
> > + status = iomap_read_folio_sync(iter, block_start, folio,
> > + poff, plen);
> > if (status)
> > return status;
> > }
> > diff --git a/include/linux/iomap.h b/include/linux/iomap.h
> > index dbbf217eb03f..e748aeebe1a5 100644
> > --- a/include/linux/iomap.h
> > +++ b/include/linux/iomap.h
> > @@ -175,6 +175,16 @@ struct iomap_folio_ops {
> > * locked by the iomap code.
> > */
> > bool (*iomap_valid)(struct inode *inode, const struct iomap *iomap);
> > +
> > + /*
> > + * Required for IOMAP_IN_MEM iomaps. Otherwise optional if the caller
> > + * wishes to handle reading in a folio.
> > + *
> > + * The read must be done synchronously.
> > + */
> > + int (*read_folio_sync)(loff_t block_start, struct folio *folio,
> > + size_t off, size_t len, const struct iomap *iomap,
> > + void *private);
> > };
> >
> > /*
> > --
> > 2.47.1
> >
> >
^ permalink raw reply [flat|nested] 70+ messages in thread
* Re: [PATCH v1 3/8] iomap: add buffered write support for IOMAP_IN_MEM iomaps
2025-06-09 22:03 ` Joanne Koong
@ 2025-06-12 3:54 ` Darrick J. Wong
0 siblings, 0 replies; 70+ messages in thread
From: Darrick J. Wong @ 2025-06-12 3:54 UTC (permalink / raw)
To: Joanne Koong
Cc: miklos, brauner, linux-fsdevel, linux-xfs, bernd.schubert,
kernel-team
On Mon, Jun 09, 2025 at 03:03:05PM -0700, Joanne Koong wrote:
> On Mon, Jun 9, 2025 at 9:38 AM Darrick J. Wong <djwong@kernel.org> wrote:
> >
> > On Fri, Jun 06, 2025 at 04:37:58PM -0700, Joanne Koong wrote:
> > > Add buffered write support for IOMAP_IN_MEM iomaps. This lets
> > > IOMAP_IN_MEM iomaps use some of the internal features in iomaps
> > > such as granular dirty tracking for large folios.
> > >
> > > Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
> > > ---
> > > fs/iomap/buffered-io.c | 24 +++++++++++++++++-------
> > > include/linux/iomap.h | 10 ++++++++++
> > > 2 files changed, 27 insertions(+), 7 deletions(-)
> > >
> > > diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
> > > index 1caeb4921035..fd2ea1306d88 100644
> > > --- a/fs/iomap/buffered-io.c
> > > +++ b/fs/iomap/buffered-io.c
> > > @@ -300,7 +300,7 @@ static inline bool iomap_block_needs_zeroing(const struct iomap_iter *iter,
> > > {
> > > const struct iomap *srcmap = iomap_iter_srcmap(iter);
> > >
> > > - return srcmap->type != IOMAP_MAPPED ||
> > > + return (srcmap->type != IOMAP_MAPPED && srcmap->type != IOMAP_IN_MEM) ||
> > > (srcmap->flags & IOMAP_F_NEW) ||
> > > pos >= i_size_read(iter->inode);
> > > }
> > > @@ -583,16 +583,26 @@ iomap_write_failed(struct inode *inode, loff_t pos, unsigned len)
> > > pos + len - 1);
> > > }
> > >
> > > -static int iomap_read_folio_sync(loff_t block_start, struct folio *folio,
> > > - size_t poff, size_t plen, const struct iomap *iomap)
> > > +static int iomap_read_folio_sync(const struct iomap_iter *iter, loff_t block_start,
> > > + struct folio *folio, size_t poff, size_t plen)
> > > {
> > > - return iomap_bio_read_folio_sync(block_start, folio, poff, plen, iomap);
> > > + const struct iomap_folio_ops *folio_ops = iter->iomap.folio_ops;
> > > + const struct iomap *srcmap = iomap_iter_srcmap(iter);
> > > +
> > > + if (folio_ops && folio_ops->read_folio_sync)
> > > + return folio_ops->read_folio_sync(block_start, folio,
> > > + poff, plen, srcmap,
> > > + iter->private);
> >
> > Hmm, patch 6 hooks this up to fuse_do_readfolio, which means that iomap
> > provides the folios and manages their uptodate/dirty state. You still
> > want fuse to handle the folio contents (aka poke the fuse server via
> > FUSE_READ/FUSE_WRITE), but this avoids the memcpy that IOMAP_INLINE
> > performs.
> >
> > So I think you're effectively addding another IO path to buffered-io.c,
> > which explains why you moved the bio code to a separate file. I wonder
>
> The bio code needed to be moved to its own separate file because it
> depends on CONFIG_BLOCK whereas fuse should still compile/run even if
> CONFIG_BLOCK is not set.
>
> Btw, I think you will need this too for your fuse server iomap patchset.
Yeah, I added it, thank you. :)
> > if you could hook up this new IO path by checking for a non-NULL
> > ->read_folio_sync function and calling it regardless of iomap::type?
>
> I think this is already doing that? It will call ->read_folio_sync()
> if the callback was provided, regardless of what the iomap type is.
Err, yes it does, my bad.
--D
> >
> > --D
> >
> > > +
> > > + /* IOMAP_IN_MEM iomaps must always handle ->read_folio_sync() */
> > > + WARN_ON_ONCE(iter->iomap.type == IOMAP_IN_MEM);
> > > +
> > > + return iomap_bio_read_folio_sync(block_start, folio, poff, plen, srcmap);
> > > }
> > >
> > > static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos,
> > > size_t len, struct folio *folio)
> > > {
> > > - const struct iomap *srcmap = iomap_iter_srcmap(iter);
> > > struct iomap_folio_state *ifs;
> > > loff_t block_size = i_blocksize(iter->inode);
> > > loff_t block_start = round_down(pos, block_size);
> > > @@ -640,8 +650,8 @@ static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos,
> > > if (iter->flags & IOMAP_NOWAIT)
> > > return -EAGAIN;
> > >
> > > - status = iomap_read_folio_sync(block_start, folio,
> > > - poff, plen, srcmap);
> > > + status = iomap_read_folio_sync(iter, block_start, folio,
> > > + poff, plen);
> > > if (status)
> > > return status;
> > > }
> > > diff --git a/include/linux/iomap.h b/include/linux/iomap.h
> > > index dbbf217eb03f..e748aeebe1a5 100644
> > > --- a/include/linux/iomap.h
> > > +++ b/include/linux/iomap.h
> > > @@ -175,6 +175,16 @@ struct iomap_folio_ops {
> > > * locked by the iomap code.
> > > */
> > > bool (*iomap_valid)(struct inode *inode, const struct iomap *iomap);
> > > +
> > > + /*
> > > + * Required for IOMAP_IN_MEM iomaps. Otherwise optional if the caller
> > > + * wishes to handle reading in a folio.
> > > + *
> > > + * The read must be done synchronously.
> > > + */
> > > + int (*read_folio_sync)(loff_t block_start, struct folio *folio,
> > > + size_t off, size_t len, const struct iomap *iomap,
> > > + void *private);
> > > };
> > >
> > > /*
> > > --
> > > 2.47.1
> > >
> > >
>
^ permalink raw reply [flat|nested] 70+ messages in thread
* [PATCH v1 4/8] iomap: add writepages support for IOMAP_IN_MEM iomaps
2025-06-06 23:37 [PATCH v1 0/8] fuse: use iomap for buffered writes + writeback Joanne Koong
` (2 preceding siblings ...)
2025-06-06 23:37 ` [PATCH v1 3/8] iomap: add buffered write support for IOMAP_IN_MEM iomaps Joanne Koong
@ 2025-06-06 23:37 ` Joanne Koong
2025-06-09 5:32 ` Christoph Hellwig
2025-06-06 23:38 ` [PATCH v1 5/8] iomap: add iomap_writeback_dirty_folio() Joanne Koong
` (6 subsequent siblings)
10 siblings, 1 reply; 70+ messages in thread
From: Joanne Koong @ 2025-06-06 23:37 UTC (permalink / raw)
To: miklos
Cc: djwong, brauner, linux-fsdevel, linux-xfs, bernd.schubert,
kernel-team
This allows IOMAP_IN_MEM iomaps to use iomap_writepages() for handling
writeback. This lets IOMAP_IN_MEM iomaps use some of the internal
features in iomaps such as granular dirty tracking for large folios.
This introduces a new iomap_writeback_ops callback, writeback_folio(),
callers may pass in which hands off folio writeback logic to the caller
for writing back dirty pages instead of relying on mapping blocks.
This exposes two apis, iomap_start_folio_write() and
iomap_finish_folio_write(), which callers may find useful in their
writeback_folio() callback implementation.
Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
---
fs/iomap/buffered-io-bio.c | 12 --------
fs/iomap/buffered-io.c | 60 ++++++++++++++++++++++++++++++++------
include/linux/iomap.h | 18 ++++++++++--
3 files changed, 67 insertions(+), 23 deletions(-)
diff --git a/fs/iomap/buffered-io-bio.c b/fs/iomap/buffered-io-bio.c
index 03841bed72e7..b7bc838cf477 100644
--- a/fs/iomap/buffered-io-bio.c
+++ b/fs/iomap/buffered-io-bio.c
@@ -96,18 +96,6 @@ int iomap_bio_read_folio_sync(loff_t block_start, struct folio *folio, size_t po
return submit_bio_wait(&bio);
}
-static void iomap_finish_folio_write(struct inode *inode, struct folio *folio,
- size_t len)
-{
- struct iomap_folio_state *ifs = folio->private;
-
- WARN_ON_ONCE(i_blocks_per_folio(inode, folio) > 1 && !ifs);
- WARN_ON_ONCE(ifs && atomic_read(&ifs->write_bytes_pending) <= 0);
-
- if (!ifs || atomic_sub_and_test(len, &ifs->write_bytes_pending))
- folio_end_writeback(folio);
-}
-
/*
* We're now finished for good with this ioend structure. Update the page
* state, release holds on bios, and finally free up memory. Do not use the
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index fd2ea1306d88..92f08b316d47 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -1441,15 +1441,15 @@ EXPORT_SYMBOL_GPL(iomap_page_mkwrite);
/*
* Submit an ioend.
- *
- * If @error is non-zero, it means that we have a situation where some part of
- * the submission process has failed after we've marked pages for writeback.
- * We cannot cancel ioend directly in that case, so call the bio end I/O handler
- * with the error status here to run the normal I/O completion handler to clear
- * the writeback bit and let the file system proess the errors.
*/
int iomap_submit_ioend(struct iomap_writepage_ctx *wpc, int error)
{
+ if (wpc->iomap.type == IOMAP_IN_MEM) {
+ if (wpc->ops->submit_ioend)
+ error = wpc->ops->submit_ioend(wpc, error);
+ return error;
+ }
+
if (!wpc->ioend)
return error;
@@ -1468,6 +1468,13 @@ int iomap_submit_ioend(struct iomap_writepage_ctx *wpc, int error)
iomap_submit_bio(&wpc->ioend->io_bio);
}
+ /*
+ * If error is non-zero, it means that we have a situation where some part of
+ * the submission process has failed after we've marked pages for writeback.
+ * We cannot cancel ioend directly in that case, so call the bio end I/O handler
+ * with the error status here to run the normal I/O completion handler to clear
+ * the writeback bit and let the file system process the errors.
+ */
if (error)
iomap_bio_ioend_error(wpc, error);
@@ -1635,8 +1642,17 @@ static int iomap_writepage_map(struct iomap_writepage_ctx *wpc,
*/
end_aligned = round_up(end_pos, i_blocksize(inode));
while ((rlen = iomap_find_dirty_range(folio, &pos, end_aligned))) {
- error = iomap_writepage_map_blocks(wpc, wbc, folio, inode,
- pos, end_pos, rlen, &count);
+ if (wpc->ops->writeback_folio) {
+ WARN_ON_ONCE(wpc->ops->map_blocks);
+ error = wpc->ops->writeback_folio(wpc, folio, inode,
+ offset_in_folio(folio, pos),
+ rlen);
+ } else {
+ WARN_ON_ONCE(wpc->iomap.type == IOMAP_IN_MEM);
+ error = iomap_writepage_map_blocks(wpc, wbc, folio,
+ inode, pos, end_pos,
+ rlen, &count);
+ }
if (error)
break;
pos += rlen;
@@ -1664,7 +1680,11 @@ static int iomap_writepage_map(struct iomap_writepage_ctx *wpc,
if (atomic_dec_and_test(&ifs->write_bytes_pending))
folio_end_writeback(folio);
} else {
- if (!count)
+ /*
+ * If wpc->ops->writeback_folio is set, then it is responsible
+ * for ending the writeback itself.
+ */
+ if (!count && !wpc->ops->writeback_folio)
folio_end_writeback(folio);
}
mapping_set_error(inode->i_mapping, error);
@@ -1693,3 +1713,25 @@ iomap_writepages(struct address_space *mapping, struct writeback_control *wbc,
return iomap_submit_ioend(wpc, error);
}
EXPORT_SYMBOL_GPL(iomap_writepages);
+
+void iomap_start_folio_write(struct inode *inode, struct folio *folio, size_t len)
+{
+ struct iomap_folio_state *ifs = folio->private;
+
+ WARN_ON_ONCE(i_blocks_per_folio(inode, folio) > 1 && !ifs);
+ if (ifs)
+ atomic_add(len, &ifs->write_bytes_pending);
+}
+EXPORT_SYMBOL_GPL(iomap_start_folio_write);
+
+void iomap_finish_folio_write(struct inode *inode, struct folio *folio, size_t len)
+{
+ struct iomap_folio_state *ifs = folio->private;
+
+ WARN_ON_ONCE(i_blocks_per_folio(inode, folio) > 1 && !ifs);
+ WARN_ON_ONCE(ifs && atomic_read(&ifs->write_bytes_pending) <= 0);
+
+ if (!ifs || atomic_sub_and_test(len, &ifs->write_bytes_pending))
+ folio_end_writeback(folio);
+}
+EXPORT_SYMBOL_GPL(iomap_finish_folio_write);
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index e748aeebe1a5..4b5e083fa802 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -424,8 +424,8 @@ static inline struct iomap_ioend *iomap_ioend_from_bio(struct bio *bio)
struct iomap_writeback_ops {
/*
- * Required, maps the blocks so that writeback can be performed on
- * the range starting at offset.
+ * Required if ->writeback_folio is not set. Maps the blocks so that
+ * writeback can be performed on the range starting at offset.
*
* Can return arbitrarily large regions, but we need to call into it at
* least once per folio to allow the file systems to synchronize with
@@ -436,6 +436,16 @@ struct iomap_writeback_ops {
*/
int (*map_blocks)(struct iomap_writepage_ctx *wpc, struct inode *inode,
loff_t offset, unsigned len);
+ /*
+ * Forwards the folio writeback logic to the caller.
+ *
+ * Required for IOMAP_IN_MEM iomaps or if ->map_blocks is not set.
+ *
+ * The caller is responsible for ending writeback on the folio after
+ * it's fully done processing it.
+ */
+ int (*writeback_folio)(struct iomap_writepage_ctx *wpc, struct folio *folio,
+ struct inode *inode, loff_t offset, unsigned len);
/*
* Optional, allows the file systems to hook into bio submission,
@@ -459,6 +469,7 @@ struct iomap_writepage_ctx {
struct iomap_ioend *ioend;
const struct iomap_writeback_ops *ops;
u32 nr_folios; /* folios added to the ioend */
+ void *private;
};
struct iomap_ioend *iomap_init_ioend(struct inode *inode, struct bio *bio,
@@ -538,4 +549,7 @@ int iomap_swapfile_activate(struct swap_info_struct *sis,
extern struct bio_set iomap_ioend_bioset;
+void iomap_start_folio_write(struct inode *inode, struct folio *folio, size_t len);
+void iomap_finish_folio_write(struct inode *inode, struct folio *folio, size_t len);
+
#endif /* LINUX_IOMAP_H */
--
2.47.1
^ permalink raw reply related [flat|nested] 70+ messages in thread
* Re: [PATCH v1 4/8] iomap: add writepages support for IOMAP_IN_MEM iomaps
2025-06-06 23:37 ` [PATCH v1 4/8] iomap: add writepages " Joanne Koong
@ 2025-06-09 5:32 ` Christoph Hellwig
2025-06-09 16:57 ` Darrick J. Wong
2025-06-09 23:15 ` Joanne Koong
0 siblings, 2 replies; 70+ messages in thread
From: Christoph Hellwig @ 2025-06-09 5:32 UTC (permalink / raw)
To: Joanne Koong
Cc: miklos, djwong, brauner, linux-fsdevel, linux-xfs, bernd.schubert,
kernel-team
On Fri, Jun 06, 2025 at 04:37:59PM -0700, Joanne Koong wrote:
> This allows IOMAP_IN_MEM iomaps to use iomap_writepages() for handling
> writeback. This lets IOMAP_IN_MEM iomaps use some of the internal
> features in iomaps such as granular dirty tracking for large folios.
>
> This introduces a new iomap_writeback_ops callback, writeback_folio(),
> callers may pass in which hands off folio writeback logic to the caller
> for writing back dirty pages instead of relying on mapping blocks.
>
> This exposes two apis, iomap_start_folio_write() and
> iomap_finish_folio_write(), which callers may find useful in their
> writeback_folio() callback implementation.
It might also be worth stating what you don't use. One big thing
that springs to mind is ioends. Which are really useful if you
need more than one request to handle a folio, something that is
pretty common in network file systems. I guess you don't need
that for fuse?
> + if (wpc->iomap.type == IOMAP_IN_MEM) {
> + if (wpc->ops->submit_ioend)
> + error = wpc->ops->submit_ioend(wpc, error);
> + return error;
> + }
Given that the patch that moved things around already wrapped the
error propagation to the bio into a helpr, how does this differ
from the main path in the function now?
> + /*
> + * If error is non-zero, it means that we have a situation where some part of
> + * the submission process has failed after we've marked pages for writeback.
> + * We cannot cancel ioend directly in that case, so call the bio end I/O handler
> + * with the error status here to run the normal I/O completion handler to clear
> + * the writeback bit and let the file system process the errors.
> + */
Please add the comment in a separate preparation patch.
> + if (wpc->ops->writeback_folio) {
> + WARN_ON_ONCE(wpc->ops->map_blocks);
> + error = wpc->ops->writeback_folio(wpc, folio, inode,
> + offset_in_folio(folio, pos),
> + rlen);
> + } else {
> + WARN_ON_ONCE(wpc->iomap.type == IOMAP_IN_MEM);
> + error = iomap_writepage_map_blocks(wpc, wbc, folio,
> + inode, pos, end_pos,
> + rlen, &count);
> + }
So instead of having two entirely different methods, can we
refactor the block based code to also use
->writeback_folio?
Basically move all of the code inside the do { } while loop after
the call into ->map_blocks into a helper, and then let the caller
loop and also directly discard the folio if needed. I can give that
a spin if you want.
Note that writeback_folio is misnamed, as it doesn't write back an
entire folio, but just a dirty range.
> } else {
> - if (!count)
> + /*
> + * If wpc->ops->writeback_folio is set, then it is responsible
> + * for ending the writeback itself.
> + */
> + if (!count && !wpc->ops->writeback_folio)
> folio_end_writeback(folio);
This fails to explain why writeback_folio does the unlocking itself.
I also don't see how that would work in case of multiple dirty ranges.
> }
> mapping_set_error(inode->i_mapping, error);
> @@ -1693,3 +1713,25 @@ iomap_writepages(struct address_space *mapping, struct writeback_control *wbc,
> return iomap_submit_ioend(wpc, error);
> }
> EXPORT_SYMBOL_GPL(iomap_writepages);
> +
> +void iomap_start_folio_write(struct inode *inode, struct folio *folio, size_t len)
> +{
> + struct iomap_folio_state *ifs = folio->private;
> +
> + WARN_ON_ONCE(i_blocks_per_folio(inode, folio) > 1 && !ifs);
> + if (ifs)
> + atomic_add(len, &ifs->write_bytes_pending);
> +}
> +EXPORT_SYMBOL_GPL(iomap_start_folio_write);
> +
> +void iomap_finish_folio_write(struct inode *inode, struct folio *folio, size_t len)
> +{
> + struct iomap_folio_state *ifs = folio->private;
> +
> + WARN_ON_ONCE(i_blocks_per_folio(inode, folio) > 1 && !ifs);
> + WARN_ON_ONCE(ifs && atomic_read(&ifs->write_bytes_pending) <= 0);
> +
> + if (!ifs || atomic_sub_and_test(len, &ifs->write_bytes_pending))
> + folio_end_writeback(folio);
Please also use these helpers in the block based code.
^ permalink raw reply [flat|nested] 70+ messages in thread
* Re: [PATCH v1 4/8] iomap: add writepages support for IOMAP_IN_MEM iomaps
2025-06-09 5:32 ` Christoph Hellwig
@ 2025-06-09 16:57 ` Darrick J. Wong
2025-06-10 3:49 ` Christoph Hellwig
2025-06-09 23:15 ` Joanne Koong
1 sibling, 1 reply; 70+ messages in thread
From: Darrick J. Wong @ 2025-06-09 16:57 UTC (permalink / raw)
To: Christoph Hellwig
Cc: Joanne Koong, miklos, brauner, linux-fsdevel, linux-xfs,
bernd.schubert, kernel-team
On Sun, Jun 08, 2025 at 10:32:20PM -0700, Christoph Hellwig wrote:
> On Fri, Jun 06, 2025 at 04:37:59PM -0700, Joanne Koong wrote:
> > This allows IOMAP_IN_MEM iomaps to use iomap_writepages() for handling
> > writeback. This lets IOMAP_IN_MEM iomaps use some of the internal
> > features in iomaps such as granular dirty tracking for large folios.
> >
> > This introduces a new iomap_writeback_ops callback, writeback_folio(),
> > callers may pass in which hands off folio writeback logic to the caller
> > for writing back dirty pages instead of relying on mapping blocks.
> >
> > This exposes two apis, iomap_start_folio_write() and
> > iomap_finish_folio_write(), which callers may find useful in their
> > writeback_folio() callback implementation.
>
> It might also be worth stating what you don't use. One big thing
> that springs to mind is ioends. Which are really useful if you
> need more than one request to handle a folio, something that is
> pretty common in network file systems. I guess you don't need
> that for fuse?
My initial thought was "I wonder if Joanne would be better off with a
totally separate iomap_writepage_map_blocks implementation" since I
*think* fuse just needs a callback from iomap to initiate FUSE_WRITE
calls on the dirty range(s) of a folio, and then fuse can call
mapping_set_error and iomap_finish_folio_write when those FUSE_WRITE
calls complete. There are no bios, so I don't see much point in using
the ioend machinery.
--D
> > + if (wpc->iomap.type == IOMAP_IN_MEM) {
> > + if (wpc->ops->submit_ioend)
> > + error = wpc->ops->submit_ioend(wpc, error);
> > + return error;
> > + }
>
> Given that the patch that moved things around already wrapped the
> error propagation to the bio into a helpr, how does this differ
> from the main path in the function now?
>
> > + /*
> > + * If error is non-zero, it means that we have a situation where some part of
> > + * the submission process has failed after we've marked pages for writeback.
> > + * We cannot cancel ioend directly in that case, so call the bio end I/O handler
> > + * with the error status here to run the normal I/O completion handler to clear
> > + * the writeback bit and let the file system process the errors.
> > + */
>
> Please add the comment in a separate preparation patch.
>
> > + if (wpc->ops->writeback_folio) {
> > + WARN_ON_ONCE(wpc->ops->map_blocks);
> > + error = wpc->ops->writeback_folio(wpc, folio, inode,
> > + offset_in_folio(folio, pos),
> > + rlen);
> > + } else {
> > + WARN_ON_ONCE(wpc->iomap.type == IOMAP_IN_MEM);
> > + error = iomap_writepage_map_blocks(wpc, wbc, folio,
> > + inode, pos, end_pos,
> > + rlen, &count);
> > + }
>
> So instead of having two entirely different methods, can we
> refactor the block based code to also use
> ->writeback_folio?
>
> Basically move all of the code inside the do { } while loop after
> the call into ->map_blocks into a helper, and then let the caller
> loop and also directly discard the folio if needed. I can give that
> a spin if you want.
>
> Note that writeback_folio is misnamed, as it doesn't write back an
> entire folio, but just a dirty range.
>
> > } else {
> > - if (!count)
> > + /*
> > + * If wpc->ops->writeback_folio is set, then it is responsible
> > + * for ending the writeback itself.
> > + */
> > + if (!count && !wpc->ops->writeback_folio)
> > folio_end_writeback(folio);
>
> This fails to explain why writeback_folio does the unlocking itself.
> I also don't see how that would work in case of multiple dirty ranges.
>
> > }
> > mapping_set_error(inode->i_mapping, error);
> > @@ -1693,3 +1713,25 @@ iomap_writepages(struct address_space *mapping, struct writeback_control *wbc,
> > return iomap_submit_ioend(wpc, error);
> > }
> > EXPORT_SYMBOL_GPL(iomap_writepages);
> > +
> > +void iomap_start_folio_write(struct inode *inode, struct folio *folio, size_t len)
> > +{
> > + struct iomap_folio_state *ifs = folio->private;
> > +
> > + WARN_ON_ONCE(i_blocks_per_folio(inode, folio) > 1 && !ifs);
> > + if (ifs)
> > + atomic_add(len, &ifs->write_bytes_pending);
> > +}
> > +EXPORT_SYMBOL_GPL(iomap_start_folio_write);
> > +
> > +void iomap_finish_folio_write(struct inode *inode, struct folio *folio, size_t len)
> > +{
> > + struct iomap_folio_state *ifs = folio->private;
> > +
> > + WARN_ON_ONCE(i_blocks_per_folio(inode, folio) > 1 && !ifs);
> > + WARN_ON_ONCE(ifs && atomic_read(&ifs->write_bytes_pending) <= 0);
> > +
> > + if (!ifs || atomic_sub_and_test(len, &ifs->write_bytes_pending))
> > + folio_end_writeback(folio);
>
> Please also use these helpers in the block based code.
>
>
^ permalink raw reply [flat|nested] 70+ messages in thread
* Re: [PATCH v1 4/8] iomap: add writepages support for IOMAP_IN_MEM iomaps
2025-06-09 16:57 ` Darrick J. Wong
@ 2025-06-10 3:49 ` Christoph Hellwig
2025-06-12 3:56 ` Darrick J. Wong
0 siblings, 1 reply; 70+ messages in thread
From: Christoph Hellwig @ 2025-06-10 3:49 UTC (permalink / raw)
To: Darrick J. Wong
Cc: Christoph Hellwig, Joanne Koong, miklos, brauner, linux-fsdevel,
linux-xfs, bernd.schubert, kernel-team
On Mon, Jun 09, 2025 at 09:57:41AM -0700, Darrick J. Wong wrote:
> > It might also be worth stating what you don't use. One big thing
> > that springs to mind is ioends. Which are really useful if you
> > need more than one request to handle a folio, something that is
> > pretty common in network file systems. I guess you don't need
> > that for fuse?
>
> My initial thought was "I wonder if Joanne would be better off with a
> totally separate iomap_writepage_map_blocks implementation"
I think that's basically what the patches do, right?
> since I
> *think* fuse just needs a callback from iomap to initiate FUSE_WRITE
> calls on the dirty range(s) of a folio, and then fuse can call
> mapping_set_error and iomap_finish_folio_write when those FUSE_WRITE
> calls complete. There are no bios, so I don't see much point in using
> the ioend machinery.
Note that the mapping_set_error in iomap_writepage_map is only for
synchronous errors for mapping setup anyway, all the actual I/O error
are handled asynchronously anyway. Similar, clearing the writeback
bit only happens for synchronous erorr, or the rare case of (almost)
synchronous I/O.
^ permalink raw reply [flat|nested] 70+ messages in thread
* Re: [PATCH v1 4/8] iomap: add writepages support for IOMAP_IN_MEM iomaps
2025-06-10 3:49 ` Christoph Hellwig
@ 2025-06-12 3:56 ` Darrick J. Wong
0 siblings, 0 replies; 70+ messages in thread
From: Darrick J. Wong @ 2025-06-12 3:56 UTC (permalink / raw)
To: Christoph Hellwig
Cc: Joanne Koong, miklos, brauner, linux-fsdevel, linux-xfs,
bernd.schubert, kernel-team
On Mon, Jun 09, 2025 at 08:49:25PM -0700, Christoph Hellwig wrote:
> On Mon, Jun 09, 2025 at 09:57:41AM -0700, Darrick J. Wong wrote:
> > > It might also be worth stating what you don't use. One big thing
> > > that springs to mind is ioends. Which are really useful if you
> > > need more than one request to handle a folio, something that is
> > > pretty common in network file systems. I guess you don't need
> > > that for fuse?
> >
> > My initial thought was "I wonder if Joanne would be better off with a
> > totally separate iomap_writepage_map_blocks implementation"
>
> I think that's basically what the patches do, right?
Yes.
> > since I
> > *think* fuse just needs a callback from iomap to initiate FUSE_WRITE
> > calls on the dirty range(s) of a folio, and then fuse can call
> > mapping_set_error and iomap_finish_folio_write when those FUSE_WRITE
> > calls complete. There are no bios, so I don't see much point in using
> > the ioend machinery.
>
> Note that the mapping_set_error in iomap_writepage_map is only for
> synchronous errors for mapping setup anyway, all the actual I/O error
> are handled asynchronously anyway. Similar, clearing the writeback
> bit only happens for synchronous erorr, or the rare case of (almost)
> synchronous I/O.
Heheh, my brain has gotten fuzzy on all that pagecache error handling
changes over the years.
--D
^ permalink raw reply [flat|nested] 70+ messages in thread
* Re: [PATCH v1 4/8] iomap: add writepages support for IOMAP_IN_MEM iomaps
2025-06-09 5:32 ` Christoph Hellwig
2025-06-09 16:57 ` Darrick J. Wong
@ 2025-06-09 23:15 ` Joanne Koong
2025-06-10 3:58 ` Christoph Hellwig
1 sibling, 1 reply; 70+ messages in thread
From: Joanne Koong @ 2025-06-09 23:15 UTC (permalink / raw)
To: Christoph Hellwig
Cc: miklos, djwong, brauner, linux-fsdevel, linux-xfs, bernd.schubert,
kernel-team
On Sun, Jun 8, 2025 at 10:32 PM Christoph Hellwig <hch@infradead.org> wrote:
>
> On Fri, Jun 06, 2025 at 04:37:59PM -0700, Joanne Koong wrote:
> > This allows IOMAP_IN_MEM iomaps to use iomap_writepages() for handling
> > writeback. This lets IOMAP_IN_MEM iomaps use some of the internal
> > features in iomaps such as granular dirty tracking for large folios.
> >
> > This introduces a new iomap_writeback_ops callback, writeback_folio(),
> > callers may pass in which hands off folio writeback logic to the caller
> > for writing back dirty pages instead of relying on mapping blocks.
> >
> > This exposes two apis, iomap_start_folio_write() and
> > iomap_finish_folio_write(), which callers may find useful in their
> > writeback_folio() callback implementation.
>
> It might also be worth stating what you don't use. One big thing
> that springs to mind is ioends. Which are really useful if you
> need more than one request to handle a folio, something that is
> pretty common in network file systems. I guess you don't need
> that for fuse?
ioends are used in fuse for cleaning up state. fuse implements a
->submit_ioend() callback in fuse_iomap_submit_ioend() (in patch 7/8).
For fuse, there can be multiple requests handling a folio. I'm using
the ifs->write_bytes_pending to keep track of how many requests for
the folio there are, so that we know when to end writeback after the
last request on that folio has completed.
>
> > + if (wpc->iomap.type == IOMAP_IN_MEM) {
> > + if (wpc->ops->submit_ioend)
> > + error = wpc->ops->submit_ioend(wpc, error);
> > + return error;
> > + }
>
> Given that the patch that moved things around already wrapped the
> error propagation to the bio into a helpr, how does this differ
> from the main path in the function now?
>
If we don't add this special casing for IOMAP_IN_MEM here, then in
this function it'll hit the "if (!wpc->ioend)" case right in the
beginning and return error without calling the ->submit_ioend()
callback. For non IOMAP_IN_MEM types, ->submit_ioend() should only get
called if wpc->ioend is set.
> > + /*
> > + * If error is non-zero, it means that we have a situation where some part of
> > + * the submission process has failed after we've marked pages for writeback.
> > + * We cannot cancel ioend directly in that case, so call the bio end I/O handler
> > + * with the error status here to run the normal I/O completion handler to clear
> > + * the writeback bit and let the file system process the errors.
> > + */
>
> Please add the comment in a separate preparation patch.
This isn't a new comment, it's just moved from the function
description to here. I'll put this comment move into the patch that
moves all the bio stuff.
>
> > + if (wpc->ops->writeback_folio) {
> > + WARN_ON_ONCE(wpc->ops->map_blocks);
> > + error = wpc->ops->writeback_folio(wpc, folio, inode,
> > + offset_in_folio(folio, pos),
> > + rlen);
> > + } else {
> > + WARN_ON_ONCE(wpc->iomap.type == IOMAP_IN_MEM);
> > + error = iomap_writepage_map_blocks(wpc, wbc, folio,
> > + inode, pos, end_pos,
> > + rlen, &count);
> > + }
>
> So instead of having two entirely different methods, can we
> refactor the block based code to also use
> ->writeback_folio?
>
> Basically move all of the code inside the do { } while loop after
> the call into ->map_blocks into a helper, and then let the caller
> loop and also directly discard the folio if needed. I can give that
> a spin if you want.
>
Sounds great, I like this idea a lot. I'll make this change for v2.
> Note that writeback_folio is misnamed, as it doesn't write back an
> entire folio, but just a dirty range.
I'll rename this to writeback_folio_dirty_range().
>
> > } else {
> > - if (!count)
> > + /*
> > + * If wpc->ops->writeback_folio is set, then it is responsible
> > + * for ending the writeback itself.
> > + */
> > + if (!count && !wpc->ops->writeback_folio)
> > folio_end_writeback(folio);
>
> This fails to explain why writeback_folio does the unlocking itself.
writeback_folio needs to do the unlocking itself because the writeback
may be done asynchronously (as in the case for fuse). I'll add that as
a comment in v2.
> I also don't see how that would work in case of multiple dirty ranges.
For multiple dirty ranges in the same folio, the caller uses
ifs->writes_bytes_pending (through the
iomap_{start/finish}_folio_write() apis) to track when writeback is
finally finished on the folio and folio_end_write() may be called.
>
> > }
> > mapping_set_error(inode->i_mapping, error);
> > @@ -1693,3 +1713,25 @@ iomap_writepages(struct address_space *mapping, struct writeback_control *wbc,
> > return iomap_submit_ioend(wpc, error);
> > }
> > EXPORT_SYMBOL_GPL(iomap_writepages);
> > +
> > +void iomap_start_folio_write(struct inode *inode, struct folio *folio, size_t len)
> > +{
> > + struct iomap_folio_state *ifs = folio->private;
> > +
> > + WARN_ON_ONCE(i_blocks_per_folio(inode, folio) > 1 && !ifs);
> > + if (ifs)
> > + atomic_add(len, &ifs->write_bytes_pending);
> > +}
> > +EXPORT_SYMBOL_GPL(iomap_start_folio_write);
> > +
> > +void iomap_finish_folio_write(struct inode *inode, struct folio *folio, size_t len)
> > +{
> > + struct iomap_folio_state *ifs = folio->private;
> > +
> > + WARN_ON_ONCE(i_blocks_per_folio(inode, folio) > 1 && !ifs);
> > + WARN_ON_ONCE(ifs && atomic_read(&ifs->write_bytes_pending) <= 0);
> > +
> > + if (!ifs || atomic_sub_and_test(len, &ifs->write_bytes_pending))
> > + folio_end_writeback(folio);
>
> Please also use these helpers in the block based code.
Will do. Thanks for your reviews on this.
>
^ permalink raw reply [flat|nested] 70+ messages in thread
* Re: [PATCH v1 4/8] iomap: add writepages support for IOMAP_IN_MEM iomaps
2025-06-09 23:15 ` Joanne Koong
@ 2025-06-10 3:58 ` Christoph Hellwig
2025-06-10 18:23 ` Joanne Koong
0 siblings, 1 reply; 70+ messages in thread
From: Christoph Hellwig @ 2025-06-10 3:58 UTC (permalink / raw)
To: Joanne Koong
Cc: Christoph Hellwig, miklos, djwong, brauner, linux-fsdevel,
linux-xfs, bernd.schubert, kernel-team
On Mon, Jun 09, 2025 at 04:15:27PM -0700, Joanne Koong wrote:
> ioends are used in fuse for cleaning up state. fuse implements a
> ->submit_ioend() callback in fuse_iomap_submit_ioend() (in patch 7/8).
But do you use struct iomap_ioend at all? (Sorry, still don't have a
whole tree with the patches applied in front of me).
> > Given that the patch that moved things around already wrapped the
> > error propagation to the bio into a helpr, how does this differ
> > from the main path in the function now?
> >
>
> If we don't add this special casing for IOMAP_IN_MEM here, then in
> this function it'll hit the "if (!wpc->ioend)" case right in the
> beginning and return error without calling the ->submit_ioend()
So this suggests you don't use struct iomap_ioend at all. Given that
you add a private field to iomap_writepage_ctx I guess that is where
you chain the fuse requests?
Either way I think we should clean this up one way or another. If the
non-block iomap writeback code doesn't use ioends we should probably move
the ioend chain into the private field, and hide everything using it (or
even the ioend name) in proper abstraction. In this case this means
finding another way to check for a non-empty wpc. One way would be to
check nr_folios as any non-empty wbc must have a number of folios
attached to it, the other would be to move the check into the
->submit_ioend method including the block fallback. For a proper split
the method should probably be renamed, and we'd probably want to require
every use to provide the submit method, even if the trivial block
users set it to the default one provided.
> > > - if (!count)
> > > + /*
> > > + * If wpc->ops->writeback_folio is set, then it is responsible
> > > + * for ending the writeback itself.
> > > + */
> > > + if (!count && !wpc->ops->writeback_folio)
> > > folio_end_writeback(folio);
> >
> > This fails to explain why writeback_folio does the unlocking itself.
>
> writeback_folio needs to do the unlocking itself because the writeback
> may be done asynchronously (as in the case for fuse). I'll add that as
> a comment in v2.
So how do you end up with a zero count here and still want
the fuse code to unlock?
^ permalink raw reply [flat|nested] 70+ messages in thread
* Re: [PATCH v1 4/8] iomap: add writepages support for IOMAP_IN_MEM iomaps
2025-06-10 3:58 ` Christoph Hellwig
@ 2025-06-10 18:23 ` Joanne Koong
2025-06-10 18:58 ` Joanne Koong
2025-06-11 4:01 ` Christoph Hellwig
0 siblings, 2 replies; 70+ messages in thread
From: Joanne Koong @ 2025-06-10 18:23 UTC (permalink / raw)
To: Christoph Hellwig
Cc: miklos, djwong, brauner, linux-fsdevel, linux-xfs, bernd.schubert,
kernel-team
On Mon, Jun 9, 2025 at 8:58 PM Christoph Hellwig <hch@infradead.org> wrote:
>
> On Mon, Jun 09, 2025 at 04:15:27PM -0700, Joanne Koong wrote:
> > ioends are used in fuse for cleaning up state. fuse implements a
> > ->submit_ioend() callback in fuse_iomap_submit_ioend() (in patch 7/8).
>
> But do you use struct iomap_ioend at all? (Sorry, still don't have a
> whole tree with the patches applied in front of me).
I don't use struct iomap_ioend at all and I'm realizing now that I
should just have fuse manually do the end-of-io submission after it
calls iomap_writepages() / iomap_writeback_dirty_folio() instead of
defining a ->submit_ioend(). Then, we can get rid of this
int iomap_submit_ioend(struct iomap_writepage_ctx *wpc, int error)
{
+ if (wpc->iomap.type == IOMAP_IN_MEM) {
+ if (wpc->ops->submit_ioend)
+ error = wpc->ops->submit_ioend(wpc, error);
+ return error;
+ }
that was added and leave the iomap_submit_ioend() logic untouched.
>
> > > Given that the patch that moved things around already wrapped the
> > > error propagation to the bio into a helpr, how does this differ
> > > from the main path in the function now?
> > >
> >
> > If we don't add this special casing for IOMAP_IN_MEM here, then in
> > this function it'll hit the "if (!wpc->ioend)" case right in the
> > beginning and return error without calling the ->submit_ioend()
>
> So this suggests you don't use struct iomap_ioend at all. Given that
> you add a private field to iomap_writepage_ctx I guess that is where
> you chain the fuse requests?
>
> Either way I think we should clean this up one way or another. If the
> non-block iomap writeback code doesn't use ioends we should probably move
> the ioend chain into the private field, and hide everything using it (or
> even the ioend name) in proper abstraction. In this case this means
> finding another way to check for a non-empty wpc. One way would be to
> check nr_folios as any non-empty wbc must have a number of folios
> attached to it, the other would be to move the check into the
> ->submit_ioend method including the block fallback. For a proper split
> the method should probably be renamed, and we'd probably want to require
> every use to provide the submit method, even if the trivial block
> users set it to the default one provided.
>
> > > > - if (!count)
> > > > + /*
> > > > + * If wpc->ops->writeback_folio is set, then it is responsible
> > > > + * for ending the writeback itself.
> > > > + */
> > > > + if (!count && !wpc->ops->writeback_folio)
> > > > folio_end_writeback(folio);
> > >
> > > This fails to explain why writeback_folio does the unlocking itself.
> >
> > writeback_folio needs to do the unlocking itself because the writeback
> > may be done asynchronously (as in the case for fuse). I'll add that as
> > a comment in v2.
>
> So how do you end up with a zero count here and still want
> the fuse code to unlock?
>
count is unused by ->writeback_folio(), so it's always just zero. But
I see what you're saying now. I should just increment count after
doing the ->writeback_folio() call and then we could just leave the
"if (!count)" check untouched. I like this suggestion a lot, i'll make
this change in v2.
^ permalink raw reply [flat|nested] 70+ messages in thread
* Re: [PATCH v1 4/8] iomap: add writepages support for IOMAP_IN_MEM iomaps
2025-06-10 18:23 ` Joanne Koong
@ 2025-06-10 18:58 ` Joanne Koong
2025-06-11 4:01 ` Christoph Hellwig
1 sibling, 0 replies; 70+ messages in thread
From: Joanne Koong @ 2025-06-10 18:58 UTC (permalink / raw)
To: Christoph Hellwig
Cc: miklos, djwong, brauner, linux-fsdevel, linux-xfs, bernd.schubert,
kernel-team
On Tue, Jun 10, 2025 at 11:23 AM Joanne Koong <joannelkoong@gmail.com> wrote:
>
> On Mon, Jun 9, 2025 at 8:58 PM Christoph Hellwig <hch@infradead.org> wrote:
> >
> > On Mon, Jun 09, 2025 at 04:15:27PM -0700, Joanne Koong wrote:
> > > ioends are used in fuse for cleaning up state. fuse implements a
> > > ->submit_ioend() callback in fuse_iomap_submit_ioend() (in patch 7/8).
> >
> > But do you use struct iomap_ioend at all? (Sorry, still don't have a
> > whole tree with the patches applied in front of me).
>
> I don't use struct iomap_ioend at all and I'm realizing now that I
> should just have fuse manually do the end-of-io submission after it
> calls iomap_writepages() / iomap_writeback_dirty_folio() instead of
> defining a ->submit_ioend(). Then, we can get rid of this
>
> int iomap_submit_ioend(struct iomap_writepage_ctx *wpc, int error)
> {
> + if (wpc->iomap.type == IOMAP_IN_MEM) {
> + if (wpc->ops->submit_ioend)
> + error = wpc->ops->submit_ioend(wpc, error);
> + return error;
> + }
>
> that was added and leave the iomap_submit_ioend() logic untouched.
Actually, nvm. You're right, it's cleaner to just have ioend stuff be totally
abstracted away like you suggested below. I'll make that change for
v2.
>
> >
> > > > Given that the patch that moved things around already wrapped the
> > > > error propagation to the bio into a helpr, how does this differ
> > > > from the main path in the function now?
> > > >
> > >
> > > If we don't add this special casing for IOMAP_IN_MEM here, then in
> > > this function it'll hit the "if (!wpc->ioend)" case right in the
> > > beginning and return error without calling the ->submit_ioend()
> >
> > So this suggests you don't use struct iomap_ioend at all. Given that
> > you add a private field to iomap_writepage_ctx I guess that is where
> > you chain the fuse requests?
> >
> > Either way I think we should clean this up one way or another. If the
> > non-block iomap writeback code doesn't use ioends we should probably move
> > the ioend chain into the private field, and hide everything using it (or
> > even the ioend name) in proper abstraction. In this case this means
> > finding another way to check for a non-empty wpc. One way would be to
> > check nr_folios as any non-empty wbc must have a number of folios
> > attached to it, the other would be to move the check into the
> > ->submit_ioend method including the block fallback. For a proper split
> > the method should probably be renamed, and we'd probably want to require
> > every use to provide the submit method, even if the trivial block
> > users set it to the default one provided.
> >
> > > > > - if (!count)
> > > > > + /*
> > > > > + * If wpc->ops->writeback_folio is set, then it is responsible
> > > > > + * for ending the writeback itself.
> > > > > + */
> > > > > + if (!count && !wpc->ops->writeback_folio)
> > > > > folio_end_writeback(folio);
> > > >
> > > > This fails to explain why writeback_folio does the unlocking itself.
> > >
> > > writeback_folio needs to do the unlocking itself because the writeback
> > > may be done asynchronously (as in the case for fuse). I'll add that as
> > > a comment in v2.
> >
> > So how do you end up with a zero count here and still want
> > the fuse code to unlock?
> >
>
> count is unused by ->writeback_folio(), so it's always just zero. But
> I see what you're saying now. I should just increment count after
> doing the ->writeback_folio() call and then we could just leave the
> "if (!count)" check untouched. I like this suggestion a lot, i'll make
> this change in v2.
^ permalink raw reply [flat|nested] 70+ messages in thread
* Re: [PATCH v1 4/8] iomap: add writepages support for IOMAP_IN_MEM iomaps
2025-06-10 18:23 ` Joanne Koong
2025-06-10 18:58 ` Joanne Koong
@ 2025-06-11 4:01 ` Christoph Hellwig
1 sibling, 0 replies; 70+ messages in thread
From: Christoph Hellwig @ 2025-06-11 4:01 UTC (permalink / raw)
To: Joanne Koong
Cc: Christoph Hellwig, miklos, djwong, brauner, linux-fsdevel,
linux-xfs, bernd.schubert, kernel-team
On Tue, Jun 10, 2025 at 11:23:27AM -0700, Joanne Koong wrote:
> count is unused by ->writeback_folio(), so it's always just zero. But
> I see what you're saying now. I should just increment count after
> doing the ->writeback_folio() call and then we could just leave the
> "if (!count)" check untouched. I like this suggestion a lot, i'll make
> this change in v2.
I think you'll need to add count. Not only to support the block
mappings through the common interface, but also to deal with the
case where due to races with e.g. truncate we end up not kicking
off any writeback at all. In that case the folio_end_writeback
here needs to happen.
^ permalink raw reply [flat|nested] 70+ messages in thread
* [PATCH v1 5/8] iomap: add iomap_writeback_dirty_folio()
2025-06-06 23:37 [PATCH v1 0/8] fuse: use iomap for buffered writes + writeback Joanne Koong
` (3 preceding siblings ...)
2025-06-06 23:37 ` [PATCH v1 4/8] iomap: add writepages " Joanne Koong
@ 2025-06-06 23:38 ` Joanne Koong
2025-06-09 4:51 ` Christoph Hellwig
2025-06-06 23:38 ` [PATCH v1 6/8] fuse: use iomap for buffered writes Joanne Koong
` (5 subsequent siblings)
10 siblings, 1 reply; 70+ messages in thread
From: Joanne Koong @ 2025-06-06 23:38 UTC (permalink / raw)
To: miklos
Cc: djwong, brauner, linux-fsdevel, linux-xfs, bernd.schubert,
kernel-team
Add iomap_writeback_dirty_folio() for writing back a dirty folio.
One use case of this is for folio laundering.
Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
---
fs/iomap/buffered-io.c | 23 +++++++++++++++++++----
include/linux/iomap.h | 3 +++
2 files changed, 22 insertions(+), 4 deletions(-)
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 92f08b316d47..766a6673f79f 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -1592,7 +1592,8 @@ static bool iomap_writepage_handle_eof(struct folio *folio, struct inode *inode,
}
static int iomap_writepage_map(struct iomap_writepage_ctx *wpc,
- struct writeback_control *wbc, struct folio *folio)
+ struct writeback_control *wbc, struct folio *folio,
+ bool unlock_folio)
{
struct iomap_folio_state *ifs = folio->private;
struct inode *inode = folio->mapping->host;
@@ -1610,7 +1611,8 @@ static int iomap_writepage_map(struct iomap_writepage_ctx *wpc,
trace_iomap_writepage(inode, pos, folio_size(folio));
if (!iomap_writepage_handle_eof(folio, inode, &end_pos)) {
- folio_unlock(folio);
+ if (unlock_folio)
+ folio_unlock(folio);
return 0;
}
WARN_ON_ONCE(end_pos <= pos);
@@ -1675,7 +1677,8 @@ static int iomap_writepage_map(struct iomap_writepage_ctx *wpc,
* already at this point. In that case we need to clear the writeback
* bit ourselves right after unlocking the page.
*/
- folio_unlock(folio);
+ if (unlock_folio)
+ folio_unlock(folio);
if (ifs) {
if (atomic_dec_and_test(&ifs->write_bytes_pending))
folio_end_writeback(folio);
@@ -1709,11 +1712,23 @@ iomap_writepages(struct address_space *mapping, struct writeback_control *wbc,
wpc->ops = ops;
while ((folio = writeback_iter(mapping, wbc, folio, &error)))
- error = iomap_writepage_map(wpc, wbc, folio);
+ error = iomap_writepage_map(wpc, wbc, folio, true);
return iomap_submit_ioend(wpc, error);
}
EXPORT_SYMBOL_GPL(iomap_writepages);
+int iomap_writeback_dirty_folio(struct folio *folio, struct writeback_control *wbc,
+ struct iomap_writepage_ctx *wpc,
+ const struct iomap_writeback_ops *ops)
+{
+ int error;
+
+ wpc->ops = ops;
+ error = iomap_writepage_map(wpc, wbc, folio, false);
+ return iomap_submit_ioend(wpc, error);
+}
+EXPORT_SYMBOL_GPL(iomap_writeback_dirty_folio);
+
void iomap_start_folio_write(struct inode *inode, struct folio *folio, size_t len)
{
struct iomap_folio_state *ifs = folio->private;
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 4b5e083fa802..a2b50b5489da 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -483,6 +483,9 @@ void iomap_sort_ioends(struct list_head *ioend_list);
int iomap_writepages(struct address_space *mapping,
struct writeback_control *wbc, struct iomap_writepage_ctx *wpc,
const struct iomap_writeback_ops *ops);
+int iomap_writeback_dirty_folio(struct folio *folio, struct writeback_control *wbc,
+ struct iomap_writepage_ctx *wpc,
+ const struct iomap_writeback_ops *ops);
/*
* Flags for direct I/O ->end_io:
--
2.47.1
^ permalink raw reply related [flat|nested] 70+ messages in thread
* Re: [PATCH v1 5/8] iomap: add iomap_writeback_dirty_folio()
2025-06-06 23:38 ` [PATCH v1 5/8] iomap: add iomap_writeback_dirty_folio() Joanne Koong
@ 2025-06-09 4:51 ` Christoph Hellwig
2025-06-09 17:14 ` Darrick J. Wong
2025-06-09 23:30 ` Joanne Koong
0 siblings, 2 replies; 70+ messages in thread
From: Christoph Hellwig @ 2025-06-09 4:51 UTC (permalink / raw)
To: Joanne Koong
Cc: miklos, djwong, brauner, linux-fsdevel, linux-xfs, bernd.schubert,
kernel-team
On Fri, Jun 06, 2025 at 04:38:00PM -0700, Joanne Koong wrote:
> Add iomap_writeback_dirty_folio() for writing back a dirty folio.
> One use case of this is for folio laundering.
Where "folio laundering" means calling ->launder_folio, right?
> @@ -1675,7 +1677,8 @@ static int iomap_writepage_map(struct iomap_writepage_ctx *wpc,
> * already at this point. In that case we need to clear the writeback
> * bit ourselves right after unlocking the page.
> */
> - folio_unlock(folio);
> + if (unlock_folio)
> + folio_unlock(folio);
> if (ifs) {
> if (atomic_dec_and_test(&ifs->write_bytes_pending))
> folio_end_writeback(folio);
When writing this code I was under the impression that
folio_end_writeback needs to be called after unlocking the page.
If that is not actually the case we can just move the unlocking into the
caller and make things a lot cleaner than the conditional locking
argument.
> +int iomap_writeback_dirty_folio(struct folio *folio, struct writeback_control *wbc,
> + struct iomap_writepage_ctx *wpc,
> + const struct iomap_writeback_ops *ops)
Please stick to the usual iomap coding style: 80 character lines,
two-tab indent for multiline function declarations. (Also in a few
other places).
^ permalink raw reply [flat|nested] 70+ messages in thread
* Re: [PATCH v1 5/8] iomap: add iomap_writeback_dirty_folio()
2025-06-09 4:51 ` Christoph Hellwig
@ 2025-06-09 17:14 ` Darrick J. Wong
2025-06-09 23:54 ` Joanne Koong
2025-06-10 3:59 ` Christoph Hellwig
2025-06-09 23:30 ` Joanne Koong
1 sibling, 2 replies; 70+ messages in thread
From: Darrick J. Wong @ 2025-06-09 17:14 UTC (permalink / raw)
To: Christoph Hellwig
Cc: Joanne Koong, miklos, brauner, linux-fsdevel, linux-xfs,
bernd.schubert, kernel-team
On Sun, Jun 08, 2025 at 09:51:54PM -0700, Christoph Hellwig wrote:
> On Fri, Jun 06, 2025 at 04:38:00PM -0700, Joanne Koong wrote:
> > Add iomap_writeback_dirty_folio() for writing back a dirty folio.
> > One use case of this is for folio laundering.
>
> Where "folio laundering" means calling ->launder_folio, right?
What does fuse use folio laundering for, anyway? It looks to me like
the primary users are invalidate_inode_pages*. Either the caller cares
about flushing dirty data and has called filemap_write_and_wait_range;
or it doesn't and wants to tear down the pagecache ahead of some other
operation that's going to change the file contents and doesn't care.
I suppose it could be useful as a last-chance operation on a dirty folio
that was dirtied after a filemap_write_and_wait_range but before
invalidate_inode_pages*? Though for xfs we just return EBUSY and let
the caller try again (or not). Is there a subtlety to fuse here that I
don't know about?
(Both of those questions are directed at hch or joanne or anyone else
who knows ;))
--D
> > @@ -1675,7 +1677,8 @@ static int iomap_writepage_map(struct iomap_writepage_ctx *wpc,
> > * already at this point. In that case we need to clear the writeback
> > * bit ourselves right after unlocking the page.
> > */
> > - folio_unlock(folio);
> > + if (unlock_folio)
> > + folio_unlock(folio);
> > if (ifs) {
> > if (atomic_dec_and_test(&ifs->write_bytes_pending))
> > folio_end_writeback(folio);
>
> When writing this code I was under the impression that
> folio_end_writeback needs to be called after unlocking the page.
>
> If that is not actually the case we can just move the unlocking into the
> caller and make things a lot cleaner than the conditional locking
> argument.
>
> > +int iomap_writeback_dirty_folio(struct folio *folio, struct writeback_control *wbc,
> > + struct iomap_writepage_ctx *wpc,
> > + const struct iomap_writeback_ops *ops)
>
> Please stick to the usual iomap coding style: 80 character lines,
> two-tab indent for multiline function declarations. (Also in a few
> other places).
>
>
^ permalink raw reply [flat|nested] 70+ messages in thread
* Re: [PATCH v1 5/8] iomap: add iomap_writeback_dirty_folio()
2025-06-09 17:14 ` Darrick J. Wong
@ 2025-06-09 23:54 ` Joanne Koong
2025-06-10 3:59 ` Christoph Hellwig
1 sibling, 0 replies; 70+ messages in thread
From: Joanne Koong @ 2025-06-09 23:54 UTC (permalink / raw)
To: Darrick J. Wong
Cc: Christoph Hellwig, miklos, brauner, linux-fsdevel, linux-xfs,
bernd.schubert, kernel-team
On Mon, Jun 9, 2025 at 10:14 AM Darrick J. Wong <djwong@kernel.org> wrote:
>
> On Sun, Jun 08, 2025 at 09:51:54PM -0700, Christoph Hellwig wrote:
> > On Fri, Jun 06, 2025 at 04:38:00PM -0700, Joanne Koong wrote:
> > > Add iomap_writeback_dirty_folio() for writing back a dirty folio.
> > > One use case of this is for folio laundering.
> >
> > Where "folio laundering" means calling ->launder_folio, right?
>
> What does fuse use folio laundering for, anyway? It looks to me like
> the primary users are invalidate_inode_pages*. Either the caller cares
> about flushing dirty data and has called filemap_write_and_wait_range;
> or it doesn't and wants to tear down the pagecache ahead of some other
> operation that's going to change the file contents and doesn't care.
>
> I suppose it could be useful as a last-chance operation on a dirty folio
> that was dirtied after a filemap_write_and_wait_range but before
> invalidate_inode_pages*? Though for xfs we just return EBUSY and let
> the caller try again (or not). Is there a subtlety to fuse here that I
> don't know about?
>
This is something I've been confused about too, as to why there even
is a launder_folio() and why some filesystems (eg orangefs, nfs,
btrfs, fuse) call it but others don't.
It was added in commit e3db7691 ("[PATCH] NFS: Fix race in
nfs_release_page()") but I fail to see how that commit fixes the race
condition described in the commit message.
My theory is that it's only needed by remote/network filesystems
because maybe they need stronger data consistency guarantees? Out of
the ones that implement launder_folio(), btrfs is the anomaly but when
I checked with the person who added it to btrfs, they said it was a
hack for something else because it got called at the right time.
Looking forward to being enlightened on this as well.
> (Both of those questions are directed at hch or joanne or anyone else
> who knows ;))
>
> --D
>
> > > @@ -1675,7 +1677,8 @@ static int iomap_writepage_map(struct iomap_writepage_ctx *wpc,
> > > * already at this point. In that case we need to clear the writeback
> > > * bit ourselves right after unlocking the page.
> > > */
> > > - folio_unlock(folio);
> > > + if (unlock_folio)
> > > + folio_unlock(folio);
> > > if (ifs) {
> > > if (atomic_dec_and_test(&ifs->write_bytes_pending))
> > > folio_end_writeback(folio);
> >
> > When writing this code I was under the impression that
> > folio_end_writeback needs to be called after unlocking the page.
> >
> > If that is not actually the case we can just move the unlocking into the
> > caller and make things a lot cleaner than the conditional locking
> > argument.
> >
> > > +int iomap_writeback_dirty_folio(struct folio *folio, struct writeback_control *wbc,
> > > + struct iomap_writepage_ctx *wpc,
> > > + const struct iomap_writeback_ops *ops)
> >
> > Please stick to the usual iomap coding style: 80 character lines,
> > two-tab indent for multiline function declarations. (Also in a few
> > other places).
> >
> >
^ permalink raw reply [flat|nested] 70+ messages in thread
* Re: [PATCH v1 5/8] iomap: add iomap_writeback_dirty_folio()
2025-06-09 17:14 ` Darrick J. Wong
2025-06-09 23:54 ` Joanne Koong
@ 2025-06-10 3:59 ` Christoph Hellwig
2025-06-11 4:34 ` Matthew Wilcox
1 sibling, 1 reply; 70+ messages in thread
From: Christoph Hellwig @ 2025-06-10 3:59 UTC (permalink / raw)
To: Darrick J. Wong
Cc: Christoph Hellwig, Joanne Koong, miklos, brauner, linux-fsdevel,
linux-xfs, bernd.schubert, kernel-team, willy, linux-mm,
linux-nfs
On Mon, Jun 09, 2025 at 10:14:44AM -0700, Darrick J. Wong wrote:
> > Where "folio laundering" means calling ->launder_folio, right?
>
> What does fuse use folio laundering for, anyway? It looks to me like
> the primary users are invalidate_inode_pages*. Either the caller cares
> about flushing dirty data and has called filemap_write_and_wait_range;
> or it doesn't and wants to tear down the pagecache ahead of some other
> operation that's going to change the file contents and doesn't care.
>
> I suppose it could be useful as a last-chance operation on a dirty folio
> that was dirtied after a filemap_write_and_wait_range but before
> invalidate_inode_pages*? Though for xfs we just return EBUSY and let
> the caller try again (or not). Is there a subtlety to fuse here that I
> don't know about?
My memory might be betraying me, but I think willy once launched an
attempt to see if we can kill launder_folio. Adding him, and the
mm and nfs lists to check if I have a point :)
^ permalink raw reply [flat|nested] 70+ messages in thread
* Re: [PATCH v1 5/8] iomap: add iomap_writeback_dirty_folio()
2025-06-10 3:59 ` Christoph Hellwig
@ 2025-06-11 4:34 ` Matthew Wilcox
2025-06-18 4:47 ` does fuse need ->launder_folios, was: " Christoph Hellwig
2025-06-18 12:17 ` Jeff Layton
0 siblings, 2 replies; 70+ messages in thread
From: Matthew Wilcox @ 2025-06-11 4:34 UTC (permalink / raw)
To: Christoph Hellwig
Cc: Darrick J. Wong, Joanne Koong, miklos, brauner, linux-fsdevel,
linux-xfs, bernd.schubert, kernel-team, linux-mm, linux-nfs
On Mon, Jun 09, 2025 at 08:59:53PM -0700, Christoph Hellwig wrote:
> On Mon, Jun 09, 2025 at 10:14:44AM -0700, Darrick J. Wong wrote:
> > > Where "folio laundering" means calling ->launder_folio, right?
> >
> > What does fuse use folio laundering for, anyway? It looks to me like
> > the primary users are invalidate_inode_pages*. Either the caller cares
> > about flushing dirty data and has called filemap_write_and_wait_range;
> > or it doesn't and wants to tear down the pagecache ahead of some other
> > operation that's going to change the file contents and doesn't care.
> >
> > I suppose it could be useful as a last-chance operation on a dirty folio
> > that was dirtied after a filemap_write_and_wait_range but before
> > invalidate_inode_pages*? Though for xfs we just return EBUSY and let
> > the caller try again (or not). Is there a subtlety to fuse here that I
> > don't know about?
>
> My memory might be betraying me, but I think willy once launched an
> attempt to see if we can kill launder_folio. Adding him, and the
> mm and nfs lists to check if I have a point :)
I ... got distracted with everything else.
Looking at the original addition of ->launder_page (e3db7691e9f3), I
don't understand why we need it. invalidate_inode_pages2() isn't
supposed to invalidate dirty pages, so I don't understand why nfs
found it necessary to do writeback from ->releasepage() instead
of just returning false like iomap does.
There's now a new question of what the hell btrfs is up to with
->launder_folio, which they just added recently.
^ permalink raw reply [flat|nested] 70+ messages in thread
* does fuse need ->launder_folios, was: Re: [PATCH v1 5/8] iomap: add iomap_writeback_dirty_folio()
2025-06-11 4:34 ` Matthew Wilcox
@ 2025-06-18 4:47 ` Christoph Hellwig
2025-06-18 12:17 ` Jeff Layton
1 sibling, 0 replies; 70+ messages in thread
From: Christoph Hellwig @ 2025-06-18 4:47 UTC (permalink / raw)
To: Matthew Wilcox
Cc: Christoph Hellwig, Darrick J. Wong, Joanne Koong, miklos, brauner,
linux-fsdevel, linux-xfs, bernd.schubert, kernel-team, linux-mm,
linux-nfs
On Wed, Jun 11, 2025 at 05:34:50AM +0100, Matthew Wilcox wrote:
> > My memory might be betraying me, but I think willy once launched an
> > attempt to see if we can kill launder_folio. Adding him, and the
> > mm and nfs lists to check if I have a point :)
>
> I ... got distracted with everything else.
>
> Looking at the original addition of ->launder_page (e3db7691e9f3), I
> don't understand why we need it. invalidate_inode_pages2() isn't
> supposed to invalidate dirty pages, so I don't understand why nfs
> found it necessary to do writeback from ->releasepage() instead
> of just returning false like iomap does.
Yeah. Miklos (and other fuse folks), can you help figuring out
if fuse really wants ->launder_folio? Because it would be really good
to settle this question before we have to add iomap infrastruture for
it.
^ permalink raw reply [flat|nested] 70+ messages in thread
* Re: [PATCH v1 5/8] iomap: add iomap_writeback_dirty_folio()
2025-06-11 4:34 ` Matthew Wilcox
2025-06-18 4:47 ` does fuse need ->launder_folios, was: " Christoph Hellwig
@ 2025-06-18 12:17 ` Jeff Layton
2025-06-20 18:15 ` Matthew Wilcox
1 sibling, 1 reply; 70+ messages in thread
From: Jeff Layton @ 2025-06-18 12:17 UTC (permalink / raw)
To: Matthew Wilcox, Christoph Hellwig
Cc: Darrick J. Wong, Joanne Koong, miklos, brauner, linux-fsdevel,
linux-xfs, bernd.schubert, kernel-team, linux-mm, linux-nfs
On Wed, 2025-06-11 at 05:34 +0100, Matthew Wilcox wrote:
> On Mon, Jun 09, 2025 at 08:59:53PM -0700, Christoph Hellwig wrote:
> > On Mon, Jun 09, 2025 at 10:14:44AM -0700, Darrick J. Wong wrote:
> > > > Where "folio laundering" means calling ->launder_folio, right?
> > >
> > > What does fuse use folio laundering for, anyway? It looks to me like
> > > the primary users are invalidate_inode_pages*. Either the caller cares
> > > about flushing dirty data and has called filemap_write_and_wait_range;
> > > or it doesn't and wants to tear down the pagecache ahead of some other
> > > operation that's going to change the file contents and doesn't care.
> > >
> > > I suppose it could be useful as a last-chance operation on a dirty folio
> > > that was dirtied after a filemap_write_and_wait_range but before
> > > invalidate_inode_pages*? Though for xfs we just return EBUSY and let
> > > the caller try again (or not). Is there a subtlety to fuse here that I
> > > don't know about?
> >
> > My memory might be betraying me, but I think willy once launched an
> > attempt to see if we can kill launder_folio. Adding him, and the
> > mm and nfs lists to check if I have a point :)
>
> I ... got distracted with everything else.
>
> Looking at the original addition of ->launder_page (e3db7691e9f3), I
> don't understand why we need it. invalidate_inode_pages2() isn't
> supposed to invalidate dirty pages, so I don't understand why nfs
> found it necessary to do writeback from ->releasepage() instead
> of just returning false like iomap does.
>
> There's now a new question of what the hell btrfs is up to with
> ->launder_folio, which they just added recently.
IIRC...
The problem was a race where a task could could dirty a page in a
mmap'ed file after it had been written back but before it was unmapped
from the pagecache.
Bear in mind that the NFS client may need write back and then
invalidate the pagecache for a file that is still in use if it
discovers that the inode's attributes have changed on the server.
Trond's solution was to write the page out while holding the page lock
in this situation. I think we'd all welcome a way to avoid this race
that didn't require launder_folio().
--
Jeff Layton <jlayton@kernel.org>
^ permalink raw reply [flat|nested] 70+ messages in thread
* Re: [PATCH v1 5/8] iomap: add iomap_writeback_dirty_folio()
2025-06-18 12:17 ` Jeff Layton
@ 2025-06-20 18:15 ` Matthew Wilcox
2025-06-25 5:26 ` Joanne Koong
0 siblings, 1 reply; 70+ messages in thread
From: Matthew Wilcox @ 2025-06-20 18:15 UTC (permalink / raw)
To: Jeff Layton
Cc: Christoph Hellwig, Darrick J. Wong, Joanne Koong, miklos, brauner,
linux-fsdevel, linux-xfs, bernd.schubert, kernel-team, linux-mm,
linux-nfs
On Wed, Jun 18, 2025 at 08:17:03AM -0400, Jeff Layton wrote:
> On Wed, 2025-06-11 at 05:34 +0100, Matthew Wilcox wrote:
> > On Mon, Jun 09, 2025 at 08:59:53PM -0700, Christoph Hellwig wrote:
> > > On Mon, Jun 09, 2025 at 10:14:44AM -0700, Darrick J. Wong wrote:
> > > > > Where "folio laundering" means calling ->launder_folio, right?
> > > >
> > > > What does fuse use folio laundering for, anyway? It looks to me like
> > > > the primary users are invalidate_inode_pages*. Either the caller cares
> > > > about flushing dirty data and has called filemap_write_and_wait_range;
> > > > or it doesn't and wants to tear down the pagecache ahead of some other
> > > > operation that's going to change the file contents and doesn't care.
> > > >
> > > > I suppose it could be useful as a last-chance operation on a dirty folio
> > > > that was dirtied after a filemap_write_and_wait_range but before
> > > > invalidate_inode_pages*? Though for xfs we just return EBUSY and let
> > > > the caller try again (or not). Is there a subtlety to fuse here that I
> > > > don't know about?
> > >
> > > My memory might be betraying me, but I think willy once launched an
> > > attempt to see if we can kill launder_folio. Adding him, and the
> > > mm and nfs lists to check if I have a point :)
> >
> > I ... got distracted with everything else.
> >
> > Looking at the original addition of ->launder_page (e3db7691e9f3), I
> > don't understand why we need it. invalidate_inode_pages2() isn't
> > supposed to invalidate dirty pages, so I don't understand why nfs
> > found it necessary to do writeback from ->releasepage() instead
> > of just returning false like iomap does.
> >
> > There's now a new question of what the hell btrfs is up to with
> > ->launder_folio, which they just added recently.
>
> IIRC...
>
> The problem was a race where a task could could dirty a page in a
> mmap'ed file after it had been written back but before it was unmapped
> from the pagecache.
>
> Bear in mind that the NFS client may need write back and then
> invalidate the pagecache for a file that is still in use if it
> discovers that the inode's attributes have changed on the server.
>
> Trond's solution was to write the page out while holding the page lock
> in this situation. I think we'd all welcome a way to avoid this race
> that didn't require launder_folio().
I think the problem is that we've left it up to the filesystem to handle
"what do we do if we've dirtied a page^W folio between, say, calling
filemap_write_and_wait_range() and calling filemap_release_folio()".
Just to make sure we're all on the same page here, this is the sample
path I'm looking at:
__iomap_dio_rw
kiocb_invalidate_pages
filemap_invalidate_pages
filemap_write_and_wait_range
invalidate_inode_pages2_range
unmap_mapping_pages
folio_lock
folio_wait_writeback
folio_unmap_invalidate
unmap_mapping_folio
folio_launder
filemap_release_folio
if (folio_test_dirty(folio))
return -EBUSY;
So some filesystems opt to write back the folio which has been dirtied
(by implementing ->launder_folio) and others opt to fail (and fall back to
buffered I/O when the user has requested direct I/O). iomap filesystems
all just "return false" for dirty folios, so it's clearly an acceptable
outcome as far as xfstests go.
The question is whether this is acceptable for all the filesystem
which implement ->launder_folio today. Because we could just move the
folio_test_dirty() to after the folio_lock() and remove all the testing
of folio dirtiness from individual filesystems.
Or have I missed something and picked the wrong sample path for
analysing why we do/don't need to writeback folios in
invalidate_inode_pages2_range()?
^ permalink raw reply [flat|nested] 70+ messages in thread
* Re: [PATCH v1 5/8] iomap: add iomap_writeback_dirty_folio()
2025-06-20 18:15 ` Matthew Wilcox
@ 2025-06-25 5:26 ` Joanne Koong
2025-06-25 6:26 ` Christoph Hellwig
0 siblings, 1 reply; 70+ messages in thread
From: Joanne Koong @ 2025-06-25 5:26 UTC (permalink / raw)
To: Matthew Wilcox
Cc: Jeff Layton, Christoph Hellwig, Darrick J. Wong, miklos, brauner,
linux-fsdevel, linux-xfs, bernd.schubert, kernel-team, linux-mm,
linux-nfs
On Fri, Jun 20, 2025 at 11:15 AM Matthew Wilcox <willy@infradead.org> wrote:
>
> On Wed, Jun 18, 2025 at 08:17:03AM -0400, Jeff Layton wrote:
> > On Wed, 2025-06-11 at 05:34 +0100, Matthew Wilcox wrote:
> > > On Mon, Jun 09, 2025 at 08:59:53PM -0700, Christoph Hellwig wrote:
> > > > On Mon, Jun 09, 2025 at 10:14:44AM -0700, Darrick J. Wong wrote:
> > > > > > Where "folio laundering" means calling ->launder_folio, right?
> > > > >
> > > > > What does fuse use folio laundering for, anyway? It looks to me like
> > > > > the primary users are invalidate_inode_pages*. Either the caller cares
> > > > > about flushing dirty data and has called filemap_write_and_wait_range;
> > > > > or it doesn't and wants to tear down the pagecache ahead of some other
> > > > > operation that's going to change the file contents and doesn't care.
> > > > >
> > > > > I suppose it could be useful as a last-chance operation on a dirty folio
> > > > > that was dirtied after a filemap_write_and_wait_range but before
> > > > > invalidate_inode_pages*? Though for xfs we just return EBUSY and let
> > > > > the caller try again (or not). Is there a subtlety to fuse here that I
> > > > > don't know about?
> > > >
> > > > My memory might be betraying me, but I think willy once launched an
> > > > attempt to see if we can kill launder_folio. Adding him, and the
> > > > mm and nfs lists to check if I have a point :)
> > >
> > > I ... got distracted with everything else.
> > >
> > > Looking at the original addition of ->launder_page (e3db7691e9f3), I
> > > don't understand why we need it. invalidate_inode_pages2() isn't
> > > supposed to invalidate dirty pages, so I don't understand why nfs
> > > found it necessary to do writeback from ->releasepage() instead
> > > of just returning false like iomap does.
> > >
> > > There's now a new question of what the hell btrfs is up to with
> > > ->launder_folio, which they just added recently.
> >
> > IIRC...
> >
> > The problem was a race where a task could could dirty a page in a
> > mmap'ed file after it had been written back but before it was unmapped
> > from the pagecache.
> >
> > Bear in mind that the NFS client may need write back and then
> > invalidate the pagecache for a file that is still in use if it
> > discovers that the inode's attributes have changed on the server.
> >
> > Trond's solution was to write the page out while holding the page lock
> > in this situation. I think we'd all welcome a way to avoid this race
> > that didn't require launder_folio().
>
> I think the problem is that we've left it up to the filesystem to handle
> "what do we do if we've dirtied a page^W folio between, say, calling
> filemap_write_and_wait_range() and calling filemap_release_folio()".
> Just to make sure we're all on the same page here, this is the sample
> path I'm looking at:
>
> __iomap_dio_rw
> kiocb_invalidate_pages
> filemap_invalidate_pages
> filemap_write_and_wait_range
> invalidate_inode_pages2_range
> unmap_mapping_pages
> folio_lock
> folio_wait_writeback
> folio_unmap_invalidate
> unmap_mapping_folio
> folio_launder
> filemap_release_folio
> if (folio_test_dirty(folio))
> return -EBUSY;
>
> So some filesystems opt to write back the folio which has been dirtied
> (by implementing ->launder_folio) and others opt to fail (and fall back to
> buffered I/O when the user has requested direct I/O). iomap filesystems
> all just "return false" for dirty folios, so it's clearly an acceptable
> outcome as far as xfstests go.
>
> The question is whether this is acceptable for all the filesystem
> which implement ->launder_folio today. Because we could just move the
> folio_test_dirty() to after the folio_lock() and remove all the testing
> of folio dirtiness from individual filesystems.
Or could the filesystems that implement ->launder_folio (from what I
see, there's only 4: fuse, nfs, btrfs, and orangefs) just move that
logic into their .release_folio implementation? I don't see why not.
In folio_unmap_invalidate(), we call:
ret = folio_launder(mapping, folio);
if (ret)
return ret;
if (folio->mapping != mapping)
return -EBUSY;
if (!filemap_release_folio(folio, gfp))
return -EBUSY;
If fuse, nfs, btrfs, and orangefs absolutely need to do whatever logic
they're doing in .launder_folio, could they not just move it into
.release_folio?
>
> Or have I missed something and picked the wrong sample path for
> analysing why we do/don't need to writeback folios in
> invalidate_inode_pages2_range()?
^ permalink raw reply [flat|nested] 70+ messages in thread
* Re: [PATCH v1 5/8] iomap: add iomap_writeback_dirty_folio()
2025-06-25 5:26 ` Joanne Koong
@ 2025-06-25 6:26 ` Christoph Hellwig
2025-06-25 16:44 ` Joanne Koong
0 siblings, 1 reply; 70+ messages in thread
From: Christoph Hellwig @ 2025-06-25 6:26 UTC (permalink / raw)
To: Joanne Koong
Cc: Matthew Wilcox, Jeff Layton, Christoph Hellwig, Darrick J. Wong,
miklos, brauner, linux-fsdevel, linux-xfs, bernd.schubert,
kernel-team, linux-mm, linux-nfs
On Tue, Jun 24, 2025 at 10:26:01PM -0700, Joanne Koong wrote:
> > The question is whether this is acceptable for all the filesystem
> > which implement ->launder_folio today. Because we could just move the
> > folio_test_dirty() to after the folio_lock() and remove all the testing
> > of folio dirtiness from individual filesystems.
>
> Or could the filesystems that implement ->launder_folio (from what I
> see, there's only 4: fuse, nfs, btrfs, and orangefs) just move that
> logic into their .release_folio implementation? I don't see why not.
> In folio_unmap_invalidate(), we call:
Without even looking into the details from the iomap POV that basically
doesn't matter. You'd still need the write back a single locked folio
interface, which adds API surface, and because it only writes a single
folio at a time is rather inefficient. Not a deal breaker because
the current version look ok, but it would still be preferable to not
have an extra magic interface for it.
^ permalink raw reply [flat|nested] 70+ messages in thread
* Re: [PATCH v1 5/8] iomap: add iomap_writeback_dirty_folio()
2025-06-25 6:26 ` Christoph Hellwig
@ 2025-06-25 16:44 ` Joanne Koong
2025-07-01 5:41 ` Darrick J. Wong
2025-07-01 6:23 ` Miklos Szeredi
0 siblings, 2 replies; 70+ messages in thread
From: Joanne Koong @ 2025-06-25 16:44 UTC (permalink / raw)
To: Christoph Hellwig
Cc: Matthew Wilcox, Jeff Layton, Darrick J. Wong, miklos, brauner,
linux-fsdevel, linux-xfs, bernd.schubert, kernel-team, linux-mm,
linux-nfs, linux-btrfs
On Tue, Jun 24, 2025 at 11:26 PM Christoph Hellwig <hch@infradead.org> wrote:
>
> On Tue, Jun 24, 2025 at 10:26:01PM -0700, Joanne Koong wrote:
> > > The question is whether this is acceptable for all the filesystem
> > > which implement ->launder_folio today. Because we could just move the
> > > folio_test_dirty() to after the folio_lock() and remove all the testing
> > > of folio dirtiness from individual filesystems.
> >
> > Or could the filesystems that implement ->launder_folio (from what I
> > see, there's only 4: fuse, nfs, btrfs, and orangefs) just move that
> > logic into their .release_folio implementation? I don't see why not.
> > In folio_unmap_invalidate(), we call:
>
> Without even looking into the details from the iomap POV that basically
> doesn't matter. You'd still need the write back a single locked folio
> interface, which adds API surface, and because it only writes a single
> folio at a time is rather inefficient. Not a deal breaker because
> the current version look ok, but it would still be preferable to not
> have an extra magic interface for it.
>
Yes but as I understand it, the focus right now is on getting rid of
->launder_folio as an API. The iomap pov imo is a separate issue with
determining whether fuse in particular needs to write back the dirty
page before releasing or should just fail.
btrfs uses ->launder_folio() to free some previously allocated
reservation (added in commit 872617a "btrfs: implement launder_folio
for clearing dirty page reserve") so at the very least, that logic
would need to be moved to .release_folio() (if that suffices? Adding
the btrfs group to cc). It's still vague to me whether
fuse/nfs/orangefs need to write back the dirty page, but it seems fine
to me not to - as I understand it, the worst that can happen (and
please correct me if I'm wrong here, Matthew) from just failing it
with -EBUSY is that the folio lingers longer in the page cache until
it eventually gets written back and cleared out, and that only happens
if the file is mapped and written to in that window between
filemap_write_and_wait_range() and unmap_mapping_folio(). afaics, if
fuse/nfs/orangefs do need to write back the dirty folio instead of
failing w/ -EBUSY, they could just do that logic in .release_folio.
^ permalink raw reply [flat|nested] 70+ messages in thread
* Re: [PATCH v1 5/8] iomap: add iomap_writeback_dirty_folio()
2025-06-25 16:44 ` Joanne Koong
@ 2025-07-01 5:41 ` Darrick J. Wong
2025-07-02 21:36 ` Joanne Koong
2025-07-01 6:23 ` Miklos Szeredi
1 sibling, 1 reply; 70+ messages in thread
From: Darrick J. Wong @ 2025-07-01 5:41 UTC (permalink / raw)
To: Joanne Koong
Cc: Christoph Hellwig, Matthew Wilcox, Jeff Layton, miklos, brauner,
linux-fsdevel, linux-xfs, bernd.schubert, kernel-team, linux-mm,
linux-nfs, linux-btrfs
On Wed, Jun 25, 2025 at 09:44:31AM -0700, Joanne Koong wrote:
> On Tue, Jun 24, 2025 at 11:26 PM Christoph Hellwig <hch@infradead.org> wrote:
> >
> > On Tue, Jun 24, 2025 at 10:26:01PM -0700, Joanne Koong wrote:
> > > > The question is whether this is acceptable for all the filesystem
> > > > which implement ->launder_folio today. Because we could just move the
> > > > folio_test_dirty() to after the folio_lock() and remove all the testing
> > > > of folio dirtiness from individual filesystems.
> > >
> > > Or could the filesystems that implement ->launder_folio (from what I
> > > see, there's only 4: fuse, nfs, btrfs, and orangefs) just move that
> > > logic into their .release_folio implementation? I don't see why not.
> > > In folio_unmap_invalidate(), we call:
> >
> > Without even looking into the details from the iomap POV that basically
> > doesn't matter. You'd still need the write back a single locked folio
> > interface, which adds API surface, and because it only writes a single
> > folio at a time is rather inefficient. Not a deal breaker because
> > the current version look ok, but it would still be preferable to not
> > have an extra magic interface for it.
> >
>
> Yes but as I understand it, the focus right now is on getting rid of
> ->launder_folio as an API. The iomap pov imo is a separate issue with
> determining whether fuse in particular needs to write back the dirty
> page before releasing or should just fail.
This might not help for Joanne's case, but so far the lack of a
launder_folio in my fuse+iomap prototype hasn't hindered it at all.
From what I can tell it's ok to bounce EBUSY back to dio callers...
> btrfs uses ->launder_folio() to free some previously allocated
> reservation (added in commit 872617a "btrfs: implement launder_folio
> for clearing dirty page reserve") so at the very least, that logic
> would need to be moved to .release_folio() (if that suffices? Adding
> the btrfs group to cc). It's still vague to me whether
> fuse/nfs/orangefs need to write back the dirty page, but it seems fine
...but only because a retry will initiate another writeback so
eventually we can make some forward progress. But it helps a lot that
fuse+iomap is handing the entire IO stack over to iomap.
> to me not to - as I understand it, the worst that can happen (and
> please correct me if I'm wrong here, Matthew) from just failing it
> with -EBUSY is that the folio lingers longer in the page cache until
> it eventually gets written back and cleared out, and that only happens
> if the file is mapped and written to in that window between
> filemap_write_and_wait_range() and unmap_mapping_folio(). afaics, if
> fuse/nfs/orangefs do need to write back the dirty folio instead of
> failing w/ -EBUSY, they could just do that logic in .release_folio.
What do you do in ->release_folio if writeback fails? Redirty it and
return false?
--D
^ permalink raw reply [flat|nested] 70+ messages in thread
* Re: [PATCH v1 5/8] iomap: add iomap_writeback_dirty_folio()
2025-07-01 5:41 ` Darrick J. Wong
@ 2025-07-02 21:36 ` Joanne Koong
2025-07-02 21:47 ` Joanne Koong
0 siblings, 1 reply; 70+ messages in thread
From: Joanne Koong @ 2025-07-02 21:36 UTC (permalink / raw)
To: Darrick J. Wong
Cc: Christoph Hellwig, Matthew Wilcox, Jeff Layton, miklos, brauner,
linux-fsdevel, linux-xfs, bernd.schubert, kernel-team, linux-mm,
linux-nfs, linux-btrfs
On Mon, Jun 30, 2025 at 10:41 PM Darrick J. Wong <djwong@kernel.org> wrote:
>
> On Wed, Jun 25, 2025 at 09:44:31AM -0700, Joanne Koong wrote:
> > On Tue, Jun 24, 2025 at 11:26 PM Christoph Hellwig <hch@infradead.org> wrote:
> > >
> > > On Tue, Jun 24, 2025 at 10:26:01PM -0700, Joanne Koong wrote:
> > > > > The question is whether this is acceptable for all the filesystem
> > > > > which implement ->launder_folio today. Because we could just move the
> > > > > folio_test_dirty() to after the folio_lock() and remove all the testing
> > > > > of folio dirtiness from individual filesystems.
> > > >
> > > > Or could the filesystems that implement ->launder_folio (from what I
> > > > see, there's only 4: fuse, nfs, btrfs, and orangefs) just move that
> > > > logic into their .release_folio implementation? I don't see why not.
> > > > In folio_unmap_invalidate(), we call:
> > >
> > > Without even looking into the details from the iomap POV that basically
> > > doesn't matter. You'd still need the write back a single locked folio
> > > interface, which adds API surface, and because it only writes a single
> > > folio at a time is rather inefficient. Not a deal breaker because
> > > the current version look ok, but it would still be preferable to not
> > > have an extra magic interface for it.
> > >
> >
> > Yes but as I understand it, the focus right now is on getting rid of
> > ->launder_folio as an API. The iomap pov imo is a separate issue with
> > determining whether fuse in particular needs to write back the dirty
> > page before releasing or should just fail.
>
> This might not help for Joanne's case, but so far the lack of a
> launder_folio in my fuse+iomap prototype hasn't hindered it at all.
> From what I can tell it's ok to bounce EBUSY back to dio callers...
>
> > btrfs uses ->launder_folio() to free some previously allocated
> > reservation (added in commit 872617a "btrfs: implement launder_folio
> > for clearing dirty page reserve") so at the very least, that logic
> > would need to be moved to .release_folio() (if that suffices? Adding
> > the btrfs group to cc). It's still vague to me whether
> > fuse/nfs/orangefs need to write back the dirty page, but it seems fine
>
> ...but only because a retry will initiate another writeback so
> eventually we can make some forward progress. But it helps a lot that
> fuse+iomap is handing the entire IO stack over to iomap.
>
> > to me not to - as I understand it, the worst that can happen (and
> > please correct me if I'm wrong here, Matthew) from just failing it
> > with -EBUSY is that the folio lingers longer in the page cache until
> > it eventually gets written back and cleared out, and that only happens
> > if the file is mapped and written to in that window between
> > filemap_write_and_wait_range() and unmap_mapping_folio(). afaics, if
> > fuse/nfs/orangefs do need to write back the dirty folio instead of
> > failing w/ -EBUSY, they could just do that logic in .release_folio.
>
> What do you do in ->release_folio if writeback fails? Redirty it and
> return false?
Yeah, I was thinking we just redirty it and return false. I don't
think that leads to any deviation from existing behavior (eg in
folio_unmap_invalidate(), a failed writeback will return -EBUSY
regardless of whether the writeback attempt happens from
->launder_folio() or ->release_folio()).
>
> --D
^ permalink raw reply [flat|nested] 70+ messages in thread
* Re: [PATCH v1 5/8] iomap: add iomap_writeback_dirty_folio()
2025-07-02 21:36 ` Joanne Koong
@ 2025-07-02 21:47 ` Joanne Koong
0 siblings, 0 replies; 70+ messages in thread
From: Joanne Koong @ 2025-07-02 21:47 UTC (permalink / raw)
To: Darrick J. Wong
Cc: Christoph Hellwig, Matthew Wilcox, Jeff Layton, miklos, brauner,
linux-fsdevel, linux-xfs, bernd.schubert, kernel-team, linux-mm,
linux-nfs, linux-btrfs
On Wed, Jul 2, 2025 at 2:36 PM Joanne Koong <joannelkoong@gmail.com> wrote:
>
> On Mon, Jun 30, 2025 at 10:41 PM Darrick J. Wong <djwong@kernel.org> wrote:
> >
> > On Wed, Jun 25, 2025 at 09:44:31AM -0700, Joanne Koong wrote:
> > > On Tue, Jun 24, 2025 at 11:26 PM Christoph Hellwig <hch@infradead.org> wrote:
> > > >
> > > > On Tue, Jun 24, 2025 at 10:26:01PM -0700, Joanne Koong wrote:
> > > > > > The question is whether this is acceptable for all the filesystem
> > > > > > which implement ->launder_folio today. Because we could just move the
> > > > > > folio_test_dirty() to after the folio_lock() and remove all the testing
> > > > > > of folio dirtiness from individual filesystems.
> > > > >
> > > > > Or could the filesystems that implement ->launder_folio (from what I
> > > > > see, there's only 4: fuse, nfs, btrfs, and orangefs) just move that
> > > > > logic into their .release_folio implementation? I don't see why not.
> > > > > In folio_unmap_invalidate(), we call:
> > > >
> > > > Without even looking into the details from the iomap POV that basically
> > > > doesn't matter. You'd still need the write back a single locked folio
> > > > interface, which adds API surface, and because it only writes a single
> > > > folio at a time is rather inefficient. Not a deal breaker because
> > > > the current version look ok, but it would still be preferable to not
> > > > have an extra magic interface for it.
> > > >
> > >
> > > Yes but as I understand it, the focus right now is on getting rid of
> > > ->launder_folio as an API. The iomap pov imo is a separate issue with
> > > determining whether fuse in particular needs to write back the dirty
> > > page before releasing or should just fail.
> >
> > This might not help for Joanne's case, but so far the lack of a
> > launder_folio in my fuse+iomap prototype hasn't hindered it at all.
> > From what I can tell it's ok to bounce EBUSY back to dio callers...
> >
> > > btrfs uses ->launder_folio() to free some previously allocated
> > > reservation (added in commit 872617a "btrfs: implement launder_folio
> > > for clearing dirty page reserve") so at the very least, that logic
> > > would need to be moved to .release_folio() (if that suffices? Adding
> > > the btrfs group to cc). It's still vague to me whether
> > > fuse/nfs/orangefs need to write back the dirty page, but it seems fine
> >
> > ...but only because a retry will initiate another writeback so
> > eventually we can make some forward progress. But it helps a lot that
> > fuse+iomap is handing the entire IO stack over to iomap.
> >
> > > to me not to - as I understand it, the worst that can happen (and
> > > please correct me if I'm wrong here, Matthew) from just failing it
> > > with -EBUSY is that the folio lingers longer in the page cache until
> > > it eventually gets written back and cleared out, and that only happens
> > > if the file is mapped and written to in that window between
> > > filemap_write_and_wait_range() and unmap_mapping_folio(). afaics, if
> > > fuse/nfs/orangefs do need to write back the dirty folio instead of
> > > failing w/ -EBUSY, they could just do that logic in .release_folio.
> >
> > What do you do in ->release_folio if writeback fails? Redirty it and
> > return false?
>
> Yeah, I was thinking we just redirty it and return false. I don't
> think that leads to any deviation from existing behavior (eg in
> folio_unmap_invalidate(), a failed writeback will return -EBUSY
> regardless of whether the writeback attempt happens from
> ->launder_folio() or ->release_folio()).
Or actually I guess the one deviation is that with ->launder_folio()
it can return back a custom error code (eg -ENOMEM) to
folio_unmap_invalidate() whereas release_folio() errors will default
to -EBUSY, but the error code propagated to folio->mapping will be the
same.
> >
> > --D
^ permalink raw reply [flat|nested] 70+ messages in thread
* Re: [PATCH v1 5/8] iomap: add iomap_writeback_dirty_folio()
2025-06-25 16:44 ` Joanne Koong
2025-07-01 5:41 ` Darrick J. Wong
@ 2025-07-01 6:23 ` Miklos Szeredi
1 sibling, 0 replies; 70+ messages in thread
From: Miklos Szeredi @ 2025-07-01 6:23 UTC (permalink / raw)
To: Joanne Koong
Cc: Christoph Hellwig, Matthew Wilcox, Jeff Layton, Darrick J. Wong,
brauner, linux-fsdevel, linux-xfs, bernd.schubert, kernel-team,
linux-mm, linux-nfs, linux-btrfs
On Wed, 25 Jun 2025 at 18:44, Joanne Koong <joannelkoong@gmail.com> wrote:
> Yes but as I understand it, the focus right now is on getting rid of
> ->launder_folio as an API. The iomap pov imo is a separate issue with
> determining whether fuse in particular needs to write back the dirty
> page before releasing or should just fail.
Fuse calls invalidate_inode_pages2() not just for direct I/O:
- open without FOPEN_KEEP_CACHE
- FUSE_NOTIFY_INVAL_INODE
- mtime/size change with FUSE_AUTO_INVAL_DATA turned
on/FUSE_EXPLICIT_INVAL_DATA turned off
- truncate
In most of these cases dirty pages d need to be written back.
Thanks,
Miklos
^ permalink raw reply [flat|nested] 70+ messages in thread
* Re: [PATCH v1 5/8] iomap: add iomap_writeback_dirty_folio()
2025-06-09 4:51 ` Christoph Hellwig
2025-06-09 17:14 ` Darrick J. Wong
@ 2025-06-09 23:30 ` Joanne Koong
2025-06-10 4:03 ` Christoph Hellwig
1 sibling, 1 reply; 70+ messages in thread
From: Joanne Koong @ 2025-06-09 23:30 UTC (permalink / raw)
To: Christoph Hellwig
Cc: miklos, djwong, brauner, linux-fsdevel, linux-xfs, bernd.schubert,
kernel-team
On Sun, Jun 8, 2025 at 9:52 PM Christoph Hellwig <hch@infradead.org> wrote:
>
> On Fri, Jun 06, 2025 at 04:38:00PM -0700, Joanne Koong wrote:
> > Add iomap_writeback_dirty_folio() for writing back a dirty folio.
> > One use case of this is for folio laundering.
>
> Where "folio laundering" means calling ->launder_folio, right?
Yes. I'll change the wording to "laundering folios" to make this more clear.
>
> > @@ -1675,7 +1677,8 @@ static int iomap_writepage_map(struct iomap_writepage_ctx *wpc,
> > * already at this point. In that case we need to clear the writeback
> > * bit ourselves right after unlocking the page.
> > */
> > - folio_unlock(folio);
> > + if (unlock_folio)
> > + folio_unlock(folio);
> > if (ifs) {
> > if (atomic_dec_and_test(&ifs->write_bytes_pending))
> > folio_end_writeback(folio);
>
> When writing this code I was under the impression that
> folio_end_writeback needs to be called after unlocking the page.
>
> If that is not actually the case we can just move the unlocking into the
> caller and make things a lot cleaner than the conditional locking
> argument.
I don't think we can move this into the caller of ->writeback_folio()
/ ->map_blocks(). iomap does some preliminary optimization checking
(eg skipping writing back truncated ranges) in
iomap_writepage_handle_eof() which will unlock the folio if succesful
and is called separately from those callbacks. As well it's not clear
for the caller to know when the folio can be unlocked (eg if the range
being written back / mapped is the last range in that folio).
>
> > +int iomap_writeback_dirty_folio(struct folio *folio, struct writeback_control *wbc,
> > + struct iomap_writepage_ctx *wpc,
> > + const struct iomap_writeback_ops *ops)
>
> Please stick to the usual iomap coding style: 80 character lines,
> two-tab indent for multiline function declarations. (Also in a few
> other places).
Will do.
>
>
^ permalink raw reply [flat|nested] 70+ messages in thread
* Re: [PATCH v1 5/8] iomap: add iomap_writeback_dirty_folio()
2025-06-09 23:30 ` Joanne Koong
@ 2025-06-10 4:03 ` Christoph Hellwig
0 siblings, 0 replies; 70+ messages in thread
From: Christoph Hellwig @ 2025-06-10 4:03 UTC (permalink / raw)
To: Joanne Koong
Cc: Christoph Hellwig, miklos, djwong, brauner, linux-fsdevel,
linux-xfs, bernd.schubert, kernel-team
On Mon, Jun 09, 2025 at 04:30:14PM -0700, Joanne Koong wrote:
> > When writing this code I was under the impression that
> > folio_end_writeback needs to be called after unlocking the page.
> >
> > If that is not actually the case we can just move the unlocking into the
> > caller and make things a lot cleaner than the conditional locking
> > argument.
>
> I don't think we can move this into the caller of ->writeback_folio()
> / ->map_blocks(). iomap does some preliminary optimization checking
> (eg skipping writing back truncated ranges) in
> iomap_writepage_handle_eof() which will unlock the folio if succesful
> and is called separately from those callbacks. As well it's not clear
> for the caller to know when the folio can be unlocked (eg if the range
> being written back / mapped is the last range in that folio).
Note that with caller I meant the immediate caller of
iomap_writepage_map, i.e. iomap_writepages only for the existing code
that needs the unlock, and your new iomap_writeback_dirty_folio
that doesn't.
^ permalink raw reply [flat|nested] 70+ messages in thread
* [PATCH v1 6/8] fuse: use iomap for buffered writes
2025-06-06 23:37 [PATCH v1 0/8] fuse: use iomap for buffered writes + writeback Joanne Koong
` (4 preceding siblings ...)
2025-06-06 23:38 ` [PATCH v1 5/8] iomap: add iomap_writeback_dirty_folio() Joanne Koong
@ 2025-06-06 23:38 ` Joanne Koong
2025-06-06 23:38 ` [PATCH v1 7/8] fuse: use iomap for writeback Joanne Koong
` (4 subsequent siblings)
10 siblings, 0 replies; 70+ messages in thread
From: Joanne Koong @ 2025-06-06 23:38 UTC (permalink / raw)
To: miklos
Cc: djwong, brauner, linux-fsdevel, linux-xfs, bernd.schubert,
kernel-team
Have buffered writes go through iomap so that large folios in fuse can
have granular large folio dirty tracking. This has no effect on
functionality until fuse's writepages callback also calls into iomap.
Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
---
fs/fuse/Kconfig | 1 +
fs/fuse/file.c | 141 ++++++++++++++++++------------------------------
2 files changed, 53 insertions(+), 89 deletions(-)
diff --git a/fs/fuse/Kconfig b/fs/fuse/Kconfig
index ca215a3cba3e..a774166264de 100644
--- a/fs/fuse/Kconfig
+++ b/fs/fuse/Kconfig
@@ -2,6 +2,7 @@
config FUSE_FS
tristate "FUSE (Filesystem in Userspace) support"
select FS_POSIX_ACL
+ select FS_IOMAP
help
With FUSE it is possible to implement a fully functional filesystem
in a userspace program.
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 3d0b33be3824..a0118b501880 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -21,6 +21,7 @@
#include <linux/filelock.h>
#include <linux/splice.h>
#include <linux/task_io_accounting_ops.h>
+#include <linux/iomap.h>
static int fuse_send_open(struct fuse_mount *fm, u64 nodeid,
unsigned int open_flags, int opcode,
@@ -788,12 +789,16 @@ static void fuse_short_read(struct inode *inode, u64 attr_ver, size_t num_read,
}
}
-static int fuse_do_readfolio(struct file *file, struct folio *folio)
+static int fuse_do_readfolio(struct file *file, struct folio *folio,
+ size_t off, size_t len)
{
struct inode *inode = folio->mapping->host;
struct fuse_mount *fm = get_fuse_mount(inode);
- loff_t pos = folio_pos(folio);
- struct fuse_folio_desc desc = { .length = folio_size(folio) };
+ loff_t pos = folio_pos(folio) + off;
+ struct fuse_folio_desc desc = {
+ .offset = off,
+ .length = len,
+ };
struct fuse_io_args ia = {
.ap.args.page_zeroing = true,
.ap.args.out_pages = true,
@@ -820,8 +825,6 @@ static int fuse_do_readfolio(struct file *file, struct folio *folio)
if (res < desc.length)
fuse_short_read(inode, attr_ver, res, &ia.ap);
- folio_mark_uptodate(folio);
-
return 0;
}
@@ -834,13 +837,25 @@ static int fuse_read_folio(struct file *file, struct folio *folio)
if (fuse_is_bad(inode))
goto out;
- err = fuse_do_readfolio(file, folio);
+ err = fuse_do_readfolio(file, folio, 0, folio_size(folio));
+ if (!err)
+ folio_mark_uptodate(folio);
+
fuse_invalidate_atime(inode);
out:
folio_unlock(folio);
return err;
}
+static int fuse_iomap_read_folio_sync(loff_t block_start, struct folio *folio,
+ size_t off, size_t len, const struct iomap *iomap,
+ void *private)
+{
+ struct file *file = private;
+
+ return fuse_do_readfolio(file, folio, off, len);
+}
+
static void fuse_readpages_end(struct fuse_mount *fm, struct fuse_args *args,
int err)
{
@@ -1375,6 +1390,25 @@ static void fuse_dio_unlock(struct kiocb *iocb, bool exclusive)
}
}
+static const struct iomap_folio_ops fuse_iomap_folio_ops = {
+ .read_folio_sync = fuse_iomap_read_folio_sync,
+};
+
+static int fuse_write_iomap_begin(struct inode *inode, loff_t offset,
+ loff_t length, unsigned int flags,
+ struct iomap *iomap, struct iomap *srcmap)
+{
+ iomap->type = IOMAP_IN_MEM;
+ iomap->folio_ops = &fuse_iomap_folio_ops;
+ iomap->length = length;
+ iomap->offset = offset;
+ return 0;
+}
+
+static const struct iomap_ops fuse_write_iomap_ops = {
+ .iomap_begin = fuse_write_iomap_begin,
+};
+
static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
@@ -1384,6 +1418,7 @@ static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from)
struct inode *inode = mapping->host;
ssize_t err, count;
struct fuse_conn *fc = get_fuse_conn(inode);
+ bool writeback = false;
if (fc->writeback_cache) {
/* Update size (EOF optimization) and mode (SUID clearing) */
@@ -1397,8 +1432,7 @@ static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from)
file_inode(file))) {
goto writethrough;
}
-
- return generic_file_write_iter(iocb, from);
+ writeback = true;
}
writethrough:
@@ -1420,6 +1454,13 @@ static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from)
goto out;
written = direct_write_fallback(iocb, from, written,
fuse_perform_write(iocb, from));
+ } else if (writeback) {
+ /*
+ * Use iomap so that we get granular dirty tracking for
+ * writing back large folios.
+ */
+ written = iomap_file_buffered_write(iocb, from,
+ &fuse_write_iomap_ops, file);
} else {
written = fuse_perform_write(iocb, from);
}
@@ -2209,84 +2250,6 @@ static int fuse_writepages(struct address_space *mapping,
return err;
}
-/*
- * It's worthy to make sure that space is reserved on disk for the write,
- * but how to implement it without killing performance need more thinking.
- */
-static int fuse_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, struct folio **foliop, void **fsdata)
-{
- pgoff_t index = pos >> PAGE_SHIFT;
- struct fuse_conn *fc = get_fuse_conn(file_inode(file));
- struct folio *folio;
- loff_t fsize;
- int err = -ENOMEM;
-
- WARN_ON(!fc->writeback_cache);
-
- folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN,
- mapping_gfp_mask(mapping));
- if (IS_ERR(folio))
- goto error;
-
- if (folio_test_uptodate(folio) || len >= folio_size(folio))
- goto success;
- /*
- * Check if the start of this folio comes after the end of file,
- * in which case the readpage can be optimized away.
- */
- fsize = i_size_read(mapping->host);
- if (fsize <= folio_pos(folio)) {
- size_t off = offset_in_folio(folio, pos);
- if (off)
- folio_zero_segment(folio, 0, off);
- goto success;
- }
- err = fuse_do_readfolio(file, folio);
- if (err)
- goto cleanup;
-success:
- *foliop = folio;
- return 0;
-
-cleanup:
- folio_unlock(folio);
- folio_put(folio);
-error:
- return err;
-}
-
-static int fuse_write_end(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied,
- struct folio *folio, void *fsdata)
-{
- struct inode *inode = folio->mapping->host;
-
- /* Haven't copied anything? Skip zeroing, size extending, dirtying. */
- if (!copied)
- goto unlock;
-
- pos += copied;
- if (!folio_test_uptodate(folio)) {
- /* Zero any unwritten bytes at the end of the page */
- size_t endoff = pos & ~PAGE_MASK;
- if (endoff)
- folio_zero_segment(folio, endoff, PAGE_SIZE);
- folio_mark_uptodate(folio);
- }
-
- if (pos > inode->i_size)
- i_size_write(inode, pos);
-
- folio_mark_dirty(folio);
-
-unlock:
- folio_unlock(folio);
- folio_put(folio);
-
- return copied;
-}
-
static int fuse_launder_folio(struct folio *folio)
{
int err = 0;
@@ -3144,12 +3107,12 @@ static const struct address_space_operations fuse_file_aops = {
.readahead = fuse_readahead,
.writepages = fuse_writepages,
.launder_folio = fuse_launder_folio,
- .dirty_folio = filemap_dirty_folio,
+ .dirty_folio = iomap_dirty_folio,
+ .release_folio = iomap_release_folio,
+ .invalidate_folio = iomap_invalidate_folio,
.migrate_folio = filemap_migrate_folio,
.bmap = fuse_bmap,
.direct_IO = fuse_direct_IO,
- .write_begin = fuse_write_begin,
- .write_end = fuse_write_end,
};
void fuse_init_file_inode(struct inode *inode, unsigned int flags)
--
2.47.1
^ permalink raw reply related [flat|nested] 70+ messages in thread
* [PATCH v1 7/8] fuse: use iomap for writeback
2025-06-06 23:37 [PATCH v1 0/8] fuse: use iomap for buffered writes + writeback Joanne Koong
` (5 preceding siblings ...)
2025-06-06 23:38 ` [PATCH v1 6/8] fuse: use iomap for buffered writes Joanne Koong
@ 2025-06-06 23:38 ` Joanne Koong
2025-06-08 19:20 ` Anuj gupta
2025-06-06 23:38 ` [PATCH v1 8/8] fuse: use iomap for folio laundering Joanne Koong
` (3 subsequent siblings)
10 siblings, 1 reply; 70+ messages in thread
From: Joanne Koong @ 2025-06-06 23:38 UTC (permalink / raw)
To: miklos
Cc: djwong, brauner, linux-fsdevel, linux-xfs, bernd.schubert,
kernel-team
Use iomap for dirty folio writeback in ->writepages().
This allows for granular dirty writeback of large folios.
Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
---
fs/fuse/file.c | 114 ++++++++++++++++++++++++++++---------------------
1 file changed, 66 insertions(+), 48 deletions(-)
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index a0118b501880..31842ee1ce0e 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1835,7 +1835,7 @@ static void fuse_writepage_finish(struct fuse_writepage_args *wpa)
* scope of the fi->lock alleviates xarray lock
* contention and noticeably improves performance.
*/
- folio_end_writeback(ap->folios[i]);
+ iomap_finish_folio_write(inode, ap->folios[i], 1);
dec_wb_stat(&bdi->wb, WB_WRITEBACK);
wb_writeout_inc(&bdi->wb);
}
@@ -2022,19 +2022,20 @@ static void fuse_writepage_add_to_bucket(struct fuse_conn *fc,
}
static void fuse_writepage_args_page_fill(struct fuse_writepage_args *wpa, struct folio *folio,
- uint32_t folio_index)
+ uint32_t folio_index, loff_t offset, unsigned len)
{
struct inode *inode = folio->mapping->host;
struct fuse_args_pages *ap = &wpa->ia.ap;
ap->folios[folio_index] = folio;
- ap->descs[folio_index].offset = 0;
- ap->descs[folio_index].length = folio_size(folio);
+ ap->descs[folio_index].offset = offset;
+ ap->descs[folio_index].length = len;
inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK);
}
static struct fuse_writepage_args *fuse_writepage_args_setup(struct folio *folio,
+ size_t offset,
struct fuse_file *ff)
{
struct inode *inode = folio->mapping->host;
@@ -2047,7 +2048,7 @@ static struct fuse_writepage_args *fuse_writepage_args_setup(struct folio *folio
return NULL;
fuse_writepage_add_to_bucket(fc, wpa);
- fuse_write_args_fill(&wpa->ia, ff, folio_pos(folio), 0);
+ fuse_write_args_fill(&wpa->ia, ff, folio_pos(folio) + offset, 0);
wpa->ia.write.in.write_flags |= FUSE_WRITE_CACHE;
wpa->inode = inode;
wpa->ia.ff = ff;
@@ -2103,7 +2104,7 @@ struct fuse_fill_wb_data {
struct fuse_file *ff;
struct inode *inode;
unsigned int max_folios;
- unsigned int nr_pages;
+ unsigned int nr_bytes;
};
static bool fuse_pages_realloc(struct fuse_fill_wb_data *data)
@@ -2145,21 +2146,28 @@ static void fuse_writepages_send(struct fuse_fill_wb_data *data)
}
static bool fuse_writepage_need_send(struct fuse_conn *fc, struct folio *folio,
+ loff_t offset, unsigned len,
struct fuse_args_pages *ap,
struct fuse_fill_wb_data *data)
{
+ struct folio *prev_folio;
+ struct fuse_folio_desc prev_desc;
+
WARN_ON(!ap->num_folios);
/* Reached max pages */
- if (data->nr_pages + folio_nr_pages(folio) > fc->max_pages)
+ if ((data->nr_bytes + len) / PAGE_SIZE > fc->max_pages)
return true;
/* Reached max write bytes */
- if ((data->nr_pages * PAGE_SIZE) + folio_size(folio) > fc->max_write)
+ if (data->nr_bytes + len > fc->max_write)
return true;
/* Discontinuity */
- if (folio_next_index(ap->folios[ap->num_folios - 1]) != folio_index(folio))
+ prev_folio = ap->folios[ap->num_folios - 1];
+ prev_desc = ap->descs[ap->num_folios - 1];
+ if ((folio_pos(prev_folio) + prev_desc.offset + prev_desc.length) !=
+ folio_pos(folio) + offset)
return true;
/* Need to grow the pages array? If so, did the expansion fail? */
@@ -2169,85 +2177,95 @@ static bool fuse_writepage_need_send(struct fuse_conn *fc, struct folio *folio,
return false;
}
-static int fuse_writepages_fill(struct folio *folio,
- struct writeback_control *wbc, void *_data)
+static int fuse_iomap_writeback_folio(struct iomap_writepage_ctx *wpc,
+ struct folio *folio, struct inode *inode,
+ loff_t offset, unsigned len)
{
- struct fuse_fill_wb_data *data = _data;
+ struct fuse_fill_wb_data *data = wpc->private;
struct fuse_writepage_args *wpa = data->wpa;
struct fuse_args_pages *ap = &wpa->ia.ap;
- struct inode *inode = data->inode;
- struct fuse_inode *fi = get_fuse_inode(inode);
struct fuse_conn *fc = get_fuse_conn(inode);
- int err;
+ struct fuse_inode *fi = get_fuse_inode(inode);
+
+ /* len will always be page aligned */
+ WARN_ON_ONCE(len & (PAGE_SIZE - 1));
if (!data->ff) {
- err = -EIO;
data->ff = fuse_write_file_get(fi);
if (!data->ff)
- goto out_unlock;
+ return -EIO;
}
- if (wpa && fuse_writepage_need_send(fc, folio, ap, data)) {
+ iomap_start_folio_write(inode, folio, 1);
+
+ if (wpa && fuse_writepage_need_send(fc, folio, offset, len, ap, data)) {
fuse_writepages_send(data);
data->wpa = NULL;
- data->nr_pages = 0;
+ data->nr_bytes = 0;
}
if (data->wpa == NULL) {
- err = -ENOMEM;
- wpa = fuse_writepage_args_setup(folio, data->ff);
+ wpa = fuse_writepage_args_setup(folio, offset, data->ff);
if (!wpa)
- goto out_unlock;
+ return -ENOMEM;
fuse_file_get(wpa->ia.ff);
data->max_folios = 1;
ap = &wpa->ia.ap;
}
- folio_start_writeback(folio);
- fuse_writepage_args_page_fill(wpa, folio, ap->num_folios);
- data->nr_pages += folio_nr_pages(folio);
+ fuse_writepage_args_page_fill(wpa, folio, ap->num_folios,
+ offset, len);
+ data->nr_bytes += len;
- err = 0;
ap->num_folios++;
if (!data->wpa)
data->wpa = wpa;
-out_unlock:
- folio_unlock(folio);
- return err;
+ return 0;
+}
+
+static int fuse_iomap_submit_ioend(struct iomap_writepage_ctx *wpc, int error)
+{
+ struct fuse_fill_wb_data *data = wpc->private;
+ WARN_ON_ONCE(!data);
+
+ if (data->wpa) {
+ WARN_ON(!data->wpa->ia.ap.num_folios);
+ fuse_writepages_send(data);
+ }
+
+ if (data->ff)
+ fuse_file_put(data->ff, false);
+
+ return error;
}
+static const struct iomap_writeback_ops fuse_writeback_ops = {
+ .writeback_folio = fuse_iomap_writeback_folio,
+ .submit_ioend = fuse_iomap_submit_ioend,
+};
+
static int fuse_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
struct inode *inode = mapping->host;
struct fuse_conn *fc = get_fuse_conn(inode);
- struct fuse_fill_wb_data data;
- int err;
+ struct fuse_fill_wb_data data = {
+ .inode = inode,
+ };
+ struct iomap_writepage_ctx wpc = {
+ .iomap.type = IOMAP_IN_MEM,
+ .private = &data,
+ };
- err = -EIO;
if (fuse_is_bad(inode))
- goto out;
+ return -EIO;
if (wbc->sync_mode == WB_SYNC_NONE &&
fc->num_background >= fc->congestion_threshold)
return 0;
- data.inode = inode;
- data.wpa = NULL;
- data.ff = NULL;
- data.nr_pages = 0;
-
- err = write_cache_pages(mapping, wbc, fuse_writepages_fill, &data);
- if (data.wpa) {
- WARN_ON(!data.wpa->ia.ap.num_folios);
- fuse_writepages_send(&data);
- }
- if (data.ff)
- fuse_file_put(data.ff, false);
-
-out:
- return err;
+ return iomap_writepages(mapping, wbc, &wpc, &fuse_writeback_ops);
}
static int fuse_launder_folio(struct folio *folio)
--
2.47.1
^ permalink raw reply related [flat|nested] 70+ messages in thread
* [PATCH v1 8/8] fuse: use iomap for folio laundering
2025-06-06 23:37 [PATCH v1 0/8] fuse: use iomap for buffered writes + writeback Joanne Koong
` (6 preceding siblings ...)
2025-06-06 23:38 ` [PATCH v1 7/8] fuse: use iomap for writeback Joanne Koong
@ 2025-06-06 23:38 ` Joanne Koong
2025-06-08 19:12 ` [PATCH v1 0/8] fuse: use iomap for buffered writes + writeback Anuj gupta
` (2 subsequent siblings)
10 siblings, 0 replies; 70+ messages in thread
From: Joanne Koong @ 2025-06-06 23:38 UTC (permalink / raw)
To: miklos
Cc: djwong, brauner, linux-fsdevel, linux-xfs, bernd.schubert,
kernel-team
Use iomap for folio laundering, which will do granular dirty
writeback when laundering a large folio.
Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
---
fs/fuse/file.c | 49 +++++++++----------------------------------------
1 file changed, 9 insertions(+), 40 deletions(-)
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 31842ee1ce0e..f88603c35354 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -2060,45 +2060,6 @@ static struct fuse_writepage_args *fuse_writepage_args_setup(struct folio *folio
return wpa;
}
-static int fuse_writepage_locked(struct folio *folio)
-{
- struct address_space *mapping = folio->mapping;
- struct inode *inode = mapping->host;
- struct fuse_inode *fi = get_fuse_inode(inode);
- struct fuse_writepage_args *wpa;
- struct fuse_args_pages *ap;
- struct fuse_file *ff;
- int error = -EIO;
-
- ff = fuse_write_file_get(fi);
- if (!ff)
- goto err;
-
- wpa = fuse_writepage_args_setup(folio, ff);
- error = -ENOMEM;
- if (!wpa)
- goto err_writepage_args;
-
- ap = &wpa->ia.ap;
- ap->num_folios = 1;
-
- folio_start_writeback(folio);
- fuse_writepage_args_page_fill(wpa, folio, 0);
-
- spin_lock(&fi->lock);
- list_add_tail(&wpa->queue_entry, &fi->queued_writes);
- fuse_flush_writepages(inode);
- spin_unlock(&fi->lock);
-
- return 0;
-
-err_writepage_args:
- fuse_file_put(ff, false);
-err:
- mapping_set_error(folio->mapping, error);
- return error;
-}
-
struct fuse_fill_wb_data {
struct fuse_writepage_args *wpa;
struct fuse_file *ff;
@@ -2271,8 +2232,16 @@ static int fuse_writepages(struct address_space *mapping,
static int fuse_launder_folio(struct folio *folio)
{
int err = 0;
+ struct fuse_fill_wb_data data = {
+ .inode = folio->mapping->host,
+ };
+ struct iomap_writepage_ctx wpc = {
+ .iomap.type = IOMAP_IN_MEM,
+ .private = &data,
+ };
+
if (folio_clear_dirty_for_io(folio)) {
- err = fuse_writepage_locked(folio);
+ err = iomap_writeback_dirty_folio(folio, NULL, &wpc, &fuse_writeback_ops);
if (!err)
folio_wait_writeback(folio);
}
--
2.47.1
^ permalink raw reply related [flat|nested] 70+ messages in thread
* Re: [PATCH v1 0/8] fuse: use iomap for buffered writes + writeback
2025-06-06 23:37 [PATCH v1 0/8] fuse: use iomap for buffered writes + writeback Joanne Koong
` (7 preceding siblings ...)
2025-06-06 23:38 ` [PATCH v1 8/8] fuse: use iomap for folio laundering Joanne Koong
@ 2025-06-08 19:12 ` Anuj gupta
2025-06-09 19:59 ` Joanne Koong
2025-06-09 4:40 ` Christoph Hellwig
2025-06-10 0:47 ` Dave Chinner
10 siblings, 1 reply; 70+ messages in thread
From: Anuj gupta @ 2025-06-08 19:12 UTC (permalink / raw)
To: Joanne Koong
Cc: miklos, djwong, brauner, linux-fsdevel, linux-xfs, bernd.schubert,
kernel-team, Anuj Gupta
On Sat, Jun 7, 2025 at 5:12 AM Joanne Koong <joannelkoong@gmail.com> wrote:
>
> This series adds fuse iomap support for buffered writes and dirty folio
> writeback. This is needed so that granular dirty tracking can be used in
> fuse when large folios are enabled so that if only a few bytes in a large
> folio are dirty, only a smaller portion is written out instead of the entire
> folio.
>
> In order to do so, a new iomap type, IOMAP_IN_MEM, is added that is more
> generic and does not depend on the block layer. The parts of iomap buffer io
> that depend on bios and CONFIG_BLOCK is moved to a separate file,
> buffered-io-bio.c, in order to allow filesystems that do not have CONFIG_BLOCK
> set to use IOMAP_IN_MEM buffered io.
>
> This series was run through fstests with large folios enabled and through
> some quick sanity checks on passthrough_hp with a) writing 1 GB in 1 MB chunks
> and then going back and dirtying a few bytes in each chunk and b) writing 50 MB
> in 1 MB chunks and going through dirtying the entire chunk for several runs.
> a) showed about a 40% speedup increase with iomap support added and b) showed
> roughly the same performance.
>
> This patchset does not enable large folios yet. That will be sent out in a
> separate future patchset.
>
>
> Thanks,
> Joanne
Hi Joanne,
I tried experimenting with your patch series to evaluate its impact. To
measure the improvement, I enabled large folios for FUSE. In my setup,
I observed a ~43% reduction in writeback time.
Here’s the script[1] I used to benchmark FUSE writeback performance
based on the details you shared: It formats and mounts an XFS volume,
runs the passthrough_hp FUSE daemon, writes 1MB chunks to populate the
file, and then issues 4-byte overwrites to test fine-grained writeback
behavior.
If I’ve missed anything or there’s a better way to evaluate this, I’d
really appreciate your input — I’m still getting up to speed with FUSE
internals.
[1]
#!/bin/bash
set -e
DEVICE="/dev/nvme0n1"
BACKING_MNT="/mnt"
FUSE_MNT="/tmp/fusefs"
CHUNK_MB=1
TOTAL_MB=1024
DIRTY_BYTES=4
REPEATS=5
LOGFILE="fuse_test_results.csv"
DIR=$(date +"%H-%M-%S-%d-%m-%y")
mkdir $DIR
echo "$DIR created"
mkdir -p "$BACKING_MNT" "$FUSE_MNT"
echo "run,duration_seconds" > "$LOGFILE"
for run in $(seq 1 $REPEATS); do
echo "[Run $run] Formatting $DEVICE with XFS..."
mkfs.xfs -f "$DEVICE"
echo "[Run $run] Mounting XFS to $BACKING_MNT..."
mount "$DEVICE" "$BACKING_MNT"
echo "[Run $run] Starting passthrough_hp on $FUSE_MNT..."
./passthrough_hp --nopassthrough "$BACKING_MNT" "$FUSE_MNT" &
sleep 2
echo "[Run $run] Dropping caches and syncing..."
sync
echo 3 > /proc/sys/vm/drop_caches
TEST_FILE="$FUSE_MNT/testfile_run${run}"
for ((i=0; i<$TOTAL_MB; i++)); do
dd if=/dev/urandom bs=1M count=1 oflag=direct seek=$i
of=$TEST_FILE status=none
done
START=$(date +%s.%N)
for ((i=0; i<$TOTAL_MB; i++)); do
offset=$((i * 1048576 + 1048572))
#offset=$((i * 1048576))
dd if=/dev/urandom bs=1 count=$DIRTY_BYTES of=$TEST_FILE
seek=$offset status=none
done
fusermount -u "$FUSE_MNT"
umount "$BACKING_MNT"
END=$(date +%s.%N)
DURATION=$(echo "$END - $START" | bc)
echo "$run,$DURATION" >> $DIR/"$LOGFILE"
echo "[Run $run] Duration: ${DURATION}s"
done
echo "All runs complete. Results saved to $DIR/$LOGFILE."
^ permalink raw reply [flat|nested] 70+ messages in thread
* Re: [PATCH v1 0/8] fuse: use iomap for buffered writes + writeback
2025-06-08 19:12 ` [PATCH v1 0/8] fuse: use iomap for buffered writes + writeback Anuj gupta
@ 2025-06-09 19:59 ` Joanne Koong
2025-06-14 14:22 ` Anuj gupta
0 siblings, 1 reply; 70+ messages in thread
From: Joanne Koong @ 2025-06-09 19:59 UTC (permalink / raw)
To: Anuj gupta
Cc: miklos, djwong, brauner, linux-fsdevel, linux-xfs, bernd.schubert,
kernel-team, Anuj Gupta
On Sun, Jun 8, 2025 at 12:13 PM Anuj gupta <anuj1072538@gmail.com> wrote:
>
> On Sat, Jun 7, 2025 at 5:12 AM Joanne Koong <joannelkoong@gmail.com> wrote:
> >
> > This series adds fuse iomap support for buffered writes and dirty folio
> > writeback. This is needed so that granular dirty tracking can be used in
> > fuse when large folios are enabled so that if only a few bytes in a large
> > folio are dirty, only a smaller portion is written out instead of the entire
> > folio.
> >
> > In order to do so, a new iomap type, IOMAP_IN_MEM, is added that is more
> > generic and does not depend on the block layer. The parts of iomap buffer io
> > that depend on bios and CONFIG_BLOCK is moved to a separate file,
> > buffered-io-bio.c, in order to allow filesystems that do not have CONFIG_BLOCK
> > set to use IOMAP_IN_MEM buffered io.
> >
> > This series was run through fstests with large folios enabled and through
> > some quick sanity checks on passthrough_hp with a) writing 1 GB in 1 MB chunks
> > and then going back and dirtying a few bytes in each chunk and b) writing 50 MB
> > in 1 MB chunks and going through dirtying the entire chunk for several runs.
> > a) showed about a 40% speedup increase with iomap support added and b) showed
> > roughly the same performance.
> >
> > This patchset does not enable large folios yet. That will be sent out in a
> > separate future patchset.
> >
> >
> > Thanks,
> > Joanne
>
> Hi Joanne,
>
> I tried experimenting with your patch series to evaluate its impact. To
> measure the improvement, I enabled large folios for FUSE. In my setup,
> I observed a ~43% reduction in writeback time.
>
> Here’s the script[1] I used to benchmark FUSE writeback performance
> based on the details you shared: It formats and mounts an XFS volume,
> runs the passthrough_hp FUSE daemon, writes 1MB chunks to populate the
> file, and then issues 4-byte overwrites to test fine-grained writeback
> behavior.
>
> If I’ve missed anything or there’s a better way to evaluate this, I’d
> really appreciate your input — I’m still getting up to speed with FUSE
> internals.
Hi Anuj,
Thanks for testing it out locally and sharing the benchmarks. Your
test is pretty much the same as mine (except the underlying filesystem
I used for passthrough is ext4). I saw roughly a 40% speedup as well
for buffered writes.
My main concern was whether iomap overhead ends up slowing down writes
that don't need granular dirty tracking. I didn't see any noticeable
difference in performance though when I tested it out by writing out
all entire chunks.
>
> [1]
>
> #!/bin/bash
> set -e
>
> DEVICE="/dev/nvme0n1"
> BACKING_MNT="/mnt"
> FUSE_MNT="/tmp/fusefs"
> CHUNK_MB=1
> TOTAL_MB=1024
> DIRTY_BYTES=4
> REPEATS=5
> LOGFILE="fuse_test_results.csv"
> DIR=$(date +"%H-%M-%S-%d-%m-%y")
>
> mkdir $DIR
> echo "$DIR created"
>
> mkdir -p "$BACKING_MNT" "$FUSE_MNT"
>
> echo "run,duration_seconds" > "$LOGFILE"
>
> for run in $(seq 1 $REPEATS); do
> echo "[Run $run] Formatting $DEVICE with XFS..."
> mkfs.xfs -f "$DEVICE"
>
> echo "[Run $run] Mounting XFS to $BACKING_MNT..."
> mount "$DEVICE" "$BACKING_MNT"
>
> echo "[Run $run] Starting passthrough_hp on $FUSE_MNT..."
> ./passthrough_hp --nopassthrough "$BACKING_MNT" "$FUSE_MNT" &
> sleep 2
>
> echo "[Run $run] Dropping caches and syncing..."
> sync
> echo 3 > /proc/sys/vm/drop_caches
>
> TEST_FILE="$FUSE_MNT/testfile_run${run}"
>
> for ((i=0; i<$TOTAL_MB; i++)); do
> dd if=/dev/urandom bs=1M count=1 oflag=direct seek=$i
> of=$TEST_FILE status=none
> done
>
> START=$(date +%s.%N)
> for ((i=0; i<$TOTAL_MB; i++)); do
> offset=$((i * 1048576 + 1048572))
> #offset=$((i * 1048576))
> dd if=/dev/urandom bs=1 count=$DIRTY_BYTES of=$TEST_FILE
> seek=$offset status=none
> done
>
> fusermount -u "$FUSE_MNT"
> umount "$BACKING_MNT"
>
> END=$(date +%s.%N)
> DURATION=$(echo "$END - $START" | bc)
> echo "$run,$DURATION" >> $DIR/"$LOGFILE"
> echo "[Run $run] Duration: ${DURATION}s"
> done
>
> echo "All runs complete. Results saved to $DIR/$LOGFILE."
^ permalink raw reply [flat|nested] 70+ messages in thread
* Re: [PATCH v1 0/8] fuse: use iomap for buffered writes + writeback
2025-06-09 19:59 ` Joanne Koong
@ 2025-06-14 14:22 ` Anuj gupta
0 siblings, 0 replies; 70+ messages in thread
From: Anuj gupta @ 2025-06-14 14:22 UTC (permalink / raw)
To: Joanne Koong
Cc: miklos, djwong, brauner, linux-fsdevel, linux-xfs, bernd.schubert,
kernel-team, Anuj Gupta
> My main concern was whether iomap overhead ends up slowing down writes
> that don't need granular dirty tracking. I didn't see any noticeable
> difference in performance though when I tested it out by writing out
> all entire chunks.
>
Hi Joanne,
Sorry I couldn't get to this any earlier. But FWIW, I tried this
scenario as well (writes that don’t require fine-grained dirty
tracking). Like you mentioned, I also didn’t observe any performance
drop with the patchset applied.
^ permalink raw reply [flat|nested] 70+ messages in thread
* Re: [PATCH v1 0/8] fuse: use iomap for buffered writes + writeback
2025-06-06 23:37 [PATCH v1 0/8] fuse: use iomap for buffered writes + writeback Joanne Koong
` (8 preceding siblings ...)
2025-06-08 19:12 ` [PATCH v1 0/8] fuse: use iomap for buffered writes + writeback Anuj gupta
@ 2025-06-09 4:40 ` Christoph Hellwig
2025-06-09 12:38 ` Anuj gupta
2025-06-10 0:47 ` Dave Chinner
10 siblings, 1 reply; 70+ messages in thread
From: Christoph Hellwig @ 2025-06-09 4:40 UTC (permalink / raw)
To: Joanne Koong
Cc: miklos, djwong, brauner, linux-fsdevel, linux-xfs, bernd.schubert,
kernel-team
Can you also point to a branch or at least tell the baseline?
The patches won't apply against Linus' 6.16-rc tree.
As for the concept: I've always wanted the highlevel fuse code to
support more than just block I/O, so using it for fuse seems fine.
A lot of the details don't make me entirely happy, I hope I can
come up with good ideas for improvement.
^ permalink raw reply [flat|nested] 70+ messages in thread
* Re: [PATCH v1 0/8] fuse: use iomap for buffered writes + writeback
2025-06-09 4:40 ` Christoph Hellwig
@ 2025-06-09 12:38 ` Anuj gupta
2025-06-09 19:47 ` Joanne Koong
2025-06-10 4:04 ` Christoph Hellwig
0 siblings, 2 replies; 70+ messages in thread
From: Anuj gupta @ 2025-06-09 12:38 UTC (permalink / raw)
To: Christoph Hellwig
Cc: Joanne Koong, miklos, djwong, brauner, linux-fsdevel, linux-xfs,
bernd.schubert, kernel-team
On Mon, Jun 9, 2025 at 10:10 AM Christoph Hellwig <hch@infradead.org> wrote:
>
> Can you also point to a branch or at least tell the baseline?
> The patches won't apply against Linus' 6.16-rc tree.
>
Yes I had a hard time too, figuring that out. FWIW, it applies fine on
top of this branch [1]. It would be a great, if base commit can shared
in the next iterations.
[1] https://git.kernel.org/pub/scm/linux/kernel/git/mszeredi/fuse.git/log/?h=for-next
^ permalink raw reply [flat|nested] 70+ messages in thread
* Re: [PATCH v1 0/8] fuse: use iomap for buffered writes + writeback
2025-06-09 12:38 ` Anuj gupta
@ 2025-06-09 19:47 ` Joanne Koong
2025-06-10 4:04 ` Christoph Hellwig
1 sibling, 0 replies; 70+ messages in thread
From: Joanne Koong @ 2025-06-09 19:47 UTC (permalink / raw)
To: Anuj gupta
Cc: Christoph Hellwig, miklos, djwong, brauner, linux-fsdevel,
linux-xfs, bernd.schubert, kernel-team
On Mon, Jun 9, 2025 at 5:39 AM Anuj gupta <anuj1072538@gmail.com> wrote:
>
> On Mon, Jun 9, 2025 at 10:10 AM Christoph Hellwig <hch@infradead.org> wrote:
> >
> > Can you also point to a branch or at least tell the baseline?
> > The patches won't apply against Linus' 6.16-rc tree.
> >
> Yes I had a hard time too, figuring that out. FWIW, it applies fine on
> top of this branch [1]. It would be a great, if base commit can shared
> in the next iterations.
Thank you both for looking at this patchset.
The commit I'm on top of is dabb903910 (" fuse: increase readdir
buffer size") which is the head of for-next in Miklos's fuse tree (the
[1] Anuj linked to).
>
> [1] https://git.kernel.org/pub/scm/linux/kernel/git/mszeredi/fuse.git/log/?h=for-next
^ permalink raw reply [flat|nested] 70+ messages in thread
* Re: [PATCH v1 0/8] fuse: use iomap for buffered writes + writeback
2025-06-09 12:38 ` Anuj gupta
2025-06-09 19:47 ` Joanne Koong
@ 2025-06-10 4:04 ` Christoph Hellwig
1 sibling, 0 replies; 70+ messages in thread
From: Christoph Hellwig @ 2025-06-10 4:04 UTC (permalink / raw)
To: Anuj gupta
Cc: Christoph Hellwig, Joanne Koong, miklos, djwong, brauner,
linux-fsdevel, linux-xfs, bernd.schubert, kernel-team
On Mon, Jun 09, 2025 at 06:08:30PM +0530, Anuj gupta wrote:
> On Mon, Jun 9, 2025 at 10:10 AM Christoph Hellwig <hch@infradead.org> wrote:
> >
> > Can you also point to a branch or at least tell the baseline?
> > The patches won't apply against Linus' 6.16-rc tree.
> >
> Yes I had a hard time too, figuring that out. FWIW, it applies fine on
> top of this branch [1]. It would be a great, if base commit can shared
> in the next iterations.
>
> [1] https://git.kernel.org/pub/scm/linux/kernel/git/mszeredi/fuse.git/log/?h=for-next
Looks like this is still behind on mainline as even applying to
linux-next doesn't work. Hope the merge window has settled a bit more
before the next version.
^ permalink raw reply [flat|nested] 70+ messages in thread
* Re: [PATCH v1 0/8] fuse: use iomap for buffered writes + writeback
2025-06-06 23:37 [PATCH v1 0/8] fuse: use iomap for buffered writes + writeback Joanne Koong
` (9 preceding siblings ...)
2025-06-09 4:40 ` Christoph Hellwig
@ 2025-06-10 0:47 ` Dave Chinner
2025-06-10 4:06 ` Christoph Hellwig
2025-06-10 20:33 ` Joanne Koong
10 siblings, 2 replies; 70+ messages in thread
From: Dave Chinner @ 2025-06-10 0:47 UTC (permalink / raw)
To: Joanne Koong
Cc: miklos, djwong, brauner, linux-fsdevel, linux-xfs, bernd.schubert,
kernel-team
On Fri, Jun 06, 2025 at 04:37:55PM -0700, Joanne Koong wrote:
> This series adds fuse iomap support for buffered writes and dirty folio
> writeback. This is needed so that granular dirty tracking can be used in
> fuse when large folios are enabled so that if only a few bytes in a large
> folio are dirty, only a smaller portion is written out instead of the entire
> folio.
>
> In order to do so, a new iomap type, IOMAP_IN_MEM, is added that is more
> generic and does not depend on the block layer. The parts of iomap buffer io
> that depend on bios and CONFIG_BLOCK is moved to a separate file,
> buffered-io-bio.c, in order to allow filesystems that do not have CONFIG_BLOCK
> set to use IOMAP_IN_MEM buffered io.
>
> This series was run through fstests with large folios enabled and through
> some quick sanity checks on passthrough_hp with a) writing 1 GB in 1 MB chunks
> and then going back and dirtying a few bytes in each chunk and b) writing 50 MB
> in 1 MB chunks and going through dirtying the entire chunk for several runs.
> a) showed about a 40% speedup increase with iomap support added and b) showed
> roughly the same performance.
>
> This patchset does not enable large folios yet. That will be sent out in a
> separate future patchset.
>
>
> Thanks,
> Joanne
>
> Joanne Koong (8):
> iomap: move buffered io bio logic into separate file
> iomap: add IOMAP_IN_MEM iomap type
> iomap: add buffered write support for IOMAP_IN_MEM iomaps
> iomap: add writepages support for IOMAP_IN_MEM iomaps
AFAICT, this is just adding a synchronous "read folio" and "write
folio" hooks into iomapi that bypass the existing "map and pack"
bio-based infrastructure. i.e. there is no actual "iomapping" being
done, it's adding special case IO hooks into the IO back end
iomap bio interfaces.
Is that a fair summary of what this is doing?
If so, given that FUSE is actually a request/response protocol,
why wasn't netfs chosen as the back end infrastructure to support
large folios in the FUSE pagecache?
It's specifically designed for request/response IO interfaces that
are not block IO based, and it has infrastructure such as local file
caching built into it for optimising performance on high latency/low
bandwidth network based filesystems.
Hence it seems like this patchset is trying to duplicate
functionality that netfs already provides request/response
protocol-based filesystems, but with much less generic functionality
than netfs already provides....
Hence I'm not seeing why this IO patch was chosen for FUSE. Was
netfs considered as a candidate infrastructure large folio support
for FUSE? If so, why was iomap chosen over netfs? If not, would FUSE
be better suited to netfs integration than hacking fuse specific "no
block mapping" IO paths into infrastructure specifically optimised
for block based filesystems?
-Dave.
--
Dave Chinner
david@fromorbit.com
^ permalink raw reply [flat|nested] 70+ messages in thread
* Re: [PATCH v1 0/8] fuse: use iomap for buffered writes + writeback
2025-06-10 0:47 ` Dave Chinner
@ 2025-06-10 4:06 ` Christoph Hellwig
2025-06-10 20:33 ` Joanne Koong
1 sibling, 0 replies; 70+ messages in thread
From: Christoph Hellwig @ 2025-06-10 4:06 UTC (permalink / raw)
To: Dave Chinner
Cc: Joanne Koong, miklos, djwong, brauner, linux-fsdevel, linux-xfs,
bernd.schubert, kernel-team
On Tue, Jun 10, 2025 at 10:47:39AM +1000, Dave Chinner wrote:
> AFAICT, this is just adding a synchronous "read folio" and "write
> folio" hooks into iomapi that bypass the existing "map and pack"
> bio-based infrastructure. i.e. there is no actual "iomapping" being
> done, it's adding special case IO hooks into the IO back end
> iomap bio interfaces.
>
> Is that a fair summary of what this is doing?
>
> If so, given that FUSE is actually a request/response protocol,
> why wasn't netfs chosen as the back end infrastructure to support
> large folios in the FUSE pagecache?
>
> It's specifically designed for request/response IO interfaces that
> are not block IO based, and it has infrastructure such as local file
> caching built into it for optimising performance on high latency/low
> bandwidth network based filesystems.
Has it? Or was that part of netfs simply written because Dave
didn't listen when I explained him that the iterate over file range
part of iomap is perfectly fine for network file systems? I'd still
like to see that part of networking file systems moved over to iomap,
and I think it will have to happen if netfs wants to add nfs support.
Because with nfs we can suddently end up in the block I/O path for
a nominal network file system in the block layout, and that could
reuse all the existing iomap code..
^ permalink raw reply [flat|nested] 70+ messages in thread
* Re: [PATCH v1 0/8] fuse: use iomap for buffered writes + writeback
2025-06-10 0:47 ` Dave Chinner
2025-06-10 4:06 ` Christoph Hellwig
@ 2025-06-10 20:33 ` Joanne Koong
1 sibling, 0 replies; 70+ messages in thread
From: Joanne Koong @ 2025-06-10 20:33 UTC (permalink / raw)
To: Dave Chinner
Cc: miklos, djwong, brauner, linux-fsdevel, linux-xfs, bernd.schubert,
kernel-team
On Mon, Jun 9, 2025 at 5:47 PM Dave Chinner <david@fromorbit.com> wrote:
>
> On Fri, Jun 06, 2025 at 04:37:55PM -0700, Joanne Koong wrote:
> > This series adds fuse iomap support for buffered writes and dirty folio
> > writeback. This is needed so that granular dirty tracking can be used in
> > fuse when large folios are enabled so that if only a few bytes in a large
> > folio are dirty, only a smaller portion is written out instead of the entire
> > folio.
> >
> > In order to do so, a new iomap type, IOMAP_IN_MEM, is added that is more
> > generic and does not depend on the block layer. The parts of iomap buffer io
> > that depend on bios and CONFIG_BLOCK is moved to a separate file,
> > buffered-io-bio.c, in order to allow filesystems that do not have CONFIG_BLOCK
> > set to use IOMAP_IN_MEM buffered io.
> >
> > This series was run through fstests with large folios enabled and through
> > some quick sanity checks on passthrough_hp with a) writing 1 GB in 1 MB chunks
> > and then going back and dirtying a few bytes in each chunk and b) writing 50 MB
> > in 1 MB chunks and going through dirtying the entire chunk for several runs.
> > a) showed about a 40% speedup increase with iomap support added and b) showed
> > roughly the same performance.
> >
> > This patchset does not enable large folios yet. That will be sent out in a
> > separate future patchset.
> >
> >
> > Thanks,
> > Joanne
> >
> > Joanne Koong (8):
> > iomap: move buffered io bio logic into separate file
> > iomap: add IOMAP_IN_MEM iomap type
> > iomap: add buffered write support for IOMAP_IN_MEM iomaps
> > iomap: add writepages support for IOMAP_IN_MEM iomaps
>
> AFAICT, this is just adding a synchronous "read folio" and "write
> folio" hooks into iomapi that bypass the existing "map and pack"
> bio-based infrastructure. i.e. there is no actual "iomapping" being
> done, it's adding special case IO hooks into the IO back end
> iomap bio interfaces.
>
> Is that a fair summary of what this is doing?
>
> If so, given that FUSE is actually a request/response protocol,
> why wasn't netfs chosen as the back end infrastructure to support
> large folios in the FUSE pagecache?
>
> It's specifically designed for request/response IO interfaces that
> are not block IO based, and it has infrastructure such as local file
> caching built into it for optimising performance on high latency/low
> bandwidth network based filesystems.
>
> Hence it seems like this patchset is trying to duplicate
> functionality that netfs already provides request/response
> protocol-based filesystems, but with much less generic functionality
> than netfs already provides....
>
> Hence I'm not seeing why this IO patch was chosen for FUSE. Was
> netfs considered as a candidate infrastructure large folio support
> for FUSE? If so, why was iomap chosen over netfs? If not, would FUSE
> be better suited to netfs integration than hacking fuse specific "no
> block mapping" IO paths into infrastructure specifically optimised
> for block based filesystems?
Hi Dave,
The main reason I chose iomap was because it has granular dirty and
uptodate tracking for large folios, which I didn't see that netfs has
(yet?). That's the main thing fuse needs. When I took a look at it, it
didnt' seem to me that fuse mapped that neatly to netfs - for example
it wouldn't use netfs_io_requests or netfs_io_streams or rolling
buffers. Netfs seemed to have a bunch of extra stuff that would have
made it messier for fuse to be integrated into, whereas the iomap
library (except for the bio dependency) seemed more
generic/minimalistic
>
> -Dave.
> --
> Dave Chinner
> david@fromorbit.com
^ permalink raw reply [flat|nested] 70+ messages in thread