* [PATCH v1 1/9] iomap: account for unaligned end offsets when truncating read range
2025-10-09 22:56 [PATCH v1 0/9] iomap: buffered io changes Joanne Koong
@ 2025-10-09 22:56 ` Joanne Koong
2025-10-13 3:00 ` Christoph Hellwig
2025-10-09 22:56 ` [PATCH v1 2/9] docs: document iomap writeback's iomap_finish_folio_write() requirement Joanne Koong
` (7 subsequent siblings)
8 siblings, 1 reply; 21+ messages in thread
From: Joanne Koong @ 2025-10-09 22:56 UTC (permalink / raw)
To: brauner; +Cc: djwong, hch, bfoster, linux-fsdevel, kernel-team
The end position to start truncating from may be at an offset into a
block, which under the current logic would result in overtruncation.
Adjust the calculation to account for unaligned end offsets.
Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
---
fs/iomap/buffered-io.c | 22 ++++++++++++++++++++--
1 file changed, 20 insertions(+), 2 deletions(-)
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index f9ae72713f74..1c6575b7e583 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -218,6 +218,22 @@ static void ifs_free(struct folio *folio)
kfree(ifs);
}
+/*
+ * Calculate how many bytes to truncate based off the number of blocks to
+ * truncate and the end position to start truncating from.
+ */
+static size_t iomap_bytes_to_truncate(loff_t end_pos, unsigned block_bits,
+ unsigned blocks_truncated)
+{
+ unsigned block_size = 1 << block_bits;
+ unsigned block_offset = end_pos & (block_size - 1);
+
+ if (!block_offset)
+ return blocks_truncated << block_bits;
+
+ return ((blocks_truncated - 1) << block_bits) + block_offset;
+}
+
/*
* Calculate the range inside the folio that we actually need to read.
*/
@@ -263,7 +279,8 @@ static void iomap_adjust_read_range(struct inode *inode, struct folio *folio,
/* truncate len if we find any trailing uptodate block(s) */
while (++i <= last) {
if (ifs_block_is_uptodate(ifs, i)) {
- plen -= (last - i + 1) * block_size;
+ plen -= iomap_bytes_to_truncate(*pos + plen,
+ block_bits, last - i + 1);
last = i - 1;
break;
}
@@ -279,7 +296,8 @@ static void iomap_adjust_read_range(struct inode *inode, struct folio *folio,
unsigned end = offset_in_folio(folio, isize - 1) >> block_bits;
if (first <= end && last > end)
- plen -= (last - end) * block_size;
+ plen -= iomap_bytes_to_truncate(*pos + plen, block_bits,
+ last - end);
}
*offp = poff;
--
2.47.3
^ permalink raw reply related [flat|nested] 21+ messages in thread
* Re: [PATCH v1 1/9] iomap: account for unaligned end offsets when truncating read range
2025-10-09 22:56 ` [PATCH v1 1/9] iomap: account for unaligned end offsets when truncating read range Joanne Koong
@ 2025-10-13 3:00 ` Christoph Hellwig
0 siblings, 0 replies; 21+ messages in thread
From: Christoph Hellwig @ 2025-10-13 3:00 UTC (permalink / raw)
To: Joanne Koong; +Cc: brauner, djwong, hch, bfoster, linux-fsdevel, kernel-team
On Thu, Oct 09, 2025 at 03:56:03PM -0700, Joanne Koong wrote:
> The end position to start truncating from may be at an offset into a
> block, which under the current logic would result in overtruncation.
>
> Adjust the calculation to account for unaligned end offsets.
Should this get a fixes tag?
Otherwise looks good:
Reviewed-by: Christoph Hellwig <hch@lst.de>
^ permalink raw reply [flat|nested] 21+ messages in thread
* [PATCH v1 2/9] docs: document iomap writeback's iomap_finish_folio_write() requirement
2025-10-09 22:56 [PATCH v1 0/9] iomap: buffered io changes Joanne Koong
2025-10-09 22:56 ` [PATCH v1 1/9] iomap: account for unaligned end offsets when truncating read range Joanne Koong
@ 2025-10-09 22:56 ` Joanne Koong
2025-10-13 3:01 ` Christoph Hellwig
2025-10-09 22:56 ` [PATCH v1 3/9] iomap: optimize pending async writeback accounting Joanne Koong
` (6 subsequent siblings)
8 siblings, 1 reply; 21+ messages in thread
From: Joanne Koong @ 2025-10-09 22:56 UTC (permalink / raw)
To: brauner; +Cc: djwong, hch, bfoster, linux-fsdevel, kernel-team
Document that iomap_finish_folio_write() must be called after writeback
on the range completes.
Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
---
Documentation/filesystems/iomap/operations.rst | 3 +++
include/linux/iomap.h | 4 ++++
2 files changed, 7 insertions(+)
diff --git a/Documentation/filesystems/iomap/operations.rst b/Documentation/filesystems/iomap/operations.rst
index cef3c3e76e9e..018cfd13b9fa 100644
--- a/Documentation/filesystems/iomap/operations.rst
+++ b/Documentation/filesystems/iomap/operations.rst
@@ -361,6 +361,9 @@ The fields are as follows:
delalloc reservations to avoid having delalloc reservations for
clean pagecache.
This function must be supplied by the filesystem.
+ If this succeeds, iomap_finish_folio_write() must be called once writeback
+ completes for the range, regardless of whether the writeback succeeded or
+ failed.
- ``writeback_submit``: Submit the previous built writeback context.
Block based file systems should use the iomap_ioend_writeback_submit
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 6d864b446b6e..e6fa812229dc 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -431,6 +431,10 @@ struct iomap_writeback_ops {
* An existing mapping from a previous call to this method can be reused
* by the file system if it is still valid.
*
+ * If this succeeds, iomap_finish_folio_write() must be called once
+ * writeback completes for the range, regardless of whether the
+ * writeback succeeded or failed.
+ *
* Returns the number of bytes processed or a negative errno.
*/
ssize_t (*writeback_range)(struct iomap_writepage_ctx *wpc,
--
2.47.3
^ permalink raw reply related [flat|nested] 21+ messages in thread
* [PATCH v1 3/9] iomap: optimize pending async writeback accounting
2025-10-09 22:56 [PATCH v1 0/9] iomap: buffered io changes Joanne Koong
2025-10-09 22:56 ` [PATCH v1 1/9] iomap: account for unaligned end offsets when truncating read range Joanne Koong
2025-10-09 22:56 ` [PATCH v1 2/9] docs: document iomap writeback's iomap_finish_folio_write() requirement Joanne Koong
@ 2025-10-09 22:56 ` Joanne Koong
2025-10-13 3:04 ` Christoph Hellwig
2025-10-09 22:56 ` [PATCH v1 4/9] iomap: simplify ->read_folio_range() error handling for reads Joanne Koong
` (5 subsequent siblings)
8 siblings, 1 reply; 21+ messages in thread
From: Joanne Koong @ 2025-10-09 22:56 UTC (permalink / raw)
To: brauner; +Cc: djwong, hch, bfoster, linux-fsdevel, kernel-team
Pending writebacks must be accounted for to determine when all requests
have completed and writeback on the folio should be ended. Currently
this is done by atomically incrementing ifs->write_bytes_pending for
every range to be written back.
Instead, the number of atomic operations can be minimized by setting
ifs->write_bytes_pending to the folio size, internally tracking how many
bytes are written back asynchronously, and then after sending off all
the requests, decrementing ifs->write_bytes_pending by the number of
bytes not written back asynchronously. Now, for N ranges written back,
only N + 2 atomic operations are required instead of 2N + 2.
Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
---
fs/fuse/file.c | 4 ++--
fs/iomap/buffered-io.c | 45 +++++++++++++++++-------------------------
fs/iomap/ioend.c | 2 --
include/linux/iomap.h | 2 --
4 files changed, 20 insertions(+), 33 deletions(-)
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 7c9c00784e33..01d378f8de18 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1883,7 +1883,8 @@ static void fuse_writepage_finish(struct fuse_writepage_args *wpa)
* scope of the fi->lock alleviates xarray lock
* contention and noticeably improves performance.
*/
- iomap_finish_folio_write(inode, ap->folios[i], 1);
+ iomap_finish_folio_write(inode, ap->folios[i],
+ ap->descs[i].length);
dec_wb_stat(&bdi->wb, WB_WRITEBACK);
wb_writeout_inc(&bdi->wb);
}
@@ -2225,7 +2226,6 @@ static ssize_t fuse_iomap_writeback_range(struct iomap_writepage_ctx *wpc,
ap = &wpa->ia.ap;
}
- iomap_start_folio_write(inode, folio, 1);
fuse_writepage_args_page_fill(wpa, folio, ap->num_folios,
offset, len);
data->nr_bytes += len;
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 1c6575b7e583..7f914d5ac25d 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -1552,16 +1552,16 @@ vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops,
}
EXPORT_SYMBOL_GPL(iomap_page_mkwrite);
-void iomap_start_folio_write(struct inode *inode, struct folio *folio,
- size_t len)
+static void iomap_writeback_init(struct inode *inode, struct folio *folio)
{
struct iomap_folio_state *ifs = folio->private;
WARN_ON_ONCE(i_blocks_per_folio(inode, folio) > 1 && !ifs);
- if (ifs)
- atomic_add(len, &ifs->write_bytes_pending);
+ if (ifs) {
+ WARN_ON_ONCE(atomic_read(&ifs->write_bytes_pending) != 0);
+ atomic_set(&ifs->write_bytes_pending, folio_size(folio));
+ }
}
-EXPORT_SYMBOL_GPL(iomap_start_folio_write);
void iomap_finish_folio_write(struct inode *inode, struct folio *folio,
size_t len)
@@ -1578,7 +1578,7 @@ EXPORT_SYMBOL_GPL(iomap_finish_folio_write);
static int iomap_writeback_range(struct iomap_writepage_ctx *wpc,
struct folio *folio, u64 pos, u32 rlen, u64 end_pos,
- bool *wb_pending)
+ unsigned *wb_bytes_pending)
{
do {
ssize_t ret;
@@ -1591,12 +1591,11 @@ static int iomap_writeback_range(struct iomap_writepage_ctx *wpc,
rlen -= ret;
pos += ret;
- /*
- * Holes are not be written back by ->writeback_range, so track
+ /* Holes are not written back by ->writeback_range, so track
* if we did handle anything that is not a hole here.
*/
if (wpc->iomap.type != IOMAP_HOLE)
- *wb_pending = true;
+ *wb_bytes_pending += ret;
} while (rlen);
return 0;
@@ -1667,7 +1666,7 @@ int iomap_writeback_folio(struct iomap_writepage_ctx *wpc, struct folio *folio)
u64 pos = folio_pos(folio);
u64 end_pos = pos + folio_size(folio);
u64 end_aligned = 0;
- bool wb_pending = false;
+ unsigned wb_bytes_pending = 0;
int error = 0;
u32 rlen;
@@ -1687,14 +1686,7 @@ int iomap_writeback_folio(struct iomap_writepage_ctx *wpc, struct folio *folio)
iomap_set_range_dirty(folio, 0, end_pos - pos);
}
- /*
- * Keep the I/O completion handler from clearing the writeback
- * bit until we have submitted all blocks by adding a bias to
- * ifs->write_bytes_pending, which is dropped after submitting
- * all blocks.
- */
- WARN_ON_ONCE(atomic_read(&ifs->write_bytes_pending) != 0);
- iomap_start_folio_write(inode, folio, 1);
+ iomap_writeback_init(inode, folio);
}
/*
@@ -1709,13 +1701,13 @@ int iomap_writeback_folio(struct iomap_writepage_ctx *wpc, struct folio *folio)
end_aligned = round_up(end_pos, i_blocksize(inode));
while ((rlen = iomap_find_dirty_range(folio, &pos, end_aligned))) {
error = iomap_writeback_range(wpc, folio, pos, rlen, end_pos,
- &wb_pending);
+ &wb_bytes_pending);
if (error)
break;
pos += rlen;
}
- if (wb_pending)
+ if (wb_bytes_pending)
wpc->nr_folios++;
/*
@@ -1732,13 +1724,12 @@ int iomap_writeback_folio(struct iomap_writepage_ctx *wpc, struct folio *folio)
* already at this point. In that case we need to clear the writeback
* bit ourselves right after unlocking the page.
*/
- if (ifs) {
- if (atomic_dec_and_test(&ifs->write_bytes_pending))
- folio_end_writeback(folio);
- } else {
- if (!wb_pending)
- folio_end_writeback(folio);
- }
+ if (ifs)
+ iomap_finish_folio_write(inode, folio,
+ folio_size(folio) - wb_bytes_pending);
+ else if (!wb_bytes_pending)
+ folio_end_writeback(folio);
+
mapping_set_error(inode->i_mapping, error);
return error;
}
diff --git a/fs/iomap/ioend.c b/fs/iomap/ioend.c
index b49fa75eab26..86f44922ed3b 100644
--- a/fs/iomap/ioend.c
+++ b/fs/iomap/ioend.c
@@ -194,8 +194,6 @@ ssize_t iomap_add_to_ioend(struct iomap_writepage_ctx *wpc, struct folio *folio,
if (!bio_add_folio(&ioend->io_bio, folio, map_len, poff))
goto new_ioend;
- iomap_start_folio_write(wpc->inode, folio, map_len);
-
/*
* Clamp io_offset and io_size to the incore EOF so that ondisk
* file size updates in the ioend completion are byte-accurate.
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index e6fa812229dc..a156a9964938 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -474,8 +474,6 @@ int iomap_ioend_writeback_submit(struct iomap_writepage_ctx *wpc, int error);
void iomap_finish_folio_read(struct folio *folio, size_t off, size_t len,
int error);
-void iomap_start_folio_write(struct inode *inode, struct folio *folio,
- size_t len);
void iomap_finish_folio_write(struct inode *inode, struct folio *folio,
size_t len);
--
2.47.3
^ permalink raw reply related [flat|nested] 21+ messages in thread
* Re: [PATCH v1 3/9] iomap: optimize pending async writeback accounting
2025-10-09 22:56 ` [PATCH v1 3/9] iomap: optimize pending async writeback accounting Joanne Koong
@ 2025-10-13 3:04 ` Christoph Hellwig
0 siblings, 0 replies; 21+ messages in thread
From: Christoph Hellwig @ 2025-10-13 3:04 UTC (permalink / raw)
To: Joanne Koong; +Cc: brauner, djwong, hch, bfoster, linux-fsdevel, kernel-team
On Thu, Oct 09, 2025 at 03:56:05PM -0700, Joanne Koong wrote:
>
> static int iomap_writeback_range(struct iomap_writepage_ctx *wpc,
> struct folio *folio, u64 pos, u32 rlen, u64 end_pos,
> - bool *wb_pending)
> + unsigned *wb_bytes_pending)
Maybe shorten the variable name to bytes_pending or wb_bytes?
The current name feels a bit too verbose (not really a major issue,
just thinking out aloud)
> - /*
> - * Holes are not be written back by ->writeback_range, so track
> + /* Holes are not written back by ->writeback_range, so track
Please stick to the normal kernel comment style that was used here
previously.
Otherwise this looks good to me.
^ permalink raw reply [flat|nested] 21+ messages in thread
* [PATCH v1 4/9] iomap: simplify ->read_folio_range() error handling for reads
2025-10-09 22:56 [PATCH v1 0/9] iomap: buffered io changes Joanne Koong
` (2 preceding siblings ...)
2025-10-09 22:56 ` [PATCH v1 3/9] iomap: optimize pending async writeback accounting Joanne Koong
@ 2025-10-09 22:56 ` Joanne Koong
2025-10-13 3:06 ` Christoph Hellwig
2025-10-09 22:56 ` [PATCH v1 5/9] iomap: simplify when reads can be skipped for writes Joanne Koong
` (4 subsequent siblings)
8 siblings, 1 reply; 21+ messages in thread
From: Joanne Koong @ 2025-10-09 22:56 UTC (permalink / raw)
To: brauner; +Cc: djwong, hch, bfoster, linux-fsdevel, kernel-team
Instead of requiring that the caller calls iomap_finish_folio_read()
even if the ->read_folio_range() callback returns an error, account for
this internally in iomap instead, which makes the interface simpler and
makes it match writeback's ->read_folio_range() error handling
expectations.
Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
---
.../filesystems/iomap/operations.rst | 7 +++--
fs/fuse/file.c | 10 ++-----
fs/iomap/buffered-io.c | 26 +++++++++----------
include/linux/iomap.h | 5 ++--
4 files changed, 19 insertions(+), 29 deletions(-)
diff --git a/Documentation/filesystems/iomap/operations.rst b/Documentation/filesystems/iomap/operations.rst
index 018cfd13b9fa..dd05d95ebb3e 100644
--- a/Documentation/filesystems/iomap/operations.rst
+++ b/Documentation/filesystems/iomap/operations.rst
@@ -149,10 +149,9 @@ These ``struct kiocb`` flags are significant for buffered I/O with iomap:
iomap calls these functions:
- ``read_folio_range``: Called to read in the range. This must be provided
- by the caller. The caller is responsible for calling
- iomap_finish_folio_read() after reading in the folio range. This should be
- done even if an error is encountered during the read. This returns 0 on
- success or a negative error on failure.
+ by the caller. If this succeeds, iomap_finish_folio_read() must be called
+ after the range is read in, regardless of whether the read succeeded or
+ failed.
- ``submit_read``: Submit any pending read requests. This function is
optional.
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 01d378f8de18..591789adb00b 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -916,13 +916,6 @@ static int fuse_iomap_read_folio_range_async(const struct iomap_iter *iter,
if (ctx->rac) {
ret = fuse_handle_readahead(folio, ctx->rac, data, pos, len);
- /*
- * If fuse_handle_readahead was successful, fuse_readpages_end
- * will do the iomap_finish_folio_read, else we need to call it
- * here
- */
- if (ret)
- iomap_finish_folio_read(folio, off, len, ret);
} else {
/*
* for non-readahead read requests, do reads synchronously
@@ -930,7 +923,8 @@ static int fuse_iomap_read_folio_range_async(const struct iomap_iter *iter,
* out-of-order reads
*/
ret = fuse_do_readfolio(file, folio, off, len);
- iomap_finish_folio_read(folio, off, len, ret);
+ if (!ret)
+ iomap_finish_folio_read(folio, off, len, ret);
}
return ret;
}
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 7f914d5ac25d..dc05ed647ba5 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -377,26 +377,16 @@ static void iomap_read_init(struct folio *folio)
size_t len = folio_size(folio);
spin_lock_irq(&ifs->state_lock);
- ifs->read_bytes_pending += len;
+ WARN_ON_ONCE(ifs->read_bytes_pending != 0);
+ ifs->read_bytes_pending = len;
spin_unlock_irq(&ifs->state_lock);
}
}
static void iomap_read_end(struct folio *folio, size_t bytes_pending)
{
- struct iomap_folio_state *ifs;
-
- /*
- * If there are no bytes pending, this means we are responsible for
- * unlocking the folio here, since no IO helper has taken ownership of
- * it.
- */
- if (!bytes_pending) {
- folio_unlock(folio);
- return;
- }
+ struct iomap_folio_state *ifs = folio->private;
- ifs = folio->private;
if (ifs) {
bool end_read, uptodate;
size_t bytes_accounted = folio_size(folio) - bytes_pending;
@@ -415,6 +405,14 @@ static void iomap_read_end(struct folio *folio, size_t bytes_pending)
spin_unlock_irq(&ifs->state_lock);
if (end_read)
folio_end_read(folio, uptodate);
+ } else if (!bytes_pending) {
+ /*
+ * If there are no bytes pending, this means we are responsible
+ * for unlocking the folio here, since no IO helper has taken
+ * ownership of it. If there are bytes pending, then the IO
+ * helper will end the read via iomap_finish_folio_read().
+ */
+ folio_unlock(folio);
}
}
@@ -462,10 +460,10 @@ static int iomap_read_folio_iter(struct iomap_iter *iter,
} else {
if (!*bytes_pending)
iomap_read_init(folio);
- *bytes_pending += plen;
ret = ctx->ops->read_folio_range(iter, ctx, plen);
if (ret)
return ret;
+ *bytes_pending += plen;
}
ret = iomap_iter_advance(iter, plen);
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index a156a9964938..c417bb8718e3 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -491,9 +491,8 @@ struct iomap_read_ops {
/*
* Read in a folio range.
*
- * The caller is responsible for calling iomap_finish_folio_read() after
- * reading in the folio range. This should be done even if an error is
- * encountered during the read.
+ * If this succeeds, iomap_finish_folio_read() must be called after the
+ * range is read in, regardless of whether the read succeeded or failed.
*
* Returns 0 on success or a negative error on failure.
*/
--
2.47.3
^ permalink raw reply related [flat|nested] 21+ messages in thread
* [PATCH v1 5/9] iomap: simplify when reads can be skipped for writes
2025-10-09 22:56 [PATCH v1 0/9] iomap: buffered io changes Joanne Koong
` (3 preceding siblings ...)
2025-10-09 22:56 ` [PATCH v1 4/9] iomap: simplify ->read_folio_range() error handling for reads Joanne Koong
@ 2025-10-09 22:56 ` Joanne Koong
2025-10-13 3:06 ` Christoph Hellwig
2025-10-09 22:56 ` [PATCH v1 6/9] iomap: optimize reads for non-block-aligned writes Joanne Koong
` (3 subsequent siblings)
8 siblings, 1 reply; 21+ messages in thread
From: Joanne Koong @ 2025-10-09 22:56 UTC (permalink / raw)
To: brauner; +Cc: djwong, hch, bfoster, linux-fsdevel, kernel-team
Currently, the logic for skipping the read range for a write is
if (!(iter->flags & IOMAP_UNSHARE) &&
(from <= poff || from >= poff + plen) &&
(to <= poff || to >= poff + plen))
which breaks down to skipping the read if any of these are true:
a) from <= poff && to <= poff
b) from <= poff && to >= poff + plen
c) from >= poff + plen && to <= poff
d) from >= poff + plen && to >= poff + plen
This can be simplified to
if (!(iter->flags & IOMAP_UNSHARE) &&
(from <= poff && to >= poff + plen))
from the following reasoning:
a) from <= poff && to <= poff
This reduces to 'to <= poff' since it is guaranteed that 'from <= to'
(since to = from + len). It is not possible for 'from <= to' to be true
here because we only reach here if plen > 0 (thanks to the preceding 'if
(plen == 0)' check that would break us out of the loop). If 'to <=
poff', plen would have to be 0 since poff and plen get adjusted in
lockstep for uptodate blocks. This means we can eliminate this check.
c) from >= poff + plen && to <= poff
This is not possible since 'from <= to' and 'plen > 0'. We can eliminate
this check.
d) from >= poff + plen && to >= poff + plen
This reduces to 'from >= poff + plen' since 'from <= to'.
It is not possible for 'from >= poff + plen' to be true here. We only
reach here if plen > 0 and for writes, poff and plen will always be
block-aligned, which means poff <= from < poff + plen. We can eliminate
this check.
The only valid check is b) from <= poff && to >= poff + plen.
Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
---
fs/iomap/buffered-io.c | 7 +++++--
1 file changed, 5 insertions(+), 2 deletions(-)
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index dc05ed647ba5..0ad8c8a218f3 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -714,9 +714,12 @@ static int __iomap_write_begin(const struct iomap_iter *iter,
if (plen == 0)
break;
+ /*
+ * If the read range will be entirely overwritten by the write,
+ * we can skip having to zero/read it in.
+ */
if (!(iter->flags & IOMAP_UNSHARE) &&
- (from <= poff || from >= poff + plen) &&
- (to <= poff || to >= poff + plen))
+ (from <= poff && to >= poff + plen))
continue;
if (iomap_block_needs_zeroing(iter, block_start)) {
--
2.47.3
^ permalink raw reply related [flat|nested] 21+ messages in thread
* [PATCH v1 6/9] iomap: optimize reads for non-block-aligned writes
2025-10-09 22:56 [PATCH v1 0/9] iomap: buffered io changes Joanne Koong
` (4 preceding siblings ...)
2025-10-09 22:56 ` [PATCH v1 5/9] iomap: simplify when reads can be skipped for writes Joanne Koong
@ 2025-10-09 22:56 ` Joanne Koong
2025-10-13 3:08 ` Christoph Hellwig
2025-10-09 22:56 ` [PATCH v1 7/9] iomap: use loff_t for file positions and offsets in writeback code Joanne Koong
` (2 subsequent siblings)
8 siblings, 1 reply; 21+ messages in thread
From: Joanne Koong @ 2025-10-09 22:56 UTC (permalink / raw)
To: brauner; +Cc: djwong, hch, bfoster, linux-fsdevel, kernel-team
If a write is block-aligned (i.e., write offset and length are both
block-aligned), no reads should be necessary.
If the write starts or ends at a non-block-aligned offset, the write
should only need to read in at most two blocks, the starting block and
the ending block. Any intermediary blocks should be skipped since they
will be completely overwritten.
Currently for non-block-aligned writes, the entire range gets read in
including intermediary blocks.
Optimize the logic to read in only the necessary blocks.
Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
---
fs/iomap/buffered-io.c | 39 ++++++++++++++++++++++++++++++++-------
1 file changed, 32 insertions(+), 7 deletions(-)
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 0ad8c8a218f3..372e14f7ab57 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -686,6 +686,7 @@ static int __iomap_write_begin(const struct iomap_iter *iter,
loff_t block_size = i_blocksize(iter->inode);
loff_t block_start = round_down(pos, block_size);
loff_t block_end = round_up(pos + len, block_size);
+ unsigned int block_bits = iter->inode->i_blkbits;
unsigned int nr_blocks = i_blocks_per_folio(iter->inode, folio);
size_t from = offset_in_folio(folio, pos), to = from + len;
size_t poff, plen;
@@ -714,13 +715,37 @@ static int __iomap_write_begin(const struct iomap_iter *iter,
if (plen == 0)
break;
- /*
- * If the read range will be entirely overwritten by the write,
- * we can skip having to zero/read it in.
- */
- if (!(iter->flags & IOMAP_UNSHARE) &&
- (from <= poff && to >= poff + plen))
- continue;
+ if (!(iter->flags & IOMAP_UNSHARE)) {
+ /*
+ * If the read range will be entirely overwritten by the
+ * write, we can skip having to zero/read it in.
+ */
+ if (from <= poff && to >= poff + plen)
+ continue;
+
+ /*
+ * If the write starts at a non-block-aligned offset
+ * (from > poff), read only the first block. Any
+ * intermediate blocks will be skipped in the next
+ * iteration.
+ *
+ * Exception: skip this optimization if the write spans
+ * only two blocks and ends at a non-block-aligned
+ * offset.
+ */
+ if (from > poff) {
+ if ((plen >> block_bits) > 2 ||
+ to >= poff + plen)
+ plen = block_size;
+ } else if (to < poff + plen) {
+ /*
+ * Else if the write ends at an offset into the
+ * last block, read in just the last block.
+ */
+ poff = poff + plen - block_size;
+ plen = block_size;
+ }
+ }
if (iomap_block_needs_zeroing(iter, block_start)) {
if (WARN_ON_ONCE(iter->flags & IOMAP_UNSHARE))
--
2.47.3
^ permalink raw reply related [flat|nested] 21+ messages in thread
* Re: [PATCH v1 6/9] iomap: optimize reads for non-block-aligned writes
2025-10-09 22:56 ` [PATCH v1 6/9] iomap: optimize reads for non-block-aligned writes Joanne Koong
@ 2025-10-13 3:08 ` Christoph Hellwig
2025-10-14 0:04 ` Joanne Koong
0 siblings, 1 reply; 21+ messages in thread
From: Christoph Hellwig @ 2025-10-13 3:08 UTC (permalink / raw)
To: Joanne Koong; +Cc: brauner, djwong, hch, bfoster, linux-fsdevel, kernel-team
On Thu, Oct 09, 2025 at 03:56:08PM -0700, Joanne Koong wrote:
> If a write is block-aligned (i.e., write offset and length are both
> block-aligned), no reads should be necessary.
>
> If the write starts or ends at a non-block-aligned offset, the write
> should only need to read in at most two blocks, the starting block and
> the ending block. Any intermediary blocks should be skipped since they
> will be completely overwritten.
>
> Currently for non-block-aligned writes, the entire range gets read in
> including intermediary blocks.
>
> Optimize the logic to read in only the necessary blocks.
At least for block based file systems doing a single read in the typical
block size range is cheaper than two smaller reads. This is especially
true for hard drivers with seeks, but even for SSDs I doubt the two
reads are more efficient.
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH v1 6/9] iomap: optimize reads for non-block-aligned writes
2025-10-13 3:08 ` Christoph Hellwig
@ 2025-10-14 0:04 ` Joanne Koong
2025-10-14 4:14 ` Christoph Hellwig
0 siblings, 1 reply; 21+ messages in thread
From: Joanne Koong @ 2025-10-14 0:04 UTC (permalink / raw)
To: Christoph Hellwig; +Cc: brauner, djwong, bfoster, linux-fsdevel, kernel-team
On Sun, Oct 12, 2025 at 8:08 PM Christoph Hellwig <hch@infradead.org> wrote:
>
> On Thu, Oct 09, 2025 at 03:56:08PM -0700, Joanne Koong wrote:
> > If a write is block-aligned (i.e., write offset and length are both
> > block-aligned), no reads should be necessary.
> >
> > If the write starts or ends at a non-block-aligned offset, the write
> > should only need to read in at most two blocks, the starting block and
> > the ending block. Any intermediary blocks should be skipped since they
> > will be completely overwritten.
> >
> > Currently for non-block-aligned writes, the entire range gets read in
> > including intermediary blocks.
> >
> > Optimize the logic to read in only the necessary blocks.
>
> At least for block based file systems doing a single read in the typical
> block size range is cheaper than two smaller reads. This is especially
> true for hard drivers with seeks, but even for SSDs I doubt the two
> reads are more efficient.
>
Ahh okay, that makes sense. I'll drop this patch then.
Thanks for reviewing the series. I'll make your suggested edits for v2.
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH v1 6/9] iomap: optimize reads for non-block-aligned writes
2025-10-14 0:04 ` Joanne Koong
@ 2025-10-14 4:14 ` Christoph Hellwig
0 siblings, 0 replies; 21+ messages in thread
From: Christoph Hellwig @ 2025-10-14 4:14 UTC (permalink / raw)
To: Joanne Koong
Cc: Christoph Hellwig, brauner, djwong, bfoster, linux-fsdevel,
kernel-team
On Mon, Oct 13, 2025 at 05:04:30PM -0700, Joanne Koong wrote:
> Ahh okay, that makes sense. I'll drop this patch then.
If this is a big enough win for fuse / network file systems we can
find a way to opt into this behavior.
^ permalink raw reply [flat|nested] 21+ messages in thread
* [PATCH v1 7/9] iomap: use loff_t for file positions and offsets in writeback code
2025-10-09 22:56 [PATCH v1 0/9] iomap: buffered io changes Joanne Koong
` (5 preceding siblings ...)
2025-10-09 22:56 ` [PATCH v1 6/9] iomap: optimize reads for non-block-aligned writes Joanne Koong
@ 2025-10-09 22:56 ` Joanne Koong
2025-10-13 3:09 ` Christoph Hellwig
2025-10-09 22:56 ` [PATCH v1 8/9] iomap: use find_next_bit() for dirty bitmap scanning Joanne Koong
2025-10-09 22:56 ` [PATCH v1 9/9] iomap: use find_next_bit() for uptodate " Joanne Koong
8 siblings, 1 reply; 21+ messages in thread
From: Joanne Koong @ 2025-10-09 22:56 UTC (permalink / raw)
To: brauner; +Cc: djwong, hch, bfoster, linux-fsdevel, kernel-team
Use loff_t instead of u64 for file positions and offsets, consistent
with kernel VFS conventions.
Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
---
fs/fuse/file.c | 4 ++--
fs/iomap/buffered-io.c | 17 +++++++++--------
fs/xfs/xfs_aops.c | 8 ++++----
include/linux/iomap.h | 4 ++--
4 files changed, 17 insertions(+), 16 deletions(-)
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 591789adb00b..c44c058feeb0 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -2172,8 +2172,8 @@ static bool fuse_folios_need_send(struct fuse_conn *fc, loff_t pos,
}
static ssize_t fuse_iomap_writeback_range(struct iomap_writepage_ctx *wpc,
- struct folio *folio, u64 pos,
- unsigned len, u64 end_pos)
+ struct folio *folio, loff_t pos,
+ unsigned len, loff_t end_pos)
{
struct fuse_fill_wb_data *data = wpc->wb_ctx;
struct fuse_writepage_args *wpa = data->wpa;
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 372e14f7ab57..66c47404787f 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -86,7 +86,8 @@ static inline bool ifs_block_is_dirty(struct folio *folio,
}
static unsigned ifs_find_dirty_range(struct folio *folio,
- struct iomap_folio_state *ifs, u64 *range_start, u64 range_end)
+ struct iomap_folio_state *ifs, loff_t *range_start,
+ loff_t range_end)
{
struct inode *inode = folio->mapping->host;
unsigned start_blk =
@@ -110,8 +111,8 @@ static unsigned ifs_find_dirty_range(struct folio *folio,
return nblks << inode->i_blkbits;
}
-static unsigned iomap_find_dirty_range(struct folio *folio, u64 *range_start,
- u64 range_end)
+static unsigned iomap_find_dirty_range(struct folio *folio, loff_t *range_start,
+ loff_t range_end)
{
struct iomap_folio_state *ifs = folio->private;
@@ -1603,7 +1604,7 @@ void iomap_finish_folio_write(struct inode *inode, struct folio *folio,
EXPORT_SYMBOL_GPL(iomap_finish_folio_write);
static int iomap_writeback_range(struct iomap_writepage_ctx *wpc,
- struct folio *folio, u64 pos, u32 rlen, u64 end_pos,
+ struct folio *folio, loff_t pos, u32 rlen, loff_t end_pos,
unsigned *wb_bytes_pending)
{
do {
@@ -1634,7 +1635,7 @@ static int iomap_writeback_range(struct iomap_writepage_ctx *wpc,
* i_size, adjust end_pos and zero all data beyond i_size.
*/
static bool iomap_writeback_handle_eof(struct folio *folio, struct inode *inode,
- u64 *end_pos)
+ loff_t *end_pos)
{
u64 isize = i_size_read(inode);
@@ -1689,9 +1690,9 @@ int iomap_writeback_folio(struct iomap_writepage_ctx *wpc, struct folio *folio)
{
struct iomap_folio_state *ifs = folio->private;
struct inode *inode = wpc->inode;
- u64 pos = folio_pos(folio);
- u64 end_pos = pos + folio_size(folio);
- u64 end_aligned = 0;
+ loff_t pos = folio_pos(folio);
+ loff_t end_pos = pos + folio_size(folio);
+ loff_t end_aligned = 0;
unsigned wb_bytes_pending = 0;
int error = 0;
u32 rlen;
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 0c2ed00733f2..593a34832116 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -480,9 +480,9 @@ static ssize_t
xfs_writeback_range(
struct iomap_writepage_ctx *wpc,
struct folio *folio,
- u64 offset,
+ loff_t offset,
unsigned int len,
- u64 end_pos)
+ loff_t end_pos)
{
ssize_t ret;
@@ -630,9 +630,9 @@ static ssize_t
xfs_zoned_writeback_range(
struct iomap_writepage_ctx *wpc,
struct folio *folio,
- u64 offset,
+ loff_t offset,
unsigned int len,
- u64 end_pos)
+ loff_t end_pos)
{
ssize_t ret;
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index c417bb8718e3..8045d4c430ae 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -438,8 +438,8 @@ struct iomap_writeback_ops {
* Returns the number of bytes processed or a negative errno.
*/
ssize_t (*writeback_range)(struct iomap_writepage_ctx *wpc,
- struct folio *folio, u64 pos, unsigned int len,
- u64 end_pos);
+ struct folio *folio, loff_t pos, unsigned int len,
+ loff_t end_pos);
/*
* Submit a writeback context previously build up by ->writeback_range.
--
2.47.3
^ permalink raw reply related [flat|nested] 21+ messages in thread
* Re: [PATCH v1 7/9] iomap: use loff_t for file positions and offsets in writeback code
2025-10-09 22:56 ` [PATCH v1 7/9] iomap: use loff_t for file positions and offsets in writeback code Joanne Koong
@ 2025-10-13 3:09 ` Christoph Hellwig
0 siblings, 0 replies; 21+ messages in thread
From: Christoph Hellwig @ 2025-10-13 3:09 UTC (permalink / raw)
To: Joanne Koong; +Cc: brauner, djwong, hch, bfoster, linux-fsdevel, kernel-team
On Thu, Oct 09, 2025 at 03:56:09PM -0700, Joanne Koong wrote:
> Use loff_t instead of u64 for file positions and offsets, consistent
> with kernel VFS conventions.
Which for historical reasons makes this a signed value, but I can't
find anything where that would cause problems. But maybe mention
that in the commit log?
Otheriwse looks good:
Reviewed-by: Christoph Hellwig <hch@lst.de>
^ permalink raw reply [flat|nested] 21+ messages in thread
* [PATCH v1 8/9] iomap: use find_next_bit() for dirty bitmap scanning
2025-10-09 22:56 [PATCH v1 0/9] iomap: buffered io changes Joanne Koong
` (6 preceding siblings ...)
2025-10-09 22:56 ` [PATCH v1 7/9] iomap: use loff_t for file positions and offsets in writeback code Joanne Koong
@ 2025-10-09 22:56 ` Joanne Koong
2025-10-13 3:13 ` Christoph Hellwig
2025-10-09 22:56 ` [PATCH v1 9/9] iomap: use find_next_bit() for uptodate " Joanne Koong
8 siblings, 1 reply; 21+ messages in thread
From: Joanne Koong @ 2025-10-09 22:56 UTC (permalink / raw)
To: brauner; +Cc: djwong, hch, bfoster, linux-fsdevel, kernel-team
Use find_next_bit()/find_next_zero_bit() for iomap dirty bitmap
scanning. This uses __ffs() internally and is more efficient for
finding the next dirty or clean bit than manually iterating through the
bitmap range testing every bit.
Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
Suggested-by: Christoph Hellwig <hch@infradead.org>
---
fs/iomap/buffered-io.c | 73 ++++++++++++++++++++++++++++++------------
1 file changed, 52 insertions(+), 21 deletions(-)
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 66c47404787f..37d2b76ca230 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -76,15 +76,49 @@ static void iomap_set_range_uptodate(struct folio *folio, size_t off,
folio_mark_uptodate(folio);
}
-static inline bool ifs_block_is_dirty(struct folio *folio,
- struct iomap_folio_state *ifs, int block)
+/**
+* ifs_next_dirty_block - find the next dirty block in the folio
+* @folio: The folio
+* @start_blk: Block number to begin searching at
+* @end_blk: Last block number (inclusive) to search
+*
+* If no dirty block is found, this will return end_blk + 1.
+*/
+static unsigned ifs_next_dirty_block(struct folio *folio,
+ unsigned start_blk, unsigned end_blk)
{
+ struct iomap_folio_state *ifs = folio->private;
struct inode *inode = folio->mapping->host;
- unsigned int blks_per_folio = i_blocks_per_folio(inode, folio);
+ unsigned int blks = i_blocks_per_folio(inode, folio);
+
+ return find_next_bit(ifs->state, blks + end_blk + 1,
+ blks + start_blk) - blks;
+}
+
+/**
+* ifs_next_clean_block - find the next clean block in the folio
+* @folio: The folio
+* @start_blk: Block number to begin searching at
+* @end_blk: Last block number (inclusive) to search
+*
+* If no clean block is found, this will return end_blk + 1.
+*/
+static unsigned ifs_next_clean_block(struct folio *folio,
+ unsigned start_blk, unsigned end_blk)
+{
+ struct iomap_folio_state *ifs = folio->private;
+ struct inode *inode = folio->mapping->host;
+ unsigned int blks = i_blocks_per_folio(inode, folio);
- return test_bit(block + blks_per_folio, ifs->state);
+ return find_next_zero_bit(ifs->state, blks + end_blk + 1,
+ blks + start_blk) - blks;
}
+#define for_each_clean_block(folio, blk, last_blk) \
+ for ((blk) = ifs_next_clean_block((folio), (blk), (last_blk)); \
+ (blk) <= (last_blk); \
+ (blk) = ifs_next_clean_block((folio), (blk) + 1, (last_blk)))
+
static unsigned ifs_find_dirty_range(struct folio *folio,
struct iomap_folio_state *ifs, loff_t *range_start,
loff_t range_end)
@@ -94,18 +128,17 @@ static unsigned ifs_find_dirty_range(struct folio *folio,
offset_in_folio(folio, *range_start) >> inode->i_blkbits;
unsigned end_blk = min_not_zero(
offset_in_folio(folio, range_end) >> inode->i_blkbits,
- i_blocks_per_folio(inode, folio));
- unsigned nblks = 1;
+ i_blocks_per_folio(inode, folio)) - 1;
+ unsigned nblks;
- while (!ifs_block_is_dirty(folio, ifs, start_blk))
- if (++start_blk == end_blk)
- return 0;
-
- while (start_blk + nblks < end_blk) {
- if (!ifs_block_is_dirty(folio, ifs, start_blk + nblks))
- break;
- nblks++;
- }
+ start_blk = ifs_next_dirty_block(folio, start_blk, end_blk);
+ if (start_blk > end_blk)
+ return 0;
+ else if (start_blk == end_blk)
+ nblks = 1;
+ else
+ nblks = ifs_next_clean_block(folio, start_blk + 1, end_blk)
+ - start_blk;
*range_start = folio_pos(folio) + (start_blk << inode->i_blkbits);
return nblks << inode->i_blkbits;
@@ -1102,7 +1135,7 @@ static void iomap_write_delalloc_ifs_punch(struct inode *inode,
struct folio *folio, loff_t start_byte, loff_t end_byte,
struct iomap *iomap, iomap_punch_t punch)
{
- unsigned int first_blk, last_blk, i;
+ unsigned int first_blk, last_blk;
loff_t last_byte;
u8 blkbits = inode->i_blkbits;
struct iomap_folio_state *ifs;
@@ -1121,11 +1154,9 @@ static void iomap_write_delalloc_ifs_punch(struct inode *inode,
folio_pos(folio) + folio_size(folio) - 1);
first_blk = offset_in_folio(folio, start_byte) >> blkbits;
last_blk = offset_in_folio(folio, last_byte) >> blkbits;
- for (i = first_blk; i <= last_blk; i++) {
- if (!ifs_block_is_dirty(folio, ifs, i))
- punch(inode, folio_pos(folio) + (i << blkbits),
- 1 << blkbits, iomap);
- }
+ for_each_clean_block(folio, first_blk, last_blk)
+ punch(inode, folio_pos(folio) + (first_blk << blkbits),
+ 1 << blkbits, iomap);
}
static void iomap_write_delalloc_punch(struct inode *inode, struct folio *folio,
--
2.47.3
^ permalink raw reply related [flat|nested] 21+ messages in thread
* Re: [PATCH v1 8/9] iomap: use find_next_bit() for dirty bitmap scanning
2025-10-09 22:56 ` [PATCH v1 8/9] iomap: use find_next_bit() for dirty bitmap scanning Joanne Koong
@ 2025-10-13 3:13 ` Christoph Hellwig
0 siblings, 0 replies; 21+ messages in thread
From: Christoph Hellwig @ 2025-10-13 3:13 UTC (permalink / raw)
To: Joanne Koong; +Cc: brauner, djwong, hch, bfoster, linux-fsdevel, kernel-team
On Thu, Oct 09, 2025 at 03:56:10PM -0700, Joanne Koong wrote:
> Use find_next_bit()/find_next_zero_bit() for iomap dirty bitmap
> scanning. This uses __ffs() internally and is more efficient for
> finding the next dirty or clean bit than manually iterating through the
> bitmap range testing every bit.
>
> Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
> Suggested-by: Christoph Hellwig <hch@infradead.org>
> ---
> fs/iomap/buffered-io.c | 73 ++++++++++++++++++++++++++++++------------
> 1 file changed, 52 insertions(+), 21 deletions(-)
>
> diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
> index 66c47404787f..37d2b76ca230 100644
> --- a/fs/iomap/buffered-io.c
> +++ b/fs/iomap/buffered-io.c
> @@ -76,15 +76,49 @@ static void iomap_set_range_uptodate(struct folio *folio, size_t off,
> folio_mark_uptodate(folio);
> }
>
> -static inline bool ifs_block_is_dirty(struct folio *folio,
> - struct iomap_folio_state *ifs, int block)
> +/**
> +* ifs_next_dirty_block - find the next dirty block in the folio
> +* @folio: The folio
> +* @start_blk: Block number to begin searching at
> +* @end_blk: Last block number (inclusive) to search
> +*
> +* If no dirty block is found, this will return end_blk + 1.
> +*/
I'm not a huge fan of using the very verbose kerneldoc comments for
static functions where this isn't even turned into generated
documentation. Can you shorten the comments into normal non-structured
ones?
> + if (start_blk > end_blk)
> + return 0;
> + else if (start_blk == end_blk)
No need for an else after a return.
> + nblks = ifs_next_clean_block(folio, start_blk + 1, end_blk)
> + - start_blk;
Kernel style keeps the operator before the line break.
> + for_each_clean_block(folio, first_blk, last_blk)
Given that this is the only user of this macro in the entire series,
what is the point of it? Just open coding the search would be simpler.
^ permalink raw reply [flat|nested] 21+ messages in thread
* [PATCH v1 9/9] iomap: use find_next_bit() for uptodate bitmap scanning
2025-10-09 22:56 [PATCH v1 0/9] iomap: buffered io changes Joanne Koong
` (7 preceding siblings ...)
2025-10-09 22:56 ` [PATCH v1 8/9] iomap: use find_next_bit() for dirty bitmap scanning Joanne Koong
@ 2025-10-09 22:56 ` Joanne Koong
2025-10-13 3:13 ` Christoph Hellwig
8 siblings, 1 reply; 21+ messages in thread
From: Joanne Koong @ 2025-10-09 22:56 UTC (permalink / raw)
To: brauner; +Cc: djwong, hch, bfoster, linux-fsdevel, kernel-team
Use find_next_bit()/find_next_zero_bit() for iomap uptodate bitmap
scanning. This uses __ffs() internally and is more efficient for
finding the next uptodate or non-uptodate bit than manually iterating
through the bitmap range testing every bit.
Signed-off-by: Joanne Koong <joannelkoong@gmail.com>
Suggested-by: Christoph Hellwig <hch@infradead.org>
---
fs/iomap/buffered-io.c | 62 ++++++++++++++++++++++++++++--------------
1 file changed, 41 insertions(+), 21 deletions(-)
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 37d2b76ca230..043c10d22db9 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -38,10 +38,36 @@ static inline bool ifs_is_fully_uptodate(struct folio *folio,
return bitmap_full(ifs->state, i_blocks_per_folio(inode, folio));
}
-static inline bool ifs_block_is_uptodate(struct iomap_folio_state *ifs,
- unsigned int block)
+/**
+* ifs_next_uptodate_block - find the next uptodate block in the folio
+* @folio: The folio
+* @start_blk: Block number to begin searching at
+* @end_blk: Last block number (inclusive) to search
+*
+* If no uptodate block is found, this will return end_blk + 1.
+*/
+static unsigned ifs_next_uptodate_block(struct folio *folio,
+ unsigned start_blk, unsigned end_blk)
{
- return test_bit(block, ifs->state);
+ struct iomap_folio_state *ifs = folio->private;
+
+ return find_next_bit(ifs->state, end_blk + 1, start_blk);
+}
+
+/**
+* ifs_next_nonuptodate_block - find the next non-uptodate block in the folio
+* @folio: The folio
+* @start_blk: Block number to begin searching at
+* @end_blk: Last block number (inclusive) to search
+*
+* If no non-uptodate block is found, this will return end_blk + 1.
+*/
+static unsigned ifs_next_nonuptodate_block(struct folio *folio,
+ unsigned start_blk, unsigned end_blk)
+{
+ struct iomap_folio_state *ifs = folio->private;
+
+ return find_next_zero_bit(ifs->state, end_blk + 1, start_blk);
}
static bool ifs_set_range_uptodate(struct folio *folio,
@@ -291,14 +317,11 @@ static void iomap_adjust_read_range(struct inode *inode, struct folio *folio,
* to avoid reading in already uptodate ranges.
*/
if (ifs) {
- unsigned int i, blocks_skipped;
+ unsigned int next, blocks_skipped;
- /* move forward for each leading block marked uptodate */
- for (i = first; i <= last; i++)
- if (!ifs_block_is_uptodate(ifs, i))
- break;
+ next = ifs_next_nonuptodate_block(folio, first, last);
+ blocks_skipped = next - first;
- blocks_skipped = i - first;
if (blocks_skipped) {
unsigned long block_offset = *pos & (block_size - 1);
unsigned bytes_skipped =
@@ -308,15 +331,15 @@ static void iomap_adjust_read_range(struct inode *inode, struct folio *folio,
poff += bytes_skipped;
plen -= bytes_skipped;
}
- first = i;
+ first = next;
- /* truncate len if we find any trailing uptodate block(s) */
- while (++i <= last) {
- if (ifs_block_is_uptodate(ifs, i)) {
+ /* find any trailing uptodate block(s) */
+ if (++next <= last) {
+ next = ifs_next_uptodate_block(folio, next, last);
+ if (next <= last) {
plen -= iomap_bytes_to_truncate(*pos + plen,
- block_bits, last - i + 1);
- last = i - 1;
- break;
+ block_bits, last - next + 1);
+ last = next - 1;
}
}
}
@@ -609,7 +632,7 @@ bool iomap_is_partially_uptodate(struct folio *folio, size_t from, size_t count)
{
struct iomap_folio_state *ifs = folio->private;
struct inode *inode = folio->mapping->host;
- unsigned first, last, i;
+ unsigned first, last;
if (!ifs)
return false;
@@ -621,10 +644,7 @@ bool iomap_is_partially_uptodate(struct folio *folio, size_t from, size_t count)
first = from >> inode->i_blkbits;
last = (from + count - 1) >> inode->i_blkbits;
- for (i = first; i <= last; i++)
- if (!ifs_block_is_uptodate(ifs, i))
- return false;
- return true;
+ return ifs_next_nonuptodate_block(folio, first, last) > last;
}
EXPORT_SYMBOL_GPL(iomap_is_partially_uptodate);
--
2.47.3
^ permalink raw reply related [flat|nested] 21+ messages in thread