Linux EXT4 FS development
 help / color / mirror / Atom feed
* [PATCH] ext4: avoid full buffer walks for large folio partial writes
@ 2026-06-03 13:48 Jia Zhu
  2026-06-03 18:11 ` Matthew Wilcox
  0 siblings, 1 reply; 2+ messages in thread
From: Jia Zhu @ 2026-06-03 13:48 UTC (permalink / raw)
  To: Theodore Ts'o, Andreas Dilger
  Cc: Alexander Viro, Christian Brauner, Jan Kara, Baokun Li,
	Ojaswin Mujoo, Ritesh Harjani, Zhang Yi, linux-ext4,
	linux-fsdevel, linux-kernel, Jia Zhu

Ext4 buffered writes into large folios still walk every buffer_head in the
folio in ext4_block_write_begin() and again in block_commit_write(). Before
regular files used large folios this was cheap, but a large folio can
contain hundreds of buffer_heads. Small overwrites of an existing large
folio therefore pay work proportional to the folio size instead of the
write size.

This is visible when the page cache is first populated with large folios
and then a small range is overwritten. The numbers below come from a local
libMicro-based microbenchmark. Each round first drops caches, writes a
10 MiB file with dd to instantiate large page-cache folios, and then runs
libMicro's write, pwrite, or writev benchmark for a small buffered
overwrite. The writev cases use libMicro's default vector count of 10.

A representative pwrite round is:

	sync
	echo 3 > /proc/sys/vm/drop_caches
	dd if=/dev/zero of=$file bs=1024k count=10
	taskset -c 0 ./bin/pwrite -H -C 50 -D 3 -S -N pwrite_u1k \
		-s 1k -f $file

To avoid comparing this change with an older kernel, the benchmark uses two
kernels built from the same master tree: one with this change and one with
only this change reverted. With THP=always and 10 dd-prefill rounds, median
latencies were:

			nofix		patched		improvement
	write_u1k	1.418 usec	0.342 usec	75.9%
	write_u10k	1.887 usec	0.409 usec	78.3%
	write_u100k	4.114 usec	2.554 usec	37.9%
	pwrite_u1k	1.677 usec	0.335 usec	80.1%
	pwrite_u10k	1.903 usec	0.410 usec	78.5%
	pwrite_u100k	4.101 usec	2.563 usec	37.5%
	writev_u1k	2.285 usec	0.756 usec	66.9%
	writev_u10k	4.655 usec	3.025 usec	35.0%

Start the ext4 write_begin walk at the first buffer that overlaps the
write. For already-uptodate large folio overwrites, add a partial commit
path which marks only the written buffers uptodate and dirty. Leave
non-uptodate folios on the old full-buffer commit path so BH_New cleanup
and folio-uptodate discovery are preserved.

Partially uptodate large folios remain described by per-buffer state, which
is what block_is_partially_uptodate() and read_folio use for later reads.

Signed-off-by: Jia Zhu <zhujia.zj@bytedance.com>
---
 fs/buffer.c     | 51 +++++++++++++++++++++++++++++++++++++++++++++++++
 fs/ext4/inode.c | 21 ++++++++++----------
 2 files changed, 62 insertions(+), 10 deletions(-)

diff --git a/fs/buffer.c b/fs/buffer.c
index b0b3792b1496e..e0c5868b088be 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2092,6 +2092,44 @@ int __block_write_begin(struct folio *folio, loff_t pos, unsigned len,
 }
 EXPORT_SYMBOL(__block_write_begin);
 
+static struct buffer_head *folio_buffer_seek(struct buffer_head *head,
+					     unsigned int blocksize,
+					     size_t offset,
+					     size_t *block_start)
+{
+	size_t nr = offset / blocksize;
+
+	*block_start = nr * blocksize;
+	while (nr--)
+		head = head->b_this_page;
+	return head;
+}
+
+static void block_commit_write_range(struct buffer_head *head,
+				     unsigned int blocksize, size_t from,
+				     size_t to)
+{
+	size_t block_start, block_end;
+	struct buffer_head *bh;
+
+	if (from == to)
+		return;
+	if (WARN_ON_ONCE(to > folio_size(head->b_folio)))
+		return;
+
+	bh = folio_buffer_seek(head, blocksize, from, &block_start);
+	do {
+		block_end = block_start + blocksize;
+		set_buffer_uptodate(bh);
+		mark_buffer_dirty(bh);
+		if (buffer_new(bh))
+			clear_buffer_new(bh);
+
+		block_start = block_end;
+		bh = bh->b_this_page;
+	} while (block_start < to && bh != head);
+}
+
 void block_commit_write(struct folio *folio, size_t from, size_t to)
 {
 	size_t block_start, block_end;
@@ -2104,6 +2142,19 @@ void block_commit_write(struct folio *folio, size_t from, size_t to)
 		return;
 	blocksize = bh->b_size;
 
+	/*
+	 * Large folios can carry hundreds of buffer_heads.  For partial writes,
+	 * keep commit work local to the written range; partially uptodate
+	 * reads remain governed by the buffer state.
+	 */
+	if (folio_test_large(folio) && from < to &&
+	    folio_test_uptodate(folio) &&
+	    to <= folio_size(folio) &&
+	    (from != 0 || to != folio_size(folio))) {
+		block_commit_write_range(head, blocksize, from, to);
+		return;
+	}
+
 	block_start = 0;
 	do {
 		block_end = block_start + blocksize;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index c2c2d6ac7f3d1..e58bba0289eba 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1180,7 +1180,7 @@ int ext4_block_write_begin(handle_t *handle, struct folio *folio,
 	unsigned int blocksize = i_blocksize(inode);
 	struct buffer_head *bh, *head, *wait[2];
 	int nr_wait = 0;
-	int i;
+	unsigned int i;
 	bool should_journal_data = ext4_should_journal_data(inode);
 
 	BUG_ON(!folio_test_locked(folio));
@@ -1191,17 +1191,18 @@ int ext4_block_write_begin(handle_t *handle, struct folio *folio,
 	head = folio_buffers(folio);
 	if (!head)
 		head = create_empty_buffers(folio, blocksize, 0);
-	block = EXT4_PG_TO_LBLK(inode, folio->index);
+	if (from == to)
+		return 0;
+	block_start = round_down(from, blocksize);
+	block = EXT4_PG_TO_LBLK(inode, folio->index) +
+		(block_start >> inode->i_blkbits);
+	bh = head;
+	for (i = 0; i < block_start; i += blocksize)
+		bh = bh->b_this_page;
 
-	for (bh = head, block_start = 0; bh != head || !block_start;
-	    block++, block_start = block_end, bh = bh->b_this_page) {
+	for (; block_start < to;
+	     block++, block_start = block_end, bh = bh->b_this_page) {
 		block_end = block_start + blocksize;
-		if (block_end <= from || block_start >= to) {
-			if (folio_test_uptodate(folio)) {
-				set_buffer_uptodate(bh);
-			}
-			continue;
-		}
 		if (WARN_ON_ONCE(buffer_new(bh)))
 			clear_buffer_new(bh);
 		if (!buffer_mapped(bh)) {

base-commit: e43ffb69e0438cddd72aaa30898b4dc446f664f8
-- 
2.39.5 (Apple Git-154)

^ permalink raw reply related	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2026-06-03 18:12 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-06-03 13:48 [PATCH] ext4: avoid full buffer walks for large folio partial writes Jia Zhu
2026-06-03 18:11 ` Matthew Wilcox

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox