public inbox for linux-ext4@vger.kernel.org
 help / color / mirror / Atom feed
* [RFC PATCH v2 0/4] remove buffer heads from ext2
@ 2025-03-26  1:49 Catherine Hoang
  2025-03-26  1:49 ` [RFC PATCH v2 1/4] ext2: remove buffer heads from superblock Catherine Hoang
                   ` (4 more replies)
  0 siblings, 5 replies; 11+ messages in thread
From: Catherine Hoang @ 2025-03-26  1:49 UTC (permalink / raw)
  To: linux-ext4; +Cc: djwong

Hi all,

This series is an effort to begin removing buffer heads from ext2. 
The first patch introduces the bulk of the new buffer cache code, while
the rest of the patches split up changes to each part of the fs for
easier testing.

This is still a work in progress, and there are a couple more things on
my todo list:
- finish removing buffer heads from xattrs, inode allocation, etc
- implement a buffer cache shrinker
- fix various locking issues

Comments and feedback appreciated!

Catherine Hoang (4):
  ext2: remove buffer heads from superblock
  ext2: remove buffer heads from group descriptors
  ext2: remove buffer heads from quota handling
  ext2: remove buffer heads from block bitmaps

 fs/ext2/Makefile |   2 +-
 fs/ext2/balloc.c | 132 ++++++++++-----------
 fs/ext2/cache.c  | 302 +++++++++++++++++++++++++++++++++++++++++++++++
 fs/ext2/ext2.h   |  47 +++++++-
 fs/ext2/ialloc.c |  12 +-
 fs/ext2/super.c  |  96 ++++++++-------
 fs/ext2/xattr.c  |   2 +-
 7 files changed, 468 insertions(+), 125 deletions(-)
 create mode 100644 fs/ext2/cache.c

-- 
2.43.0


^ permalink raw reply	[flat|nested] 11+ messages in thread

* [RFC PATCH v2 1/4] ext2: remove buffer heads from superblock
  2025-03-26  1:49 [RFC PATCH v2 0/4] remove buffer heads from ext2 Catherine Hoang
@ 2025-03-26  1:49 ` Catherine Hoang
  2025-03-28 18:24   ` Darrick J. Wong
  2025-03-26  1:49 ` [RFC PATCH v2 2/4] ext2: remove buffer heads from group descriptors Catherine Hoang
                   ` (3 subsequent siblings)
  4 siblings, 1 reply; 11+ messages in thread
From: Catherine Hoang @ 2025-03-26  1:49 UTC (permalink / raw)
  To: linux-ext4; +Cc: djwong

The superblock is stored in the buffer_head s_sbh in struct ext2_sb_info.
Replace this buffer head with the new ext2_buffer and update the buffer
functions accordingly. This patch also introduces new buffer cache code
needed for future patches.

Signed-off-by: Catherine Hoang <catherine.hoang@oracle.com>
---
 fs/ext2/Makefile |   2 +-
 fs/ext2/cache.c  | 302 +++++++++++++++++++++++++++++++++++++++++++++++
 fs/ext2/ext2.h   |  43 ++++++-
 fs/ext2/super.c  |  52 +++++---
 fs/ext2/xattr.c  |   2 +-
 5 files changed, 379 insertions(+), 22 deletions(-)
 create mode 100644 fs/ext2/cache.c

diff --git a/fs/ext2/Makefile b/fs/ext2/Makefile
index 8860948ef9ca..e8b38243058f 100644
--- a/fs/ext2/Makefile
+++ b/fs/ext2/Makefile
@@ -5,7 +5,7 @@
 
 obj-$(CONFIG_EXT2_FS) += ext2.o
 
-ext2-y := balloc.o dir.o file.o ialloc.o inode.o \
+ext2-y := balloc.o cache.o dir.o file.o ialloc.o inode.o \
 	  ioctl.o namei.o super.o symlink.o trace.o
 
 # For tracepoints to include our trace.h from tracepoint infrastructure
diff --git a/fs/ext2/cache.c b/fs/ext2/cache.c
new file mode 100644
index 000000000000..464c506ba1b6
--- /dev/null
+++ b/fs/ext2/cache.c
@@ -0,0 +1,302 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2025 Oracle. All rights reserved.
+ */
+
+#include "ext2.h"
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/rhashtable.h>
+#include <linux/mm.h>
+#include <linux/types.h>
+
+static const struct rhashtable_params buffer_cache_params = {
+	.key_len     = sizeof(sector_t),
+	.key_offset  = offsetof(struct ext2_buffer, b_block),
+	.head_offset = offsetof(struct ext2_buffer, b_rhash),
+	.automatic_shrinking = true,
+};
+
+void ext2_buffer_lock(struct ext2_buffer *buf)
+{
+	mutex_lock(&buf->b_lock);
+}
+
+void ext2_buffer_unlock(struct ext2_buffer *buf)
+{
+	mutex_unlock(&buf->b_lock);
+}
+
+void ext2_buffer_set_dirty(struct ext2_buffer *buf)
+{
+    set_bit(EXT2_BUF_DIRTY_BIT, &buf->b_flags);
+}
+
+static int ext2_buffer_uptodate(struct ext2_buffer *buf)
+{
+    return test_bit(EXT2_BUF_UPTODATE_BIT, &buf->b_flags);
+}
+
+void ext2_buffer_set_uptodate(struct ext2_buffer *buf)
+{
+    set_bit(EXT2_BUF_UPTODATE_BIT, &buf->b_flags);
+}
+
+void ext2_buffer_clear_uptodate(struct ext2_buffer *buf)
+{
+    clear_bit(EXT2_BUF_UPTODATE_BIT, &buf->b_flags);
+}
+
+int ext2_buffer_error(struct ext2_buffer *buf)
+{
+       return buf->b_error;
+}
+
+void ext2_buffer_clear_error(struct ext2_buffer *buf)
+{
+       buf->b_error = 0;
+}
+
+static struct ext2_buffer *ext2_insert_buffer_cache(struct super_block *sb, struct ext2_buffer *new_buf)
+{
+	struct ext2_sb_info *sbi = EXT2_SB(sb);
+	struct ext2_buffer_cache *bc = &sbi->s_buffer_cache;
+	struct rhashtable *buffer_cache = &bc->bc_hash;
+	struct ext2_buffer *old_buf;
+
+	rcu_read_lock();
+	old_buf = rhashtable_lookup_get_insert_fast(buffer_cache,
+				&new_buf->b_rhash, buffer_cache_params);
+
+	if (old_buf) {
+		refcount_inc(&old_buf->b_refcount);
+		rcu_read_unlock();
+		return old_buf;
+	}
+
+	refcount_inc(&new_buf->b_refcount);
+	rcu_read_unlock();
+	return new_buf;
+}
+
+static void ext2_buf_write_end_io(struct bio *bio)
+{
+	struct ext2_buffer *buf = bio->bi_private;
+	int err = blk_status_to_errno(bio->bi_status);
+
+	buf->b_error = err;
+	complete(&buf->b_complete);
+	mutex_unlock(&buf->b_lock);
+	bio_put(bio);
+}
+
+static int ext2_submit_buffer_read(struct super_block *sb, struct ext2_buffer *buf)
+{
+	struct bio_vec bio_vec;
+	struct bio bio;
+	sector_t sector = buf->b_block * (sb->s_blocksize >> 9);
+	int error;
+
+	bio_init(&bio, sb->s_bdev, &bio_vec, 1, REQ_OP_READ);
+	bio.bi_iter.bi_sector = sector;
+
+	buf->b_size = sb->s_blocksize;
+	__bio_add_page(&bio, buf->b_page, buf->b_size, 0);
+
+	mutex_lock(&buf->b_lock);
+	error = submit_bio_wait(&bio);
+	ext2_buffer_set_uptodate(buf);
+	mutex_unlock(&buf->b_lock);
+
+	return error;
+}
+
+static void ext2_submit_buffer_write(struct super_block *sb, struct ext2_buffer *buf)
+{
+	struct bio *bio;
+	sector_t sector = buf->b_block * (sb->s_blocksize >> 9);
+
+	bio = bio_alloc(sb->s_bdev, 1, REQ_OP_WRITE, GFP_KERNEL);
+
+	bio->bi_iter.bi_sector = sector;
+	bio->bi_end_io = ext2_buf_write_end_io;
+	bio->bi_private = buf;
+
+	__bio_add_page(bio, buf->b_page, buf->b_size, 0);
+
+	mutex_lock(&buf->b_lock);
+	submit_bio(bio);
+}
+
+static int ext2_sync_buffer_cache_wait(struct list_head *submit_list)
+{
+	struct ext2_buffer *buf, *n;
+	int error = 0, error2;
+
+	list_for_each_entry_safe(buf, n, submit_list, b_list) {
+		wait_for_completion(&buf->b_complete);
+		refcount_dec(&buf->b_refcount);
+		error2 = buf->b_error;
+		if (!error)
+			error = error2;
+	}
+
+	return error;
+}
+
+int ext2_sync_buffer_wait(struct super_block *sb, struct ext2_buffer *buf)
+{
+	if (test_and_clear_bit(EXT2_BUF_DIRTY_BIT, &buf->b_flags)) {
+		ext2_submit_buffer_write(sb, buf);
+		wait_for_completion(&buf->b_complete);
+		return buf->b_error;
+	}
+
+	return 0;
+}
+
+int ext2_sync_buffer_cache(struct super_block *sb)
+{
+	struct ext2_sb_info *sbi = EXT2_SB(sb);
+	struct ext2_buffer_cache *bc = &sbi->s_buffer_cache;
+	struct rhashtable *buffer_cache = &bc->bc_hash;
+	struct rhashtable_iter iter;
+	struct ext2_buffer *buf, *n;
+	struct blk_plug plug;
+	LIST_HEAD(submit_list);
+
+	rhashtable_walk_enter(buffer_cache, &iter);
+	rhashtable_walk_start(&iter);
+	while ((buf = rhashtable_walk_next(&iter)) != NULL) {
+		if (IS_ERR(buf))
+			continue;
+		if (test_and_clear_bit(EXT2_BUF_DIRTY_BIT, &buf->b_flags)) {
+			refcount_inc(&buf->b_refcount);
+			list_add(&buf->b_list, &submit_list);
+		}
+	}
+	rhashtable_walk_stop(&iter);
+	rhashtable_walk_exit(&iter);
+
+	blk_start_plug(&plug);
+	list_for_each_entry_safe(buf, n, &submit_list, b_list) {
+		ext2_submit_buffer_write(sb, buf);
+	}
+	blk_finish_plug(&plug);
+
+	return ext2_sync_buffer_cache_wait(&submit_list);
+}
+
+static struct ext2_buffer *ext2_lookup_buffer_cache(struct super_block *sb, sector_t block)
+{
+	struct ext2_sb_info *sbi = EXT2_SB(sb);
+	struct ext2_buffer_cache *bc = &sbi->s_buffer_cache;
+	struct rhashtable *buffer_cache = &bc->bc_hash;
+	struct ext2_buffer *found = NULL;
+
+	rcu_read_lock();
+	found = rhashtable_lookup(buffer_cache, &block, buffer_cache_params);
+	if (found && !refcount_inc_not_zero(&found->b_refcount))
+		found = NULL;
+	rcu_read_unlock();
+
+	return found;
+}
+
+static struct ext2_buffer *ext2_init_buffer(struct super_block *sb, sector_t block, bool need_uptodate)
+{
+	struct ext2_buffer *buf;
+	gfp_t gfp = GFP_KERNEL;
+
+	buf = kmalloc(sizeof(struct ext2_buffer), GFP_KERNEL);
+	if (!buf)
+		return NULL;
+
+	buf->b_block = block;
+	buf->b_size = sb->s_blocksize;
+	buf->b_flags = 0;
+	buf->b_error = 0;
+
+	mutex_init(&buf->b_lock);
+	refcount_set(&buf->b_refcount, 1);
+	init_completion(&buf->b_complete);
+
+	if (!need_uptodate)
+		gfp |= __GFP_ZERO;
+
+	buf->b_page = alloc_page(gfp);
+	if (!buf->b_page) {
+		kfree_rcu(buf, b_rcu);
+		return NULL;
+	}
+
+	buf->b_data = page_address(buf->b_page);
+
+	return buf;
+}
+
+static void ext2_destroy_buffer(void *ptr, void *arg)
+{
+	struct ext2_buffer *buf = ptr;
+
+	WARN_ON(test_bit(EXT2_BUF_DIRTY_BIT, &buf->b_flags));
+	__free_page(buf->b_page);
+	kfree(buf);
+}
+
+void ext2_put_buffer(struct super_block *sb, struct ext2_buffer *buf)
+{
+	if (!buf)
+		return;
+
+	WARN_ON(refcount_read(&buf->b_refcount) < 1);
+	refcount_dec(&buf->b_refcount);
+}
+
+
+static struct ext2_buffer *ext2_find_get_buffer(struct super_block *sb, sector_t block, bool need_uptodate)
+{
+	int err;
+	struct ext2_buffer *buf;
+	struct ext2_buffer *new_buf;
+
+	buf = ext2_lookup_buffer_cache(sb, block);
+
+	if (!buf) {
+		new_buf = ext2_init_buffer(sb, block, need_uptodate);
+		if (!new_buf)
+			return ERR_PTR(-ENOMEM);
+
+		buf = ext2_insert_buffer_cache(sb, new_buf);
+		if (IS_ERR(buf) || buf != new_buf)
+			ext2_destroy_buffer(new_buf, NULL);
+	}
+
+	if (need_uptodate && !ext2_buffer_uptodate(buf)) {
+		err = ext2_submit_buffer_read(sb, buf);
+		if (err)
+			return ERR_PTR(err);
+	}
+
+	return buf;
+}
+
+struct ext2_buffer *ext2_get_buffer(struct super_block *sb, sector_t bno)
+{
+	return ext2_find_get_buffer(sb, bno, false);
+}
+
+struct ext2_buffer *ext2_read_buffer(struct super_block *sb, sector_t bno)
+{
+	return ext2_find_get_buffer(sb, bno, true);
+}
+
+int ext2_init_buffer_cache(struct ext2_buffer_cache *bc)
+{
+	return rhashtable_init(&bc->bc_hash, &buffer_cache_params);
+}
+
+void ext2_destroy_buffer_cache(struct ext2_buffer_cache *bc)
+{
+	rhashtable_free_and_destroy(&bc->bc_hash, ext2_destroy_buffer, NULL);
+}
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index f38bdd46e4f7..bfed70fd6430 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -18,6 +18,7 @@
 #include <linux/rbtree.h>
 #include <linux/mm.h>
 #include <linux/highmem.h>
+#include <linux/rhashtable.h>
 
 /* XXX Here for now... not interested in restructing headers JUST now */
 
@@ -61,6 +62,29 @@ struct ext2_block_alloc_info {
 	ext2_fsblk_t		last_alloc_physical_block;
 };
 
+struct ext2_buffer {
+	sector_t b_block;
+	struct rhash_head b_rhash;
+	struct rcu_head b_rcu;
+	struct page *b_page;
+	size_t b_size;
+	char *b_data;
+	unsigned long b_flags;
+	refcount_t b_refcount;
+	struct mutex b_lock;
+	struct completion b_complete;
+	struct list_head b_list;
+	int b_error;
+};
+
+/* ext2_buffer flags */
+#define EXT2_BUF_DIRTY_BIT 0
+#define EXT2_BUF_UPTODATE_BIT 1
+
+struct ext2_buffer_cache {
+	struct rhashtable bc_hash;
+};
+
 #define rsv_start rsv_window._rsv_start
 #define rsv_end rsv_window._rsv_end
 
@@ -79,7 +103,7 @@ struct ext2_sb_info {
 	unsigned long s_groups_count;	/* Number of groups in the fs */
 	unsigned long s_overhead_last;  /* Last calculated overhead */
 	unsigned long s_blocks_last;    /* Last seen block count */
-	struct buffer_head * s_sbh;	/* Buffer containing the super block */
+	struct ext2_buffer * s_sbuf;	/* Buffer containing the super block */
 	struct ext2_super_block * s_es;	/* Pointer to the super block in the buffer */
 	struct buffer_head ** s_group_desc;
 	unsigned long  s_mount_opt;
@@ -116,6 +140,7 @@ struct ext2_sb_info {
 	struct mb_cache *s_ea_block_cache;
 	struct dax_device *s_daxdev;
 	u64 s_dax_part_off;
+	struct ext2_buffer_cache s_buffer_cache;
 };
 
 static inline spinlock_t *
@@ -716,6 +741,22 @@ extern int ext2_should_retry_alloc(struct super_block *sb, int *retries);
 extern void ext2_init_block_alloc_info(struct inode *);
 extern void ext2_rsv_window_add(struct super_block *sb, struct ext2_reserve_window_node *rsv);
 
+/* cache.c */
+extern void ext2_buffer_lock(struct ext2_buffer *);
+extern void ext2_buffer_unlock(struct ext2_buffer *);
+extern int ext2_init_buffer_cache(struct ext2_buffer_cache *);
+extern void ext2_destroy_buffer_cache(struct ext2_buffer_cache *);
+extern int ext2_sync_buffer_wait(struct super_block *, struct ext2_buffer *);
+extern int ext2_sync_buffer_cache(struct super_block *);
+extern struct ext2_buffer *ext2_get_buffer(struct super_block *, sector_t);
+extern struct ext2_buffer *ext2_read_buffer(struct super_block *, sector_t);
+extern void ext2_put_buffer(struct super_block *, struct ext2_buffer *);
+extern void ext2_buffer_set_dirty(struct ext2_buffer *);
+extern void ext2_buffer_set_uptodate(struct ext2_buffer *);
+extern void ext2_buffer_clear_uptodate(struct ext2_buffer *);
+extern int ext2_buffer_error(struct ext2_buffer *);
+extern void ext2_buffer_clear_error(struct ext2_buffer *);
+
 /* dir.c */
 int ext2_add_link(struct dentry *, struct inode *);
 int ext2_inode_by_name(struct inode *dir,
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 37f7ce56adce..ac53f587d140 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -168,7 +168,8 @@ static void ext2_put_super (struct super_block * sb)
 	percpu_counter_destroy(&sbi->s_freeblocks_counter);
 	percpu_counter_destroy(&sbi->s_freeinodes_counter);
 	percpu_counter_destroy(&sbi->s_dirs_counter);
-	brelse (sbi->s_sbh);
+	ext2_put_buffer (sb, sbi->s_sbuf);
+	ext2_destroy_buffer_cache(&sbi->s_buffer_cache);
 	sb->s_fs_info = NULL;
 	kfree(sbi->s_blockgroup_lock);
 	fs_put_dax(sbi->s_daxdev, NULL);
@@ -803,7 +804,7 @@ static unsigned long descriptor_loc(struct super_block *sb,
 
 static int ext2_fill_super(struct super_block *sb, void *data, int silent)
 {
-	struct buffer_head * bh;
+	struct ext2_buffer * buf;
 	struct ext2_sb_info * sbi;
 	struct ext2_super_block * es;
 	struct inode *root;
@@ -835,6 +836,12 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
 	sbi->s_daxdev = fs_dax_get_by_bdev(sb->s_bdev, &sbi->s_dax_part_off,
 					   NULL, NULL);
 
+	ret = ext2_init_buffer_cache(&sbi->s_buffer_cache);
+	if (ret) {
+		ext2_msg(sb, KERN_ERR, "error: unable to create buffer cache");
+		goto failed_sbi;
+	}
+
 	spin_lock_init(&sbi->s_lock);
 	ret = -EINVAL;
 
@@ -862,7 +869,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
 		logic_sb_block = sb_block;
 	}
 
-	if (!(bh = sb_bread(sb, logic_sb_block))) {
+	if (IS_ERR(buf = ext2_read_buffer(sb, logic_sb_block))) {
 		ext2_msg(sb, KERN_ERR, "error: unable to read superblock");
 		goto failed_sbi;
 	}
@@ -870,7 +877,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
 	 * Note: s_es must be initialized as soon as possible because
 	 *       some ext2 macro-instructions depend on its value
 	 */
-	es = (struct ext2_super_block *) (((char *)bh->b_data) + offset);
+	es = (struct ext2_super_block *) (((char *)buf->b_data) + offset);
 	sbi->s_es = es;
 	sb->s_magic = le16_to_cpu(es->s_magic);
 
@@ -966,7 +973,8 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
 
 	/* If the blocksize doesn't match, re-read the thing.. */
 	if (sb->s_blocksize != blocksize) {
-		brelse(bh);
+		ext2_buffer_clear_uptodate(buf);
+		ext2_put_buffer(sb, buf);
 
 		if (!sb_set_blocksize(sb, blocksize)) {
 			ext2_msg(sb, KERN_ERR,
@@ -976,13 +984,13 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
 
 		logic_sb_block = (sb_block*BLOCK_SIZE) / blocksize;
 		offset = (sb_block*BLOCK_SIZE) % blocksize;
-		bh = sb_bread(sb, logic_sb_block);
-		if(!bh) {
+		buf = ext2_read_buffer(sb, logic_sb_block);
+		if(IS_ERR(buf)) {
 			ext2_msg(sb, KERN_ERR, "error: couldn't read"
 				"superblock on 2nd try");
 			goto failed_sbi;
 		}
-		es = (struct ext2_super_block *) (((char *)bh->b_data) + offset);
+		es = (struct ext2_super_block *) (((char *)buf->b_data) + offset);
 		sbi->s_es = es;
 		if (es->s_magic != cpu_to_le16(EXT2_SUPER_MAGIC)) {
 			ext2_msg(sb, KERN_ERR, "error: magic mismatch");
@@ -1021,7 +1029,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
 					sbi->s_inodes_per_block;
 	sbi->s_desc_per_block = sb->s_blocksize /
 					sizeof (struct ext2_group_desc);
-	sbi->s_sbh = bh;
+	sbi->s_sbuf = buf;
 	sbi->s_mount_state = le16_to_cpu(es->s_state);
 	sbi->s_addr_per_block_bits =
 		ilog2 (EXT2_ADDR_PER_BLOCK(sb));
@@ -1031,7 +1039,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
 	if (sb->s_magic != EXT2_SUPER_MAGIC)
 		goto cantfind_ext2;
 
-	if (sb->s_blocksize != bh->b_size) {
+	if (sb->s_blocksize != buf->b_size) {
 		if (!silent)
 			ext2_msg(sb, KERN_ERR, "error: unsupported blocksize");
 		goto failed_mount;
@@ -1213,7 +1221,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
 	kvfree(sbi->s_group_desc);
 	kfree(sbi->s_debts);
 failed_mount:
-	brelse(bh);
+	ext2_put_buffer(sb, buf);
 failed_sbi:
 	fs_put_dax(sbi->s_daxdev, NULL);
 	sb->s_fs_info = NULL;
@@ -1224,9 +1232,9 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
 
 static void ext2_clear_super_error(struct super_block *sb)
 {
-	struct buffer_head *sbh = EXT2_SB(sb)->s_sbh;
+	struct ext2_buffer *sbuf = EXT2_SB(sb)->s_sbuf;
 
-	if (buffer_write_io_error(sbh)) {
+	if (ext2_buffer_error(sbuf)) {
 		/*
 		 * Oh, dear.  A previous attempt to write the
 		 * superblock failed.  This could happen because the
@@ -1237,8 +1245,8 @@ static void ext2_clear_super_error(struct super_block *sb)
 		 */
 		ext2_msg(sb, KERN_ERR,
 		       "previous I/O error to superblock detected");
-		clear_buffer_write_io_error(sbh);
-		set_buffer_uptodate(sbh);
+		ext2_buffer_clear_error(sbuf);
+		ext2_buffer_set_uptodate(sbuf);
 	}
 }
 
@@ -1252,9 +1260,9 @@ void ext2_sync_super(struct super_block *sb, struct ext2_super_block *es,
 	es->s_wtime = cpu_to_le32(ktime_get_real_seconds());
 	/* unlock before we do IO */
 	spin_unlock(&EXT2_SB(sb)->s_lock);
-	mark_buffer_dirty(EXT2_SB(sb)->s_sbh);
+	ext2_buffer_set_dirty(EXT2_SB(sb)->s_sbuf);
 	if (wait)
-		sync_dirty_buffer(EXT2_SB(sb)->s_sbh);
+		ext2_sync_buffer_wait(sb, EXT2_SB(sb)->s_sbuf);
 }
 
 /*
@@ -1271,13 +1279,19 @@ static int ext2_sync_fs(struct super_block *sb, int wait)
 {
 	struct ext2_sb_info *sbi = EXT2_SB(sb);
 	struct ext2_super_block *es = EXT2_SB(sb)->s_es;
+	int err = 0;
 
 	/*
 	 * Write quota structures to quota file, sync_blockdev() will write
 	 * them to disk later
 	 */
-	dquot_writeback_dquots(sb, -1);
+	err = dquot_writeback_dquots(sb, -1);
+	if (err)
+		goto out;
+
+	err = ext2_sync_buffer_cache(sb);
 
+out:
 	spin_lock(&sbi->s_lock);
 	if (es->s_state & cpu_to_le16(EXT2_VALID_FS)) {
 		ext2_debug("setting valid to 0\n");
@@ -1285,7 +1299,7 @@ static int ext2_sync_fs(struct super_block *sb, int wait)
 	}
 	spin_unlock(&sbi->s_lock);
 	ext2_sync_super(sb, es, wait);
-	return 0;
+	return err;
 }
 
 static int ext2_freeze(struct super_block *sb)
diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
index c885dcc3bd0d..1eb4a8607f67 100644
--- a/fs/ext2/xattr.c
+++ b/fs/ext2/xattr.c
@@ -387,7 +387,7 @@ static void ext2_xattr_update_super_block(struct super_block *sb)
 	ext2_update_dynamic_rev(sb);
 	EXT2_SET_COMPAT_FEATURE(sb, EXT2_FEATURE_COMPAT_EXT_ATTR);
 	spin_unlock(&EXT2_SB(sb)->s_lock);
-	mark_buffer_dirty(EXT2_SB(sb)->s_sbh);
+	ext2_buffer_set_dirty(EXT2_SB(sb)->s_sbuf);
 }
 
 /*
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [RFC PATCH v2 2/4] ext2: remove buffer heads from group descriptors
  2025-03-26  1:49 [RFC PATCH v2 0/4] remove buffer heads from ext2 Catherine Hoang
  2025-03-26  1:49 ` [RFC PATCH v2 1/4] ext2: remove buffer heads from superblock Catherine Hoang
@ 2025-03-26  1:49 ` Catherine Hoang
  2025-03-28 18:29   ` Darrick J. Wong
  2025-03-26  1:49 ` [RFC PATCH v2 3/4] ext2: remove buffer heads from quota handling Catherine Hoang
                   ` (2 subsequent siblings)
  4 siblings, 1 reply; 11+ messages in thread
From: Catherine Hoang @ 2025-03-26  1:49 UTC (permalink / raw)
  To: linux-ext4; +Cc: djwong

The group descriptors are stored as an array of buffer_heads
s_group_desc in struct ext2_sb_info. Replace these buffer heads with the
new ext2_buffer and update the buffer functions accordingly.

Signed-off-by: Catherine Hoang <catherine.hoang@oracle.com>
---
 fs/ext2/balloc.c | 24 ++++++++++++------------
 fs/ext2/ext2.h   |  4 ++--
 fs/ext2/ialloc.c | 12 ++++++------
 fs/ext2/super.c  | 10 +++++-----
 4 files changed, 25 insertions(+), 25 deletions(-)

diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c
index b8cfab8f98b9..21dafa9ae2ea 100644
--- a/fs/ext2/balloc.c
+++ b/fs/ext2/balloc.c
@@ -38,7 +38,7 @@
 
 struct ext2_group_desc * ext2_get_group_desc(struct super_block * sb,
 					     unsigned int block_group,
-					     struct buffer_head ** bh)
+					     struct ext2_buffer ** buf)
 {
 	unsigned long group_desc;
 	unsigned long offset;
@@ -63,8 +63,8 @@ struct ext2_group_desc * ext2_get_group_desc(struct super_block * sb,
 	}
 
 	desc = (struct ext2_group_desc *) sbi->s_group_desc[group_desc]->b_data;
-	if (bh)
-		*bh = sbi->s_group_desc[group_desc];
+	if (buf)
+		*buf = sbi->s_group_desc[group_desc];
 	return desc + offset;
 }
 
@@ -166,7 +166,7 @@ read_block_bitmap(struct super_block *sb, unsigned int block_group)
 }
 
 static void group_adjust_blocks(struct super_block *sb, int group_no,
-	struct ext2_group_desc *desc, struct buffer_head *bh, int count)
+	struct ext2_group_desc *desc, struct ext2_buffer *buf, int count)
 {
 	if (count) {
 		struct ext2_sb_info *sbi = EXT2_SB(sb);
@@ -176,7 +176,7 @@ static void group_adjust_blocks(struct super_block *sb, int group_no,
 		free_blocks = le16_to_cpu(desc->bg_free_blocks_count);
 		desc->bg_free_blocks_count = cpu_to_le16(free_blocks + count);
 		spin_unlock(sb_bgl_lock(sbi, group_no));
-		mark_buffer_dirty(bh);
+		ext2_buffer_set_dirty(buf);
 	}
 }
 
@@ -483,7 +483,7 @@ void ext2_free_blocks(struct inode * inode, ext2_fsblk_t block,
 		      unsigned long count)
 {
 	struct buffer_head *bitmap_bh = NULL;
-	struct buffer_head * bh2;
+	struct ext2_buffer * buf2;
 	unsigned long block_group;
 	unsigned long bit;
 	unsigned long i;
@@ -522,7 +522,7 @@ void ext2_free_blocks(struct inode * inode, ext2_fsblk_t block,
 	if (!bitmap_bh)
 		goto error_return;
 
-	desc = ext2_get_group_desc (sb, block_group, &bh2);
+	desc = ext2_get_group_desc (sb, block_group, &buf2);
 	if (!desc)
 		goto error_return;
 
@@ -553,7 +553,7 @@ void ext2_free_blocks(struct inode * inode, ext2_fsblk_t block,
 	if (sb->s_flags & SB_SYNCHRONOUS)
 		sync_dirty_buffer(bitmap_bh);
 
-	group_adjust_blocks(sb, block_group, desc, bh2, group_freed);
+	group_adjust_blocks(sb, block_group, desc, buf2, group_freed);
 	freed += group_freed;
 
 	if (overflow) {
@@ -1209,7 +1209,7 @@ ext2_fsblk_t ext2_new_blocks(struct inode *inode, ext2_fsblk_t goal,
 		    unsigned long *count, int *errp, unsigned int flags)
 {
 	struct buffer_head *bitmap_bh = NULL;
-	struct buffer_head *gdp_bh;
+	struct ext2_buffer *gdp_buf;
 	int group_no;
 	int goal_group;
 	ext2_grpblk_t grp_target_blk;	/* blockgroup relative goal block */
@@ -1274,7 +1274,7 @@ ext2_fsblk_t ext2_new_blocks(struct inode *inode, ext2_fsblk_t goal,
 			EXT2_BLOCKS_PER_GROUP(sb);
 	goal_group = group_no;
 retry_alloc:
-	gdp = ext2_get_group_desc(sb, group_no, &gdp_bh);
+	gdp = ext2_get_group_desc(sb, group_no, &gdp_buf);
 	if (!gdp)
 		goto io_error;
 
@@ -1319,7 +1319,7 @@ ext2_fsblk_t ext2_new_blocks(struct inode *inode, ext2_fsblk_t goal,
 		group_no++;
 		if (group_no >= ngroups)
 			group_no = 0;
-		gdp = ext2_get_group_desc(sb, group_no, &gdp_bh);
+		gdp = ext2_get_group_desc(sb, group_no, &gdp_buf);
 		if (!gdp)
 			goto io_error;
 
@@ -1403,7 +1403,7 @@ ext2_fsblk_t ext2_new_blocks(struct inode *inode, ext2_fsblk_t goal,
 		goto out;
 	}
 
-	group_adjust_blocks(sb, group_no, gdp, gdp_bh, -num);
+	group_adjust_blocks(sb, group_no, gdp, gdp_buf, -num);
 	percpu_counter_sub(&sbi->s_freeblocks_counter, num);
 
 	mark_buffer_dirty(bitmap_bh);
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index bfed70fd6430..5857d5ce7641 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -105,7 +105,7 @@ struct ext2_sb_info {
 	unsigned long s_blocks_last;    /* Last seen block count */
 	struct ext2_buffer * s_sbuf;	/* Buffer containing the super block */
 	struct ext2_super_block * s_es;	/* Pointer to the super block in the buffer */
-	struct buffer_head ** s_group_desc;
+	struct ext2_buffer ** s_group_desc;
 	unsigned long  s_mount_opt;
 	unsigned long s_sb_block;
 	kuid_t s_resuid;
@@ -735,7 +735,7 @@ extern unsigned long ext2_count_free_blocks (struct super_block *);
 extern unsigned long ext2_count_dirs (struct super_block *);
 extern struct ext2_group_desc * ext2_get_group_desc(struct super_block * sb,
 						    unsigned int block_group,
-						    struct buffer_head ** bh);
+						    struct ext2_buffer ** buf);
 extern void ext2_discard_reservation (struct inode *);
 extern int ext2_should_retry_alloc(struct super_block *sb, int *retries);
 extern void ext2_init_block_alloc_info(struct inode *);
diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index fdf63e9c6e7c..36fe7975b2d6 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -66,9 +66,9 @@ read_inode_bitmap(struct super_block * sb, unsigned long block_group)
 static void ext2_release_inode(struct super_block *sb, int group, int dir)
 {
 	struct ext2_group_desc * desc;
-	struct buffer_head *bh;
+	struct ext2_buffer *buf;
 
-	desc = ext2_get_group_desc(sb, group, &bh);
+	desc = ext2_get_group_desc(sb, group, &buf);
 	if (!desc) {
 		ext2_error(sb, "ext2_release_inode",
 			"can't get descriptor for group %d", group);
@@ -83,7 +83,7 @@ static void ext2_release_inode(struct super_block *sb, int group, int dir)
 	percpu_counter_inc(&EXT2_SB(sb)->s_freeinodes_counter);
 	if (dir)
 		percpu_counter_dec(&EXT2_SB(sb)->s_dirs_counter);
-	mark_buffer_dirty(bh);
+	ext2_buffer_set_dirty(buf);
 }
 
 /*
@@ -421,7 +421,7 @@ struct inode *ext2_new_inode(struct inode *dir, umode_t mode,
 {
 	struct super_block *sb;
 	struct buffer_head *bitmap_bh = NULL;
-	struct buffer_head *bh2;
+	struct ext2_buffer *buf2;
 	int group, i;
 	ino_t ino = 0;
 	struct inode * inode;
@@ -453,7 +453,7 @@ struct inode *ext2_new_inode(struct inode *dir, umode_t mode,
 	}
 
 	for (i = 0; i < sbi->s_groups_count; i++) {
-		gdp = ext2_get_group_desc(sb, group, &bh2);
+		gdp = ext2_get_group_desc(sb, group, &buf2);
 		if (!gdp) {
 			if (++group == sbi->s_groups_count)
 				group = 0;
@@ -536,7 +536,7 @@ struct inode *ext2_new_inode(struct inode *dir, umode_t mode,
 	}
 	spin_unlock(sb_bgl_lock(sbi, group));
 
-	mark_buffer_dirty(bh2);
+	ext2_buffer_set_dirty(buf2);
 	if (test_opt(sb, GRPID)) {
 		inode->i_mode = mode;
 		inode->i_uid = current_fsuid();
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index ac53f587d140..4323448bf64b 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -162,7 +162,7 @@ static void ext2_put_super (struct super_block * sb)
 	}
 	db_count = sbi->s_gdb_count;
 	for (i = 0; i < db_count; i++)
-		brelse(sbi->s_group_desc[i]);
+		ext2_put_buffer(sb, sbi->s_group_desc[i]);
 	kvfree(sbi->s_group_desc);
 	kfree(sbi->s_debts);
 	percpu_counter_destroy(&sbi->s_freeblocks_counter);
@@ -1093,7 +1093,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
 	db_count = (sbi->s_groups_count + EXT2_DESC_PER_BLOCK(sb) - 1) /
 		   EXT2_DESC_PER_BLOCK(sb);
 	sbi->s_group_desc = kvmalloc_array(db_count,
-					   sizeof(struct buffer_head *),
+					   sizeof(struct ext2_buffer *),
 					   GFP_KERNEL);
 	if (sbi->s_group_desc == NULL) {
 		ret = -ENOMEM;
@@ -1109,10 +1109,10 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
 	}
 	for (i = 0; i < db_count; i++) {
 		block = descriptor_loc(sb, logic_sb_block, i);
-		sbi->s_group_desc[i] = sb_bread(sb, block);
+		sbi->s_group_desc[i] = ext2_read_buffer(sb, block);
 		if (!sbi->s_group_desc[i]) {
 			for (j = 0; j < i; j++)
-				brelse (sbi->s_group_desc[j]);
+				ext2_put_buffer (sb, sbi->s_group_desc[j]);
 			ext2_msg(sb, KERN_ERR,
 				"error: unable to read group descriptors");
 			goto failed_mount_group_desc;
@@ -1216,7 +1216,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
 	percpu_counter_destroy(&sbi->s_dirs_counter);
 failed_mount2:
 	for (i = 0; i < db_count; i++)
-		brelse(sbi->s_group_desc[i]);
+		ext2_put_buffer(sb, sbi->s_group_desc[i]);
 failed_mount_group_desc:
 	kvfree(sbi->s_group_desc);
 	kfree(sbi->s_debts);
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [RFC PATCH v2 3/4] ext2: remove buffer heads from quota handling
  2025-03-26  1:49 [RFC PATCH v2 0/4] remove buffer heads from ext2 Catherine Hoang
  2025-03-26  1:49 ` [RFC PATCH v2 1/4] ext2: remove buffer heads from superblock Catherine Hoang
  2025-03-26  1:49 ` [RFC PATCH v2 2/4] ext2: remove buffer heads from group descriptors Catherine Hoang
@ 2025-03-26  1:49 ` Catherine Hoang
  2025-03-26  1:49 ` [RFC PATCH v2 4/4] ext2: remove buffer heads from block bitmaps Catherine Hoang
  2025-03-27 10:32 ` [RFC PATCH v2 0/4] remove buffer heads from ext2 Christoph Hellwig
  4 siblings, 0 replies; 11+ messages in thread
From: Catherine Hoang @ 2025-03-26  1:49 UTC (permalink / raw)
  To: linux-ext4; +Cc: djwong

Currently, we read and write to quotafiles by storing blocks of data
in buffer_heads. Replace these buffer heads with the new ext2_buffer
and update the buffer functions accordingly.

Signed-off-by: Catherine Hoang <catherine.hoang@oracle.com>
---
 fs/ext2/super.c | 34 ++++++++++++++++------------------
 1 file changed, 16 insertions(+), 18 deletions(-)

diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 4323448bf64b..be6fff36bf23 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -1506,7 +1506,7 @@ static ssize_t ext2_quota_read(struct super_block *sb, int type, char *data,
 	int tocopy;
 	size_t toread;
 	struct buffer_head tmp_bh;
-	struct buffer_head *bh;
+	struct ext2_buffer *buf;
 	loff_t i_size = i_size_read(inode);
 
 	if (off > i_size)
@@ -1525,11 +1525,11 @@ static ssize_t ext2_quota_read(struct super_block *sb, int type, char *data,
 		if (!buffer_mapped(&tmp_bh))	/* A hole? */
 			memset(data, 0, tocopy);
 		else {
-			bh = sb_bread(sb, tmp_bh.b_blocknr);
-			if (!bh)
-				return -EIO;
-			memcpy(data, bh->b_data+offset, tocopy);
-			brelse(bh);
+			buf = ext2_read_buffer(sb, tmp_bh.b_blocknr);
+			if (IS_ERR(buf))
+				return PTR_ERR(buf);
+			memcpy(data, buf->b_data+offset, tocopy);
+			ext2_put_buffer(sb, buf);
 		}
 		offset = 0;
 		toread -= tocopy;
@@ -1550,7 +1550,7 @@ static ssize_t ext2_quota_write(struct super_block *sb, int type,
 	int tocopy;
 	size_t towrite = len;
 	struct buffer_head tmp_bh;
-	struct buffer_head *bh;
+	struct ext2_buffer *buf;
 
 	while (towrite > 0) {
 		tocopy = min_t(size_t, sb->s_blocksize - offset, towrite);
@@ -1561,20 +1561,18 @@ static ssize_t ext2_quota_write(struct super_block *sb, int type,
 		if (err < 0)
 			goto out;
 		if (offset || tocopy != EXT2_BLOCK_SIZE(sb))
-			bh = sb_bread(sb, tmp_bh.b_blocknr);
+			buf = ext2_read_buffer(sb, tmp_bh.b_blocknr);
 		else
-			bh = sb_getblk(sb, tmp_bh.b_blocknr);
-		if (unlikely(!bh)) {
-			err = -EIO;
+			buf = ext2_get_buffer(sb, tmp_bh.b_blocknr);
+		if (unlikely(IS_ERR(buf))) {
+			err = PTR_ERR(buf);
 			goto out;
 		}
-		lock_buffer(bh);
-		memcpy(bh->b_data+offset, data, tocopy);
-		flush_dcache_page(bh->b_page);
-		set_buffer_uptodate(bh);
-		mark_buffer_dirty(bh);
-		unlock_buffer(bh);
-		brelse(bh);
+		ext2_buffer_lock(buf);
+		memcpy(buf->b_data+offset, data, tocopy);
+		ext2_buffer_set_dirty(buf);
+		ext2_buffer_unlock(buf);
+		ext2_put_buffer(sb, buf);
 		offset = 0;
 		towrite -= tocopy;
 		data += tocopy;
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [RFC PATCH v2 4/4] ext2: remove buffer heads from block bitmaps
  2025-03-26  1:49 [RFC PATCH v2 0/4] remove buffer heads from ext2 Catherine Hoang
                   ` (2 preceding siblings ...)
  2025-03-26  1:49 ` [RFC PATCH v2 3/4] ext2: remove buffer heads from quota handling Catherine Hoang
@ 2025-03-26  1:49 ` Catherine Hoang
  2025-03-27 10:32 ` [RFC PATCH v2 0/4] remove buffer heads from ext2 Christoph Hellwig
  4 siblings, 0 replies; 11+ messages in thread
From: Catherine Hoang @ 2025-03-26  1:49 UTC (permalink / raw)
  To: linux-ext4; +Cc: djwong

The block allocation code uses buffer_heads to store block bitmaps.
Replace these buffer heads with the new ext2_buffer and update the
buffer functions accordingly.

Signed-off-by: Catherine Hoang <catherine.hoang@oracle.com>
---
 fs/ext2/balloc.c | 108 +++++++++++++++++++++--------------------------
 1 file changed, 48 insertions(+), 60 deletions(-)

diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c
index 21dafa9ae2ea..2195c6ddbc83 100644
--- a/fs/ext2/balloc.c
+++ b/fs/ext2/balloc.c
@@ -71,7 +71,7 @@ struct ext2_group_desc * ext2_get_group_desc(struct super_block * sb,
 static int ext2_valid_block_bitmap(struct super_block *sb,
 					struct ext2_group_desc *desc,
 					unsigned int block_group,
-					struct buffer_head *bh)
+					struct ext2_buffer *buf)
 {
 	ext2_grpblk_t offset;
 	ext2_grpblk_t next_zero_bit;
@@ -86,7 +86,7 @@ static int ext2_valid_block_bitmap(struct super_block *sb,
 	bitmap_blk = le32_to_cpu(desc->bg_block_bitmap);
 	offset = bitmap_blk - group_first_block;
 	if (offset < 0 || offset > max_bit ||
-	    !ext2_test_bit(offset, bh->b_data))
+	    !ext2_test_bit(offset, buf->b_data))
 		/* bad block bitmap */
 		goto err_out;
 
@@ -94,7 +94,7 @@ static int ext2_valid_block_bitmap(struct super_block *sb,
 	bitmap_blk = le32_to_cpu(desc->bg_inode_bitmap);
 	offset = bitmap_blk - group_first_block;
 	if (offset < 0 || offset > max_bit ||
-	    !ext2_test_bit(offset, bh->b_data))
+	    !ext2_test_bit(offset, buf->b_data))
 		/* bad block bitmap */
 		goto err_out;
 
@@ -104,7 +104,7 @@ static int ext2_valid_block_bitmap(struct super_block *sb,
 	if (offset < 0 || offset > max_bit ||
 	    offset + EXT2_SB(sb)->s_itb_per_group - 1 > max_bit)
 		goto err_out;
-	next_zero_bit = ext2_find_next_zero_bit(bh->b_data,
+	next_zero_bit = ext2_find_next_zero_bit(buf->b_data,
 				offset + EXT2_SB(sb)->s_itb_per_group,
 				offset);
 	if (next_zero_bit >= offset + EXT2_SB(sb)->s_itb_per_group)
@@ -125,31 +125,19 @@ static int ext2_valid_block_bitmap(struct super_block *sb,
  *
  * Return buffer_head on success or NULL in case of failure.
  */
-static struct buffer_head *
+static struct ext2_buffer *
 read_block_bitmap(struct super_block *sb, unsigned int block_group)
 {
 	struct ext2_group_desc * desc;
-	struct buffer_head * bh = NULL;
+	struct ext2_buffer * buf = NULL;
 	ext2_fsblk_t bitmap_blk;
-	int ret;
 
 	desc = ext2_get_group_desc(sb, block_group, NULL);
 	if (!desc)
 		return NULL;
 	bitmap_blk = le32_to_cpu(desc->bg_block_bitmap);
-	bh = sb_getblk(sb, bitmap_blk);
-	if (unlikely(!bh)) {
-		ext2_error(sb, __func__,
-			    "Cannot read block bitmap - "
-			    "block_group = %d, block_bitmap = %u",
-			    block_group, le32_to_cpu(desc->bg_block_bitmap));
-		return NULL;
-	}
-	ret = bh_read(bh, 0);
-	if (ret > 0)
-		return bh;
-	if (ret < 0) {
-		brelse(bh);
+	buf = ext2_read_buffer(sb, bitmap_blk);
+	if (unlikely(IS_ERR(buf))) {
 		ext2_error(sb, __func__,
 			    "Cannot read block bitmap - "
 			    "block_group = %d, block_bitmap = %u",
@@ -157,12 +145,12 @@ read_block_bitmap(struct super_block *sb, unsigned int block_group)
 		return NULL;
 	}
 
-	ext2_valid_block_bitmap(sb, desc, block_group, bh);
+	ext2_valid_block_bitmap(sb, desc, block_group, buf);
 	/*
 	 * file system mounted not to panic on error, continue with corrupt
 	 * bitmap
 	 */
-	return bh;
+	return buf;
 }
 
 static void group_adjust_blocks(struct super_block *sb, int group_no,
@@ -482,7 +470,7 @@ void ext2_discard_reservation(struct inode *inode)
 void ext2_free_blocks(struct inode * inode, ext2_fsblk_t block,
 		      unsigned long count)
 {
-	struct buffer_head *bitmap_bh = NULL;
+	struct ext2_buffer *bitmap_buf = NULL;
 	struct ext2_buffer * buf2;
 	unsigned long block_group;
 	unsigned long bit;
@@ -517,9 +505,9 @@ void ext2_free_blocks(struct inode * inode, ext2_fsblk_t block,
 		overflow = bit + count - EXT2_BLOCKS_PER_GROUP(sb);
 		count -= overflow;
 	}
-	brelse(bitmap_bh);
-	bitmap_bh = read_block_bitmap(sb, block_group);
-	if (!bitmap_bh)
+	ext2_put_buffer(sb, bitmap_buf);
+	bitmap_buf = read_block_bitmap(sb, block_group);
+	if (IS_ERR(bitmap_buf))
 		goto error_return;
 
 	desc = ext2_get_group_desc (sb, block_group, &buf2);
@@ -541,7 +529,7 @@ void ext2_free_blocks(struct inode * inode, ext2_fsblk_t block,
 
 	for (i = 0, group_freed = 0; i < count; i++) {
 		if (!ext2_clear_bit_atomic(sb_bgl_lock(sbi, block_group),
-						bit + i, bitmap_bh->b_data)) {
+						bit + i, bitmap_buf->b_data)) {
 			ext2_error(sb, __func__,
 				"bit already cleared for block %lu", block + i);
 		} else {
@@ -549,9 +537,9 @@ void ext2_free_blocks(struct inode * inode, ext2_fsblk_t block,
 		}
 	}
 
-	mark_buffer_dirty(bitmap_bh);
+	ext2_buffer_set_dirty(bitmap_buf);
 	if (sb->s_flags & SB_SYNCHRONOUS)
-		sync_dirty_buffer(bitmap_bh);
+		ext2_sync_buffer_wait(sb, bitmap_buf);
 
 	group_adjust_blocks(sb, block_group, desc, buf2, group_freed);
 	freed += group_freed;
@@ -562,7 +550,7 @@ void ext2_free_blocks(struct inode * inode, ext2_fsblk_t block,
 		goto do_more;
 	}
 error_return:
-	brelse(bitmap_bh);
+	ext2_put_buffer(sb, bitmap_buf);
 	if (freed) {
 		percpu_counter_add(&sbi->s_freeblocks_counter, freed);
 		dquot_free_block_nodirty(inode, freed);
@@ -580,12 +568,12 @@ void ext2_free_blocks(struct inode * inode, ext2_fsblk_t block,
  * we find a bit free.
  */
 static ext2_grpblk_t
-bitmap_search_next_usable_block(ext2_grpblk_t start, struct buffer_head *bh,
+bitmap_search_next_usable_block(ext2_grpblk_t start, struct ext2_buffer *buf,
 					ext2_grpblk_t maxblocks)
 {
 	ext2_grpblk_t next;
 
-	next = ext2_find_next_zero_bit(bh->b_data, maxblocks, start);
+	next = ext2_find_next_zero_bit(buf->b_data, maxblocks, start);
 	if (next >= maxblocks)
 		return -1;
 	return next;
@@ -604,7 +592,7 @@ bitmap_search_next_usable_block(ext2_grpblk_t start, struct buffer_head *bh,
  * then for any free bit in the bitmap.
  */
 static ext2_grpblk_t
-find_next_usable_block(int start, struct buffer_head *bh, int maxblocks)
+find_next_usable_block(int start, struct ext2_buffer *buf, int maxblocks)
 {
 	ext2_grpblk_t here, next;
 	char *p, *r;
@@ -621,7 +609,7 @@ find_next_usable_block(int start, struct buffer_head *bh, int maxblocks)
 		ext2_grpblk_t end_goal = (start + 63) & ~63;
 		if (end_goal > maxblocks)
 			end_goal = maxblocks;
-		here = ext2_find_next_zero_bit(bh->b_data, end_goal, start);
+		here = ext2_find_next_zero_bit(buf->b_data, end_goal, start);
 		if (here < end_goal)
 			return here;
 		ext2_debug("Bit not found near goal\n");
@@ -631,14 +619,14 @@ find_next_usable_block(int start, struct buffer_head *bh, int maxblocks)
 	if (here < 0)
 		here = 0;
 
-	p = ((char *)bh->b_data) + (here >> 3);
+	p = ((char *)buf->b_data) + (here >> 3);
 	r = memscan(p, 0, ((maxblocks + 7) >> 3) - (here >> 3));
-	next = (r - ((char *)bh->b_data)) << 3;
+	next = (r - ((char *)buf->b_data)) << 3;
 
 	if (next < maxblocks && next >= here)
 		return next;
 
-	here = bitmap_search_next_usable_block(here, bh, maxblocks);
+	here = bitmap_search_next_usable_block(here, buf, maxblocks);
 	return here;
 }
 
@@ -666,7 +654,7 @@ find_next_usable_block(int start, struct buffer_head *bh, int maxblocks)
  */
 static int
 ext2_try_to_allocate(struct super_block *sb, int group,
-			struct buffer_head *bitmap_bh, ext2_grpblk_t grp_goal,
+			struct ext2_buffer *bitmap_buf, ext2_grpblk_t grp_goal,
 			unsigned long *count,
 			struct ext2_reserve_window *my_rsv)
 {
@@ -689,7 +677,7 @@ ext2_try_to_allocate(struct super_block *sb, int group,
 	BUG_ON(start > EXT2_BLOCKS_PER_GROUP(sb));
 
 	if (grp_goal < 0) {
-		grp_goal = find_next_usable_block(start, bitmap_bh, end);
+		grp_goal = find_next_usable_block(start, bitmap_buf, end);
 		if (grp_goal < 0)
 			goto fail_access;
 		if (!my_rsv) {
@@ -697,7 +685,7 @@ ext2_try_to_allocate(struct super_block *sb, int group,
 
 			for (i = 0; i < 7 && grp_goal > start &&
 					!ext2_test_bit(grp_goal - 1,
-					     		bitmap_bh->b_data);
+					     		bitmap_buf->b_data);
 			     		i++, grp_goal--)
 				;
 		}
@@ -705,7 +693,7 @@ ext2_try_to_allocate(struct super_block *sb, int group,
 
 	for (; num < *count && grp_goal < end; grp_goal++) {
 		if (ext2_set_bit_atomic(sb_bgl_lock(EXT2_SB(sb), group),
-					grp_goal, bitmap_bh->b_data)) {
+					grp_goal, bitmap_buf->b_data)) {
 			if (num == 0)
 				continue;
 			break;
@@ -869,7 +857,7 @@ static int find_next_reservable_window(
  */
 static int alloc_new_reservation(struct ext2_reserve_window_node *my_rsv,
 		ext2_grpblk_t grp_goal, struct super_block *sb,
-		unsigned int group, struct buffer_head *bitmap_bh)
+		unsigned int group, struct ext2_buffer *bitmap_buf)
 {
 	struct ext2_reserve_window_node *search_head;
 	ext2_fsblk_t group_first_block, group_end_block, start_block;
@@ -960,7 +948,7 @@ static int alloc_new_reservation(struct ext2_reserve_window_node *my_rsv,
 	spin_unlock(rsv_lock);
 	first_free_block = bitmap_search_next_usable_block(
 			my_rsv->rsv_start - group_first_block,
-			bitmap_bh, group_end_block - group_first_block + 1);
+			bitmap_buf, group_end_block - group_first_block + 1);
 
 	if (first_free_block < 0) {
 		/*
@@ -1062,7 +1050,7 @@ static void try_to_extend_reservation(struct ext2_reserve_window_node *my_rsv,
  */
 static ext2_grpblk_t
 ext2_try_to_allocate_with_rsv(struct super_block *sb, unsigned int group,
-			struct buffer_head *bitmap_bh, ext2_grpblk_t grp_goal,
+			struct ext2_buffer *bitmap_buf, ext2_grpblk_t grp_goal,
 			struct ext2_reserve_window_node * my_rsv,
 			unsigned long *count)
 {
@@ -1077,7 +1065,7 @@ ext2_try_to_allocate_with_rsv(struct super_block *sb, unsigned int group,
 	 * or last attempt to allocate a block with reservation turned on failed
 	 */
 	if (my_rsv == NULL) {
-		return ext2_try_to_allocate(sb, group, bitmap_bh,
+		return ext2_try_to_allocate(sb, group, bitmap_buf,
 						grp_goal, count, NULL);
 	}
 	/*
@@ -1111,7 +1099,7 @@ ext2_try_to_allocate_with_rsv(struct super_block *sb, unsigned int group,
 			if (my_rsv->rsv_goal_size < *count)
 				my_rsv->rsv_goal_size = *count;
 			ret = alloc_new_reservation(my_rsv, grp_goal, sb,
-							group, bitmap_bh);
+							group, bitmap_buf);
 			if (ret < 0)
 				break;			/* failed */
 
@@ -1137,7 +1125,7 @@ ext2_try_to_allocate_with_rsv(struct super_block *sb, unsigned int group,
 			rsv_window_dump(&EXT2_SB(sb)->s_rsv_window_root, 1);
 			return -1;
 		}
-		ret = ext2_try_to_allocate(sb, group, bitmap_bh, grp_goal,
+		ret = ext2_try_to_allocate(sb, group, bitmap_buf, grp_goal,
 					   &num, &my_rsv->rsv_window);
 		if (ret >= 0) {
 			my_rsv->rsv_alloc_hit += num;
@@ -1208,7 +1196,7 @@ int ext2_data_block_valid(struct ext2_sb_info *sbi, ext2_fsblk_t start_blk,
 ext2_fsblk_t ext2_new_blocks(struct inode *inode, ext2_fsblk_t goal,
 		    unsigned long *count, int *errp, unsigned int flags)
 {
-	struct buffer_head *bitmap_bh = NULL;
+	struct ext2_buffer *bitmap_buf = NULL;
 	struct ext2_buffer *gdp_buf;
 	int group_no;
 	int goal_group;
@@ -1297,12 +1285,12 @@ ext2_fsblk_t ext2_new_blocks(struct inode *inode, ext2_fsblk_t goal,
 		 * pointer and we have to release it before calling
 		 * read_block_bitmap().
 		 */
-		brelse(bitmap_bh);
-		bitmap_bh = read_block_bitmap(sb, group_no);
-		if (!bitmap_bh)
+		ext2_put_buffer(sb, bitmap_buf);
+		bitmap_buf = read_block_bitmap(sb, group_no);
+		if (IS_ERR(bitmap_buf))
 			goto io_error;
 		grp_alloc_blk = ext2_try_to_allocate_with_rsv(sb, group_no,
-					bitmap_bh, grp_target_blk,
+					bitmap_buf, grp_target_blk,
 					my_rsv, &num);
 		if (grp_alloc_blk >= 0)
 			goto allocated;
@@ -1338,15 +1326,15 @@ ext2_fsblk_t ext2_new_blocks(struct inode *inode, ext2_fsblk_t goal,
 		if (my_rsv && (free_blocks <= (windowsz/2)))
 			continue;
 
-		brelse(bitmap_bh);
-		bitmap_bh = read_block_bitmap(sb, group_no);
-		if (!bitmap_bh)
+		ext2_put_buffer(sb, bitmap_buf);
+		bitmap_buf = read_block_bitmap(sb, group_no);
+		if (IS_ERR(bitmap_buf))
 			goto io_error;
 		/*
 		 * try to allocate block(s) from this group, without a goal(-1).
 		 */
 		grp_alloc_blk = ext2_try_to_allocate_with_rsv(sb, group_no,
-					bitmap_bh, -1, my_rsv, &num);
+					bitmap_buf, -1, my_rsv, &num);
 		if (grp_alloc_blk >= 0)
 			goto allocated;
 	}
@@ -1406,12 +1394,12 @@ ext2_fsblk_t ext2_new_blocks(struct inode *inode, ext2_fsblk_t goal,
 	group_adjust_blocks(sb, group_no, gdp, gdp_buf, -num);
 	percpu_counter_sub(&sbi->s_freeblocks_counter, num);
 
-	mark_buffer_dirty(bitmap_bh);
+	ext2_buffer_set_dirty(bitmap_buf);
 	if (sb->s_flags & SB_SYNCHRONOUS)
-		sync_dirty_buffer(bitmap_bh);
+		ext2_sync_buffer_wait(sb, bitmap_buf);
 
 	*errp = 0;
-	brelse(bitmap_bh);
+	ext2_put_buffer(sb, bitmap_buf);
 	if (num < *count) {
 		dquot_free_block_nodirty(inode, *count-num);
 		mark_inode_dirty(inode);
@@ -1429,7 +1417,7 @@ ext2_fsblk_t ext2_new_blocks(struct inode *inode, ext2_fsblk_t goal,
 		dquot_free_block_nodirty(inode, *count);
 		mark_inode_dirty(inode);
 	}
-	brelse(bitmap_bh);
+	ext2_put_buffer(sb, bitmap_buf);
 	return 0;
 }
 
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [RFC PATCH v2 0/4] remove buffer heads from ext2
  2025-03-26  1:49 [RFC PATCH v2 0/4] remove buffer heads from ext2 Catherine Hoang
                   ` (3 preceding siblings ...)
  2025-03-26  1:49 ` [RFC PATCH v2 4/4] ext2: remove buffer heads from block bitmaps Catherine Hoang
@ 2025-03-27 10:32 ` Christoph Hellwig
  2025-04-04 16:43   ` Darrick J. Wong
  4 siblings, 1 reply; 11+ messages in thread
From: Christoph Hellwig @ 2025-03-27 10:32 UTC (permalink / raw)
  To: Catherine Hoang; +Cc: linux-ext4, djwong

On Tue, Mar 25, 2025 at 06:49:24PM -0700, Catherine Hoang wrote:
> Hi all,
> 
> This series is an effort to begin removing buffer heads from ext2. 

Why is that desirable?


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [RFC PATCH v2 1/4] ext2: remove buffer heads from superblock
  2025-03-26  1:49 ` [RFC PATCH v2 1/4] ext2: remove buffer heads from superblock Catherine Hoang
@ 2025-03-28 18:24   ` Darrick J. Wong
  0 siblings, 0 replies; 11+ messages in thread
From: Darrick J. Wong @ 2025-03-28 18:24 UTC (permalink / raw)
  To: Catherine Hoang; +Cc: linux-ext4

On Tue, Mar 25, 2025 at 06:49:25PM -0700, Catherine Hoang wrote:
> The superblock is stored in the buffer_head s_sbh in struct ext2_sb_info.
> Replace this buffer head with the new ext2_buffer and update the buffer
> functions accordingly. This patch also introduces new buffer cache code
> needed for future patches.
> 
> Signed-off-by: Catherine Hoang <catherine.hoang@oracle.com>
> ---
>  fs/ext2/Makefile |   2 +-
>  fs/ext2/cache.c  | 302 +++++++++++++++++++++++++++++++++++++++++++++++
>  fs/ext2/ext2.h   |  43 ++++++-
>  fs/ext2/super.c  |  52 +++++---
>  fs/ext2/xattr.c  |   2 +-
>  5 files changed, 379 insertions(+), 22 deletions(-)
>  create mode 100644 fs/ext2/cache.c
> 
> diff --git a/fs/ext2/Makefile b/fs/ext2/Makefile
> index 8860948ef9ca..e8b38243058f 100644
> --- a/fs/ext2/Makefile
> +++ b/fs/ext2/Makefile
> @@ -5,7 +5,7 @@
>  
>  obj-$(CONFIG_EXT2_FS) += ext2.o
>  
> -ext2-y := balloc.o dir.o file.o ialloc.o inode.o \
> +ext2-y := balloc.o cache.o dir.o file.o ialloc.o inode.o \
>  	  ioctl.o namei.o super.o symlink.o trace.o
>  
>  # For tracepoints to include our trace.h from tracepoint infrastructure
> diff --git a/fs/ext2/cache.c b/fs/ext2/cache.c
> new file mode 100644
> index 000000000000..464c506ba1b6
> --- /dev/null
> +++ b/fs/ext2/cache.c
> @@ -0,0 +1,302 @@
> +// SPDX-License-Identifier: GPL-2.0-or-later
> +/*
> + * Copyright (c) 2025 Oracle. All rights reserved.
> + */
> +
> +#include "ext2.h"
> +#include <linux/bio.h>
> +#include <linux/blkdev.h>
> +#include <linux/rhashtable.h>
> +#include <linux/mm.h>
> +#include <linux/types.h>
> +
> +static const struct rhashtable_params buffer_cache_params = {
> +	.key_len     = sizeof(sector_t),
> +	.key_offset  = offsetof(struct ext2_buffer, b_block),
> +	.head_offset = offsetof(struct ext2_buffer, b_rhash),
> +	.automatic_shrinking = true,
> +};
> +
> +void ext2_buffer_lock(struct ext2_buffer *buf)
> +{
> +	mutex_lock(&buf->b_lock);
> +}
> +
> +void ext2_buffer_unlock(struct ext2_buffer *buf)
> +{
> +	mutex_unlock(&buf->b_lock);
> +}
> +
> +void ext2_buffer_set_dirty(struct ext2_buffer *buf)
> +{
> +    set_bit(EXT2_BUF_DIRTY_BIT, &buf->b_flags);
> +}
> +
> +static int ext2_buffer_uptodate(struct ext2_buffer *buf)
> +{
> +    return test_bit(EXT2_BUF_UPTODATE_BIT, &buf->b_flags);
> +}
> +
> +void ext2_buffer_set_uptodate(struct ext2_buffer *buf)
> +{
> +    set_bit(EXT2_BUF_UPTODATE_BIT, &buf->b_flags);
> +}
> +
> +void ext2_buffer_clear_uptodate(struct ext2_buffer *buf)
> +{
> +    clear_bit(EXT2_BUF_UPTODATE_BIT, &buf->b_flags);
> +}
> +
> +int ext2_buffer_error(struct ext2_buffer *buf)
> +{
> +       return buf->b_error;
> +}
> +
> +void ext2_buffer_clear_error(struct ext2_buffer *buf)
> +{
> +       buf->b_error = 0;
> +}

Indenting errors in these previous two functions.

> +static struct ext2_buffer *ext2_insert_buffer_cache(struct super_block *sb, struct ext2_buffer *new_buf)
> +{
> +	struct ext2_sb_info *sbi = EXT2_SB(sb);
> +	struct ext2_buffer_cache *bc = &sbi->s_buffer_cache;
> +	struct rhashtable *buffer_cache = &bc->bc_hash;
> +	struct ext2_buffer *old_buf;
> +
> +	rcu_read_lock();
> +	old_buf = rhashtable_lookup_get_insert_fast(buffer_cache,
> +				&new_buf->b_rhash, buffer_cache_params);
> +
> +	if (old_buf) {
> +		refcount_inc(&old_buf->b_refcount);
> +		rcu_read_unlock();
> +		return old_buf;
> +	}
> +
> +	refcount_inc(&new_buf->b_refcount);

I think the b_refcount model is clearer this time around -- buffers are
created with refcount==1, the cache owns that refcount when it's added
to the rhashtable, and cache users get their own refcount.
sync_buffers* takes its own refcount while writes are in progress.

(Can this please be captured in the design documentation, please?)

> +	rcu_read_unlock();
> +	return new_buf;
> +}
> +
> +static void ext2_buf_write_end_io(struct bio *bio)
> +{
> +	struct ext2_buffer *buf = bio->bi_private;
> +	int err = blk_status_to_errno(bio->bi_status);
> +
> +	buf->b_error = err;

buf->b_error = blk_status_to_errno(bio->bi_status);

> +	complete(&buf->b_complete);
> +	mutex_unlock(&buf->b_lock);
> +	bio_put(bio);
> +}
> +
> +static int ext2_submit_buffer_read(struct super_block *sb, struct ext2_buffer *buf)
> +{
> +	struct bio_vec bio_vec;
> +	struct bio bio;
> +	sector_t sector = buf->b_block * (sb->s_blocksize >> 9);
> +	int error;
> +
> +	bio_init(&bio, sb->s_bdev, &bio_vec, 1, REQ_OP_READ);
> +	bio.bi_iter.bi_sector = sector;
> +
> +	buf->b_size = sb->s_blocksize;
> +	__bio_add_page(&bio, buf->b_page, buf->b_size, 0);
> +
> +	mutex_lock(&buf->b_lock);
> +	error = submit_bio_wait(&bio);
> +	ext2_buffer_set_uptodate(buf);
> +	mutex_unlock(&buf->b_lock);

Hmm.  So the cache itself holds b_lock whenever IO is in progress, and
the quota code locks the buffer whenever it's copying dquot data into
the ext2_buffer.  That makes sense, we want to avoid sync_buffer* racing
with something that's writing to b_data.  But why doesn't the existing
code do that for group descriptor or superblock bufferheads?

I think we don't need to lock the ext2_buffer to read its contents
because the get/read functions always return to us an uptodate buffer.

> +
> +	return error;
> +}
> +
> +static void ext2_submit_buffer_write(struct super_block *sb, struct ext2_buffer *buf)
> +{
> +	struct bio *bio;
> +	sector_t sector = buf->b_block * (sb->s_blocksize >> 9);
> +
> +	bio = bio_alloc(sb->s_bdev, 1, REQ_OP_WRITE, GFP_KERNEL);
> +
> +	bio->bi_iter.bi_sector = sector;
> +	bio->bi_end_io = ext2_buf_write_end_io;
> +	bio->bi_private = buf;
> +
> +	__bio_add_page(bio, buf->b_page, buf->b_size, 0);
> +
> +	mutex_lock(&buf->b_lock);
> +	submit_bio(bio);
> +}
> +
> +static int ext2_sync_buffer_cache_wait(struct list_head *submit_list)
> +{
> +	struct ext2_buffer *buf, *n;
> +	int error = 0, error2;
> +
> +	list_for_each_entry_safe(buf, n, submit_list, b_list) {
> +		wait_for_completion(&buf->b_complete);
> +		refcount_dec(&buf->b_refcount);
> +		error2 = buf->b_error;
> +		if (!error)
> +			error = error2;
> +	}
> +
> +	return error;
> +}
> +
> +int ext2_sync_buffer_wait(struct super_block *sb, struct ext2_buffer *buf)
> +{
> +	if (test_and_clear_bit(EXT2_BUF_DIRTY_BIT, &buf->b_flags)) {
> +		ext2_submit_buffer_write(sb, buf);
> +		wait_for_completion(&buf->b_complete);
> +		return buf->b_error;
> +	}
> +
> +	return 0;
> +}
> +
> +int ext2_sync_buffer_cache(struct super_block *sb)
> +{
> +	struct ext2_sb_info *sbi = EXT2_SB(sb);
> +	struct ext2_buffer_cache *bc = &sbi->s_buffer_cache;
> +	struct rhashtable *buffer_cache = &bc->bc_hash;
> +	struct rhashtable_iter iter;
> +	struct ext2_buffer *buf, *n;
> +	struct blk_plug plug;
> +	LIST_HEAD(submit_list);
> +
> +	rhashtable_walk_enter(buffer_cache, &iter);
> +	rhashtable_walk_start(&iter);
> +	while ((buf = rhashtable_walk_next(&iter)) != NULL) {
> +		if (IS_ERR(buf))
> +			continue;
> +		if (test_and_clear_bit(EXT2_BUF_DIRTY_BIT, &buf->b_flags)) {
> +			refcount_inc(&buf->b_refcount);
> +			list_add(&buf->b_list, &submit_list);
> +		}
> +	}
> +	rhashtable_walk_stop(&iter);
> +	rhashtable_walk_exit(&iter);
> +
> +	blk_start_plug(&plug);
> +	list_for_each_entry_safe(buf, n, &submit_list, b_list) {
> +		ext2_submit_buffer_write(sb, buf);
> +	}
> +	blk_finish_plug(&plug);
> +
> +	return ext2_sync_buffer_cache_wait(&submit_list);
> +}
> +
> +static struct ext2_buffer *ext2_lookup_buffer_cache(struct super_block *sb, sector_t block)
> +{
> +	struct ext2_sb_info *sbi = EXT2_SB(sb);
> +	struct ext2_buffer_cache *bc = &sbi->s_buffer_cache;
> +	struct rhashtable *buffer_cache = &bc->bc_hash;
> +	struct ext2_buffer *found = NULL;
> +
> +	rcu_read_lock();
> +	found = rhashtable_lookup(buffer_cache, &block, buffer_cache_params);
> +	if (found && !refcount_inc_not_zero(&found->b_refcount))
> +		found = NULL;
> +	rcu_read_unlock();
> +
> +	return found;
> +}
> +
> +static struct ext2_buffer *ext2_init_buffer(struct super_block *sb, sector_t block, bool need_uptodate)
> +{
> +	struct ext2_buffer *buf;
> +	gfp_t gfp = GFP_KERNEL;
> +
> +	buf = kmalloc(sizeof(struct ext2_buffer), GFP_KERNEL);
> +	if (!buf)
> +		return NULL;
> +
> +	buf->b_block = block;
> +	buf->b_size = sb->s_blocksize;
> +	buf->b_flags = 0;
> +	buf->b_error = 0;
> +
> +	mutex_init(&buf->b_lock);
> +	refcount_set(&buf->b_refcount, 1);
> +	init_completion(&buf->b_complete);
> +
> +	if (!need_uptodate)
> +		gfp |= __GFP_ZERO;

Should this set uptodate if the allocation actually zeroed the buffer?

> +
> +	buf->b_page = alloc_page(gfp);

Note: This could be trimmed to slab allocations for blocksize < pagesize
filesystems; and folio_alloc for pagesize > blocksize allocations, like
hch's recent xfs_buf changes.

> +static struct ext2_buffer *ext2_find_get_buffer(struct super_block *sb, sector_t block, bool need_uptodate)
> +{
> +	int err;
> +	struct ext2_buffer *buf;
> +	struct ext2_buffer *new_buf;
> +
> +	buf = ext2_lookup_buffer_cache(sb, block);
> +
> +	if (!buf) {
> +		new_buf = ext2_init_buffer(sb, block, need_uptodate);
> +		if (!new_buf)
> +			return ERR_PTR(-ENOMEM);
> +
> +		buf = ext2_insert_buffer_cache(sb, new_buf);
> +		if (IS_ERR(buf) || buf != new_buf)
> +			ext2_destroy_buffer(new_buf, NULL);
> +	}
> +
> +	if (need_uptodate && !ext2_buffer_uptodate(buf)) {
> +		err = ext2_submit_buffer_read(sb, buf);
> +		if (err)
> +			return ERR_PTR(err);
> +	}
> +
> +	return buf;
> +}

Looking better now :)

> diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
> index f38bdd46e4f7..bfed70fd6430 100644
> --- a/fs/ext2/ext2.h
> +++ b/fs/ext2/ext2.h
> @@ -18,6 +18,7 @@
>  #include <linux/rbtree.h>
>  #include <linux/mm.h>
>  #include <linux/highmem.h>
> +#include <linux/rhashtable.h>
>  
>  /* XXX Here for now... not interested in restructing headers JUST now */
>  
> @@ -61,6 +62,29 @@ struct ext2_block_alloc_info {
>  	ext2_fsblk_t		last_alloc_physical_block;
>  };
>  
> +struct ext2_buffer {
> +	sector_t b_block;
> +	struct rhash_head b_rhash;
> +	struct rcu_head b_rcu;
> +	struct page *b_page;
> +	size_t b_size;
> +	char *b_data;
> +	unsigned long b_flags;
> +	refcount_t b_refcount;
> +	struct mutex b_lock;
> +	struct completion b_complete;
> +	struct list_head b_list;
> +	int b_error;
> +};
> +
> +/* ext2_buffer flags */
> +#define EXT2_BUF_DIRTY_BIT 0
> +#define EXT2_BUF_UPTODATE_BIT 1
> +
> +struct ext2_buffer_cache {
> +	struct rhashtable bc_hash;
> +};
> +
>  #define rsv_start rsv_window._rsv_start
>  #define rsv_end rsv_window._rsv_end
>  
> @@ -79,7 +103,7 @@ struct ext2_sb_info {
>  	unsigned long s_groups_count;	/* Number of groups in the fs */
>  	unsigned long s_overhead_last;  /* Last calculated overhead */
>  	unsigned long s_blocks_last;    /* Last seen block count */
> -	struct buffer_head * s_sbh;	/* Buffer containing the super block */
> +	struct ext2_buffer * s_sbuf;	/* Buffer containing the super block */
>  	struct ext2_super_block * s_es;	/* Pointer to the super block in the buffer */
>  	struct buffer_head ** s_group_desc;
>  	unsigned long  s_mount_opt;
> @@ -116,6 +140,7 @@ struct ext2_sb_info {
>  	struct mb_cache *s_ea_block_cache;
>  	struct dax_device *s_daxdev;
>  	u64 s_dax_part_off;
> +	struct ext2_buffer_cache s_buffer_cache;
>  };
>  
>  static inline spinlock_t *
> @@ -716,6 +741,22 @@ extern int ext2_should_retry_alloc(struct super_block *sb, int *retries);
>  extern void ext2_init_block_alloc_info(struct inode *);
>  extern void ext2_rsv_window_add(struct super_block *sb, struct ext2_reserve_window_node *rsv);
>  
> +/* cache.c */
> +extern void ext2_buffer_lock(struct ext2_buffer *);
> +extern void ext2_buffer_unlock(struct ext2_buffer *);
> +extern int ext2_init_buffer_cache(struct ext2_buffer_cache *);
> +extern void ext2_destroy_buffer_cache(struct ext2_buffer_cache *);
> +extern int ext2_sync_buffer_wait(struct super_block *, struct ext2_buffer *);
> +extern int ext2_sync_buffer_cache(struct super_block *);
> +extern struct ext2_buffer *ext2_get_buffer(struct super_block *, sector_t);
> +extern struct ext2_buffer *ext2_read_buffer(struct super_block *, sector_t);
> +extern void ext2_put_buffer(struct super_block *, struct ext2_buffer *);
> +extern void ext2_buffer_set_dirty(struct ext2_buffer *);
> +extern void ext2_buffer_set_uptodate(struct ext2_buffer *);
> +extern void ext2_buffer_clear_uptodate(struct ext2_buffer *);
> +extern int ext2_buffer_error(struct ext2_buffer *);
> +extern void ext2_buffer_clear_error(struct ext2_buffer *);

No need for externs on functions.

> +
>  /* dir.c */
>  int ext2_add_link(struct dentry *, struct inode *);
>  int ext2_inode_by_name(struct inode *dir,
> diff --git a/fs/ext2/super.c b/fs/ext2/super.c
> index 37f7ce56adce..ac53f587d140 100644
> --- a/fs/ext2/super.c
> +++ b/fs/ext2/super.c
> @@ -168,7 +168,8 @@ static void ext2_put_super (struct super_block * sb)
>  	percpu_counter_destroy(&sbi->s_freeblocks_counter);
>  	percpu_counter_destroy(&sbi->s_freeinodes_counter);
>  	percpu_counter_destroy(&sbi->s_dirs_counter);
> -	brelse (sbi->s_sbh);
> +	ext2_put_buffer (sb, sbi->s_sbuf);
> +	ext2_destroy_buffer_cache(&sbi->s_buffer_cache);
>  	sb->s_fs_info = NULL;
>  	kfree(sbi->s_blockgroup_lock);
>  	fs_put_dax(sbi->s_daxdev, NULL);
> @@ -803,7 +804,7 @@ static unsigned long descriptor_loc(struct super_block *sb,
>  
>  static int ext2_fill_super(struct super_block *sb, void *data, int silent)
>  {
> -	struct buffer_head * bh;
> +	struct ext2_buffer * buf;
>  	struct ext2_sb_info * sbi;
>  	struct ext2_super_block * es;
>  	struct inode *root;
> @@ -835,6 +836,12 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
>  	sbi->s_daxdev = fs_dax_get_by_bdev(sb->s_bdev, &sbi->s_dax_part_off,
>  					   NULL, NULL);
>  
> +	ret = ext2_init_buffer_cache(&sbi->s_buffer_cache);
> +	if (ret) {
> +		ext2_msg(sb, KERN_ERR, "error: unable to create buffer cache");
> +		goto failed_sbi;
> +	}
> +
>  	spin_lock_init(&sbi->s_lock);
>  	ret = -EINVAL;
>  
> @@ -862,7 +869,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
>  		logic_sb_block = sb_block;
>  	}
>  
> -	if (!(bh = sb_bread(sb, logic_sb_block))) {
> +	if (IS_ERR(buf = ext2_read_buffer(sb, logic_sb_block))) {
>  		ext2_msg(sb, KERN_ERR, "error: unable to read superblock");
>  		goto failed_sbi;

I think this would be better expanded to avoid assigning and comparing
in the same line.  Then you can even extract the real error code and
return it:

	buf = ext2_read_buffer(...);
	if (IS_ERR(buf)) {
		ext2_msg(...);
		ret = PTR_ERR(buf);
		goto failed_sbi;
	}

I also wonder if you could have decreased the diff size by retaining the
"bh" variable name even if it's no longer short for bufferhead.

>  	}
> @@ -870,7 +877,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
>  	 * Note: s_es must be initialized as soon as possible because
>  	 *       some ext2 macro-instructions depend on its value
>  	 */
> -	es = (struct ext2_super_block *) (((char *)bh->b_data) + offset);
> +	es = (struct ext2_super_block *) (((char *)buf->b_data) + offset);

Should b_data be declared (void *) and we can simplify all this casting?

>  	sbi->s_es = es;
>  	sb->s_magic = le16_to_cpu(es->s_magic);
>  
> @@ -966,7 +973,8 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
>  
>  	/* If the blocksize doesn't match, re-read the thing.. */
>  	if (sb->s_blocksize != blocksize) {
> -		brelse(bh);
> +		ext2_buffer_clear_uptodate(buf);
> +		ext2_put_buffer(sb, buf);

Should we instead have a way to purge a cached buffer instead of letting
it stick around (with the wrong size?) in the cache?

>  
>  		if (!sb_set_blocksize(sb, blocksize)) {
>  			ext2_msg(sb, KERN_ERR,
> @@ -976,13 +984,13 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
>  
>  		logic_sb_block = (sb_block*BLOCK_SIZE) / blocksize;
>  		offset = (sb_block*BLOCK_SIZE) % blocksize;
> -		bh = sb_bread(sb, logic_sb_block);
> -		if(!bh) {
> +		buf = ext2_read_buffer(sb, logic_sb_block);
> +		if(IS_ERR(buf)) {
>  			ext2_msg(sb, KERN_ERR, "error: couldn't read"
>  				"superblock on 2nd try");
>  			goto failed_sbi;
>  		}
> -		es = (struct ext2_super_block *) (((char *)bh->b_data) + offset);
> +		es = (struct ext2_super_block *) (((char *)buf->b_data) + offset);
>  		sbi->s_es = es;
>  		if (es->s_magic != cpu_to_le16(EXT2_SUPER_MAGIC)) {
>  			ext2_msg(sb, KERN_ERR, "error: magic mismatch");

Future refactoring question: Do the ext* developers want to move all
this validation code into a per-buffer-type validators like XFS does?

> @@ -1021,7 +1029,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
>  					sbi->s_inodes_per_block;
>  	sbi->s_desc_per_block = sb->s_blocksize /
>  					sizeof (struct ext2_group_desc);
> -	sbi->s_sbh = bh;
> +	sbi->s_sbuf = buf;
>  	sbi->s_mount_state = le16_to_cpu(es->s_state);
>  	sbi->s_addr_per_block_bits =
>  		ilog2 (EXT2_ADDR_PER_BLOCK(sb));
> @@ -1031,7 +1039,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
>  	if (sb->s_magic != EXT2_SUPER_MAGIC)
>  		goto cantfind_ext2;
>  
> -	if (sb->s_blocksize != bh->b_size) {
> +	if (sb->s_blocksize != buf->b_size) {
>  		if (!silent)
>  			ext2_msg(sb, KERN_ERR, "error: unsupported blocksize");
>  		goto failed_mount;
> @@ -1213,7 +1221,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
>  	kvfree(sbi->s_group_desc);
>  	kfree(sbi->s_debts);
>  failed_mount:
> -	brelse(bh);
> +	ext2_put_buffer(sb, buf);
>  failed_sbi:
>  	fs_put_dax(sbi->s_daxdev, NULL);
>  	sb->s_fs_info = NULL;
> @@ -1224,9 +1232,9 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
>  
>  static void ext2_clear_super_error(struct super_block *sb)
>  {
> -	struct buffer_head *sbh = EXT2_SB(sb)->s_sbh;
> +	struct ext2_buffer *sbuf = EXT2_SB(sb)->s_sbuf;
>  
> -	if (buffer_write_io_error(sbh)) {
> +	if (ext2_buffer_error(sbuf)) {
>  		/*
>  		 * Oh, dear.  A previous attempt to write the
>  		 * superblock failed.  This could happen because the
> @@ -1237,8 +1245,8 @@ static void ext2_clear_super_error(struct super_block *sb)
>  		 */
>  		ext2_msg(sb, KERN_ERR,
>  		       "previous I/O error to superblock detected");
> -		clear_buffer_write_io_error(sbh);
> -		set_buffer_uptodate(sbh);
> +		ext2_buffer_clear_error(sbuf);
> +		ext2_buffer_set_uptodate(sbuf);

I wonder if setting uptodate should just clear b_error.

>  	}
>  }
>  
> @@ -1252,9 +1260,9 @@ void ext2_sync_super(struct super_block *sb, struct ext2_super_block *es,
>  	es->s_wtime = cpu_to_le32(ktime_get_real_seconds());
>  	/* unlock before we do IO */
>  	spin_unlock(&EXT2_SB(sb)->s_lock);
> -	mark_buffer_dirty(EXT2_SB(sb)->s_sbh);
> +	ext2_buffer_set_dirty(EXT2_SB(sb)->s_sbuf);
>  	if (wait)
> -		sync_dirty_buffer(EXT2_SB(sb)->s_sbh);
> +		ext2_sync_buffer_wait(sb, EXT2_SB(sb)->s_sbuf);
>  }
>  
>  /*
> @@ -1271,13 +1279,19 @@ static int ext2_sync_fs(struct super_block *sb, int wait)
>  {
>  	struct ext2_sb_info *sbi = EXT2_SB(sb);
>  	struct ext2_super_block *es = EXT2_SB(sb)->s_es;
> +	int err = 0;
>  
>  	/*
>  	 * Write quota structures to quota file, sync_blockdev() will write
>  	 * them to disk later
>  	 */
> -	dquot_writeback_dquots(sb, -1);
> +	err = dquot_writeback_dquots(sb, -1);
> +	if (err)
> +		goto out;
> +
> +	err = ext2_sync_buffer_cache(sb);

Shouldn't we try to write the buffer cache to disk even if dquot writing
fails?

	err = dquot_writeback_dquots(...);

	err2 = ext2_sync_buffer_cache(...);
	if (err2 && !err)
		err = err2;
	...
	return err;

--D

>  
> +out:
>  	spin_lock(&sbi->s_lock);
>  	if (es->s_state & cpu_to_le16(EXT2_VALID_FS)) {
>  		ext2_debug("setting valid to 0\n");
> @@ -1285,7 +1299,7 @@ static int ext2_sync_fs(struct super_block *sb, int wait)
>  	}
>  	spin_unlock(&sbi->s_lock);
>  	ext2_sync_super(sb, es, wait);
> -	return 0;
> +	return err;
>  }
>  
>  static int ext2_freeze(struct super_block *sb)
> diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
> index c885dcc3bd0d..1eb4a8607f67 100644
> --- a/fs/ext2/xattr.c
> +++ b/fs/ext2/xattr.c
> @@ -387,7 +387,7 @@ static void ext2_xattr_update_super_block(struct super_block *sb)
>  	ext2_update_dynamic_rev(sb);
>  	EXT2_SET_COMPAT_FEATURE(sb, EXT2_FEATURE_COMPAT_EXT_ATTR);
>  	spin_unlock(&EXT2_SB(sb)->s_lock);
> -	mark_buffer_dirty(EXT2_SB(sb)->s_sbh);
> +	ext2_buffer_set_dirty(EXT2_SB(sb)->s_sbuf);
>  }
>  
>  /*
> -- 
> 2.43.0
> 
> 

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [RFC PATCH v2 2/4] ext2: remove buffer heads from group descriptors
  2025-03-26  1:49 ` [RFC PATCH v2 2/4] ext2: remove buffer heads from group descriptors Catherine Hoang
@ 2025-03-28 18:29   ` Darrick J. Wong
  0 siblings, 0 replies; 11+ messages in thread
From: Darrick J. Wong @ 2025-03-28 18:29 UTC (permalink / raw)
  To: Catherine Hoang; +Cc: linux-ext4

On Tue, Mar 25, 2025 at 06:49:26PM -0700, Catherine Hoang wrote:
> The group descriptors are stored as an array of buffer_heads
> s_group_desc in struct ext2_sb_info. Replace these buffer heads with the
> new ext2_buffer and update the buffer functions accordingly.
> 
> Signed-off-by: Catherine Hoang <catherine.hoang@oracle.com>
> ---
>  fs/ext2/balloc.c | 24 ++++++++++++------------
>  fs/ext2/ext2.h   |  4 ++--
>  fs/ext2/ialloc.c | 12 ++++++------
>  fs/ext2/super.c  | 10 +++++-----
>  4 files changed, 25 insertions(+), 25 deletions(-)
> 
> diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c
> index b8cfab8f98b9..21dafa9ae2ea 100644
> --- a/fs/ext2/balloc.c
> +++ b/fs/ext2/balloc.c
> @@ -38,7 +38,7 @@
>  
>  struct ext2_group_desc * ext2_get_group_desc(struct super_block * sb,
>  					     unsigned int block_group,
> -					     struct buffer_head ** bh)
> +					     struct ext2_buffer ** buf)
>  {
>  	unsigned long group_desc;
>  	unsigned long offset;
> @@ -63,8 +63,8 @@ struct ext2_group_desc * ext2_get_group_desc(struct super_block * sb,
>  	}
>  
>  	desc = (struct ext2_group_desc *) sbi->s_group_desc[group_desc]->b_data;
> -	if (bh)
> -		*bh = sbi->s_group_desc[group_desc];
> +	if (buf)
> +		*buf = sbi->s_group_desc[group_desc];

Yeah, these patches would be less long if you'd stuck with the "bh"
name.  I hate how this sounds like busy work, but please put it back to
make reviewing easier.

> @@ -1109,10 +1109,10 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
>  	}
>  	for (i = 0; i < db_count; i++) {
>  		block = descriptor_loc(sb, logic_sb_block, i);
> -		sbi->s_group_desc[i] = sb_bread(sb, block);
> +		sbi->s_group_desc[i] = ext2_read_buffer(sb, block);

ext2_read_buffer can return an ERR_PTR, you need to check for errors
here, not just null pointers.

--D

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [RFC PATCH v2 0/4] remove buffer heads from ext2
  2025-03-27 10:32 ` [RFC PATCH v2 0/4] remove buffer heads from ext2 Christoph Hellwig
@ 2025-04-04 16:43   ` Darrick J. Wong
  2025-04-07  6:25     ` Christoph Hellwig
  0 siblings, 1 reply; 11+ messages in thread
From: Darrick J. Wong @ 2025-04-04 16:43 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Catherine Hoang, linux-ext4

On Thu, Mar 27, 2025 at 03:32:42AM -0700, Christoph Hellwig wrote:
> On Tue, Mar 25, 2025 at 06:49:24PM -0700, Catherine Hoang wrote:
> > Hi all,
> > 
> > This series is an effort to begin removing buffer heads from ext2. 
> 
> Why is that desirable?

struct buffer_head is a mismash of things -- originally it was a landing
place for the old buffer cache, right?  So it has the necessary things
like a pointer to a memory page, the disk address, a length, buffer
state flags (uptodate/dirty), and some locks.  For filesystem metadata
blocks I think that's all that most filesystems really need.  Assuming
that filesystems /never/ want overlapping metadata buffers, I think it's
more efficient to look up buffer objects via an rhashtable instead of
walking the address_space xarray to find a folio, and then walking a
linked list from that folio to find the particular bh.

Unfortunately, it also has a bunch of file mapping state information
(e.g. BH_Delalloc) that aren't needed for caching metadata blocks.  All
the confusion that results from the incohesive mixing of these two
usecases goes away by separating out the metadata buffers into a
separate cache and (ha) leaving the filesystems to port the file IO
paths to iomap.

Separating filesystem metadata buffers into a private datastructure
instead of using the blockdev pagecache also closes off an entire class
of attack surface where evil userspace can wait for a filesystem to load
a metadata block into memory and validate it; and then scribble on the
pagecache block to cause the filesystem driver to make the wrong
decisions -- look at all the ext4 metadata_csum bugs where syzkaller
discovered that the decision to call the crc32c driver was gated on a
bit in a bufferhead, and setting that bit having not initialized the
crc32c driver would lead to a kernel crash.  Nowadays we have
CONFIG_BLK_DEV_WRITE_MOUNTED to shut that down, though it defaults to y
and I think that might actually break leased layout things like pnfs.

So the upsides are: faster lookups, a more cohesive data structure that
only tries to do one thing, and closing attack surfaces.

The downsides: this new buffer cache code still needs: an explicit hook
into the dirty pagecache timeout to start its own writeback; to provide
its own shrinker; and some sort of solution for file mapping metadata so
that fsync can flush just those blocks and not the whole cache.

--D

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [RFC PATCH v2 0/4] remove buffer heads from ext2
  2025-04-04 16:43   ` Darrick J. Wong
@ 2025-04-07  6:25     ` Christoph Hellwig
  2025-04-07 16:54       ` Darrick J. Wong
  0 siblings, 1 reply; 11+ messages in thread
From: Christoph Hellwig @ 2025-04-07  6:25 UTC (permalink / raw)
  To: Darrick J. Wong; +Cc: Christoph Hellwig, Catherine Hoang, linux-ext4

On Fri, Apr 04, 2025 at 09:43:22AM -0700, Darrick J. Wong wrote:
> On Thu, Mar 27, 2025 at 03:32:42AM -0700, Christoph Hellwig wrote:
> > On Tue, Mar 25, 2025 at 06:49:24PM -0700, Catherine Hoang wrote:
> > > Hi all,
> > > 
> > > This series is an effort to begin removing buffer heads from ext2. 
> > 
> > Why is that desirable?
> 
> struct buffer_head is a mismash of things -- originally it was a landing
> place for the old buffer cache, right?

Yes.

> So it has the necessary things
> like a pointer to a memory page, the disk address, a length, buffer
> state flags (uptodate/dirty), and some locks.

Yes. (although the page to folio migration is almost done).

> For filesystem metadata
> blocks I think that's all that most filesystems really need.

Yes.

> Assuming
> that filesystems /never/ want overlapping metadata buffers, I think it's
> more efficient to look up buffer objects via an rhashtable instead of
> walking the address_space xarray to find a folio, and then walking a
> linked list from that folio to find the particular bh.

Probably.  But does it make a practical difference for ext2?

> Unfortunately, it also has a bunch of file mapping state information
> (e.g. BH_Delalloc) that aren't needed for caching metadata blocks.  All
> the confusion that results from the incohesive mixing of these two
> usecases goes away by separating out the metadata buffers into a
> separate cache and (ha) leaving the filesystems to port the file IO
> paths to iomap.

Exactly.

> Separating filesystem metadata buffers into a private datastructure
> instead of using the blockdev pagecache also closes off an entire class
> of attack surface where evil userspace can wait for a filesystem to load
> a metadata block into memory and validate it;

It does close that window.  OTOH that behavior has traditionally been
expected very much for extN for use with some tools (IIRC tunefs), so
changing that would actually break existing userspace.

> and then scribble on the
> pagecache block to cause the filesystem driver to make the wrong
> decisions -- look at all the ext4 metadata_csum bugs where syzkaller
> discovered that the decision to call the crc32c driver was gated on a
> bit in a bufferhead, and setting that bit having not initialized the
> crc32c driver would lead to a kernel crash.  Nowadays we have
> CONFIG_BLK_DEV_WRITE_MOUNTED to shut that down, though it defaults to y
> and I think that might actually break leased layout things like pnfs.

pNFS scsi/nvme only works for remote nodes, so it's not affected by
this.  The block layout could theoretically work locally, but it's
a pretty broken protocol to start with.

> So the upsides are: faster lookups, a more cohesive data structure that
> only tries to do one thing, and closing attack surfaces.
> 
> The downsides: this new buffer cache code still needs: an explicit hook
> into the dirty pagecache timeout to start its own writeback; to provide
> its own shrinker; and some sort of solution for file mapping metadata so
> that fsync can flush just those blocks and not the whole cache.

The other major downside is instead of one maintained code base we now
have one per file system.  That's probably okay for well maintained
file systems with somewhat special needs (say btrfs or xfs), but I'd
rather not have that for every misc file system using buffer_heads
right now.

So if we want to reduce problems with buffer_heads I'd much rather see
all data path usage go away ASAP.  After that the weird entanglement
with jbd2 would be the next step (and maybe a private buffer cache for
ext4/gfs2 is the answer there).  But copy and pasting a new buffer cache
into simple and barely maintained file systems like ext2 feels at least
a bit questionable.


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [RFC PATCH v2 0/4] remove buffer heads from ext2
  2025-04-07  6:25     ` Christoph Hellwig
@ 2025-04-07 16:54       ` Darrick J. Wong
  0 siblings, 0 replies; 11+ messages in thread
From: Darrick J. Wong @ 2025-04-07 16:54 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Catherine Hoang, linux-ext4

On Sun, Apr 06, 2025 at 11:25:09PM -0700, Christoph Hellwig wrote:
> On Fri, Apr 04, 2025 at 09:43:22AM -0700, Darrick J. Wong wrote:
> > On Thu, Mar 27, 2025 at 03:32:42AM -0700, Christoph Hellwig wrote:
> > > On Tue, Mar 25, 2025 at 06:49:24PM -0700, Catherine Hoang wrote:
> > > > Hi all,
> > > > 
> > > > This series is an effort to begin removing buffer heads from ext2. 
> > > 
> > > Why is that desirable?
> > 
> > struct buffer_head is a mismash of things -- originally it was a landing
> > place for the old buffer cache, right?
> 
> Yes.
> 
> > So it has the necessary things
> > like a pointer to a memory page, the disk address, a length, buffer
> > state flags (uptodate/dirty), and some locks.
> 
> Yes. (although the page to folio migration is almost done).

Yeah, now that your folio conversion has landed for xfs_buf, Catherine
should crib your implementation into this one.  Though at least ext2
doesn't have all the discontiguous multi-fsblock stuff to deal with.

> > For filesystem metadata
> > blocks I think that's all that most filesystems really need.
> 
> Yes.
> 
> > Assuming
> > that filesystems /never/ want overlapping metadata buffers, I think it's
> > more efficient to look up buffer objects via an rhashtable instead of
> > walking the address_space xarray to find a folio, and then walking a
> > linked list from that folio to find the particular bh.
> 
> Probably.  But does it make a practical difference for ext2?

Not really, ext2 is just the test vehicle for getting this going.

> > Unfortunately, it also has a bunch of file mapping state information
> > (e.g. BH_Delalloc) that aren't needed for caching metadata blocks.  All
> > the confusion that results from the incohesive mixing of these two
> > usecases goes away by separating out the metadata buffers into a
> > separate cache and (ha) leaving the filesystems to port the file IO
> > paths to iomap.
> 
> Exactly.
> 
> > Separating filesystem metadata buffers into a private datastructure
> > instead of using the blockdev pagecache also closes off an entire class
> > of attack surface where evil userspace can wait for a filesystem to load
> > a metadata block into memory and validate it;
> 
> It does close that window.  OTOH that behavior has traditionally been
> expected very much for extN for use with some tools (IIRC tunefs), so
> changing that would actually break existing userspace.

Ted at least would like to get rid of tune2fs' scribbling on the block
device.  It's really gross in ext4 since tune2fs can race with the
kernel to update sb fields and recalculate the checksum.

> > and then scribble on the
> > pagecache block to cause the filesystem driver to make the wrong
> > decisions -- look at all the ext4 metadata_csum bugs where syzkaller
> > discovered that the decision to call the crc32c driver was gated on a
> > bit in a bufferhead, and setting that bit having not initialized the
> > crc32c driver would lead to a kernel crash.  Nowadays we have
> > CONFIG_BLK_DEV_WRITE_MOUNTED to shut that down, though it defaults to y
> > and I think that might actually break leased layout things like pnfs.
> 
> pNFS scsi/nvme only works for remote nodes, so it's not affected by
> this.  The block layout could theoretically work locally, but it's
> a pretty broken protocol to start with.

Ah.

> > So the upsides are: faster lookups, a more cohesive data structure that
> > only tries to do one thing, and closing attack surfaces.
> > 
> > The downsides: this new buffer cache code still needs: an explicit hook
> > into the dirty pagecache timeout to start its own writeback; to provide
> > its own shrinker; and some sort of solution for file mapping metadata so
> > that fsync can flush just those blocks and not the whole cache.
> 
> The other major downside is instead of one maintained code base we now
> have one per file system.  That's probably okay for well maintained
> file systems with somewhat special needs (say btrfs or xfs), but I'd
> rather not have that for every misc file system using buffer_heads
> right now.
> 
> So if we want to reduce problems with buffer_heads I'd much rather see
> all data path usage go away ASAP.  After that the weird entanglement
> with jbd2 would be the next step (and maybe a private buffer cache for
> ext4/gfs2 is the answer there).  But copy and pasting a new buffer cache
> into simple and barely maintained file systems like ext2 feels at least
> a bit questionable.

The next step is to yank all that code up to fs/ and port another
filesystem (e.g. fat) to it.  But let's let Catherine get it working for
all the metadata types within the confines of ext2.

--D

^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2025-04-07 16:54 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-03-26  1:49 [RFC PATCH v2 0/4] remove buffer heads from ext2 Catherine Hoang
2025-03-26  1:49 ` [RFC PATCH v2 1/4] ext2: remove buffer heads from superblock Catherine Hoang
2025-03-28 18:24   ` Darrick J. Wong
2025-03-26  1:49 ` [RFC PATCH v2 2/4] ext2: remove buffer heads from group descriptors Catherine Hoang
2025-03-28 18:29   ` Darrick J. Wong
2025-03-26  1:49 ` [RFC PATCH v2 3/4] ext2: remove buffer heads from quota handling Catherine Hoang
2025-03-26  1:49 ` [RFC PATCH v2 4/4] ext2: remove buffer heads from block bitmaps Catherine Hoang
2025-03-27 10:32 ` [RFC PATCH v2 0/4] remove buffer heads from ext2 Christoph Hellwig
2025-04-04 16:43   ` Darrick J. Wong
2025-04-07  6:25     ` Christoph Hellwig
2025-04-07 16:54       ` Darrick J. Wong

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox