Linux EXT4 FS development

Linux EXT4 FS development
 help / color / mirror / Atom feed

* [PATCH 3/4] libext2fs: update iblock when using ea_inode feature
From: Etienne AUJAMES @ 2026-05-29 11:58 UTC (permalink / raw)
  To: linux-ext4; +Cc: adilger, dongyangli

When a xattr is stored in an ea_inode, ext2fs_xattrs_* functions do
not update the inode block count.

This patch uses a cached inode to update the block count and quota
when writing xattrs.
It also fix an xattr remove case: the current ACL block was not
release by ext2fs_xattrs_write().

Add a helper function ext2fs_iblk_get() to get the inode block count
in cluster count unit.

Add ext2fs_xattrs_open_inode() to specify an optional cached inode to
use or update.
The function handle an optional quota context argument to update quota
accounting. It is hard to predict inode quota usage with ea_inode
deduplication.

For testing purposes, modify the debugfs "ea_set" command to handle
input file larger than the FS block size.

Add a regression test: d_xattr_ea_inode

Fixes: 50d0998cfe ("libext2fs: add ea_inode support to set xattr")
Signed-off-by: Etienne AUJAMES <eaujames@ddn.com>
Change-Id: I34733255bb76ffe2386d8cd6c19ce4561be4da3a
Lustre-bug-id: https://jira.whamcloud.com/browse/LU-20049
---
 debugfs/xattrs.c              |  19 ++-
 e2fsck/pass1.c                |  12 +-
 lib/ext2fs/ext2fs.h           |   7 ++
 lib/ext2fs/ext_attr.c         | 227 ++++++++++++++++++++++------------
 lib/ext2fs/i_block.c          |  14 +++
 lib/support/quotaio.h         |   1 -
 tests/d_xattr_ea_inode/expect | 137 ++++++++++++++++++++
 tests/d_xattr_ea_inode/name   |   1 +
 tests/d_xattr_ea_inode/script |  85 +++++++++++++
 9 files changed, 415 insertions(+), 88 deletions(-)
 create mode 100644 tests/d_xattr_ea_inode/expect
 create mode 100644 tests/d_xattr_ea_inode/name
 create mode 100644 tests/d_xattr_ea_inode/script

diff --git a/debugfs/xattrs.c b/debugfs/xattrs.c
index b518941c9..8364281f4 100644
--- a/debugfs/xattrs.c
+++ b/debugfs/xattrs.c
@@ -14,6 +14,7 @@ extern int optind;
 extern char *optarg;
 #endif
 #include <ctype.h>
+#include <unistd.h>
 #include "support/cstring.h"
 
 #include "debugfs.h"
@@ -299,10 +300,24 @@ void do_set_xattr(int argc, ss_argv_t argv, int sci_idx EXT2FS_ATTR((unused)),
 		goto out;
 
 	if (fp) {
-		err = ext2fs_get_mem(current_fs->blocksize, &buf);
+		struct stat st;
+
+		if (fstat(fileno(fp), &st)) {
+			err = errno;
+			goto out;
+		}
+		if (st.st_size > sysconf(_SC_ARG_MAX)) {
+			err = EFBIG;
+			goto out;
+		}
+		err = ext2fs_get_mem(st.st_size, &buf);
 		if (err)
 			goto out;
-		buflen = fread(buf, 1, current_fs->blocksize, fp);
+		buflen = fread(buf, 1, st.st_size, fp);
+		if (ferror(fp)) {
+			err = errno;
+			goto out;
+		}
 	} else {
 		buf = argv[optind + 2];
 		buflen = parse_c_string(buf);
diff --git a/e2fsck/pass1.c b/e2fsck/pass1.c
index fdde76cc2..364128f4d 100644
--- a/e2fsck/pass1.c
+++ b/e2fsck/pass1.c
@@ -968,18 +968,18 @@ static void reserve_block_for_lnf_repair(e2fsck_t ctx)
 
 static errcode_t get_inline_data_ea_size(ext2_filsys fs, ext2_ino_t ino,
 					 struct ext2_inode *inode,
-					 size_t *sz)
+					 size_t inode_size, size_t *sz)
 {
 	void *p;
 	struct ext2_xattr_handle *handle;
 	errcode_t retval;
 
-	retval = ext2fs_xattrs_open(fs, ino, &handle);
+	retval = ext2fs_xattrs_open_inode(fs, ino, inode, inode_size, NULL,
+					  &handle);
 	if (retval)
 		return retval;
 
-	retval = ext2fs_xattrs_read_inode(handle,
-					  (struct ext2_inode_large *)inode);
+	retval = ext2fs_xattrs_read(handle);
 	if (retval)
 		goto err;
 
@@ -1580,6 +1580,7 @@ void e2fsck_pass1(e2fsck_t ctx)
 			size_t size = 0;
 
 			pctx.errcode = get_inline_data_ea_size(fs, ino, inode,
+							       inode_size,
 							       &size);
 			if (!pctx.errcode &&
 			    fix_problem(ctx, PR_1_INLINE_DATA_FEATURE, &pctx)) {
@@ -1603,7 +1604,8 @@ void e2fsck_pass1(e2fsck_t ctx)
 			flags = fs->flags;
 			if (failed_csum)
 				fs->flags |= EXT2_FLAG_IGNORE_CSUM_ERRORS;
-			err = get_inline_data_ea_size(fs, ino, inode, &size);
+			err = get_inline_data_ea_size(fs, ino, inode,
+						      inode_size, &size);
 			fs->flags = (flags & EXT2_FLAG_IGNORE_CSUM_ERRORS) |
 				    (fs->flags & ~EXT2_FLAG_IGNORE_CSUM_ERRORS);
 
diff --git a/lib/ext2fs/ext2fs.h b/lib/ext2fs/ext2fs.h
index c4fcb10be..56de5ea50 100644
--- a/lib/ext2fs/ext2fs.h
+++ b/lib/ext2fs/ext2fs.h
@@ -94,6 +94,8 @@ typedef __s64 __bitwise		ext2_off64_t;
 typedef __s64 __bitwise		e2_blkcnt_t;
 typedef __u32 __bitwise		ext2_dirhash_t;
 
+typedef struct quota_ctx *quota_ctx_t;
+
 #if EXT2_FLAT_INCLUDES
 #include "com_err.h"
 #include "ext2_io.h"
@@ -1408,6 +1410,10 @@ errcode_t ext2fs_xattr_set(struct ext2_xattr_handle *handle,
 errcode_t ext2fs_xattr_remove(struct ext2_xattr_handle *handle,
 			      const char *key);
 errcode_t ext2fs_xattr_remove_all(struct ext2_xattr_handle *handle);
+errcode_t ext2fs_xattrs_open_inode(ext2_filsys fs, ext2_ino_t ino,
+				   struct ext2_inode *inode, size_t inode_size,
+				   quota_ctx_t qctx,
+				   struct ext2_xattr_handle **handle);
 errcode_t ext2fs_xattrs_open(ext2_filsys fs, ext2_ino_t ino,
 			     struct ext2_xattr_handle **handle);
 errcode_t ext2fs_xattrs_close(struct ext2_xattr_handle **handle);
@@ -1607,6 +1613,7 @@ errcode_t ext2fs_iblk_add_blocks(ext2_filsys fs, struct ext2_inode *inode,
 errcode_t ext2fs_iblk_sub_blocks(ext2_filsys fs, struct ext2_inode *inode,
 				 blk64_t num_blocks);
 errcode_t ext2fs_iblk_set(ext2_filsys fs, struct ext2_inode *inode, blk64_t b);
+blk64_t ext2fs_iblk_get(ext2_filsys fs, struct ext2_inode *inode);
 
 /* imager.c */
 extern errcode_t ext2fs_image_inode_write(ext2_filsys fs, int fd, int flags);
diff --git a/lib/ext2fs/ext_attr.c b/lib/ext2fs/ext_attr.c
index 7723d0f91..3b90b70bb 100644
--- a/lib/ext2fs/ext_attr.c
+++ b/lib/ext2fs/ext_attr.c
@@ -22,6 +22,7 @@
 #include "ext2_fs.h"
 #include "ext2_ext_attr.h"
 #include "ext4_acl.h"
+#include "support/quotaio.h"
 
 #include "ext2fsP.h"
 
@@ -361,8 +362,14 @@ struct ext2_xattr_handle {
 	int capacity;
 	int count;
 	int ibody_count;
+	struct ext2_inode *in_inode;
+	size_t in_inode_size;
+	struct ext2_inode_large *alloc_inode;
+	struct ext2_inode_large *inode;
+	size_t inode_size;
 	ext2_ino_t ino;
 	unsigned int flags;
+	quota_ctx_t qctx;
 };
 
 static errcode_t ext2fs_xattrs_expand(struct ext2_xattr_handle *h,
@@ -499,9 +506,11 @@ out:
 	return err;
 }
 
-static errcode_t prep_ea_block_for_write(ext2_filsys fs, ext2_ino_t ino,
-					 struct ext2_inode_large *inode)
+static errcode_t prep_ea_block_for_write(ext2_filsys fs,
+					 struct ext2_xattr_handle *handle)
 {
+	struct ext2_inode_large *inode = handle->inode;
+	ext2_ino_t ino = handle->ino;
 	struct ext2_ext_attr_header *header;
 	void *block_buf = NULL;
 	blk64_t blk, goal;
@@ -541,11 +550,15 @@ static errcode_t prep_ea_block_for_write(ext2_filsys fs, ext2_ino_t ino,
 		if (err)
 			goto out2;
 	} else {
-		/* No block, we must increment i_blocks */
+		/* No block, we must increment i_blocks and quota */
 		err = ext2fs_iblk_add_blocks(fs, (struct ext2_inode *)inode,
 					     1);
 		if (err)
 			goto out;
+
+		if (handle->qctx)
+			quota_data_add(handle->qctx, handle->inode, handle->ino,
+				       EXT2FS_C2B(fs, 1) * fs->blocksize);
 	}
 
 	/* Allocate a block */
@@ -744,8 +757,7 @@ write_xattrs_to_buffer(ext2_filsys fs, struct ext2_xattr *attrs, int count,
 errcode_t ext2fs_xattrs_write(struct ext2_xattr_handle *handle)
 {
 	ext2_filsys fs = handle->fs;
-	const unsigned int inode_size = EXT2_INODE_SIZE(fs->super);
-	struct ext2_inode_large *inode;
+	struct ext2_inode_large *inode = handle->inode;
 	char *start, *block_buf = NULL;
 	struct ext2_ext_attr_header *header;
 	__u32 ea_inode_magic;
@@ -755,21 +767,12 @@ errcode_t ext2fs_xattrs_write(struct ext2_xattr_handle *handle)
 	errcode_t err;
 
 	EXT2_CHECK_MAGIC(handle, EXT2_ET_MAGIC_EA_HANDLE);
-	i = inode_size;
-	if (i < sizeof(*inode))
-		i = sizeof(*inode);
-	err = ext2fs_get_memzero(i, &inode);
-	if (err)
-		return err;
-
-	err = ext2fs_read_inode_full(fs, handle->ino, EXT2_INODE(inode),
-				     inode_size);
-	if (err)
-		goto out;
+	if (!inode)
+		return EINVAL;
 
 	/* If extra_isize isn't set, we need to set it now */
 	if (inode->i_extra_isize == 0 &&
-	    inode_size > EXT2_GOOD_OLD_INODE_SIZE) {
+	    handle->inode_size > EXT2_GOOD_OLD_INODE_SIZE) {
 		char *p = (char *)inode;
 		size_t extra = fs->super->s_want_extra_isize;
 
@@ -778,22 +781,20 @@ errcode_t ext2fs_xattrs_write(struct ext2_xattr_handle *handle)
 		memset(p + EXT2_GOOD_OLD_INODE_SIZE, 0, extra);
 		inode->i_extra_isize = extra;
 	}
-	if (inode->i_extra_isize & 3) {
-		err = EXT2_ET_INODE_CORRUPTED;
-		goto out;
-	}
+	if (inode->i_extra_isize & 3)
+		return EXT2_ET_INODE_CORRUPTED;
 
 	/* Does the inode have space for EA? */
 	if (inode->i_extra_isize < sizeof(inode->i_extra_isize) ||
-	    inode_size <= EXT2_GOOD_OLD_INODE_SIZE + inode->i_extra_isize +
-								sizeof(__u32))
+	    handle->inode_size <= EXT2_GOOD_OLD_INODE_SIZE +
+	    inode->i_extra_isize + sizeof(__u32))
 		goto write_ea_block;
 
 	/* Write the inode EA */
 	ea_inode_magic = EXT2_EXT_ATTR_MAGIC;
 	memcpy(((char *) inode) + EXT2_GOOD_OLD_INODE_SIZE +
 	       inode->i_extra_isize, &ea_inode_magic, sizeof(__u32));
-	storage_size = inode_size - EXT2_GOOD_OLD_INODE_SIZE -
+	storage_size = handle->inode_size - EXT2_GOOD_OLD_INODE_SIZE -
 				inode->i_extra_isize - sizeof(__u32);
 	start = ((char *) inode) + EXT2_GOOD_OLD_INODE_SIZE +
 				inode->i_extra_isize + sizeof(__u32);
@@ -801,17 +802,16 @@ errcode_t ext2fs_xattrs_write(struct ext2_xattr_handle *handle)
 	err = write_xattrs_to_buffer(fs, handle->attrs, handle->ibody_count,
 				     start, storage_size, 0, 0);
 	if (err)
-		goto out;
+		return err;
 write_ea_block:
 	/* Are we done? */
-	if (handle->ibody_count == handle->count &&
-	    !ext2fs_file_acl_block(fs, EXT2_INODE(inode)))
+	if (handle->ibody_count == handle->count)
 		goto skip_ea_block;
 
 	/* Write the EA block */
 	err = ext2fs_get_memzero(fs->blocksize, &block_buf);
 	if (err)
-		goto out;
+		return err;
 
 	storage_size = fs->blocksize - sizeof(struct ext2_ext_attr_header);
 	start = block_buf + sizeof(struct ext2_ext_attr_header);
@@ -820,7 +820,7 @@ write_ea_block:
 				     handle->count - handle->ibody_count, start,
 				     storage_size, start - block_buf, 1);
 	if (err)
-		goto out2;
+		goto out;
 
 	/* Write a header on the EA block */
 	header = (struct ext2_ext_attr_header *) block_buf;
@@ -829,15 +829,15 @@ write_ea_block:
 	header->h_blocks = 1;
 
 	/* Get a new block for writing */
-	err = prep_ea_block_for_write(fs, handle->ino, inode);
+	err = prep_ea_block_for_write(fs, handle);
 	if (err)
-		goto out2;
+		goto out;
 
 	/* Finally, write the new EA block */
 	blk = ext2fs_file_acl_block(fs, EXT2_INODE(inode));
 	err = ext2fs_write_ext_attr3(fs, blk, block_buf, handle->ino);
 	if (err)
-		goto out2;
+		goto out;
 
 skip_ea_block:
 	blk = ext2fs_file_acl_block(fs, (struct ext2_inode *)inode);
@@ -845,19 +845,26 @@ skip_ea_block:
 		/* xattrs shrunk, free the block */
 		err = ext2fs_free_ext_attr(fs, handle->ino, inode);
 		if (err)
-			goto out;
+			return err;
+		if (handle->qctx)
+			quota_data_sub(handle->qctx, inode, handle->ino,
+				       EXT2FS_C2B(fs, 1) * fs->blocksize);
 	}
 
 	/* Write the inode */
 	err = ext2fs_write_inode_full(fs, handle->ino, EXT2_INODE(inode),
-				      inode_size);
+				      handle->inode_size);
 	if (err)
-		goto out2;
+		goto out;
+
+	/* Update the caller cached inode if provided */
+	if (handle->in_inode && handle->in_inode != EXT2_INODE(handle->inode))
+		memcpy(handle->in_inode, EXT2_INODE(handle->inode),
+		       handle->in_inode_size);
 
-out2:
-	ext2fs_free_mem(&block_buf);
 out:
-	ext2fs_free_mem(&inode);
+	ext2fs_free_mem(&block_buf);
+
 	return err;
 }
 
@@ -1130,29 +1137,51 @@ out:
 	return err;
 }
 
-errcode_t ext2fs_xattrs_read(struct ext2_xattr_handle *handle)
+errcode_t ext2fs_xattrs_read(struct ext2_xattr_handle *h)
 {
-	struct ext2_inode_large *inode;
-	size_t inode_size = EXT2_INODE_SIZE(handle->fs->super);
 	errcode_t err;
 
-	EXT2_CHECK_MAGIC(handle, EXT2_ET_MAGIC_EA_HANDLE);
+	EXT2_CHECK_MAGIC(h, EXT2_ET_MAGIC_EA_HANDLE);
+
+	h->inode_size = EXT2_INODE_SIZE(h->fs->super);
+	if (h->inode_size < sizeof(*h->inode))
+		h->inode_size = sizeof(*h->inode);
+
+	/* Use the caller cached inode if possible */
+	if (h->in_inode &&  h->in_inode_size >= h->inode_size) {
+		h->inode = (struct ext2_inode_large *) h->in_inode;
+		goto xattrs_read;
+	}
+
+	/* Flush the caller cached inode if provided */
+	if (h->in_inode) {
+		err = ext2fs_write_inode_full(h->fs, h->ino, h->in_inode,
+					     h->in_inode_size);
+		if (err)
+			goto err;
+	}
 
-	if (inode_size < sizeof(*inode))
-		inode_size = sizeof(*inode);
-	err = ext2fs_get_memzero(inode_size, &inode);
+	err = ext2fs_get_memzero(h->inode_size, &h->alloc_inode);
 	if (err)
 		return err;
 
-	err = ext2fs_read_inode_full(handle->fs, handle->ino, EXT2_INODE(inode),
-				     EXT2_INODE_SIZE(handle->fs->super));
+	err = ext2fs_read_inode_full(h->fs, h->ino, EXT2_INODE(h->alloc_inode),
+				     h->inode_size);
 	if (err)
-		goto out;
+		goto err;
 
-	err = ext2fs_xattrs_read_inode(handle, inode);
+	h->inode = h->alloc_inode;
 
-out:
-	ext2fs_free_mem(&inode);
+xattrs_read:
+	err = ext2fs_xattrs_read_inode(h, h->inode);
+	if (err)
+		goto err;
+
+	return 0;
+
+err:
+	h->inode_size = 0;
+	ext2fs_free_mem(&h->alloc_inode);
 
 	return err;
 }
@@ -1272,12 +1301,15 @@ out:
 	return err;
 }
 
-static errcode_t xattr_create_ea_inode(ext2_filsys fs, const void *value,
-				       size_t value_len, ext2_ino_t *ea_ino)
+static errcode_t xattr_create_ea_inode(struct ext2_xattr_handle *handle,
+				       const void *value, size_t value_len,
+				       ext2_ino_t *ea_ino)
 {
+	ext2_filsys fs = handle->fs;
 	struct ext2_inode inode;
 	ext2_ino_t ino;
 	ext2_file_t file;
+	blk64_t	iblk;
 	__u32 hash;
 	errcode_t ret;
 
@@ -1317,16 +1349,30 @@ static errcode_t xattr_create_ea_inode(ext2_filsys fs, const void *value,
 	if (ret)
 		return ret;
 
+	ret = ext2fs_read_inode(fs, ino, &inode);
+	if (ret)
+		return ret;
+
 	ext2fs_inode_alloc_stats2(fs, ino, 1 /* inuse */, 0 /* isdir */);
+	iblk = ext2fs_iblk_get(fs, &inode);
+	ext2fs_iblk_add_blocks(fs, EXT2_INODE(handle->inode), iblk);
+	if (handle->qctx) {
+		quota_data_add(handle->qctx, handle->inode, handle->ino,
+			       EXT2FS_C2B(fs, iblk) * fs->blocksize);
+		quota_data_inodes(handle->qctx, handle->inode, handle->ino, +1);
+	}
 
 	*ea_ino = ino;
 	return 0;
 }
 
-static errcode_t xattr_inode_dec_ref(ext2_filsys fs, ext2_ino_t ino)
+static errcode_t xattr_inode_dec_ref(struct ext2_xattr_handle *handle,
+				     ext2_ino_t ino)
 {
+	ext2_filsys fs = handle->fs;
 	struct ext2_inode_large inode;
 	__u64 ref_count;
+	blk64_t	iblk;
 	errcode_t ret;
 
 	ret = ext2fs_read_inode_full(fs, ino, (struct ext2_inode *)&inode,
@@ -1338,6 +1384,14 @@ static errcode_t xattr_inode_dec_ref(ext2_filsys fs, ext2_ino_t ino)
 	ref_count--;
 	ext2fs_set_ea_inode_ref(EXT2_INODE(&inode), ref_count);
 
+	iblk = ext2fs_iblk_get(fs, EXT2_INODE(&inode));
+	ext2fs_iblk_sub_blocks(fs, EXT2_INODE(handle->inode), iblk);
+	if (handle->qctx) {
+		quota_data_sub(handle->qctx, handle->inode, handle->ino,
+			       EXT2FS_C2B(fs, iblk) * fs->blocksize);
+		quota_data_inodes(handle->qctx, handle->inode, handle->ino, -1);
+	}
+
 	if (ref_count)
 		goto write_out;
 
@@ -1365,7 +1419,8 @@ out:
 	return ret;
 }
 
-static errcode_t xattr_update_entry(ext2_filsys fs, struct ext2_xattr *x,
+static errcode_t xattr_update_entry(struct ext2_xattr_handle *handle,
+				    struct ext2_xattr *x,
 				    const char *name, const char *short_name,
 				    int index, const void *value,
 				    size_t value_len, int in_inode)
@@ -1390,13 +1445,13 @@ static errcode_t xattr_update_entry(ext2_filsys fs, struct ext2_xattr *x,
 	memcpy(new_value, value, value_len);
 
 	if (in_inode) {
-		ret = xattr_create_ea_inode(fs, value, value_len, &ea_ino);
+		ret = xattr_create_ea_inode(handle, value, value_len, &ea_ino);
 		if (ret)
 			goto fail;
 	}
 
 	if (x->ea_ino) {
-		ret = xattr_inode_dec_ref(fs, x->ea_ino);
+		ret = xattr_inode_dec_ref(handle, x->ea_ino);
 		if (ret)
 			goto fail;
 	}
@@ -1419,7 +1474,7 @@ fail:
 	if (new_value)
 		ext2fs_free_mem(&new_value);
 	if (ea_ino)
-		xattr_inode_dec_ref(fs, ea_ino);
+		xattr_inode_dec_ref(handle, ea_ino);
 	return ret;
 }
 
@@ -1486,7 +1541,7 @@ static errcode_t xattr_array_update(struct ext2_xattr_handle *h,
 		}
 
 		/* Update the existing entry. */
-		ret = xattr_update_entry(h->fs, &h->attrs[old_idx], name,
+		ret = xattr_update_entry(h, &h->attrs[old_idx], name,
 					 shortname, name_idx, value,
 					 value_len, in_inode);
 		if (ret)
@@ -1515,7 +1570,7 @@ static errcode_t xattr_array_update(struct ext2_xattr_handle *h,
 
 	if (old_idx >= 0) {
 		/* Update the existing entry. */
-		ret = xattr_update_entry(h->fs, &h->attrs[old_idx], name,
+		ret = xattr_update_entry(h, &h->attrs[old_idx], name,
 					 shortname, name_idx, value,
 					 value_len, in_inode);
 		if (ret)
@@ -1551,7 +1606,7 @@ add_new:
 			return ret;
 	}
 
-	ret = xattr_update_entry(h->fs, &h->attrs[h->count], name, shortname,
+	ret = xattr_update_entry(h, &h->attrs[h->count], name, shortname,
 				 name_idx, value, value_len, in_inode);
 	if (ret)
 		return ret;
@@ -1594,8 +1649,7 @@ errcode_t ext2fs_xattr_set(struct ext2_xattr_handle *h,
 			   size_t value_len)
 {
 	ext2_filsys fs = h->fs;
-	const int inode_size = EXT2_INODE_SIZE(fs->super);
-	struct ext2_inode_large *inode = NULL;
+	struct ext2_inode_large *inode = h->inode;
 	struct ext2_xattr *x;
 	char *new_value;
 	int ibody_free, block_free;
@@ -1605,6 +1659,8 @@ errcode_t ext2fs_xattr_set(struct ext2_xattr_handle *h,
 	errcode_t ret;
 
 	EXT2_CHECK_MAGIC(h, EXT2_ET_MAGIC_EA_HANDLE);
+	if (!inode || !h->inode_size)
+		return EINVAL;
 
 	ret = ext2fs_get_mem(value_len, &new_value);
 	if (ret)
@@ -1632,23 +1688,14 @@ errcode_t ext2fs_xattr_set(struct ext2_xattr_handle *h,
 			break;
 		}
 	}
-
-	ret = ext2fs_get_memzero(inode_size, &inode);
-	if (ret)
-		goto out;
-	ret = ext2fs_read_inode_full(fs, h->ino,
-				     (struct ext2_inode *)inode,
-				     inode_size);
-	if (ret)
-		goto out;
-	if (inode_size > EXT2_GOOD_OLD_INODE_SIZE) {
+	if (h->inode_size > EXT2_GOOD_OLD_INODE_SIZE) {
 		extra_isize = inode->i_extra_isize;
 		if (extra_isize == 0) {
 			extra_isize = fs->super->s_want_extra_isize;
 			if (extra_isize == 0)
 				extra_isize = sizeof(__u32);
 		}
-		ibody_free = inode_size - EXT2_GOOD_OLD_INODE_SIZE;
+		ibody_free = h->inode_size - EXT2_GOOD_OLD_INODE_SIZE;
 		ibody_free -= extra_isize;
 		/* Extended attribute magic and final null entry. */
 		ibody_free -= sizeof(__u32) * 2;
@@ -1694,8 +1741,6 @@ errcode_t ext2fs_xattr_set(struct ext2_xattr_handle *h,
 write_out:
 	ret = ext2fs_xattrs_write(h);
 out:
-	if (inode)
-		ext2fs_free_mem(&inode);
 	ext2fs_free_mem(&new_value);
 	return ret;
 }
@@ -1712,7 +1757,7 @@ errcode_t ext2fs_xattr_remove(struct ext2_xattr_handle *handle,
 			ext2fs_free_mem(&x->name);
 			ext2fs_free_mem(&x->value);
 			if (x->ea_ino)
-				xattr_inode_dec_ref(handle->fs, x->ea_ino);
+				xattr_inode_dec_ref(handle, x->ea_ino);
 			memmove(x, x + 1, (end - x - 1)*sizeof(*x));
 			memset(end - 1, 0, sizeof(*end));
 			if (x < handle->attrs + handle->ibody_count)
@@ -1736,7 +1781,7 @@ errcode_t ext2fs_xattr_remove_all(struct ext2_xattr_handle *handle)
 		ext2fs_free_mem(&x->name);
 		ext2fs_free_mem(&x->value);
 		if (x->ea_ino)
-			xattr_inode_dec_ref(handle->fs, x->ea_ino);
+			xattr_inode_dec_ref(handle, x->ea_ino);
 	}
 
 	handle->ibody_count = 0;
@@ -1744,8 +1789,14 @@ errcode_t ext2fs_xattr_remove_all(struct ext2_xattr_handle *handle)
 	return ext2fs_xattrs_write(handle);
 }
 
-errcode_t ext2fs_xattrs_open(ext2_filsys fs, ext2_ino_t ino,
-			     struct ext2_xattr_handle **handle)
+/* If the inode size is set to EXT2_INODE_SIZE(fs), the input inode is used
+ * directly. Otherwise, the ext2fs_xattrs_* functions operate on a separate copy
+ * (with inline xattrs) and update the caller's cached inode on write.
+ */
+errcode_t ext2fs_xattrs_open_inode(ext2_filsys fs, ext2_ino_t ino,
+				   struct ext2_inode *inode, size_t inode_size,
+				   quota_ctx_t qctx,
+				   struct ext2_xattr_handle **handle)
 {
 	struct ext2_xattr_handle *h;
 	errcode_t err;
@@ -1754,6 +1805,9 @@ errcode_t ext2fs_xattrs_open(ext2_filsys fs, ext2_ino_t ino,
 	    !ext2fs_has_feature_inline_data(fs->super))
 		return EXT2_ET_MISSING_EA_FEATURE;
 
+	if (inode && inode_size < sizeof(*inode))
+		return EINVAL;
+
 	err = ext2fs_get_memzero(sizeof(*h), &h);
 	if (err)
 		return err;
@@ -1768,11 +1822,23 @@ errcode_t ext2fs_xattrs_open(ext2_filsys fs, ext2_ino_t ino,
 	}
 	h->count = 0;
 	h->ino = ino;
+	h->in_inode = inode;
+	h->in_inode_size = inode_size;
+	h->alloc_inode = NULL;
+	h->inode = NULL;
+	h->inode_size = 0;
 	h->fs = fs;
+	h->qctx = qctx;
 	*handle = h;
 	return 0;
 }
 
+errcode_t ext2fs_xattrs_open(ext2_filsys fs, ext2_ino_t ino,
+			     struct ext2_xattr_handle **handle)
+{
+	return ext2fs_xattrs_open_inode(fs, ino, NULL, 0, NULL, handle);
+}
+
 errcode_t ext2fs_xattrs_close(struct ext2_xattr_handle **handle)
 {
 	struct ext2_xattr_handle *h = *handle;
@@ -1780,6 +1846,7 @@ errcode_t ext2fs_xattrs_close(struct ext2_xattr_handle **handle)
 	EXT2_CHECK_MAGIC(h, EXT2_ET_MAGIC_EA_HANDLE);
 	xattrs_free_keys(h);
 	ext2fs_free_mem(&h->attrs);
+	ext2fs_free_mem(&h->alloc_inode);
 	ext2fs_free_mem(handle);
 	return 0;
 }
diff --git a/lib/ext2fs/i_block.c b/lib/ext2fs/i_block.c
index 2eecf02fc..064a5c989 100644
--- a/lib/ext2fs/i_block.c
+++ b/lib/ext2fs/i_block.c
@@ -88,3 +88,17 @@ errcode_t ext2fs_iblk_set(ext2_filsys fs, struct ext2_inode *inode, blk64_t b)
 		return EOVERFLOW;
 	return 0;
 }
+
+blk64_t ext2fs_iblk_get(ext2_filsys fs, struct ext2_inode *inode)
+{
+	blk64_t	blk = inode->i_blocks;
+
+	if (ext2fs_has_feature_huge_file(fs->super))
+		blk += ((blk64_t) inode->osd2.linux2.l_i_blocks_hi) << 32;
+
+	if (!ext2fs_has_feature_huge_file(fs->super) ||
+	    !(inode->i_flags & EXT4_HUGE_FILE_FL))
+		blk /= fs->blocksize / 512;
+
+	return EXT2FS_B2C(fs, blk);
+}
diff --git a/lib/support/quotaio.h b/lib/support/quotaio.h
index 6152416fb..c76486919 100644
--- a/lib/support/quotaio.h
+++ b/lib/support/quotaio.h
@@ -58,7 +58,6 @@ enum quota_type {
 #define QUOTA_PRJ_BIT (1 << PRJQUOTA)
 #define QUOTA_ALL_BIT (QUOTA_USR_BIT | QUOTA_GRP_BIT | QUOTA_PRJ_BIT)
 
-typedef struct quota_ctx *quota_ctx_t;
 struct dict_t;
 
 struct quota_ctx {
diff --git a/tests/d_xattr_ea_inode/expect b/tests/d_xattr_ea_inode/expect
new file mode 100644
index 000000000..aaad9c5b3
--- /dev/null
+++ b/tests/d_xattr_ea_inode/expect
@@ -0,0 +1,137 @@
+debugfs edit extended attributes with ea_inode feature
+mke2fs -Fq -b 4096 -O ea_inode test.img 1m
+Exit status is 0
+Generate xattr value (8292 bytes)
+ea_set -f d_xattr_ea_inode.tmp / user.test1
+Exit status is 0
+ea_get -f d_xattr_ea_inode.ver.tmp / user.test1
+Exit status is 0
+Compare xattr values (8292 bytes)
+stat /
+Blockcount: 32
+Exit status is 0
+ea_rm / user.test1
+Exit status is 0
+e2fsck -yf -N test_filesys
+Pass 1: Checking inodes, blocks, and sizes
+Pass 2: Checking directory structure
+Pass 3: Checking directory connectivity
+Pass 4: Checking reference counts
+Pass 5: Checking group summary information
+test_filesys: 11/128 files (0.0% non-contiguous), 18/256 blocks
+Exit status is 0
+
+Generate xattr value (4097 bytes)
+ea_set -f d_xattr_ea_inode.tmp / user.test1
+Exit status is 0
+ea_get -f d_xattr_ea_inode.ver.tmp / user.test1
+Exit status is 0
+Compare xattr values (4097 bytes)
+stat /
+Blockcount: 24
+Exit status is 0
+e2fsck -yf -N test_filesys
+Pass 1: Checking inodes, blocks, and sizes
+Pass 2: Checking directory structure
+Pass 3: Checking directory connectivity
+Pass 4: Checking reference counts
+Pass 5: Checking group summary information
+test_filesys: 12/128 files (0.0% non-contiguous), 20/256 blocks
+Exit status is 0
+
+Generate xattr value (102 bytes)
+ea_set -f d_xattr_ea_inode.tmp / user.test2
+Exit status is 0
+ea_get -f d_xattr_ea_inode.ver.tmp / user.test2
+Exit status is 0
+Compare xattr values (102 bytes)
+stat /
+Blockcount: 32
+Exit status is 0
+e2fsck -yf -N test_filesys
+Pass 1: Checking inodes, blocks, and sizes
+Pass 2: Checking directory structure
+Pass 3: Checking directory connectivity
+Pass 4: Checking reference counts
+Pass 5: Checking group summary information
+test_filesys: 12/128 files (0.0% non-contiguous), 21/256 blocks
+Exit status is 0
+
+Generate xattr value (5005 bytes)
+ea_set -f d_xattr_ea_inode.tmp / user.test2
+Exit status is 0
+ea_get -f d_xattr_ea_inode.ver.tmp / user.test2
+Exit status is 0
+Compare xattr values (5005 bytes)
+stat /
+Blockcount: 40
+Exit status is 0
+ea_rm / user.test2
+Exit status is 0
+e2fsck -yf -N test_filesys
+Pass 1: Checking inodes, blocks, and sizes
+Pass 2: Checking directory structure
+Pass 3: Checking directory connectivity
+Pass 4: Checking reference counts
+Pass 5: Checking group summary information
+test_filesys: 12/128 files (0.0% non-contiguous), 20/256 blocks
+Exit status is 0
+
+Generate xattr value (512 bytes)
+ea_set -f d_xattr_ea_inode.tmp / user.test1
+Exit status is 0
+ea_get -f d_xattr_ea_inode.ver.tmp / user.test1
+Exit status is 0
+Compare xattr values (512 bytes)
+stat /
+Blockcount: 16
+Exit status is 0
+ea_rm / user.test1
+Exit status is 0
+e2fsck -yf -N test_filesys
+Pass 1: Checking inodes, blocks, and sizes
+Pass 2: Checking directory structure
+Pass 3: Checking directory connectivity
+Pass 4: Checking reference counts
+Pass 5: Checking group summary information
+test_filesys: 11/128 files (0.0% non-contiguous), 18/256 blocks
+Exit status is 0
+
+Generate xattr value (1024 bytes)
+ea_set -f d_xattr_ea_inode.tmp / user.test1
+Exit status is 0
+ea_get -f d_xattr_ea_inode.ver.tmp / user.test1
+Exit status is 0
+Compare xattr values (1024 bytes)
+stat /
+Blockcount: 16
+Exit status is 0
+e2fsck -yf -N test_filesys
+Pass 1: Checking inodes, blocks, and sizes
+Pass 2: Checking directory structure
+Pass 3: Checking directory connectivity
+Pass 4: Checking reference counts
+Pass 5: Checking group summary information
+test_filesys: 11/128 files (0.0% non-contiguous), 19/256 blocks
+Exit status is 0
+
+Generate xattr value (5000 bytes)
+ea_set -f d_xattr_ea_inode.tmp / user.test1
+Exit status is 0
+ea_get -f d_xattr_ea_inode.ver.tmp / user.test1
+Exit status is 0
+Compare xattr values (5000 bytes)
+stat /
+Blockcount: 24
+Exit status is 0
+ea_rm / user.test1
+Exit status is 0
+e2fsck -yf -N test_filesys
+Pass 1: Checking inodes, blocks, and sizes
+Pass 2: Checking directory structure
+Pass 3: Checking directory connectivity
+Pass 4: Checking reference counts
+Pass 5: Checking group summary information
+test_filesys: 11/128 files (0.0% non-contiguous), 18/256 blocks
+Exit status is 0
+
diff --git a/tests/d_xattr_ea_inode/name b/tests/d_xattr_ea_inode/name
new file mode 100644
index 000000000..9e36dc986
--- /dev/null
+++ b/tests/d_xattr_ea_inode/name
@@ -0,0 +1 @@
+edit extended attributes in debugfs with ea_inode feature
diff --git a/tests/d_xattr_ea_inode/script b/tests/d_xattr_ea_inode/script
new file mode 100644
index 000000000..84104549c
--- /dev/null
+++ b/tests/d_xattr_ea_inode/script
@@ -0,0 +1,85 @@
+#!/bin/bash
+
+if ! test -x $DEBUGFS_EXE; then
+	echo "$test_name: $test_description: skipped (no debugfs)"
+	return 0
+fi
+
+OUT=$test_name.log
+EXP=$test_dir/expect
+VERIFY_FSCK_OPT=-yf
+
+TEST_DATA=$test_name.tmp
+VERIFY_DATA=$test_name.ver.tmp
+
+echo "debugfs edit extended attributes with ea_inode feature" > $OUT.new
+
+d_xattr_ea_inode_check() {
+	local xattr_size=$1
+	local xattr_name=$2
+	local ea_rm=$3
+
+	echo "Generate xattr value ($xattr_size bytes)" >> $OUT.new
+	echo $xattr_size |
+		awk '{srand();for(i=0;i<$1;i++) printf("%c",97+int(rand()*26));}' > $TEST_DATA
+
+	echo "ea_set -f $TEST_DATA / $xattr_name" >> $OUT.new
+	$DEBUGFS -w -R "ea_set -f $TEST_DATA / $xattr_name" $TMPFILE >> $OUT.new 2>&1
+	echo Exit status is $? >> $OUT.new
+
+	echo "ea_get -f $VERIFY_DATA / $xattr_name" >> $OUT.new
+	$DEBUGFS -w -R "ea_get -f $VERIFY_DATA / $xattr_name" $TMPFILE >> $OUT.new 2>&1
+	echo Exit status is $? >> $OUT.new
+
+	echo "Compare xattr values ($xattr_size bytes)" >> $OUT.new
+	diff -u $TEST_DATA $VERIFY_DATA >> $OUT.new
+
+	echo "stat /" >> $OUT.new
+	($DEBUGFS -c -R "stat /" $TMPFILE | grep -Eo "Blockcount: [0-9]+") >> $OUT.new 2>&1
+	echo Exit status is $? >> $OUT.new
+
+	if $ea_rm; then
+		echo "ea_rm / $xattr_name" >> $OUT.new
+		$DEBUGFS -w -R "ea_rm / $xattr_name" $TMPFILE >> $OUT.new 2>&1
+		echo Exit status is $? >> $OUT.new
+	fi
+
+	echo e2fsck $VERIFY_FSCK_OPT -N test_filesys >> $OUT.new
+	$FSCK $VERIFY_FSCK_OPT -N test_filesys $TMPFILE >> $OUT.new 2>&1
+	echo Exit status is $? >> $OUT.new
+	echo >> $OUT.new
+}
+
+truncate -s1M $TMPFILE 2>&1
+
+echo "mke2fs -Fq -b 4096 -O ea_inode test.img 1m" >> $OUT.new
+$MKE2FS -Fq -b 4096 -O ea_inode $TMPFILE 1m > /dev/null 2>&1
+echo Exit status is $? >> $OUT.new
+
+d_xattr_ea_inode_check 8292 user.test1 true
+
+d_xattr_ea_inode_check 4097 user.test1 false
+d_xattr_ea_inode_check 102  user.test2 false
+d_xattr_ea_inode_check 5005 user.test2 true
+d_xattr_ea_inode_check 512  user.test1 true
+
+d_xattr_ea_inode_check 1024 user.test1 false
+d_xattr_ea_inode_check 5000 user.test1 true
+
+sed -f $cmd_dir/filter.sed $OUT.new > $OUT
+
+#
+# Do the verification
+#
+
+rm -f $TMPFILE $TEST_DATA $VERIFY_DATA $OUT.new
+
+if cmp -s $OUT $EXP; then
+	echo "$test_name: $test_description: ok"
+	touch $test_name.ok
+else
+	echo "$test_name: $test_description: failed"
+	diff $DIFF_OPTS $EXP $OUT > $test_name.failed
+fi
+
+unset VERIFY_FSCK_OPT VERIFY_DATA TEST_DATA OUT EXP d_xattr_ea_inode_check
-- 
2.43.7


^ permalink raw reply related

* [PATCH 4/4] libext2fs: add ext2fs_xattrs_release_all() helper
From: Etienne AUJAMES @ 2026-05-29 11:58 UTC (permalink / raw)
  To: linux-ext4; +Cc: adilger, dongyangli

This patch adds a helper function ext2fs_xattrs_release_all() which
removes all extended attributes and updates the quota accordingly.

The main purpose of this is to handle ea_inode xattrs in e2fsck when
deleting orphan inodes:

 # e2fsck -yf /tmp/ext4
 e2fsck 1.47.3-wc2 (11-Nov-2025)
 Clearing orphaned inode 12 (uid=0, gid=0, mode=0100644, size=0)
 Pass 1: Checking inodes, blocks, and sizes
 Pass 2: Checking directory structure
 Pass 3: Checking directory connectivity
 Pass 4: Checking reference counts
 Regular filesystem inode 13 has EA_INODE flag set. Clear<y>? yes
 Unattached inode 13
 Connect to /lost+found<y>? yes
 Inode 13 ref count is 2, should be 1.  Fix<y>? yes

fuse2fs, debugfs and mke2fs are updated to use this function and
handle ea_inode on inode deletion.

Update d_xattr_ea_inode to check for the inode deletion case.
Add a regression test: f_orphan_ea_inode

Signed-off-by: Etienne AUJAMES <eaujames@ddn.com>
Change-Id: I4a84a50d43b8b9aab2dfc352a92256c710a3659e
Lustre-bug-id: https://jira.whamcloud.com/browse/LU-20049
---
 debugfs/debugfs.c                |  33 +++++++--
 e2fsck/super.c                   |  67 +++++++++++-------
 lib/ext2fs/ext2fs.h              |   3 +
 lib/ext2fs/ext_attr.c            |  41 +++++++++++
 misc/create_inode_libarchive.c   |  35 ++++-----
 misc/fuse2fs.c                   | 117 +++++++++++--------------------
 tests/d_xattr_ea_inode/expect    |  51 ++++++++++++++
 tests/d_xattr_ea_inode/script    |  55 ++++++++++-----
 tests/f_orphan_ea_inode/expect.1 |   6 ++
 tests/f_orphan_ea_inode/expect.2 |   7 ++
 tests/f_orphan_ea_inode/image.gz | Bin 0 -> 2139 bytes
 tests/f_orphan_ea_inode/name     |   1 +
 tests/f_orphan_ea_inode/script   |   3 +
 13 files changed, 277 insertions(+), 142 deletions(-)
 create mode 100644 tests/f_orphan_ea_inode/expect.1
 create mode 100644 tests/f_orphan_ea_inode/expect.2
 create mode 100644 tests/f_orphan_ea_inode/image.gz
 create mode 100644 tests/f_orphan_ea_inode/name
 create mode 100644 tests/f_orphan_ea_inode/script

diff --git a/debugfs/debugfs.c b/debugfs/debugfs.c
index b9f248be2..d316293d2 100644
--- a/debugfs/debugfs.c
+++ b/debugfs/debugfs.c
@@ -1861,21 +1861,40 @@ static int release_blocks_proc(ext2_filsys fs, blk64_t *blocknr,
 
 static void kill_file_by_inode(ext2_ino_t inode)
 {
-	struct ext2_inode inode_buf;
+	struct ext2_inode_large *inode_buf;
+	size_t inode_size = EXT2_INODE_SIZE(current_fs->super);
+	errcode_t err;
 
-	if (debugfs_read_inode(inode, &inode_buf, 0))
-		return;
-	ext2fs_set_dtime(current_fs,  &inode_buf);
-	if (debugfs_write_inode(inode, &inode_buf, 0))
+	err = ext2fs_get_memzero(inode_size, &inode_buf);
+	if (err)
 		return;
-	if (ext2fs_inode_has_valid_blocks2(current_fs, &inode_buf)) {
+
+	err = ext2fs_read_inode_full(current_fs, inode, EXT2_INODE(inode_buf),
+				     inode_size);
+	if (err) {
+		com_err(__func__, err, "while reading inode %u", inode);
+		goto out;
+	}
+
+	ext2fs_set_dtime(current_fs,  EXT2_INODE(inode_buf));
+	ext2fs_xattrs_release_all(current_fs, inode, inode_buf, inode_size,
+				  NULL);
+	if (ext2fs_inode_has_valid_blocks2(current_fs, EXT2_INODE(inode_buf))) {
 		blk64_t last_cluster = 0;
 		ext2fs_block_iterate3(current_fs, inode, BLOCK_FLAG_READ_ONLY,
 				      NULL, release_blocks_proc, &last_cluster);
 	}
 	printf("\n");
 	ext2fs_inode_alloc_stats2(current_fs, inode, -1,
-				  LINUX_S_ISDIR(inode_buf.i_mode));
+				  LINUX_S_ISDIR(inode_buf->i_mode));
+
+	err = ext2fs_write_inode_full(current_fs, inode, EXT2_INODE(inode_buf),
+				     inode_size);
+	if (err)
+		com_err(__func__, err, "while writing inode %u", inode);
+
+out:
+	ext2fs_free_mem(&inode_buf);
 }
 
 
diff --git a/e2fsck/super.c b/e2fsck/super.c
index c2ccefd54..1a94ba567 100644
--- a/e2fsck/super.c
+++ b/e2fsck/super.c
@@ -156,13 +156,14 @@ static errcode_t truncate_inode_blocks(e2fsck_t ctx, ext2_ino_t ino,
 	struct process_block_struct pb = { 0 };
 	e2_blkcnt_t truncate_block = 0;
 	__u32 truncate_offset = 0;
-	blk64_t blk;
+	blk64_t blk, iblks;
 	int ret_flags;
 	errcode_t retval = 0;
 
 	if (!ext2fs_inode_has_valid_blocks2(fs, EXT2_INODE(inode)))
 		return 0;
 
+	iblks = ext2fs_get_stat_i_blocks(fs, EXT2_INODE(inode));
 	if (inode->i_links_count) {
 		truncate_offset = inode->i_size % fs->blocksize;
 		truncate_block = (e2_blkcnt_t)
@@ -190,6 +191,10 @@ static errcode_t truncate_inode_blocks(e2fsck_t ctx, ext2_ino_t ino,
 			"release_inode_blocks");
 
 	ext2fs_iblk_sub_blocks(fs, EXT2_INODE(inode), pb.truncated_blocks);
+	iblks -= ext2fs_get_stat_i_blocks(fs, EXT2_INODE(inode));
+	if (ctx->qctx)
+		quota_data_sub(ctx->qctx, inode, ino, iblks * 512);
+
 	if (!truncate_offset)
 		return 0;
 
@@ -217,17 +222,19 @@ static errcode_t truncate_inode_blocks(e2fsck_t ctx, ext2_ino_t ino,
  * not deleted.
  */
 static int release_inode_blocks(e2fsck_t ctx, ext2_ino_t ino,
-				struct ext2_inode_large *inode, char *block_buf,
+				struct ext2_inode_large *inode,
+				size_t inode_size, char *block_buf,
 				struct problem_context *pctx)
 {
 	ext2_filsys			fs = ctx->fs;
-	blk64_t				free_blks, ino_blks;
+	blk64_t				free_blks;
+	__u32				free_inodes;
 	char				*buf;
 	errcode_t			err;
 	int				rc = 0;
 
+	free_inodes = fs->super->s_free_inodes_count;
 	free_blks = ext2fs_free_blocks_count(fs->super);
-	ino_blks = ext2fs_get_stat_i_blocks(fs, EXT2_INODE(inode));
 	buf = block_buf + 3 * ctx->fs->blocksize;
 	if (truncate_inode_blocks(ctx, ino, inode, buf, pctx)) {
 		rc = 1;
@@ -236,7 +243,7 @@ static int release_inode_blocks(e2fsck_t ctx, ext2_ino_t ino,
 	if (inode->i_links_count)
 		goto update_counts;
 
-	err = ext2fs_free_ext_attr(fs, ino, inode);
+	err = ext2fs_xattrs_release_all(fs, ino, inode, inode_size, ctx->qctx);
 	if (err) {
 		com_err(__func__, err,
 			_("while calling ext2fs_free_ext_attr for inode %u"),
@@ -249,9 +256,8 @@ static int release_inode_blocks(e2fsck_t ctx, ext2_ino_t ino,
 
 update_counts:
 	ctx->free_blocks += ext2fs_free_blocks_count(fs->super) - free_blks;
-	ino_blks -= ext2fs_get_stat_i_blocks(fs, EXT2_INODE(inode));
-	if (ctx->qctx)
-		quota_data_sub(ctx->qctx, inode, 0, ino_blks << 9);
+	free_inodes = fs->super->s_free_inodes_count - free_inodes;
+	ctx->free_inodes += free_inodes;
 
 	return rc;
 }
@@ -312,44 +318,55 @@ static int release_orphan_inode(e2fsck_t ctx, ext2_ino_t *ino, char *block_buf)
 {
 	ext2_filsys fs = ctx->fs;
 	struct problem_context pctx;
-	struct ext2_inode_large inode;
+	struct ext2_inode_large *inode;
+	size_t inode_size = EXT2_INODE_SIZE(fs->super);
 	ext2_ino_t next_ino;
+	int rc = 1;
+
+	if (ext2fs_get_memzero(inode_size, &inode))
+		return 1;
 
-	e2fsck_read_inode_full(ctx, *ino, EXT2_INODE(&inode),
-				sizeof(inode), "release_orphan_inode");
+	e2fsck_read_inode_full(ctx, *ino, EXT2_INODE(inode),
+				     inode_size, __func__);
 	clear_problem_context(&pctx);
 	pctx.ino = *ino;
-	pctx.inode = EXT2_INODE(&inode);
-	pctx.str = inode.i_links_count ? _("Truncating") : _("Clearing");
+	pctx.inode = EXT2_INODE(inode);
+	pctx.str = inode->i_links_count ? _("Truncating") : _("Clearing");
 
 	fix_problem(ctx, PR_0_ORPHAN_CLEAR_INODE, &pctx);
 
-	next_ino = inode.i_dtime;
+	next_ino = inode->i_dtime;
 	if (next_ino &&
 	    ((next_ino < EXT2_FIRST_INODE(fs->super)) ||
 	     (next_ino > fs->super->s_inodes_count))) {
 		pctx.ino = next_ino;
 		fix_problem(ctx, PR_0_ORPHAN_ILLEGAL_INODE, &pctx);
-		return 1;
+		goto out;
 	}
 
-	if (release_inode_blocks(ctx, *ino, &inode, block_buf, &pctx))
-		return 1;
+	if (release_inode_blocks(ctx, *ino, inode, inode_size, block_buf,
+				 &pctx))
+		goto out;
 
-	if (!inode.i_links_count) {
+	if (!inode->i_links_count) {
 		if (ctx->qctx)
-			quota_data_inodes(ctx->qctx, &inode, *ino, -1);
+			quota_data_inodes(ctx->qctx, inode, *ino, -1);
 		ext2fs_inode_alloc_stats2(fs, *ino, -1,
-					  LINUX_S_ISDIR(inode.i_mode));
+					  LINUX_S_ISDIR(inode->i_mode));
 		ctx->free_inodes++;
-		ext2fs_set_dtime(fs, EXT2_INODE(&inode));
+		ext2fs_set_dtime(fs, EXT2_INODE(inode));
 	} else {
-		inode.i_dtime = 0;
+		inode->i_dtime = 0;
 	}
-	e2fsck_write_inode_full(ctx, *ino, EXT2_INODE(&inode),
-				sizeof(inode), "delete_file");
+	e2fsck_write_inode_full(ctx, *ino, EXT2_INODE(inode),
+				      inode_size, __func__);
 	*ino = next_ino;
-	return 0;
+	rc = 0;
+
+out:
+	ext2fs_free_mem(&inode);
+
+	return rc;
 }
 
 struct process_orphan_block_data {
diff --git a/lib/ext2fs/ext2fs.h b/lib/ext2fs/ext2fs.h
index 56de5ea50..cb3f1a3a1 100644
--- a/lib/ext2fs/ext2fs.h
+++ b/lib/ext2fs/ext2fs.h
@@ -1425,6 +1425,9 @@ errcode_t ext2fs_xattr_inode_max_size(ext2_filsys fs, ext2_ino_t ino,
 #define XATTR_HANDLE_FLAG_RAW	0x0001
 errcode_t ext2fs_xattrs_flags(struct ext2_xattr_handle *handle,
 			      unsigned int *new_flags, unsigned int *old_flags);
+errcode_t ext2fs_xattrs_release_all(ext2_filsys fs, ext2_ino_t ino,
+				    struct ext2_inode_large *inode,
+				    size_t inode_size, quota_ctx_t qctx);
 extern void ext2fs_ext_attr_block_rehash(struct ext2_ext_attr_header *header,
 					 struct ext2_ext_attr_entry *end);
 extern __u32 ext2fs_get_ea_inode_hash(struct ext2_inode *inode);
diff --git a/lib/ext2fs/ext_attr.c b/lib/ext2fs/ext_attr.c
index 3b90b70bb..2a2e79acd 100644
--- a/lib/ext2fs/ext_attr.c
+++ b/lib/ext2fs/ext_attr.c
@@ -1868,3 +1868,44 @@ errcode_t ext2fs_xattrs_flags(struct ext2_xattr_handle *handle,
 		handle->flags = *new_flags;
 	return 0;
 }
+
+errcode_t ext2fs_xattrs_release_all(ext2_filsys fs, ext2_ino_t ino,
+				    struct ext2_inode_large *inode,
+				    size_t inode_size, quota_ctx_t qctx)
+{
+	struct ext2_xattr_handle *h;
+	errcode_t err = 0;
+
+	if (!ext2fs_has_feature_ea_inode(fs->super)) {
+		blk64_t blk = ext2fs_file_acl_block(fs, EXT2_INODE(inode));
+
+		if (!blk)
+			return 0;
+
+		err = ext2fs_free_ext_attr(fs, ino, inode);
+		if (err || !qctx)
+			return err;
+
+		quota_data_sub(qctx, inode, ino,
+			       EXT2FS_C2B(fs, 1) * fs->blocksize);
+		return 0;
+	}
+
+	err = ext2fs_xattrs_open_inode(fs, ino, EXT2_INODE(inode), inode_size,
+				       qctx, &h);
+	if (err)
+		return err;
+
+	err = ext2fs_xattrs_read(h);
+	if (err)
+		goto out_close;
+
+	err = ext2fs_xattr_remove_all(h);
+	if (err)
+		goto out_close;
+
+out_close:
+	ext2fs_xattrs_close(&h);
+
+	return err;
+}
diff --git a/misc/create_inode_libarchive.c b/misc/create_inode_libarchive.c
index fadf0721f..4736e8c22 100644
--- a/misc/create_inode_libarchive.c
+++ b/misc/create_inode_libarchive.c
@@ -261,46 +261,49 @@ static inline unsigned int __round_up(unsigned int quantity, unsigned int size)
 static int remove_inode(ext2_filsys fs, ext2_ino_t ino)
 {
 	errcode_t ret = 0;
-	struct ext2_inode_large inode;
+	struct ext2_inode_large *inode;
+	size_t inode_size = EXT2_INODE_SIZE(fs->super);
 
-	memset(&inode, 0, sizeof(inode));
-	ret = ext2fs_read_inode_full(fs, ino, (struct ext2_inode *)&inode,
-				     sizeof(inode));
+	ret = ext2fs_get_memzero(inode_size, &inode);
+	if (ret)
+		return ret;
+
+	ret = ext2fs_read_inode_full(fs, ino, EXT2_INODE(inode), inode_size);
 	if (ret)
 		goto out;
 
-	switch (inode.i_links_count) {
+	switch (inode->i_links_count) {
 	case 0:
 		return 0; /* XXX: already done? */
 	case 1:
-		inode.i_links_count--;
-		ext2fs_set_dtime(fs, EXT2_INODE(&inode));
+		inode->i_links_count--;
+		ext2fs_set_dtime(fs, EXT2_INODE(inode));
 		break;
 	default:
-		inode.i_links_count--;
+		inode->i_links_count--;
 	}
 
-	if (inode.i_links_count)
+	if (inode->i_links_count)
 		goto write_out;
 
 	/* Nobody holds this file; free its blocks! */
-	ret = ext2fs_free_ext_attr(fs, ino, &inode);
+	ret = ext2fs_xattrs_release_all(fs, ino, inode, inode_size, NULL);
 	if (ret)
 		goto write_out;
 
-	if (ext2fs_inode_has_valid_blocks2(fs, (struct ext2_inode *)&inode)) {
-		ret = ext2fs_punch(fs, ino, (struct ext2_inode *)&inode, NULL,
-				   0, ~0ULL);
+	if (ext2fs_inode_has_valid_blocks2(fs, EXT2_INODE(inode))) {
+		ret = ext2fs_punch(fs, ino, EXT2_INODE(inode), NULL, 0, ~0ULL);
 		if (ret)
 			goto write_out;
 	}
 
-	ext2fs_inode_alloc_stats2(fs, ino, -1, LINUX_S_ISDIR(inode.i_mode));
+	ext2fs_inode_alloc_stats2(fs, ino, -1, LINUX_S_ISDIR(inode->i_mode));
 
 write_out:
-	ret = ext2fs_write_inode_full(fs, ino, (struct ext2_inode *)&inode,
-				      sizeof(inode));
+	ret = ext2fs_write_inode_full(fs, ino, EXT2_INODE(inode), inode_size);
 out:
+	ext2fs_free_mem(&inode);
+
 	return ret;
 }
 
diff --git a/misc/fuse2fs.c b/misc/fuse2fs.c
index 94e289fab..11141f645 100644
--- a/misc/fuse2fs.c
+++ b/misc/fuse2fs.c
@@ -2274,123 +2274,88 @@ static int fuse2fs_unlink(struct fuse2fs *ff, const char *path,
 	return 0;
 }
 
-static int remove_ea_inodes(struct fuse2fs *ff, ext2_ino_t ino,
-			    struct ext2_inode_large *inode)
+static int remove_inode(struct fuse2fs *ff, ext2_ino_t ino)
 {
 	ext2_filsys fs = ff->fs;
-	struct ext2_xattr_handle *h;
 	errcode_t err;
+	struct ext2_inode_large *inode;
+	size_t inode_size = EXT2_INODE_SIZE(fs->super);
 	int ret = 0;
 
-	/*
-	 * The xattr handle maintains its own private copy of the inode, so
-	 * write ours to disk so that we can read it.
-	 */
-	err = fuse2fs_write_inode(fs, ino, inode);
+	err = ext2fs_get_memzero(inode_size, &inode);
 	if (err)
 		return translate_error(fs, ino, err);
 
-	err = ext2fs_xattrs_open(fs, ino, &h);
-	if (err)
-		return translate_error(fs, ino, err);
-
-	err = ext2fs_xattrs_read(h);
+	err = ext2fs_read_inode_full(fs, ino, EXT2_INODE(inode), inode_size);
 	if (err) {
 		ret = translate_error(fs, ino, err);
-		goto out_close;
-	}
-
-	err = ext2fs_xattr_remove_all(h);
-	if (err) {
-		ret = translate_error(fs, ino, err);
-		goto out_close;
+		goto out;
 	}
-
-out_close:
-	ext2fs_xattrs_close(&h);
-	if (ret)
-		return ret;
-
-	/* Now read the inode back in. */
-	err = fuse2fs_read_inode(fs, ino, inode);
-	if (err)
-		return translate_error(fs, ino, err);
-
-	return 0;
-}
-
-static int remove_inode(struct fuse2fs *ff, ext2_ino_t ino)
-{
-	ext2_filsys fs = ff->fs;
-	errcode_t err;
-	struct ext2_inode_large inode;
-	int ret = 0;
-
-	err = fuse2fs_read_inode(fs, ino, &inode);
-	if (err)
-		return translate_error(fs, ino, err);
-
 	dbg_printf(ff, "%s: put ino=%d links=%d\n", __func__, ino,
-		   inode.i_links_count);
+		   inode->i_links_count);
 
-	if (S_ISDIR(inode.i_mode)) {
+	if (S_ISDIR(inode->i_mode)) {
 		/*
 		 * Caller should have checked that this is an empty directory
 		 * before starting the unlink process.  nlink is usually 2, but
 		 * it could be 1 if this dir ever had more than 65000 subdirs.
 		 * Zero the link count.
 		 */
-		if (!ext2fs_dir_link_empty(EXT2_INODE(&inode)))
-			return translate_error(fs, ino, EXT2_ET_INODE_CORRUPTED);
-		inode.i_links_count = 0;
-		ext2fs_set_dtime(fs, EXT2_INODE(&inode));
+		if (!ext2fs_dir_link_empty(EXT2_INODE(inode))) {
+			ret = translate_error(fs, ino, EXT2_ET_INODE_CORRUPTED);
+			goto out;
+		}
+		inode->i_links_count = 0;
+		ext2fs_set_dtime(fs, EXT2_INODE(inode));
 	} else {
 		/*
 		 * Any other file type can be hardlinked, so all we need to do
 		 * is decrement the nlink.
 		 */
-		if (inode.i_links_count == 0)
-			return translate_error(fs, ino, EXT2_ET_INODE_CORRUPTED);
-		inode.i_links_count--;
-		if (!inode.i_links_count)
-			ext2fs_set_dtime(fs, EXT2_INODE(&inode));
+		if (inode->i_links_count == 0) {
+			ret = translate_error(fs, ino, EXT2_ET_INODE_CORRUPTED);
+			goto out;
+		}
+		inode->i_links_count--;
+		if (!inode->i_links_count)
+			ext2fs_set_dtime(fs, EXT2_INODE(inode));
 	}
 
-	ret = update_ctime(fs, ino, &inode);
+	ret = update_ctime(fs, ino, inode);
 	if (ret)
-		return ret;
+		goto out;
 
 	/* Still linked?  Leave it be. */
-	if (inode.i_links_count)
+	if (inode->i_links_count)
 		goto write_out;
 
-	if (ext2fs_has_feature_ea_inode(fs->super)) {
-		ret = remove_ea_inodes(ff, ino, &inode);
-		if (ret)
-			return ret;
-	}
-
 	/* Nobody holds this file; free its blocks! */
-	err = ext2fs_free_ext_attr(fs, ino, &inode);
-	if (err)
-		return translate_error(fs, ino, err);
+	err = ext2fs_xattrs_release_all(fs, ino, inode, inode_size, NULL);
+	if (err) {
+		ret = translate_error(fs, ino, err);
+		goto out;
+	}
 
-	if (ext2fs_inode_has_valid_blocks2(fs, EXT2_INODE(&inode))) {
-		err = ext2fs_punch(fs, ino, EXT2_INODE(&inode), NULL,
+	if (ext2fs_inode_has_valid_blocks2(fs, EXT2_INODE(inode))) {
+		err = ext2fs_punch(fs, ino, EXT2_INODE(inode), NULL,
 				   0, ~0ULL);
-		if (err)
-			return translate_error(fs, ino, err);
+		if (err) {
+			ret = translate_error(fs, ino, err);
+			goto out;
+		}
 	}
 
 	ext2fs_inode_alloc_stats2(fs, ino, -1,
-				  LINUX_S_ISDIR(inode.i_mode));
+				  LINUX_S_ISDIR(inode->i_mode));
 
 write_out:
-	err = fuse2fs_write_inode(fs, ino, &inode);
+	err = ext2fs_write_inode_full(fs, ino, EXT2_INODE(inode), inode_size);
 	if (err)
-		return translate_error(fs, ino, err);
+		ret = translate_error(fs, ino, err);
+out:
+	ext2fs_free_mem(&inode);
 
-	return 0;
+	return ret;
 }
 
 static int __op_unlink(struct fuse2fs *ff, const char *path)
diff --git a/tests/d_xattr_ea_inode/expect b/tests/d_xattr_ea_inode/expect
index aaad9c5b3..e1878c3dc 100644
--- a/tests/d_xattr_ea_inode/expect
+++ b/tests/d_xattr_ea_inode/expect
@@ -135,3 +135,54 @@ Pass 5: Checking group summary information
 test_filesys: 11/128 files (0.0% non-contiguous), 18/256 blocks
 Exit status is 0
 
+write d_xattr_ea_inode.tmp test_file
+Allocated inode: 12
+Exit status is 0
+
+Generate xattr value (1024 bytes)
+ea_set -f d_xattr_ea_inode.tmp test_file user.test1
+Exit status is 0
+ea_get -f d_xattr_ea_inode.ver.tmp test_file user.test1
+Exit status is 0
+Compare xattr values (1024 bytes)
+stat test_file
+Blockcount: 16
+Exit status is 0
+e2fsck -yf -N test_filesys
+Pass 1: Checking inodes, blocks, and sizes
+Pass 2: Checking directory structure
+Pass 3: Checking directory connectivity
+Pass 4: Checking reference counts
+Pass 5: Checking group summary information
+test_filesys: 12/128 files (0.0% non-contiguous), 20/256 blocks
+Exit status is 0
+
+Generate xattr value (16384 bytes)
+ea_set -f d_xattr_ea_inode.tmp test_file user.test2
+Exit status is 0
+ea_get -f d_xattr_ea_inode.ver.tmp test_file user.test2
+Exit status is 0
+Compare xattr values (16384 bytes)
+stat test_file
+Blockcount: 48
+Exit status is 0
+e2fsck -yf -N test_filesys
+Pass 1: Checking inodes, blocks, and sizes
+Pass 2: Checking directory structure
+Pass 3: Checking directory connectivity
+Pass 4: Checking reference counts
+Pass 5: Checking group summary information
+test_filesys: 13/128 files (0.0% non-contiguous), 24/256 blocks
+Exit status is 0
+
+rm test_file
+
+Exit status is 0
+e2fsck -yf -N test_filesys
+Pass 1: Checking inodes, blocks, and sizes
+Pass 2: Checking directory structure
+Pass 3: Checking directory connectivity
+Pass 4: Checking reference counts
+Pass 5: Checking group summary information
+test_filesys: 11/128 files (0.0% non-contiguous), 18/256 blocks
+Exit status is 0
diff --git a/tests/d_xattr_ea_inode/script b/tests/d_xattr_ea_inode/script
index 84104549c..c24eb6cd5 100644
--- a/tests/d_xattr_ea_inode/script
+++ b/tests/d_xattr_ea_inode/script
@@ -15,32 +15,33 @@ VERIFY_DATA=$test_name.ver.tmp
 echo "debugfs edit extended attributes with ea_inode feature" > $OUT.new
 
 d_xattr_ea_inode_check() {
-	local xattr_size=$1
-	local xattr_name=$2
-	local ea_rm=$3
+	local path=$1
+	local xattr_size=$2
+	local xattr_name=$3
+	local ea_rm=$4
 
 	echo "Generate xattr value ($xattr_size bytes)" >> $OUT.new
 	echo $xattr_size |
 		awk '{srand();for(i=0;i<$1;i++) printf("%c",97+int(rand()*26));}' > $TEST_DATA
 
-	echo "ea_set -f $TEST_DATA / $xattr_name" >> $OUT.new
-	$DEBUGFS -w -R "ea_set -f $TEST_DATA / $xattr_name" $TMPFILE >> $OUT.new 2>&1
+	echo "ea_set -f $TEST_DATA $path $xattr_name" >> $OUT.new
+	$DEBUGFS -w -R "ea_set -f $TEST_DATA $path $xattr_name" $TMPFILE >> $OUT.new 2>&1
 	echo Exit status is $? >> $OUT.new
 
-	echo "ea_get -f $VERIFY_DATA / $xattr_name" >> $OUT.new
-	$DEBUGFS -w -R "ea_get -f $VERIFY_DATA / $xattr_name" $TMPFILE >> $OUT.new 2>&1
+	echo "ea_get -f $VERIFY_DATA $path $xattr_name" >> $OUT.new
+	$DEBUGFS -w -R "ea_get -f $VERIFY_DATA $path $xattr_name" $TMPFILE >> $OUT.new 2>&1
 	echo Exit status is $? >> $OUT.new
 
 	echo "Compare xattr values ($xattr_size bytes)" >> $OUT.new
 	diff -u $TEST_DATA $VERIFY_DATA >> $OUT.new
 
-	echo "stat /" >> $OUT.new
-	($DEBUGFS -c -R "stat /" $TMPFILE | grep -Eo "Blockcount: [0-9]+") >> $OUT.new 2>&1
+	echo "stat $path" >> $OUT.new
+	($DEBUGFS -c -R "stat $path" $TMPFILE | grep -Eo "Blockcount: [0-9]+") >> $OUT.new 2>&1
 	echo Exit status is $? >> $OUT.new
 
 	if $ea_rm; then
-		echo "ea_rm / $xattr_name" >> $OUT.new
-		$DEBUGFS -w -R "ea_rm / $xattr_name" $TMPFILE >> $OUT.new 2>&1
+		echo "ea_rm $path $xattr_name" >> $OUT.new
+		$DEBUGFS -w -R "ea_rm $path $xattr_name" $TMPFILE >> $OUT.new 2>&1
 		echo Exit status is $? >> $OUT.new
 	fi
 
@@ -56,15 +57,33 @@ echo "mke2fs -Fq -b 4096 -O ea_inode test.img 1m" >> $OUT.new
 $MKE2FS -Fq -b 4096 -O ea_inode $TMPFILE 1m > /dev/null 2>&1
 echo Exit status is $? >> $OUT.new
 
-d_xattr_ea_inode_check 8292 user.test1 true
+d_xattr_ea_inode_check / 8292 user.test1 true
 
-d_xattr_ea_inode_check 4097 user.test1 false
-d_xattr_ea_inode_check 102  user.test2 false
-d_xattr_ea_inode_check 5005 user.test2 true
-d_xattr_ea_inode_check 512  user.test1 true
+d_xattr_ea_inode_check / 4097 user.test1 false
+d_xattr_ea_inode_check / 102  user.test2 false
+d_xattr_ea_inode_check / 5005 user.test2 true
+d_xattr_ea_inode_check / 512  user.test1 true
 
-d_xattr_ea_inode_check 1024 user.test1 false
-d_xattr_ea_inode_check 5000 user.test1 true
+d_xattr_ea_inode_check / 1024 user.test1 false
+d_xattr_ea_inode_check / 5000 user.test1 true
+
+# Create and remove a file with ea_inode
+echo "test_file_content" > $TEST_DATA
+echo "write $TEST_DATA test_file" >> $OUT.new
+$DEBUGFS -w -R "write $TEST_DATA test_file" $TMPFILE >> $OUT.new 2>&1
+echo Exit status is $? >> $OUT.new
+echo >> $OUT.new
+
+d_xattr_ea_inode_check test_file 1024  user.test1 false
+d_xattr_ea_inode_check test_file 16384 user.test2 false
+
+echo "rm test_file" >> $OUT.new
+$DEBUGFS -w -R "rm test_file" $TMPFILE >> $OUT.new 2>&1
+echo Exit status is $? >> $OUT.new
+
+echo e2fsck $VERIFY_FSCK_OPT -N test_filesys >> $OUT.new
+$FSCK $VERIFY_FSCK_OPT -N test_filesys $TMPFILE >> $OUT.new 2>&1
+echo Exit status is $? >> $OUT.new
 
 sed -f $cmd_dir/filter.sed $OUT.new > $OUT
 
diff --git a/tests/f_orphan_ea_inode/expect.1 b/tests/f_orphan_ea_inode/expect.1
new file mode 100644
index 000000000..3eba3d718
--- /dev/null
+++ b/tests/f_orphan_ea_inode/expect.1
@@ -0,0 +1,6 @@
+test_filesys: Clearing orphaned inode 12 (uid=0, gid=0, mode=0100644, size=0)
+test_filesys: Clearing orphaned inode 13 (uid=1000, gid=1000, mode=0100644, size=6)
+test_filesys: Clearing orphaned inode 14 (uid=1001, gid=1001, mode=0100644, size=0)
+test_filesys: Clearing orphaned inode 15 (uid=1002, gid=1002, mode=0100644, size=6)
+test_filesys: clean, 13/128 files, 23/256 blocks
+Exit status is 0
diff --git a/tests/f_orphan_ea_inode/expect.2 b/tests/f_orphan_ea_inode/expect.2
new file mode 100644
index 000000000..bf76a5c25
--- /dev/null
+++ b/tests/f_orphan_ea_inode/expect.2
@@ -0,0 +1,7 @@
+Pass 1: Checking inodes, blocks, and sizes
+Pass 2: Checking directory structure
+Pass 3: Checking directory connectivity
+Pass 4: Checking reference counts
+Pass 5: Checking group summary information
+test_filesys: 13/128 files (0.0% non-contiguous), 23/256 blocks
+Exit status is 0
diff --git a/tests/f_orphan_ea_inode/image.gz b/tests/f_orphan_ea_inode/image.gz
new file mode 100644
index 0000000000000000000000000000000000000000..95f0e53aebd45e041b4b1f53be7c374c8144c1b7
GIT binary patch
literal 2139
zcmeH`ZB){C6vzM8Hrp}7g@%~MtYvweYvyWYCGO-yK4OW}V<wpzXd)%L%m=7B52Z~^
zQ={e6O5&KqaG4HETqaXUP01LVmg2*F0G7z3!f&v>*_*xGyS^{(x%d9g@7#Mo=U!CN
zx^;`kZBo{(O(w?UlOWkxD8Vt(5&DxMNiEw;EPH7y+Ew~8s5^)kG8_|gs>t&FoaZqP
zhX^f4FF$nsWQ%V%)-Su+(l6Bj)v?|nzO6mdz3EPOer?uZ1ll)s&Ai{i%Nu890TO2K
zshf0{b{<Zv_<<poPxa+^D#yc4?<l;OGK`3u+<CLg<*w-W<I&Z&TQxBKQZK^k`xB!I
z(xSjWKVP%@5e;nGe!!u<P+^A{(m#3T?#y#3+&#B^K;w?n8!FVs+wQPuX=y9mv_BLa
z=6Qv2PmV?vhTVTy7c;X9{Xof3m)GR|oFiSpu)UCoih#ockGL374P)C>)fKnfhQuM3
ztE3o$LWqHZD0#-n3TAl<f}q;T^&2Retol@MQ}DTA*g>%tf_w_xidVV}P2DUekE)eD
zT(0*hq{s=2DLZ|=cfjpP$NK%@=c#ATI5bt|@jRbe)6n{i=2q(KOUp^UXH_U#Uun@v
zibGUrxGp$!c<IY}PVE{YCR5k~q_VDufa}qrdE~6*s9_O;x6eT`O1@ed`+(FRn2_dR
zy?yp+&_3XFs02d_6Stp^N{Bi#Yr8ZraBTIL_E9OTb@d@SI&nN#{_FNup8eKF^qsOO
zqmqFvSOVkM=-~sio=0EA9y8h{L<fM1e&apib)L>gIAYUPiAMJD3-(ra9L!`-503X&
zkE^6#8;v0xBSQ8}OeIL;lcoEH!L8Ob?!DL*>IBEr%AXAqYhC(lf<e&c!9rpa1t~yN
zROqvHMj-DAi1se&rBWj}t3|)vFRr%EM^&ByvH|gy4IU$Un2?-Mx~$FA^ip}&LeisK
zMvT5Alb4%Cyn)UP1b6qGr{*f{1-#`qLPf<rz=XvKHf=M{A^}#ihNMxl8`(<!IBOM7
z=?Vu{qUa390%0_(p($2JAJg$)p88HTI~3?`q@4qkMSSqK%?ZGaTsb#=nv};~c|3`S
z;tkGKjZz$r3up$s<cf@zu0*Io+u5{!G4)1V1%K%aHgejF{rVU<@yn7Td!-oe>@~D*
z0Vc{mZQ6tXP$#y`cS52**Q|e%QQ8=WpBK$J4K_}_l~<2P(lgK;5~jzBUMu8J=*SB{
zu6o87DJL#ukb`Hs$o89qW=2h^DO_^T0O|gm1q7WOdI3WMASpos)5bw|%t04(St-?I
zI<+XXZ&F+7Op7CM4HI=7$IonKxlbh#x3CCTLZL{v1h~&|gDaytp6nCp!M9BLOtX(e
z)ppMY*hgR0Ug#Q`KxpXdycinpU<q<si|6gN=q#7j;8mhaAKp)|TL9rd#RA&=v>uJ!
zB?fibNa<QO0K9aI&P3I~31I2szuSflXauk3xuJ7kIrh1K({`9b!eg3<AD9nik=amQ
z4A7F1hyxGu5`+1`8GRLqB}7xt%qp6>bru0;9?UEe*0yT@Se{ymzx_(1Yw1fztv6+{
z&4>Pe{l|1(NBH&nZ1g*N%U^UYwDJ`y93DU>!vnq1xwwn%W;V>4G3=h2i~ss#c00=v
i9Rno8r?2x(P~Yhtfp-M{Zvr7`V4?Up>pcjvf&Kz@_kh^|

literal 0
HcmV?d00001

diff --git a/tests/f_orphan_ea_inode/name b/tests/f_orphan_ea_inode/name
new file mode 100644
index 000000000..b892ff960
--- /dev/null
+++ b/tests/f_orphan_ea_inode/name
@@ -0,0 +1 @@
+clearing orphan inodes with ea_inode and quota features
diff --git a/tests/f_orphan_ea_inode/script b/tests/f_orphan_ea_inode/script
new file mode 100644
index 000000000..9650d07d0
--- /dev/null
+++ b/tests/f_orphan_ea_inode/script
@@ -0,0 +1,3 @@
+FSCK_OPT=-p
+SECOND_FSCK_OPT="-yf"
+. $cmd_dir/run_e2fsck
-- 
2.43.7


^ permalink raw reply related

* [PATCH 0/4] e2fsck: Fix orphan inodes processing
From: Etienne AUJAMES @ 2026-05-29 11:47 UTC (permalink / raw)
  To: linux-ext4; +Cc: adilger, dongyangli

e2fsck does not handle properly orphan inodes.

Case 1: bad free_blocks accounting with extent files

 # e2fsck -v /tmp/ext4
 e2fsck 1.47.3-wc2 (11-Nov-2025)
 Truncating orphaned inode 12 (uid=0, gid=0, mode=0100644, size=4096)
 Setting free blocks count to 2554682 (was 2554683)
 /tmp/ext4: clean, 13/655360 files, 66758/2621440 blocks

 # e2fsck -yf /tmp/ext4
 e2fsck 1.47.3-wc2 (11-Nov-2025)
 Pass 1: Checking inodes, blocks, and sizes
 Pass 2: Checking directory structure
 Pass 3: Checking directory connectivity
 Pass 4: Checking reference counts
 Pass 5: Checking group summary information
 Free blocks count wrong (2554682, counted=2554683).
 Fix<y>? yes

Case 2: e2fsck does not support orphan inodes with ea_inode

 # e2fsck -yf /tmp/ext4                                
 e2fsck 1.47.3-wc2 (11-Nov-2025)
 Clearing orphaned inode 12 (uid=0, gid=0, mode=0100644, size=0)
 Pass 1: Checking inodes, blocks, and sizes
 Pass 2: Checking directory structure
 Pass 3: Checking directory connectivity
 Pass 4: Checking reference counts
 Regular filesystem inode 13 has EA_INODE flag set. Clear<y>? yes
 Unattached inode 13
 Connect to /lost+found<y>? yes
 Inode 13 ref count is 2, should be 1.  Fix<y>? yes
 Pass 5: Checking group summary information

Patch 1 fixes the first case.
Patch 2 includes quota function in libext2fs (required by patch 2).
Patch 3 fixes ext2fs_xattrs_* function to update inode iblk and quota.
Patch 4 fixes the second case.

Bugs tracked by: https://jira.whamcloud.com/browse/LU-20049

Etienne AUJAMES (3):
  e2fsck: fix orphaned extent files handling
  libext2fs: update iblock when using ea_inode feature
  libext2fs: add ext2fs_xattrs_release_all() helper

Li Dongyang (1):
  libext2fs: add quota to libext2fs

 debugfs/debugfs.c                             |  33 +-
 debugfs/xattrs.c                              |  19 +-
 e2fsck/pass1.c                                |  12 +-
 e2fsck/super.c                                | 295 +++++++++---------
 lib/ext2fs/Makefile.in                        |  43 +++
 lib/ext2fs/ext2fs.h                           |  10 +
 lib/ext2fs/ext_attr.c                         | 268 +++++++++++-----
 lib/ext2fs/i_block.c                          |  14 +
 lib/support/quotaio.h                         |   1 -
 misc/create_inode_libarchive.c                |  35 ++-
 misc/fuse2fs.c                                | 117 +++----
 tests/d_xattr_ea_inode/expect                 | 188 +++++++++++
 tests/d_xattr_ea_inode/name                   |   1 +
 tests/d_xattr_ea_inode/script                 | 104 ++++++
 tests/f_orphan_ea_inode/expect.1              |   6 +
 tests/f_orphan_ea_inode/expect.2              |   7 +
 tests/f_orphan_ea_inode/image.gz              | Bin 0 -> 2139 bytes
 tests/f_orphan_ea_inode/name                  |   1 +
 tests/f_orphan_ea_inode/script                |   3 +
 .../f_orphan_truncate_extents_inode/expect.1  |   3 +
 .../f_orphan_truncate_extents_inode/expect.2  |   7 +
 .../f_orphan_truncate_extents_inode/image.gz  | Bin 0 -> 2854 bytes
 tests/f_orphan_truncate_extents_inode/name    |   1 +
 tests/f_orphan_truncate_extents_inode/script  |   3 +
 24 files changed, 842 insertions(+), 329 deletions(-)
 create mode 100644 tests/d_xattr_ea_inode/expect
 create mode 100644 tests/d_xattr_ea_inode/name
 create mode 100644 tests/d_xattr_ea_inode/script
 create mode 100644 tests/f_orphan_ea_inode/expect.1
 create mode 100644 tests/f_orphan_ea_inode/expect.2
 create mode 100644 tests/f_orphan_ea_inode/image.gz
 create mode 100644 tests/f_orphan_ea_inode/name
 create mode 100644 tests/f_orphan_ea_inode/script
 create mode 100644 tests/f_orphan_truncate_extents_inode/expect.1
 create mode 100644 tests/f_orphan_truncate_extents_inode/expect.2
 create mode 100644 tests/f_orphan_truncate_extents_inode/image.gz
 create mode 100644 tests/f_orphan_truncate_extents_inode/name
 create mode 100644 tests/f_orphan_truncate_extents_inode/script

-- 
2.43.7


^ permalink raw reply

* Re: [PATCH v4 09/23] ext4: implement writeback path using iomap
From: Zhang Yi @ 2026-05-29 14:12 UTC (permalink / raw)
  To: Ojaswin Mujoo, Zhang Yi
  Cc: linux-ext4, linux-fsdevel, linux-kernel, tytso, adilger.kernel,
	libaokun, jack, ritesh.list, djwong, hch, yi.zhang, yangerkun,
	yukuai
In-Reply-To: <ahboWtDYXagrgTR4@li-dc0c254c-257c-11b2-a85c-98b6c1322444.ibm.com>

Hi, Ojaswin!

On 5/27/2026 8:49 PM, Ojaswin Mujoo wrote:
> On Mon, May 11, 2026 at 03:23:29PM +0800, Zhang Yi wrote:
>> From: Zhang Yi <yi.zhang@huawei.com>
>>
>> Add the iomap writeback path for ext4 buffered I/O. This introduces:
>>
>>   - ext4_iomap_writepages(): the main writeback entry point.
>>   - ext4_writeback_ops: a new iomap_writeback_ops instance to handle
>>     block mapping and I/O submission.
>>   - A new end I/O worker for converting unwritten extents, updating file
>>     size, and handling DATA_ERR_ABORT after I/O completion.
>>
>> Core implementation details:
>>
>>   - ->writeback_range() callback
>>     Calls ext4_iomap_map_writeback_range() to query the longest range of
>>     existing mapped extents. For performance, when a block range is not
>>     yet allocated, it allocates based on the writeback length and delalloc
>>     extent length, rather than allocating for a single folio at a time.
>>     The folio is then added to an iomap_ioend instance.
>>
>>   - ->writeback_submit() callback
>>     Registers ext4_iomap_end_bio() as the end bio callback. This callback
>>     schedules a worker to handle:
>>     - Unwritten extent conversion.
>>     - i_disksize update after data is written back.
>>     - Journal abort on writeback I/O failure.
> 
> Hi Zhang, the changes look good. I have a few comments below:
>>
>> Key changes and considerations:
>>
>> - Append write and unwritten extents
>>    Since data=ordered mode is not used to prevent stale data exposure
>>    during append writebacks, new blocks are always allocated as unwritten
>>    extents (i.e. always enable dioread_nolock), and i_disksize update is
>>    postponed until I/O completion.
> 
> Makes sense.
> 
>>    Additionally, the deadlock that the
>>    reserve handle was expected to resolve does not occur anymore.
> 
> I guess this is since we don't use ordered data so we can't block on
> starting a txn in end io.

Yep.

> 
>>    Therefore, the end I/O worker can start a normal journal handle
>>    instead of a reserve handle when converting unwritten extents.
>>
>> - Lock ordering
>>    The ->writeback_range() callback runs under the folio lock, requiring
>>    the journal handle to be started under that same lock. This reverses
>>    the order compared to the buffer_head writeback path. The lock ordering
>>    documentation in super.c has been updated accordingly.
>>
>> Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
>> ---
>>   fs/ext4/ext4.h        |   4 +
>>   fs/ext4/inode.c       | 208 +++++++++++++++++++++++++++++++++++++++++-
>>   fs/ext4/page-io.c     | 126 +++++++++++++++++++++++++
>>   fs/ext4/super.c       |   7 +-
>>   fs/iomap/ioend.c      |   3 +-
>>   include/linux/iomap.h |   1 +
>>   6 files changed, 346 insertions(+), 3 deletions(-)
>>
>> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
>> index 4832e7f7db82..078feda47e36 100644
>> --- a/fs/ext4/ext4.h
>> +++ b/fs/ext4/ext4.h
>> @@ -1173,6 +1173,8 @@ struct ext4_inode_info {
>>   	 */
>>   	struct list_head i_rsv_conversion_list;
>>   	struct work_struct i_rsv_conversion_work;
>> +	struct list_head i_iomap_ioend_list;
>> +	struct work_struct i_iomap_ioend_work;
>>   
>>   	/*
>>   	 * Transactions that contain inode's metadata needed to complete
>> @@ -3870,6 +3872,8 @@ int ext4_bio_write_folio(struct ext4_io_submit *io, struct folio *page,
>>   		size_t len);
>>   extern struct ext4_io_end_vec *ext4_alloc_io_end_vec(ext4_io_end_t *io_end);
>>   extern struct ext4_io_end_vec *ext4_last_io_end_vec(ext4_io_end_t *io_end);
>> +extern void ext4_iomap_end_io(struct work_struct *work);
>> +extern void ext4_iomap_end_bio(struct bio *bio);
>>   
>>   /* mmp.c */
>>   extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t);
>> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
>> index 1ae7d3f4a1c8..a80195bd6f20 100644
>> --- a/fs/ext4/inode.c
>> +++ b/fs/ext4/inode.c
>> @@ -44,6 +44,7 @@
>>   #include <linux/iversion.h>
>>   
>>   #include "ext4_jbd2.h"
>> +#include "ext4_extents.h"
>>   #include "xattr.h"
>>   #include "acl.h"
>>   #include "truncate.h"
>> @@ -4120,10 +4121,215 @@ static void ext4_iomap_readahead(struct readahead_control *rac)
>>   	iomap_bio_readahead(rac, &ext4_iomap_buffered_read_ops);
>>   }
>>   
>> +static int ext4_iomap_map_one_extent(struct inode *inode,
>> +				     struct ext4_map_blocks *map)
>> +{
>> +	struct extent_status es;
>> +	handle_t *handle = NULL;
>> +	int credits, map_flags;
>> +	int retval;
>> +
>> +	credits = ext4_chunk_trans_blocks(inode, map->m_len);
>> +	handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, credits);
>> +	if (IS_ERR(handle))
>> +		return PTR_ERR(handle);
>> +
>> +	map->m_flags = 0;
>> +	/*
>> +	 * It is necessary to look up extent and map blocks under i_data_sem
>> +	 * in write mode, otherwise, the delalloc extent may become stale
>> +	 * during concurrent truncate operations.
>> +	 */
>> +	ext4_fc_track_inode(handle, inode);
>> +	down_write(&EXT4_I(inode)->i_data_sem);
>> +	if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es, &map->m_seq)) {
>> +		retval = es.es_len - (map->m_lblk - es.es_lblk);
>> +		map->m_len = min_t(unsigned int, retval, map->m_len);
>> +
>> +		if (ext4_es_is_delayed(&es)) {
> 
> I understand that it is okay for us to rely on extent status ==
> delayed here because we never reclaim delayed es entries and hence we
> are sure to not skip any delayed block allocations here.

Yeah, right.

> 
>> +			map->m_flags |= EXT4_MAP_DELAYED;
>> +			trace_ext4_da_write_pages_extent(inode, map);
>> +			/*
>> +			 * Call ext4_map_create_blocks() to allocate any
>> +			 * delayed allocation blocks. It is possible that
>> +			 * we're going to need more metadata blocks, however
>> +			 * we must not fail because we're in writeback and
>> +			 * there is nothing we can do so it might result in
>> +			 * data loss. So use reserved blocks to allocate
>> +			 * metadata if possible.
>> +			 */
>> +			map_flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT |
>> +				    EXT4_GET_BLOCKS_METADATA_NOFAIL |
>> +				    EXT4_EX_NOCACHE;
>> +
>> +			retval = ext4_map_create_blocks(handle, inode, map,
>> +							map_flags);
>> +			if (retval > 0)
>> +				ext4_fc_track_range(handle, inode, map->m_lblk,
>> +						map->m_lblk + map->m_len - 1);
>> +			goto out;
>> +		} else if (unlikely(ext4_es_is_hole(&es)))
> 
> Now that you've fixed the partial invalidate in iomap (patch 12/23)
> can we still hit this hole case?

Theoretically, no hole should be encountered; this is just defensive
programming.

> 
>> +			goto out;
>> +
>> +		/* Found written or unwritten extent. */
>> +		map->m_pblk = ext4_es_pblock(&es) + map->m_lblk - es.es_lblk;
>> +		map->m_flags = ext4_es_is_written(&es) ?
>> +			       EXT4_MAP_MAPPED : EXT4_MAP_UNWRITTEN;
>> +		goto out;
>> +	}
>> +
>> +	retval = ext4_map_query_blocks(handle, inode, map, EXT4_EX_NOCACHE);
>> +out:
>> +	up_write(&EXT4_I(inode)->i_data_sem);
>> +	ext4_journal_stop(handle);
>> +	return retval < 0 ? retval : 0;
>> +}
>> +
>> +static int ext4_iomap_map_writeback_range(struct iomap_writepage_ctx *wpc,
>> +					  loff_t offset, unsigned int dirty_len)
>> +{
>> +	struct inode *inode = wpc->inode;
>> +	struct super_block *sb = inode->i_sb;
>> +	struct journal_s *journal = EXT4_SB(sb)->s_journal;
>> +	struct ext4_map_blocks map;
>> +	unsigned int blkbits = inode->i_blkbits;
>> +	unsigned int index = offset >> blkbits;
>> +	unsigned int blk_end, blk_len;
>> +	int ret;
>> +
>> +	ret = ext4_emergency_state(sb);
>> +	if (unlikely(ret))
>> +		return ret;
>> +
>> +	/* Check validity of the cached writeback mapping. */
>> +	if (offset >= wpc->iomap.offset &&
>> +	    offset < wpc->iomap.offset + wpc->iomap.length &&
>> +	    ext4_iomap_valid(inode, &wpc->iomap))
>> +		return 0;
>> +
>> +	blk_len = dirty_len >> blkbits;
>> +	blk_end = min_t(unsigned int, (wpc->wbc->range_end >> blkbits),
>> +				      (UINT_MAX - 1));
> 
> This is an interesting idea. I'm just a bit worried when we have
> range_end == LLONG_MAX (bg flush) and we will always be trying to allocate
> MAX_WRITEPAGES, incase of a slightly fragmented FS, we might keep
> falling into slower mballoc criterias and might waste a lot of time
> scanning the groups.

Actually, MAX_WRITEPAGES is not allocated every time; the allocated
length also depends on the length of data that has already been delayed
for writing, and the smaller value is taken. If the user has indeed
performed delalloc writes on data of up to MAX_WRITEPAGES in length,
then regardless of how fragmented the file system is, I will need to
allocate blocks of that length. Reducing the number of calls is always
beneficial.

> 
>> +	if (blk_end > index + blk_len)
>> +		blk_len = blk_end - index + 1;
>> +
>> +retry:
>> +	map.m_lblk = index;
>> +	map.m_len = min_t(unsigned int, MAX_WRITEPAGES_EXTENT_LEN, blk_len);
>> +	ret = ext4_map_blocks(NULL, inode, &map,
>> +			      EXT4_GET_BLOCKS_IO_SUBMIT | EXT4_EX_NOCACHE);
> 
> Do we really need the IO_SUBMIT flag here now that we are:
> 1. Not using ordered data
> 2. We anyways don't use it in ext4_iomap_map_one_extent().
> 
> I think we can drop it.

We can't drop it, because IO_SUBMIT is also used to avoid the check of
ext4_check_map_extents_env() in ext4_map_blocks() under the writeback
path.

Cheers,
Yi.

> 
>> +	if (ret < 0)
>> +		return ret;
>> +
>> +	/*
>> +	 * The map is not a delalloc extent, it must either be a hole
>> +	 * or an extent which have already been allocated.
>> +	 */
>> +	if (!(map.m_flags & EXT4_MAP_DELAYED))
>> +		goto out;
>> +
>> +	/* Map one delalloc extent. */
>> +	ret = ext4_iomap_map_one_extent(inode, &map);
>> +	if (ret < 0) {
>> +		if (ext4_emergency_state(sb))
>> +			return ret;
>> +
>> +		/*
>> +		 * Retry transient ENOSPC errors, if
>> +		 * ext4_count_free_blocks() is non-zero, a commit
>> +		 * should free up blocks.
>> +		 */
>> +		if (ret == -ENOSPC && journal && ext4_count_free_clusters(sb)) {
>> +			jbd2_journal_force_commit_nested(journal);
>> +			goto retry;
>> +		}
>> +
>> +		ext4_msg(sb, KERN_CRIT,
>> +			 "Delayed block allocation failed for inode %llu at logical offset %llu with max blocks %u with error %d",
>> +			 inode->i_ino, (unsigned long long)map.m_lblk,
>> +			 (unsigned int)map.m_len, -ret);
>> +		ext4_msg(sb, KERN_CRIT,
>> +			 "This should not happen!! Data will be lost\n");
>> +		if (ret == -ENOSPC)
>> +			ext4_print_free_blocks(inode);
>> +		return ret;
>> +	}
>> +out:
>> +	ext4_set_iomap(inode, &wpc->iomap, &map, offset, dirty_len, 0);
>> +	return 0;
>> +}
>> +
> 
> <snip>
>>


^ permalink raw reply

* [PATCH 1/4] e2fsck: fix orphaned extent files handling
From: Etienne AUJAMES @ 2026-05-29 11:54 UTC (permalink / raw)
  To: linux-ext4; +Cc: adilger, dongyangli

release_inode_blocks() does not handle corectly multi-levels extent
files: it does not count the non-leaf blocks directly released by
ext2fs_block_iterate3().

This patch relies on ext2fs_get_stat_i_blocks() count for quota update
and ext2fs_free_blocks_count() to count number of blocks released by
release_inode_blocks().

Add regression test: f_orphan_truncate_extents_inode

Signed-off-by: Etienne AUJAMES <eaujames@ddn.com>
Change-Id: Ib0c3aaaa685e7bcfae896617cda03005d19539ff
Lustre-bug-id: https://jira.whamcloud.com/browse/LU-20049
---
 e2fsck/super.c                                | 244 +++++++++---------
 .../f_orphan_truncate_extents_inode/expect.1  |   3 +
 .../f_orphan_truncate_extents_inode/expect.2  |   7 +
 .../f_orphan_truncate_extents_inode/image.gz  | Bin 0 -> 2854 bytes
 tests/f_orphan_truncate_extents_inode/name    |   1 +
 tests/f_orphan_truncate_extents_inode/script  |   3 +
 6 files changed, 133 insertions(+), 125 deletions(-)
 create mode 100644 tests/f_orphan_truncate_extents_inode/expect.1
 create mode 100644 tests/f_orphan_truncate_extents_inode/expect.2
 create mode 100644 tests/f_orphan_truncate_extents_inode/image.gz
 create mode 100644 tests/f_orphan_truncate_extents_inode/name
 create mode 100644 tests/f_orphan_truncate_extents_inode/script

diff --git a/e2fsck/super.c b/e2fsck/super.c
index cfc0919a2..c2ccefd54 100644
--- a/e2fsck/super.c
+++ b/e2fsck/super.c
@@ -62,21 +62,14 @@ static int check_super_value64(e2fsck_t ctx, const char *descr,
 	return 1;
 }
 
-/*
- * helper function to release an inode
- */
 struct process_block_struct {
-	e2fsck_t 	ctx;
-	char 		*buf;
+	e2fsck_t	ctx;
+	char		*buf;
 	struct problem_context *pctx;
-	int		truncating;
-	int		truncate_offset;
 	e2_blkcnt_t	truncate_block;
-	int		truncated_blocks;
-	int		abort;
+	e2_blkcnt_t	truncated_blocks;
 	errcode_t	errcode;
 	blk64_t last_cluster;
-	struct ext2_inode_large *inode;
 };
 
 static int release_inode_block(ext2_filsys fs,
@@ -91,7 +84,6 @@ static int release_inode_block(ext2_filsys fs,
 	struct problem_context	*pctx;
 	blk64_t			blk = *block_nr;
 	blk64_t			cluster = EXT2FS_B2C(fs, *block_nr);
-	int			retval = 0;
 
 	pb = (struct process_block_struct *) priv_data;
 	ctx = pb->ctx;
@@ -111,155 +103,157 @@ static int release_inode_block(ext2_filsys fs,
 	if ((blk < fs->super->s_first_data_block) ||
 	    (blk >= ext2fs_blocks_count(fs->super))) {
 		fix_problem(ctx, PR_0_ORPHAN_ILLEGAL_BLOCK_NUM, pctx);
-	return_abort:
-		pb->abort = 1;
+		pb->errcode = EXT2_ET_BAD_BLOCK_NUM;
 		return BLOCK_ABORT;
 	}
 
 	if (!ext2fs_test_block_bitmap2(fs->block_map, blk)) {
 		fix_problem(ctx, PR_0_ORPHAN_ALREADY_CLEARED_BLOCK, pctx);
-		goto return_abort;
+		pb->errcode = EXT2_ET_BAD_BLOCK_NUM;
+		return BLOCK_ABORT;
 	}
 
 	/*
-	 * If we are deleting an orphan, then we leave the fields alone.
-	 * If we are truncating an orphan, then update the inode fields
-	 * and clean up any partial block data.
+	 * We don't remove direct blocks until we've reached
+	 * the truncation block.
 	 */
-	if (pb->truncating) {
-		/*
-		 * We only remove indirect blocks if they are
-		 * completely empty.
-		 */
-		if (blockcnt < 0) {
-			int	i, limit;
-			blk_t	*bp;
-
-			pb->errcode = io_channel_read_blk64(fs->io, blk, 1,
-							pb->buf);
-			if (pb->errcode)
-				goto return_abort;
-
-			limit = fs->blocksize >> 2;
-			for (i = 0, bp = (blk_t *) pb->buf;
-			     i < limit;	 i++, bp++)
-				if (*bp)
-					return 0;
-		}
-		/*
-		 * We don't remove direct blocks until we've reached
-		 * the truncation block.
-		 */
-		if (blockcnt >= 0 && blockcnt < pb->truncate_block)
-			return 0;
-		/*
-		 * If part of the last block needs truncating, we do
-		 * it here.
-		 */
-		if ((blockcnt == pb->truncate_block) && pb->truncate_offset) {
-			pb->errcode = io_channel_read_blk64(fs->io, blk, 1,
-							pb->buf);
-			if (pb->errcode)
-				goto return_abort;
-			memset(pb->buf + pb->truncate_offset, 0,
-			       fs->blocksize - pb->truncate_offset);
-			pb->errcode = io_channel_write_blk64(fs->io, blk, 1,
-							 pb->buf);
-			if (pb->errcode)
-				goto return_abort;
-		}
-		pb->truncated_blocks++;
-		*block_nr = 0;
-		retval |= BLOCK_CHANGED;
+	if (blockcnt >= 0 && blockcnt < pb->truncate_block)
+		return 0;
+
+	/*
+	 * We only remove indirect blocks if they are
+	 * completely empty.
+	 */
+	if (blockcnt < 0) {
+		int	i, limit;
+		blk_t	*bp;
+
+		pb->errcode = io_channel_read_blk64(fs->io, blk, 1,
+						    pb->buf);
+		if (pb->errcode)
+			return BLOCK_ABORT;
+
+		limit = fs->blocksize >> 2;
+		for (i = 0, bp = (blk_t *) pb->buf;
+		     i < limit;	 i++, bp++)
+			if (*bp)
+				return 0;
 	}
 
-	if (ctx->qctx)
-		quota_data_sub(ctx->qctx, pb->inode, 0, ctx->fs->blocksize);
 	ext2fs_block_alloc_stats2(fs, blk, -1);
-	ctx->free_blocks++;
-	return retval;
+	pb->truncated_blocks++;
+	*block_nr = 0;
+
+	return BLOCK_CHANGED;
 }
 
-/*
- * This function releases an inode.  Returns 1 if an inconsistency was
- * found.  If the inode has a link count, then it is being truncated and
- * not deleted.
- */
-static int release_inode_blocks(e2fsck_t ctx, ext2_ino_t ino,
-				struct ext2_inode_large *inode, char *block_buf,
-				struct problem_context *pctx)
+static errcode_t truncate_inode_blocks(e2fsck_t ctx, ext2_ino_t ino,
+				       struct ext2_inode_large *inode,
+				       char *block_buf,
+				       struct problem_context *pctx)
 {
-	struct process_block_struct 	pb;
-	ext2_filsys			fs = ctx->fs;
-	blk64_t				blk;
-	errcode_t			retval;
-	__u32				count;
+	ext2_filsys fs = ctx->fs;
+	struct process_block_struct pb = { 0 };
+	e2_blkcnt_t truncate_block = 0;
+	__u32 truncate_offset = 0;
+	blk64_t blk;
+	int ret_flags;
+	errcode_t retval = 0;
 
 	if (!ext2fs_inode_has_valid_blocks2(fs, EXT2_INODE(inode)))
-		goto release_acl;
+		return 0;
 
-	pb.buf = block_buf + 3 * ctx->fs->blocksize;
-	pb.ctx = ctx;
-	pb.abort = 0;
-	pb.errcode = 0;
-	pb.pctx = pctx;
-	pb.last_cluster = 0;
-	pb.inode = inode;
 	if (inode->i_links_count) {
-		pb.truncating = 1;
-		pb.truncate_block = (e2_blkcnt_t)
+		truncate_offset = inode->i_size % fs->blocksize;
+		truncate_block = (e2_blkcnt_t)
 			((EXT2_I_SIZE(inode) + fs->blocksize - 1) /
 			 fs->blocksize);
-		pb.truncate_offset = inode->i_size % fs->blocksize;
-	} else {
-		pb.truncating = 0;
-		pb.truncate_block = 0;
-		pb.truncate_offset = 0;
 	}
-	pb.truncated_blocks = 0;
+
+	pb.buf = block_buf;
+	pb.ctx = ctx;
+	pb.pctx = pctx;
+	pb.truncate_block = truncate_block;
 	retval = ext2fs_block_iterate3(fs, ino, BLOCK_FLAG_DEPTH_TRAVERSE,
 				      block_buf, release_inode_block, &pb);
 	if (retval) {
 		com_err("release_inode_blocks", retval,
 			_("while calling ext2fs_block_iterate for inode %u"),
 			ino);
-		return 1;
+		return retval;
 	}
-	if (pb.abort)
-		return 1;
+	if (pb.errcode)
+		return pb.errcode;
 
 	/* Refresh the inode since ext2fs_block_iterate may have changed it */
 	e2fsck_read_inode_full(ctx, ino, EXT2_INODE(inode), sizeof(*inode),
 			"release_inode_blocks");
 
-	if (pb.truncated_blocks)
-		ext2fs_iblk_sub_blocks(fs, EXT2_INODE(inode),
-				pb.truncated_blocks);
-release_acl:
-	blk = ext2fs_file_acl_block(fs, EXT2_INODE(inode));
-	if (blk) {
-		retval = ext2fs_adjust_ea_refcount3(fs, blk, block_buf, -1,
-				&count, ino);
-		if (retval == EXT2_ET_BAD_EA_BLOCK_NUM) {
-			retval = 0;
-			count = 1;
-		}
-		if (retval) {
-			com_err("release_inode_blocks", retval,
-		_("while calling ext2fs_adjust_ea_refcount2 for inode %u"),
-				ino);
-			return 1;
-		}
-		if (count == 0) {
-			if (ctx->qctx)
-				quota_data_sub(ctx->qctx, inode, 0,
-						ctx->fs->blocksize);
-			ext2fs_block_alloc_stats2(fs, blk, -1);
-			ctx->free_blocks++;
-		}
-		ext2fs_file_acl_block_set(fs, EXT2_INODE(inode), 0);
+	ext2fs_iblk_sub_blocks(fs, EXT2_INODE(inode), pb.truncated_blocks);
+	if (!truncate_offset)
+		return 0;
+
+	/* Is there an initialized block at the end? */
+	retval = ext2fs_bmap2(fs, ino, NULL, NULL, 0,
+			   truncate_block, &ret_flags, &blk);
+	if (retval)
+		return retval;
+	if ((blk == 0) || (ret_flags & BMAP_RET_UNINIT))
+		return 0;
+
+	retval = io_channel_read_blk64(fs->io, blk, 1, block_buf);
+	if (retval)
+		return retval;
+
+	memset(block_buf + truncate_offset, 0, fs->blocksize - truncate_offset);
+	retval = io_channel_write_blk64(fs->io, blk, 1, block_buf);
+
+	return retval;
+}
+
+/*
+ * This function releases an inode.  Returns 1 if an inconsistency was
+ * found.  If the inode has a link count, then it is being truncated and
+ * not deleted.
+ */
+static int release_inode_blocks(e2fsck_t ctx, ext2_ino_t ino,
+				struct ext2_inode_large *inode, char *block_buf,
+				struct problem_context *pctx)
+{
+	ext2_filsys			fs = ctx->fs;
+	blk64_t				free_blks, ino_blks;
+	char				*buf;
+	errcode_t			err;
+	int				rc = 0;
+
+	free_blks = ext2fs_free_blocks_count(fs->super);
+	ino_blks = ext2fs_get_stat_i_blocks(fs, EXT2_INODE(inode));
+	buf = block_buf + 3 * ctx->fs->blocksize;
+	if (truncate_inode_blocks(ctx, ino, inode, buf, pctx)) {
+		rc = 1;
+		goto update_counts;
 	}
-	return 0;
+	if (inode->i_links_count)
+		goto update_counts;
+
+	err = ext2fs_free_ext_attr(fs, ino, inode);
+	if (err) {
+		com_err(__func__, err,
+			_("while calling ext2fs_free_ext_attr for inode %u"),
+			ino);
+		rc = 1;
+		goto update_counts;
+	}
+
+	rc = 0;
+
+update_counts:
+	ctx->free_blocks += ext2fs_free_blocks_count(fs->super) - free_blks;
+	ino_blks -= ext2fs_get_stat_i_blocks(fs, EXT2_INODE(inode));
+	if (ctx->qctx)
+		quota_data_sub(ctx->qctx, inode, 0, ino_blks << 9);
+
+	return rc;
 }
 
 /* Load all quota data in preparation for orphan clearing. */
diff --git a/tests/f_orphan_truncate_extents_inode/expect.1 b/tests/f_orphan_truncate_extents_inode/expect.1
new file mode 100644
index 000000000..b24aae7ad
--- /dev/null
+++ b/tests/f_orphan_truncate_extents_inode/expect.1
@@ -0,0 +1,3 @@
+test_filesys: Truncating orphaned inode 12 (uid=0, gid=0, mode=0100644, size=4096)
+test_filesys: clean, 12/128 files, 75/1024 blocks
+Exit status is 0
diff --git a/tests/f_orphan_truncate_extents_inode/expect.2 b/tests/f_orphan_truncate_extents_inode/expect.2
new file mode 100644
index 000000000..7edff9bce
--- /dev/null
+++ b/tests/f_orphan_truncate_extents_inode/expect.2
@@ -0,0 +1,7 @@
+Pass 1: Checking inodes, blocks, and sizes
+Pass 2: Checking directory structure
+Pass 3: Checking directory connectivity
+Pass 4: Checking reference counts
+Pass 5: Checking group summary information
+test_filesys: 12/128 files (16.7% non-contiguous), 75/1024 blocks
+Exit status is 0
diff --git a/tests/f_orphan_truncate_extents_inode/image.gz b/tests/f_orphan_truncate_extents_inode/image.gz
new file mode 100644
index 0000000000000000000000000000000000000000..30681b879455b936e05d4dcbb4feaed6c4ff1eb9
GIT binary patch
literal 2854
zcmeHI{Z|ub7RHY)J?j?rU>6Hbay;x&EDLO;6|5w)E)secX|xtmD~TnV2mvC7HJUid
zQceX863bCRLDH29M-);L`3R7JUqnJ68v*MeBqBnh8Iq!8LP#>R!|wj*U*PHdF!S7d
zp7*))-g)mcB<cI_uW5H?EnSwC`z_~iz|44l$zvCxXV$_uLK&f*VL@Hf4~zB?`7d(G
z9Z{!g&3Sg&a>DO7t*O4TcjJk~J3vBKc=8_z<Sx_ZjK+W5OQMCD&zoA-ZaHK-k{JHC
zZM3Gff_sHcoAcz(H>U;-V<l(x4%b;(x0@n(<rf>BqHdjt*wL`GuHnw>6mHu1+Nyoz
zW#pC>z>l3<rY6NLyqxh`dRgPmwBTT=F8iEJ6rWe+`*)uuF5F_14mkHWWP@|oHuvqk
zFtsZg@btzm3uh`mVERrPNbQ_SpU6QtArK8AfQ6rCaPSmduP5Cr`@p3+K0lm-sT=o1
z<%}^hZj+oB9rI3o${4Y@Q+VB9n~x6jB~<Rlj7xR8Q=6t^&$)n|E!E7`0sp&trZS&*
zN$6dy6=sCTJUS6JNg!<9@XwVyQqmY<>0I@?)`7e%^~C9E?mTr@Do-uCS66sakURfa
zW69B;b{JjAXwR$EWrD^|QH!GH=*g;l(*nlPsL{fW*GAR_*hxR5OWtxT?7yS4mTGsR
z-Qz{f4kh`)haPQK(8^OkjS!YrD%XD=EexyN<QpQ@MPt2z+0Hrd@>&xA*JqnXp0Db;
z@R>W?h~4>m$^%q(oWgL9Y<7tSE54y~V@noy^{m!D0kZ7tIiJk9Exa3{G2eaS_1|D_
zuJ_aksd10YC-MpwE{|3*BGdA!W{E!kIQd@69%FRuNuW#a1==a%mmR}?CfC>H9(;l|
zG-HiC2x}U<q^`@RR(!qis=;$A|1!j5VKLN|K(&`C=|_#~gBWntYZ0CS7yjY|c7HSJ
zbLQr1hCLQfGzKfSU=xYCw-eBpnqimXdOoNaWMXFDytSd){e0FKs758Nn*1LW`bjPK
z*!pf1ygqYRCz)EndT+j1c=LU1VzVZeXRq3mCRJ`Ag=K{BLC{=-wPDLR^scY!65ji>
zT=#;t?f)P2t&e9#>EbU%9;j(3v#DewfZX;C{}M?EUF5dkN(EFu?JuVcKk`wZO}ZLJ
zt0*z8_n@lhh#%@MBle-mb4L(srYJ(h9TR`*l{Qmq{7>8chmh`TbWV)yBO@c2pZD4s
zB|o5no@S1!NF^@~X2Q@#{}($1NHCcK1+s)Hx%6efn9BYZIa7(k>8w8<mY>^qDMOJq
zqKC_DEs&}<j%_?=E(ck22a)e=1V~ydIIJik>KP<WoQOYn6ah_Mm4GHY1gxQ4@$a4c
z7hHc$#s&yrg7@fLAIvh<tLRBGjx0&uFQ#;Cf7#kjA60Q^0=DstL(d?NU)8Md*dW?n
zz(ZPXS~&1p2-{iaFpB>q+=T?2`jsnb3WGRWn2WqIg{kP<d3`7g)*VDvn6@ghP)(#r
zG`ACZRH=b(W}a5X-G`0l(AaWNVmN{nk&Bh?`yUUOL-+~azZ&{r4(m$}X`@wwNT+;8
zvAP-?x&qc!V^A<2)~t5#N=I(l95A<5q_ifx*_H!U2e7tkCI))6Pba|@Drc#~$RKJW
zM5%L1IQ1~)5HHfc&ReJ?Dg_m;^ZqaPt%T?oT<5``ZxzE<z3`z}i-TaC*S-I7A_Cz&
zd%i`+7F<vy<4u*ZjZsXQvU?S{$=i>&h$kh=6M{@uWd*Hb!{0-#$+%n?u3}zX?8jAr
zy*Q}BRooZxB0u8VoPOa$>Q{JHx>)1@4))@U7PB=G_H~_&IHe5db8tST9uL%0!t)eB
z81Jn+MtR8C*%S!1RoJ&7S53vric2^+Ynz0)de_0%YZs$w+bry)$@@|9$-51W5Kx+D
zM6(JwNX)fPrM%QJh^7|Mkyw)kl9V|5(G<^{t(JD0_VGmi(zhP--;(ce2YNa`^Bc;u
z`+mo3>m6G?M1`W#MlOTkj`ST-(;byJS+B@#_jLtUosXJl>q8euJ;ek<5-Fq7f7OP<
z&ZHPUx(%PI2joaq`u$r243dg0;u|i(-puz@f?oKcID(yyu*iuJ{Q*26{+u1}J!(K<
z7C9WM&!nkznL&rUiTqDHqk<mI!k0}ORMzeC!I}_C4Y+$w4S&NOC>nd>j&GfTBK6=8
z8fr(Rh`$Ae+)3_3_)rgsBRXQd&9?6$dXk$15Hu0EZz*x#io|_OF`x}^4O3P09#26U
zo&>RZB{OAkWApe$P?A%uB$dvXVM;S$&>ZsA4+Um!E%)c-B&%fik(~%`##j8u@G>0z
ztg$9S2Z(5J{|Ve;_|Px3^r!)G%f}cTJ2lVgW|T>eQwB#I@JEYAv~LiDAslF1acg>`
z_sBuk7EHy9#(k?1e<f!Lqe>GmWFfC@Q4ml@G%!BYgnLc449ITRBLDrzzQYIY9o*Wl
Qs3(SfSGqhPU{%0>0O93B*Z=?k

literal 0
HcmV?d00001

diff --git a/tests/f_orphan_truncate_extents_inode/name b/tests/f_orphan_truncate_extents_inode/name
new file mode 100644
index 000000000..6f16502b3
--- /dev/null
+++ b/tests/f_orphan_truncate_extents_inode/name
@@ -0,0 +1 @@
+truncating an orphaned extent-mapped inode in preen mode
diff --git a/tests/f_orphan_truncate_extents_inode/script b/tests/f_orphan_truncate_extents_inode/script
new file mode 100644
index 000000000..fb895e9a4
--- /dev/null
+++ b/tests/f_orphan_truncate_extents_inode/script
@@ -0,0 +1,3 @@
+FSCK_OPT=-p
+SECOND_FSCK_OPT="-yf -E no_optimize_extents"
+. $cmd_dir/run_e2fsck
-- 
2.43.7


^ permalink raw reply related

* Re: [PATCH] ext4: fix quota accounting WARN in bigalloc punch hole
From: Zhang Yi @ 2026-05-29 10:37 UTC (permalink / raw)
  To: Qiliang Yuan
  Cc: tytso, adilger.kernel, jack, ritesh.list, ojaswin, libaokun,
	linux-ext4, linux-kernel, yzjaurora
In-Reply-To: <20260529083445.101050-1-realwujing@gmail.com>

On 5/29/2026 4:34 PM, Qiliang Yuan wrote:
> Hi Zhang Yi,
> 
> On 5/29/2026 3:32 PM, Zhang Yi wrote:
>> On 5/28/2026 6:21 PM, Qiliang Yuan wrote:
>>> __es_remove_extent() -> get_rsvd() already correctly excludes
>>> boundary clusters that still contain delayed blocks from resv_used.
>>> Adding pending to resv_used double-counts those boundary clusters,
>>> erroneously releasing reservations that are still needed.
>>
>> Hmm, the analysis doesn't seem correct to me. Do you mean the
>> following case?
>>
>> # Assume the cluster size is 16KB.
>> xfs_io -f -c "pwrite 12k 4k" /mnt/foo
>> xfs_io -d -c "pwrite 0 4k" /mnt/foo
>> xfs_io -c "fpunch 0 4k" /mnt/foo
>>
>> During the direct I/O write, quota space will be added in
>> ext4_mb_new_blocks() because the EXT4_MB_DELALLOC_RESERVED flag is
>> not set. Therefore, in ext4_es_insert_extent(), we should release the
>> quota reservations, since this cluster has already been allocated.
>>
>> Then, in the third operation (punch hole), it will reclaim the added
>> dqb_curspace. This should not cause an insufficiency.
>>
>> Am I missing something?
> 
> Thanks for the review!  Let me explain the issue with your specific
> example.

Thanks for the explanation, some comments below.

> 
> After step 1 (delalloc write 4KB@12KB in a 16KB cluster), we have:
> 
>   ES tree: [0,1) hole, [1,3) delayed, [3,4) hole   (blocks 0..3)

             [0,3) hole, [3,4) delayed(blocks 0..3)  ?

>   Quota:   dqb_rsvspace += 16KB (one cluster reserved)

Right.

> 
> Step 2 (DIO write 4KB@0KB, RWF_DSYNC):
> 
>   The DIO allocates one cluster, but the mapped extent from
>   ext4_ext_map_blocks() only covers the written range, e.g. [0,0].
> 
>   In ext4_es_insert_extent():
> 
>     a) __es_remove_extent([0,0]) removes the [0,1) hole entry, but
>        there are no delayed extents within [0,0], so resv_used = 0.
> 
>        This is correct: the DIO extent [0,0] does not overlap the
>        delayed region [1,3).

Right.

> 
>     b) __revise_pending() scans outside the newly inserted extent [0,0]:
> 
>        - Left boundary (block 0): the range starts at cluster
>          boundary, no blocks to scan on the left → no pending insert.
> 
>        - Right boundary (block 3): blocks [1,3] are outside [0,0]
>          and are delayed → __revise_pending() inserts a pending
>          reservation and returns pending = 1.
> 
>        This pending reservation means: "cluster containing block 3
>        still has delayed blocks, keep this cluster reserved."

Right.

> 
>     c) Then comes the bug:
> 
>          resv_used += pending;   // resv_used = 0 + 1 = 1
> 
>        This causes ext4_da_update_reserve_space() to release 16KB
>        of quota reservation (dquot_release_reservation_block()).  But
>        block 3 is still delayed!  Its quota reservation should NOT
>        be released — no blocks within [0,0] actually used the delalloc
>        reservation.

No. Because this cluster has already been allocated, it needs to occupy
actual quota space and can no longer use reserved counts, so the
reserved counts must be released.

For better understanding, please see the implementation of
ext4_insert_delayed_blocks(). When a new extent is delallocated to an
already allocated cluster using the the reserved count is not increased
(ext4_clu_alloc_state() returns 2).

> 
>     So after DIO:
>       dqb_rsvspace: originally 16KB, now 0 (incorrectly released)
>       dqb_curspace: 16KB (from ext4_mb_new_blocks)
> 
> Step 3 (punch 4KB@0KB):
> 
>   ext4_remove_blocks() sees that block 0's cluster has a pending
>   reservation → calls ext4_rereserve_cluster() → dquot_reclaim_block()
>   tries to move 16KB from dqb_curspace to dqb_rsvspace.  But
>   dqb_curspace may already be insufficient (depending on whether
>   other allocs/frees have happened), triggering:
    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
How exactly did it happen? All data cluster allocations are precisely
calculated. In addition, the current example cannot reproduce the issue
you encountered.

> 
>     WARNING at dquot_reclaim_space_nodirty
> 
> Step 4 (delalloc writeback of block 3):
> 
>   ext4_da_update_reserve_space() → dquot_claim_block() tries to
>   move 16KB from dqb_rsvspace to dqb_curspace.  Since rsvspace was
>   incorrectly released and not fully restored by the punch hole's
>   rereseve, this triggers:
> 
>     WARNING at dquot_claim_space_nodirty
> 
> The key point is: pending from __revise_pending() indicates clusters
> that *still contain delayed blocks outside the newly inserted extent*.
> These clusters' quota reservations must be preserved — get_rsvd()

Whether to reserve a quota is determined not by whether there are
delalloc extents left, but by whether there are only pure delalloc
extents in this cluster range.

Cheers,
Yi.

> inside __es_remove_extent() already correctly excludes them from
> resv_used.  Adding pending to resv_used double-counts them.
> 
> I also found a pre-existing retry loop issue: if __es_insert_extent()
> fails with -ENOMEM on the first pass, resv_used carries a stale value
> back to retry and could be double-released.  I'll include a fix for
> that in v2.
> 


^ permalink raw reply

* Re: [PATCH] ext4: fix fast commit wait/wake bit mapping on 64-bit
From: Baokun Li @ 2026-05-29  9:57 UTC (permalink / raw)
  To: Li Chen
  Cc: Theodore Ts'o, Andreas Dilger, Jan Kara, Ojaswin Mujoo,
	Ritesh Harjani, Zhang Yi, linux-ext4, linux-kernel,
	Sashiko AI review
In-Reply-To: <20260513085818.552432-1-me@linux.beauty>

On 2026/5/13 16:58, Li Chen wrote:
> On 64-bit, ext4 dynamic inode states live in the upper half of i_flags,
> and ext4_test_inode_state() applies the corresponding +32 offset.
>
> The fast-commit wait and wake paths open-coded the wait key with the raw
> EXT4_STATE_* value. Add small helpers for the state wait word and bit,
> and use them for the FC_COMMITTING and FC_FLUSHING_DATA waits so the wait
> key follows the same mapping as the state helpers.
>
> Fixes: 857d32f26181 ("ext4: rework fast commit commit path")
> Reported-by: Sashiko AI review <sashiko-bot@kernel.org>
> Signed-off-by: Li Chen <chenl311@chinatelecom.cn>

This cleanup looks good! Just two small nits below. Nits aside:

Reviewed-by: Baokun Li <libaokun@linux.alibaba.com>

> ---
>  fs/ext4/ext4.h        | 20 +++++++++++++++++
>  fs/ext4/fast_commit.c | 50 ++++++++++++++++---------------------------
>  2 files changed, 38 insertions(+), 32 deletions(-)
>
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 94283a991e5c..6569d1d575a0 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -2000,6 +2000,8 @@ EXT4_INODE_BIT_FNS(flag, flags, 0)
>  static inline int ext4_test_inode_state(struct inode *inode, int bit);
>  static inline void ext4_set_inode_state(struct inode *inode, int bit);
>  static inline void ext4_clear_inode_state(struct inode *inode, int bit);
> +static inline unsigned long *ext4_inode_state_wait_word(struct inode *inode);
> +static inline int ext4_inode_state_wait_bit(int bit);

The forward declarations above are only needed for macro-generated functions
that can't be found by name search. These two helpers are spelled out
directly,
so the declarations are unnecessary here.

>  #if (BITS_PER_LONG < 64)
>  EXT4_INODE_BIT_FNS(state, state_flags, 0)
>  
> @@ -2015,6 +2017,24 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
>  	/* We depend on the fact that callers will set i_flags */
>  }
>  #endif
> +
> +static inline unsigned long *ext4_inode_state_wait_word(struct inode *inode)
> +{
> +#if (BITS_PER_LONG < 64)
> +	return &EXT4_I(inode)->i_state_flags;
> +#else
> +	return &EXT4_I(inode)->i_flags;
> +#endif
> +}
> +
> +static inline int ext4_inode_state_wait_bit(int bit)
> +{
> +#if (BITS_PER_LONG < 64)
> +	return bit;
> +#else
> +	return bit + 32;
> +#endif
> +}

Would it be cleaner to place these inside the existing #if
(BITS_PER_LONG < 64)
/ #else block alongside ext4_clear_state_flags(), the same way the rest
of the
state helpers are organised? That would also avoid repeating the #if guard.


Cheers,
Baokun

>  #else
>  /* Assume that user mode programs are passing in an ext4fs superblock, not
>   * a kernel struct super_block.  This will allow us to call the feature-test
> diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
> index b3c22636251d..1775bce9649a 100644
> --- a/fs/ext4/fast_commit.c
> +++ b/fs/ext4/fast_commit.c
> @@ -239,6 +239,8 @@ void ext4_fc_del(struct inode *inode)
>  	struct ext4_inode_info *ei = EXT4_I(inode);
>  	struct ext4_fc_dentry_update *fc_dentry;
>  	wait_queue_head_t *wq;
> +	unsigned long *wait_word = ext4_inode_state_wait_word(inode);
> +	int wait_bit = ext4_inode_state_wait_bit(EXT4_STATE_FC_FLUSHING_DATA);
>  	int alloc_ctx;
>  
>  	if (ext4_fc_disabled(inode->i_sb))
> @@ -268,17 +270,9 @@ void ext4_fc_del(struct inode *inode)
>  	WARN_ON(ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)
>  		&& !ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE));
>  	while (ext4_test_inode_state(inode, EXT4_STATE_FC_FLUSHING_DATA)) {
> -#if (BITS_PER_LONG < 64)
> -		DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
> -				EXT4_STATE_FC_FLUSHING_DATA);
> -		wq = bit_waitqueue(&ei->i_state_flags,
> -				   EXT4_STATE_FC_FLUSHING_DATA);
> -#else
> -		DEFINE_WAIT_BIT(wait, &ei->i_flags,
> -				EXT4_STATE_FC_FLUSHING_DATA);
> -		wq = bit_waitqueue(&ei->i_flags,
> -				   EXT4_STATE_FC_FLUSHING_DATA);
> -#endif
> +		DEFINE_WAIT_BIT(wait, wait_word, wait_bit);
> +
> +		wq = bit_waitqueue(wait_word, wait_bit);
>  		prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
>  		if (ext4_test_inode_state(inode, EXT4_STATE_FC_FLUSHING_DATA)) {
>  			ext4_fc_unlock(inode->i_sb, alloc_ctx);
> @@ -542,6 +536,8 @@ void ext4_fc_track_inode(handle_t *handle, struct inode *inode)
>  {
>  	struct ext4_inode_info *ei = EXT4_I(inode);
>  	wait_queue_head_t *wq;
> +	unsigned long *wait_word = ext4_inode_state_wait_word(inode);
> +	int wait_bit = ext4_inode_state_wait_bit(EXT4_STATE_FC_COMMITTING);
>  	int ret;
>  
>  	if (S_ISDIR(inode->i_mode))
> @@ -564,17 +560,9 @@ void ext4_fc_track_inode(handle_t *handle, struct inode *inode)
>  	lockdep_assert_not_held(&ei->i_data_sem);
>  
>  	while (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
> -#if (BITS_PER_LONG < 64)
> -		DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
> -				EXT4_STATE_FC_COMMITTING);
> -		wq = bit_waitqueue(&ei->i_state_flags,
> -				   EXT4_STATE_FC_COMMITTING);
> -#else
> -		DEFINE_WAIT_BIT(wait, &ei->i_flags,
> -				EXT4_STATE_FC_COMMITTING);
> -		wq = bit_waitqueue(&ei->i_flags,
> -				   EXT4_STATE_FC_COMMITTING);
> -#endif
> +		DEFINE_WAIT_BIT(wait, wait_word, wait_bit);
> +
> +		wq = bit_waitqueue(wait_word, wait_bit);
>  		prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
>  		if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING))
>  			schedule();
> @@ -1034,6 +1022,8 @@ static int ext4_fc_perform_commit(journal_t *journal)
>  	int ret = 0;
>  	u32 crc = 0;
>  	int alloc_ctx;
> +	int flushing_wait_bit =
> +		ext4_inode_state_wait_bit(EXT4_STATE_FC_FLUSHING_DATA);
>  
>  	/*
>  	 * Step 1: Mark all inodes on s_fc_q[MAIN] with
> @@ -1059,11 +1049,8 @@ static int ext4_fc_perform_commit(journal_t *journal)
>  	list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
>  		ext4_clear_inode_state(&iter->vfs_inode,
>  				       EXT4_STATE_FC_FLUSHING_DATA);
> -#if (BITS_PER_LONG < 64)
> -		wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_FLUSHING_DATA);
> -#else
> -		wake_up_bit(&iter->i_flags, EXT4_STATE_FC_FLUSHING_DATA);
> -#endif
> +		wake_up_bit(ext4_inode_state_wait_word(&iter->vfs_inode),
> +			    flushing_wait_bit);
>  	}
>  
>  	/*
> @@ -1279,6 +1266,8 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid)
>  	struct ext4_inode_info *ei;
>  	struct ext4_fc_dentry_update *fc_dentry;
>  	int alloc_ctx;
> +	int committing_wait_bit =
> +		ext4_inode_state_wait_bit(EXT4_STATE_FC_COMMITTING);
>  
>  	if (full && sbi->s_fc_bh)
>  		sbi->s_fc_bh = NULL;
> @@ -1315,11 +1304,8 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid)
>  		 * barrier in prepare_to_wait() in ext4_fc_track_inode().
>  		 */
>  		smp_mb();
> -#if (BITS_PER_LONG < 64)
> -		wake_up_bit(&ei->i_state_flags, EXT4_STATE_FC_COMMITTING);
> -#else
> -		wake_up_bit(&ei->i_flags, EXT4_STATE_FC_COMMITTING);
> -#endif
> +		wake_up_bit(ext4_inode_state_wait_word(&ei->vfs_inode),
> +			    committing_wait_bit);
>  	}
>  
>  	while (!list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) {



^ permalink raw reply

* [tytso-ext4:dev 1/5] ERROR: modpost: "ext4fs_dirhash" [fs/ext4/ext4-test.ko] undefined!
From: kernel test robot @ 2026-05-29  9:52 UTC (permalink / raw)
  To: Guan-Chun Wu; +Cc: oe-kbuild-all, linux-ext4, Theodore Ts'o, Chen Hao Yu

tree:   https://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4.git dev
head:   87280ac61ae6ef27fc6ee78b733689754e425592
commit: a4841d91a7f57d38d3a63c18b4ce68ee5e06829b [1/5] ext4: add Kunit coverage for directory hash computation
config: x86_64-rhel-9.4-kunit (https://download.01.org/0day-ci/archive/20260529/202605291142.INrrPtVc-lkp@intel.com/config)
compiler: gcc-14 (Debian 14.2.0-19) 14.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20260529/202605291142.INrrPtVc-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202605291142.INrrPtVc-lkp@intel.com/

All errors (new ones prefixed by >>, old ones prefixed by <<):

>> ERROR: modpost: "ext4fs_dirhash" [fs/ext4/ext4-test.ko] undefined!

--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

^ permalink raw reply

* Re: [PATCH v4 08/23] ext4: implement buffered write path using iomap
From: Zhang Yi @ 2026-05-29  9:13 UTC (permalink / raw)
  To: Ojaswin Mujoo
  Cc: linux-ext4, linux-fsdevel, linux-kernel, tytso, adilger.kernel,
	libaokun, jack, ritesh.list, djwong, hch, yi.zhang, yizhang089,
	yangerkun, yukuai
In-Reply-To: <ahXUBgoivo5CFl39@li-dc0c254c-257c-11b2-a85c-98b6c1322444.ibm.com>

Hi, Ojaswin!

On 5/27/2026 1:10 AM, Ojaswin Mujoo wrote:
> On Mon, May 11, 2026 at 03:23:28PM +0800, Zhang Yi wrote:
>> From: Zhang Yi <yi.zhang@huawei.com>
>>
>> Introduce two new iomap_ops instances for ext4 buffered writes:
>>
>>  - ext4_iomap_buffered_da_write_ops: for delayed allocation mode, using
>>    ext4_da_map_blocks() to map delalloc extents.
>>  - ext4_iomap_buffered_write_ops: for non-delayed allocation mode, using
>>    ext4_iomap_get_blocks() to directly allocate blocks.
>>
>> Also add ext4_iomap_valid() for the iomap infrastructure to check extent
>> validity.
>>
>> Key changes and considerations:
>>
>>  - Unwritten extents for new blocks (dioread_nolock always on)
>>    Since data=ordered mode is not used to prevent stale data exposure in
>>    the non-delayed allocation path, new blocks are always allocated as
>>    unwritten extents.
> 
> Okay makes sense.
> 
>>
>>  - Short write and write failure handling
>>    a. Delalloc path: On short write or failure, the stale delalloc range
>>       must be dropped and its space reservation released. Otherwise, a
>>       clean folio may cover leftover delalloc extents, causing
>>       inaccurate space reservation accounting.
> 
> Hmm, okay so in the usual buffer head path, seems like during a short
> write we still zero the new buffers we couldn't write and keep it dirty
> (folio_zero_new_buffers()). This way they are still written back and
> the delalloc reservations are used up.
> 

In fact, in the normal buffer head path, writeback does not consume
delalloc reservations. Instead, the reservations are retained until the
inode is released or the area is written again using delalloc. This is
because i_size is not updated during short writes. Therefore, when a
zeroed dirty folio is written back, no block mapping is created for it.
For details, please see the lblk >= blocks judgment in
mpage_process_page_bufs().

This will not lead to duplicate space statistics, because
ext4_da_map_blocks() only reserves space when inserting a new delalloc
extent. Therefore, this does not pose a serious issue. However, It may
cause some temporary and minor space leaks. Nevertheless, I think it
would be better if delalloc extents could be released for the buffer
head path when short writes occur.

> However in iomap we don't mark the range that we couldnt write as dirty
> so we need to make sure we clear up the stale delalloc mappings. Is this
> correct?
> 
Yeah.

Thanks,
Yi.

> Regards,
> Ojaswin
> 
>>    b. Non-delalloc path: No cleanup of allocated blocks is needed on
>>       short write.
>>
>>  - Lock ordering reversal
>>    The folio lock and transaction start ordering is reversed compared to
>>    the buffer_head buffered write path. To handle this, the journal
>>    handle must be stopped in iomap_begin() callbacks. The lock ordering
>>    documentation in super.c has been updated accordingly.
>>
>> Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
>> ---
>>  fs/ext4/ext4.h  |   4 ++
>>  fs/ext4/file.c  |  20 +++++-
>>  fs/ext4/inode.c | 165 +++++++++++++++++++++++++++++++++++++++++++++++-
>>  fs/ext4/super.c |  10 ++-
>>  4 files changed, 192 insertions(+), 7 deletions(-)
>>
>> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
>> index 1e27d73d7427..4832e7f7db82 100644
>> --- a/fs/ext4/ext4.h
>> +++ b/fs/ext4/ext4.h
>> @@ -3057,6 +3057,7 @@ int ext4_walk_page_buffers(handle_t *handle,
>>  int do_journal_get_write_access(handle_t *handle, struct inode *inode,
>>  				struct buffer_head *bh);
>>  void ext4_set_inode_mapping_order(struct inode *inode);
>> +int ext4_nonda_switch(struct super_block *sb);
>>  #define FALL_BACK_TO_NONDELALLOC 1
>>  #define CONVERT_INLINE_DATA	 2
>>  
>> @@ -3926,6 +3927,9 @@ static inline void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end)
>>  
>>  extern const struct iomap_ops ext4_iomap_ops;
>>  extern const struct iomap_ops ext4_iomap_report_ops;
>> +extern const struct iomap_ops ext4_iomap_buffered_write_ops;
>> +extern const struct iomap_ops ext4_iomap_buffered_da_write_ops;
>> +extern const struct iomap_write_ops ext4_iomap_write_ops;
>>  
>>  static inline int ext4_buffer_uptodate(struct buffer_head *bh)
>>  {
>> diff --git a/fs/ext4/file.c b/fs/ext4/file.c
>> index eb1a323962b1..7f9bfbbc4a4e 100644
>> --- a/fs/ext4/file.c
>> +++ b/fs/ext4/file.c
>> @@ -299,6 +299,21 @@ static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from)
>>  	return count;
>>  }
>>  
>> +static ssize_t ext4_iomap_buffered_write(struct kiocb *iocb,
>> +					 struct iov_iter *from)
>> +{
>> +	struct inode *inode = file_inode(iocb->ki_filp);
>> +	const struct iomap_ops *iomap_ops;
>> +
>> +	if (test_opt(inode->i_sb, DELALLOC) && !ext4_nonda_switch(inode->i_sb))
>> +		iomap_ops = &ext4_iomap_buffered_da_write_ops;
>> +	else
>> +		iomap_ops = &ext4_iomap_buffered_write_ops;
>> +
>> +	return iomap_file_buffered_write(iocb, from, iomap_ops,
>> +					 &ext4_iomap_write_ops, NULL);
>> +}
>> +
>>  static ssize_t ext4_buffered_write_iter(struct kiocb *iocb,
>>  					struct iov_iter *from)
>>  {
>> @@ -313,7 +328,10 @@ static ssize_t ext4_buffered_write_iter(struct kiocb *iocb,
>>  	if (ret <= 0)
>>  		goto out;
>>  
>> -	ret = generic_perform_write(iocb, from);
>> +	if (ext4_inode_buffered_iomap(inode))
>> +		ret = ext4_iomap_buffered_write(iocb, from);
>> +	else
>> +		ret = generic_perform_write(iocb, from);
>>  
>>  out:
>>  	inode_unlock(inode);
>> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
>> index 39577a6b65b9..1ae7d3f4a1c8 100644
>> --- a/fs/ext4/inode.c
>> +++ b/fs/ext4/inode.c
>> @@ -3097,7 +3097,7 @@ static int ext4_dax_writepages(struct address_space *mapping,
>>  	return ret;
>>  }
>>  
>> -static int ext4_nonda_switch(struct super_block *sb)
>> +int ext4_nonda_switch(struct super_block *sb)
>>  {
>>  	s64 free_clusters, dirty_clusters;
>>  	struct ext4_sb_info *sbi = EXT4_SB(sb);
>> @@ -3467,6 +3467,15 @@ static bool ext4_inode_datasync_dirty(struct inode *inode)
>>  	return inode_state_read_once(inode) & I_DIRTY_DATASYNC;
>>  }
>>  
>> +static bool ext4_iomap_valid(struct inode *inode, const struct iomap *iomap)
>> +{
>> +	return iomap->validity_cookie == READ_ONCE(EXT4_I(inode)->i_es_seq);
>> +}
>> +
>> +const struct iomap_write_ops ext4_iomap_write_ops = {
>> +	.iomap_valid = ext4_iomap_valid,
>> +};
>> +
>>  static void ext4_set_iomap(struct inode *inode, struct iomap *iomap,
>>  			   struct ext4_map_blocks *map, loff_t offset,
>>  			   loff_t length, unsigned int flags)
>> @@ -3501,6 +3510,8 @@ static void ext4_set_iomap(struct inode *inode, struct iomap *iomap,
>>  	    !ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
>>  		iomap->flags |= IOMAP_F_MERGED;
>>  
>> +	iomap->validity_cookie = map->m_seq;
>> +
>>  	/*
>>  	 * Flags passed to ext4_map_blocks() for direct I/O writes can result
>>  	 * in m_flags having both EXT4_MAP_MAPPED and EXT4_MAP_UNWRITTEN bits
>> @@ -3908,8 +3919,12 @@ const struct iomap_ops ext4_iomap_report_ops = {
>>  	.iomap_begin = ext4_iomap_begin_report,
>>  };
>>  
>> +/* Map blocks */
>> +typedef int (ext4_get_blocks_t)(struct inode *, struct ext4_map_blocks *);
>> +
>>  static int ext4_iomap_map_blocks(struct inode *inode, loff_t offset,
>> -		loff_t length, struct ext4_map_blocks *map)
>> +		loff_t length, ext4_get_blocks_t get_blocks,
>> +		struct ext4_map_blocks *map)
>>  {
>>  	u8 blkbits = inode->i_blkbits;
>>  
>> @@ -3921,6 +3936,9 @@ static int ext4_iomap_map_blocks(struct inode *inode, loff_t offset,
>>  	map->m_len = min_t(loff_t, (offset + length - 1) >> blkbits,
>>  			   EXT4_MAX_LOGICAL_BLOCK) - map->m_lblk + 1;
>>  
>> +	if (get_blocks)
>> +		return get_blocks(inode, map);
>> +
>>  	return ext4_map_blocks(NULL, inode, map, 0);
>>  }
>>  
>> @@ -3938,7 +3956,7 @@ static int ext4_iomap_buffered_read_begin(struct inode *inode, loff_t offset,
>>  	if (WARN_ON_ONCE(ext4_has_inline_data(inode)))
>>  		return -ERANGE;
>>  
>> -	ret = ext4_iomap_map_blocks(inode, offset, length, &map);
>> +	ret = ext4_iomap_map_blocks(inode, offset, length, NULL, &map);
>>  	if (ret < 0)
>>  		return ret;
>>  
>> @@ -3946,6 +3964,147 @@ static int ext4_iomap_buffered_read_begin(struct inode *inode, loff_t offset,
>>  	return 0;
>>  }
>>  
>> +static int ext4_iomap_get_blocks(struct inode *inode,
>> +				 struct ext4_map_blocks *map)
>> +{
>> +	loff_t i_size = i_size_read(inode);
>> +	handle_t *handle;
>> +	int ret;
>> +
>> +	/*
>> +	 * Check if the blocks have already been allocated, this could
>> +	 * avoid initiating a new journal transaction and return the
>> +	 * mapping information directly.
>> +	 */
>> +	if ((map->m_lblk + map->m_len) <=
>> +	    round_up(i_size, i_blocksize(inode)) >> inode->i_blkbits) {
>> +		ret = ext4_map_blocks(NULL, inode, map, 0);
>> +		if (ret < 0)
>> +			return ret;
>> +		if (map->m_flags & (EXT4_MAP_MAPPED | EXT4_MAP_UNWRITTEN |
>> +				    EXT4_MAP_DELAYED))
>> +			return 0;
>> +	}
>> +
>> +	handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
>> +			ext4_chunk_trans_blocks(inode, map->m_len));
>> +	if (IS_ERR(handle))
>> +		return PTR_ERR(handle);
>> +
>> +	ret = ext4_map_blocks(handle, inode, map,
>> +			      EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT);
>> +	/*
>> +	 * Stop handle here following the lock ordering of the folio lock
>> +	 * and the transaction start.
>> +	 */
>> +	ext4_journal_stop(handle);
>> +
>> +	return ret;
>> +}
>> +
>> +static int ext4_iomap_buffered_do_write_begin(struct inode *inode,
>> +		loff_t offset, loff_t length, unsigned int flags,
>> +		struct iomap *iomap, struct iomap *srcmap, bool delalloc)
>> +{
>> +	int ret, retries = 0;
>> +	struct ext4_map_blocks map;
>> +	ext4_get_blocks_t *get_blocks;
>> +
>> +	ret = ext4_emergency_state(inode->i_sb);
>> +	if (unlikely(ret))
>> +		return ret;
>> +
>> +	/* Inline data and non-extent are not supported. */
>> +	if (WARN_ON_ONCE(ext4_has_inline_data(inode)))
>> +		return -ERANGE;
>> +	if (WARN_ON_ONCE(!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
>> +		return -EINVAL;
>> +	if (WARN_ON_ONCE(!(flags & IOMAP_WRITE)))
>> +		return -EINVAL;
>> +
>> +	if (delalloc)
>> +		get_blocks = ext4_da_map_blocks;
>> +	else
>> +		get_blocks = ext4_iomap_get_blocks;
>> +retry:
>> +	ret = ext4_iomap_map_blocks(inode, offset, length, get_blocks, &map);
>> +	if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
>> +		goto retry;
>> +	if (ret < 0)
>> +		return ret;
>> +
>> +	ext4_set_iomap(inode, iomap, &map, offset, length, flags);
>> +	return 0;
>> +}
>> +
>> +static int ext4_iomap_buffered_write_begin(struct inode *inode,
>> +		loff_t offset, loff_t length, unsigned int flags,
>> +		struct iomap *iomap, struct iomap *srcmap)
>> +{
>> +	return ext4_iomap_buffered_do_write_begin(inode, offset, length, flags,
>> +						  iomap, srcmap, false);
>> +}
>> +
>> +static int ext4_iomap_buffered_da_write_begin(struct inode *inode,
>> +		loff_t offset, loff_t length, unsigned int flags,
>> +		struct iomap *iomap, struct iomap *srcmap)
>> +{
>> +	return ext4_iomap_buffered_do_write_begin(inode, offset, length, flags,
>> +						  iomap, srcmap, true);
>> +}
>> +
>> +/*
>> + * On write failure, drop the stale delayed allocation range and release
>> + * its reserved space for both start and end blocks. Otherwise, we may
>> + * leave a range of delayed extents covered by a clean folio, which can
>> + * result in inaccurate space reservation accounting.
>> + */
>> +static void ext4_iomap_punch_delalloc(struct inode *inode, loff_t offset,
>> +				     loff_t length, struct iomap *iomap)
>> +{
>> +	down_write(&EXT4_I(inode)->i_data_sem);
>> +	ext4_es_remove_extent(inode, offset >> inode->i_blkbits,
>> +			DIV_ROUND_UP_ULL(length, EXT4_BLOCK_SIZE(inode->i_sb)));
>> +	up_write(&EXT4_I(inode)->i_data_sem);
>> +}
>> +
>> +static int ext4_iomap_buffered_da_write_end(struct inode *inode, loff_t offset,
>> +					    loff_t length, ssize_t written,
>> +					    unsigned int flags,
>> +					    struct iomap *iomap)
>> +{
>> +	loff_t start_byte, end_byte;
>> +
>> +	/* If we didn't reserve the blocks, we're not allowed to punch them. */
>> +	if (iomap->type != IOMAP_DELALLOC || !(iomap->flags & IOMAP_F_NEW))
>> +		return 0;
>> +
>> +	/* Nothing to do if we've written the entire delalloc extent */
>> +	start_byte = iomap_last_written_block(inode, offset, written);
>> +	end_byte = round_up(offset + length, i_blocksize(inode));
>> +	if (start_byte >= end_byte)
>> +		return 0;
>> +
>> +	filemap_invalidate_lock(inode->i_mapping);
>> +	iomap_write_delalloc_release(inode, start_byte, end_byte, flags,
>> +				     iomap, ext4_iomap_punch_delalloc);
>> +	filemap_invalidate_unlock(inode->i_mapping);
>> +	return 0;
>> +}
>> +
>> +/*
>> + * Since we always allocate unwritten extents, there is no need for
>> + * iomap_end to clean up allocated blocks on a short write.
>> + */
>> +const struct iomap_ops ext4_iomap_buffered_write_ops = {
>> +	.iomap_begin = ext4_iomap_buffered_write_begin,
>> +};
>> +
>> +const struct iomap_ops ext4_iomap_buffered_da_write_ops = {
>> +	.iomap_begin = ext4_iomap_buffered_da_write_begin,
>> +	.iomap_end = ext4_iomap_buffered_da_write_end,
>> +};
>> +
>>  const struct iomap_ops ext4_iomap_buffered_read_ops = {
>>  	.iomap_begin = ext4_iomap_buffered_read_begin,
>>  };
>> diff --git a/fs/ext4/super.c b/fs/ext4/super.c
>> index 6a77db4d3124..9bc294b769db 100644
>> --- a/fs/ext4/super.c
>> +++ b/fs/ext4/super.c
>> @@ -104,9 +104,13 @@ static const struct fs_parameter_spec ext4_param_specs[];
>>   *   -> page lock -> i_data_sem (rw)
>>   *
>>   * buffered write path:
>> - * sb_start_write -> i_mutex -> mmap_lock
>> - * sb_start_write -> i_mutex -> transaction start -> page lock ->
>> - *   i_data_sem (rw)
>> + * sb_start_write -> i_rwsem (w) -> mmap_lock
>> + * - buffer_head path:
>> + *   sb_start_write -> i_rwsem (w) -> transaction start -> folio lock ->
>> + *     i_data_sem (rw)
>> + * - iomap path:
>> + *   sb_start_write -> i_rwsem (w) -> transaction start -> i_data_sem (rw)
>> + *   sb_start_write -> i_rwsem (w) -> folio lock (not under an active handle)
>>   *
>>   * truncate:
>>   * sb_start_write -> i_mutex -> invalidate_lock (w) -> i_mmap_rwsem (w) ->
>> -- 
>> 2.52.0
>>


^ permalink raw reply

* [PATCH v2 4/4] fs: retire sget()
From: Christian Brauner @ 2026-05-29  8:43 UTC (permalink / raw)
  To: linux-fsdevel
  Cc: Theodore Ts'o, Andreas Dilger, Jan Kara, Ritesh Harjani (IBM),
	linux-ext4, linux-cifs, Alexander Viro,
	Christian Brauner (Amutable)
In-Reply-To: <20260529-work-sget-v2-0-57bbe08604e4@kernel.org>

sget() and sget_fc() have lived side by side as near-duplicate
find-or-create-and-publish helpers for the legacy and fs_context mount
APIs. The three remaining in-tree callers (CIFS plus the ext4 extents
and mballoc KUnit tests) have all been moved to sget_fc(). Nothing
calls sget() anymore.

Delete sget() from fs/super.c and the prototype in <linux/fs.h>.
Update the two comments that referred to "sget()" or "sget{_fc}()" to
just say "sget_fc()".

This removes ~60 lines of code that only existed to be kept in
lockstep with sget_fc() on every superblock publish-path change.

Signed-off-by: Christian Brauner (Amutable) <brauner@kernel.org>
---
 fs/btrfs/super.c   |  2 +-
 fs/super.c         | 71 ++++--------------------------------------------------
 include/linux/fs.h |  4 ---
 3 files changed, 6 insertions(+), 71 deletions(-)

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index b26aa9169e83..636154861d7c 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -2052,7 +2052,7 @@ static int btrfs_get_tree_subvol(struct fs_context *fc)
 	 * then open_ctree will properly initialize the file system specific
 	 * settings later.  btrfs_init_fs_info initializes the static elements
 	 * of the fs_info (locks and such) to make cleanup easier if we find a
-	 * superblock with our given fs_devices later on at sget() time.
+	 * superblock with our given fs_devices later on at sget_fc() time.
 	 */
 	fs_info = kvzalloc_obj(struct btrfs_fs_info);
 	if (!fs_info)
diff --git a/fs/super.c b/fs/super.c
index 378e81efe643..5fe8cea9f8fe 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -328,7 +328,7 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags,
 	init_rwsem(&s->s_umount);
 	lockdep_set_class(&s->s_umount, &type->s_umount_key);
 	/*
-	 * sget() can have s_umount recursion.
+	 * sget_fc() can have s_umount recursion.
 	 *
 	 * When it cannot find a suitable sb, it allocates a new
 	 * one (this one), and tries again to find a suitable old
@@ -439,7 +439,7 @@ static void kill_super_notify(struct super_block *sb)
 
 	/*
 	 * Remove it from @fs_supers so it isn't found by new
-	 * sget{_fc}() walkers anymore. Any concurrent mounter still
+	 * sget_fc() walkers anymore. Any concurrent mounter still
 	 * managing to grab a temporary reference is guaranteed to
 	 * already see SB_DYING and will wait until we notify them about
 	 * SB_DEAD.
@@ -517,7 +517,7 @@ EXPORT_SYMBOL(deactivate_super);
  * @sb: superblock to acquire
  *
  * Acquire a temporary reference on a superblock and try to trade it for
- * an active reference. This is used in sget{_fc}() to wait for a
+ * an active reference. This is used in sget_fc() to wait for a
  * superblock to either become SB_BORN or for it to pass through
  * sb->kill() and be marked as SB_DEAD.
  *
@@ -673,11 +673,11 @@ void generic_shutdown_super(struct super_block *sb)
 	/*
 	 * Broadcast to everyone that grabbed a temporary reference to this
 	 * superblock before we removed it from @fs_supers that the superblock
-	 * is dying. Every walker of @fs_supers outside of sget{_fc}() will now
+	 * is dying. Every walker of @fs_supers outside of sget_fc() will now
 	 * discard this superblock and treat it as dead.
 	 *
 	 * We leave the superblock on @fs_supers so it can be found by
-	 * sget{_fc}() until we passed sb->kill_sb().
+	 * sget_fc() until we passed sb->kill_sb().
 	 */
 	super_wake(sb, SB_DYING);
 	super_unlock_excl(sb);
@@ -808,67 +808,6 @@ struct super_block *sget_fc(struct fs_context *fc,
 }
 EXPORT_SYMBOL(sget_fc);
 
-/**
- *	sget	-	find or create a superblock
- *	@type:	  filesystem type superblock should belong to
- *	@test:	  comparison callback
- *	@set:	  setup callback
- *	@flags:	  mount flags
- *	@data:	  argument to each of them
- */
-struct super_block *sget(struct file_system_type *type,
-			int (*test)(struct super_block *,void *),
-			int (*set)(struct super_block *,void *),
-			int flags,
-			void *data)
-{
-	struct user_namespace *user_ns = current_user_ns();
-	struct super_block *s = NULL;
-	struct super_block *old;
-	int err;
-
-retry:
-	spin_lock(&sb_lock);
-	if (test) {
-		hlist_for_each_entry(old, &type->fs_supers, s_instances) {
-			if (!test(old, data))
-				continue;
-			if (user_ns != old->s_user_ns) {
-				spin_unlock(&sb_lock);
-				destroy_unused_super(s);
-				return ERR_PTR(-EBUSY);
-			}
-			if (!grab_super(old))
-				goto retry;
-			destroy_unused_super(s);
-			return old;
-		}
-	}
-	if (!s) {
-		spin_unlock(&sb_lock);
-		s = alloc_super(type, flags, user_ns);
-		if (!s)
-			return ERR_PTR(-ENOMEM);
-		goto retry;
-	}
-
-	err = set(s, data);
-	if (err) {
-		spin_unlock(&sb_lock);
-		destroy_unused_super(s);
-		return ERR_PTR(err);
-	}
-	s->s_type = type;
-	strscpy(s->s_id, type->name, sizeof(s->s_id));
-	list_add_tail(&s->s_list, &super_blocks);
-	hlist_add_head(&s->s_instances, &type->fs_supers);
-	spin_unlock(&sb_lock);
-	get_filesystem(type);
-	shrinker_register(s->s_shrink);
-	return s;
-}
-EXPORT_SYMBOL(sget);
-
 void drop_super(struct super_block *sb)
 {
 	super_unlock_shared(sb);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 11559c513dfb..6dbe3218dc1e 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2327,10 +2327,6 @@ void free_anon_bdev(dev_t);
 struct super_block *sget_fc(struct fs_context *fc,
 			    int (*test)(struct super_block *, struct fs_context *),
 			    int (*set)(struct super_block *, struct fs_context *));
-struct super_block *sget(struct file_system_type *type,
-			int (*test)(struct super_block *,void *),
-			int (*set)(struct super_block *,void *),
-			int flags, void *data);
 struct super_block *sget_dev(struct fs_context *fc, dev_t dev);
 
 /* Alas, no aliases. Too much hassle with bringing module.h everywhere */

-- 
2.47.3


^ permalink raw reply related

* [PATCH v2 3/4] smb: client: convert cifs_smb3_do_mount() to sget_fc()
From: Christian Brauner @ 2026-05-29  8:43 UTC (permalink / raw)
  To: linux-fsdevel
  Cc: Theodore Ts'o, Andreas Dilger, Jan Kara, Ritesh Harjani (IBM),
	linux-ext4, linux-cifs, Alexander Viro,
	Christian Brauner (Amutable)
In-Reply-To: <20260529-work-sget-v2-0-57bbe08604e4@kernel.org>

The CIFS mount path already runs through fs_context: smb3_get_tree()
calls smb3_get_tree_common() with a struct fs_context * in hand. But
the fc is dropped on the way to sget(). Plumb it through to sget_fc()
so the legacy sget() interface can go.

cifs_smb3_do_mount() now takes (struct fs_context *, struct
smb3_fs_context *). The old (fs_type, flags) pair is reconstructed
from fc->fs_type and fc->sb_flags. The flags argument was always
passed as 0 by the sole caller anyway. The cifs_dbg diagnostic now
prints fc->sb_flags directly.

cifs_match_super() and cifs_set_super() were the two void-data
callbacks for sget(). The match callback now takes
(struct super_block *, struct fs_context *) and reads struct
cifs_mnt_data out of fc->sget_key. The set callback is gone entirely:
sget_fc() pre-populates sb->s_fs_info from fc->s_fs_info before
invoking set() so set_anon_super_fc() (which just allocates an anon
bdev) is sufficient.

Before sget_fc() we stash cifs_sb in fc->s_fs_info, the per-mount data
in fc->sget_key and force fc->sb_flags to SB_NODIRATIME | SB_NOATIME
to reproduce the previous hard-coded behaviour (alloc_super() reads
fc->sb_flags). The original sb_flags is saved and restored around the
call so the rest of the mount path sees the same fc semantics as
before.

mnt_data.flags keeps its historical value of 0 so the CIFS_MS_MASK
comparison in compare_mount_options() returns the same (always-equal)
result.

No functional change. With this in place sget() has no remaining CIFS
caller.

Signed-off-by: Christian Brauner (Amutable) <brauner@kernel.org>
---
 fs/smb/client/cifsfs.c     | 43 ++++++++++++++++++++++++++-----------------
 fs/smb/client/cifsfs.h     |  3 ++-
 fs/smb/client/cifsproto.h  |  3 ++-
 fs/smb/client/connect.c    |  5 +++--
 fs/smb/client/fs_context.c |  2 +-
 5 files changed, 34 insertions(+), 22 deletions(-)

diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c
index 9f76b0347fa9..d5074e3fbb85 100644
--- a/fs/smb/client/cifsfs.c
+++ b/fs/smb/client/cifsfs.c
@@ -12,6 +12,7 @@
 
 #include <linux/module.h>
 #include <linux/fs.h>
+#include <linux/fs_context.h>
 #include <linux/filelock.h>
 #include <linux/mount.h>
 #include <linux/slab.h>
@@ -966,26 +967,19 @@ cifs_get_root(struct smb3_fs_context *ctx, struct super_block *sb)
 	return dentry;
 }
 
-static int cifs_set_super(struct super_block *sb, void *data)
-{
-	struct cifs_mnt_data *mnt_data = data;
-	sb->s_fs_info = mnt_data->cifs_sb;
-	return set_anon_super(sb, NULL);
-}
-
 struct dentry *
-cifs_smb3_do_mount(struct file_system_type *fs_type,
-	      int flags, struct smb3_fs_context *old_ctx)
+cifs_smb3_do_mount(struct fs_context *fc, struct smb3_fs_context *old_ctx)
 {
 	struct cifs_mnt_data mnt_data;
 	struct cifs_sb_info *cifs_sb;
 	struct super_block *sb;
 	struct dentry *root;
+	unsigned int saved_sb_flags;
 	int rc;
 
 	if (cifsFYI) {
-		cifs_dbg(FYI, "%s: devname=%s flags=0x%x\n", __func__,
-			 old_ctx->source, flags);
+		cifs_dbg(FYI, "%s: devname=%s sb_flags=0x%x\n", __func__,
+			 old_ctx->source, fc->sb_flags);
 	} else {
 		cifs_info("Attempting to mount %s\n", old_ctx->source);
 	}
@@ -1012,7 +1006,7 @@ cifs_smb3_do_mount(struct file_system_type *fs_type,
 
 	rc = cifs_mount(cifs_sb, cifs_sb->ctx);
 	if (rc) {
-		if (!(flags & SB_SILENT))
+		if (!(fc->sb_flags & SB_SILENT))
 			cifs_dbg(VFS, "cifs_mount failed w/return code = %d\n",
 				 rc);
 		root = ERR_PTR(rc);
@@ -1021,12 +1015,27 @@ cifs_smb3_do_mount(struct file_system_type *fs_type,
 
 	mnt_data.ctx = cifs_sb->ctx;
 	mnt_data.cifs_sb = cifs_sb;
-	mnt_data.flags = flags;
+	mnt_data.flags = 0;
 
-	/* BB should we make this contingent on mount parm? */
-	flags |= SB_NODIRATIME | SB_NOATIME;
-
-	sb = sget(fs_type, cifs_match_super, cifs_set_super, flags, &mnt_data);
+	/*
+	 * sb->s_flags is set from fc->sb_flags by alloc_super(). CIFS has
+	 * historically forced SB_NODIRATIME | SB_NOATIME on every mount and
+	 * ignored the caller-supplied SB_* flags. Preserve that behaviour by
+	 * overriding fc->sb_flags around the sget_fc() call.
+	 *
+	 * Hand cifs_sb to sget_fc() via fc->s_fs_info; sget_fc() copies it
+	 * onto sb->s_fs_info before running set() and clears fc->s_fs_info
+	 * on successful publish. Pass the rest of the per-mount context to
+	 * cifs_match_super() through fc->sget_key.
+	 */
+	saved_sb_flags = fc->sb_flags;
+	fc->sb_flags = SB_NODIRATIME | SB_NOATIME;
+	fc->s_fs_info = cifs_sb;
+	fc->sget_key = &mnt_data;
+	sb = sget_fc(fc, cifs_match_super, set_anon_super_fc);
+	fc->sget_key = NULL;
+	fc->s_fs_info = NULL;
+	fc->sb_flags = saved_sb_flags;
 	if (IS_ERR(sb)) {
 		cifs_umount(cifs_sb);
 		return ERR_CAST(sb);
diff --git a/fs/smb/client/cifsfs.h b/fs/smb/client/cifsfs.h
index c455b15f2778..0a93f48924a5 100644
--- a/fs/smb/client/cifsfs.h
+++ b/fs/smb/client/cifsfs.h
@@ -144,8 +144,9 @@ ssize_t cifs_file_copychunk_range(unsigned int xid, struct file *src_file,
 long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg);
 void cifs_setsize(struct inode *inode, loff_t offset);
 
+struct fs_context;
 struct smb3_fs_context;
-struct dentry *cifs_smb3_do_mount(struct file_system_type *fs_type, int flags,
+struct dentry *cifs_smb3_do_mount(struct fs_context *fc,
 				  struct smb3_fs_context *old_ctx);
 
 char *cifs_silly_fullpath(struct dentry *dentry);
diff --git a/fs/smb/client/cifsproto.h b/fs/smb/client/cifsproto.h
index 4a25afda9448..a39572cbaadb 100644
--- a/fs/smb/client/cifsproto.h
+++ b/fs/smb/client/cifsproto.h
@@ -19,6 +19,7 @@
 struct statfs;
 struct smb_rqst;
 struct smb3_fs_context;
+struct fs_context;
 
 /*
  *****************************************************************
@@ -236,7 +237,7 @@ void cifs_mount_put_conns(struct cifs_mount_ctx *mnt_ctx);
 int cifs_mount_get_session(struct cifs_mount_ctx *mnt_ctx);
 int cifs_is_path_remote(struct cifs_mount_ctx *mnt_ctx);
 int cifs_mount_get_tcon(struct cifs_mount_ctx *mnt_ctx);
-int cifs_match_super(struct super_block *sb, void *data);
+int cifs_match_super(struct super_block *sb, struct fs_context *fc);
 int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb3_fs_context *ctx);
 void cifs_umount(struct cifs_sb_info *cifs_sb);
 void cifs_mark_open_files_invalid(struct cifs_tcon *tcon);
diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c
index dcde25da468d..79762e6bbe50 100644
--- a/fs/smb/client/connect.c
+++ b/fs/smb/client/connect.c
@@ -6,6 +6,7 @@
  *
  */
 #include <linux/fs.h>
+#include <linux/fs_context.h>
 #include <linux/net.h>
 #include <linux/string.h>
 #include <linux/sched/mm.h>
@@ -2991,9 +2992,9 @@ static int match_prepath(struct super_block *sb,
 }
 
 int
-cifs_match_super(struct super_block *sb, void *data)
+cifs_match_super(struct super_block *sb, struct fs_context *fc)
 {
-	struct cifs_mnt_data *mnt_data = data;
+	struct cifs_mnt_data *mnt_data = fc->sget_key;
 	struct smb3_fs_context *ctx;
 	struct cifs_sb_info *cifs_sb;
 	struct TCP_Server_Info *tcp_srv;
diff --git a/fs/smb/client/fs_context.c b/fs/smb/client/fs_context.c
index b9544eb0381b..6aba4e1c9c27 100644
--- a/fs/smb/client/fs_context.c
+++ b/fs/smb/client/fs_context.c
@@ -920,7 +920,7 @@ static int smb3_get_tree_common(struct fs_context *fc)
 	struct dentry *root;
 	int rc = 0;
 
-	root = cifs_smb3_do_mount(fc->fs_type, 0, ctx);
+	root = cifs_smb3_do_mount(fc, ctx);
 	if (IS_ERR(root))
 		return PTR_ERR(root);
 

-- 
2.47.3


^ permalink raw reply related

* [PATCH v2 2/4] ext4: convert mballoc KUnit test to sget_fc()
From: Christian Brauner @ 2026-05-29  8:43 UTC (permalink / raw)
  To: linux-fsdevel
  Cc: Theodore Ts'o, Andreas Dilger, Jan Kara, Ritesh Harjani (IBM),
	linux-ext4, linux-cifs, Alexander Viro,
	Christian Brauner (Amutable)
In-Reply-To: <20260529-work-sget-v2-0-57bbe08604e4@kernel.org>

Same treatment as the extents KUnit test. The mballoc test uses sget()
as a thin "give me an initialized superblock" wrapper for a fake
file_system_type. Move it onto sget_fc() so sget() can go away.

Add a no-op mbt_init_fs_context() so fs_context_for_mount() has
something to call on the fake fs_type. mbt_set() now takes a struct
fs_context * (still a no-op). mbt_ext4_alloc_super_block() allocates
the fc, hands it to sget_fc() and drops the fc reference once the sb
is published.

No functional change.

Signed-off-by: Christian Brauner (Amutable) <brauner@kernel.org>
---
 fs/ext4/mballoc-test.c | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/fs/ext4/mballoc-test.c b/fs/ext4/mballoc-test.c
index 90ed505fa4b1..d90da44aadbd 100644
--- a/fs/ext4/mballoc-test.c
+++ b/fs/ext4/mballoc-test.c
@@ -5,6 +5,7 @@
 
 #include <kunit/test.h>
 #include <kunit/static_stub.h>
+#include <linux/fs_context.h>
 #include <linux/random.h>
 
 #include "ext4.h"
@@ -63,8 +64,14 @@ static void mbt_kill_sb(struct super_block *sb)
 	generic_shutdown_super(sb);
 }
 
+static int mbt_init_fs_context(struct fs_context *fc)
+{
+	return 0;
+}
+
 static struct file_system_type mbt_fs_type = {
 	.name			= "mballoc test",
+	.init_fs_context	= mbt_init_fs_context,
 	.kill_sb		= mbt_kill_sb,
 };
 
@@ -127,7 +134,7 @@ static void mbt_mb_release(struct super_block *sb)
 	kfree(sb->s_bdev);
 }
 
-static int mbt_set(struct super_block *sb, void *data)
+static int mbt_set(struct super_block *sb, struct fs_context *fc)
 {
 	return 0;
 }
@@ -136,13 +143,19 @@ static struct super_block *mbt_ext4_alloc_super_block(void)
 {
 	struct mbt_ext4_super_block *fsb;
 	struct super_block *sb;
+	struct fs_context *fc;
 	struct ext4_sb_info *sbi;
 
 	fsb = kzalloc_obj(*fsb);
 	if (fsb == NULL)
 		return NULL;
 
-	sb = sget(&mbt_fs_type, NULL, mbt_set, 0, NULL);
+	fc = fs_context_for_mount(&mbt_fs_type, 0);
+	if (IS_ERR(fc))
+		goto out;
+
+	sb = sget_fc(fc, NULL, mbt_set);
+	put_fs_context(fc);
 	if (IS_ERR(sb))
 		goto out;
 

-- 
2.47.3


^ permalink raw reply related

* [PATCH v2 1/4] ext4: convert extents KUnit test to sget_fc()
From: Christian Brauner @ 2026-05-29  8:43 UTC (permalink / raw)
  To: linux-fsdevel
  Cc: Theodore Ts'o, Andreas Dilger, Jan Kara, Ritesh Harjani (IBM),
	linux-ext4, linux-cifs, Alexander Viro,
	Christian Brauner (Amutable)
In-Reply-To: <20260529-work-sget-v2-0-57bbe08604e4@kernel.org>

The extents KUnit test uses sget() to get an initialized superblock for
its fake file_system_type. sget() predates fs_context and we want to
retire it. Switch this caller over to sget_fc().

Add a no-op ext_init_fs_context() so fs_context_for_mount() has
something to call on the fake fs_type. ext_set() now takes a struct
fs_context * (still a no-op). extents_kunit_init() allocates the fc,
hands it to sget_fc() and drops the fc reference once the sb is
published. sget_fc() does not retain a pointer to it.

No functional change for the test.

Signed-off-by: Christian Brauner (Amutable) <brauner@kernel.org>
---
 fs/ext4/extents-test.c | 22 ++++++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

diff --git a/fs/ext4/extents-test.c b/fs/ext4/extents-test.c
index 6b53a3f39fcd..bd7795a82607 100644
--- a/fs/ext4/extents-test.c
+++ b/fs/ext4/extents-test.c
@@ -37,6 +37,7 @@
 
 #include <kunit/test.h>
 #include <kunit/static_stub.h>
+#include <linux/fs_context.h>
 #include <linux/gfp_types.h>
 #include <linux/stddef.h>
 
@@ -130,14 +131,20 @@ static void ext_kill_sb(struct super_block *sb)
 	generic_shutdown_super(sb);
 }
 
-static int ext_set(struct super_block *sb, void *data)
+static int ext_init_fs_context(struct fs_context *fc)
+{
+	return 0;
+}
+
+static int ext_set(struct super_block *sb, struct fs_context *fc)
 {
 	return 0;
 }
 
 static struct file_system_type ext_fs_type = {
-	.name = "extents test",
-	.kill_sb = ext_kill_sb,
+	.name		 = "extents test",
+	.init_fs_context = ext_init_fs_context,
+	.kill_sb	 = ext_kill_sb,
 };
 
 static void extents_kunit_exit(struct kunit *test)
@@ -223,6 +230,7 @@ static int extents_kunit_init(struct kunit *test)
 	struct ext4_inode_info *ei;
 	struct inode *inode;
 	struct super_block *sb;
+	struct fs_context *fc;
 	struct ext4_sb_info *sbi = NULL;
 	struct kunit_ext_test_param *param =
 		(struct kunit_ext_test_param *)(test->param_value);
@@ -232,7 +240,13 @@ static int extents_kunit_init(struct kunit *test)
 	if (sbi == NULL)
 		return -ENOMEM;
 
-	sb = sget(&ext_fs_type, NULL, ext_set, 0, NULL);
+	fc = fs_context_for_mount(&ext_fs_type, 0);
+	if (IS_ERR(fc)) {
+		kfree(sbi);
+		return PTR_ERR(fc);
+	}
+	sb = sget_fc(fc, NULL, ext_set);
+	put_fs_context(fc);
 	if (IS_ERR(sb)) {
 		kfree(sbi);
 		return PTR_ERR(sb);

-- 
2.47.3


^ permalink raw reply related

* [PATCH v2 0/4] super: retire sget()
From: Christian Brauner @ 2026-05-29  8:43 UTC (permalink / raw)
  To: linux-fsdevel
  Cc: Theodore Ts'o, Andreas Dilger, Jan Kara, Ritesh Harjani (IBM),
	linux-ext4, linux-cifs, Alexander Viro,
	Christian Brauner (Amutable)

CIFS plus the two ext4 KUnit tests (extents-test, mballoc-test) were the
last in-tree callers, and all three convert cleanly to sget_fc(). That
lets sget() and its prototype come out, taking ~60 lines that only
existed to be kept in lockstep with sget_fc() on every publish-path
change.

Signed-off-by: Christian Brauner (Amutable) <brauner@kernel.org>
---
Changes in v2:
- Move some changes into a separate series.
- Link to v1: https://patch.msgid.link/20260526-work-sget-v1-0-263f7025cedd@kernel.org

---
Christian Brauner (4):
      ext4: convert extents KUnit test to sget_fc()
      ext4: convert mballoc KUnit test to sget_fc()
      smb: client: convert cifs_smb3_do_mount() to sget_fc()
      fs: retire sget()

 fs/btrfs/super.c           |  2 +-
 fs/ext4/extents-test.c     | 22 +++++++++++---
 fs/ext4/mballoc-test.c     | 17 +++++++++--
 fs/smb/client/cifsfs.c     | 43 +++++++++++++++++-----------
 fs/smb/client/cifsfs.h     |  3 +-
 fs/smb/client/cifsproto.h  |  3 +-
 fs/smb/client/connect.c    |  5 ++--
 fs/smb/client/fs_context.c |  2 +-
 fs/super.c                 | 71 ++++------------------------------------------
 include/linux/fs.h         |  4 ---
 10 files changed, 73 insertions(+), 99 deletions(-)
---
base-commit: 254f49634ee16a731174d2ae34bc50bd5f45e731
change-id: 20260526-work-sget-6bc80b96cba5


^ permalink raw reply

* Re: [PATCH] ext4: fix quota accounting WARN in bigalloc punch hole
From: Qiliang Yuan @ 2026-05-29  8:34 UTC (permalink / raw)
  To: yi.zhang
  Cc: tytso, adilger.kernel, jack, ritesh.list, ojaswin, libaokun,
	linux-ext4, linux-kernel, yzjaurora
In-Reply-To: <73e91ebd-27de-4834-af2f-9b4ac19a4100@huaweicloud.com>

Hi Zhang Yi,

On 5/29/2026 3:32 PM, Zhang Yi wrote:
> On 5/28/2026 6:21 PM, Qiliang Yuan wrote:
>> __es_remove_extent() -> get_rsvd() already correctly excludes
>> boundary clusters that still contain delayed blocks from resv_used.
>> Adding pending to resv_used double-counts those boundary clusters,
>> erroneously releasing reservations that are still needed.
>
> Hmm, the analysis doesn't seem correct to me. Do you mean the
> following case?
>
> # Assume the cluster size is 16KB.
> xfs_io -f -c "pwrite 12k 4k" /mnt/foo
> xfs_io -d -c "pwrite 0 4k" /mnt/foo
> xfs_io -c "fpunch 0 4k" /mnt/foo
>
> During the direct I/O write, quota space will be added in
> ext4_mb_new_blocks() because the EXT4_MB_DELALLOC_RESERVED flag is
> not set. Therefore, in ext4_es_insert_extent(), we should release the
> quota reservations, since this cluster has already been allocated.
>
> Then, in the third operation (punch hole), it will reclaim the added
> dqb_curspace. This should not cause an insufficiency.
>
> Am I missing something?

Thanks for the review!  Let me explain the issue with your specific
example.

After step 1 (delalloc write 4KB@12KB in a 16KB cluster), we have:

  ES tree: [0,1) hole, [1,3) delayed, [3,4) hole   (blocks 0..3)
  Quota:   dqb_rsvspace += 16KB (one cluster reserved)

Step 2 (DIO write 4KB@0KB, RWF_DSYNC):

  The DIO allocates one cluster, but the mapped extent from
  ext4_ext_map_blocks() only covers the written range, e.g. [0,0].

  In ext4_es_insert_extent():

    a) __es_remove_extent([0,0]) removes the [0,1) hole entry, but
       there are no delayed extents within [0,0], so resv_used = 0.

       This is correct: the DIO extent [0,0] does not overlap the
       delayed region [1,3).

    b) __revise_pending() scans outside the newly inserted extent [0,0]:

       - Left boundary (block 0): the range starts at cluster
         boundary, no blocks to scan on the left → no pending insert.

       - Right boundary (block 3): blocks [1,3] are outside [0,0]
         and are delayed → __revise_pending() inserts a pending
         reservation and returns pending = 1.

       This pending reservation means: "cluster containing block 3
       still has delayed blocks, keep this cluster reserved."

    c) Then comes the bug:

         resv_used += pending;   // resv_used = 0 + 1 = 1

       This causes ext4_da_update_reserve_space() to release 16KB
       of quota reservation (dquot_release_reservation_block()).  But
       block 3 is still delayed!  Its quota reservation should NOT
       be released — no blocks within [0,0] actually used the delalloc
       reservation.

    So after DIO:
      dqb_rsvspace: originally 16KB, now 0 (incorrectly released)
      dqb_curspace: 16KB (from ext4_mb_new_blocks)

Step 3 (punch 4KB@0KB):

  ext4_remove_blocks() sees that block 0's cluster has a pending
  reservation → calls ext4_rereserve_cluster() → dquot_reclaim_block()
  tries to move 16KB from dqb_curspace to dqb_rsvspace.  But
  dqb_curspace may already be insufficient (depending on whether
  other allocs/frees have happened), triggering:

    WARNING at dquot_reclaim_space_nodirty

Step 4 (delalloc writeback of block 3):

  ext4_da_update_reserve_space() → dquot_claim_block() tries to
  move 16KB from dqb_rsvspace to dqb_curspace.  Since rsvspace was
  incorrectly released and not fully restored by the punch hole's
  rereseve, this triggers:

    WARNING at dquot_claim_space_nodirty

The key point is: pending from __revise_pending() indicates clusters
that *still contain delayed blocks outside the newly inserted extent*.
These clusters' quota reservations must be preserved — get_rsvd()
inside __es_remove_extent() already correctly excludes them from
resv_used.  Adding pending to resv_used double-counts them.

I also found a pre-existing retry loop issue: if __es_insert_extent()
fails with -ENOMEM on the first pass, resv_used carries a stale value
back to retry and could be double-released.  I'll include a fix for
that in v2.

-- 
Qiliang Yuan

^ permalink raw reply

* Re: [PATCH] ext4: validate dx count against limit in dx_csum
From: Baokun Li @ 2026-05-29  8:00 UTC (permalink / raw)
  To: Artem Blagodarenko; +Cc: adilger.kernel, linux-ext4
In-Reply-To: <20260528160557.6956-1-ablagodarenko@thelustrecollective.com>

On 2026/5/29 00:05, Artem Blagodarenko wrote:
> From: Artem Blagodarenko <artem.blagodarenko@gmail.com>
>
> Sashiko AI, during inspection of the "ext4: replace ext4_dir_entry
> with ext4_dir_entry_2" series, reported a missing bounds check on
> count against limit in DX block checksum verification and setup
> paths.
>
> The code validates that limit fits within block boundaries, but does
> not verify that count <= limit. A maliciously crafted filesystem image
> could therefore provide an artificially large count value.
>
> Since ext4_dx_csum() uses count to determine the checksum region
> size, this may result in an out-of-bounds access crossing page
> boundaries into unmapped memory, potentially leading to a crash during
> checksum verification.
>
> The same issue exists in ext4_dx_csum_set().
>
> The bug was not introduced by the patch under review, so the fix is sent
> separately.
>
> Signed-off-by: Artem Blagodarenko artem.blagodarenko@gmail.com

Looks good, feel free to add:

Reviewed-by: Baokun Li <libaokun@linux.alibaba.com>

> ---
>  fs/ext4/namei.c | 6 ++++--
>  1 file changed, 4 insertions(+), 2 deletions(-)
>
> diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
> index a316fc2ac41b..20b7bac69889 100644
> --- a/fs/ext4/namei.c
> +++ b/fs/ext4/namei.c
> @@ -472,7 +472,8 @@ static int ext4_dx_csum_verify(struct inode *inode,
>  	}
>  	limit = le16_to_cpu(c->limit);
>  	count = le16_to_cpu(c->count);
> -	if (count_offset + (limit * sizeof(struct dx_entry)) >
> +	if (!count || count > limit ||
> +	    count_offset + (limit * sizeof(struct dx_entry)) >
>  	    EXT4_BLOCK_SIZE(inode->i_sb) - sizeof(struct dx_tail)) {
>  		warn_no_space_for_csum(inode);
>  		return 0;
> @@ -501,7 +502,8 @@ static void ext4_dx_csum_set(struct inode *inode, struct ext4_dir_entry_2 *diren
>  	}
>  	limit = le16_to_cpu(c->limit);
>  	count = le16_to_cpu(c->count);
> -	if (count_offset + (limit * sizeof(struct dx_entry)) >
> +	if (!count || count > limit ||
> +	    count_offset + (limit * sizeof(struct dx_entry)) >
>  	    EXT4_BLOCK_SIZE(inode->i_sb) - sizeof(struct dx_tail)) {
>  		warn_no_space_for_csum(inode);
>  		return;



^ permalink raw reply

* Re: [PATCH] jbd2: update outdated comment for jbd2_journal_try_to_free_buffers()
From: Baokun Li @ 2026-05-29  7:44 UTC (permalink / raw)
  To: Zhang Yi
  Cc: linux-ext4, linux-fsdevel, linux-kernel, tytso, adilger.kernel,
	jack, ojaswin, ritesh.list, yi.zhang, yizhang089, yangerkun,
	yukuai
In-Reply-To: <20260522030540.3896201-1-yi.zhang@huaweicloud.com>

On 2026/5/22 11:05, Zhang Yi wrote:
> From: Zhang Yi <yi.zhang@huawei.com>
>
> jbd2_journal_try_to_free_buffers() currently only tries to remove
> checkpointed data buffers from the checkpoint list for data=journal
> mode, and bails out if any buffer is still attached to a transaction.
> For data=ordered and writeback modes, data buffers never have
> journal_heads, so the function degenerates to a plain
> try_to_free_buffers() call.
>
> Besides, The release of metadata buffers has been delegated to the jbd2
> journal shrinker in commit 4ba3fcdde7e3 ("jbd2,ext4: add a shrinker to
> release checkpointed buffers"). jbd2_journal_try_to_free_buffers() is
> not used for handling metadata buffers now.
>
> However, the comment above the function still references
> jbd2_journal_dirty_data(), __jbd2_journal_unfile_buffer(), t_datalist,
> BKL, and BUF_CLEAN, all of which were removed in commit 87c89c232c8f
> ("jbd2: Remove data=ordered mode support using jbd buffer heads").
>
> Replace it with a description of what the function actually does now.
>
> Signed-off-by: Zhang Yi <yi.zhang@huawei.com>

Looks good!

Reviewed-by: Baokun Li <libaokun@linux.alibaba.com>

> ---
>  fs/jbd2/transaction.c | 39 ++++++++++++---------------------------
>  1 file changed, 12 insertions(+), 27 deletions(-)
>
> diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
> index 4885903bbd10..239bcf88ed1c 100644
> --- a/fs/jbd2/transaction.c
> +++ b/fs/jbd2/transaction.c
> @@ -2139,38 +2139,23 @@ static void __jbd2_journal_unfile_buffer(struct journal_head *jh)
>  }
>  
>  /**
> - * jbd2_journal_try_to_free_buffers() - try to free page buffers.
> + * jbd2_journal_try_to_free_buffers() - try to free folio buffers.
>   * @journal: journal for operation
>   * @folio: Folio to detach data from.
>   *
> - * For all the buffers on this page,
> - * if they are fully written out ordered data, move them onto BUF_CLEAN
> - * so try_to_free_buffers() can reap them.
> + * For each buffer_head on @folio, if the buffer has a journal_head but
> + * is not attached to a running or committing transaction, try to remove
> + * it from the checkpoint list.  This is needed for data=journal mode
> + * where data buffers are journaled: once they are checkpointed, the
> + * journal_head can be detached and the buffer freed.  If any buffer is
> + * still attached to a transaction, the folio cannot be released and we
> + * bail out.  Otherwise we call try_to_free_buffers() to detach all
> + * buffer_heads from the folio.
>   *
> - * This function returns non-zero if we wish try_to_free_buffers()
> - * to be called. We do this if the page is releasable by try_to_free_buffers().
> - * We also do it if the page has locked or dirty buffers and the caller wants
> - * us to perform sync or async writeout.
> + * For data=ordered and writeback modes, data buffers never have
> + * journal_heads, so this degenerates to a plain try_to_free_buffers().
>   *
> - * This complicates JBD locking somewhat.  We aren't protected by the
> - * BKL here.  We wish to remove the buffer from its committing or
> - * running transaction's ->t_datalist via __jbd2_journal_unfile_buffer.
> - *
> - * This may *change* the value of transaction_t->t_datalist, so anyone
> - * who looks at t_datalist needs to lock against this function.
> - *
> - * Even worse, someone may be doing a jbd2_journal_dirty_data on this
> - * buffer.  So we need to lock against that.  jbd2_journal_dirty_data()
> - * will come out of the lock with the buffer dirty, which makes it
> - * ineligible for release here.
> - *
> - * Who else is affected by this?  hmm...  Really the only contender
> - * is do_get_write_access() - it could be looking at the buffer while
> - * journal_try_to_free_buffer() is changing its state.  But that
> - * cannot happen because we never reallocate freed data as metadata
> - * while the data is part of a transaction.  Yes?
> - *
> - * Return false on failure, true on success
> + * Return: true if the folio's buffers were freed, false otherwise
>   */
>  bool jbd2_journal_try_to_free_buffers(journal_t *journal, struct folio *folio)
>  {



^ permalink raw reply

* Re: [PATCH] ext4: Fix ERR_PTR(0) in ext4_mkdir()
From: Baokun Li @ 2026-05-29  7:37 UTC (permalink / raw)
  To: Hongling Zeng
  Cc: tytso, adilger.kernel, jack, ojaswin, ritesh.list, yi.zhang,
	linux-ext4, linux-kernel, zhongling0719, neil, brauner, jlayton
In-Reply-To: <20260520074634.53656-1-zenghongling@kylinos.cn>

On 2026/5/20 15:46, Hongling Zeng wrote:
> When mkdir succeeds, ext4_mkdir() returns ERR_PTR(0) which is incorrect.
> It should return NULL instead for success and ERR_PTR() only with
> negative error codes for failure.
>
> Fixes: 88d5baf69082 ("Change inode_operations.mkdir to return struct dentry *")
> Signed-off-by: Hongling Zeng <zenghongling@kylinos.cn>

Looks good.

Reviewed-by: Baokun Li <libaokun@linux.alibaba.com>

> ---
>  fs/ext4/namei.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
> index 4a47fbd8dd30..8cadaeb15b2b 100644
> --- a/fs/ext4/namei.c
> +++ b/fs/ext4/namei.c
> @@ -3054,7 +3054,7 @@ static struct dentry *ext4_mkdir(struct mnt_idmap *idmap, struct inode *dir,
>  out_retry:
>  	if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
>  		goto retry;
> -	return ERR_PTR(err);
> +	return err ? ERR_PTR(err) : NULL;
>  }
>  
>  /*



^ permalink raw reply

* Re: [PATCH] ext4: fix quota accounting WARN in bigalloc punch hole
From: Zhang Yi @ 2026-05-29  7:32 UTC (permalink / raw)
  To: Qiliang Yuan, Theodore Ts'o, Andreas Dilger, Baokun Li,
	Jan Kara, Ojaswin Mujoo, Ritesh Harjani (IBM)
  Cc: linux-ext4, linux-kernel, Zijing Yin
In-Reply-To: <20260528-fix-ext4-bigalloc-punch-hole-quota-v2-v1-1-32871356273b@gmail.com>

Hi, Qiliang!

On 5/28/2026 6:21 PM, Qiliang Yuan wrote:
> When doing direct I/O write on a bigalloc filesystem, the allocated
> extent might not cover entire clusters at its boundaries, leaving
> delayed blocks in those boundary clusters.  In ext4_es_insert_extent(),
> __revise_pending() inserts new pending reservations for those boundary
> clusters, and the return value (pending=true) was added to resv_used,
> causing ext4_da_update_reserve_space() to incorrectly release the
> quota reservations for those boundary clusters.
> 
> Later when PUNCH_HOLE removes the DIO-allocated blocks, the
> extent removal path detects the pending reservation via
> ext4_is_pending() and calls ext4_rereserve_cluster().  This tries
> to reclaim quota from dq_dqb.dqb_curspace back to dqb_rsvspace,
> but since the quota was already incorrectly released, dqb_curspace
> is insufficient, triggering:
> 
>   WARNING at dquot_reclaim_space_nodirty+0x77c/0x8c0

Hmm, the analysis doesn't seem correct to me. Do you mean the
following case?

# Assume the cluster size is 16KB.
xfs_io -f -c "pwrite 12k 4k" /mnt/foo
xfs_io -d -c "pwrite 0 4k" /mnt/foo
xfs_io -c "fpunch 0 4k" /mnt/foo

During the direct I/O write, quota space will be added in
ext4_mb_new_blocks() because the EXT4_MB_DELALLOC_RESERVED flag is
not set. Therefore, in ext4_es_insert_extent(), we should release the
quota reservations, since this cluster has already been allocated.

Then, in the third operation (punch hole), it will reclaim the added
dqb_curspace. This should not cause an insufficiency.

Am I missing something?

> 
> The subsequent delalloc writeback then fires a second WARN from
> dquot_claim_space_nodirty() for the same reason: dqb_rsvspace was
> depleted by the earlier incorrect release.
> 
> __es_remove_extent() -> get_rsvd() already correctly excludes
> boundary clusters that still contain delayed blocks from resv_used.
> Adding pending to resv_used double-counts those boundary clusters,
> erroneously releasing reservations that are still needed.
> 
> Remove the pending variable and the resv_used += pending addition.

That's not correct. Assume the following case.

# Assume the cluster size is 16KB.
xfs_io -f -c "pwrite 0 16k" /mnt/foo
xfs_io -c "sync_range -w 0 4k" /mnt/foo

Although only part of the cluster is written back, the cluster has been
allocated. Therefore, the quota needs to be claimed. However, since we
only wrote back a portion of a cluster, __es_remove_extent() will not
return the reserved clusters that need to be consumed (i.e., resv_used
is zero). Therefore, we need to determine whether a new pending
allocation has been created by checking the pending status, so that we
can correctly claim the quota.

Thanks,
Yi

> 
> Fixes: c543e2429640 ("ext4: update delalloc data reserve spcae in ext4_es_insert_extent()")
> Reported-by: Zijing Yin <yzjaurora@gmail.com>
> Closes: https://bugzilla.kernel.org/show_bug.cgi?id=221570
> Signed-off-by: Qiliang Yuan <realwujing@gmail.com>
> ---
>  fs/ext4/extents_status.c | 4 +---
>  1 file changed, 1 insertion(+), 3 deletions(-)
> 
> diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
> index 6e4a191e82191..fefe0bb8ac4d1 100644
> --- a/fs/ext4/extents_status.c
> +++ b/fs/ext4/extents_status.c
> @@ -909,7 +909,7 @@ void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
>  	struct extent_status newes;
>  	ext4_lblk_t end = lblk + len - 1;
>  	int err1 = 0, err2 = 0, err3 = 0;
> -	int resv_used = 0, pending = 0;
> +	int resv_used = 0;
>  	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
>  	struct extent_status *es1 = NULL;
>  	struct extent_status *es2 = NULL;
> @@ -977,7 +977,6 @@ void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
>  			__free_pending(pr);
>  			pr = NULL;
>  		}
> -		pending = err3;
>  	}
>  	ext4_es_inc_seq(inode);
>  error:
> @@ -998,7 +997,6 @@ void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
>  	 * any previously delayed allocated clusters instead of claim them
>  	 * again.
>  	 */
> -	resv_used += pending;
>  	if (resv_used)
>  		ext4_da_update_reserve_space(inode, resv_used,
>  					     delalloc_reserve_used);
> 
> ---
> base-commit: eb3f4b7426cfd2b79d65b7d37155480b32259a11
> change-id: 20260528-fix-ext4-bigalloc-punch-hole-quota-v2-2adca315d1ba
> 
> Best regards,


^ permalink raw reply

* Re: [PATCH] ext4: convert legacy ext4_debug() to standard pr_debug()
From: Baokun Li @ 2026-05-29  7:32 UTC (permalink / raw)
  To: lirongqing
  Cc: Theodore Ts'o, Jan Kara, Zhang Yi, Andreas Dilger,
	Ojaswin Mujoo, Ritesh Harjani, linux-ext4, linux-kernel
In-Reply-To: <20260521074634.2697-1-lirongqing@baidu.com>

On 2026/5/21 15:46, lirongqing wrote:
> From: Li RongQing <lirongqing@baidu.com>
>
> The ext4 file system historically implemented its own debug logging macro
> ext4_debug() via EXT4FS_DEBUG conditional compilation. This legacy
> implementation suffers from two major drawbacks:
>
> 1. It makes two consecutive un-serialized printk() calls, which can
>    lead to severe log interleaving and corruption under multi-core
>    concurrent workloads.
> 2. It completely bypasses the standard modern kernel dynamic debug
>    (CONFIG_DYNAMIC_DEBUG) infrastructure.
>
> Clean up the legacy implementation by leveraging pr_debug(). This squashes
> the multiple printk() calls into a single atomic execution, ensuring
> log integrity, while seamlessly hooking ext4 into the kernel's native
> dynamic debug framework.
>
> The redundant __FILE__ and __LINE__ macros are intentionally removed from
> the string format because the dynamic debug infrastructure can already
> append them automatically at runtime (via the '+fl' flags) if desired.
> This avoids redundancy and double-logging in modern production/debugging
> environments while keeping the macro clean and robust against dangling
> comma compiler errors.
>
> Signed-off-by: Li RongQing <lirongqing@baidu.com>

Thanks for cleaning this up.

> ---
>  fs/ext4/ext4.h | 20 ++------------------
>  1 file changed, 2 insertions(+), 18 deletions(-)
>
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 94283a9..39e86ff 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -62,24 +62,8 @@
>   */
>  #define DOUBLE_CHECK__
>  
> -/*
> - * Define EXT4FS_DEBUG to produce debug messages
> - */
> -#undef EXT4FS_DEBUG
> -

This looks like it's just disabling EXT4FS_DEBUG, but it's actually
the only toggle point — enabling debug required manually flipping this
to #define EXT4FS_DEBUG in the source.

Since balloc.c, ialloc.c, and page-io.c still have #ifdef EXT4FS_DEBUG
blocks that weren't cleaned up here, it would be cleaner to replace
those with CONFIG_EXT4_DEBUG so they fall under a single
Kconfig-controlled umbrella.

> -/*
> - * Debug code
> - */
> -#ifdef EXT4FS_DEBUG
> -#define ext4_debug(f, a...)						\
> -	do {								\
> -		printk(KERN_DEBUG "EXT4-fs DEBUG (%s, %d): %s:",	\
> -			__FILE__, __LINE__, __func__);			\
> -		printk(KERN_DEBUG f, ## a);				\
> -	} while (0)
> -#else
> -#define ext4_debug(fmt, ...)	no_printk(fmt, ##__VA_ARGS__)
> -#endif
> +#define ext4_debug(fmt, ...)						\
> +	pr_debug("EXT4-fs DEBUG %s: " fmt, __func__,  ##__VA_ARGS__)

Nit: an extra space between __func__, and ##__VA_ARGS__.

>  
>   /*
>    * Turn on EXT_DEBUG to enable ext4_ext_show_path/leaf/move in extents.c

Also, could ext4_debug be moved next to ext_debug and placed under
#ifdef CONFIG_EXT4_DEBUG together, for a cleaner layout?


Cheers,
Baokun


^ permalink raw reply

* Re: [PATCH] ext4: Remove mention of PageWriteback
From: Baokun Li @ 2026-05-29  6:33 UTC (permalink / raw)
  To: Matthew Wilcox (Oracle)
  Cc: Theodore Ts'o, Andreas Dilger, Jan Kara, Ojaswin Mujoo,
	Ritesh Harjani (IBM), Zhang Yi, linux-ext4, linux-kernel
In-Reply-To: <20260526190805.341676-1-willy@infradead.org>

On 2026/5/27 03:08, Matthew Wilcox (Oracle) wrote:
> Update a comment to refer to the concept of writeback instead of the
> (now obsolete) detail of how it's implemented.
>
> Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>

Looks good to me.

Reviewed-by: Baokun Li <libaokun@linux.alibaba.com>

> ---
>  fs/ext4/page-io.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
> index dc82e7b57e75..bc674aa4a656 100644
> --- a/fs/ext4/page-io.c
> +++ b/fs/ext4/page-io.c
> @@ -168,7 +168,7 @@ static void ext4_release_io_end(ext4_io_end_t *io_end)
>   * written. On IO failure, check if journal abort is needed. Note that
>   * we are protected from truncate touching same part of extent tree by the
>   * fact that truncate code waits for all DIO to finish (thus exclusion from
> - * direct IO is achieved) and also waits for PageWriteback bits. Thus we
> + * direct IO is achieved) and also waits for writeback to complete. Thus we
>   * cannot get to ext4_ext_truncate() before all IOs overlapping that range are
>   * completed (happens from ext4_free_ioend()).
>   */



^ permalink raw reply

* Re: [PATCH] ext4: Remove mention of PageWriteback
From: Ojaswin Mujoo @ 2026-05-29  6:01 UTC (permalink / raw)
  To: Matthew Wilcox (Oracle)
  Cc: Theodore Ts'o, Andreas Dilger, Baokun Li, Jan Kara,
	Ritesh Harjani (IBM), Zhang Yi, linux-ext4, linux-kernel
In-Reply-To: <20260526190805.341676-1-willy@infradead.org>

On Tue, May 26, 2026 at 08:08:02PM +0100, Matthew Wilcox (Oracle) wrote:
> Update a comment to refer to the concept of writeback instead of the
> (now obsolete) detail of how it's implemented.
> 
> Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
> ---
>  fs/ext4/page-io.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
> index dc82e7b57e75..bc674aa4a656 100644
> --- a/fs/ext4/page-io.c
> +++ b/fs/ext4/page-io.c
> @@ -168,7 +168,7 @@ static void ext4_release_io_end(ext4_io_end_t *io_end)
>   * written. On IO failure, check if journal abort is needed. Note that
>   * we are protected from truncate touching same part of extent tree by the
>   * fact that truncate code waits for all DIO to finish (thus exclusion from
> - * direct IO is achieved) and also waits for PageWriteback bits. Thus we
> + * direct IO is achieved) and also waits for writeback to complete. Thus we

Looks good,

Reviewed-by: Ojaswin Mujoo <ojaswin@linux.ibm.com>

Regards,
Ojaswin

>   * cannot get to ext4_ext_truncate() before all IOs overlapping that range are
>   * completed (happens from ext4_free_ioend()).
>   */
> -- 
> 2.47.3
> 

^ permalink raw reply

* Re: [PATCH v6 06/11] fstests: verify f_fsid for cloned filesystems
From: Darrick J. Wong @ 2026-05-29  4:39 UTC (permalink / raw)
  To: Anand Jain
  Cc: fstests, linux-btrfs, linux-ext4, linux-xfs, linux-f2fs-devel,
	zlang, hch
In-Reply-To: <e029755044a32de6ec2a0d3391f3ff0089fc3c30.1779939330.git.asj@kernel.org>

On Thu, May 28, 2026 at 12:05:37PM +0800, Anand Jain wrote:
> Verify that the cloned filesystem provides an f_fsid that is persistent
> across mount cycles, yet unique from the original filesystem's f_fsid.

Might want to add that last part to the test description itself, because
otherwise I don't know what 'verify' means.

> Signed-off-by: Anand Jain <asj@kernel.org>
> ---
>  tests/generic/802     | 67 +++++++++++++++++++++++++++++++++++++++++++
>  tests/generic/802.out |  7 +++++
>  2 files changed, 74 insertions(+)
>  create mode 100644 tests/generic/802
>  create mode 100644 tests/generic/802.out
> 
> diff --git a/tests/generic/802 b/tests/generic/802
> new file mode 100644
> index 000000000000..653e74e11b53
> --- /dev/null
> +++ b/tests/generic/802
> @@ -0,0 +1,67 @@
> +#! /bin/bash
> +# SPDX-License-Identifier: GPL-2.0
> +# Copyright (c) 2026 Anand Jain <asj@kernel.org>.  All Rights Reserved.
> +#
> +# FS QA Test 802
> +# Verify f_fsid and s_uuid of cloned filesystems across mount cycle.
> +
> +. ./common/preamble
> +
> +_begin_fstest auto quick mount clone
> +
> +_require_test
> +_require_block_device $TEST_DEV
> +_require_loop
> +
> +[ "$FSTYP" = "btrfs" ] && _fixed_by_kernel_commit xxxxxxxxxxxx \
> +	"btrfs: use on-disk uuid for s_uuid in temp_fsid mounts"
> +[ "$FSTYP" = "btrfs" ] && _fixed_by_kernel_commit xxxxxxxxxxxx \
> +	"btrfs: derive f_fsid from on-disk fsuuid and dev_t"

_fixed_by_fs_commit?

> +
> +_cleanup()
> +{
> +	cd /
> +	rm -r -f $tmp.*
> +	umount $mnt1 $mnt2 2>/dev/null
> +	_loop_image_destroy "${devs[@]}" 2> /dev/null
> +}
> +
> +# Setup base loop device and its clone
> +devs=()
> +_loop_image_create_clone devs
> +mkdir -p $TEST_DIR/$seq
> +mnt1=$TEST_DIR/$seq/mnt1
> +mnt2=$TEST_DIR/$seq/mnt2
> +mkdir -p $mnt1
> +mkdir -p $mnt2
> +
> +# Mount both filesystems simultaneously using mandatory clone mount options
> +_mount $(_common_dev_mount_options) $(_clone_mount_option) ${devs[0]} $mnt1 || \
> +						_fail "Failed to mount dev1"
> +_mount $(_common_dev_mount_options) $(_clone_mount_option) ${devs[1]} $mnt2 || \
> +						_fail "Failed to mount dev2"
> +
> +# Capture baseline filesystem IDs for comparison
> +fsid_scratch=$(stat -f -c "%i" $mnt1)
> +fsid_clone=$(stat -f -c "%i" $mnt2)
> +
> +echo "**** fsid initially ****"
> +echo $fsid_scratch | sed -e "s/$fsid_scratch/FSID_SCRATCH/g"
> +echo $fsid_clone | sed -e "s/$fsid_clone/FSID_CLONE/g"

Why echo only to sed?

--D

> +
> +# Verify that the fsids remain stable after a mount cycle, even when the
> +# mount order is reversed.
> +echo "**** fsid after mount cycle ****"
> +_unmount $mnt1
> +_unmount $mnt2
> +_mount $(_common_dev_mount_options) $(_clone_mount_option) ${devs[1]} $mnt2 || \
> +						_fail "Failed to mount dev2"
> +_mount $(_common_dev_mount_options) $(_clone_mount_option) ${devs[0]} $mnt1 || \
> +						_fail "Failed to mount dev1"
> +
> +# Compare post mount-cycle values against the baseline
> +stat -f -c "%i" $mnt1 | sed -e "s/$fsid_scratch/FSID_SCRATCH/g"
> +stat -f -c "%i" $mnt2 | sed -e "s/$fsid_clone/FSID_CLONE/g"
> +
> +status=0
> +exit
> diff --git a/tests/generic/802.out b/tests/generic/802.out
> new file mode 100644
> index 000000000000..d1e008f122bb
> --- /dev/null
> +++ b/tests/generic/802.out
> @@ -0,0 +1,7 @@
> +QA output created by 802
> +**** fsid initially ****
> +FSID_SCRATCH
> +FSID_CLONE
> +**** fsid after mount cycle ****
> +FSID_SCRATCH
> +FSID_CLONE
> -- 
> 2.43.0
> 
> 

^ permalink raw reply

* Re: [PATCH v6 05/11] fstests: verify fanotify isolation on cloned filesystems
From: Darrick J. Wong @ 2026-05-29  4:36 UTC (permalink / raw)
  To: Anand Jain
  Cc: fstests, linux-btrfs, linux-ext4, linux-xfs, linux-f2fs-devel,
	zlang, hch
In-Reply-To: <ef076b330a047d2f19ed48f5b7166f419433bb73.1779939330.git.asj@kernel.org>

On Thu, May 28, 2026 at 12:05:36PM +0800, Anand Jain wrote:
> Verify that fanotify events are correctly routed to the appropriate
> watcher when cloned filesystems are mounted.
> Helps verify kernel's event notification distinguishes between devices
> sharing the same FSID/UUID.
> 
> Signed-off-by: Anand Jain <asj@kernel.org>
> ---
>  tests/generic/801     | 135 ++++++++++++++++++++++++++++++++++++++++++
>  tests/generic/801.out |   7 +++
>  2 files changed, 142 insertions(+)
>  create mode 100644 tests/generic/801
>  create mode 100644 tests/generic/801.out
> 
> diff --git a/tests/generic/801 b/tests/generic/801
> new file mode 100644
> index 000000000000..3bfb87d41922
> --- /dev/null
> +++ b/tests/generic/801
> @@ -0,0 +1,135 @@
> +#! /bin/bash
> +# SPDX-License-Identifier: GPL-2.0
> +# Copyright (c) 2026 Anand Jain <asj@kernel.org>.  All Rights Reserved.
> +#
> +# FS QA Test 801
> +# Verify fanotify FID functionality on cloned filesystems by setting up
> +# watchers and making sure notifications are in the correct logs files.
> +
> +. ./common/preamble
> +
> +_begin_fstest auto quick mount clone
> +
> +_require_test
> +_require_block_device $TEST_DEV
> +_require_loop
> +_require_command "$FSNOTIFYWAIT_PROG" fsnotifywait
> +_require_unique_f_fsid
> +
> +_cleanup()
> +{
> +	cd /
> +	[[ -n $pid1 ]] && { kill -TERM "$pid1" 2> /dev/null; wait $pid1; }
> +	[[ -n $pid2 ]] && { kill -TERM "$pid2" 2> /dev/null; wait $pid2; }
> +
> +	if [ "$semanage_added" = "yes" ]; then
> +		semanage permissive -d unconfined_t >/dev/null 2>&1 || true
> +	fi
> +
> +	umount $mnt1 $mnt2 2>/dev/null
> +	_loop_image_destroy "${devs[@]}" 2> /dev/null
> +	rm -r -f $tmp.*
> +}
> +
> +# Run fsnotifywait in unbuffered mode to watch filesystem-wide create events
> +monitor_fanotify()
> +{
> +	local mmnt=$1
> +	exec stdbuf -oL $FSNOTIFYWAIT_PROG -m -F -S -e create "$mmnt" 2>&1

I guess you need stdbuf to force fsnotifywait to run in linebuffered
mode even if you pipe/redirect it somewhere?

> +}
> +
> +# Transform f_fsid into the hi.lo format used in fanotify FID logs
> +fsid_to_fid_parts()
> +{
> +	local fsid=$1
> +	# Pad to 16 hex chars (64-bit), then split into two 32-bit halves
> +	local padded=$(printf '%016x' "0x${fsid}")
> +	local hi=$(printf '%x' "0x${padded:0:8}")   # strips leading zeros
> +	local lo=$(printf '%x' "0x${padded:8:8}")   # strips leading zeros
> +	echo "${hi}.${lo}"
> +}
> +
> +# Create base loop device and its clone
> +devs=()
> +_loop_image_create_clone devs
> +mkdir -p $TEST_DIR/$seq
> +mnt1=$TEST_DIR/$seq/mnt1
> +mnt2=$TEST_DIR/$seq/mnt2
> +mkdir -p $mnt1
> +mkdir -p $mnt2
> +
> +# Mount both base and clone filesystems using required clone mount options
> +_mount $(_common_dev_mount_options) $(_clone_mount_option) ${devs[0]} $mnt1 || \
> +						_fail "Failed to mount dev1"
> +_mount $(_common_dev_mount_options) $(_clone_mount_option) ${devs[1]} $mnt2 || \
> +						_fail "Failed to mount dev2"
> +
> +# Fetch filesystem IDs to verify the kernel can differentiate between them
> +fsid1=$(stat -f -c "%i" $mnt1)
> +fsid2=$(stat -f -c "%i" $mnt2)
> +
> +log1=$tmp.fanotify1
> +log2=$tmp.fanotify2
> +
> +pid1=""
> +pid2=""
> +echo "Setup FID fanotify watchers on both mnt1 and mnt2"
> +
> +# Permit unconfined_t domains when SELinux is enforcing to prevent fanotify
> +# blockages
> +semanage_added="no"
> +if [ "$(getenforce 2>/dev/null)" = "Enforcing" ]; then
> +    if ! semanage permissive -l | grep -q "unconfined_t"; then
> +        semanage permissive -a unconfined_t >/dev/null 2>&1 && semanage_added="yes"
> +    fi
> +fi

Is there a cleaner way to manage setting up and automatically undoing
this step?

There might not be, since iirc the suggestion to register cleanup
functions in a cleanups=() array and call them all in reverse order
didn't go anywhere.

> +
> +# Start asynchronous fanotify monitors
> +( monitor_fanotify "$mnt1" > "$log1" ) &
> +pid1=$!
> +( monitor_fanotify "$mnt2" > "$log2" ) &
> +pid2=$!
> +sleep 2
> +
> +echo "Trigger file creation on mnt1"
> +touch $mnt1/file_on_mnt1
> +sync
> +sleep 1
> +
> +echo "Trigger file creation on mnt2"
> +touch $mnt2/file_on_mnt2
> +sync
> +sleep 1
> +
> +echo "Verify fsid in the fanotify"
> +kill $pid1 $pid2
> +wait $pid1 $pid2 2>/dev/null
> +pid1=""
> +pid2=""
> +
> +e_fsid1=$(fsid_to_fid_parts "$fsid1")
> +e_fsid2=$(fsid_to_fid_parts "$fsid2")
> +
> +# Dump debug details to the full log
> +echo $fsid1 $e_fsid1 $fsid2 $e_fsid2 >> $seqres.full
> +cat $log1 >> $seqres.full
> +cat $log2 >> $seqres.full
> +
> +# Ensure monitor 1 only captured events belonging to mnt 1 and fsid 1
> +if grep -qF "$e_fsid1" "$log1" && ! grep -qF "$e_fsid2" "$log1"; then
> +	echo "SUCCESS: mnt1 events found"
> +else
> +	[ ! -s "$log1" ] && echo "  - mnt1 received no events."
> +	grep -qF "$e_fsid2" "$log1" && echo "  - mnt1 received event from mnt2."
> +fi
> +
> +# Ensure monitor 2 only captured events belonging to mnt 2 and fsid 2
> +if grep -qF "$e_fsid2" "$log2" && ! grep -qF "$e_fsid1" "$log2"; then
> +	echo "SUCCESS: mnt2 events found"
> +else
> +	[ ! -s "$log2" ] && echo "  - mnt2 received no events."
> +	grep -qF "$e_fsid1" "$log2" && echo "  - mnt2 received event from mnt1."
> +fi
> +
> +status=0
> +exit
> diff --git a/tests/generic/801.out b/tests/generic/801.out
> new file mode 100644
> index 000000000000..d7b318d9f27c
> --- /dev/null
> +++ b/tests/generic/801.out
> @@ -0,0 +1,7 @@
> +QA output created by 801
> +Setup FID fanotify watchers on both mnt1 and mnt2
> +Trigger file creation on mnt1
> +Trigger file creation on mnt2
> +Verify fsid in the fanotify
> +SUCCESS: mnt1 events found
> +SUCCESS: mnt2 events found
> -- 
> 2.43.0
> 
> 

^ permalink raw reply

* Re: [PATCH v6 04/11] fstests: add _require_unique_f_fsid() helper
From: Darrick J. Wong @ 2026-05-29  4:30 UTC (permalink / raw)
  To: Anand Jain
  Cc: fstests, linux-btrfs, linux-ext4, linux-xfs, linux-f2fs-devel,
	zlang, hch
In-Reply-To: <983ed0f63318c930379ee74220f23aa558c16d51.1779939330.git.asj@kernel.org>

On Thu, May 28, 2026 at 12:05:35PM +0800, Anand Jain wrote:
> Add a helper to check if the target filesystem supports unique f_fsid
> tracking across cloned or snapshot instances.
> 
> Certain filesystems like XFS, Btrfs, and F2FS ensure unique f_fsid
> identifiers per filesystem instance. However, Ext4 derives its f_fsid
> directly from its superblock UUID, which leads to identical f_fsid
> values on cloned images until the UUID is manually modified by userspace.
> 
> Introduce _require_unique_f_fsid() to allow test cases requiring strict
> f_fsid uniqueness to skip gracefully on unsupported filesystems.
> 
> Signed-off-by: Anand Jain <asj@kernel.org>
> ---
>  common/rc | 21 +++++++++++++++++++++
>  1 file changed, 21 insertions(+)
> 
> diff --git a/common/rc b/common/rc
> index 937f478963b4..5446552aed92 100644
> --- a/common/rc
> +++ b/common/rc
> @@ -6314,6 +6314,27 @@ _require_fanotify_ioerrors()
>  	_notrun "$FSTYP does not support fanotify ioerrors"
>  }
>  
> +# Ext4 derives f_fsid from the superblock UUID, meaning clones share the
> +# same f_fsid until their UUIDs diverge. Conversely, XFS, Btrfs,
> +# and F2FS ensure f_fsid remains unique per filesystem instance (often by
> +# deriving it from the UUID and underlying block device.)
> +#
> +# Across all filesystems, a UUID collision causes libblkid tools to return
> +# non-deterministic device mappings. It is ultimately the responsibility

"device mappings", as in /dev/disk/by-id/$UUID ?

> +# of the userspace utility or use-case to enforce uniqueness when a clone
> +# diverges. For details, see mailing list thread discussions titled:
> +#      "ext4: derive f_fsid from block device to avoid collisions".

How about providing a direct lore link?

--D

> +_require_unique_f_fsid()
> +{
> +	# Skip the test if the filesystem does not enforce unique f_fsids
> +	# natively. Checking this dynamically requires recreating a clone
> +	# layout, so we use a static lookup based on FSTYP.
> +	if [ "$FSTYP" == "ext4" ]; then
> +		_notrun "Target filesystem ($FSTYP) does not guarantee unique f_fsid on clones."
> +	fi
> +}
> +
> +
>  # Computes a percentage of the available space in a filesystem and
>  # returns that quantity in MB. The percentage must not contain a percent
>  # sign ("%").
> -- 
> 2.43.0
> 
> 

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox