Linux EXT4 FS development

Linux EXT4 FS development
 help / color / mirror / Atom feed

* [PATCH 03/16] blk-crypto: Allow control over whether hardware is used
From: Eric Biggers @ 2026-06-24  5:03 UTC (permalink / raw)
  To: linux-fscrypt
  Cc: linux-fsdevel, linux-ext4, linux-f2fs-devel, linux-block,
	Christoph Hellwig, Theodore Ts'o, Andreas Dilger, Baokun Li,
	Jan Kara, Ojaswin Mujoo, Ritesh Harjani, Zhang Yi, Jaegeuk Kim,
	Chao Yu, Eric Biggers
In-Reply-To: <20260624050334.124606-1-ebiggers@kernel.org>

fscrypt uses inline encryption hardware only when the "inlinecrypt"
mount option is given.  I'd like to keep that behavior even after
standardizing on the blk-crypto API for file contents encryption.  That
is, the default should continue to be the well-tested CPU-based
encryption code, and the use of inline encryption hardware should
continue to be an opt-in feature for systems where it's beneficial and
has been fully validated (including verifying ciphertext correctness).

To support this use case, add an allow_hw field to struct
blk_crypto_config.

For now it's always set to true.  Later commits will change that.

Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---
 block/blk-crypto.c          | 8 +++++++-
 drivers/md/dm-inlinecrypt.c | 2 +-
 fs/crypto/inline_crypt.c    | 3 ++-
 include/linux/blk-crypto.h  | 6 +++++-
 4 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/block/blk-crypto.c b/block/blk-crypto.c
index dd83fc5af282..c157db869183 100644
--- a/block/blk-crypto.c
+++ b/block/blk-crypto.c
@@ -298,20 +298,21 @@ int __blk_crypto_rq_bio_prep(struct request *rq, struct bio *bio,
  * @key_type: type of the key -- either raw or hardware-wrapped
  * @crypto_mode: identifier for the encryption algorithm to use
  * @dun_bytes: number of bytes that will be used to specify the DUN when this
  *	       key is used
  * @data_unit_size: the data unit size to use for en/decryption
+ * @allow_hw: true if using inline encryption hardware is allowed
  *
  * Return: 0 on success, -errno on failure.  The caller is responsible for
  *	   zeroizing both blk_key and key_bytes when done with them.
  */
 int blk_crypto_init_key(struct blk_crypto_key *blk_key,
 			const u8 *key_bytes, size_t key_size,
 			enum blk_crypto_key_type key_type,
 			enum blk_crypto_mode_num crypto_mode,
 			unsigned int dun_bytes,
-			unsigned int data_unit_size)
+			unsigned int data_unit_size, bool allow_hw)
 {
 	const struct blk_crypto_mode *mode;
 
 	memset(blk_key, 0, sizeof(*blk_key));
 
@@ -326,10 +327,12 @@ int blk_crypto_init_key(struct blk_crypto_key *blk_key,
 		break;
 	case BLK_CRYPTO_KEY_TYPE_HW_WRAPPED:
 		if (key_size < mode->security_strength ||
 		    key_size > BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE)
 			return -EINVAL;
+		if (!allow_hw)
+			return -EINVAL;
 		break;
 	default:
 		return -EINVAL;
 	}
 
@@ -341,10 +344,11 @@ int blk_crypto_init_key(struct blk_crypto_key *blk_key,
 
 	blk_key->crypto_cfg.crypto_mode = crypto_mode;
 	blk_key->crypto_cfg.dun_bytes = dun_bytes;
 	blk_key->crypto_cfg.data_unit_size = data_unit_size;
 	blk_key->crypto_cfg.key_type = key_type;
+	blk_key->crypto_cfg.allow_hw = allow_hw;
 	blk_key->data_unit_size_bits = ilog2(data_unit_size);
 	blk_key->size = key_size;
 	memcpy(blk_key->bytes, key_bytes, key_size);
 
 	return 0;
@@ -366,10 +370,12 @@ bool blk_crypto_config_supported_natively(struct block_device *bdev,
 {
 	struct blk_crypto_profile *profile = bdev_get_queue(bdev)->crypto_profile;
 
 	if (!profile)
 		return false;
+	if (!cfg->allow_hw)
+		return false;
 	if (!(profile->modes_supported[cfg->crypto_mode] & cfg->data_unit_size))
 		return false;
 	if (profile->max_dun_bytes_supported < cfg->dun_bytes)
 		return false;
 	if (!(profile->key_types_supported & cfg->key_type))
diff --git a/drivers/md/dm-inlinecrypt.c b/drivers/md/dm-inlinecrypt.c
index be1b4aa8f28b..a0f039c1e153 100644
--- a/drivers/md/dm-inlinecrypt.c
+++ b/drivers/md/dm-inlinecrypt.c
@@ -404,11 +404,11 @@ static int inlinecrypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 		       (ctx->sector_bits - SECTOR_SHIFT);
 	dun_bytes = DIV_ROUND_UP(fls64(ctx->max_dun), 8);
 
 	err = blk_crypto_init_key(&ctx->key, key_bytes, ctx->key_size,
 				  ctx->key_type, cipher->mode_num,
-				  dun_bytes, ctx->sector_size);
+				  dun_bytes, ctx->sector_size, true);
 	if (err) {
 		ti->error = "Error initializing blk-crypto key";
 		goto bad;
 	}
 
diff --git a/fs/crypto/inline_crypt.c b/fs/crypto/inline_crypt.c
index 47324062fee5..0d4c0dd04d20 100644
--- a/fs/crypto/inline_crypt.c
+++ b/fs/crypto/inline_crypt.c
@@ -132,10 +132,11 @@ int fscrypt_select_encryption_impl(struct fscrypt_inode_info *ci,
 	crypto_cfg.crypto_mode = ci->ci_mode->blk_crypto_mode;
 	crypto_cfg.data_unit_size = 1U << ci->ci_data_unit_bits;
 	crypto_cfg.dun_bytes = fscrypt_get_dun_bytes(ci);
 	crypto_cfg.key_type = is_hw_wrapped_key ?
 		BLK_CRYPTO_KEY_TYPE_HW_WRAPPED : BLK_CRYPTO_KEY_TYPE_RAW;
+	crypto_cfg.allow_hw = true;
 
 	devs = fscrypt_get_devices(sb, &num_devs);
 	if (IS_ERR(devs))
 		return PTR_ERR(devs);
 
@@ -173,11 +174,11 @@ int fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key,
 	if (!blk_key)
 		return -ENOMEM;
 
 	err = blk_crypto_init_key(blk_key, key_bytes, key_size, key_type,
 				  crypto_mode, fscrypt_get_dun_bytes(ci),
-				  1U << ci->ci_data_unit_bits);
+				  1U << ci->ci_data_unit_bits, true);
 	if (err) {
 		fscrypt_err(inode, "error %d initializing blk-crypto key", err);
 		goto fail;
 	}
 
diff --git a/include/linux/blk-crypto.h b/include/linux/blk-crypto.h
index f7c3cb4a342f..7b9dca89aec9 100644
--- a/include/linux/blk-crypto.h
+++ b/include/linux/blk-crypto.h
@@ -75,16 +75,20 @@ enum blk_crypto_key_type {
  *	key.  This is the size in bytes of each individual plaintext and
  *	ciphertext.  This is always a power of 2.  It might be e.g. the
  *	filesystem block size or the disk sector size.
  * @dun_bytes: the maximum number of bytes of DUN used when using this key
  * @key_type: the type of this key -- either raw or hardware-wrapped
+ * @allow_hw: true if inline encryption hardware will be used if available;
+ *	      false to always use CPU-based encryption (requires
+ *	      CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK)
  */
 struct blk_crypto_config {
 	enum blk_crypto_mode_num crypto_mode;
 	unsigned int data_unit_size;
 	unsigned int dun_bytes;
 	enum blk_crypto_key_type key_type;
+	bool allow_hw;
 };
 
 /**
  * struct blk_crypto_key - an inline encryption key
  * @crypto_cfg: the crypto mode, data unit size, key type, and other
@@ -148,11 +152,11 @@ bool bio_crypt_dun_is_contiguous(const struct bio_crypt_ctx *bc,
 int blk_crypto_init_key(struct blk_crypto_key *blk_key,
 			const u8 *key_bytes, size_t key_size,
 			enum blk_crypto_key_type key_type,
 			enum blk_crypto_mode_num crypto_mode,
 			unsigned int dun_bytes,
-			unsigned int data_unit_size);
+			unsigned int data_unit_size, bool allow_hw);
 
 int blk_crypto_start_using_key(struct block_device *bdev,
 			       const struct blk_crypto_key *key);
 
 void blk_crypto_evict_key(struct block_device *bdev,
-- 
2.54.0


^ permalink raw reply related

* [PATCH 02/16] blk-crypto: Fold __blk_crypto_cfg_supported() into its caller
From: Eric Biggers @ 2026-06-24  5:03 UTC (permalink / raw)
  To: linux-fscrypt
  Cc: linux-fsdevel, linux-ext4, linux-f2fs-devel, linux-block,
	Christoph Hellwig, Theodore Ts'o, Andreas Dilger, Baokun Li,
	Jan Kara, Ojaswin Mujoo, Ritesh Harjani, Zhang Yi, Jaegeuk Kim,
	Chao Yu, Eric Biggers
In-Reply-To: <20260624050334.124606-1-ebiggers@kernel.org>

__blk_crypto_cfg_supported() is called only by
blk_crypto_config_supported_natively(), so fold it in.

Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---
 block/blk-crypto-profile.c | 22 ----------------------
 block/blk-crypto.c         | 23 +++++++++++++++++++++--
 2 files changed, 21 insertions(+), 24 deletions(-)

diff --git a/block/blk-crypto-profile.c b/block/blk-crypto-profile.c
index cf447ba4a66e..53126c091b0b 100644
--- a/block/blk-crypto-profile.c
+++ b/block/blk-crypto-profile.c
@@ -333,32 +333,10 @@ void blk_crypto_put_keyslot(struct blk_crypto_keyslot *slot)
 		spin_unlock_irqrestore(&profile->idle_slots_lock, flags);
 		wake_up(&profile->idle_slots_wait_queue);
 	}
 }
 
-/**
- * __blk_crypto_cfg_supported() - Check whether the given crypto profile
- *				  supports the given crypto configuration.
- * @profile: the crypto profile to check
- * @cfg: the crypto configuration to check for
- *
- * Return: %true if @profile supports the given @cfg.
- */
-bool __blk_crypto_cfg_supported(struct blk_crypto_profile *profile,
-				const struct blk_crypto_config *cfg)
-{
-	if (!profile)
-		return false;
-	if (!(profile->modes_supported[cfg->crypto_mode] & cfg->data_unit_size))
-		return false;
-	if (profile->max_dun_bytes_supported < cfg->dun_bytes)
-		return false;
-	if (!(profile->key_types_supported & cfg->key_type))
-		return false;
-	return true;
-}
-
 /*
  * This is an internal function that evicts a key from an inline encryption
  * device that can be either a real device or the blk-crypto-fallback "device".
  * It is used only by blk_crypto_evict_key(); see that function for details.
  */
diff --git a/block/blk-crypto.c b/block/blk-crypto.c
index 15e25e41b166..dd83fc5af282 100644
--- a/block/blk-crypto.c
+++ b/block/blk-crypto.c
@@ -349,15 +349,34 @@ int blk_crypto_init_key(struct blk_crypto_key *blk_key,
 
 	return 0;
 }
 EXPORT_SYMBOL_GPL(blk_crypto_init_key);
 
+
+/**
+ * blk_crypto_config_supported_natively() - Check whether a block device
+ *					    supports hardware inline encryption
+ *					    with the given configuration.
+ * @bdev: the block device
+ * @cfg: the crypto configuration to check for
+ *
+ * Return: %true if @bdev supports hardware inline encryption with @cfg.
+ */
 bool blk_crypto_config_supported_natively(struct block_device *bdev,
 					  const struct blk_crypto_config *cfg)
 {
-	return __blk_crypto_cfg_supported(bdev_get_queue(bdev)->crypto_profile,
-					  cfg);
+	struct blk_crypto_profile *profile = bdev_get_queue(bdev)->crypto_profile;
+
+	if (!profile)
+		return false;
+	if (!(profile->modes_supported[cfg->crypto_mode] & cfg->data_unit_size))
+		return false;
+	if (profile->max_dun_bytes_supported < cfg->dun_bytes)
+		return false;
+	if (!(profile->key_types_supported & cfg->key_type))
+		return false;
+	return true;
 }
 
 /*
  * Check if bios with @cfg can be en/decrypted by blk-crypto (i.e. either the
  * block_device it's submitted to supports inline crypto, or the
-- 
2.54.0


^ permalink raw reply related

* [PATCH 01/16] blk-crypto: Simplify check for fallback support
From: Eric Biggers @ 2026-06-24  5:03 UTC (permalink / raw)
  To: linux-fscrypt
  Cc: linux-fsdevel, linux-ext4, linux-f2fs-devel, linux-block,
	Christoph Hellwig, Theodore Ts'o, Andreas Dilger, Baokun Li,
	Jan Kara, Ojaswin Mujoo, Ritesh Harjani, Zhang Yi, Jaegeuk Kim,
	Chao Yu, Eric Biggers
In-Reply-To: <20260624050334.124606-1-ebiggers@kernel.org>

Since blk-crypto-fallback supports all blk_crypto_keys except wrapped
keys, just check for that condition directly instead of using
__blk_crypto_cfg_supported().  With this done,
__blk_crypto_cfg_supported() is now used only for the hardware support.

Signed-off-by: Eric Biggers <ebiggers@kernel.org>
---
 block/blk-crypto-fallback.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/block/blk-crypto-fallback.c b/block/blk-crypto-fallback.c
index 2a5c52ab74b4..2a8f40a65158 100644
--- a/block/blk-crypto-fallback.c
+++ b/block/blk-crypto-fallback.c
@@ -494,12 +494,11 @@ bool blk_crypto_fallback_bio_prep(struct bio *bio)
 		/* User didn't call blk_crypto_start_using_key() first */
 		bio_io_error(bio);
 		return false;
 	}
 
-	if (!__blk_crypto_cfg_supported(blk_crypto_fallback_profile,
-					&bc->bc_key->crypto_cfg)) {
+	if (bc->bc_key->crypto_cfg.key_type != BLK_CRYPTO_KEY_TYPE_RAW) {
 		bio_endio_status(bio, BLK_STS_NOTSUPP);
 		return false;
 	}
 
 	if (bio_data_dir(bio) == WRITE) {
-- 
2.54.0


^ permalink raw reply related

* [PATCH 00/16] fscrypt: Standardize on blk-crypto
From: Eric Biggers @ 2026-06-24  5:03 UTC (permalink / raw)
  To: linux-fscrypt
  Cc: linux-fsdevel, linux-ext4, linux-f2fs-devel, linux-block,
	Christoph Hellwig, Theodore Ts'o, Andreas Dilger, Baokun Li,
	Jan Kara, Ojaswin Mujoo, Ritesh Harjani, Zhang Yi, Jaegeuk Kim,
	Chao Yu, Eric Biggers

This series can also be retrieved from:

    git fetch https://git.kernel.org/pub/scm/linux/kernel/git/ebiggers/linux.git/ fscrypt-blk-crypto-v1

Currently, ext4 and f2fs (i.e., the block-based filesystems with fscrypt
support) have two file contents encryption implementations:

 - Filesystem-layer, where code in fs/crypto/ directly invokes
   crypto_skcipher to en/decrypt data using the CPU.  This
   implementation requires the management of bounce pages at the
   filesystem level.  It doesn't support direct I/O or large folios.

 - blk-crypto (also known as inline encryption), where the filesystem
   assigns bio_crypt_ctxs to bios, which are then processed either by
   the CPU using blk-crypto-fallback.c or by inline encryption hardware.
   This supports direct I/O and is compatible with large folios.

Currently, the latter implementation is enabled only when the
"inlinecrypt" mount option is given.

The persistence of the fs-layer implementation is mainly for historical
reasons, as it came first.  It's becoming increasingly hard to maintain,
especially as the filesystems get refactored to use iomap, large folios,
etc.  It's time to remove it and just rely on the similar code in
blk-crypto-fallback.  This series does that.

Some fs-layer encryption support remains in fs/crypto/ for non-block
based filesystems (UBIFS and CephFS), as well as directories and
symlinks.  So it's not entirely gone, but it's reduced.

To be clear, this just changes an internal implementation detail.  ext4
and f2fs continue to fully support encryption (fscrypt), regardless of
the presence of inline encryption hardware on the system.

Eric Biggers (16):
  blk-crypto: Simplify check for fallback support
  blk-crypto: Fold __blk_crypto_cfg_supported() into its caller
  blk-crypto: Allow control over whether hardware is used
  fscrypt: Fully disallow IV_INO_LBLK_32 with s_blocksize != PAGE_SIZE
  fscrypt: Always use blk-crypto for contents on block-based filesystems
  ext4: Remove fs-layer file contents en/decryption code
  ext4: Make ext4_bio_write_folio() return void
  ext4: Further de-generalize the bio postprocessing code
  f2fs: Remove fs-layer file contents en/decryption code
  fs/buffer: Remove fs-layer decryption code
  fscrypt: Replace calls to fscrypt_inode_uses_inline_crypto()
  fscrypt: Remove fscrypt_dio_supported()
  fscrypt: Remove fs-layer zeroout code
  fscrypt: Remove unused functions and workqueue
  fscrypt: Merge bio.c and inline_crypt.c into block.c
  fscrypt: Add safety checks to non-block-based en/decryption

 Documentation/filesystems/fscrypt.rst       |  39 ++-
 arch/loongarch/configs/loongson32_defconfig |   1 -
 arch/loongarch/configs/loongson64_defconfig |   1 -
 block/blk-crypto-fallback.c                 |   3 +-
 block/blk-crypto-profile.c                  |  22 --
 block/blk-crypto.c                          |  31 ++-
 drivers/md/dm-inlinecrypt.c                 |   2 +-
 fs/buffer.c                                 |  45 +---
 fs/crypto/Kconfig                           |   8 +-
 fs/crypto/Makefile                          |   3 +-
 fs/crypto/bio.c                             | 216 ---------------
 fs/crypto/{inline_crypt.c => block.c}       | 283 +++++++++-----------
 fs/crypto/crypto.c                          | 140 ++++------
 fs/crypto/fscrypt_private.h                 |  28 +-
 fs/crypto/keysetup.c                        |  31 +--
 fs/crypto/policy.c                          |  17 ++
 fs/ext4/crypto.c                            |   2 +-
 fs/ext4/ext4.h                              |   6 +-
 fs/ext4/inode.c                             |  64 +----
 fs/ext4/page-io.c                           |  74 +----
 fs/ext4/readpage.c                          | 140 +++-------
 fs/ext4/super.c                             |   6 +-
 fs/f2fs/compress.c                          |  28 +-
 fs/f2fs/data.c                              |  93 +------
 fs/f2fs/f2fs.h                              |   2 -
 fs/f2fs/file.c                              |   2 -
 fs/f2fs/segment.c                           |   2 -
 fs/f2fs/super.c                             |   2 +-
 include/linux/blk-crypto.h                  |   6 +-
 include/linux/fscrypt.h                     |  96 ++-----
 30 files changed, 357 insertions(+), 1036 deletions(-)
 delete mode 100644 fs/crypto/bio.c
 rename fs/crypto/{inline_crypt.c => block.c} (61%)


base-commit: 1dc18801be29bc54709aa355b8acd80e183b03cd
prerequisite-patch-id: 319d2891e88c7df1ebb5ebf434d18b68f770399f
prerequisite-patch-id: f6157c86deab0ff5ec953ae3ed6b0e84f37741bf
prerequisite-patch-id: 5330c9e4b65644baae81bd177a46be6223d2b494
prerequisite-patch-id: 073cb85332cc58e4b5066bf8f7ac948c0d9a2bac
prerequisite-patch-id: 4b1b7521df7ce7157156dbbc373c699060b21e3f
prerequisite-patch-id: edfd2a34a97697517828f233e478e5b7f8cf85c2
-- 
2.54.0


^ permalink raw reply

* Re: [PATCH] ext4, jbd2: abort journal on file data write error under data_err=abort
From: Aditya Prakash Srivastava @ 2026-06-24  3:57 UTC (permalink / raw)
  To: Zhang Yi
  Cc: tytso, jack, adilger.kernel, libaokun, ojaswin, ritesh.list,
	Zhang Yi, linux-ext4, linux-kernel, Anthony Rebello
In-Reply-To: <d94027e3-53e7-4317-bc8d-c72bbd49186f@huaweicloud.com>

Hi Yi,

Thank you very much for your feedback and for pointing out commit
ce51afb8cc5e!

I was investigating the open Bugzilla ticket #207729, which was still
marked as ASSIGNED and unresolved without any links to the fix. I'm very
glad to hear that the abort logic has already been cleanly integrated
into the end-I/O workqueue path by commit ce51afb8cc5e.

I have added a comment to Bugzilla ticket #207729 referencing
commit ce51afb8cc5e so that the ticket can be resolved/closed and
others don't duplicate this work in the future.

Thanks again for the review and guidance!

Best regards,
Aditya

On Wed, Jun 24, 2026 at 8:12 AM Zhang Yi <yi.zhang@huaweicloud.com> wrote:
>
> On 6/24/2026 12:11 AM, Aditya Srivastava wrote:
> > From: Aditya Prakash Srivastava <aditya.ansh182@gmail.com>
> >
> > The "data_err=abort" mount option in ext4 is designed to abort the
> > journal and force the filesystem into read-only mode if a file data
> > writeback failure is detected (to prevent silent data loss and stale
> > data exposure).
> >
> > However, in standard data=ordered mode, file data writeback is executed
> > and waited for during transaction commit in
> > journal_finish_inode_data_buffers(). When
> > filemap_fdatawait_range_keep_errors() detects and returns a data writeback
> > error (e.g. -EIO), JBD2 merely prints a warning message and then
> > discards the error. This results in the transaction committing
> > successfully, exposing stale/corrupted data and defeating the purpose
> > of the data_err=abort option.
>
> Hi, Aditya!
>
> Thanks for the patch. However, after commit ce51afb8cc5e ("ext4: abort
> journal on data writeback failure if in data_err=abort mode"), has
> already moved the abort logic into the end IO workqueue. In that case,
> it seems the journal abort would be triggered properly upon a data
> writeback failure. The issue described in this Bugzilla looks like an
> outdated one. Have you encountered any actual issues for now?
>
> Thanks,
> Yi.
>
> >
> > Fix this by:
> > 1. Defining a new JBD2 configuration flag, JBD2_ABORT_ON_DATA_ERR.
> > 2. In JBD2, if JBD2_ABORT_ON_DATA_ERR is set, abort the transaction commit
> >    and the journal thread via jbd2_journal_abort() if writeback fails.
> > 3. In ext4, configure JBD2_ABORT_ON_DATA_ERR on the journal based on the
> >    ext4 "data_err=abort" mount option in ext4_init_journal_params().
> >
> > Reported-by: Anthony Rebello <arebello@redhat.com>
> > Closes: https://bugzilla.kernel.org/show_bug.cgi?id=207729
> > Suggested-by: Jan Kara <jack@suse.cz>
> > Signed-off-by: Aditya Prakash Srivastava <aditya.ansh182@gmail.com>
> > ---
> >  fs/ext4/super.c      | 4 ++++
> >  fs/jbd2/commit.c     | 2 ++
> >  include/linux/jbd2.h | 1 +
> >  3 files changed, 7 insertions(+)
> >
> > diff --git a/fs/ext4/super.c b/fs/ext4/super.c
> > index 7283108d7609..de34490a5b68 100644
> > --- a/fs/ext4/super.c
> > +++ b/fs/ext4/super.c
> > @@ -5875,6 +5875,10 @@ static void ext4_init_journal_params(struct super_block *sb, journal_t *journal)
> >               journal->j_flags |= JBD2_BARRIER;
> >       else
> >               journal->j_flags &= ~JBD2_BARRIER;
> > +     if (test_opt(sb, DATA_ERR_ABORT))
> > +             journal->j_flags |= JBD2_ABORT_ON_DATA_ERR;
> > +     else
> > +             journal->j_flags &= ~JBD2_ABORT_ON_DATA_ERR;
> >       /*
> >        * Always enable journal cycle record option, letting the journal
> >        * records log transactions continuously between each mount.
> > diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
> > index d8577725a2fb..49acc9d0809e 100644
> > --- a/fs/jbd2/commit.c
> > +++ b/fs/jbd2/commit.c
> > @@ -768,6 +768,8 @@ void jbd2_journal_commit_transaction(journal_t *journal)
> >               printk(KERN_WARNING
> >                       "JBD2: Detected IO errors %d while flushing file data on %s\n",
> >                       err, journal->j_devname);
> > +             if (journal->j_flags & JBD2_ABORT_ON_DATA_ERR)
> > +                     jbd2_journal_abort(journal, err);
> >               err = 0;
> >       }
> >
> > diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
> > index 7e785aa6d35d..e39679656de6 100644
> > --- a/include/linux/jbd2.h
> > +++ b/include/linux/jbd2.h
> > @@ -1410,6 +1410,7 @@ JBD2_FEATURE_INCOMPAT_FUNCS(fast_commit,        FAST_COMMIT)
> >  #define JBD2_FLUSHED 0x008   /* The journal superblock has been flushed */
> >  #define JBD2_LOADED  0x010   /* The journal superblock has been loaded */
> >  #define JBD2_BARRIER 0x020   /* Use IDE barriers */
> > +#define JBD2_ABORT_ON_DATA_ERR       0x040   /* Abort the journal on file data write errors */
> >  #define JBD2_CYCLE_RECORD            0x080   /* Journal cycled record log on
> >                                                * clean and empty filesystem
> >                                                * logging area */
>

^ permalink raw reply

* [Bug 207729] Mounting EXT4 with data_err=abort does not abort journal on data block write failure
From: bugzilla-daemon @ 2026-06-24  3:56 UTC (permalink / raw)
  To: linux-ext4
In-Reply-To: <bug-207729-13602@https.bugzilla.kernel.org/>

https://bugzilla.kernel.org/show_bug.cgi?id=207729

Aditya Prakash Srivastava (aditya.ansh182@gmail.com) changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
                 CC|                            |aditya.ansh182@gmail.com

--- Comment #5 from Aditya Prakash Srivastava (aditya.ansh182@gmail.com) ---
This has been resolved by commit ce51afb8cc5e ("ext4: abort
journal on data writeback failure if in data_err=abort mode")

-- 
You may reply to this email to add a comment.

You are receiving this mail because:
You are watching the assignee of the bug.

^ permalink raw reply

* Re: [PATCH] ext4, jbd2: abort journal on file data write error under data_err=abort
From: Zhang Yi @ 2026-06-24  2:42 UTC (permalink / raw)
  To: Aditya Srivastava, tytso
  Cc: jack, adilger.kernel, libaokun, ojaswin, ritesh.list, Zhang Yi,
	linux-ext4, linux-kernel, Anthony Rebello
In-Reply-To: <20260623161131.2189-1-aditya.ansh182@gmail.com>

On 6/24/2026 12:11 AM, Aditya Srivastava wrote:
> From: Aditya Prakash Srivastava <aditya.ansh182@gmail.com>
> 
> The "data_err=abort" mount option in ext4 is designed to abort the
> journal and force the filesystem into read-only mode if a file data
> writeback failure is detected (to prevent silent data loss and stale
> data exposure).
> 
> However, in standard data=ordered mode, file data writeback is executed
> and waited for during transaction commit in
> journal_finish_inode_data_buffers(). When
> filemap_fdatawait_range_keep_errors() detects and returns a data writeback
> error (e.g. -EIO), JBD2 merely prints a warning message and then
> discards the error. This results in the transaction committing
> successfully, exposing stale/corrupted data and defeating the purpose
> of the data_err=abort option.

Hi, Aditya!

Thanks for the patch. However, after commit ce51afb8cc5e ("ext4: abort
journal on data writeback failure if in data_err=abort mode"), has
already moved the abort logic into the end IO workqueue. In that case,
it seems the journal abort would be triggered properly upon a data
writeback failure. The issue described in this Bugzilla looks like an
outdated one. Have you encountered any actual issues for now?

Thanks,
Yi.

> 
> Fix this by:
> 1. Defining a new JBD2 configuration flag, JBD2_ABORT_ON_DATA_ERR.
> 2. In JBD2, if JBD2_ABORT_ON_DATA_ERR is set, abort the transaction commit
>    and the journal thread via jbd2_journal_abort() if writeback fails.
> 3. In ext4, configure JBD2_ABORT_ON_DATA_ERR on the journal based on the
>    ext4 "data_err=abort" mount option in ext4_init_journal_params().
> 
> Reported-by: Anthony Rebello <arebello@redhat.com>
> Closes: https://bugzilla.kernel.org/show_bug.cgi?id=207729
> Suggested-by: Jan Kara <jack@suse.cz>
> Signed-off-by: Aditya Prakash Srivastava <aditya.ansh182@gmail.com>
> ---
>  fs/ext4/super.c      | 4 ++++
>  fs/jbd2/commit.c     | 2 ++
>  include/linux/jbd2.h | 1 +
>  3 files changed, 7 insertions(+)
> 
> diff --git a/fs/ext4/super.c b/fs/ext4/super.c
> index 7283108d7609..de34490a5b68 100644
> --- a/fs/ext4/super.c
> +++ b/fs/ext4/super.c
> @@ -5875,6 +5875,10 @@ static void ext4_init_journal_params(struct super_block *sb, journal_t *journal)
>  		journal->j_flags |= JBD2_BARRIER;
>  	else
>  		journal->j_flags &= ~JBD2_BARRIER;
> +	if (test_opt(sb, DATA_ERR_ABORT))
> +		journal->j_flags |= JBD2_ABORT_ON_DATA_ERR;
> +	else
> +		journal->j_flags &= ~JBD2_ABORT_ON_DATA_ERR;
>  	/*
>  	 * Always enable journal cycle record option, letting the journal
>  	 * records log transactions continuously between each mount.
> diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
> index d8577725a2fb..49acc9d0809e 100644
> --- a/fs/jbd2/commit.c
> +++ b/fs/jbd2/commit.c
> @@ -768,6 +768,8 @@ void jbd2_journal_commit_transaction(journal_t *journal)
>  		printk(KERN_WARNING
>  			"JBD2: Detected IO errors %d while flushing file data on %s\n",
>  			err, journal->j_devname);
> +		if (journal->j_flags & JBD2_ABORT_ON_DATA_ERR)
> +			jbd2_journal_abort(journal, err);
>  		err = 0;
>  	}
>  
> diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
> index 7e785aa6d35d..e39679656de6 100644
> --- a/include/linux/jbd2.h
> +++ b/include/linux/jbd2.h
> @@ -1410,6 +1410,7 @@ JBD2_FEATURE_INCOMPAT_FUNCS(fast_commit,	FAST_COMMIT)
>  #define JBD2_FLUSHED	0x008	/* The journal superblock has been flushed */
>  #define JBD2_LOADED	0x010	/* The journal superblock has been loaded */
>  #define JBD2_BARRIER	0x020	/* Use IDE barriers */
> +#define JBD2_ABORT_ON_DATA_ERR	0x040	/* Abort the journal on file data write errors */
>  #define JBD2_CYCLE_RECORD		0x080	/* Journal cycled record log on
>  						 * clean and empty filesystem
>  						 * logging area */


^ permalink raw reply

* [PATCH] ext4, jbd2: abort journal on file data write error under data_err=abort
From: Aditya Srivastava @ 2026-06-23 16:11 UTC (permalink / raw)
  To: tytso
  Cc: jack, adilger.kernel, libaokun, ojaswin, ritesh.list, yi.zhang,
	linux-ext4, linux-kernel, Aditya Prakash Srivastava,
	Anthony Rebello

From: Aditya Prakash Srivastava <aditya.ansh182@gmail.com>

The "data_err=abort" mount option in ext4 is designed to abort the
journal and force the filesystem into read-only mode if a file data
writeback failure is detected (to prevent silent data loss and stale
data exposure).

However, in standard data=ordered mode, file data writeback is executed
and waited for during transaction commit in
journal_finish_inode_data_buffers(). When
filemap_fdatawait_range_keep_errors() detects and returns a data writeback
error (e.g. -EIO), JBD2 merely prints a warning message and then
discards the error. This results in the transaction committing
successfully, exposing stale/corrupted data and defeating the purpose
of the data_err=abort option.

Fix this by:
1. Defining a new JBD2 configuration flag, JBD2_ABORT_ON_DATA_ERR.
2. In JBD2, if JBD2_ABORT_ON_DATA_ERR is set, abort the transaction commit
   and the journal thread via jbd2_journal_abort() if writeback fails.
3. In ext4, configure JBD2_ABORT_ON_DATA_ERR on the journal based on the
   ext4 "data_err=abort" mount option in ext4_init_journal_params().

Reported-by: Anthony Rebello <arebello@redhat.com>
Closes: https://bugzilla.kernel.org/show_bug.cgi?id=207729
Suggested-by: Jan Kara <jack@suse.cz>
Signed-off-by: Aditya Prakash Srivastava <aditya.ansh182@gmail.com>
---
 fs/ext4/super.c      | 4 ++++
 fs/jbd2/commit.c     | 2 ++
 include/linux/jbd2.h | 1 +
 3 files changed, 7 insertions(+)

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 7283108d7609..de34490a5b68 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -5875,6 +5875,10 @@ static void ext4_init_journal_params(struct super_block *sb, journal_t *journal)
 		journal->j_flags |= JBD2_BARRIER;
 	else
 		journal->j_flags &= ~JBD2_BARRIER;
+	if (test_opt(sb, DATA_ERR_ABORT))
+		journal->j_flags |= JBD2_ABORT_ON_DATA_ERR;
+	else
+		journal->j_flags &= ~JBD2_ABORT_ON_DATA_ERR;
 	/*
 	 * Always enable journal cycle record option, letting the journal
 	 * records log transactions continuously between each mount.
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index d8577725a2fb..49acc9d0809e 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -768,6 +768,8 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 		printk(KERN_WARNING
 			"JBD2: Detected IO errors %d while flushing file data on %s\n",
 			err, journal->j_devname);
+		if (journal->j_flags & JBD2_ABORT_ON_DATA_ERR)
+			jbd2_journal_abort(journal, err);
 		err = 0;
 	}

diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index 7e785aa6d35d..e39679656de6 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -1410,6 +1410,7 @@ JBD2_FEATURE_INCOMPAT_FUNCS(fast_commit,	FAST_COMMIT)
 #define JBD2_FLUSHED	0x008	/* The journal superblock has been flushed */
 #define JBD2_LOADED	0x010	/* The journal superblock has been loaded */
 #define JBD2_BARRIER	0x020	/* Use IDE barriers */
+#define JBD2_ABORT_ON_DATA_ERR	0x040	/* Abort the journal on file data write errors */
 #define JBD2_CYCLE_RECORD		0x080	/* Journal cycled record log on
 						 * clean and empty filesystem
 						 * logging area */
-- 
2.47.3

^ permalink raw reply related

* [syzbot ci] Re: ext4: deferred iput framework for EA inodes
From: syzbot ci @ 2026-06-23 13:13 UTC (permalink / raw)
  To: adilger.kernel, jack, libaokun, linux-ext4, linux-kernel, ojaswin,
	ritesh.list, tytso, yi.zhang, yun.zhou
  Cc: syzbot, syzkaller-bugs
In-Reply-To: <20260623083540.2744885-1-yun.zhou@windriver.com>

syzbot ci has tested the following series

[v9] ext4: deferred iput framework for EA inodes
https://lore.kernel.org/all/20260623083540.2744885-1-yun.zhou@windriver.com
* [PATCH v9 1/4] fs: add iput_if_not_last() helper
* [PATCH v9 2/4] ext4: introduce ext4_put_ea_inode() for safe deferred iput
* [PATCH v9 3/4] ext4: convert all EA inode iput() calls to ext4_put_ea_inode()
* [PATCH v9 4/4] ext4: remove ea_inode_array mechanism in favor of ext4_put_ea_inode()

and found the following issue:
WARNING: ODEBUG bug in flush_delayed_work

Full report is available here:
https://ci.syzbot.org/series/acc1a7bd-816f-451e-86ee-a62f88ad8fcc

***

WARNING: ODEBUG bug in flush_delayed_work

tree:      torvalds
URL:       https://kernel.googlesource.com/pub/scm/linux/kernel/git/torvalds/linux
base:      502d801f0ab03e4f32f9a33d203154ce84887921
arch:      amd64
compiler:  Debian clang version 22.1.6 (++20260514074242+fc4aad7b5db3-1~exp1~20260514074407.73), Debian LLD 22.1.6
config:    https://ci.syzbot.org/builds/79d2cdf0-fde6-4e70-a88c-04d44c9d3e44/config
syz repro: https://ci.syzbot.org/findings/9b2e3f5d-279a-469f-9f2a-05f96ea11587/syz_repro

EXT4-fs warning (device loop2): ext4_multi_mount_protect:287: Invalid MMP block in superblock
------------[ cut here ]------------
ODEBUG: assert_init not available (active state 0) object: ffff8881bf08c9e0 object type: timer_list hint: 0x0
WARNING: lib/debugobjects.c:632 at debug_print_object+0xec/0x230 lib/debugobjects.c:629, CPU#1: syz.2.19/5833
Modules linked in:
CPU: 1 UID: 0 PID: 5833 Comm: syz.2.19 Not tainted syzkaller #0 PREEMPT(full) 
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.2-debian-1.16.2-1 04/01/2014
RIP: 0010:debug_print_object+0x18a/0x230 lib/debugobjects.c:629
Code: f8 48 c1 e8 03 80 3c 18 00 74 08 4c 89 ff e8 5d 04 71 fd 4d 8b 0f 4c 89 ef 48 8b 74 24 08 48 89 ea 44 89 e1 4d 89 f0 ff 34 24 <67> 48 0f b9 3a 48 83 c4 08 ff 05 a3 23 70 0b 48 83 c4 10 5b 41 5c
RSP: 0018:ffffc9000179f848 EFLAGS: 00010046

RAX: 1ffffffff179eacc RBX: dffffc0000000000 RCX: 0000000000000000
RDX: ffffffff8c2a9980 RSI: ffffffff8c2a93e0 RDI: ffffffff903ddbf0
RBP: ffffffff8c2a9980 R08: ffff8881bf08c9e0 R09: ffffffff8bcf69c0
R10: dffffc0000000000 R11: ffffffff81b25990 R12: 0000000000000000
R13: ffffffff903ddbf0 R14: ffff8881bf08c9e0 R15: ffffffff8bcf5660
FS:  00007fd11e12e6c0(0000) GS:ffff8882a922d000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007ffe46d80f88 CR3: 00000001bf40e000 CR4: 00000000000006f0
Call Trace:
 <TASK>
 debug_object_assert_init+0x237/0x370 lib/debugobjects.c:1075
 debug_timer_assert_init kernel/time/timer.c:803 [inline]
 debug_assert_init kernel/time/timer.c:848 [inline]
 __try_to_del_timer_sync kernel/time/timer.c:1457 [inline]
 __timer_delete_sync+0x181/0x520 kernel/time/timer.c:1621
 flush_delayed_work+0x48/0x100 kernel/workqueue.c:4414
 ext4_drain_ea_inode_work fs/ext4/xattr.h:196 [inline]
 __ext4_fill_super fs/ext4/super.c:5795 [inline]
 ext4_fill_super+0x54c7/0x66d0 fs/ext4/super.c:5844
 get_tree_bdev_flags+0x430/0x4f0 fs/super.c:1634
 vfs_get_tree+0x92/0x2a0 fs/super.c:1694
 fc_mount fs/namespace.c:1198 [inline]
 do_new_mount_fc fs/namespace.c:3765 [inline]
 do_new_mount+0x319/0xdc0 fs/namespace.c:3841
 do_mount fs/namespace.c:4174 [inline]
 __do_sys_mount fs/namespace.c:4390 [inline]
 __se_sys_mount+0x31d/0x420 fs/namespace.c:4367
 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline]
 do_syscall_64+0x174/0x580 arch/x86/entry/syscall_64.c:94
 entry_SYSCALL_64_after_hwframe+0x77/0x7f
RIP: 0033:0x7fd11d19e0ca
Code: 48 c7 c2 e8 ff ff ff f7 d8 64 89 02 b8 ff ff ff ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 49 89 ca b8 a5 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 e8 ff ff ff f7 d8 64 89 01 48
RSP: 002b:00007fd11e12de58 EFLAGS: 00000246
 ORIG_RAX: 00000000000000a5
RAX: ffffffffffffffda RBX: 00007fd11e12dee0 RCX: 00007fd11d19e0ca
RDX: 0000200000000400 RSI: 0000200000000340 RDI: 00007fd11e12dea0
RBP: 0000200000000400 R08: 00007fd11e12dee0 R09: 000000000021c91c
R10: 000000000021c91c R11: 0000000000000246 R12: 0000200000000340
R13: 00007fd11e12dea0 R14: 000000000000051a R15: 0000200000000240
 </TASK>
----------------
Code disassembly (best guess):
   0:	f8                   	clc
   1:	48 c1 e8 03          	shr    $0x3,%rax
   5:	80 3c 18 00          	cmpb   $0x0,(%rax,%rbx,1)
   9:	74 08                	je     0x13
   b:	4c 89 ff             	mov    %r15,%rdi
   e:	e8 5d 04 71 fd       	call   0xfd710470
  13:	4d 8b 0f             	mov    (%r15),%r9
  16:	4c 89 ef             	mov    %r13,%rdi
  19:	48 8b 74 24 08       	mov    0x8(%rsp),%rsi
  1e:	48 89 ea             	mov    %rbp,%rdx
  21:	44 89 e1             	mov    %r12d,%ecx
  24:	4d 89 f0             	mov    %r14,%r8
  27:	ff 34 24             	push   (%rsp)
* 2a:	67 48 0f b9 3a       	ud1    (%edx),%rdi <-- trapping instruction
  2f:	48 83 c4 08          	add    $0x8,%rsp
  33:	ff 05 a3 23 70 0b    	incl   0xb7023a3(%rip)        # 0xb7023dc
  39:	48 83 c4 10          	add    $0x10,%rsp
  3d:	5b                   	pop    %rbx
  3e:	41 5c                	pop    %r12


***

If these findings have caused you to resend the series or submit a
separate fix, please add the following tag to your commit message:
  Tested-by: syzbot@syzkaller.appspotmail.com

---
This report is generated by a bot. It may contain errors.
syzbot ci engineers can be reached at syzkaller@googlegroups.com.

To test a patch for this bug, please reply with `#syz test`
(should be on a separate line).

The patch should be attached to the email.
Note: arguments like custom git repos and branches are not supported.

^ permalink raw reply

* [PATCH v2] ext4: fix ABBA deadlock in ext4_xattr_inode_cache_find()
From: Aditya Srivastava @ 2026-06-23  9:59 UTC (permalink / raw)
  To: tytso
  Cc: jack, adilger.kernel, libaokun, ritesh.list, yi.zhang, linux-ext4,
	linux-kernel, Aditya Prakash Srivastava, Colin Ian King

From: Aditya Prakash Srivastava <aditya.ansh182@gmail.com>

Syzbot/stress-ng reported an ABBA deadlock in ext4 when exercising
concurrent xattr workloads (using the ea_inode mount/format option).

The deadlock occurs between the running transaction and the eviction
thread:
- Task 1 (stress-ng): Holds a reference to a shared mbcache_entry (ce)
  and calls ext4_xattr_inode_cache_find() -> ext4_iget() to retrieve
  the corresponding EA inode. Since the EA inode is currently being
  evicted, ext4_iget() blocks in __wait_on_freeing_inode() waiting for
  eviction to complete.
- Task 2 (eviction thread): Currently evicting the same EA inode in
  ext4_evict_ea_inode(). It calls mb_cache_entry_wait_unused(oe) which
  blocks waiting for Task 1 to release the reference to the mbcache_entry.

To break this deadlock, perform a non-blocking lookup of the EA inode
using VFS's find_inode_nowait() API. If the EA inode is currently being
evicted (marked with I_FREEING or I_WILL_FREE), simply skip it (treat
as a cache miss) rather than waiting for eviction to complete. If the
returned inode is found to be I_NEW, wait for its initialization to
clear using wait_on_new_inode().

This deadlock was made much easier to hit after commit 0a46ef234756
("ext4: do not create EA inode under buffer lock") which removed
synchronization on the buffer lock.

Reported-by: Colin Ian King <colin.i.king@gmail.com>
Closes: https://bugzilla.kernel.org/show_bug.cgi?id=219283
Fixes: 0a46ef234756 ("ext4: do not create EA inode under buffer lock")
Signed-off-by: Aditya Prakash Srivastava <aditya.ansh182@gmail.com>
---
Changes in v2:
  - Read inode state locklessly using inode_state_read_once() to resolve
    a lockdep assertion on cache hit.
  - Manually restore essential inode/ea_inode validations on the retrieved
    inode (is_bad_inode, EXT4_EA_INODE_FL, file_acl, and xattr checks) to
    match VFS safety guarantees and prevent using corrupted/failed inodes.

 fs/ext4/xattr.c | 29 ++++++++++++++++++++++++++---
 1 file changed, 26 insertions(+), 3 deletions(-)

diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 982a1f831e22..ef13e7a76153 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -1523,6 +1523,20 @@ static struct inode *ext4_xattr_inode_create(handle_t *handle,
 	return ea_inode;
 }
 
+static int ext4_xattr_inode_match(struct inode *inode, u64 ino, void *data)
+{
+	if (inode->i_ino != ino)
+		return 0;
+	spin_lock(&inode->i_lock);
+	if (inode_state_read(inode) & (I_FREEING | I_WILL_FREE)) {
+		spin_unlock(&inode->i_lock);
+		return 0;
+	}
+	__iget(inode);
+	spin_unlock(&inode->i_lock);
+	return 1;
+}
+
 static struct inode *
 ext4_xattr_inode_cache_find(struct inode *inode, const void *value,
 			    size_t value_len, u32 hash)
@@ -1549,10 +1563,19 @@ ext4_xattr_inode_cache_find(struct inode *inode, const void *value,
 	}
 
 	while (ce) {
-		ea_inode = ext4_iget(inode->i_sb, ce->e_value,
-				     EXT4_IGET_EA_INODE);
-		if (IS_ERR(ea_inode))
+		ea_inode = find_inode_nowait(inode->i_sb, ce->e_value,
+					     ext4_xattr_inode_match, NULL);
+		if (!ea_inode)
 			goto next_entry;
+		if (inode_state_read_once(ea_inode) & I_NEW)
+			wait_on_new_inode(ea_inode);
+		if (is_bad_inode(ea_inode) ||
+		    !(EXT4_I(ea_inode)->i_flags & EXT4_EA_INODE_FL) ||
+		    ext4_test_inode_state(ea_inode, EXT4_STATE_XATTR) ||
+		    EXT4_I(ea_inode)->i_file_acl) {
+			iput(ea_inode);
+			goto next_entry;
+		}
 		ext4_xattr_inode_set_class(ea_inode);
 		if (i_size_read(ea_inode) == value_len &&
 		    !ext4_xattr_inode_read(ea_inode, ea_data, value_len) &&
-- 
2.47.3


^ permalink raw reply related

* [PATCH] ext4: cancel dirty accounting for folios without buffers
From: Zhu Jia @ 2026-06-23  9:49 UTC (permalink / raw)
  To: tytso, adilger.kernel
  Cc: libaokun, jack, ojaswin, ritesh.list, yi.zhang, linux-ext4,
	linux-kernel, Zhu Jia, stable

Since commit cc5095747edf ("ext4: don't BUG if someone dirty pages
without asking ext4 first"), mpage_prepare_extent_to_map() handles dirty
folios without buffer heads by warning, clearing PG_dirty, and skipping
them. ext4 cannot write these folios because there are no buffer heads to
map and submit.

That recovery leaves dirty accounting behind: folio_clear_dirty() clears
PG_dirty but does not undo the accounting charged when the folio was
dirtied. We have seen this in production as Dirty/nr_dirty staying high
while Writeback/nr_writeback and device write IO stayed near zero, with
many writer tasks blocked in balance_dirty_pages() throttling. Thus the
warning-and-skip recovery can still become a dirty-throttle DoS.

Use folio_cancel_dirty() so dropping PG_dirty also cancels the dirty
accounting.

Fixes: cc5095747edf ("ext4: don't BUG if someone dirty pages without asking ext4 first")
Cc: stable@vger.kernel.org
Signed-off-by: Zhu Jia <zhujia.zj@bytedance.com>
---
 fs/ext4/inode.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index c2c2d6ac7f3d1..7ea280e70c06e 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2715,7 +2715,13 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
 			 */
 			if (!folio_buffers(folio)) {
 				ext4_warning_inode(mpd->inode, "page %lu does not have buffers attached", folio->index);
-				folio_clear_dirty(folio);
+				/*
+				 * folio_cancel_dirty() pairs the dropped dirty
+				 * state with dirty accounting, but leaves stale
+				 * PAGECACHE_TAG_DIRTY/TOWRITE tags behind. Later
+				 * writeback may rescan this clean folio.
+				 */
+				folio_cancel_dirty(folio);
 				folio_unlock(folio);
 				continue;
 			}
-- 
2.20.1

^ permalink raw reply related

* [PATCH] ext4: fix ABBA deadlock in ext4_xattr_inode_cache_find()
From: Aditya Srivastava @ 2026-06-23  9:05 UTC (permalink / raw)
  To: tytso
  Cc: jack, adilger.kernel, libaokun, ritesh.list, yi.zhang, linux-ext4,
	linux-kernel, Aditya Prakash Srivastava, Colin Ian King

From: Aditya Prakash Srivastava <aditya.ansh182@gmail.com>

Syzbot/stress-ng reported an ABBA deadlock in ext4 when exercising
concurrent xattr workloads (using the ea_inode mount/format option).

The deadlock occurs between the running transaction and the eviction
thread:
- Task 1 (stress-ng): Holds a reference to a shared mbcache_entry (ce)
  and calls ext4_xattr_inode_cache_find() -> ext4_iget() to retrieve
  the corresponding EA inode. Since the EA inode is currently being
  evicted, ext4_iget() blocks in __wait_on_freeing_inode() waiting for
  eviction to complete.
- Task 2 (eviction thread): Currently evicting the same EA inode in
  ext4_evict_ea_inode(). It calls mb_cache_entry_wait_unused(oe) which
  blocks waiting for Task 1 to release the reference to the mbcache_entry.

To break this deadlock, perform a non-blocking lookup of the EA inode
using VFS's find_inode_nowait() API. If the EA inode is currently being
evicted (marked with I_FREEING or I_WILL_FREE), simply skip it (treat
as a cache miss) rather than waiting for eviction to complete. If the
returned inode is found to be I_NEW, wait for its initialization to
clear using wait_on_new_inode().

This deadlock was made much easier to hit after commit 0a46ef234756
("ext4: do not create EA inode under buffer lock") which removed
synchronization on the buffer lock.

Reported-by: Colin Ian King <colin.i.king@gmail.com>
Closes: https://bugzilla.kernel.org/show_bug.cgi?id=219283
Fixes: 0a46ef234756 ("ext4: do not create EA inode under buffer lock")
Signed-off-by: Aditya Prakash Srivastava <aditya.ansh182@gmail.com>
---
 fs/ext4/xattr.c | 22 +++++++++++++++++++---
 1 file changed, 19 insertions(+), 3 deletions(-)

diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 982a1f831e22..8c0082362a9b 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -1523,6 +1523,20 @@ static struct inode *ext4_xattr_inode_create(handle_t *handle,
 	return ea_inode;
 }
 
+static int ext4_xattr_inode_match(struct inode *inode, u64 ino, void *data)
+{
+	if (inode->i_ino != ino)
+		return 0;
+	spin_lock(&inode->i_lock);
+	if (inode_state_read(inode) & (I_FREEING | I_WILL_FREE)) {
+		spin_unlock(&inode->i_lock);
+		return 0;
+	}
+	__iget(inode);
+	spin_unlock(&inode->i_lock);
+	return 1;
+}
+
 static struct inode *
 ext4_xattr_inode_cache_find(struct inode *inode, const void *value,
 			    size_t value_len, u32 hash)
@@ -1549,10 +1563,12 @@ ext4_xattr_inode_cache_find(struct inode *inode, const void *value,
 	}
 
 	while (ce) {
-		ea_inode = ext4_iget(inode->i_sb, ce->e_value,
-				     EXT4_IGET_EA_INODE);
-		if (IS_ERR(ea_inode))
+		ea_inode = find_inode_nowait(inode->i_sb, ce->e_value,
+					     ext4_xattr_inode_match, NULL);
+		if (!ea_inode)
 			goto next_entry;
+		if (inode_state_read(ea_inode) & I_NEW)
+			wait_on_new_inode(ea_inode);
 		ext4_xattr_inode_set_class(ea_inode);
 		if (i_size_read(ea_inode) == value_len &&
 		    !ext4_xattr_inode_read(ea_inode, ea_data, value_len) &&
-- 
2.47.3


^ permalink raw reply related

* [PATCH v9 2/4] ext4: introduce ext4_put_ea_inode() for safe deferred iput
From: Yun Zhou @ 2026-06-23  8:52 UTC (permalink / raw)
  To: tytso, adilger.kernel, libaokun, jack, ojaswin, ritesh.list,
	yi.zhang, viro, brauner
  Cc: linux-fsdevel, linux-ext4, linux-kernel, yun.zhou
In-Reply-To: <20260623085243.2816425-1-yun.zhou@windriver.com>

Calling iput() on EA inodes while holding xattr_sem or a jbd2 handle
can trigger write_inode_now() -> ext4_writepages() -> s_writepages_rwsem,
creating a lock ordering issue during mount (!SB_ACTIVE).

Add ext4_put_ea_inode() which uses iput_if_not_last() as a fast path.
If this is not the last reference, it is dropped immediately.  If this
is the last reference, the inode is linked onto a per-sb lock-free llist
via i_ea_iput_node (embedded in ext4_inode_info, sharing space with the
unused xattr_sem of EA inodes via a union) and a delayed worker
(1 jiffie) performs the final iput() in a clean context.  This avoids
per-iput memory allocation.

Convert the first call site: ext4_xattr_block_set()'s "Drop the
previous xattr block" path, which previously called
ext4_xattr_inode_array_free() under xattr_sem + jbd2 handle.

The worker is drained in ext4_put_super() before quota shutdown using
a loop to handle re-arming (evicting an EA inode may queue further EA
inodes). Initialization is placed before journal loading since fast
commit replay may trigger evictions that call ext4_put_ea_inode().

Signed-off-by: Yun Zhou <yun.zhou@windriver.com>
Suggested-by: Jan Kara <jack@suse.cz>
---
 fs/ext4/ext4.h  | 13 ++++++++-
 fs/ext4/super.c | 18 +++++++++++-
 fs/ext4/xattr.c | 73 ++++++++++++++++++++++++++++++++++++++++++++++++-
 fs/ext4/xattr.h | 14 ++++++++++
 4 files changed, 115 insertions(+), 3 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index b37c136ea3ab..b9b0ada7774b 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1070,8 +1070,14 @@ struct ext4_inode_info {
 	 * between readers of EAs and writers of regular file data, so
 	 * instead we synchronize on xattr_sem when reading or changing
 	 * EAs.
+	 *
+	 * EA inodes (EXT4_EA_INODE_FL) do not use xattr_sem; they reuse
+	 * the space for deferred iput linkage.
 	 */
-	struct rw_semaphore xattr_sem;
+	union {
+		struct rw_semaphore xattr_sem;
+		struct llist_node i_ea_iput_node;
+	};
 
 	/*
 	 * Inodes with EXT4_STATE_ORPHAN_FILE use i_orphan_idx. Otherwise
@@ -1770,6 +1776,11 @@ struct ext4_sb_info {
 	struct ext4_es_stats s_es_stats;
 	struct mb_cache *s_ea_block_cache;
 	struct mb_cache *s_ea_inode_cache;
+
+	/* Deferred iput for EA inodes to avoid lock ordering issues */
+	struct llist_head s_ea_inode_to_free;
+	struct delayed_work s_ea_inode_work;
+
 	spinlock_t s_es_lock ____cacheline_aligned_in_smp;
 
 	/* Journal triggers for checksum computation */
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 245f67d10ded..97f0e7c1b254 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1303,6 +1303,8 @@ static void ext4_put_super(struct super_block *sb)
 			 &sb->s_uuid);
 
 	ext4_unregister_li_request(sb);
+	/* Drain deferred EA inode iputs while quota is still active. */
+	ext4_drain_ea_inode_work(sbi);
 	ext4_quotas_off(sb, EXT4_MAXQUOTAS);
 
 	destroy_workqueue(sbi->rsv_conversion_wq);
@@ -1423,6 +1425,13 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
 	memset(&ei->i_dquot, 0, sizeof(ei->i_dquot));
 #endif
 	ei->jinode = NULL;
+	/*
+	 * Reinitialize xattr_sem every allocation because EA inodes
+	 * share this space with i_ea_iput_node (via union) which may
+	 * have overwritten the semaphore when the slab object was
+	 * previously used as an EA inode.
+	 */
+	init_rwsem(&ei->xattr_sem);
 	INIT_LIST_HEAD(&ei->i_rsv_conversion_list);
 	spin_lock_init(&ei->i_completed_io_lock);
 	ei->i_sync_tid = 0;
@@ -1488,7 +1497,6 @@ static void init_once(void *foo)
 	struct ext4_inode_info *ei = foo;
 
 	INIT_LIST_HEAD(&ei->i_orphan);
-	init_rwsem(&ei->xattr_sem);
 	init_rwsem(&ei->i_data_sem);
 	inode_init_once(&ei->vfs_inode);
 	ext4_fc_init_inode(&ei->vfs_inode);
@@ -5508,6 +5516,8 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
 	 * The first inode we look at is the journal inode.  Don't try
 	 * root first: it may be modified in the journal!
 	 */
+	ext4_init_ea_inode_work(sbi);
+
 	if (!test_opt(sb, NOLOAD) && ext4_has_feature_journal(sb)) {
 		err = ext4_load_and_init_journal(sb, es, ctx);
 		if (err)
@@ -5747,6 +5757,8 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
 	return 0;
 
 failed_mount9:
+	/* Drain deferred EA inode iputs before quota shutdown */
+	ext4_drain_ea_inode_work(sbi);
 	ext4_quotas_off(sb, EXT4_MAXQUOTAS);
 failed_mount8: __maybe_unused
 	ext4_release_orphan_info(sb);
@@ -5767,6 +5779,8 @@ failed_mount8: __maybe_unused
 	if (EXT4_SB(sb)->rsv_conversion_wq)
 		destroy_workqueue(EXT4_SB(sb)->rsv_conversion_wq);
 failed_mount_wq:
+	/* Drain deferred EA inode iputs before freeing structures */
+	ext4_drain_ea_inode_work(sbi);
 	ext4_xattr_destroy_cache(sbi->s_ea_inode_cache);
 	sbi->s_ea_inode_cache = NULL;
 
@@ -5777,6 +5791,8 @@ failed_mount8: __maybe_unused
 		ext4_journal_destroy(sbi, sbi->s_journal);
 	}
 failed_mount3a:
+	/* Drain deferred EA inode iputs from journal replay */
+	ext4_drain_ea_inode_work(sbi);
 	ext4_es_unregister_shrinker(sbi);
 failed_mount3:
 	/* flush s_sb_upd_work before sbi destroy */
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 982a1f831e22..ecdad5920b14 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -117,6 +117,8 @@ const struct xattr_handler * const ext4_xattr_handlers[] = {
 static int
 ext4_expand_inode_array(struct ext4_xattr_inode_array **ea_inode_array,
 			struct inode *inode);
+static void ext4_xattr_inode_array_free_deferred(struct super_block *sb,
+				struct ext4_xattr_inode_array *array);
 
 #ifdef CONFIG_LOCKDEP
 void ext4_xattr_inode_set_class(struct inode *ea_inode)
@@ -2187,7 +2189,8 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
 		ext4_xattr_release_block(handle, inode, bs->bh,
 					 &ea_inode_array,
 					 0 /* extra_credits */);
-		ext4_xattr_inode_array_free(ea_inode_array);
+		ext4_xattr_inode_array_free_deferred(inode->i_sb,
+						     ea_inode_array);
 	}
 	error = 0;
 
@@ -3025,6 +3028,74 @@ void ext4_xattr_inode_array_free(struct ext4_xattr_inode_array *ea_inode_array)
 	kfree(ea_inode_array);
 }
 
+static void ext4_xattr_inode_array_free_deferred(struct super_block *sb,
+				struct ext4_xattr_inode_array *array)
+{
+	int idx;
+
+	if (array == NULL)
+		return;
+
+	for (idx = 0; idx < array->count; ++idx)
+		ext4_put_ea_inode(sb, array->inodes[idx]);
+	kfree(array);
+}
+
+/*
+ * Worker function for deferred EA inode iput.  Processes all inodes queued
+ * on s_ea_inode_to_free in a context free of xattr_sem/jbd2 handle locks.
+ */
+static void ext4_ea_inode_work(struct work_struct *work)
+{
+	struct ext4_sb_info *sbi = container_of(to_delayed_work(work),
+						struct ext4_sb_info,
+						s_ea_inode_work);
+	struct llist_node *node = llist_del_all(&sbi->s_ea_inode_to_free);
+	struct llist_node *next;
+
+	while (node) {
+		struct ext4_inode_info *ei = container_of(node,
+					struct ext4_inode_info, i_ea_iput_node);
+		next = node->next;
+		iput(&ei->vfs_inode);
+		node = next;
+	}
+}
+
+/*
+ * Release a VFS reference on an EA inode.  Must be used instead of iput()
+ * in any context where xattr_sem or a jbd2 handle is held.
+ *
+ * If this is not the last reference, drops it immediately via
+ * iput_if_not_last() with no further action needed.
+ *
+ * If this is the last reference, the inode is linked onto a per-sb
+ * llist via i_ea_iput_node (embedded in ext4_inode_info, sharing space
+ * with the unused xattr_sem) and a delayed worker performs the final
+ * iput() in a clean context.
+ */
+void ext4_put_ea_inode(struct super_block *sb, struct inode *inode)
+{
+	if (!inode)
+		return;
+	WARN_ON_ONCE(!(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL));
+	if (iput_if_not_last(inode))
+		return;
+	llist_add(&EXT4_I(inode)->i_ea_iput_node,
+		  &EXT4_SB(sb)->s_ea_inode_to_free);
+	/*
+	 * Use a short delay to allow multiple EA inodes to accumulate,
+	 * reducing workqueue wakeups when several are released together.
+	 */
+	schedule_delayed_work(&EXT4_SB(sb)->s_ea_inode_work, 1);
+}
+
+void ext4_init_ea_inode_work(struct ext4_sb_info *sbi)
+{
+	init_llist_head(&sbi->s_ea_inode_to_free);
+	INIT_DELAYED_WORK(&sbi->s_ea_inode_work, ext4_ea_inode_work);
+}
+
 /*
  * ext4_xattr_block_cache_insert()
  *
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index 1fedf44d4fb6..9883ba5569a1 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -190,6 +190,20 @@ extern int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
 				   struct ext4_xattr_inode_array **array,
 				   int extra_credits);
 extern void ext4_xattr_inode_array_free(struct ext4_xattr_inode_array *array);
+extern void ext4_init_ea_inode_work(struct ext4_sb_info *sbi);
+extern void ext4_put_ea_inode(struct super_block *sb, struct inode *inode);
+
+/*
+ * Drain all pending deferred EA inode iputs.  Must be called before
+ * freeing resources that eviction depends on (quota, block allocator).
+ * Loops because worker iput may trigger eviction that re-queues.
+ */
+static inline void ext4_drain_ea_inode_work(struct ext4_sb_info *sbi)
+{
+	while (flush_delayed_work(&sbi->s_ea_inode_work) ||
+	       !llist_empty(&sbi->s_ea_inode_to_free))
+		;
+}
 
 extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
 			    struct ext4_inode *raw_inode, handle_t *handle);
-- 
2.43.0


^ permalink raw reply related

* [PATCH v9 0/4] ext4: deferred iput framework for EA inodes
From: Yun Zhou @ 2026-06-23  8:52 UTC (permalink / raw)
  To: tytso, adilger.kernel, libaokun, jack, ojaswin, ritesh.list,
	yi.zhang, viro, brauner
  Cc: linux-fsdevel, linux-ext4, linux-kernel, yun.zhou
In-Reply-To: <20260623083540.2744885-5-yun.zhou@windriver.com>

This series introduces a deferred-iput framework for EA inodes to
eliminate a class of lock ordering issues in ext4 xattr code.

The problem: iput() on EA inodes while holding xattr_sem or a jbd2
handle can trigger eviction, which may acquire those same locks or
s_writepages_rwsem, creating circular dependencies.  The immediate
deadlock (during mount-time orphan cleanup) is fixed by two separate
patches already reviewed and posted:

  ext4: skip extra isize expansion during mount to prevent deadlock
  ext4: set EXT4_STATE_NO_EXPAND in ext4_evict_inode

This series provides the structural fix that makes the code safe
regardless of calling context:

Patch 1 adds a VFS helper iput_if_not_last() which drops an inode
reference only if it is not the last one, using atomic_add_unless().
This provides a proper VFS abstraction for filesystems that need to
conditionally defer final iput.

Patch 2 introduces ext4_put_ea_inode() using iput_if_not_last() as
a fast path (single atomic, zero overhead for the common case).  If
this is the last reference, the inode is linked onto a per-sb llist
(via i_ea_iput_node embedded in ext4_inode_info, union with xattr_sem
which is unused for EA inodes) and a delayed worker (1 jiffie) performs
the final iput() in a clean context.  No per-iput allocation needed.
Also moves init_rwsem(xattr_sem) from init_once to ext4_alloc_inode
to handle slab reuse after the union field has been overwritten.

Patch 3 converts all EA inode iput() calls in xattr code to use
ext4_put_ea_inode() uniformly -- no exceptions to reason about.

Patch 4 removes the now-redundant ea_inode_array mechanism (parameter
threading, struct, expand/free functions), replaced entirely by direct
ext4_put_ea_inode() calls.  This is a net code reduction.

Link: https://syzkaller.appspot.com/bug?extid=5d19358d7eb30ffb0cc5

v9:
 - Add iput_if_not_last() as proper VFS helper (per reviewer: don't
   let filesystems manipulate inode refcount without VFS abstraction).
 - Use iput_if_not_last() + llist_node embedded in ext4_inode_info
   (union with xattr_sem) to avoid per-iput allocation entirely.
 - Convert ALL EA inode iput() calls uniformly -- no exceptions.
 - Remove entire ea_inode_array mechanism.
 - Add WARN_ON_ONCE in ext4_put_ea_inode() to catch misuse on non-EA
   inodes (protects the xattr_sem union safety).
 - Fix worker re-arm: ext4_drain_ea_inode_work() loops to handle
   nested EA inode evictions re-scheduling work.
 - Move INIT_DELAYED_WORK before journal loading (fast commit replay
   may trigger evictions).
 - Drain before ext4_quotas_off() for correct quota accounting.
 - Add flush in failed_mount_wq and failed_mount3a error paths for
   journal replay case.
 - Move init_rwsem(xattr_sem) from init_once to ext4_alloc_inode to
   handle slab object reuse after union overwrite.
 - Encapsulate worker init into ext4_init_ea_inode_work(), making
   ext4_ea_inode_work() static to xattr.c.

(Resending with VFS maintainers Cc'd -- no code changes from initial posting)

Yun Zhou (4):
  fs: add iput_if_not_last() helper
  ext4: introduce ext4_put_ea_inode() for safe deferred iput
  ext4: convert all EA inode iput() calls to ext4_put_ea_inode()
  ext4: remove ea_inode_array mechanism in favor of ext4_put_ea_inode()

 fs/ext4/ext4.h     |  13 ++++-
 fs/ext4/inode.c    |   6 +-
 fs/ext4/super.c    |  18 +++++-
 fs/ext4/xattr.c    | 150 ++++++++++++++++++++-------------------------
 fs/ext4/xattr.h    |  21 ++++---
 include/linux/fs.h |  13 ++++
 6 files changed, 125 insertions(+), 96 deletions(-)

-- 
2.43.0

^ permalink raw reply

* [PATCH v9 4/4] ext4: remove ea_inode_array mechanism in favor of ext4_put_ea_inode()
From: Yun Zhou @ 2026-06-23  8:52 UTC (permalink / raw)
  To: tytso, adilger.kernel, libaokun, jack, ojaswin, ritesh.list,
	yi.zhang, viro, brauner
  Cc: linux-fsdevel, linux-ext4, linux-kernel, yun.zhou
In-Reply-To: <20260623085243.2816425-1-yun.zhou@windriver.com>

Now that ext4_put_ea_inode() handles deferred iput safely for all cases
(using iput_if_not_last + embedded llist_node), the ea_inode_array
mechanism for batching deferred iputs is redundant.

Remove:
- ext4_expand_inode_array() and ext4_xattr_inode_array_free()
- ext4_xattr_inode_array_free_deferred()
- struct ext4_xattr_inode_array and EIA_INCR/EIA_MASK defines
- ea_inode_array parameter from ext4_xattr_inode_dec_ref_all(),
  ext4_xattr_release_block(), and ext4_xattr_delete_inode()
- ea_inode_array variable from ext4_evict_inode()

Instead, ext4_xattr_inode_dec_ref_all() now calls ext4_put_ea_inode()
directly after processing each EA inode.  This simplifies the code
by eliminating multi-layer parameter threading and removes the need
for callers to manage array lifetime.

Signed-off-by: Yun Zhou <yun.zhou@windriver.com>
Suggested-by: Jan Kara <jack@suse.cz>
---
 fs/ext4/inode.c |  6 +---
 fs/ext4/xattr.c | 95 +++----------------------------------------------
 fs/ext4/xattr.h |  7 ----
 3 files changed, 6 insertions(+), 102 deletions(-)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 0d131371ad3d..6f1b84e46a2e 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -176,7 +176,6 @@ void ext4_evict_inode(struct inode *inode)
 	 * (xattr block freeing), bitmap, group descriptor (inode freeing)
 	 */
 	int extra_credits = 6;
-	struct ext4_xattr_inode_array *ea_inode_array = NULL;
 	bool freeze_protected = false;
 
 	trace_ext4_evict_inode(inode);
@@ -282,8 +281,7 @@ void ext4_evict_inode(struct inode *inode)
 	}
 
 	/* Remove xattr references. */
-	err = ext4_xattr_delete_inode(handle, inode, &ea_inode_array,
-				      extra_credits);
+	err = ext4_xattr_delete_inode(handle, inode, extra_credits);
 	if (err) {
 		ext4_warning(inode->i_sb, "xattr delete (err %d)", err);
 stop_handle:
@@ -291,7 +289,6 @@ void ext4_evict_inode(struct inode *inode)
 		ext4_orphan_del(NULL, inode);
 		if (freeze_protected)
 			sb_end_intwrite(inode->i_sb);
-		ext4_xattr_inode_array_free(ea_inode_array);
 		goto no_delete;
 	}
 
@@ -321,7 +318,6 @@ void ext4_evict_inode(struct inode *inode)
 	ext4_journal_stop(handle);
 	if (freeze_protected)
 		sb_end_intwrite(inode->i_sb);
-	ext4_xattr_inode_array_free(ea_inode_array);
 	return;
 no_delete:
 	/*
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 90b693b78a45..7f334349bd4f 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -114,12 +114,6 @@ const struct xattr_handler * const ext4_xattr_handlers[] = {
 #define EA_INODE_CACHE(inode)	(((struct ext4_sb_info *) \
 				inode->i_sb->s_fs_info)->s_ea_inode_cache)
 
-static int
-ext4_expand_inode_array(struct ext4_xattr_inode_array **ea_inode_array,
-			struct inode *inode);
-static void ext4_xattr_inode_array_free_deferred(struct super_block *sb,
-				struct ext4_xattr_inode_array *array);
-
 #ifdef CONFIG_LOCKDEP
 void ext4_xattr_inode_set_class(struct inode *ea_inode)
 {
@@ -1162,7 +1156,6 @@ static void
 ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent,
 			     struct buffer_head *bh,
 			     struct ext4_xattr_entry *first, bool block_csum,
-			     struct ext4_xattr_inode_array **ea_inode_array,
 			     int extra_credits, bool skip_quota)
 {
 	struct inode *ea_inode;
@@ -1199,14 +1192,6 @@ ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent,
 		if (err)
 			continue;
 
-		err = ext4_expand_inode_array(ea_inode_array, ea_inode);
-		if (err) {
-			ext4_warning_inode(ea_inode,
-					   "Expand inode array err=%d", err);
-			ext4_put_ea_inode(parent->i_sb, ea_inode);
-			continue;
-		}
-
 		err = ext4_journal_ensure_credits_fn(handle, credits, credits,
 			ext4_free_metadata_revoke_credits(parent->i_sb, 1),
 			ext4_xattr_restart_fn(handle, parent, bh, block_csum,
@@ -1214,6 +1199,7 @@ ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent,
 		if (err < 0) {
 			ext4_warning_inode(ea_inode, "Ensure credits err=%d",
 					   err);
+			ext4_put_ea_inode(parent->i_sb, ea_inode);
 			continue;
 		}
 		if (err > 0) {
@@ -1223,6 +1209,7 @@ ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent,
 				ext4_warning_inode(ea_inode,
 						"Re-get write access err=%d",
 						err);
+				ext4_put_ea_inode(parent->i_sb, ea_inode);
 				continue;
 			}
 		}
@@ -1231,6 +1218,7 @@ ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent,
 		if (err) {
 			ext4_warning_inode(ea_inode, "ea_inode dec ref err=%d",
 					   err);
+			ext4_put_ea_inode(parent->i_sb, ea_inode);
 			continue;
 		}
 
@@ -1247,6 +1235,7 @@ ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent,
 		entry->e_value_inum = 0;
 		entry->e_value_size = 0;
 
+		ext4_put_ea_inode(parent->i_sb, ea_inode);
 		dirty = true;
 	}
 
@@ -1273,7 +1262,6 @@ ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent,
 static void
 ext4_xattr_release_block(handle_t *handle, struct inode *inode,
 			 struct buffer_head *bh,
-			 struct ext4_xattr_inode_array **ea_inode_array,
 			 int extra_credits)
 {
 	struct mb_cache *ea_block_cache = EA_BLOCK_CACHE(inode);
@@ -1315,7 +1303,6 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
 			ext4_xattr_inode_dec_ref_all(handle, inode, bh,
 						     BFIRST(bh),
 						     true /* block_csum */,
-						     ea_inode_array,
 						     extra_credits,
 						     true /* skip_quota */);
 		ext4_free_blocks(handle, inode, bh, 0, 1,
@@ -2184,13 +2171,8 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
 
 	/* Drop the previous xattr block. */
 	if (bs->bh && bs->bh != new_bh) {
-		struct ext4_xattr_inode_array *ea_inode_array = NULL;
-
 		ext4_xattr_release_block(handle, inode, bs->bh,
-					 &ea_inode_array,
 					 0 /* extra_credits */);
-		ext4_xattr_inode_array_free_deferred(inode->i_sb,
-						     ea_inode_array);
 	}
 	error = 0;
 
@@ -2866,46 +2848,6 @@ int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
 	return error;
 }
 
-#define EIA_INCR 16 /* must be 2^n */
-#define EIA_MASK (EIA_INCR - 1)
-
-/* Add the large xattr @inode into @ea_inode_array for deferred iput().
- * If @ea_inode_array is new or full it will be grown and the old
- * contents copied over.
- */
-static int
-ext4_expand_inode_array(struct ext4_xattr_inode_array **ea_inode_array,
-			struct inode *inode)
-{
-	if (*ea_inode_array == NULL) {
-		/*
-		 * Start with 15 inodes, so it fits into a power-of-two size.
-		 */
-		(*ea_inode_array) = kmalloc_flex(**ea_inode_array, inodes,
-						 EIA_MASK, GFP_NOFS);
-		if (*ea_inode_array == NULL)
-			return -ENOMEM;
-		(*ea_inode_array)->count = 0;
-	} else if (((*ea_inode_array)->count & EIA_MASK) == EIA_MASK) {
-		/* expand the array once all 15 + n * 16 slots are full */
-		struct ext4_xattr_inode_array *new_array = NULL;
-
-		new_array = kmalloc_flex(**ea_inode_array, inodes,
-					 (*ea_inode_array)->count + EIA_INCR,
-					 GFP_NOFS);
-		if (new_array == NULL)
-			return -ENOMEM;
-		memcpy(new_array, *ea_inode_array,
-		       struct_size(*ea_inode_array, inodes,
-				   (*ea_inode_array)->count));
-		kfree(*ea_inode_array);
-		*ea_inode_array = new_array;
-	}
-	(*ea_inode_array)->count++;
-	(*ea_inode_array)->inodes[(*ea_inode_array)->count - 1] = inode;
-	return 0;
-}
-
 /*
  * ext4_xattr_delete_inode()
  *
@@ -2916,7 +2858,6 @@ ext4_expand_inode_array(struct ext4_xattr_inode_array **ea_inode_array,
  * references on xattr block and xattr inodes.
  */
 int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
-			    struct ext4_xattr_inode_array **ea_inode_array,
 			    int extra_credits)
 {
 	struct buffer_head *bh = NULL;
@@ -2955,7 +2896,6 @@ int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
 			ext4_xattr_inode_dec_ref_all(handle, inode, iloc.bh,
 						     IFIRST(header),
 						     false /* block_csum */,
-						     ea_inode_array,
 						     extra_credits,
 						     false /* skip_quota */);
 	}
@@ -2994,7 +2934,7 @@ int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
 
 		}
 
-		ext4_xattr_release_block(handle, inode, bh, ea_inode_array,
+		ext4_xattr_release_block(handle, inode, bh,
 					 extra_credits);
 		/*
 		 * Update i_file_acl value in the same transaction that releases
@@ -3016,31 +2956,6 @@ int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
 	return error;
 }
 
-void ext4_xattr_inode_array_free(struct ext4_xattr_inode_array *ea_inode_array)
-{
-	int idx;
-
-	if (ea_inode_array == NULL)
-		return;
-
-	for (idx = 0; idx < ea_inode_array->count; ++idx)
-		iput(ea_inode_array->inodes[idx]);
-	kfree(ea_inode_array);
-}
-
-static void ext4_xattr_inode_array_free_deferred(struct super_block *sb,
-				struct ext4_xattr_inode_array *array)
-{
-	int idx;
-
-	if (array == NULL)
-		return;
-
-	for (idx = 0; idx < array->count; ++idx)
-		ext4_put_ea_inode(sb, array->inodes[idx]);
-	kfree(array);
-}
-
 /*
  * Worker function for deferred EA inode iput.  Processes all inodes queued
  * on s_ea_inode_to_free in a context free of xattr_sem/jbd2 handle locks.
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index 9883ba5569a1..8214a31fe001 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -131,11 +131,6 @@ struct ext4_xattr_ibody_find {
 	struct ext4_iloc iloc;
 };
 
-struct ext4_xattr_inode_array {
-	unsigned int count;
-	struct inode *inodes[] __counted_by(count);
-};
-
 extern const struct xattr_handler ext4_xattr_user_handler;
 extern const struct xattr_handler ext4_xattr_trusted_handler;
 extern const struct xattr_handler ext4_xattr_security_handler;
@@ -187,9 +182,7 @@ extern int __ext4_xattr_set_credits(struct super_block *sb, struct inode *inode,
 				bool is_create);
 
 extern int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
-				   struct ext4_xattr_inode_array **array,
 				   int extra_credits);
-extern void ext4_xattr_inode_array_free(struct ext4_xattr_inode_array *array);
 extern void ext4_init_ea_inode_work(struct ext4_sb_info *sbi);
 extern void ext4_put_ea_inode(struct super_block *sb, struct inode *inode);
 
-- 
2.43.0


^ permalink raw reply related

* [PATCH v9 1/4] fs: add iput_if_not_last() helper
From: Yun Zhou @ 2026-06-23  8:52 UTC (permalink / raw)
  To: tytso, adilger.kernel, libaokun, jack, ojaswin, ritesh.list,
	yi.zhang, viro, brauner
  Cc: linux-fsdevel, linux-ext4, linux-kernel, yun.zhou
In-Reply-To: <20260623085243.2816425-1-yun.zhou@windriver.com>

Add a helper that drops an inode reference only if the caller does not
hold the last one.  Returns true if the reference was dropped, false
otherwise.

This is useful for filesystems that need to release inode references
in contexts where triggering final iput (and thus eviction) would be
unsafe due to lock ordering constraints.  The caller can check the
return value and defer the final iput to a safe context.

Unlike iput_not_last() which BUG_ON's if called with the last ref,
this variant is designed to be called speculatively.

Signed-off-by: Yun Zhou <yun.zhou@windriver.com>
Suggested-by: Jan Kara <jack@suse.cz>
---
 include/linux/fs.h | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 6da44573ce45..4916a9d54347 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2418,6 +2418,19 @@ static inline void super_set_sysfs_name_generic(struct super_block *sb, const ch
 extern void ihold(struct inode * inode);
 extern void iput(struct inode *);
 void iput_not_last(struct inode *);
+
+/**
+ * iput_if_not_last - drop an inode reference only if it is not the last one
+ * @inode: inode to put
+ *
+ * Returns true if the reference was dropped, false if this was the last
+ * reference and the caller must arrange for final iput() in a safe context.
+ */
+static inline bool iput_if_not_last(struct inode *inode)
+{
+	return atomic_add_unless(&inode->i_count, -1, 1);
+}
+
 int inode_update_time(struct inode *inode, enum fs_update_time type,
 		unsigned int flags);
 int generic_update_time(struct inode *inode, enum fs_update_time type,
-- 
2.43.0


^ permalink raw reply related

* [PATCH v9 3/4] ext4: convert all EA inode iput() calls to ext4_put_ea_inode()
From: Yun Zhou @ 2026-06-23  8:52 UTC (permalink / raw)
  To: tytso, adilger.kernel, libaokun, jack, ojaswin, ritesh.list,
	yi.zhang, viro, brauner
  Cc: linux-fsdevel, linux-ext4, linux-kernel, yun.zhou
In-Reply-To: <20260623085243.2816425-1-yun.zhou@windriver.com>

Convert all iput() calls on EA inodes in xattr code paths to use
ext4_put_ea_inode().  This establishes a uniform rule: every EA inode
reference release in ext4 xattr code goes through ext4_put_ea_inode(),
eliminating the need to analyze each call site individually for lock
safety.

Converted sites:

- ext4_xattr_inode_get() read path
- ext4_xattr_inode_inc_ref_all() main loop and cleanup path
- ext4_xattr_inode_dec_ref_all() error paths
- ext4_xattr_inode_create() error path
- ext4_xattr_inode_cache_find() mismatch path
- ext4_xattr_inode_lookup_create() out_err
- ext4_xattr_set_entry() old_ea_inode
- ext4_xattr_block_set() new block path, cleanup, and tmp_inode
- ext4_xattr_ibody_set() error and success paths
- ext4_xattr_delete_inode() quota loop

For most of these, iput_if_not_last() will succeed (the EA inode has
other references) making the overhead a single atomic operation.

Signed-off-by: Yun Zhou <yun.zhou@windriver.com>
---
 fs/ext4/xattr.c | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index ecdad5920b14..90b693b78a45 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -569,7 +569,7 @@ ext4_xattr_inode_get(struct inode *inode, struct ext4_xattr_entry *entry,
 					ea_inode->i_ino, true /* reusable */);
 	}
 out:
-	iput(ea_inode);
+	ext4_put_ea_inode(inode->i_sb, ea_inode);
 	return err;
 }
 
@@ -1106,10 +1106,10 @@ static int ext4_xattr_inode_inc_ref_all(handle_t *handle, struct inode *parent,
 		err = ext4_xattr_inode_inc_ref(handle, ea_inode);
 		if (err) {
 			ext4_warning_inode(ea_inode, "inc ref error %d", err);
-			iput(ea_inode);
+			ext4_put_ea_inode(parent->i_sb, ea_inode);
 			goto cleanup;
 		}
-		iput(ea_inode);
+		ext4_put_ea_inode(parent->i_sb, ea_inode);
 	}
 	return 0;
 
@@ -1135,7 +1135,7 @@ static int ext4_xattr_inode_inc_ref_all(handle_t *handle, struct inode *parent,
 		if (err)
 			ext4_warning_inode(ea_inode, "cleanup dec ref error %d",
 					   err);
-		iput(ea_inode);
+		ext4_put_ea_inode(parent->i_sb, ea_inode);
 	}
 	return saved_err;
 }
@@ -1203,7 +1203,7 @@ ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent,
 		if (err) {
 			ext4_warning_inode(ea_inode,
 					   "Expand inode array err=%d", err);
-			iput(ea_inode);
+			ext4_put_ea_inode(parent->i_sb, ea_inode);
 			continue;
 		}
 
@@ -1507,7 +1507,7 @@ static struct inode *ext4_xattr_inode_create(handle_t *handle,
 			if (ext4_xattr_inode_dec_ref(handle, ea_inode))
 				ext4_warning_inode(ea_inode,
 					"cleanup dec ref error %d", err);
-			iput(ea_inode);
+			ext4_put_ea_inode(inode->i_sb, ea_inode);
 			return ERR_PTR(err);
 		}
 
@@ -1566,7 +1566,7 @@ ext4_xattr_inode_cache_find(struct inode *inode, const void *value,
 			kvfree(ea_data);
 			return ea_inode;
 		}
-		iput(ea_inode);
+		ext4_put_ea_inode(inode->i_sb, ea_inode);
 	next_entry:
 		ce = mb_cache_entry_find_next(ea_inode_cache, ce);
 	}
@@ -1617,7 +1617,7 @@ static struct inode *ext4_xattr_inode_lookup_create(handle_t *handle,
 				      ea_inode->i_ino, true /* reusable */);
 	return ea_inode;
 out_err:
-	iput(ea_inode);
+	ext4_put_ea_inode(inode->i_sb, ea_inode);
 	ext4_xattr_inode_free_quota(inode, NULL, value_len);
 	return ERR_PTR(err);
 }
@@ -1850,7 +1850,7 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i,
 
 	ret = 0;
 out:
-	iput(old_ea_inode);
+	ext4_put_ea_inode(inode->i_sb, old_ea_inode);
 	return ret;
 }
 
@@ -2012,7 +2012,7 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
 				old_ea_inode_quota = le32_to_cpu(
 						s->here->e_value_size);
 			}
-			iput(tmp_inode);
+			ext4_put_ea_inode(inode->i_sb, tmp_inode);
 
 			s->here->e_value_inum = 0;
 			s->here->e_value_size = 0;
@@ -2152,7 +2152,7 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
 					ext4_warning_inode(ea_inode,
 							   "dec ref error=%d",
 							   error);
-				iput(ea_inode);
+				ext4_put_ea_inode(inode->i_sb, ea_inode);
 				ea_inode = NULL;
 			}
 
@@ -2206,7 +2206,7 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
 			ext4_xattr_inode_free_quota(inode, ea_inode,
 						    i_size_read(ea_inode));
 		}
-		iput(ea_inode);
+		ext4_put_ea_inode(inode->i_sb, ea_inode);
 	}
 	if (ce)
 		mb_cache_entry_put(ea_block_cache, ce);
@@ -2288,7 +2288,7 @@ int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode,
 
 			ext4_xattr_inode_free_quota(inode, ea_inode,
 						    i_size_read(ea_inode));
-			iput(ea_inode);
+			ext4_put_ea_inode(inode->i_sb, ea_inode);
 		}
 		return error;
 	}
@@ -2300,7 +2300,7 @@ int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode,
 		header->h_magic = cpu_to_le32(0);
 		ext4_clear_inode_state(inode, EXT4_STATE_XATTR);
 	}
-	iput(ea_inode);
+	ext4_put_ea_inode(inode->i_sb, ea_inode);
 	return 0;
 }
 
@@ -2989,7 +2989,7 @@ int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
 					continue;
 				ext4_xattr_inode_free_quota(inode, ea_inode,
 					      le32_to_cpu(entry->e_value_size));
-				iput(ea_inode);
+				ext4_put_ea_inode(inode->i_sb, ea_inode);
 			}
 
 		}
-- 
2.43.0


^ permalink raw reply related

* [PATCH v9 4/4] ext4: remove ea_inode_array mechanism in favor of ext4_put_ea_inode()
From: Yun Zhou @ 2026-06-23  8:35 UTC (permalink / raw)
  To: tytso, adilger.kernel, libaokun, jack, ojaswin, ritesh.list,
	yi.zhang
  Cc: linux-ext4, linux-kernel, yun.zhou
In-Reply-To: <20260623083540.2744885-1-yun.zhou@windriver.com>

Now that ext4_put_ea_inode() handles deferred iput safely for all cases
(using iput_if_not_last + embedded llist_node), the ea_inode_array
mechanism for batching deferred iputs is redundant.

Remove:
- ext4_expand_inode_array() and ext4_xattr_inode_array_free()
- ext4_xattr_inode_array_free_deferred()
- struct ext4_xattr_inode_array and EIA_INCR/EIA_MASK defines
- ea_inode_array parameter from ext4_xattr_inode_dec_ref_all(),
  ext4_xattr_release_block(), and ext4_xattr_delete_inode()
- ea_inode_array variable from ext4_evict_inode()

Instead, ext4_xattr_inode_dec_ref_all() now calls ext4_put_ea_inode()
directly after processing each EA inode.  This simplifies the code
by eliminating multi-layer parameter threading and removes the need
for callers to manage array lifetime.

Signed-off-by: Yun Zhou <yun.zhou@windriver.com>
Suggested-by: Jan Kara <jack@suse.cz>
---
 fs/ext4/inode.c |  6 +---
 fs/ext4/xattr.c | 95 +++----------------------------------------------
 fs/ext4/xattr.h |  7 ----
 3 files changed, 6 insertions(+), 102 deletions(-)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 0d131371ad3d..6f1b84e46a2e 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -176,7 +176,6 @@ void ext4_evict_inode(struct inode *inode)
 	 * (xattr block freeing), bitmap, group descriptor (inode freeing)
 	 */
 	int extra_credits = 6;
-	struct ext4_xattr_inode_array *ea_inode_array = NULL;
 	bool freeze_protected = false;
 
 	trace_ext4_evict_inode(inode);
@@ -282,8 +281,7 @@ void ext4_evict_inode(struct inode *inode)
 	}
 
 	/* Remove xattr references. */
-	err = ext4_xattr_delete_inode(handle, inode, &ea_inode_array,
-				      extra_credits);
+	err = ext4_xattr_delete_inode(handle, inode, extra_credits);
 	if (err) {
 		ext4_warning(inode->i_sb, "xattr delete (err %d)", err);
 stop_handle:
@@ -291,7 +289,6 @@ void ext4_evict_inode(struct inode *inode)
 		ext4_orphan_del(NULL, inode);
 		if (freeze_protected)
 			sb_end_intwrite(inode->i_sb);
-		ext4_xattr_inode_array_free(ea_inode_array);
 		goto no_delete;
 	}
 
@@ -321,7 +318,6 @@ void ext4_evict_inode(struct inode *inode)
 	ext4_journal_stop(handle);
 	if (freeze_protected)
 		sb_end_intwrite(inode->i_sb);
-	ext4_xattr_inode_array_free(ea_inode_array);
 	return;
 no_delete:
 	/*
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 90b693b78a45..7f334349bd4f 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -114,12 +114,6 @@ const struct xattr_handler * const ext4_xattr_handlers[] = {
 #define EA_INODE_CACHE(inode)	(((struct ext4_sb_info *) \
 				inode->i_sb->s_fs_info)->s_ea_inode_cache)
 
-static int
-ext4_expand_inode_array(struct ext4_xattr_inode_array **ea_inode_array,
-			struct inode *inode);
-static void ext4_xattr_inode_array_free_deferred(struct super_block *sb,
-				struct ext4_xattr_inode_array *array);
-
 #ifdef CONFIG_LOCKDEP
 void ext4_xattr_inode_set_class(struct inode *ea_inode)
 {
@@ -1162,7 +1156,6 @@ static void
 ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent,
 			     struct buffer_head *bh,
 			     struct ext4_xattr_entry *first, bool block_csum,
-			     struct ext4_xattr_inode_array **ea_inode_array,
 			     int extra_credits, bool skip_quota)
 {
 	struct inode *ea_inode;
@@ -1199,14 +1192,6 @@ ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent,
 		if (err)
 			continue;
 
-		err = ext4_expand_inode_array(ea_inode_array, ea_inode);
-		if (err) {
-			ext4_warning_inode(ea_inode,
-					   "Expand inode array err=%d", err);
-			ext4_put_ea_inode(parent->i_sb, ea_inode);
-			continue;
-		}
-
 		err = ext4_journal_ensure_credits_fn(handle, credits, credits,
 			ext4_free_metadata_revoke_credits(parent->i_sb, 1),
 			ext4_xattr_restart_fn(handle, parent, bh, block_csum,
@@ -1214,6 +1199,7 @@ ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent,
 		if (err < 0) {
 			ext4_warning_inode(ea_inode, "Ensure credits err=%d",
 					   err);
+			ext4_put_ea_inode(parent->i_sb, ea_inode);
 			continue;
 		}
 		if (err > 0) {
@@ -1223,6 +1209,7 @@ ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent,
 				ext4_warning_inode(ea_inode,
 						"Re-get write access err=%d",
 						err);
+				ext4_put_ea_inode(parent->i_sb, ea_inode);
 				continue;
 			}
 		}
@@ -1231,6 +1218,7 @@ ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent,
 		if (err) {
 			ext4_warning_inode(ea_inode, "ea_inode dec ref err=%d",
 					   err);
+			ext4_put_ea_inode(parent->i_sb, ea_inode);
 			continue;
 		}
 
@@ -1247,6 +1235,7 @@ ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent,
 		entry->e_value_inum = 0;
 		entry->e_value_size = 0;
 
+		ext4_put_ea_inode(parent->i_sb, ea_inode);
 		dirty = true;
 	}
 
@@ -1273,7 +1262,6 @@ ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent,
 static void
 ext4_xattr_release_block(handle_t *handle, struct inode *inode,
 			 struct buffer_head *bh,
-			 struct ext4_xattr_inode_array **ea_inode_array,
 			 int extra_credits)
 {
 	struct mb_cache *ea_block_cache = EA_BLOCK_CACHE(inode);
@@ -1315,7 +1303,6 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
 			ext4_xattr_inode_dec_ref_all(handle, inode, bh,
 						     BFIRST(bh),
 						     true /* block_csum */,
-						     ea_inode_array,
 						     extra_credits,
 						     true /* skip_quota */);
 		ext4_free_blocks(handle, inode, bh, 0, 1,
@@ -2184,13 +2171,8 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
 
 	/* Drop the previous xattr block. */
 	if (bs->bh && bs->bh != new_bh) {
-		struct ext4_xattr_inode_array *ea_inode_array = NULL;
-
 		ext4_xattr_release_block(handle, inode, bs->bh,
-					 &ea_inode_array,
 					 0 /* extra_credits */);
-		ext4_xattr_inode_array_free_deferred(inode->i_sb,
-						     ea_inode_array);
 	}
 	error = 0;
 
@@ -2866,46 +2848,6 @@ int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
 	return error;
 }
 
-#define EIA_INCR 16 /* must be 2^n */
-#define EIA_MASK (EIA_INCR - 1)
-
-/* Add the large xattr @inode into @ea_inode_array for deferred iput().
- * If @ea_inode_array is new or full it will be grown and the old
- * contents copied over.
- */
-static int
-ext4_expand_inode_array(struct ext4_xattr_inode_array **ea_inode_array,
-			struct inode *inode)
-{
-	if (*ea_inode_array == NULL) {
-		/*
-		 * Start with 15 inodes, so it fits into a power-of-two size.
-		 */
-		(*ea_inode_array) = kmalloc_flex(**ea_inode_array, inodes,
-						 EIA_MASK, GFP_NOFS);
-		if (*ea_inode_array == NULL)
-			return -ENOMEM;
-		(*ea_inode_array)->count = 0;
-	} else if (((*ea_inode_array)->count & EIA_MASK) == EIA_MASK) {
-		/* expand the array once all 15 + n * 16 slots are full */
-		struct ext4_xattr_inode_array *new_array = NULL;
-
-		new_array = kmalloc_flex(**ea_inode_array, inodes,
-					 (*ea_inode_array)->count + EIA_INCR,
-					 GFP_NOFS);
-		if (new_array == NULL)
-			return -ENOMEM;
-		memcpy(new_array, *ea_inode_array,
-		       struct_size(*ea_inode_array, inodes,
-				   (*ea_inode_array)->count));
-		kfree(*ea_inode_array);
-		*ea_inode_array = new_array;
-	}
-	(*ea_inode_array)->count++;
-	(*ea_inode_array)->inodes[(*ea_inode_array)->count - 1] = inode;
-	return 0;
-}
-
 /*
  * ext4_xattr_delete_inode()
  *
@@ -2916,7 +2858,6 @@ ext4_expand_inode_array(struct ext4_xattr_inode_array **ea_inode_array,
  * references on xattr block and xattr inodes.
  */
 int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
-			    struct ext4_xattr_inode_array **ea_inode_array,
 			    int extra_credits)
 {
 	struct buffer_head *bh = NULL;
@@ -2955,7 +2896,6 @@ int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
 			ext4_xattr_inode_dec_ref_all(handle, inode, iloc.bh,
 						     IFIRST(header),
 						     false /* block_csum */,
-						     ea_inode_array,
 						     extra_credits,
 						     false /* skip_quota */);
 	}
@@ -2994,7 +2934,7 @@ int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
 
 		}
 
-		ext4_xattr_release_block(handle, inode, bh, ea_inode_array,
+		ext4_xattr_release_block(handle, inode, bh,
 					 extra_credits);
 		/*
 		 * Update i_file_acl value in the same transaction that releases
@@ -3016,31 +2956,6 @@ int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
 	return error;
 }
 
-void ext4_xattr_inode_array_free(struct ext4_xattr_inode_array *ea_inode_array)
-{
-	int idx;
-
-	if (ea_inode_array == NULL)
-		return;
-
-	for (idx = 0; idx < ea_inode_array->count; ++idx)
-		iput(ea_inode_array->inodes[idx]);
-	kfree(ea_inode_array);
-}
-
-static void ext4_xattr_inode_array_free_deferred(struct super_block *sb,
-				struct ext4_xattr_inode_array *array)
-{
-	int idx;
-
-	if (array == NULL)
-		return;
-
-	for (idx = 0; idx < array->count; ++idx)
-		ext4_put_ea_inode(sb, array->inodes[idx]);
-	kfree(array);
-}
-
 /*
  * Worker function for deferred EA inode iput.  Processes all inodes queued
  * on s_ea_inode_to_free in a context free of xattr_sem/jbd2 handle locks.
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index 9883ba5569a1..8214a31fe001 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -131,11 +131,6 @@ struct ext4_xattr_ibody_find {
 	struct ext4_iloc iloc;
 };
 
-struct ext4_xattr_inode_array {
-	unsigned int count;
-	struct inode *inodes[] __counted_by(count);
-};
-
 extern const struct xattr_handler ext4_xattr_user_handler;
 extern const struct xattr_handler ext4_xattr_trusted_handler;
 extern const struct xattr_handler ext4_xattr_security_handler;
@@ -187,9 +182,7 @@ extern int __ext4_xattr_set_credits(struct super_block *sb, struct inode *inode,
 				bool is_create);
 
 extern int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
-				   struct ext4_xattr_inode_array **array,
 				   int extra_credits);
-extern void ext4_xattr_inode_array_free(struct ext4_xattr_inode_array *array);
 extern void ext4_init_ea_inode_work(struct ext4_sb_info *sbi);
 extern void ext4_put_ea_inode(struct super_block *sb, struct inode *inode);
 
-- 
2.43.0


^ permalink raw reply related

* [PATCH v9 3/4] ext4: convert all EA inode iput() calls to ext4_put_ea_inode()
From: Yun Zhou @ 2026-06-23  8:35 UTC (permalink / raw)
  To: tytso, adilger.kernel, libaokun, jack, ojaswin, ritesh.list,
	yi.zhang
  Cc: linux-ext4, linux-kernel, yun.zhou
In-Reply-To: <20260623083540.2744885-1-yun.zhou@windriver.com>

Convert all iput() calls on EA inodes in xattr code paths to use
ext4_put_ea_inode().  This establishes a uniform rule: every EA inode
reference release in ext4 xattr code goes through ext4_put_ea_inode(),
eliminating the need to analyze each call site individually for lock
safety.

Converted sites:

- ext4_xattr_inode_get() read path
- ext4_xattr_inode_inc_ref_all() main loop and cleanup path
- ext4_xattr_inode_dec_ref_all() error paths
- ext4_xattr_inode_create() error path
- ext4_xattr_inode_cache_find() mismatch path
- ext4_xattr_inode_lookup_create() out_err
- ext4_xattr_set_entry() old_ea_inode
- ext4_xattr_block_set() new block path, cleanup, and tmp_inode
- ext4_xattr_ibody_set() error and success paths
- ext4_xattr_delete_inode() quota loop

For most of these, iput_if_not_last() will succeed (the EA inode has
other references) making the overhead a single atomic operation.

Signed-off-by: Yun Zhou <yun.zhou@windriver.com>
---
 fs/ext4/xattr.c | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index ecdad5920b14..90b693b78a45 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -569,7 +569,7 @@ ext4_xattr_inode_get(struct inode *inode, struct ext4_xattr_entry *entry,
 					ea_inode->i_ino, true /* reusable */);
 	}
 out:
-	iput(ea_inode);
+	ext4_put_ea_inode(inode->i_sb, ea_inode);
 	return err;
 }
 
@@ -1106,10 +1106,10 @@ static int ext4_xattr_inode_inc_ref_all(handle_t *handle, struct inode *parent,
 		err = ext4_xattr_inode_inc_ref(handle, ea_inode);
 		if (err) {
 			ext4_warning_inode(ea_inode, "inc ref error %d", err);
-			iput(ea_inode);
+			ext4_put_ea_inode(parent->i_sb, ea_inode);
 			goto cleanup;
 		}
-		iput(ea_inode);
+		ext4_put_ea_inode(parent->i_sb, ea_inode);
 	}
 	return 0;
 
@@ -1135,7 +1135,7 @@ static int ext4_xattr_inode_inc_ref_all(handle_t *handle, struct inode *parent,
 		if (err)
 			ext4_warning_inode(ea_inode, "cleanup dec ref error %d",
 					   err);
-		iput(ea_inode);
+		ext4_put_ea_inode(parent->i_sb, ea_inode);
 	}
 	return saved_err;
 }
@@ -1203,7 +1203,7 @@ ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent,
 		if (err) {
 			ext4_warning_inode(ea_inode,
 					   "Expand inode array err=%d", err);
-			iput(ea_inode);
+			ext4_put_ea_inode(parent->i_sb, ea_inode);
 			continue;
 		}
 
@@ -1507,7 +1507,7 @@ static struct inode *ext4_xattr_inode_create(handle_t *handle,
 			if (ext4_xattr_inode_dec_ref(handle, ea_inode))
 				ext4_warning_inode(ea_inode,
 					"cleanup dec ref error %d", err);
-			iput(ea_inode);
+			ext4_put_ea_inode(inode->i_sb, ea_inode);
 			return ERR_PTR(err);
 		}
 
@@ -1566,7 +1566,7 @@ ext4_xattr_inode_cache_find(struct inode *inode, const void *value,
 			kvfree(ea_data);
 			return ea_inode;
 		}
-		iput(ea_inode);
+		ext4_put_ea_inode(inode->i_sb, ea_inode);
 	next_entry:
 		ce = mb_cache_entry_find_next(ea_inode_cache, ce);
 	}
@@ -1617,7 +1617,7 @@ static struct inode *ext4_xattr_inode_lookup_create(handle_t *handle,
 				      ea_inode->i_ino, true /* reusable */);
 	return ea_inode;
 out_err:
-	iput(ea_inode);
+	ext4_put_ea_inode(inode->i_sb, ea_inode);
 	ext4_xattr_inode_free_quota(inode, NULL, value_len);
 	return ERR_PTR(err);
 }
@@ -1850,7 +1850,7 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i,
 
 	ret = 0;
 out:
-	iput(old_ea_inode);
+	ext4_put_ea_inode(inode->i_sb, old_ea_inode);
 	return ret;
 }
 
@@ -2012,7 +2012,7 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
 				old_ea_inode_quota = le32_to_cpu(
 						s->here->e_value_size);
 			}
-			iput(tmp_inode);
+			ext4_put_ea_inode(inode->i_sb, tmp_inode);
 
 			s->here->e_value_inum = 0;
 			s->here->e_value_size = 0;
@@ -2152,7 +2152,7 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
 					ext4_warning_inode(ea_inode,
 							   "dec ref error=%d",
 							   error);
-				iput(ea_inode);
+				ext4_put_ea_inode(inode->i_sb, ea_inode);
 				ea_inode = NULL;
 			}
 
@@ -2206,7 +2206,7 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
 			ext4_xattr_inode_free_quota(inode, ea_inode,
 						    i_size_read(ea_inode));
 		}
-		iput(ea_inode);
+		ext4_put_ea_inode(inode->i_sb, ea_inode);
 	}
 	if (ce)
 		mb_cache_entry_put(ea_block_cache, ce);
@@ -2288,7 +2288,7 @@ int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode,
 
 			ext4_xattr_inode_free_quota(inode, ea_inode,
 						    i_size_read(ea_inode));
-			iput(ea_inode);
+			ext4_put_ea_inode(inode->i_sb, ea_inode);
 		}
 		return error;
 	}
@@ -2300,7 +2300,7 @@ int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode,
 		header->h_magic = cpu_to_le32(0);
 		ext4_clear_inode_state(inode, EXT4_STATE_XATTR);
 	}
-	iput(ea_inode);
+	ext4_put_ea_inode(inode->i_sb, ea_inode);
 	return 0;
 }
 
@@ -2989,7 +2989,7 @@ int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
 					continue;
 				ext4_xattr_inode_free_quota(inode, ea_inode,
 					      le32_to_cpu(entry->e_value_size));
-				iput(ea_inode);
+				ext4_put_ea_inode(inode->i_sb, ea_inode);
 			}
 
 		}
-- 
2.43.0


^ permalink raw reply related

* [PATCH v9 1/4] fs: add iput_if_not_last() helper
From: Yun Zhou @ 2026-06-23  8:35 UTC (permalink / raw)
  To: tytso, adilger.kernel, libaokun, jack, ojaswin, ritesh.list,
	yi.zhang
  Cc: linux-ext4, linux-kernel, yun.zhou
In-Reply-To: <20260623083540.2744885-1-yun.zhou@windriver.com>

Add a helper that drops an inode reference only if the caller does not
hold the last one.  Returns true if the reference was dropped, false
otherwise.

This is useful for filesystems that need to release inode references
in contexts where triggering final iput (and thus eviction) would be
unsafe due to lock ordering constraints.  The caller can check the
return value and defer the final iput to a safe context.

Unlike iput_not_last() which BUG_ON's if called with the last ref,
this variant is designed to be called speculatively.

Signed-off-by: Yun Zhou <yun.zhou@windriver.com>
Suggested-by: Jan Kara <jack@suse.cz>
---
 include/linux/fs.h | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 6da44573ce45..4916a9d54347 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2418,6 +2418,19 @@ static inline void super_set_sysfs_name_generic(struct super_block *sb, const ch
 extern void ihold(struct inode * inode);
 extern void iput(struct inode *);
 void iput_not_last(struct inode *);
+
+/**
+ * iput_if_not_last - drop an inode reference only if it is not the last one
+ * @inode: inode to put
+ *
+ * Returns true if the reference was dropped, false if this was the last
+ * reference and the caller must arrange for final iput() in a safe context.
+ */
+static inline bool iput_if_not_last(struct inode *inode)
+{
+	return atomic_add_unless(&inode->i_count, -1, 1);
+}
+
 int inode_update_time(struct inode *inode, enum fs_update_time type,
 		unsigned int flags);
 int generic_update_time(struct inode *inode, enum fs_update_time type,
-- 
2.43.0


^ permalink raw reply related

* [PATCH v9 2/4] ext4: introduce ext4_put_ea_inode() for safe deferred iput
From: Yun Zhou @ 2026-06-23  8:35 UTC (permalink / raw)
  To: tytso, adilger.kernel, libaokun, jack, ojaswin, ritesh.list,
	yi.zhang
  Cc: linux-ext4, linux-kernel, yun.zhou
In-Reply-To: <20260623083540.2744885-1-yun.zhou@windriver.com>

Calling iput() on EA inodes while holding xattr_sem or a jbd2 handle
can trigger write_inode_now() -> ext4_writepages() -> s_writepages_rwsem,
creating a lock ordering issue during mount (!SB_ACTIVE).

Add ext4_put_ea_inode() which uses iput_if_not_last() as a fast path.
If this is not the last reference, it is dropped immediately.  If this
is the last reference, the inode is linked onto a per-sb lock-free llist
via i_ea_iput_node (embedded in ext4_inode_info, sharing space with the
unused xattr_sem of EA inodes via a union) and a delayed worker
(1 jiffie) performs the final iput() in a clean context.  This avoids
per-iput memory allocation.

Convert the first call site: ext4_xattr_block_set()'s "Drop the
previous xattr block" path, which previously called
ext4_xattr_inode_array_free() under xattr_sem + jbd2 handle.

The worker is drained in ext4_put_super() before quota shutdown using
a loop to handle re-arming (evicting an EA inode may queue further EA
inodes). Initialization is placed before journal loading since fast
commit replay may trigger evictions that call ext4_put_ea_inode().

Signed-off-by: Yun Zhou <yun.zhou@windriver.com>
Suggested-by: Jan Kara <jack@suse.cz>
---
 fs/ext4/ext4.h  | 13 ++++++++-
 fs/ext4/super.c | 18 +++++++++++-
 fs/ext4/xattr.c | 73 ++++++++++++++++++++++++++++++++++++++++++++++++-
 fs/ext4/xattr.h | 14 ++++++++++
 4 files changed, 115 insertions(+), 3 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index b37c136ea3ab..b9b0ada7774b 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1070,8 +1070,14 @@ struct ext4_inode_info {
 	 * between readers of EAs and writers of regular file data, so
 	 * instead we synchronize on xattr_sem when reading or changing
 	 * EAs.
+	 *
+	 * EA inodes (EXT4_EA_INODE_FL) do not use xattr_sem; they reuse
+	 * the space for deferred iput linkage.
 	 */
-	struct rw_semaphore xattr_sem;
+	union {
+		struct rw_semaphore xattr_sem;
+		struct llist_node i_ea_iput_node;
+	};
 
 	/*
 	 * Inodes with EXT4_STATE_ORPHAN_FILE use i_orphan_idx. Otherwise
@@ -1770,6 +1776,11 @@ struct ext4_sb_info {
 	struct ext4_es_stats s_es_stats;
 	struct mb_cache *s_ea_block_cache;
 	struct mb_cache *s_ea_inode_cache;
+
+	/* Deferred iput for EA inodes to avoid lock ordering issues */
+	struct llist_head s_ea_inode_to_free;
+	struct delayed_work s_ea_inode_work;
+
 	spinlock_t s_es_lock ____cacheline_aligned_in_smp;
 
 	/* Journal triggers for checksum computation */
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 245f67d10ded..97f0e7c1b254 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1303,6 +1303,8 @@ static void ext4_put_super(struct super_block *sb)
 			 &sb->s_uuid);
 
 	ext4_unregister_li_request(sb);
+	/* Drain deferred EA inode iputs while quota is still active. */
+	ext4_drain_ea_inode_work(sbi);
 	ext4_quotas_off(sb, EXT4_MAXQUOTAS);
 
 	destroy_workqueue(sbi->rsv_conversion_wq);
@@ -1423,6 +1425,13 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
 	memset(&ei->i_dquot, 0, sizeof(ei->i_dquot));
 #endif
 	ei->jinode = NULL;
+	/*
+	 * Reinitialize xattr_sem every allocation because EA inodes
+	 * share this space with i_ea_iput_node (via union) which may
+	 * have overwritten the semaphore when the slab object was
+	 * previously used as an EA inode.
+	 */
+	init_rwsem(&ei->xattr_sem);
 	INIT_LIST_HEAD(&ei->i_rsv_conversion_list);
 	spin_lock_init(&ei->i_completed_io_lock);
 	ei->i_sync_tid = 0;
@@ -1488,7 +1497,6 @@ static void init_once(void *foo)
 	struct ext4_inode_info *ei = foo;
 
 	INIT_LIST_HEAD(&ei->i_orphan);
-	init_rwsem(&ei->xattr_sem);
 	init_rwsem(&ei->i_data_sem);
 	inode_init_once(&ei->vfs_inode);
 	ext4_fc_init_inode(&ei->vfs_inode);
@@ -5508,6 +5516,8 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
 	 * The first inode we look at is the journal inode.  Don't try
 	 * root first: it may be modified in the journal!
 	 */
+	ext4_init_ea_inode_work(sbi);
+
 	if (!test_opt(sb, NOLOAD) && ext4_has_feature_journal(sb)) {
 		err = ext4_load_and_init_journal(sb, es, ctx);
 		if (err)
@@ -5747,6 +5757,8 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
 	return 0;
 
 failed_mount9:
+	/* Drain deferred EA inode iputs before quota shutdown */
+	ext4_drain_ea_inode_work(sbi);
 	ext4_quotas_off(sb, EXT4_MAXQUOTAS);
 failed_mount8: __maybe_unused
 	ext4_release_orphan_info(sb);
@@ -5767,6 +5779,8 @@ failed_mount8: __maybe_unused
 	if (EXT4_SB(sb)->rsv_conversion_wq)
 		destroy_workqueue(EXT4_SB(sb)->rsv_conversion_wq);
 failed_mount_wq:
+	/* Drain deferred EA inode iputs before freeing structures */
+	ext4_drain_ea_inode_work(sbi);
 	ext4_xattr_destroy_cache(sbi->s_ea_inode_cache);
 	sbi->s_ea_inode_cache = NULL;
 
@@ -5777,6 +5791,8 @@ failed_mount8: __maybe_unused
 		ext4_journal_destroy(sbi, sbi->s_journal);
 	}
 failed_mount3a:
+	/* Drain deferred EA inode iputs from journal replay */
+	ext4_drain_ea_inode_work(sbi);
 	ext4_es_unregister_shrinker(sbi);
 failed_mount3:
 	/* flush s_sb_upd_work before sbi destroy */
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 982a1f831e22..ecdad5920b14 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -117,6 +117,8 @@ const struct xattr_handler * const ext4_xattr_handlers[] = {
 static int
 ext4_expand_inode_array(struct ext4_xattr_inode_array **ea_inode_array,
 			struct inode *inode);
+static void ext4_xattr_inode_array_free_deferred(struct super_block *sb,
+				struct ext4_xattr_inode_array *array);
 
 #ifdef CONFIG_LOCKDEP
 void ext4_xattr_inode_set_class(struct inode *ea_inode)
@@ -2187,7 +2189,8 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
 		ext4_xattr_release_block(handle, inode, bs->bh,
 					 &ea_inode_array,
 					 0 /* extra_credits */);
-		ext4_xattr_inode_array_free(ea_inode_array);
+		ext4_xattr_inode_array_free_deferred(inode->i_sb,
+						     ea_inode_array);
 	}
 	error = 0;
 
@@ -3025,6 +3028,74 @@ void ext4_xattr_inode_array_free(struct ext4_xattr_inode_array *ea_inode_array)
 	kfree(ea_inode_array);
 }
 
+static void ext4_xattr_inode_array_free_deferred(struct super_block *sb,
+				struct ext4_xattr_inode_array *array)
+{
+	int idx;
+
+	if (array == NULL)
+		return;
+
+	for (idx = 0; idx < array->count; ++idx)
+		ext4_put_ea_inode(sb, array->inodes[idx]);
+	kfree(array);
+}
+
+/*
+ * Worker function for deferred EA inode iput.  Processes all inodes queued
+ * on s_ea_inode_to_free in a context free of xattr_sem/jbd2 handle locks.
+ */
+static void ext4_ea_inode_work(struct work_struct *work)
+{
+	struct ext4_sb_info *sbi = container_of(to_delayed_work(work),
+						struct ext4_sb_info,
+						s_ea_inode_work);
+	struct llist_node *node = llist_del_all(&sbi->s_ea_inode_to_free);
+	struct llist_node *next;
+
+	while (node) {
+		struct ext4_inode_info *ei = container_of(node,
+					struct ext4_inode_info, i_ea_iput_node);
+		next = node->next;
+		iput(&ei->vfs_inode);
+		node = next;
+	}
+}
+
+/*
+ * Release a VFS reference on an EA inode.  Must be used instead of iput()
+ * in any context where xattr_sem or a jbd2 handle is held.
+ *
+ * If this is not the last reference, drops it immediately via
+ * iput_if_not_last() with no further action needed.
+ *
+ * If this is the last reference, the inode is linked onto a per-sb
+ * llist via i_ea_iput_node (embedded in ext4_inode_info, sharing space
+ * with the unused xattr_sem) and a delayed worker performs the final
+ * iput() in a clean context.
+ */
+void ext4_put_ea_inode(struct super_block *sb, struct inode *inode)
+{
+	if (!inode)
+		return;
+	WARN_ON_ONCE(!(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL));
+	if (iput_if_not_last(inode))
+		return;
+	llist_add(&EXT4_I(inode)->i_ea_iput_node,
+		  &EXT4_SB(sb)->s_ea_inode_to_free);
+	/*
+	 * Use a short delay to allow multiple EA inodes to accumulate,
+	 * reducing workqueue wakeups when several are released together.
+	 */
+	schedule_delayed_work(&EXT4_SB(sb)->s_ea_inode_work, 1);
+}
+
+void ext4_init_ea_inode_work(struct ext4_sb_info *sbi)
+{
+	init_llist_head(&sbi->s_ea_inode_to_free);
+	INIT_DELAYED_WORK(&sbi->s_ea_inode_work, ext4_ea_inode_work);
+}
+
 /*
  * ext4_xattr_block_cache_insert()
  *
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index 1fedf44d4fb6..9883ba5569a1 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -190,6 +190,20 @@ extern int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
 				   struct ext4_xattr_inode_array **array,
 				   int extra_credits);
 extern void ext4_xattr_inode_array_free(struct ext4_xattr_inode_array *array);
+extern void ext4_init_ea_inode_work(struct ext4_sb_info *sbi);
+extern void ext4_put_ea_inode(struct super_block *sb, struct inode *inode);
+
+/*
+ * Drain all pending deferred EA inode iputs.  Must be called before
+ * freeing resources that eviction depends on (quota, block allocator).
+ * Loops because worker iput may trigger eviction that re-queues.
+ */
+static inline void ext4_drain_ea_inode_work(struct ext4_sb_info *sbi)
+{
+	while (flush_delayed_work(&sbi->s_ea_inode_work) ||
+	       !llist_empty(&sbi->s_ea_inode_to_free))
+		;
+}
 
 extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
 			    struct ext4_inode *raw_inode, handle_t *handle);
-- 
2.43.0


^ permalink raw reply related

* [PATCH v9 0/4] ext4: deferred iput framework for EA inodes
From: Yun Zhou @ 2026-06-23  8:35 UTC (permalink / raw)
  To: tytso, adilger.kernel, libaokun, jack, ojaswin, ritesh.list,
	yi.zhang
  Cc: linux-ext4, linux-kernel, yun.zhou

This series introduces a deferred-iput framework for EA inodes to
eliminate a class of lock ordering issues in ext4 xattr code.

The problem: iput() on EA inodes while holding xattr_sem or a jbd2
handle can trigger eviction, which may acquire those same locks or
s_writepages_rwsem, creating circular dependencies.  The immediate
deadlock (during mount-time orphan cleanup) is fixed by two separate
patches already reviewed and posted:

  ext4: skip extra isize expansion during mount to prevent deadlock
  ext4: set EXT4_STATE_NO_EXPAND in ext4_evict_inode

This series provides the structural fix that makes the code safe
regardless of calling context:

Patch 1 adds a VFS helper iput_if_not_last() which drops an inode
reference only if it is not the last one, using atomic_add_unless().
This provides a proper VFS abstraction for filesystems that need to
conditionally defer final iput.

Patch 2 introduces ext4_put_ea_inode() using iput_if_not_last() as
a fast path (single atomic, zero overhead for the common case).  If
this is the last reference, the inode is linked onto a per-sb llist
(via i_ea_iput_node embedded in ext4_inode_info, union with xattr_sem
which is unused for EA inodes) and a delayed worker (1 jiffie) performs
the final iput() in a clean context.  No per-iput allocation needed.
Also moves init_rwsem(xattr_sem) from init_once to ext4_alloc_inode
to handle slab reuse after the union field has been overwritten.

Patch 3 converts all EA inode iput() calls in xattr code to use
ext4_put_ea_inode() uniformly -- no exceptions to reason about.

Patch 4 removes the now-redundant ea_inode_array mechanism (parameter
threading, struct, expand/free functions), replaced entirely by direct
ext4_put_ea_inode() calls.  This is a net code reduction.

Link: https://syzkaller.appspot.com/bug?extid=5d19358d7eb30ffb0cc5

v9:
 - Add iput_if_not_last() as proper VFS helper (per reviewer: don't
   let filesystems manipulate inode refcount without VFS abstraction).
 - Use iput_if_not_last() + llist_node embedded in ext4_inode_info
   (union with xattr_sem) to avoid per-iput allocation entirely.
 - Convert ALL EA inode iput() calls uniformly -- no exceptions.
 - Remove entire ea_inode_array mechanism.
 - Add WARN_ON_ONCE in ext4_put_ea_inode() to catch misuse on non-EA
   inodes (protects the xattr_sem union safety).
 - Fix worker re-arm: ext4_drain_ea_inode_work() loops to handle
   nested EA inode evictions re-scheduling work.
 - Move INIT_DELAYED_WORK before journal loading (fast commit replay
   may trigger evictions).
 - Drain before ext4_quotas_off() for correct quota accounting.
 - Add flush in failed_mount_wq and failed_mount3a error paths for
   journal replay case.
 - Move init_rwsem(xattr_sem) from init_once to ext4_alloc_inode to
   handle slab object reuse after union overwrite.
 - Encapsulate worker init into ext4_init_ea_inode_work(), making
   ext4_ea_inode_work() static to xattr.c.

Yun Zhou (4):
  fs: add iput_if_not_last() helper
  ext4: introduce ext4_put_ea_inode() for safe deferred iput
  ext4: convert all EA inode iput() calls to ext4_put_ea_inode()
  ext4: remove ea_inode_array mechanism in favor of ext4_put_ea_inode()

 fs/ext4/ext4.h     |  13 ++++-
 fs/ext4/inode.c    |   6 +-
 fs/ext4/super.c    |  18 +++++-
 fs/ext4/xattr.c    | 150 ++++++++++++++++++++-------------------------
 fs/ext4/xattr.h    |  21 ++++---
 include/linux/fs.h |  13 ++++
 6 files changed, 125 insertions(+), 96 deletions(-)

-- 
2.43.0

^ permalink raw reply

* [RFC PATCH v2 2/2] ext4: fast commit: allocate the range array lazily
From: Daejun Park @ 2026-06-23  8:26 UTC (permalink / raw)
  To: tytso@mit.edu, adilger.kernel@dilger.ca
  Cc: me@linux.beauty, libaokun@linux.alibaba.com,
	harshadshirwadkar@gmail.com, yi.zhang@huaweicloud.com,
	ojaswin@linux.ibm.com, linux-ext4@vger.kernel.org,
	linux-kernel@vger.kernel.org, Daejun Park
In-Reply-To: <20260623082506epcms2p81c2f8abfee34b88e5a645fbfaaf4bf9e@epcms2p8>

Patch 1 keeps the disjoint-range set in a fixed EXT4_FC_MAX_RANGES+1
array.  Embedding that in every ext4_inode_info costs ~140 bytes on
inodes that never use fast commit or only ever touch a single
contiguous range.

Keep the first range inline in i_fc_range and allocate the array only
when a second disjoint range appears; free it when the inode is
evicted.  The tracking path runs under i_fc_lock, so the array is
allocated with GFP_ATOMIC; on allocation failure we coalesce into the
inline range -- i.e. the original single-range behaviour -- so memory
pressure never forces a full commit.  The per-inode fast-commit
footprint drops to 20 bytes.

Signed-off-by: Daejun Park <daejun7.park@samsung.com>
---
 fs/ext4/ext4.h        | 23 +++++++++++-----
 fs/ext4/fast_commit.c | 64 ++++++++++++++++++++++++++++++++++++++++---
 fs/ext4/super.c       |  1 +
 3 files changed, 77 insertions(+), 11 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 8e93d30766fd..d93cc52cd01e 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1106,14 +1106,23 @@ struct ext4_inode_info {
 					 */
 
 	/*
-	 * Disjoint lblk ranges modified in this fast commit.  Tracking the
+	 * Logical block ranges modified in this fast commit.  Tracking the
 	 * actual modified ranges (instead of one coalesced [min,max]) avoids
 	 * snapshotting the whole spanned extent map for scattered allocations.
-	 * Sorted by start, mutually disjoint.  Bounded by EXT4_FC_MAX_RANGES;
-	 * the extra slot is transient room used while inserting before an
-	 * overflow merge.  Protected by i_fc_lock.
+	 *
+	 * The first range is kept inline in i_fc_range, so the common case of a
+	 * single contiguous dirty region needs no allocation.  When a second
+	 * disjoint range appears the inode is upgraded to the i_fc_ranges array
+	 * (EXT4_FC_MAX_RANGES + 1 entries, sorted and mutually disjoint; the
+	 * extra slot is transient room used while inserting before an overflow
+	 * merge), allocated then and freed when the inode is evicted.  If that
+	 * allocation fails we fall back to coalescing into i_fc_range, i.e. the
+	 * original single coalesced-range behaviour.  i_fc_nr_ranges counts the
+	 * valid ranges; while i_fc_ranges is NULL it is 0 or 1.  Protected by
+	 * i_fc_lock.
 	 */
-	struct ext4_fc_lblk_range i_fc_ranges[EXT4_FC_MAX_RANGES + 1];
+	struct ext4_fc_lblk_range i_fc_range;
+	struct ext4_fc_lblk_range *i_fc_ranges;
 	unsigned int i_fc_nr_ranges;
 
 	/*
@@ -1135,8 +1144,8 @@ struct ext4_inode_info {
 	spinlock_t i_raw_lock;	/* protects updates to the raw inode */
 
 	/*
-	 * Protect concurrent accesses on i_fc_ranges, i_fc_nr_ranges
-	 * and inode's EXT4_FC_STATE_COMMITTING state bit.
+	 * Protect concurrent accesses on i_fc_range, i_fc_ranges,
+	 * i_fc_nr_ranges and inode's EXT4_FC_STATE_COMMITTING state bit.
 	 */
 	spinlock_t i_fc_lock;
 
diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
index 1ea3742a55b1..9d36365eae02 100644
--- a/fs/ext4/fast_commit.c
+++ b/fs/ext4/fast_commit.c
@@ -230,6 +230,7 @@ void ext4_fc_init_inode(struct inode *inode)
 	struct ext4_inode_info *ei = EXT4_I(inode);
 
 	ext4_fc_reset_inode(inode);
+	ei->i_fc_ranges = NULL;
 	ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING);
 	ext4_clear_inode_state(inode, EXT4_STATE_FC_REQUEUE);
 	INIT_LIST_HEAD(&ei->i_fc_list);
@@ -691,6 +692,8 @@ static int __track_range(handle_t *handle, struct inode *inode, void *arg,
 	struct ext4_inode_info *ei = EXT4_I(inode);
 	struct __track_range_args *__arg =
 		(struct __track_range_args *)arg;
+	ext4_lblk_t start = __arg->start, end = __arg->end;
+	ext4_lblk_t s0, e0;
 
 	if (inode->i_ino < EXT4_FIRST_INO(inode->i_sb)) {
 		ext4_debug("Special inode %llu being modified\n", inode->i_ino);
@@ -701,12 +704,61 @@ static int __track_range(handle_t *handle, struct inode *inode, void *arg,
 	 * A sub-block punch hole rounds up the start and down the end, passing
 	 * end == start - 1: no whole block changed, so there is nothing to
 	 * track.  (ext4_fc_track_template has already reset the range set for a
-	 * new transaction.)
+	 * new transaction, so we need not do it here.)
 	 */
-	if (__arg->end < __arg->start)
+	if (end < start)
 		return 0;
 
-	ext4_fc_range_add(ei, __arg->start, __arg->end);
+	/* Already upgraded to the heap array: full multi-interval tracking. */
+	if (ei->i_fc_ranges) {
+		ext4_fc_range_add(ei, start, end);
+		return 0;
+	}
+
+	/* First range of this commit stays inline, no allocation needed. */
+	if (ei->i_fc_nr_ranges == 0) {
+		ei->i_fc_range.start = start;
+		ei->i_fc_range.len = end - start + 1;
+		ei->i_fc_nr_ranges = 1;
+		return 0;
+	}
+
+	/* One inline range so far. */
+	s0 = ei->i_fc_range.start;
+	e0 = s0 + ei->i_fc_range.len - 1;
+
+	/* Disjoint from it: try to upgrade to the array for exact tracking. */
+	if (start > e0 + 1 || end + 1 < s0) {
+		struct ext4_fc_lblk_range *heap;
+
+		/*
+		 * GFP_ATOMIC: we hold i_fc_lock.  __GFP_NOWARN: failure is not
+		 * fatal -- we fall back to the single coalesced range below --
+		 * so it must not splat under memory pressure.
+		 */
+		heap = kmalloc_array(EXT4_FC_MAX_RANGES + 1, sizeof(*heap),
+				     GFP_ATOMIC | __GFP_NOWARN);
+		if (heap) {
+			heap[0] = ei->i_fc_range;
+			ei->i_fc_ranges = heap;
+			ext4_fc_range_add(ei, start, end);
+			return 0;
+		}
+		/*
+		 * Out of memory: fall back to the original single coalesced
+		 * range by absorbing the gap below.  This over-logs the spanned
+		 * extents but stays a valid fast commit (no full-commit
+		 * fallback), so there is nothing to mark ineligible.
+		 */
+	}
+
+	/* Overlapping/adjacent, or array allocation failed: coalesce inline. */
+	if (start < s0)
+		s0 = start;
+	if (end > e0)
+		e0 = end;
+	ei->i_fc_range.start = s0;
+	ei->i_fc_range.len = e0 - s0 + 1;
 
 	return 0;
 }
@@ -1178,7 +1230,11 @@ static int ext4_fc_snapshot_inode_data(struct inode *inode,
 			*nr_rangesp = 0;
 		return 0;
 	}
-	memcpy(tracked, ei->i_fc_ranges, nr_tracked * sizeof(tracked[0]));
+	if (ei->i_fc_ranges)
+		memcpy(tracked, ei->i_fc_ranges,
+		       nr_tracked * sizeof(tracked[0]));
+	else
+		tracked[0] = ei->i_fc_range;	/* inline single-range mode */
 	ei->i_fc_nr_ranges = 0;
 	spin_unlock(&ei->i_fc_lock);
 
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index f2c52cc74676..cfa7e6bec385 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1455,6 +1455,7 @@ static void ext4_free_in_core_inode(struct inode *inode)
 		pr_warn("%s: inode %llu still in fc list",
 			__func__, inode->i_ino);
 	}
+	kfree(EXT4_I(inode)->i_fc_ranges);
 	kmem_cache_free(ext4_inode_cachep, EXT4_I(inode));
 }
 
-- 
2.43.0


^ permalink raw reply related

* [RFC PATCH v2 1/2] ext4: fast commit: track disjoint modified ranges  per inode
From: Daejun Park @ 2026-06-23  8:25 UTC (permalink / raw)
  To: tytso@mit.edu, adilger.kernel@dilger.ca
  Cc: me@linux.beauty, libaokun@linux.alibaba.com,
	harshadshirwadkar@gmail.com, yi.zhang@huaweicloud.com,
	ojaswin@linux.ibm.com, linux-ext4@vger.kernel.org,
	linux-kernel@vger.kernel.org, Daejun Park
In-Reply-To: <20260623082331epcms2p4798e9d26b06f9a005bcca7b2edf395d3@epcms2p4>

Fast commit tracks a single coalesced logical range per inode
(i_fc_lblk_start .. i_fc_lblk_len).  When an inode is modified at
several disjoint offsets between two commits (e.g. random writes), that
range widens to span [min, max] of all touched offsets.  At commit time
ext4_fc_snapshot_inode_data() walks that whole span through the extent
status tree, emitting an ADD_RANGE per mapped segment and a DEL_RANGE
per hole -- including the unmodified ones.  On scattered allocations
this produces hundreds to thousands of ranges per commit and exceeds
EXT4_FC_SNAPSHOT_MAX_RANGES, which fails the snapshot and falls back to
a full jbd2 commit -- the heavy path fast commit is meant to avoid.

Replace the single range with a bounded set of up to EXT4_FC_MAX_RANGES
(16) sorted, mutually disjoint ranges.  ext4_fc_range_add() inserts and
merges into it; on overflow the two ranges separated by the smallest
gap are coalesced, so the worst case degrades to the old single-span
behaviour.  ext4_fc_snapshot_inode_data() now walks only the tracked
ranges.  The on-disk fast-commit (TLV) format is unchanged.

On a sparse random-write workload (1 GiB span, 16 disjoint 4 KiB writes
per fsync, 300 fsyncs, dev 7.1.0-rc4): ranges per commit 1095 -> 16,
full-commit fallback 76% -> 0.7%, snap_fail_ranges_cap 226 -> 0.

Signed-off-by: Daejun Park <daejun7.park@samsung.com>
---
 fs/ext4/ext4.h        |  31 ++++++--
 fs/ext4/fast_commit.c | 165 +++++++++++++++++++++++++++++++++---------
 2 files changed, 156 insertions(+), 40 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index ddc903738c6b..8e93d30766fd 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1042,6 +1042,20 @@ enum ext4_fc_snap_err {
 	EXT4_FC_SNAP_ERR_INODE_LOC,
 };
 
+/*
+ * Maximum number of disjoint logical-block ranges tracked per inode for a
+ * single fast commit.  Scattered allocations that exceed this get their two
+ * closest ranges merged (see ext4_fc_range_add()), degrading gracefully to
+ * the old single coalesced-range behaviour.
+ */
+#define EXT4_FC_MAX_RANGES 16
+
+/* In-memory record of an lblk range modified in the current fast commit. */
+struct ext4_fc_lblk_range {
+	ext4_lblk_t start;
+	ext4_lblk_t len;
+};
+
 /*
  * fourth extended file system inode data in memory
  */
@@ -1091,11 +1105,16 @@ struct ext4_inode_info {
 					 * protected by sbi->s_fc_lock.
 					 */
 
-	/* Start of lblk range that needs to be committed in this fast commit */
-	ext4_lblk_t i_fc_lblk_start;
-
-	/* End of lblk range that needs to be committed in this fast commit */
-	ext4_lblk_t i_fc_lblk_len;
+	/*
+	 * Disjoint lblk ranges modified in this fast commit.  Tracking the
+	 * actual modified ranges (instead of one coalesced [min,max]) avoids
+	 * snapshotting the whole spanned extent map for scattered allocations.
+	 * Sorted by start, mutually disjoint.  Bounded by EXT4_FC_MAX_RANGES;
+	 * the extra slot is transient room used while inserting before an
+	 * overflow merge.  Protected by i_fc_lock.
+	 */
+	struct ext4_fc_lblk_range i_fc_ranges[EXT4_FC_MAX_RANGES + 1];
+	unsigned int i_fc_nr_ranges;
 
 	/*
 	 * Commit-time fast commit snapshots.
@@ -1116,7 +1135,7 @@ struct ext4_inode_info {
 	spinlock_t i_raw_lock;	/* protects updates to the raw inode */
 
 	/*
-	 * Protect concurrent accesses on i_fc_lblk_start, i_fc_lblk_len
+	 * Protect concurrent accesses on i_fc_ranges, i_fc_nr_ranges
 	 * and inode's EXT4_FC_STATE_COMMITTING state bit.
 	 */
 	spinlock_t i_fc_lock;
diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
index 4ef796b9b6cb..1ea3742a55b1 100644
--- a/fs/ext4/fast_commit.c
+++ b/fs/ext4/fast_commit.c
@@ -222,8 +222,7 @@ static inline void ext4_fc_reset_inode(struct inode *inode)
 {
 	struct ext4_inode_info *ei = EXT4_I(inode);
 
-	ei->i_fc_lblk_start = 0;
-	ei->i_fc_lblk_len = 0;
+	ei->i_fc_nr_ranges = 0;
 }
 
 void ext4_fc_init_inode(struct inode *inode)
@@ -582,7 +581,7 @@ static int __track_inode(handle_t *handle, struct inode *inode, void *arg,
 	if (update)
 		return -EEXIST;
 
-	EXT4_I(inode)->i_fc_lblk_len = 0;
+	EXT4_I(inode)->i_fc_nr_ranges = 0;
 
 	return 0;
 }
@@ -622,12 +621,74 @@ struct __track_range_args {
 	ext4_lblk_t start, end;
 };
 
+/*
+ * Record that logical block range [start, end] was modified in the current
+ * fast commit.  Maintains a small, bounded set of sorted, mutually disjoint
+ * ranges, merging the new range with any it overlaps or is adjacent to.  When
+ * the set would exceed EXT4_FC_MAX_RANGES, the consecutive pair separated by
+ * the smallest gap is merged (absorbing that gap), so the worst case degrades
+ * gracefully to the old single coalesced-range behaviour.  Tracking the actual
+ * modified ranges (rather than one [min,max] span) keeps
+ * ext4_fc_snapshot_inode_data() from snapshotting the whole spanned extent map
+ * on scattered allocations.  Caller holds ei->i_fc_lock; ei->i_fc_ranges is
+ * non-NULL with room for EXT4_FC_MAX_RANGES + 1 entries.
+ */
+static void ext4_fc_range_add(struct ext4_inode_info *ei,
+			      ext4_lblk_t start, ext4_lblk_t end)
+{
+	struct ext4_fc_lblk_range *r = ei->i_fc_ranges;
+	unsigned int n = ei->i_fc_nr_ranges;
+	unsigned int i, j;
+
+	/* Skip ranges lying entirely before [start - 1] (no overlap/adjacency). */
+	i = 0;
+	while (i < n && r[i].start + r[i].len < start)
+		i++;
+
+	/* Absorb every range overlapping or adjacent to the growing [start,end]. */
+	j = i;
+	while (j < n && r[j].start <= end + 1) {
+		if (r[j].start < start)
+			start = r[j].start;
+		if (r[j].start + r[j].len - 1 > end)
+			end = r[j].start + r[j].len - 1;
+		j++;
+	}
+
+	/* Replace r[i..j-1] with the merged range (j == i is a plain insert). */
+	if (j != i + 1)
+		memmove(&r[i + 1], &r[j], (n - j) * sizeof(*r));
+	r[i].start = start;
+	r[i].len = end - start + 1;
+	ei->i_fc_nr_ranges = n - (j - i) + 1;
+
+	/* Overflow: merge the consecutive pair separated by the smallest gap. */
+	while (ei->i_fc_nr_ranges > EXT4_FC_MAX_RANGES) {
+		ext4_lblk_t best_gap = ~0U;
+		unsigned int best = 0;
+
+		n = ei->i_fc_nr_ranges;
+		for (i = 0; i + 1 < n; i++) {
+			ext4_lblk_t gap = r[i + 1].start -
+					  (r[i].start + r[i].len);
+
+			if (gap < best_gap) {
+				best_gap = gap;
+				best = i;
+			}
+		}
+		r[best].len = r[best + 1].start + r[best + 1].len - r[best].start;
+		memmove(&r[best + 1], &r[best + 2],
+			(n - best - 2) * sizeof(*r));
+		ei->i_fc_nr_ranges = n - 1;
+	}
+}
+
 /* __track_fn for tracking data updates */
 static int __track_range(handle_t *handle, struct inode *inode, void *arg,
 			 bool update)
 {
 	struct ext4_inode_info *ei = EXT4_I(inode);
-	ext4_lblk_t oldstart;
 	struct __track_range_args *__arg =
 		(struct __track_range_args *)arg;
 
@@ -636,17 +697,16 @@ static int __track_range(handle_t *handle, struct inode *inode, void *arg,
 		return -ECANCELED;
 	}
 
-	oldstart = ei->i_fc_lblk_start;
+	/*
+	 * A sub-block punch hole rounds up the start and down the end, passing
+	 * end == start - 1: no whole block changed, so there is nothing to
+	 * track.  (ext4_fc_track_template has already reset the range set for a
+	 * new transaction.)
+	 */
+	if (__arg->end < __arg->start)
+		return 0;
 
-	if (update && ei->i_fc_lblk_len > 0) {
-		ei->i_fc_lblk_start = min(ei->i_fc_lblk_start, __arg->start);
-		ei->i_fc_lblk_len =
-			max(oldstart + ei->i_fc_lblk_len - 1, __arg->end) -
-				ei->i_fc_lblk_start + 1;
-	} else {
-		ei->i_fc_lblk_start = __arg->start;
-		ei->i_fc_lblk_len = __arg->end - __arg->start + 1;
-	}
+	ext4_fc_range_add(ei, __arg->start, __arg->end);
 
 	return 0;
 }
@@ -994,31 +1054,26 @@ static void ext4_fc_free_inode_snap(struct inode *inode)
 	ei->i_fc_snap = NULL;
 }
 
-static int ext4_fc_snapshot_inode_data(struct inode *inode,
+/*
+ * Snapshot one modified lblk range [start_lblk, end_lblk] into @ranges by
+ * walking the extent status tree, emitting an ADD_RANGE per mapped segment and
+ * a DEL_RANGE per hole.  *nr_ranges accumulates the number of ranges produced
+ * for this inode across calls; together with nr_ranges_total (ranges already
+ * produced by earlier inodes in this commit) it is bounded against
+ * EXT4_FC_SNAPSHOT_MAX_RANGES.
+ */
+static int ext4_fc_snapshot_lblk_range(struct inode *inode,
+				       ext4_lblk_t start_lblk,
+				       ext4_lblk_t end_lblk,
 				       struct list_head *ranges,
 				       unsigned int nr_ranges_total,
-				       unsigned int *nr_rangesp,
+				       unsigned int *nr_ranges,
 				       int *snap_err)
 {
-	struct ext4_inode_info *ei = EXT4_I(inode);
 	struct ext4_fc_snap_stats *stats =
 		&EXT4_SB(inode->i_sb)->s_fc_snap_stats;
-	ext4_lblk_t start_lblk, end_lblk, cur_lblk;
-	unsigned int nr_ranges = 0;
+	ext4_lblk_t cur_lblk = start_lblk;
 
-	spin_lock(&ei->i_fc_lock);
-	if (ei->i_fc_lblk_len == 0) {
-		spin_unlock(&ei->i_fc_lock);
-		if (nr_rangesp)
-			*nr_rangesp = 0;
-		return 0;
-	}
-	start_lblk = ei->i_fc_lblk_start;
-	end_lblk = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1;
-	ei->i_fc_lblk_len = 0;
-	spin_unlock(&ei->i_fc_lock);
-
-	cur_lblk = start_lblk;
 	ext4_debug("snapshot data ranges %u-%u for inode %llu\n",
 		   start_lblk, end_lblk,
 		   (unsigned long long)inode->i_ino);
@@ -1050,7 +1105,7 @@ static int ext4_fc_snapshot_inode_data(struct inode *inode,
 			continue;
 		}
 
-		if (nr_ranges_total + nr_ranges >= EXT4_FC_SNAPSHOT_MAX_RANGES) {
+		if (nr_ranges_total + *nr_ranges >= EXT4_FC_SNAPSHOT_MAX_RANGES) {
 			atomic64_inc(&stats->snap_fail_ranges_cap);
 			ext4_fc_set_snap_err(snap_err,
 					     EXT4_FC_SNAP_ERR_RANGES_CAP);
@@ -1063,7 +1118,7 @@ static int ext4_fc_snapshot_inode_data(struct inode *inode,
 			ext4_fc_set_snap_err(snap_err, EXT4_FC_SNAP_ERR_NOMEM);
 			return -ENOMEM;
 		}
-		nr_ranges++;
+		(*nr_ranges)++;
 
 		range->lblk = cur_lblk;
 		range->len = len;
@@ -1101,6 +1156,48 @@ static int ext4_fc_snapshot_inode_data(struct inode *inode,
 		cur_lblk += range->len;
 	}
 
+	return 0;
+}
+
+static int ext4_fc_snapshot_inode_data(struct inode *inode,
+				       struct list_head *ranges,
+				       unsigned int nr_ranges_total,
+				       unsigned int *nr_rangesp,
+				       int *snap_err)
+{
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	struct ext4_fc_lblk_range tracked[EXT4_FC_MAX_RANGES + 1];
+	unsigned int nr_ranges = 0, nr_tracked, t;
+	int ret;
+
+	spin_lock(&ei->i_fc_lock);
+	nr_tracked = ei->i_fc_nr_ranges;
+	if (nr_tracked == 0) {
+		spin_unlock(&ei->i_fc_lock);
+		if (nr_rangesp)
+			*nr_rangesp = 0;
+		return 0;
+	}
+	memcpy(tracked, ei->i_fc_ranges, nr_tracked * sizeof(tracked[0]));
+	ei->i_fc_nr_ranges = 0;
+	spin_unlock(&ei->i_fc_lock);
+
+	/*
+	 * Snapshot only the actually-modified ranges, not the whole [min,max]
+	 * span: this is what keeps scattered allocations from blowing past
+	 * EXT4_FC_SNAPSHOT_MAX_RANGES and falling back to a full commit.
+	 */
+	for (t = 0; t < nr_tracked; t++) {
+		ext4_lblk_t s = tracked[t].start;
+		ext4_lblk_t e = s + tracked[t].len - 1;
+
+		ret = ext4_fc_snapshot_lblk_range(inode, s, e, ranges,
+						  nr_ranges_total, &nr_ranges,
+						  snap_err);
+		if (ret)
+			return ret;
+	}
+
 	if (nr_rangesp)
 		*nr_rangesp = nr_ranges;
 	return 0;
-- 
2.43.0


^ permalink raw reply related

* [RFC PATCH v2 0/2] ext4: speed up fast commit on random writes
From: Daejun Park @ 2026-06-23  8:23 UTC (permalink / raw)
  To: tytso@mit.edu, adilger.kernel@dilger.ca
  Cc: me@linux.beauty, libaokun@linux.alibaba.com,
	harshadshirwadkar@gmail.com, yi.zhang@huaweicloud.com,
	ojaswin@linux.ibm.com, linux-ext4@vger.kernel.org,
	linux-kernel@vger.kernel.org, Daejun Park
In-Reply-To: <CGME20260623082331epcms2p4798e9d26b06f9a005bcca7b2edf395d3@epcms2p4>

Fast commit is meant to make fsync cheap, but on random-write workloads it
defeats itself.  ext4 still tracks a single coalesced [min,max] logical range
per inode (i_fc_lblk_start/len).  When an inode is dirtied at several disjoint
offsets between two commits, that span widens to cover them all, and at commit
time ext4_fc_snapshot_inode_data() walks the whole span through the extent
status tree -- emitting an ADD_RANGE per mapped segment and a DEL_RANGE per
hole inside it.  For scattered writes that is hundreds to thousands of ranges
even though only a handful of regions were actually modified.

The recently merged fast-commit snapshot work did not change this: it caps a
snapshot at EXT4_FC_SNAPSHOT_MAX_RANGES (2048) and fails over to a full commit
when the span exceeds it.  Measured on dev (7.1.0-rc4) with a sparse
random-write workload (1 GiB span, R disjoint 4 KiB writes per fsync, 300
fsyncs):

  R=16 regions/fsync:  ranges/commit 1095,  full-commit fallback 76%
                       (snap_fail_ranges_cap on 226 of 300 fsyncs)

Fast commit barely functions on this workload.

This series tracks the actually-modified disjoint ranges instead of one span,
and snapshots only those:

  1/2 replaces the single [min,max] range with a small, bounded set of sorted,
      disjoint ranges (up to EXT4_FC_MAX_RANGES = 16; the two closest are
      merged on overflow, so the worst case degrades to the old single-span
      behaviour).  ext4_fc_snapshot_inode_data() then walks only the tracked
      ranges.  The on-disk TLV format is unchanged.

  2/2 allocates the range array lazily: the first range stays inline, the
      array is allocated only when a second disjoint range appears, and on an
      allocation failure we fall back to the inline single range.  Per-inode
      fast-commit footprint stays ~20 bytes.

Result on the same workload (dev, patched):

  R=16:  ranges/commit 1095 -> 16,  fallback 76% -> 0.7%,
         snap_fail_ranges_cap 226 -> 0


Testing (on dev, patched):
  - crash recovery: deterministic writes + fsync, kill -9 QEMU (power loss),
    reboot -> fast-commit replay -> verify every fsync'd block, e2fsck -fn.
    9600/9600 blocks verified, 0 mismatch, e2fsck clean.  Run with R=64, so the
    overflow-merge path is exercised.
  - ext4/generic fast-commit xfstests (ext4/044 ext4/045 generic/455 456 457
    470 482): ext4/044, ext4/045, generic/456 pass; generic/457, 470 skip
    (reflink/dax unsupported on ext4); generic/455 fails identically on
    unpatched dev (pre-existing, patch-unrelated); generic/482's single failure
    in a combined run did not reproduce (3/3 pass in isolation on the patched
    kernel).


Changes since v1 [1]:
  - Rebased from v6.17-rc3 onto ext4.git dev; re-implemented on top of the
    merged fast-commit snapshot model (v1 targeted the old
    ext4_fc_write_inode_data(), which no longer exists).

[1] v1: https://lore.kernel.org/linux-ext4/20260611044733epcms2p38013ae683a283555526f70e4eab6d2a9@epcms2p3/

Daejun Park (2):
  ext4: fast commit: track disjoint modified ranges per inode
  ext4: fast commit: allocate the range array lazily

 fs/ext4/ext4.h        |  42 ++++++--
 fs/ext4/fast_commit.c | 219 +++++++++++++++++++++++++++++++++++-------
 fs/ext4/super.c       |   1 +
 3 files changed, 222 insertions(+), 40 deletions(-)


base-commit: c143957520c6c9b5cd72e0de8b52b814f0c576fe
--
2.43.0

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox