Linux EXT4 FS development
 help / color / mirror / Atom feed
From: Li Chen <me@linux.beauty>
To: Zhang Yi <yi.zhang@huaweicloud.com>,
	Theodore Ts'o <tytso@mit.edu>,
	Andreas Dilger <adilger.kernel@dilger.ca>,
	Baokun Li <libaokun@linux.alibaba.com>, Jan Kara <jack@suse.cz>,
	Ojaswin Mujoo <ojaswin@linux.ibm.com>,
	"Ritesh Harjani (IBM)" <ritesh.list@gmail.com>,
	Zhang Yi <yi.zhang@huawei.com>,
	linux-ext4@vger.kernel.org, linux-kernel@vger.kernel.org
Cc: Steven Rostedt <rostedt@goodmis.org>,
	Masami Hiramatsu <mhiramat@kernel.org>,
	Mathieu Desnoyers <mathieu.desnoyers@efficios.com>,
	linux-trace-kernel@vger.kernel.org
Subject: [RFC v8 4/7] ext4: fast commit: avoid self-deadlock in inode snapshotting
Date: Fri, 15 May 2026 17:18:24 +0800	[thread overview]
Message-ID: <20260515091829.194810-5-me@linux.beauty> (raw)
In-Reply-To: <20260515091829.194810-1-me@linux.beauty>

ext4_fc_snapshot_inodes() used igrab()/iput() to pin inodes while building
commit-time snapshots. With ext4_fc_del() waiting for
EXT4_STATE_FC_COMMITTING, iput() can trigger
ext4_clear_inode()->ext4_fc_del() in the commit thread and deadlock waiting
for the fast commit to finish.

ext4_fc_del() also has to re-check EXT4_STATE_FC_COMMITTING after
waiting on EXT4_STATE_FC_FLUSHING_DATA. The commit thread clears
FLUSHING_DATA before it sets COMMITTING, so a waiter woken from the
flush wait must not delete the inode based on an old COMMITTING
check.

Avoid taking extra references. Collect inode pointers under s_fc_lock and
rely on EXT4_STATE_FC_COMMITTING to pin inodes until ext4_fc_cleanup()
clears the bit.

Also set EXT4_STATE_FC_COMMITTING for create-only inodes referenced
from the dentry update queue, and wake up waiters when ext4_fc_cleanup()
clears the bit.

Signed-off-by: Li Chen <chenl311@chinatelecom.cn>
---
Changes in v8:
- Factor out small ext4_fc_wait_inode_state()/ext4_fc_wake_inode_state()
  helpers so the repeated FC state wait/wake mapping is kept in one place.
- Re-check EXT4_STATE_FC_COMMITTING after waking from
  EXT4_STATE_FC_FLUSHING_DATA in ext4_fc_del(), so list deletion only
  happens after both predicates pass under the same s_fc_lock critical
  section.

 fs/ext4/fast_commit.c | 124 +++++++++++++++++++++++++-----------------
 1 file changed, 75 insertions(+), 49 deletions(-)

diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
index 673668860e2d..8a6981e50ffe 100644
--- a/fs/ext4/fast_commit.c
+++ b/fs/ext4/fast_commit.c
@@ -235,6 +235,37 @@ static bool ext4_fc_eligible(struct super_block *sb)
 		!(ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE));
 }
 
+/*
+ * Wait for an inode fast-commit state bit to clear while dropping the
+ * fast-commit lock around schedule().
+ */
+static void ext4_fc_wait_inode_state(struct inode *inode, int bit,
+				     int *alloc_ctx)
+{
+	wait_queue_head_t *wq;
+	unsigned long *wait_word = ext4_inode_state_wait_word(inode);
+	int wait_bit = ext4_inode_state_wait_bit(bit);
+
+	while (ext4_test_inode_state(inode, bit)) {
+		DEFINE_WAIT_BIT(wait, wait_word, wait_bit);
+
+		wq = bit_waitqueue(wait_word, wait_bit);
+		prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
+		if (ext4_test_inode_state(inode, bit)) {
+			ext4_fc_unlock(inode->i_sb, *alloc_ctx);
+			schedule();
+			*alloc_ctx = ext4_fc_lock(inode->i_sb);
+		}
+		finish_wait(wq, &wait.wq_entry);
+	}
+}
+
+static inline void ext4_fc_wake_inode_state(struct inode *inode, int bit)
+{
+	wake_up_bit(ext4_inode_state_wait_word(inode),
+		    ext4_inode_state_wait_bit(bit));
+}
+
 /*
  * Remove inode from fast commit list. If the inode is being committed
  * we wait until inode commit is done.
@@ -243,12 +274,6 @@ void ext4_fc_del(struct inode *inode)
 {
 	struct ext4_inode_info *ei = EXT4_I(inode);
 	struct ext4_fc_dentry_update *fc_dentry;
-	wait_queue_head_t *wq;
-	unsigned long *wait_word = ext4_inode_state_wait_word(inode);
-	int committing_wait_bit =
-		ext4_inode_state_wait_bit(EXT4_STATE_FC_COMMITTING);
-	int flushing_wait_bit =
-		ext4_inode_state_wait_bit(EXT4_STATE_FC_FLUSHING_DATA);
 	int alloc_ctx;
 
 	if (ext4_fc_disabled(inode->i_sb))
@@ -263,32 +288,19 @@ void ext4_fc_del(struct inode *inode)
 
 	/*
 	 * Wait for ongoing fast commit to finish. We cannot remove the inode
-	 * from fast commit lists while it is being committed.
+	 * from fast commit lists while it is being committed. If we wake from
+	 * FC_FLUSHING_DATA, re-check FC_COMMITTING before deleting because the
+	 * commit thread sets FC_COMMITTING only after clearing FLUSHING_DATA.
 	 */
-	while (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
-		DEFINE_WAIT_BIT(wait, wait_word, committing_wait_bit);
+	for (;;) {
+		ext4_fc_wait_inode_state(inode, EXT4_STATE_FC_COMMITTING,
+					 &alloc_ctx);
 
-		wq = bit_waitqueue(wait_word, committing_wait_bit);
-		prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
-		if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
-			ext4_fc_unlock(inode->i_sb, alloc_ctx);
-			schedule();
-			alloc_ctx = ext4_fc_lock(inode->i_sb);
-		}
-		finish_wait(wq, &wait.wq_entry);
-	}
-
-	while (ext4_test_inode_state(inode, EXT4_STATE_FC_FLUSHING_DATA)) {
-		DEFINE_WAIT_BIT(wait, wait_word, flushing_wait_bit);
+		if (!ext4_test_inode_state(inode, EXT4_STATE_FC_FLUSHING_DATA))
+			break;
 
-		wq = bit_waitqueue(wait_word, flushing_wait_bit);
-		prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
-		if (ext4_test_inode_state(inode, EXT4_STATE_FC_FLUSHING_DATA)) {
-			ext4_fc_unlock(inode->i_sb, alloc_ctx);
-			schedule();
-			alloc_ctx = ext4_fc_lock(inode->i_sb);
-		}
-		finish_wait(wq, &wait.wq_entry);
+		ext4_fc_wait_inode_state(inode, EXT4_STATE_FC_FLUSHING_DATA,
+					 &alloc_ctx);
 	}
 
 	ext4_fc_free_inode_snap(inode);
@@ -1184,13 +1196,12 @@ static int ext4_fc_snapshot_inodes(journal_t *journal)
 
 	alloc_ctx = ext4_fc_lock(sb);
 	list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
-		inodes[i] = igrab(&iter->vfs_inode);
-		if (inodes[i])
-			i++;
+		inodes[i++] = &iter->vfs_inode;
 	}
 
 	list_for_each_entry(fc_dentry, &sbi->s_fc_dentry_q[FC_Q_MAIN], fcd_list) {
 		struct ext4_inode_info *ei;
+		struct inode *inode;
 
 		if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT)
 			continue;
@@ -1200,12 +1211,20 @@ static int ext4_fc_snapshot_inodes(journal_t *journal)
 		/* See the comment in ext4_fc_commit_dentry_updates(). */
 		ei = list_first_entry(&fc_dentry->fcd_dilist,
 				      struct ext4_inode_info, i_fc_dilist);
+		inode = &ei->vfs_inode;
 		if (!list_empty(&ei->i_fc_list))
 			continue;
 
-		inodes[i] = igrab(&ei->vfs_inode);
-		if (inodes[i])
-			i++;
+		/*
+		 * Create-only inodes may only be referenced via fcd_dilist and
+		 * not appear on s_fc_q[MAIN]. They may hit the last iput while
+		 * we are snapshotting, but inode eviction calls ext4_fc_del(),
+		 * which waits for FC_COMMITTING to clear. Mark them FC_COMMITTING
+		 * so the inode stays pinned and the snapshot stays valid until
+		 * ext4_fc_cleanup().
+		 */
+		ext4_set_inode_state(inode, EXT4_STATE_FC_COMMITTING);
+		inodes[i++] = inode;
 	}
 	ext4_fc_unlock(sb, alloc_ctx);
 
@@ -1215,10 +1234,6 @@ static int ext4_fc_snapshot_inodes(journal_t *journal)
 			break;
 	}
 
-	for (nr_inodes = 0; nr_inodes < i; nr_inodes++) {
-		if (inodes[nr_inodes])
-			iput(inodes[nr_inodes]);
-	}
 	kvfree(inodes);
 	return ret;
 }
@@ -1234,8 +1249,6 @@ static int ext4_fc_perform_commit(journal_t *journal)
 	int ret = 0;
 	u32 crc = 0;
 	int alloc_ctx;
-	int flushing_wait_bit =
-		ext4_inode_state_wait_bit(EXT4_STATE_FC_FLUSHING_DATA);
 
 	/*
 	 * Step 1: Mark all inodes on s_fc_q[MAIN] with
@@ -1261,8 +1274,8 @@ static int ext4_fc_perform_commit(journal_t *journal)
 	list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
 		ext4_clear_inode_state(&iter->vfs_inode,
 				       EXT4_STATE_FC_FLUSHING_DATA);
-		wake_up_bit(ext4_inode_state_wait_word(&iter->vfs_inode),
-			    flushing_wait_bit);
+		ext4_fc_wake_inode_state(&iter->vfs_inode,
+					 EXT4_STATE_FC_FLUSHING_DATA);
 	}
 
 	/*
@@ -1285,8 +1298,9 @@ static int ext4_fc_perform_commit(journal_t *journal)
 	jbd2_journal_lock_updates(journal);
 	/*
 	 * The journal is now locked. No more handles can start and all the
-	 * previous handles are now drained. We now mark the inodes on the
-	 * commit queue as being committed.
+	 * previous handles are now drained. Snapshotting happens in this
+	 * window so log writing can consume only stable snapshots without
+	 * doing logical-to-physical mapping.
 	 */
 	alloc_ctx = ext4_fc_lock(sb);
 	list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
@@ -1482,8 +1496,6 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid)
 	struct ext4_inode_info *ei;
 	struct ext4_fc_dentry_update *fc_dentry;
 	int alloc_ctx;
-	int committing_wait_bit =
-		ext4_inode_state_wait_bit(EXT4_STATE_FC_COMMITTING);
 
 	if (full && sbi->s_fc_bh)
 		sbi->s_fc_bh = NULL;
@@ -1521,8 +1533,8 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid)
 		 * barrier in prepare_to_wait() in ext4_fc_del().
 		 */
 		smp_mb();
-		wake_up_bit(ext4_inode_state_wait_word(&ei->vfs_inode),
-			    committing_wait_bit);
+		ext4_fc_wake_inode_state(&ei->vfs_inode,
+					 EXT4_STATE_FC_COMMITTING);
 	}
 
 	while (!list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) {
@@ -1537,6 +1549,20 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid)
 					      struct ext4_inode_info,
 					      i_fc_dilist);
 			ext4_fc_free_inode_snap(&ei->vfs_inode);
+			spin_lock(&ei->i_fc_lock);
+			ext4_clear_inode_state(&ei->vfs_inode,
+					       EXT4_STATE_FC_REQUEUE);
+			ext4_clear_inode_state(&ei->vfs_inode,
+					       EXT4_STATE_FC_COMMITTING);
+			spin_unlock(&ei->i_fc_lock);
+			/*
+			 * Make sure clearing of EXT4_STATE_FC_COMMITTING is
+			 * visible before we send the wakeup. Pairs with implicit
+			 * barrier in prepare_to_wait() in ext4_fc_del().
+			 */
+			smp_mb();
+			ext4_fc_wake_inode_state(&ei->vfs_inode,
+						 EXT4_STATE_FC_COMMITTING);
 		}
 		list_del_init(&fc_dentry->fcd_dilist);
 
-- 
2.53.0

  parent reply	other threads:[~2026-05-15  9:21 UTC|newest]

Thread overview: 8+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-05-15  9:18 [RFC v8 0/7] ext4: fast commit: snapshot inode state for FC log Li Chen
2026-05-15  9:18 ` [RFC v8 1/7] ext4: fast commit: snapshot inode state before writing log Li Chen
2026-05-15  9:18 ` [RFC v8 2/7] ext4: lockdep: handle i_data_sem subclassing for special inodes Li Chen
2026-05-15  9:18 ` [RFC v8 3/7] ext4: fast commit: avoid waiting for FC_COMMITTING Li Chen
2026-05-15  9:18 ` Li Chen [this message]
2026-05-15  9:18 ` [RFC v8 5/7] ext4: fast commit: avoid i_data_sem by dropping ext4_map_blocks() in snapshots Li Chen
2026-05-15  9:18 ` [RFC v8 6/7] ext4: fast commit: add lock_updates tracepoint Li Chen
2026-05-15  9:18 ` [RFC v8 7/7] ext4: fast commit: export snapshot stats in fc_info Li Chen

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260515091829.194810-5-me@linux.beauty \
    --to=me@linux.beauty \
    --cc=adilger.kernel@dilger.ca \
    --cc=jack@suse.cz \
    --cc=libaokun@linux.alibaba.com \
    --cc=linux-ext4@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-trace-kernel@vger.kernel.org \
    --cc=mathieu.desnoyers@efficios.com \
    --cc=mhiramat@kernel.org \
    --cc=ojaswin@linux.ibm.com \
    --cc=ritesh.list@gmail.com \
    --cc=rostedt@goodmis.org \
    --cc=tytso@mit.edu \
    --cc=yi.zhang@huawei.com \
    --cc=yi.zhang@huaweicloud.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox