Linux EXT4 FS development
 help / color / mirror / Atom feed
* Re: improve the swap_activate interface
From: Christoph Hellwig @ 2026-05-15 11:33 UTC (permalink / raw)
  To: Steve French
  Cc: Christoph Hellwig, Andrew Morton, Chris Li, Kairui Song,
	Christian Brauner, Darrick J . Wong, Jens Axboe, David Sterba,
	Theodore Ts'o, Jaegeuk Kim, Chao Yu, Trond Myklebust,
	Anna Schumaker, Namjae Jeon, Hyunchul Lee, Steve French,
	Paulo Alcantara, Carlos Maiolino, Damien Le Moal, Naohiro Aota,
	linux-xfs, linux-fsdevel, linux-doc, linux-mm, linux-block,
	linux-btrfs, linux-ext4, linux-f2fs-devel, linux-nfs, linux-cifs
In-Reply-To: <CAH2r5msnYVb3hhXHwqDVHGGC1h4E6mLCRS4ktCrQoD9zdUW81g@mail.gmail.com>

On Wed, May 13, 2026 at 03:34:03PM -0500, Steve French wrote:
> I just tried this on 7.1-rc3 with the swap patches (full kernel build,
> on Ubuntu 25,10) and boot failed with out of memory which I had never
> seen before.  Any idea how to workaround this with the swap patch
> series, or is there a fix for this in the swap series already?

Is that a failure with the patches or also with the baseline?


^ permalink raw reply

* [RFC v8 7/7] ext4: fast commit: export snapshot stats in fc_info
From: Li Chen @ 2026-05-15  9:18 UTC (permalink / raw)
  To: Zhang Yi, Theodore Ts'o, Andreas Dilger, Baokun Li, Jan Kara,
	Ojaswin Mujoo, Ritesh Harjani (IBM), Zhang Yi, linux-ext4,
	linux-kernel
  Cc: Steven Rostedt, Masami Hiramatsu, Mathieu Desnoyers,
	linux-trace-kernel
In-Reply-To: <20260515091829.194810-1-me@linux.beauty>

Snapshot-based fast commit can fall back when the commit-time snapshot
cannot be built (e.g. extent status cache misses). It is useful to
quantify the updates-locked window and to see why snapshotting failed.

Add best-effort snapshot counters to the ext4 superblock and extend
/proc/fs/ext4/<sb_id>/fc_info to report the number of snapshotted
inodes and ranges, snapshot failure reasons, and the average/max time
spent with journal updates locked.

Signed-off-by: Li Chen <chenl311@chinatelecom.cn>
---
Changes in v8:
- Treat stale snapshot inode sizing as a capacity fallback instead of
  letting log writing later report a missing snapshot.
- Use atomic64_t for the snapshot counters so fc_info cannot observe
  torn 64-bit values on 32-bit systems.

Changes in v7:
- Address Sashiko review by using READ_ONCE() + div64_u64() for the fc_info
  lock_updates average.

Changes in v6:
- Start consuming locked_ns in fc_info, so this patch intentionally moves
  lock_updates_ns_{total,max,samples} accounting here.
- Guard the tracepoint call with trace_ext4_fc_lock_updates_enabled() and
  use trace_call__ext4_fc_lock_updates() to avoid the double static_branch
  at the guarded call site.
- Keep the stats unconditionally while avoiding extra tracepoint
  overhead when ext4_fc_lock_updates is disabled.

 fs/ext4/ext4.h        |  31 ++++++++++++++
 fs/ext4/fast_commit.c |  96 ++++++++++++++++++++++++++++++++++++++-----
 fs/ext4/super.c       |   1 +
 3 files changed, 118 insertions(+), 10 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index dd09d00a73af..ddc903738c6b 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1550,6 +1550,36 @@ struct ext4_orphan_info {
 						 * file blocks */
 };
 
+/*
+ * Ext4 fast commit snapshot statistics.
+ *
+ * These are best-effort counters intended for debugging / performance
+ * introspection; they are not exact under concurrent updates.
+ */
+struct ext4_fc_snap_stats {
+	atomic64_t lock_updates_ns_total;
+	atomic64_t lock_updates_ns_max;
+	atomic64_t lock_updates_samples;
+
+	atomic64_t snap_inodes;
+	atomic64_t snap_ranges;
+
+	atomic64_t snap_fail_es_miss;
+	atomic64_t snap_fail_es_delayed;
+	atomic64_t snap_fail_es_other;
+
+	atomic64_t snap_fail_inodes_cap;
+	atomic64_t snap_fail_ranges_cap;
+	atomic64_t snap_fail_nomem;
+	atomic64_t snap_fail_inode_loc;
+
+	/*
+	 * Missing inode snapshots during log writing should never happen.
+	 * Keep this counter to help catch unexpected regressions.
+	 */
+	atomic64_t snap_fail_no_snap;
+};
+
 /*
  * fourth extended-fs super-block data in memory
  */
@@ -1824,6 +1854,7 @@ struct ext4_sb_info {
 	struct mutex s_fc_lock;
 	struct buffer_head *s_fc_bh;
 	struct ext4_fc_stats s_fc_stats;
+	struct ext4_fc_snap_stats s_fc_snap_stats;
 	tid_t s_fc_ineligible_tid;
 #ifdef CONFIG_EXT4_DEBUG
 	int s_fc_debug_max_replay;
diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
index dc08f8ff43d9..4ef796b9b6cb 100644
--- a/fs/ext4/fast_commit.c
+++ b/fs/ext4/fast_commit.c
@@ -281,6 +281,19 @@ static inline void ext4_fc_wake_inode_state(struct inode *inode, int bit)
 		    ext4_inode_state_wait_bit(bit));
 }
 
+static void ext4_fc_snap_stats_update_max(atomic64_t *stat, u64 value)
+{
+	u64 old = atomic64_read(stat);
+
+	while (value > old) {
+		u64 prev = atomic64_cmpxchg(stat, old, value);
+
+		if (prev == old)
+			break;
+		old = prev;
+	}
+}
+
 /*
  * Remove inode from fast commit list. If the inode is being committed
  * we wait until inode commit is done.
@@ -868,6 +881,8 @@ static int ext4_fc_write_inode(struct inode *inode, u32 *crc)
 {
 	struct ext4_inode_info *ei = EXT4_I(inode);
 	struct ext4_fc_inode_snap *snap = ei->i_fc_snap;
+	struct ext4_fc_snap_stats *stats =
+		&EXT4_SB(inode->i_sb)->s_fc_snap_stats;
 	struct ext4_fc_inode fc_inode;
 	struct ext4_fc_tl tl;
 	u8 *dst;
@@ -875,13 +890,17 @@ static int ext4_fc_write_inode(struct inode *inode, u32 *crc)
 	int inode_len;
 	int ret;
 
-	if (!snap)
+	if (!snap) {
+		atomic64_inc(&stats->snap_fail_no_snap);
 		return -ECANCELED;
+	}
 
 	src = snap->inode_buf;
 	inode_len = snap->inode_len;
-	if (!src || inode_len == 0)
+	if (!src || inode_len == 0) {
+		atomic64_inc(&stats->snap_fail_no_snap);
 		return -ECANCELED;
+	}
 
 	fc_inode.fc_ino = cpu_to_le32(inode->i_ino);
 	tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_INODE);
@@ -911,13 +930,17 @@ static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc)
 {
 	struct ext4_inode_info *ei = EXT4_I(inode);
 	struct ext4_fc_inode_snap *snap = ei->i_fc_snap;
+	struct ext4_fc_snap_stats *stats =
+		&EXT4_SB(inode->i_sb)->s_fc_snap_stats;
 	struct ext4_fc_add_range fc_ext;
 	struct ext4_fc_del_range lrange;
 	struct ext4_extent *ex;
 	struct ext4_fc_range *range;
 
-	if (!snap)
+	if (!snap) {
+		atomic64_inc(&stats->snap_fail_no_snap);
 		return -ECANCELED;
+	}
 
 	list_for_each_entry(range, &snap->data_list, list) {
 		if (range->tag == EXT4_FC_TAG_DEL_RANGE) {
@@ -978,6 +1001,8 @@ static int ext4_fc_snapshot_inode_data(struct inode *inode,
 				       int *snap_err)
 {
 	struct ext4_inode_info *ei = EXT4_I(inode);
+	struct ext4_fc_snap_stats *stats =
+		&EXT4_SB(inode->i_sb)->s_fc_snap_stats;
 	ext4_lblk_t start_lblk, end_lblk, cur_lblk;
 	unsigned int nr_ranges = 0;
 
@@ -1005,11 +1030,13 @@ static int ext4_fc_snapshot_inode_data(struct inode *inode,
 		u64 remaining = (u64)end_lblk - cur_lblk + 1;
 
 		if (!ext4_es_lookup_extent(inode, cur_lblk, NULL, &es, NULL)) {
+			atomic64_inc(&stats->snap_fail_es_miss);
 			ext4_fc_set_snap_err(snap_err, EXT4_FC_SNAP_ERR_ES_MISS);
 			return -EAGAIN;
 		}
 
 		if (ext4_es_is_delayed(&es)) {
+			atomic64_inc(&stats->snap_fail_es_delayed);
 			ext4_fc_set_snap_err(snap_err,
 					     EXT4_FC_SNAP_ERR_ES_DELAYED);
 			return -EAGAIN;
@@ -1024,6 +1051,7 @@ static int ext4_fc_snapshot_inode_data(struct inode *inode,
 		}
 
 		if (nr_ranges_total + nr_ranges >= EXT4_FC_SNAPSHOT_MAX_RANGES) {
+			atomic64_inc(&stats->snap_fail_ranges_cap);
 			ext4_fc_set_snap_err(snap_err,
 					     EXT4_FC_SNAP_ERR_RANGES_CAP);
 			return -E2BIG;
@@ -1031,6 +1059,7 @@ static int ext4_fc_snapshot_inode_data(struct inode *inode,
 
 		range = kmem_cache_alloc(ext4_fc_range_cachep, GFP_NOFS);
 		if (!range) {
+			atomic64_inc(&stats->snap_fail_nomem);
 			ext4_fc_set_snap_err(snap_err, EXT4_FC_SNAP_ERR_NOMEM);
 			return -ENOMEM;
 		}
@@ -1058,6 +1087,7 @@ static int ext4_fc_snapshot_inode_data(struct inode *inode,
 				range->len = max;
 		} else {
 			kmem_cache_free(ext4_fc_range_cachep, range);
+			atomic64_inc(&stats->snap_fail_es_other);
 			ext4_fc_set_snap_err(snap_err, EXT4_FC_SNAP_ERR_ES_OTHER);
 			return -EAGAIN;
 		}
@@ -1081,6 +1111,8 @@ static int ext4_fc_snapshot_inode(struct inode *inode,
 				  unsigned int *nr_rangesp, int *snap_err)
 {
 	struct ext4_inode_info *ei = EXT4_I(inode);
+	struct ext4_fc_snap_stats *stats =
+		&EXT4_SB(inode->i_sb)->s_fc_snap_stats;
 	struct ext4_fc_inode_snap *snap;
 	int inode_len = EXT4_GOOD_OLD_INODE_SIZE;
 	struct ext4_iloc iloc;
@@ -1091,6 +1123,7 @@ static int ext4_fc_snapshot_inode(struct inode *inode,
 
 	ret = ext4_get_inode_loc_noio(inode, &iloc);
 	if (ret) {
+		atomic64_inc(&stats->snap_fail_inode_loc);
 		ext4_fc_set_snap_err(snap_err, EXT4_FC_SNAP_ERR_INODE_LOC);
 		return ret;
 	}
@@ -1102,6 +1135,7 @@ static int ext4_fc_snapshot_inode(struct inode *inode,
 
 	snap = kmalloc(struct_size(snap, inode_buf, inode_len), GFP_NOFS);
 	if (!snap) {
+		atomic64_inc(&stats->snap_fail_nomem);
 		ext4_fc_set_snap_err(snap_err, EXT4_FC_SNAP_ERR_NOMEM);
 		brelse(iloc.bh);
 		return -ENOMEM;
@@ -1126,6 +1160,8 @@ static int ext4_fc_snapshot_inode(struct inode *inode,
 	list_splice_tail_init(&ranges, &snap->data_list);
 	ext4_fc_unlock(inode->i_sb, alloc_ctx);
 
+	atomic64_inc(&stats->snap_inodes);
+	atomic64_add(nr_ranges, &stats->snap_ranges);
 	if (nr_rangesp)
 		*nr_rangesp = nr_ranges;
 	return 0;
@@ -1229,12 +1265,10 @@ static int ext4_fc_snapshot_inodes(journal_t *journal, struct inode **inodes,
 	int ret = 0;
 	int alloc_ctx;
 
-	if (!inodes_size)
-		return 0;
-
 	alloc_ctx = ext4_fc_lock(sb);
 	list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
 		if (i >= inodes_size) {
+			atomic64_inc(&sbi->s_fc_snap_stats.snap_fail_inodes_cap);
 			ext4_fc_set_snap_err(snap_err,
 					     EXT4_FC_SNAP_ERR_INODES_CAP);
 			ret = -E2BIG;
@@ -1260,6 +1294,7 @@ static int ext4_fc_snapshot_inodes(journal_t *journal, struct inode **inodes,
 			continue;
 
 		if (i >= inodes_size) {
+			atomic64_inc(&sbi->s_fc_snap_stats.snap_fail_inodes_cap);
 			ext4_fc_set_snap_err(snap_err,
 					     EXT4_FC_SNAP_ERR_INODES_CAP);
 			ret = -E2BIG;
@@ -1303,6 +1338,7 @@ static int ext4_fc_perform_commit(journal_t *journal, tid_t commit_tid)
 {
 	struct super_block *sb = journal->j_private;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_fc_snap_stats *snap_stats = &sbi->s_fc_snap_stats;
 	struct ext4_inode_info *iter;
 	struct ext4_fc_head head;
 	struct inode *inode;
@@ -1362,8 +1398,13 @@ static int ext4_fc_perform_commit(journal_t *journal, tid_t commit_tid)
 		return ret;
 
 	ret = ext4_fc_alloc_snapshot_inodes(sb, &inodes, &inodes_size);
-	if (ret)
+	if (ret) {
+		if (ret == -E2BIG)
+			atomic64_inc(&snap_stats->snap_fail_inodes_cap);
+		else if (ret == -ENOMEM)
+			atomic64_inc(&snap_stats->snap_fail_nomem);
 		return ret;
+	}
 
 	/* Step 4: Mark all inodes as being committed. */
 	jbd2_journal_lock_updates(journal);
@@ -1384,12 +1425,15 @@ static int ext4_fc_perform_commit(journal_t *journal, tid_t commit_tid)
 	ret = ext4_fc_snapshot_inodes(journal, inodes, inodes_size,
 				      &snap_inodes, &snap_ranges, &snap_err);
 	jbd2_journal_unlock_updates(journal);
-	if (trace_ext4_fc_lock_updates_enabled()) {
-		locked_ns = ktime_to_ns(ktime_sub(ktime_get(), lock_start));
-		trace_call__ext4_fc_lock_updates(sb, commit_tid, locked_ns,
-						 snap_inodes, snap_ranges,
-						 ret, snap_err);
-	}
+	locked_ns = ktime_to_ns(ktime_sub(ktime_get(), lock_start));
+	atomic64_add(locked_ns, &snap_stats->lock_updates_ns_total);
+	atomic64_inc(&snap_stats->lock_updates_samples);
+	ext4_fc_snap_stats_update_max(&snap_stats->lock_updates_ns_max,
+				      locked_ns);
+	if (trace_ext4_fc_lock_updates_enabled())
+		trace_call__ext4_fc_lock_updates(sb, commit_tid, locked_ns,
+						 snap_inodes, snap_ranges,
+						 ret, snap_err);
 	kvfree(inodes);
 	if (ret)
 		return ret;
@@ -2657,11 +2701,26 @@ int ext4_fc_info_show(struct seq_file *seq, void *v)
 {
 	struct ext4_sb_info *sbi = EXT4_SB((struct super_block *)seq->private);
 	struct ext4_fc_stats *stats = &sbi->s_fc_stats;
+	struct ext4_fc_snap_stats *snap_stats = &sbi->s_fc_snap_stats;
+	u64 lock_avg_ns = 0;
+	u64 lock_updates_samples;
+	u64 lock_updates_ns_total;
+	u64 lock_updates_ns_max;
 	int i;
 
 	if (v != SEQ_START_TOKEN)
 		return 0;
 
+	lock_updates_samples =
+		atomic64_read(&snap_stats->lock_updates_samples);
+	lock_updates_ns_total =
+		atomic64_read(&snap_stats->lock_updates_ns_total);
+	lock_updates_ns_max =
+		atomic64_read(&snap_stats->lock_updates_ns_max);
+	if (lock_updates_samples)
+		lock_avg_ns = div64_u64(lock_updates_ns_total,
+					lock_updates_samples);
+
 	seq_printf(seq,
 		"fc stats:\n%ld commits\n%ld ineligible\n%ld numblks\n%lluus avg_commit_time\n",
 		   stats->fc_num_commits, stats->fc_ineligible_commits,
@@ -2672,6 +2731,23 @@ int ext4_fc_info_show(struct seq_file *seq, void *v)
 		seq_printf(seq, "\"%s\":\t%d\n", fc_ineligible_reasons[i],
 			stats->fc_ineligible_reason_count[i]);
 
+	seq_printf(seq,
+		   "Snapshot stats:\n%llu inodes\n%llu ranges\n%lluus lock_updates_avg\n%lluus lock_updates_max\n",
+		   atomic64_read(&snap_stats->snap_inodes),
+		   atomic64_read(&snap_stats->snap_ranges),
+		   div_u64(lock_avg_ns, 1000),
+		   div_u64(lock_updates_ns_max, 1000));
+	seq_printf(seq,
+		   "Snapshot failures:\n%llu es_miss\n%llu es_delayed\n%llu es_other\n%llu inodes_cap\n%llu ranges_cap\n%llu nomem\n%llu inode_loc\n%llu no_snap\n",
+		   atomic64_read(&snap_stats->snap_fail_es_miss),
+		   atomic64_read(&snap_stats->snap_fail_es_delayed),
+		   atomic64_read(&snap_stats->snap_fail_es_other),
+		   atomic64_read(&snap_stats->snap_fail_inodes_cap),
+		   atomic64_read(&snap_stats->snap_fail_ranges_cap),
+		   atomic64_read(&snap_stats->snap_fail_nomem),
+		   atomic64_read(&snap_stats->snap_fail_inode_loc),
+		   atomic64_read(&snap_stats->snap_fail_no_snap));
+
 	return 0;
 }
 
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 3c869f0001c5..f1f8819a2a23 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -4544,6 +4544,7 @@ static void ext4_fast_commit_init(struct super_block *sb)
 	sbi->s_fc_ineligible_tid = 0;
 	mutex_init(&sbi->s_fc_lock);
 	memset(&sbi->s_fc_stats, 0, sizeof(sbi->s_fc_stats));
+	memset(&sbi->s_fc_snap_stats, 0, sizeof(sbi->s_fc_snap_stats));
 	sbi->s_fc_replay_state.fc_regions = NULL;
 	sbi->s_fc_replay_state.fc_regions_size = 0;
 	sbi->s_fc_replay_state.fc_regions_used = 0;
-- 
2.53.0

^ permalink raw reply related

* [RFC v8 6/7] ext4: fast commit: add lock_updates tracepoint
From: Li Chen @ 2026-05-15  9:18 UTC (permalink / raw)
  To: Zhang Yi, Theodore Ts'o, Andreas Dilger, Baokun Li, Jan Kara,
	Ojaswin Mujoo, Ritesh Harjani (IBM), Zhang Yi, Steven Rostedt,
	Masami Hiramatsu, Mathieu Desnoyers, linux-ext4, linux-kernel,
	linux-trace-kernel
In-Reply-To: <20260515091829.194810-1-me@linux.beauty>

Commit-time fast commit snapshots run under jbd2_journal_lock_updates(),
so it is useful to quantify the time spent with updates locked and to
understand why snapshotting can fail.

Add a new tracepoint, ext4_fc_lock_updates, reporting the time spent in
the updates-locked window along with the number of snapshotted inodes
and ranges. Record the first snapshot failure reason in a stable snap_err
field for tooling.

Signed-off-by: Li Chen <chenl311@chinatelecom.cn>
Reviewed-by: Steven Rostedt (Google) <rostedt@goodmis.org>
---
Changes in v8:
- Use trace_call__ext4_fc_lock_updates() at the guarded call site as
  suggested by Steven Rostedt, avoiding a second static_branch check.

Changes in v7:
- Address Sashiko review by reporting successfully snapshotted inode counts
  in ext4_fc_lock_updates when snapshotting stops early.

Changes in v6:
- Drop explicit ext4_fc_snap_err assignments and rely on enum
  auto-increment.
- Treat locked_ns as trace-only in this patch and calculate it only when
  ext4_fc_lock_updates is enabled, as suggested by Steven Rostedt.

 fs/ext4/ext4.h              | 15 ++++++++
 fs/ext4/fast_commit.c       | 74 +++++++++++++++++++++++++++++--------
 include/trace/events/ext4.h | 61 ++++++++++++++++++++++++++++++
 3 files changed, 135 insertions(+), 15 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 927173bc8381..dd09d00a73af 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1027,6 +1027,21 @@ enum {
 
 struct ext4_fc_inode_snap;
 
+/*
+ * Snapshot failure reasons for ext4_fc_lock_updates tracepoint.
+ * Keep these stable for tooling.
+ */
+enum ext4_fc_snap_err {
+	EXT4_FC_SNAP_ERR_NONE = 0,
+	EXT4_FC_SNAP_ERR_ES_MISS,
+	EXT4_FC_SNAP_ERR_ES_DELAYED,
+	EXT4_FC_SNAP_ERR_ES_OTHER,
+	EXT4_FC_SNAP_ERR_INODES_CAP,
+	EXT4_FC_SNAP_ERR_RANGES_CAP,
+	EXT4_FC_SNAP_ERR_NOMEM,
+	EXT4_FC_SNAP_ERR_INODE_LOC,
+};
+
 /*
  * fourth extended file system inode data in memory
  */
diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
index 9e73c83b0e25..dc08f8ff43d9 100644
--- a/fs/ext4/fast_commit.c
+++ b/fs/ext4/fast_commit.c
@@ -194,6 +194,12 @@ static struct kmem_cache *ext4_fc_range_cachep;
 #define EXT4_FC_SNAPSHOT_MAX_INODES	1024
 #define EXT4_FC_SNAPSHOT_MAX_RANGES	2048
 
+static inline void ext4_fc_set_snap_err(int *snap_err, int err)
+{
+	if (snap_err && *snap_err == EXT4_FC_SNAP_ERR_NONE)
+		*snap_err = err;
+}
+
 static void ext4_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
 {
 	BUFFER_TRACE(bh, "");
@@ -968,11 +974,12 @@ static void ext4_fc_free_inode_snap(struct inode *inode)
 static int ext4_fc_snapshot_inode_data(struct inode *inode,
 				       struct list_head *ranges,
 				       unsigned int nr_ranges_total,
-				       unsigned int *nr_rangesp)
+				       unsigned int *nr_rangesp,
+				       int *snap_err)
 {
 	struct ext4_inode_info *ei = EXT4_I(inode);
-	unsigned int nr_ranges = 0;
 	ext4_lblk_t start_lblk, end_lblk, cur_lblk;
+	unsigned int nr_ranges = 0;
 
 	spin_lock(&ei->i_fc_lock);
 	if (ei->i_fc_lblk_len == 0) {
@@ -997,11 +1004,16 @@ static int ext4_fc_snapshot_inode_data(struct inode *inode,
 		ext4_lblk_t len;
 		u64 remaining = (u64)end_lblk - cur_lblk + 1;
 
-		if (!ext4_es_lookup_extent(inode, cur_lblk, NULL, &es, NULL))
+		if (!ext4_es_lookup_extent(inode, cur_lblk, NULL, &es, NULL)) {
+			ext4_fc_set_snap_err(snap_err, EXT4_FC_SNAP_ERR_ES_MISS);
 			return -EAGAIN;
+		}
 
-		if (ext4_es_is_delayed(&es))
+		if (ext4_es_is_delayed(&es)) {
+			ext4_fc_set_snap_err(snap_err,
+					     EXT4_FC_SNAP_ERR_ES_DELAYED);
 			return -EAGAIN;
+		}
 
 		len = es.es_len - (cur_lblk - es.es_lblk);
 		if (len > remaining)
@@ -1011,12 +1023,17 @@ static int ext4_fc_snapshot_inode_data(struct inode *inode,
 			continue;
 		}
 
-		if (nr_ranges_total + nr_ranges >= EXT4_FC_SNAPSHOT_MAX_RANGES)
+		if (nr_ranges_total + nr_ranges >= EXT4_FC_SNAPSHOT_MAX_RANGES) {
+			ext4_fc_set_snap_err(snap_err,
+					     EXT4_FC_SNAP_ERR_RANGES_CAP);
 			return -E2BIG;
+		}
 
 		range = kmem_cache_alloc(ext4_fc_range_cachep, GFP_NOFS);
-		if (!range)
+		if (!range) {
+			ext4_fc_set_snap_err(snap_err, EXT4_FC_SNAP_ERR_NOMEM);
 			return -ENOMEM;
+		}
 		nr_ranges++;
 
 		range->lblk = cur_lblk;
@@ -1041,6 +1058,7 @@ static int ext4_fc_snapshot_inode_data(struct inode *inode,
 				range->len = max;
 		} else {
 			kmem_cache_free(ext4_fc_range_cachep, range);
+			ext4_fc_set_snap_err(snap_err, EXT4_FC_SNAP_ERR_ES_OTHER);
 			return -EAGAIN;
 		}
 
@@ -1060,7 +1078,7 @@ static int ext4_fc_snapshot_inode_data(struct inode *inode,
 
 static int ext4_fc_snapshot_inode(struct inode *inode,
 				  unsigned int nr_ranges_total,
-				  unsigned int *nr_rangesp)
+				  unsigned int *nr_rangesp, int *snap_err)
 {
 	struct ext4_inode_info *ei = EXT4_I(inode);
 	struct ext4_fc_inode_snap *snap;
@@ -1072,8 +1090,10 @@ static int ext4_fc_snapshot_inode(struct inode *inode,
 	int alloc_ctx;
 
 	ret = ext4_get_inode_loc_noio(inode, &iloc);
-	if (ret)
+	if (ret) {
+		ext4_fc_set_snap_err(snap_err, EXT4_FC_SNAP_ERR_INODE_LOC);
 		return ret;
+	}
 
 	if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA))
 		inode_len = EXT4_INODE_SIZE(inode->i_sb);
@@ -1082,6 +1102,7 @@ static int ext4_fc_snapshot_inode(struct inode *inode,
 
 	snap = kmalloc(struct_size(snap, inode_buf, inode_len), GFP_NOFS);
 	if (!snap) {
+		ext4_fc_set_snap_err(snap_err, EXT4_FC_SNAP_ERR_NOMEM);
 		brelse(iloc.bh);
 		return -ENOMEM;
 	}
@@ -1092,7 +1113,7 @@ static int ext4_fc_snapshot_inode(struct inode *inode,
 	brelse(iloc.bh);
 
 	ret = ext4_fc_snapshot_inode_data(inode, &ranges, nr_ranges_total,
-					  &nr_ranges);
+					  &nr_ranges, snap_err);
 	if (ret) {
 		kfree(snap);
 		ext4_fc_free_ranges(&ranges);
@@ -1193,7 +1214,10 @@ static int ext4_fc_alloc_snapshot_inodes(struct super_block *sb,
 					 unsigned int *nr_inodesp);
 
 static int ext4_fc_snapshot_inodes(journal_t *journal, struct inode **inodes,
-				   unsigned int inodes_size)
+				   unsigned int inodes_size,
+				   unsigned int *nr_inodesp,
+				   unsigned int *nr_rangesp,
+				   int *snap_err)
 {
 	struct super_block *sb = journal->j_private;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -1211,6 +1235,8 @@ static int ext4_fc_snapshot_inodes(journal_t *journal, struct inode **inodes,
 	alloc_ctx = ext4_fc_lock(sb);
 	list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
 		if (i >= inodes_size) {
+			ext4_fc_set_snap_err(snap_err,
+					     EXT4_FC_SNAP_ERR_INODES_CAP);
 			ret = -E2BIG;
 			goto unlock;
 		}
@@ -1234,6 +1260,8 @@ static int ext4_fc_snapshot_inodes(journal_t *journal, struct inode **inodes,
 			continue;
 
 		if (i >= inodes_size) {
+			ext4_fc_set_snap_err(snap_err,
+					     EXT4_FC_SNAP_ERR_INODES_CAP);
 			ret = -E2BIG;
 			goto unlock;
 		}
@@ -1258,16 +1286,20 @@ static int ext4_fc_snapshot_inodes(journal_t *journal, struct inode **inodes,
 		unsigned int inode_ranges = 0;
 
 		ret = ext4_fc_snapshot_inode(inodes[idx], nr_ranges,
-					     &inode_ranges);
+					     &inode_ranges, snap_err);
 		if (ret)
 			break;
 		nr_ranges += inode_ranges;
 	}
 
+	if (nr_inodesp)
+		*nr_inodesp = idx;
+	if (nr_rangesp)
+		*nr_rangesp = nr_ranges;
 	return ret;
 }
 
-static int ext4_fc_perform_commit(journal_t *journal)
+static int ext4_fc_perform_commit(journal_t *journal, tid_t commit_tid)
 {
 	struct super_block *sb = journal->j_private;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -1276,10 +1308,15 @@ static int ext4_fc_perform_commit(journal_t *journal)
 	struct inode *inode;
 	struct inode **inodes;
 	unsigned int inodes_size;
+	unsigned int snap_inodes = 0;
+	unsigned int snap_ranges = 0;
+	int snap_err = EXT4_FC_SNAP_ERR_NONE;
 	struct blk_plug plug;
 	int ret = 0;
 	u32 crc = 0;
 	int alloc_ctx;
+	ktime_t lock_start;
+	u64 locked_ns;
 
 	/*
 	 * Step 1: Mark all inodes on s_fc_q[MAIN] with
@@ -1324,13 +1361,13 @@ static int ext4_fc_perform_commit(journal_t *journal)
 	if (ret)
 		return ret;
 
-
 	ret = ext4_fc_alloc_snapshot_inodes(sb, &inodes, &inodes_size);
 	if (ret)
 		return ret;
 
 	/* Step 4: Mark all inodes as being committed. */
 	jbd2_journal_lock_updates(journal);
+	lock_start = ktime_get();
 	/*
 	 * The journal is now locked. No more handles can start and all the
 	 * previous handles are now drained. Snapshotting happens in this
@@ -1344,8 +1381,15 @@ static int ext4_fc_perform_commit(journal_t *journal)
 	}
 	ext4_fc_unlock(sb, alloc_ctx);
 
-	ret = ext4_fc_snapshot_inodes(journal, inodes, inodes_size);
+	ret = ext4_fc_snapshot_inodes(journal, inodes, inodes_size,
+				      &snap_inodes, &snap_ranges, &snap_err);
 	jbd2_journal_unlock_updates(journal);
+	if (trace_ext4_fc_lock_updates_enabled()) {
+		locked_ns = ktime_to_ns(ktime_sub(ktime_get(), lock_start));
+		trace_call__ext4_fc_lock_updates(sb, commit_tid, locked_ns,
+						 snap_inodes, snap_ranges,
+						 ret, snap_err);
+	}
 	kvfree(inodes);
 	if (ret)
 		return ret;
@@ -1550,7 +1594,7 @@ int ext4_fc_commit(journal_t *journal, tid_t commit_tid)
 		journal_ioprio = EXT4_DEF_JOURNAL_IOPRIO;
 	set_task_ioprio(current, journal_ioprio);
 	fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize;
-	ret = ext4_fc_perform_commit(journal);
+	ret = ext4_fc_perform_commit(journal, commit_tid);
 	if (ret < 0) {
 		if (ret == -EAGAIN || ret == -E2BIG || ret == -ECANCELED)
 			status = EXT4_FC_STATUS_INELIGIBLE;
diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
index f493642cf121..7028a28316fa 100644
--- a/include/trace/events/ext4.h
+++ b/include/trace/events/ext4.h
@@ -107,6 +107,26 @@ TRACE_DEFINE_ENUM(EXT4_FC_REASON_VERITY);
 TRACE_DEFINE_ENUM(EXT4_FC_REASON_MOVE_EXT);
 TRACE_DEFINE_ENUM(EXT4_FC_REASON_MAX);
 
+#undef EM
+#undef EMe
+#define EM(a)	TRACE_DEFINE_ENUM(EXT4_FC_SNAP_ERR_##a);
+#define EMe(a)	TRACE_DEFINE_ENUM(EXT4_FC_SNAP_ERR_##a);
+
+#define TRACE_SNAP_ERR						\
+	EM(NONE)						\
+	EM(ES_MISS)						\
+	EM(ES_DELAYED)						\
+	EM(ES_OTHER)						\
+	EM(INODES_CAP)						\
+	EM(RANGES_CAP)						\
+	EM(NOMEM)						\
+	EMe(INODE_LOC)
+
+TRACE_SNAP_ERR
+
+#undef EM
+#undef EMe
+
 #define show_fc_reason(reason)						\
 	__print_symbolic(reason,					\
 		{ EXT4_FC_REASON_XATTR,		"XATTR"},		\
@@ -2818,6 +2838,47 @@ TRACE_EVENT(ext4_fc_commit_stop,
 		  __entry->num_fc_ineligible, __entry->nblks_agg, __entry->tid)
 );
 
+#define EM(a)	{ EXT4_FC_SNAP_ERR_##a, #a },
+#define EMe(a)	{ EXT4_FC_SNAP_ERR_##a, #a }
+
+TRACE_EVENT(ext4_fc_lock_updates,
+	    TP_PROTO(struct super_block *sb, tid_t commit_tid, u64 locked_ns,
+		     unsigned int nr_inodes, unsigned int nr_ranges, int err,
+		     int snap_err),
+
+	TP_ARGS(sb, commit_tid, locked_ns, nr_inodes, nr_ranges, err, snap_err),
+
+	TP_STRUCT__entry(/* entry */
+		__field(dev_t, dev)
+		__field(tid_t, tid)
+		__field(u64, locked_ns)
+		__field(unsigned int, nr_inodes)
+		__field(unsigned int, nr_ranges)
+		__field(int, err)
+		__field(int, snap_err)
+	),
+
+	TP_fast_assign(/* assign */
+		__entry->dev = sb->s_dev;
+		__entry->tid = commit_tid;
+		__entry->locked_ns = locked_ns;
+		__entry->nr_inodes = nr_inodes;
+		__entry->nr_ranges = nr_ranges;
+		__entry->err = err;
+		__entry->snap_err = snap_err;
+	),
+
+	TP_printk("dev %d,%d tid %u locked_ns %llu nr_inodes %u nr_ranges %u err %d snap_err %s",
+		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->tid,
+		  __entry->locked_ns, __entry->nr_inodes, __entry->nr_ranges,
+		  __entry->err, __print_symbolic(__entry->snap_err,
+						 TRACE_SNAP_ERR))
+);
+
+#undef EM
+#undef EMe
+#undef TRACE_SNAP_ERR
+
 #define FC_REASON_NAME_STAT(reason)					\
 	show_fc_reason(reason),						\
 	__entry->fc_ineligible_rc[reason]
-- 
2.53.0

^ permalink raw reply related

* [RFC v8 5/7] ext4: fast commit: avoid i_data_sem by dropping ext4_map_blocks() in snapshots
From: Li Chen @ 2026-05-15  9:18 UTC (permalink / raw)
  To: Zhang Yi, Theodore Ts'o, Andreas Dilger, Baokun Li, Jan Kara,
	Ojaswin Mujoo, Ritesh Harjani (IBM), Zhang Yi, linux-ext4,
	linux-kernel
  Cc: Steven Rostedt, Masami Hiramatsu, Mathieu Desnoyers,
	linux-trace-kernel
In-Reply-To: <20260515091829.194810-1-me@linux.beauty>

Commit-time snapshots run under jbd2_journal_lock_updates(), so the work
done there must stay bounded.

The snapshot path still used ext4_map_blocks() to build data ranges. This
can take i_data_sem and pulls the mapping code into the snapshot logic.
Build inode data range snapshots from the extent status tree instead.

The extent status tree is a cache, not an authoritative source. If the
needed information is missing or unstable (e.g. delayed allocation), treat
the transaction as fast commit ineligible and fall back to full commit.

Also cap the number of inodes and ranges snapshotted per fast commit and
allocate range records from a dedicated slab cache. The inode pointer
array is allocated outside the updates-locked window.

Testing: QEMU/KVM guest, virtio-pmem + dax, ext4 -O fast_commit, mounted
dax,noatime. Ran python3 500x {4K write + fsync}, fallocate 256M, and
python3 500x {creat + fsync(dir)} without lockdep splats or errors.

Signed-off-by: Li Chen <chenl311@chinatelecom.cn>
---
Changes in v7:
- Address Sashiko review by guarding snapshot range arithmetic near
  EXT_MAX_BLOCKS to avoid cur_lblk / remaining-range wraparound in the
  snapshot walk.

 fs/ext4/fast_commit.c | 253 ++++++++++++++++++++++++++++++------------
 1 file changed, 179 insertions(+), 74 deletions(-)

diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
index 8a6981e50ffe..9e73c83b0e25 100644
--- a/fs/ext4/fast_commit.c
+++ b/fs/ext4/fast_commit.c
@@ -184,6 +184,15 @@
 
 #include <trace/events/ext4.h>
 static struct kmem_cache *ext4_fc_dentry_cachep;
+static struct kmem_cache *ext4_fc_range_cachep;
+
+/*
+ * Avoid spending unbounded time/memory snapshotting highly fragmented files
+ * under jbd2_journal_lock_updates(). If we exceed this limit, fall back to
+ * full commit.
+ */
+#define EXT4_FC_SNAPSHOT_MAX_INODES	1024
+#define EXT4_FC_SNAPSHOT_MAX_RANGES	2048
 
 static void ext4_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
 {
@@ -939,7 +948,7 @@ static void ext4_fc_free_ranges(struct list_head *head)
 
 	list_for_each_entry_safe(range, range_n, head, list) {
 		list_del(&range->list);
-		kfree(range);
+		kmem_cache_free(ext4_fc_range_cachep, range);
 	}
 }
 
@@ -957,16 +966,19 @@ static void ext4_fc_free_inode_snap(struct inode *inode)
 }
 
 static int ext4_fc_snapshot_inode_data(struct inode *inode,
-				       struct list_head *ranges)
+				       struct list_head *ranges,
+				       unsigned int nr_ranges_total,
+				       unsigned int *nr_rangesp)
 {
 	struct ext4_inode_info *ei = EXT4_I(inode);
+	unsigned int nr_ranges = 0;
 	ext4_lblk_t start_lblk, end_lblk, cur_lblk;
-	struct ext4_map_blocks map;
-	int ret;
 
 	spin_lock(&ei->i_fc_lock);
 	if (ei->i_fc_lblk_len == 0) {
 		spin_unlock(&ei->i_fc_lock);
+		if (nr_rangesp)
+			*nr_rangesp = 0;
 		return 0;
 	}
 	start_lblk = ei->i_fc_lblk_start;
@@ -980,61 +992,82 @@ static int ext4_fc_snapshot_inode_data(struct inode *inode,
 		   (unsigned long long)inode->i_ino);
 
 	while (cur_lblk <= end_lblk) {
+		struct extent_status es;
 		struct ext4_fc_range *range;
+		ext4_lblk_t len;
+		u64 remaining = (u64)end_lblk - cur_lblk + 1;
 
-		map.m_lblk = cur_lblk;
-		map.m_len = end_lblk - cur_lblk + 1;
-		ret = ext4_map_blocks(NULL, inode, &map,
-				      EXT4_GET_BLOCKS_IO_SUBMIT |
-				      EXT4_EX_NOCACHE);
-		if (ret < 0)
-			return -ECANCELED;
+		if (!ext4_es_lookup_extent(inode, cur_lblk, NULL, &es, NULL))
+			return -EAGAIN;
+
+		if (ext4_es_is_delayed(&es))
+			return -EAGAIN;
 
-		if (map.m_len == 0) {
+		len = es.es_len - (cur_lblk - es.es_lblk);
+		if (len > remaining)
+			len = remaining;
+		if (len == 0) {
 			cur_lblk++;
 			continue;
 		}
 
-		range = kmalloc(sizeof(*range), GFP_NOFS);
+		if (nr_ranges_total + nr_ranges >= EXT4_FC_SNAPSHOT_MAX_RANGES)
+			return -E2BIG;
+
+		range = kmem_cache_alloc(ext4_fc_range_cachep, GFP_NOFS);
 		if (!range)
 			return -ENOMEM;
+		nr_ranges++;
 
-		range->lblk = map.m_lblk;
-		range->len = map.m_len;
+		range->lblk = cur_lblk;
+		range->len = len;
 		range->pblk = 0;
 		range->unwritten = false;
 
-		if (ret == 0) {
+		if (ext4_es_is_hole(&es)) {
 			range->tag = EXT4_FC_TAG_DEL_RANGE;
-		} else {
-			unsigned int max = (map.m_flags & EXT4_MAP_UNWRITTEN) ?
-				EXT_UNWRITTEN_MAX_LEN : EXT_INIT_MAX_LEN;
-
-			/* Limit the number of blocks in one extent */
-			map.m_len = min(max, map.m_len);
+		} else if (ext4_es_is_written(&es) ||
+			   ext4_es_is_unwritten(&es)) {
+			unsigned int max;
 
 			range->tag = EXT4_FC_TAG_ADD_RANGE;
-			range->len = map.m_len;
-			range->pblk = map.m_pblk;
-			range->unwritten = !!(map.m_flags & EXT4_MAP_UNWRITTEN);
+			range->pblk = ext4_es_pblock(&es) +
+				      (cur_lblk - es.es_lblk);
+			range->unwritten = ext4_es_is_unwritten(&es);
+
+			max = range->unwritten ? EXT_UNWRITTEN_MAX_LEN :
+						 EXT_INIT_MAX_LEN;
+			if (range->len > max)
+				range->len = max;
+		} else {
+			kmem_cache_free(ext4_fc_range_cachep, range);
+			return -EAGAIN;
 		}
 
 		INIT_LIST_HEAD(&range->list);
 		list_add_tail(&range->list, ranges);
 
-		cur_lblk += map.m_len;
+		if ((u64)range->len > (u64)end_lblk - cur_lblk)
+			break;
+
+		cur_lblk += range->len;
 	}
 
+	if (nr_rangesp)
+		*nr_rangesp = nr_ranges;
 	return 0;
 }
 
-static int ext4_fc_snapshot_inode(struct inode *inode)
+static int ext4_fc_snapshot_inode(struct inode *inode,
+				  unsigned int nr_ranges_total,
+				  unsigned int *nr_rangesp)
 {
 	struct ext4_inode_info *ei = EXT4_I(inode);
 	struct ext4_fc_inode_snap *snap;
 	int inode_len = EXT4_GOOD_OLD_INODE_SIZE;
 	struct ext4_iloc iloc;
 	LIST_HEAD(ranges);
+	unsigned int nr_ranges = 0;
 	int ret;
 	int alloc_ctx;
 
@@ -1058,7 +1091,8 @@ static int ext4_fc_snapshot_inode(struct inode *inode)
 	memcpy(snap->inode_buf, (u8 *)ext4_raw_inode(&iloc), inode_len);
 	brelse(iloc.bh);
 
-	ret = ext4_fc_snapshot_inode_data(inode, &ranges);
+	ret = ext4_fc_snapshot_inode_data(inode, &ranges, nr_ranges_total,
+					  &nr_ranges);
 	if (ret) {
 		kfree(snap);
 		ext4_fc_free_ranges(&ranges);
@@ -1071,10 +1105,11 @@ static int ext4_fc_snapshot_inode(struct inode *inode)
 	list_splice_tail_init(&ranges, &snap->data_list);
 	ext4_fc_unlock(inode->i_sb, alloc_ctx);
 
+	if (nr_rangesp)
+		*nr_rangesp = nr_ranges;
 	return 0;
 }
 
-
 /* Flushes data of all the inodes in the commit queue. */
 static int ext4_fc_flush_data(journal_t *journal)
 {
@@ -1153,49 +1188,32 @@ static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc)
 	return 0;
 }
 
-static int ext4_fc_snapshot_inodes(journal_t *journal)
+static int ext4_fc_alloc_snapshot_inodes(struct super_block *sb,
+					 struct inode ***inodesp,
+					 unsigned int *nr_inodesp);
+
+static int ext4_fc_snapshot_inodes(journal_t *journal, struct inode **inodes,
+				   unsigned int inodes_size)
 {
 	struct super_block *sb = journal->j_private;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	struct ext4_inode_info *iter;
 	struct ext4_fc_dentry_update *fc_dentry;
-	struct inode **inodes;
-	unsigned int nr_inodes = 0;
 	unsigned int i = 0;
+	unsigned int idx;
+	unsigned int nr_ranges = 0;
 	int ret = 0;
 	int alloc_ctx;
 
-	alloc_ctx = ext4_fc_lock(sb);
-	list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list)
-		nr_inodes++;
-
-	list_for_each_entry(fc_dentry, &sbi->s_fc_dentry_q[FC_Q_MAIN], fcd_list) {
-		struct ext4_inode_info *ei;
-
-		if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT)
-			continue;
-		if (list_empty(&fc_dentry->fcd_dilist))
-			continue;
-
-		/* See the comment in ext4_fc_commit_dentry_updates(). */
-		ei = list_first_entry(&fc_dentry->fcd_dilist,
-				      struct ext4_inode_info, i_fc_dilist);
-		if (!list_empty(&ei->i_fc_list))
-			continue;
-
-		nr_inodes++;
-	}
-	ext4_fc_unlock(sb, alloc_ctx);
-
-	if (!nr_inodes)
+	if (!inodes_size)
 		return 0;
 
-	inodes = kvcalloc(nr_inodes, sizeof(*inodes), GFP_NOFS);
-	if (!inodes)
-		return -ENOMEM;
-
 	alloc_ctx = ext4_fc_lock(sb);
 	list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
+		if (i >= inodes_size) {
+			ret = -E2BIG;
+			goto unlock;
+		}
 		inodes[i++] = &iter->vfs_inode;
 	}
 
@@ -1215,6 +1233,10 @@ static int ext4_fc_snapshot_inodes(journal_t *journal)
 		if (!list_empty(&ei->i_fc_list))
 			continue;
 
+		if (i >= inodes_size) {
+			ret = -E2BIG;
+			goto unlock;
+		}
 		/*
 		 * Create-only inodes may only be referenced via fcd_dilist and
 		 * not appear on s_fc_q[MAIN]. They may hit the last iput while
@@ -1226,15 +1248,22 @@ static int ext4_fc_snapshot_inodes(journal_t *journal)
 		ext4_set_inode_state(inode, EXT4_STATE_FC_COMMITTING);
 		inodes[i++] = inode;
 	}
+unlock:
 	ext4_fc_unlock(sb, alloc_ctx);
 
-	for (nr_inodes = 0; nr_inodes < i; nr_inodes++) {
-		ret = ext4_fc_snapshot_inode(inodes[nr_inodes]);
+	if (ret)
+		return ret;
+
+	for (idx = 0; idx < i; idx++) {
+		unsigned int inode_ranges = 0;
+
+		ret = ext4_fc_snapshot_inode(inodes[idx], nr_ranges,
+					     &inode_ranges);
 		if (ret)
 			break;
+		nr_ranges += inode_ranges;
 	}
 
-	kvfree(inodes);
 	return ret;
 }
 
@@ -1245,6 +1274,8 @@ static int ext4_fc_perform_commit(journal_t *journal)
 	struct ext4_inode_info *iter;
 	struct ext4_fc_head head;
 	struct inode *inode;
+	struct inode **inodes;
+	unsigned int inodes_size;
 	struct blk_plug plug;
 	int ret = 0;
 	u32 crc = 0;
@@ -1294,6 +1325,10 @@ static int ext4_fc_perform_commit(journal_t *journal)
 		return ret;
 
 
+	ret = ext4_fc_alloc_snapshot_inodes(sb, &inodes, &inodes_size);
+	if (ret)
+		return ret;
+
 	/* Step 4: Mark all inodes as being committed. */
 	jbd2_journal_lock_updates(journal);
 	/*
@@ -1309,8 +1344,9 @@ static int ext4_fc_perform_commit(journal_t *journal)
 	}
 	ext4_fc_unlock(sb, alloc_ctx);
 
-	ret = ext4_fc_snapshot_inodes(journal);
+	ret = ext4_fc_snapshot_inodes(journal, inodes, inodes_size);
 	jbd2_journal_unlock_updates(journal);
+	kvfree(inodes);
 	if (ret)
 		return ret;
 
@@ -1366,6 +1402,64 @@ static int ext4_fc_perform_commit(journal_t *journal)
 	return ret;
 }
 
+static unsigned int ext4_fc_count_snapshot_inodes(struct super_block *sb)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_inode_info *iter;
+	struct ext4_fc_dentry_update *fc_dentry;
+	unsigned int nr_inodes = 0;
+	int alloc_ctx;
+
+	alloc_ctx = ext4_fc_lock(sb);
+	list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list)
+		nr_inodes++;
+
+	list_for_each_entry(fc_dentry, &sbi->s_fc_dentry_q[FC_Q_MAIN], fcd_list) {
+		struct ext4_inode_info *ei;
+
+		if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT)
+			continue;
+		if (list_empty(&fc_dentry->fcd_dilist))
+			continue;
+
+		/* See the comment in ext4_fc_commit_dentry_updates(). */
+		ei = list_first_entry(&fc_dentry->fcd_dilist,
+				      struct ext4_inode_info, i_fc_dilist);
+		if (!list_empty(&ei->i_fc_list))
+			continue;
+
+		nr_inodes++;
+	}
+	ext4_fc_unlock(sb, alloc_ctx);
+
+	return nr_inodes;
+}
+
+static int ext4_fc_alloc_snapshot_inodes(struct super_block *sb,
+					 struct inode ***inodesp,
+					 unsigned int *nr_inodesp)
+{
+	unsigned int nr_inodes = ext4_fc_count_snapshot_inodes(sb);
+	struct inode **inodes;
+
+	*inodesp = NULL;
+	*nr_inodesp = 0;
+
+	if (!nr_inodes)
+		return 0;
+
+	if (nr_inodes > EXT4_FC_SNAPSHOT_MAX_INODES)
+		return -E2BIG;
+
+	inodes = kvcalloc(nr_inodes, sizeof(*inodes), GFP_NOFS);
+	if (!inodes)
+		return -ENOMEM;
+
+	*inodesp = inodes;
+	*nr_inodesp = nr_inodes;
+	return 0;
+}
+
 static void ext4_fc_update_stats(struct super_block *sb, int status,
 				 u64 commit_time, int nblks, tid_t commit_tid)
 {
@@ -1458,7 +1552,10 @@ int ext4_fc_commit(journal_t *journal, tid_t commit_tid)
 	fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize;
 	ret = ext4_fc_perform_commit(journal);
 	if (ret < 0) {
-		status = EXT4_FC_STATUS_FAILED;
+		if (ret == -EAGAIN || ret == -E2BIG || ret == -ECANCELED)
+			status = EXT4_FC_STATUS_INELIGIBLE;
+		else
+			status = EXT4_FC_STATUS_FAILED;
 		goto fallback;
 	}
 	nblks = (sbi->s_fc_bytes + bsize - 1) / bsize - fc_bufs_before;
@@ -1539,26 +1636,27 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid)
 
 	while (!list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) {
 		fc_dentry = list_first_entry(&sbi->s_fc_dentry_q[FC_Q_MAIN],
-					     struct ext4_fc_dentry_update,
-					     fcd_list);
+						 struct ext4_fc_dentry_update,
+						 fcd_list);
 		list_del_init(&fc_dentry->fcd_list);
 		if (fc_dentry->fcd_op == EXT4_FC_TAG_CREAT &&
-		    !list_empty(&fc_dentry->fcd_dilist)) {
+			!list_empty(&fc_dentry->fcd_dilist)) {
 			/* See the comment in ext4_fc_commit_dentry_updates(). */
 			ei = list_first_entry(&fc_dentry->fcd_dilist,
-					      struct ext4_inode_info,
-					      i_fc_dilist);
+						  struct ext4_inode_info,
+						  i_fc_dilist);
 			ext4_fc_free_inode_snap(&ei->vfs_inode);
 			spin_lock(&ei->i_fc_lock);
 			ext4_clear_inode_state(&ei->vfs_inode,
-					       EXT4_STATE_FC_REQUEUE);
+						   EXT4_STATE_FC_REQUEUE);
 			ext4_clear_inode_state(&ei->vfs_inode,
-					       EXT4_STATE_FC_COMMITTING);
+						   EXT4_STATE_FC_COMMITTING);
 			spin_unlock(&ei->i_fc_lock);
 			/*
 			 * Make sure clearing of EXT4_STATE_FC_COMMITTING is
-			 * visible before we send the wakeup. Pairs with implicit
-			 * barrier in prepare_to_wait() in ext4_fc_del().
+			 * visible before we send the wakeup. Pairs with
+			 * implicit barrier in prepare_to_wait() in
+			 * ext4_fc_del().
 			 */
 			smp_mb();
 			ext4_fc_wake_inode_state(&ei->vfs_inode,
@@ -2538,13 +2636,20 @@ int __init ext4_fc_init_dentry_cache(void)
 	ext4_fc_dentry_cachep = KMEM_CACHE(ext4_fc_dentry_update,
 					   SLAB_RECLAIM_ACCOUNT);
 
-	if (ext4_fc_dentry_cachep == NULL)
+	if (!ext4_fc_dentry_cachep)
 		return -ENOMEM;
 
+	ext4_fc_range_cachep = KMEM_CACHE(ext4_fc_range, SLAB_RECLAIM_ACCOUNT);
+	if (!ext4_fc_range_cachep) {
+		kmem_cache_destroy(ext4_fc_dentry_cachep);
+		return -ENOMEM;
+	}
+
 	return 0;
 }
 
 void ext4_fc_destroy_dentry_cache(void)
 {
+	kmem_cache_destroy(ext4_fc_range_cachep);
 	kmem_cache_destroy(ext4_fc_dentry_cachep);
 }
-- 
2.53.0

^ permalink raw reply related

* [RFC v8 4/7] ext4: fast commit: avoid self-deadlock in inode snapshotting
From: Li Chen @ 2026-05-15  9:18 UTC (permalink / raw)
  To: Zhang Yi, Theodore Ts'o, Andreas Dilger, Baokun Li, Jan Kara,
	Ojaswin Mujoo, Ritesh Harjani (IBM), Zhang Yi, linux-ext4,
	linux-kernel
  Cc: Steven Rostedt, Masami Hiramatsu, Mathieu Desnoyers,
	linux-trace-kernel
In-Reply-To: <20260515091829.194810-1-me@linux.beauty>

ext4_fc_snapshot_inodes() used igrab()/iput() to pin inodes while building
commit-time snapshots. With ext4_fc_del() waiting for
EXT4_STATE_FC_COMMITTING, iput() can trigger
ext4_clear_inode()->ext4_fc_del() in the commit thread and deadlock waiting
for the fast commit to finish.

ext4_fc_del() also has to re-check EXT4_STATE_FC_COMMITTING after
waiting on EXT4_STATE_FC_FLUSHING_DATA. The commit thread clears
FLUSHING_DATA before it sets COMMITTING, so a waiter woken from the
flush wait must not delete the inode based on an old COMMITTING
check.

Avoid taking extra references. Collect inode pointers under s_fc_lock and
rely on EXT4_STATE_FC_COMMITTING to pin inodes until ext4_fc_cleanup()
clears the bit.

Also set EXT4_STATE_FC_COMMITTING for create-only inodes referenced
from the dentry update queue, and wake up waiters when ext4_fc_cleanup()
clears the bit.

Signed-off-by: Li Chen <chenl311@chinatelecom.cn>
---
Changes in v8:
- Factor out small ext4_fc_wait_inode_state()/ext4_fc_wake_inode_state()
  helpers so the repeated FC state wait/wake mapping is kept in one place.
- Re-check EXT4_STATE_FC_COMMITTING after waking from
  EXT4_STATE_FC_FLUSHING_DATA in ext4_fc_del(), so list deletion only
  happens after both predicates pass under the same s_fc_lock critical
  section.

 fs/ext4/fast_commit.c | 124 +++++++++++++++++++++++++-----------------
 1 file changed, 75 insertions(+), 49 deletions(-)

diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
index 673668860e2d..8a6981e50ffe 100644
--- a/fs/ext4/fast_commit.c
+++ b/fs/ext4/fast_commit.c
@@ -235,6 +235,37 @@ static bool ext4_fc_eligible(struct super_block *sb)
 		!(ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE));
 }
 
+/*
+ * Wait for an inode fast-commit state bit to clear while dropping the
+ * fast-commit lock around schedule().
+ */
+static void ext4_fc_wait_inode_state(struct inode *inode, int bit,
+				     int *alloc_ctx)
+{
+	wait_queue_head_t *wq;
+	unsigned long *wait_word = ext4_inode_state_wait_word(inode);
+	int wait_bit = ext4_inode_state_wait_bit(bit);
+
+	while (ext4_test_inode_state(inode, bit)) {
+		DEFINE_WAIT_BIT(wait, wait_word, wait_bit);
+
+		wq = bit_waitqueue(wait_word, wait_bit);
+		prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
+		if (ext4_test_inode_state(inode, bit)) {
+			ext4_fc_unlock(inode->i_sb, *alloc_ctx);
+			schedule();
+			*alloc_ctx = ext4_fc_lock(inode->i_sb);
+		}
+		finish_wait(wq, &wait.wq_entry);
+	}
+}
+
+static inline void ext4_fc_wake_inode_state(struct inode *inode, int bit)
+{
+	wake_up_bit(ext4_inode_state_wait_word(inode),
+		    ext4_inode_state_wait_bit(bit));
+}
+
 /*
  * Remove inode from fast commit list. If the inode is being committed
  * we wait until inode commit is done.
@@ -243,12 +274,6 @@ void ext4_fc_del(struct inode *inode)
 {
 	struct ext4_inode_info *ei = EXT4_I(inode);
 	struct ext4_fc_dentry_update *fc_dentry;
-	wait_queue_head_t *wq;
-	unsigned long *wait_word = ext4_inode_state_wait_word(inode);
-	int committing_wait_bit =
-		ext4_inode_state_wait_bit(EXT4_STATE_FC_COMMITTING);
-	int flushing_wait_bit =
-		ext4_inode_state_wait_bit(EXT4_STATE_FC_FLUSHING_DATA);
 	int alloc_ctx;
 
 	if (ext4_fc_disabled(inode->i_sb))
@@ -263,32 +288,19 @@ void ext4_fc_del(struct inode *inode)
 
 	/*
 	 * Wait for ongoing fast commit to finish. We cannot remove the inode
-	 * from fast commit lists while it is being committed.
+	 * from fast commit lists while it is being committed. If we wake from
+	 * FC_FLUSHING_DATA, re-check FC_COMMITTING before deleting because the
+	 * commit thread sets FC_COMMITTING only after clearing FLUSHING_DATA.
 	 */
-	while (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
-		DEFINE_WAIT_BIT(wait, wait_word, committing_wait_bit);
+	for (;;) {
+		ext4_fc_wait_inode_state(inode, EXT4_STATE_FC_COMMITTING,
+					 &alloc_ctx);
 
-		wq = bit_waitqueue(wait_word, committing_wait_bit);
-		prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
-		if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
-			ext4_fc_unlock(inode->i_sb, alloc_ctx);
-			schedule();
-			alloc_ctx = ext4_fc_lock(inode->i_sb);
-		}
-		finish_wait(wq, &wait.wq_entry);
-	}
-
-	while (ext4_test_inode_state(inode, EXT4_STATE_FC_FLUSHING_DATA)) {
-		DEFINE_WAIT_BIT(wait, wait_word, flushing_wait_bit);
+		if (!ext4_test_inode_state(inode, EXT4_STATE_FC_FLUSHING_DATA))
+			break;
 
-		wq = bit_waitqueue(wait_word, flushing_wait_bit);
-		prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
-		if (ext4_test_inode_state(inode, EXT4_STATE_FC_FLUSHING_DATA)) {
-			ext4_fc_unlock(inode->i_sb, alloc_ctx);
-			schedule();
-			alloc_ctx = ext4_fc_lock(inode->i_sb);
-		}
-		finish_wait(wq, &wait.wq_entry);
+		ext4_fc_wait_inode_state(inode, EXT4_STATE_FC_FLUSHING_DATA,
+					 &alloc_ctx);
 	}
 
 	ext4_fc_free_inode_snap(inode);
@@ -1184,13 +1196,12 @@ static int ext4_fc_snapshot_inodes(journal_t *journal)
 
 	alloc_ctx = ext4_fc_lock(sb);
 	list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
-		inodes[i] = igrab(&iter->vfs_inode);
-		if (inodes[i])
-			i++;
+		inodes[i++] = &iter->vfs_inode;
 	}
 
 	list_for_each_entry(fc_dentry, &sbi->s_fc_dentry_q[FC_Q_MAIN], fcd_list) {
 		struct ext4_inode_info *ei;
+		struct inode *inode;
 
 		if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT)
 			continue;
@@ -1200,12 +1211,20 @@ static int ext4_fc_snapshot_inodes(journal_t *journal)
 		/* See the comment in ext4_fc_commit_dentry_updates(). */
 		ei = list_first_entry(&fc_dentry->fcd_dilist,
 				      struct ext4_inode_info, i_fc_dilist);
+		inode = &ei->vfs_inode;
 		if (!list_empty(&ei->i_fc_list))
 			continue;
 
-		inodes[i] = igrab(&ei->vfs_inode);
-		if (inodes[i])
-			i++;
+		/*
+		 * Create-only inodes may only be referenced via fcd_dilist and
+		 * not appear on s_fc_q[MAIN]. They may hit the last iput while
+		 * we are snapshotting, but inode eviction calls ext4_fc_del(),
+		 * which waits for FC_COMMITTING to clear. Mark them FC_COMMITTING
+		 * so the inode stays pinned and the snapshot stays valid until
+		 * ext4_fc_cleanup().
+		 */
+		ext4_set_inode_state(inode, EXT4_STATE_FC_COMMITTING);
+		inodes[i++] = inode;
 	}
 	ext4_fc_unlock(sb, alloc_ctx);
 
@@ -1215,10 +1234,6 @@ static int ext4_fc_snapshot_inodes(journal_t *journal)
 			break;
 	}
 
-	for (nr_inodes = 0; nr_inodes < i; nr_inodes++) {
-		if (inodes[nr_inodes])
-			iput(inodes[nr_inodes]);
-	}
 	kvfree(inodes);
 	return ret;
 }
@@ -1234,8 +1249,6 @@ static int ext4_fc_perform_commit(journal_t *journal)
 	int ret = 0;
 	u32 crc = 0;
 	int alloc_ctx;
-	int flushing_wait_bit =
-		ext4_inode_state_wait_bit(EXT4_STATE_FC_FLUSHING_DATA);
 
 	/*
 	 * Step 1: Mark all inodes on s_fc_q[MAIN] with
@@ -1261,8 +1274,8 @@ static int ext4_fc_perform_commit(journal_t *journal)
 	list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
 		ext4_clear_inode_state(&iter->vfs_inode,
 				       EXT4_STATE_FC_FLUSHING_DATA);
-		wake_up_bit(ext4_inode_state_wait_word(&iter->vfs_inode),
-			    flushing_wait_bit);
+		ext4_fc_wake_inode_state(&iter->vfs_inode,
+					 EXT4_STATE_FC_FLUSHING_DATA);
 	}
 
 	/*
@@ -1285,8 +1298,9 @@ static int ext4_fc_perform_commit(journal_t *journal)
 	jbd2_journal_lock_updates(journal);
 	/*
 	 * The journal is now locked. No more handles can start and all the
-	 * previous handles are now drained. We now mark the inodes on the
-	 * commit queue as being committed.
+	 * previous handles are now drained. Snapshotting happens in this
+	 * window so log writing can consume only stable snapshots without
+	 * doing logical-to-physical mapping.
 	 */
 	alloc_ctx = ext4_fc_lock(sb);
 	list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
@@ -1482,8 +1496,6 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid)
 	struct ext4_inode_info *ei;
 	struct ext4_fc_dentry_update *fc_dentry;
 	int alloc_ctx;
-	int committing_wait_bit =
-		ext4_inode_state_wait_bit(EXT4_STATE_FC_COMMITTING);
 
 	if (full && sbi->s_fc_bh)
 		sbi->s_fc_bh = NULL;
@@ -1521,8 +1533,8 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid)
 		 * barrier in prepare_to_wait() in ext4_fc_del().
 		 */
 		smp_mb();
-		wake_up_bit(ext4_inode_state_wait_word(&ei->vfs_inode),
-			    committing_wait_bit);
+		ext4_fc_wake_inode_state(&ei->vfs_inode,
+					 EXT4_STATE_FC_COMMITTING);
 	}
 
 	while (!list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) {
@@ -1537,6 +1549,20 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid)
 					      struct ext4_inode_info,
 					      i_fc_dilist);
 			ext4_fc_free_inode_snap(&ei->vfs_inode);
+			spin_lock(&ei->i_fc_lock);
+			ext4_clear_inode_state(&ei->vfs_inode,
+					       EXT4_STATE_FC_REQUEUE);
+			ext4_clear_inode_state(&ei->vfs_inode,
+					       EXT4_STATE_FC_COMMITTING);
+			spin_unlock(&ei->i_fc_lock);
+			/*
+			 * Make sure clearing of EXT4_STATE_FC_COMMITTING is
+			 * visible before we send the wakeup. Pairs with implicit
+			 * barrier in prepare_to_wait() in ext4_fc_del().
+			 */
+			smp_mb();
+			ext4_fc_wake_inode_state(&ei->vfs_inode,
+						 EXT4_STATE_FC_COMMITTING);
 		}
 		list_del_init(&fc_dentry->fcd_dilist);
 
-- 
2.53.0

^ permalink raw reply related

* [RFC v8 3/7] ext4: fast commit: avoid waiting for FC_COMMITTING
From: Li Chen @ 2026-05-15  9:18 UTC (permalink / raw)
  To: Zhang Yi, Theodore Ts'o, Andreas Dilger, Baokun Li, Jan Kara,
	Ojaswin Mujoo, Ritesh Harjani (IBM), Zhang Yi, linux-ext4,
	linux-kernel
  Cc: Steven Rostedt, Masami Hiramatsu, Mathieu Desnoyers,
	linux-trace-kernel
In-Reply-To: <20260515091829.194810-1-me@linux.beauty>

ext4_fc_track_inode() can be called while holding i_data_sem (e.g.
fallocate). Waiting for EXT4_STATE_FC_COMMITTING in that case risks an
ABBA deadlock: i_data_sem -> wait(FC_COMMITTING) vs FC_COMMITTING ->
wait(i_data_sem) in the commit task.

Now that fast commit snapshots inode state at commit time, updates during
log writing do not need to block. Drop the wait and lockdep assertion in
ext4_fc_track_inode(), and make ext4_fc_del() wait for FC_COMMITTING so an
inode cannot be removed while the commit thread is still using it.

When an inode is modified during a fast commit, mark it with
EXT4_STATE_FC_REQUEUE so cleanup keeps it queued for the next fast commit.
This is needed because jbd2_fc_end_commit() invokes the cleanup callback
with tid == 0, so tid-based requeue logic would requeue every inode.

Testing: tracepoint ext4:ext4_fc_commit_stop with two fsyncs in the same
transaction. nblks is the number of journal blocks written for that fast
commit. Before this change, the second fsync still wrote almost the same
fast commit log (nblks 10->9), because tid == 0 in jbd2_fc_end_commit()
caused the tid-based requeue logic to keep all inodes queued. After this
change, only inodes modified during the commit are requeued, and the
second fsync wrote a nearly empty fast commit (nblks 10->1).

Signed-off-by: Li Chen <chenl311@chinatelecom.cn>
---
Changes in RFC v8:
- Base the series on "ext4: fix fast commit wait/wake bit mapping on
  64-bit", so the FC_COMMITTING/FC_FLUSHING_DATA wait and wake paths use
  the shared helper mapping.

 fs/ext4/ext4.h        |   1 +
 fs/ext4/fast_commit.c | 106 ++++++++++++++++++++----------------------
 2 files changed, 51 insertions(+), 56 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 115a3c94db16..927173bc8381 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1991,6 +1991,7 @@ enum {
 	EXT4_STATE_FC_COMMITTING,	/* Fast commit ongoing */
 	EXT4_STATE_FC_FLUSHING_DATA,	/* Fast commit flushing data */
 	EXT4_STATE_ORPHAN_FILE,		/* Inode orphaned in orphan file */
+	EXT4_STATE_FC_REQUEUE,		/* Inode modified during fast commit */
 };
 
 #define EXT4_INODE_BIT_FNS(name, field, offset)				\
diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
index 0c49144e8ca2..673668860e2d 100644
--- a/fs/ext4/fast_commit.c
+++ b/fs/ext4/fast_commit.c
@@ -62,9 +62,8 @@
  *     setting "EXT4_STATE_FC_COMMITTING" state, and snapshot the inode state
  *     needed for log writing.
  * [5] Unlock the journal by calling jbd2_journal_unlock_updates(). This allows
- *     starting of new handles. If new handles try to start an update on
- *     any of the inodes that are being committed, ext4_fc_track_inode()
- *     will block until those inodes have finished the fast commit.
+ *     starting of new handles. Updates to inodes being fast committed are
+ *     tracked for requeue rather than blocking.
  * [6] Commit all the directory entry updates in the fast commit space.
  * [7] Commit all the changed inodes in the fast commit space.
  * [8] Write tail tag (this tag ensures the atomicity, please read the following
@@ -218,6 +217,7 @@ void ext4_fc_init_inode(struct inode *inode)
 
 	ext4_fc_reset_inode(inode);
 	ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING);
+	ext4_clear_inode_state(inode, EXT4_STATE_FC_REQUEUE);
 	INIT_LIST_HEAD(&ei->i_fc_list);
 	INIT_LIST_HEAD(&ei->i_fc_dilist);
 	ei->i_fc_snap = NULL;
@@ -245,7 +245,10 @@ void ext4_fc_del(struct inode *inode)
 	struct ext4_fc_dentry_update *fc_dentry;
 	wait_queue_head_t *wq;
 	unsigned long *wait_word = ext4_inode_state_wait_word(inode);
-	int wait_bit = ext4_inode_state_wait_bit(EXT4_STATE_FC_FLUSHING_DATA);
+	int committing_wait_bit =
+		ext4_inode_state_wait_bit(EXT4_STATE_FC_COMMITTING);
+	int flushing_wait_bit =
+		ext4_inode_state_wait_bit(EXT4_STATE_FC_FLUSHING_DATA);
 	int alloc_ctx;
 
 	if (ext4_fc_disabled(inode->i_sb))
@@ -259,26 +262,26 @@ void ext4_fc_del(struct inode *inode)
 	}
 
 	/*
-	 * Since ext4_fc_del is called from ext4_evict_inode while having a
-	 * handle open, there is no need for us to wait here even if a fast
-	 * commit is going on. That is because, if this inode is being
-	 * committed, ext4_mark_inode_dirty would have waited for inode commit
-	 * operation to finish before we come here. So, by the time we come
-	 * here, inode's EXT4_STATE_FC_COMMITTING would have been cleared. So,
-	 * we shouldn't see EXT4_STATE_FC_COMMITTING to be set on this inode
-	 * here.
-	 *
-	 * We may come here without any handles open in the "no_delete" case of
-	 * ext4_evict_inode as well. However, if that happens, we first mark the
-	 * file system as fast commit ineligible anyway. So, even in that case,
-	 * it is okay to remove the inode from the fc list.
+	 * Wait for ongoing fast commit to finish. We cannot remove the inode
+	 * from fast commit lists while it is being committed.
 	 */
-	WARN_ON(ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)
-		&& !ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE));
+	while (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
+		DEFINE_WAIT_BIT(wait, wait_word, committing_wait_bit);
+
+		wq = bit_waitqueue(wait_word, committing_wait_bit);
+		prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
+		if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
+			ext4_fc_unlock(inode->i_sb, alloc_ctx);
+			schedule();
+			alloc_ctx = ext4_fc_lock(inode->i_sb);
+		}
+		finish_wait(wq, &wait.wq_entry);
+	}
+
 	while (ext4_test_inode_state(inode, EXT4_STATE_FC_FLUSHING_DATA)) {
-		DEFINE_WAIT_BIT(wait, wait_word, wait_bit);
+		DEFINE_WAIT_BIT(wait, wait_word, flushing_wait_bit);
 
-		wq = bit_waitqueue(wait_word, wait_bit);
+		wq = bit_waitqueue(wait_word, flushing_wait_bit);
 		prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
 		if (ext4_test_inode_state(inode, EXT4_STATE_FC_FLUSHING_DATA)) {
 			ext4_fc_unlock(inode->i_sb, alloc_ctx);
@@ -287,19 +290,22 @@ void ext4_fc_del(struct inode *inode)
 		}
 		finish_wait(wq, &wait.wq_entry);
 	}
+
 	ext4_fc_free_inode_snap(inode);
 	list_del_init(&ei->i_fc_list);
 
 	/*
-	 * Since this inode is getting removed, let's also remove all FC
-	 * dentry create references, since it is not needed to log it anyways.
+	 * Since this inode is getting removed, let's also remove all FC dentry
+	 * create references, since it is not needed to log it anyways.
 	 */
 	if (list_empty(&ei->i_fc_dilist)) {
 		ext4_fc_unlock(inode->i_sb, alloc_ctx);
 		return;
 	}
 
-	fc_dentry = list_first_entry(&ei->i_fc_dilist, struct ext4_fc_dentry_update, fcd_dilist);
+	fc_dentry = list_first_entry(&ei->i_fc_dilist,
+				     struct ext4_fc_dentry_update,
+				     fcd_dilist);
 	WARN_ON(fc_dentry->fcd_op != EXT4_FC_TAG_CREAT);
 	list_del_init(&fc_dentry->fcd_list);
 	list_del_init(&fc_dentry->fcd_dilist);
@@ -371,6 +377,8 @@ static int ext4_fc_track_template(
 
 	tid = handle->h_transaction->t_tid;
 	spin_lock(&ei->i_fc_lock);
+	if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING))
+		ext4_set_inode_state(inode, EXT4_STATE_FC_REQUEUE);
 	if (tid == ei->i_sync_tid) {
 		update = true;
 	} else {
@@ -541,10 +549,6 @@ static int __track_inode(handle_t *handle, struct inode *inode, void *arg,
 
 void ext4_fc_track_inode(handle_t *handle, struct inode *inode)
 {
-	struct ext4_inode_info *ei = EXT4_I(inode);
-	wait_queue_head_t *wq;
-	unsigned long *wait_word = ext4_inode_state_wait_word(inode);
-	int wait_bit = ext4_inode_state_wait_bit(EXT4_STATE_FC_COMMITTING);
 	int ret;
 
 	if (S_ISDIR(inode->i_mode))
@@ -560,21 +564,11 @@ void ext4_fc_track_inode(handle_t *handle, struct inode *inode)
 		return;
 
 	/*
-	 * If we come here, we may sleep while waiting for the inode to
-	 * commit. We shouldn't be holding i_data_sem when we go to sleep since
-	 * the commit path needs to grab the lock while committing the inode.
+	 * Fast commit snapshots inode state at commit time, so there's no need
+	 * to wait for EXT4_STATE_FC_COMMITTING here. If the inode is already
+	 * on the commit queue, ext4_fc_cleanup() will requeue it for the new
+	 * transaction once the current commit finishes.
 	 */
-	lockdep_assert_not_held(&ei->i_data_sem);
-
-	while (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
-		DEFINE_WAIT_BIT(wait, wait_word, wait_bit);
-
-		wq = bit_waitqueue(wait_word, wait_bit);
-		prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
-		if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING))
-			schedule();
-		finish_wait(wq, &wait.wq_entry);
-	}
 
 	/*
 	 * From this point on, this inode will not be committed either
@@ -1499,32 +1493,32 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid)
 
 	alloc_ctx = ext4_fc_lock(sb);
 	while (!list_empty(&sbi->s_fc_q[FC_Q_MAIN])) {
+		bool requeue;
+
 		ei = list_first_entry(&sbi->s_fc_q[FC_Q_MAIN],
 					struct ext4_inode_info,
 					i_fc_list);
 		list_del_init(&ei->i_fc_list);
 		ext4_fc_free_inode_snap(&ei->vfs_inode);
+		spin_lock(&ei->i_fc_lock);
+		if (full)
+			requeue = !tid_geq(tid, ei->i_sync_tid);
+		else
+			requeue = ext4_test_inode_state(&ei->vfs_inode,
+							EXT4_STATE_FC_REQUEUE);
+		if (!requeue)
+			ext4_fc_reset_inode(&ei->vfs_inode);
+		ext4_clear_inode_state(&ei->vfs_inode, EXT4_STATE_FC_REQUEUE);
 		ext4_clear_inode_state(&ei->vfs_inode,
 				       EXT4_STATE_FC_COMMITTING);
-		if (tid_geq(tid, ei->i_sync_tid)) {
-			ext4_fc_reset_inode(&ei->vfs_inode);
-		} else if (full) {
-			/*
-			 * We are called after a full commit, inode has been
-			 * modified while the commit was running. Re-enqueue
-			 * the inode into STAGING, which will then be splice
-			 * back into MAIN. This cannot happen during
-			 * fastcommit because the journal is locked all the
-			 * time in that case (and tid doesn't increase so
-			 * tid check above isn't reliable).
-			 */
+		spin_unlock(&ei->i_fc_lock);
+		if (requeue)
 			list_add_tail(&ei->i_fc_list,
 				      &sbi->s_fc_q[FC_Q_STAGING]);
-		}
 		/*
 		 * Make sure clearing of EXT4_STATE_FC_COMMITTING is
 		 * visible before we send the wakeup. Pairs with implicit
-		 * barrier in prepare_to_wait() in ext4_fc_track_inode().
+		 * barrier in prepare_to_wait() in ext4_fc_del().
 		 */
 		smp_mb();
 		wake_up_bit(ext4_inode_state_wait_word(&ei->vfs_inode),
-- 
2.53.0

^ permalink raw reply related

* [RFC v8 2/7] ext4: lockdep: handle i_data_sem subclassing for special inodes
From: Li Chen @ 2026-05-15  9:18 UTC (permalink / raw)
  To: Zhang Yi, Theodore Ts'o, Andreas Dilger, Baokun Li, Jan Kara,
	Ojaswin Mujoo, Ritesh Harjani (IBM), Zhang Yi, linux-ext4,
	linux-kernel
  Cc: Steven Rostedt, Masami Hiramatsu, Mathieu Desnoyers,
	linux-trace-kernel
In-Reply-To: <20260515091829.194810-1-me@linux.beauty>

Fast commit can hold s_fc_lock while writing journal blocks. Mapping the
journal inode can take its i_data_sem. Normal inode update paths can take a
data inode i_data_sem and then s_fc_lock, which makes lockdep report a
circular dependency.

lockdep treats all i_data_sem instances as one lock class and cannot
distinguish the journal inode i_data_sem from a regular inode i_data_sem.
The journal inode is not tracked by fast commit and no FC waiters ever
depend on it, so this is not a real ABBA deadlock. Assign the journal inode
a dedicated i_data_sem lockdep subclass to avoid the false positive.

Inode cache objects can be recycled, so also reset i_data_sem to
I_DATA_SEM_NORMAL when allocating an ext4 inode. Otherwise a new inode may
inherit an old subclass (journal/quota/ea) and trigger lockdep warnings.

Signed-off-by: Li Chen <chenl311@chinatelecom.cn>
---
Changes v6:
- Rebase onto linux-next master as of 2026-04-08.
- Refresh the patch context around upstream ext4_alloc_inode() changes,
  without changing the subclassing logic.

 fs/ext4/ext4.h  | 4 +++-
 fs/ext4/super.c | 8 ++++++++
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index e337a37bb6fb..115a3c94db16 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1015,12 +1015,14 @@ do {										\
  *			  than the first
  *  I_DATA_SEM_QUOTA  - Used for quota inodes only
  *  I_DATA_SEM_EA     - Used for ea_inodes only
+ *  I_DATA_SEM_JOURNAL - Used for journal inode only
  */
 enum {
 	I_DATA_SEM_NORMAL = 0,
 	I_DATA_SEM_OTHER,
 	I_DATA_SEM_QUOTA,
-	I_DATA_SEM_EA
+	I_DATA_SEM_EA,
+	I_DATA_SEM_JOURNAL
 };
 
 struct ext4_fc_inode_snap;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 6a77db4d3124..3c869f0001c5 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1431,6 +1431,9 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
 	ext4_fc_init_inode(&ei->vfs_inode);
 	spin_lock_init(&ei->i_fc_lock);
 	mmb_init(&ei->i_metadata_bhs, &ei->vfs_inode.i_data);
+#ifdef CONFIG_LOCKDEP
+	lockdep_set_subclass(&ei->i_data_sem, I_DATA_SEM_NORMAL);
+#endif
 	return &ei->vfs_inode;
 }
 
@@ -5910,6 +5913,11 @@ static struct inode *ext4_get_journal_inode(struct super_block *sb,
 		return ERR_PTR(-EFSCORRUPTED);
 	}
 
+#ifdef CONFIG_LOCKDEP
+	lockdep_set_subclass(&EXT4_I(journal_inode)->i_data_sem,
+			     I_DATA_SEM_JOURNAL);
+#endif
+
 	ext4_debug("Journal inode found at %p: %lld bytes\n",
 		  journal_inode, journal_inode->i_size);
 	return journal_inode;
-- 
2.53.0

^ permalink raw reply related

* [RFC v8 1/7] ext4: fast commit: snapshot inode state before writing log
From: Li Chen @ 2026-05-15  9:18 UTC (permalink / raw)
  To: Zhang Yi, Theodore Ts'o, Andreas Dilger, Baokun Li, Jan Kara,
	Ojaswin Mujoo, Ritesh Harjani (IBM), Zhang Yi, linux-ext4,
	linux-kernel
  Cc: Steven Rostedt, Masami Hiramatsu, Mathieu Desnoyers,
	linux-trace-kernel
In-Reply-To: <20260515091829.194810-1-me@linux.beauty>

Fast commit writes inode metadata and data range updates after unlocking
journal updates. New handles can start at that point, so the log writing
path must not look at live inode state.

Add a commit-time per-inode snapshot and populate it while journal updates
are locked and existing handles are drained. Store the snapshot behind
ext4_inode_info->i_fc_snap so ext4_inode_info only grows by one pointer.
The snapshot contains a copy of the on-disk inode plus the data range
records needed for fast commit TLVs.

Snapshotting runs under jbd2_journal_lock_updates(). Avoid triggering I/O
there by using ext4_get_inode_loc_noio() and falling back to full commit
if the inode table block is not present or not uptodate.

Log writing then only serializes the snapshot, so it no longer needs to
call ext4_map_blocks() and take i_data_sem under s_fc_lock. The snapshot
is installed and freed under s_fc_lock and is released from fast commit
cleanup and inode eviction.

Signed-off-by: Li Chen <chenl311@chinatelecom.cn>
---
Changes in v7:
- Drop the stale i_fc_wait initialization after rebasing onto the new
  linux-next base.

Changes in v6:
- Rebase onto linux-next master as of 2026-04-08.
- Fix the inode debug print format after rebasing.

 fs/ext4/ext4.h        |  22 ++-
 fs/ext4/fast_commit.c | 331 +++++++++++++++++++++++++++++++++++-------
 fs/ext4/inode.c       |  51 +++++++
 3 files changed, 352 insertions(+), 52 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 6569d1d575a0..e337a37bb6fb 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1023,6 +1023,7 @@ enum {
 	I_DATA_SEM_EA
 };
 
+struct ext4_fc_inode_snap;
 
 /*
  * fourth extended file system inode data in memory
@@ -1079,6 +1080,22 @@ struct ext4_inode_info {
 	/* End of lblk range that needs to be committed in this fast commit */
 	ext4_lblk_t i_fc_lblk_len;
 
+	/*
+	 * Commit-time fast commit snapshots.
+	 *
+	 * i_fc_snap is installed and freed under sbi->s_fc_lock. The fast
+	 * commit log writing path reads the snapshot under sbi->s_fc_lock while
+	 * serializing fast commit TLVs.
+	 *
+	 * The snapshot lifetime is bounded by EXT4_STATE_FC_COMMITTING and the
+	 * corresponding cleanup / eviction paths.
+	 *
+	 * i_fc_snap points to per-inode snapshot data for fast commit:
+	 * - a raw inode snapshot for EXT4_FC_TAG_INODE
+	 * - data range records for EXT4_FC_TAG_{ADD,DEL}_RANGE
+	 */
+	struct ext4_fc_inode_snap *i_fc_snap;
+
 	spinlock_t i_raw_lock;	/* protects updates to the raw inode */
 
 	/*
@@ -3100,8 +3117,9 @@ extern int  ext4_file_getattr(struct mnt_idmap *, const struct path *,
 			      struct kstat *, u32, unsigned int);
 extern void ext4_dirty_inode(struct inode *, int);
 extern int ext4_change_inode_journal_flag(struct inode *, int);
-extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
-extern int ext4_get_fc_inode_loc(struct super_block *sb, unsigned long ino,
+int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc);
+int ext4_get_inode_loc_noio(struct inode *inode, struct ext4_iloc *iloc);
+int ext4_get_fc_inode_loc(struct super_block *sb, unsigned long ino,
 			  struct ext4_iloc *iloc);
 extern int ext4_inode_attach_jinode(struct inode *inode);
 extern int ext4_can_truncate(struct inode *inode);
diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
index 1775bce9649a..0c49144e8ca2 100644
--- a/fs/ext4/fast_commit.c
+++ b/fs/ext4/fast_commit.c
@@ -56,21 +56,23 @@
  *     deleted while it is being flushed.
  * [2] Flush data buffers to disk and clear "EXT4_STATE_FC_FLUSHING_DATA"
  *     state.
- * [3] Lock the journal by calling jbd2_journal_lock_updates. This ensures that
- *     all the exsiting handles finish and no new handles can start.
- * [4] Mark all the fast commit eligible inodes as undergoing fast commit
- *     by setting "EXT4_STATE_FC_COMMITTING" state.
- * [5] Unlock the journal by calling jbd2_journal_unlock_updates. This allows
+ * [3] Lock the journal by calling jbd2_journal_lock_updates(). This ensures
+ *     that all the existing handles finish and no new handles can start.
+ * [4] Mark all the fast commit eligible inodes as undergoing fast commit by
+ *     setting "EXT4_STATE_FC_COMMITTING" state, and snapshot the inode state
+ *     needed for log writing.
+ * [5] Unlock the journal by calling jbd2_journal_unlock_updates(). This allows
  *     starting of new handles. If new handles try to start an update on
  *     any of the inodes that are being committed, ext4_fc_track_inode()
  *     will block until those inodes have finished the fast commit.
  * [6] Commit all the directory entry updates in the fast commit space.
- * [7] Commit all the changed inodes in the fast commit space and clear
- *     "EXT4_STATE_FC_COMMITTING" for these inodes.
+ * [7] Commit all the changed inodes in the fast commit space.
  * [8] Write tail tag (this tag ensures the atomicity, please read the following
  *     section for more details).
+ * [9] Clear "EXT4_STATE_FC_COMMITTING" and wake up waiters in
+ *     ext4_fc_cleanup().
  *
- * All the inode updates must be enclosed within jbd2_jounrnal_start()
+ * All the inode updates must be enclosed within jbd2_journal_start()
  * and jbd2_journal_stop() similar to JBD2 journaling.
  *
  * Fast Commit Ineligibility
@@ -200,6 +202,8 @@ static void ext4_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
 	unlock_buffer(bh);
 }
 
+static void ext4_fc_free_inode_snap(struct inode *inode);
+
 static inline void ext4_fc_reset_inode(struct inode *inode)
 {
 	struct ext4_inode_info *ei = EXT4_I(inode);
@@ -216,6 +220,7 @@ void ext4_fc_init_inode(struct inode *inode)
 	ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING);
 	INIT_LIST_HEAD(&ei->i_fc_list);
 	INIT_LIST_HEAD(&ei->i_fc_dilist);
+	ei->i_fc_snap = NULL;
 }
 
 static bool ext4_fc_disabled(struct super_block *sb)
@@ -248,6 +253,7 @@ void ext4_fc_del(struct inode *inode)
 
 	alloc_ctx = ext4_fc_lock(inode->i_sb);
 	if (list_empty(&ei->i_fc_list) && list_empty(&ei->i_fc_dilist)) {
+		ext4_fc_free_inode_snap(inode);
 		ext4_fc_unlock(inode->i_sb, alloc_ctx);
 		return;
 	}
@@ -281,6 +287,7 @@ void ext4_fc_del(struct inode *inode)
 		}
 		finish_wait(wq, &wait.wq_entry);
 	}
+	ext4_fc_free_inode_snap(inode);
 	list_del_init(&ei->i_fc_list);
 
 	/*
@@ -817,6 +824,21 @@ static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u32 *crc,
 	return true;
 }
 
+struct ext4_fc_range {
+	struct list_head list;
+	u16 tag;
+	ext4_lblk_t lblk;
+	ext4_lblk_t len;
+	ext4_fsblk_t pblk;
+	bool unwritten;
+};
+
+struct ext4_fc_inode_snap {
+	struct list_head data_list;
+	unsigned int inode_len;
+	u8 inode_buf[];
+};
+
 /*
  * Writes inode in the fast commit space under TLV with tag @tag.
  * Returns 0 on success, error on failure.
@@ -824,21 +846,21 @@ static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u32 *crc,
 static int ext4_fc_write_inode(struct inode *inode, u32 *crc)
 {
 	struct ext4_inode_info *ei = EXT4_I(inode);
-	int inode_len = EXT4_GOOD_OLD_INODE_SIZE;
-	int ret;
-	struct ext4_iloc iloc;
+	struct ext4_fc_inode_snap *snap = ei->i_fc_snap;
 	struct ext4_fc_inode fc_inode;
 	struct ext4_fc_tl tl;
 	u8 *dst;
+	u8 *src;
+	int inode_len;
+	int ret;
 
-	ret = ext4_get_inode_loc(inode, &iloc);
-	if (ret)
-		return ret;
+	if (!snap)
+		return -ECANCELED;
 
-	if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA))
-		inode_len = EXT4_INODE_SIZE(inode->i_sb);
-	else if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE)
-		inode_len += ei->i_extra_isize;
+	src = snap->inode_buf;
+	inode_len = snap->inode_len;
+	if (!src || inode_len == 0)
+		return -ECANCELED;
 
 	fc_inode.fc_ino = cpu_to_le32(inode->i_ino);
 	tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_INODE);
@@ -854,10 +876,9 @@ static int ext4_fc_write_inode(struct inode *inode, u32 *crc)
 	dst += EXT4_FC_TAG_BASE_LEN;
 	memcpy(dst, &fc_inode, sizeof(fc_inode));
 	dst += sizeof(fc_inode);
-	memcpy(dst, (u8 *)ext4_raw_inode(&iloc), inode_len);
+	memcpy(dst, src, inode_len);
 	ret = 0;
 err:
-	brelse(iloc.bh);
 	return ret;
 }
 
@@ -867,12 +888,74 @@ static int ext4_fc_write_inode(struct inode *inode, u32 *crc)
  */
 static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc)
 {
-	ext4_lblk_t old_blk_size, cur_lblk_off, new_blk_size;
 	struct ext4_inode_info *ei = EXT4_I(inode);
-	struct ext4_map_blocks map;
+	struct ext4_fc_inode_snap *snap = ei->i_fc_snap;
 	struct ext4_fc_add_range fc_ext;
 	struct ext4_fc_del_range lrange;
 	struct ext4_extent *ex;
+	struct ext4_fc_range *range;
+
+	if (!snap)
+		return -ECANCELED;
+
+	list_for_each_entry(range, &snap->data_list, list) {
+		if (range->tag == EXT4_FC_TAG_DEL_RANGE) {
+			lrange.fc_ino = cpu_to_le32(inode->i_ino);
+			lrange.fc_lblk = cpu_to_le32(range->lblk);
+			lrange.fc_len = cpu_to_le32(range->len);
+			if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_DEL_RANGE,
+					     sizeof(lrange), (u8 *)&lrange, crc))
+				return -ENOSPC;
+			continue;
+		}
+
+		fc_ext.fc_ino = cpu_to_le32(inode->i_ino);
+		ex = (struct ext4_extent *)&fc_ext.fc_ex;
+		ex->ee_block = cpu_to_le32(range->lblk);
+		ex->ee_len = cpu_to_le16(range->len);
+		ext4_ext_store_pblock(ex, range->pblk);
+		if (range->unwritten)
+			ext4_ext_mark_unwritten(ex);
+		else
+			ext4_ext_mark_initialized(ex);
+
+		if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_ADD_RANGE,
+				     sizeof(fc_ext), (u8 *)&fc_ext, crc))
+			return -ENOSPC;
+	}
+
+	return 0;
+}
+
+static void ext4_fc_free_ranges(struct list_head *head)
+{
+	struct ext4_fc_range *range, *range_n;
+
+	list_for_each_entry_safe(range, range_n, head, list) {
+		list_del(&range->list);
+		kfree(range);
+	}
+}
+
+static void ext4_fc_free_inode_snap(struct inode *inode)
+{
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	struct ext4_fc_inode_snap *snap = ei->i_fc_snap;
+
+	if (!snap)
+		return;
+
+	ext4_fc_free_ranges(&snap->data_list);
+	kfree(snap);
+	ei->i_fc_snap = NULL;
+}
+
+static int ext4_fc_snapshot_inode_data(struct inode *inode,
+				       struct list_head *ranges)
+{
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	ext4_lblk_t start_lblk, end_lblk, cur_lblk;
+	struct ext4_map_blocks map;
 	int ret;
 
 	spin_lock(&ei->i_fc_lock);
@@ -880,18 +963,21 @@ static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc)
 		spin_unlock(&ei->i_fc_lock);
 		return 0;
 	}
-	old_blk_size = ei->i_fc_lblk_start;
-	new_blk_size = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1;
+	start_lblk = ei->i_fc_lblk_start;
+	end_lblk = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1;
 	ei->i_fc_lblk_len = 0;
 	spin_unlock(&ei->i_fc_lock);
 
-	cur_lblk_off = old_blk_size;
-	ext4_debug("will try writing %d to %d for inode %llu\n",
-		   cur_lblk_off, new_blk_size, inode->i_ino);
+	cur_lblk = start_lblk;
+	ext4_debug("snapshot data ranges %u-%u for inode %llu\n",
+		   start_lblk, end_lblk,
+		   (unsigned long long)inode->i_ino);
+
+	while (cur_lblk <= end_lblk) {
+		struct ext4_fc_range *range;
 
-	while (cur_lblk_off <= new_blk_size) {
-		map.m_lblk = cur_lblk_off;
-		map.m_len = new_blk_size - cur_lblk_off + 1;
+		map.m_lblk = cur_lblk;
+		map.m_len = end_lblk - cur_lblk + 1;
 		ret = ext4_map_blocks(NULL, inode, &map,
 				      EXT4_GET_BLOCKS_IO_SUBMIT |
 				      EXT4_EX_NOCACHE);
@@ -899,17 +985,21 @@ static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc)
 			return -ECANCELED;
 
 		if (map.m_len == 0) {
-			cur_lblk_off++;
+			cur_lblk++;
 			continue;
 		}
 
+		range = kmalloc(sizeof(*range), GFP_NOFS);
+		if (!range)
+			return -ENOMEM;
+
+		range->lblk = map.m_lblk;
+		range->len = map.m_len;
+		range->pblk = 0;
+		range->unwritten = false;
+
 		if (ret == 0) {
-			lrange.fc_ino = cpu_to_le32(inode->i_ino);
-			lrange.fc_lblk = cpu_to_le32(map.m_lblk);
-			lrange.fc_len = cpu_to_le32(map.m_len);
-			if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_DEL_RANGE,
-					    sizeof(lrange), (u8 *)&lrange, crc))
-				return -ENOSPC;
+			range->tag = EXT4_FC_TAG_DEL_RANGE;
 		} else {
 			unsigned int max = (map.m_flags & EXT4_MAP_UNWRITTEN) ?
 				EXT_UNWRITTEN_MAX_LEN : EXT_INIT_MAX_LEN;
@@ -917,26 +1007,67 @@ static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc)
 			/* Limit the number of blocks in one extent */
 			map.m_len = min(max, map.m_len);
 
-			fc_ext.fc_ino = cpu_to_le32(inode->i_ino);
-			ex = (struct ext4_extent *)&fc_ext.fc_ex;
-			ex->ee_block = cpu_to_le32(map.m_lblk);
-			ex->ee_len = cpu_to_le16(map.m_len);
-			ext4_ext_store_pblock(ex, map.m_pblk);
-			if (map.m_flags & EXT4_MAP_UNWRITTEN)
-				ext4_ext_mark_unwritten(ex);
-			else
-				ext4_ext_mark_initialized(ex);
-			if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_ADD_RANGE,
-					    sizeof(fc_ext), (u8 *)&fc_ext, crc))
-				return -ENOSPC;
+			range->tag = EXT4_FC_TAG_ADD_RANGE;
+			range->len = map.m_len;
+			range->pblk = map.m_pblk;
+			range->unwritten = !!(map.m_flags & EXT4_MAP_UNWRITTEN);
 		}
 
-		cur_lblk_off += map.m_len;
+		INIT_LIST_HEAD(&range->list);
+		list_add_tail(&range->list, ranges);
+
+		cur_lblk += map.m_len;
 	}
 
 	return 0;
 }
 
+static int ext4_fc_snapshot_inode(struct inode *inode)
+{
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	struct ext4_fc_inode_snap *snap;
+	int inode_len = EXT4_GOOD_OLD_INODE_SIZE;
+	struct ext4_iloc iloc;
+	LIST_HEAD(ranges);
+	int ret;
+	int alloc_ctx;
+
+	ret = ext4_get_inode_loc_noio(inode, &iloc);
+	if (ret)
+		return ret;
+
+	if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA))
+		inode_len = EXT4_INODE_SIZE(inode->i_sb);
+	else if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE)
+		inode_len += ei->i_extra_isize;
+
+	snap = kmalloc(struct_size(snap, inode_buf, inode_len), GFP_NOFS);
+	if (!snap) {
+		brelse(iloc.bh);
+		return -ENOMEM;
+	}
+	INIT_LIST_HEAD(&snap->data_list);
+	snap->inode_len = inode_len;
+
+	memcpy(snap->inode_buf, (u8 *)ext4_raw_inode(&iloc), inode_len);
+	brelse(iloc.bh);
+
+	ret = ext4_fc_snapshot_inode_data(inode, &ranges);
+	if (ret) {
+		kfree(snap);
+		ext4_fc_free_ranges(&ranges);
+		return ret;
+	}
+
+	alloc_ctx = ext4_fc_lock(inode->i_sb);
+	ext4_fc_free_inode_snap(inode);
+	ei->i_fc_snap = snap;
+	list_splice_tail_init(&ranges, &snap->data_list);
+	ext4_fc_unlock(inode->i_sb, alloc_ctx);
+
+	return 0;
+}
+
 
 /* Flushes data of all the inodes in the commit queue. */
 static int ext4_fc_flush_data(journal_t *journal)
@@ -987,6 +1118,11 @@ static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc)
 		 */
 		if (list_empty(&fc_dentry->fcd_dilist))
 			continue;
+		/*
+		 * For EXT4_FC_TAG_CREAT, fcd_dilist is linked on the created
+		 * inode's i_fc_dilist list (kept singular), so we can recover the
+		 * inode through it.
+		 */
 		ei = list_first_entry(&fc_dentry->fcd_dilist,
 				struct ext4_inode_info, i_fc_dilist);
 		inode = &ei->vfs_inode;
@@ -1011,6 +1147,88 @@ static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc)
 	return 0;
 }
 
+static int ext4_fc_snapshot_inodes(journal_t *journal)
+{
+	struct super_block *sb = journal->j_private;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_inode_info *iter;
+	struct ext4_fc_dentry_update *fc_dentry;
+	struct inode **inodes;
+	unsigned int nr_inodes = 0;
+	unsigned int i = 0;
+	int ret = 0;
+	int alloc_ctx;
+
+	alloc_ctx = ext4_fc_lock(sb);
+	list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list)
+		nr_inodes++;
+
+	list_for_each_entry(fc_dentry, &sbi->s_fc_dentry_q[FC_Q_MAIN], fcd_list) {
+		struct ext4_inode_info *ei;
+
+		if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT)
+			continue;
+		if (list_empty(&fc_dentry->fcd_dilist))
+			continue;
+
+		/* See the comment in ext4_fc_commit_dentry_updates(). */
+		ei = list_first_entry(&fc_dentry->fcd_dilist,
+				      struct ext4_inode_info, i_fc_dilist);
+		if (!list_empty(&ei->i_fc_list))
+			continue;
+
+		nr_inodes++;
+	}
+	ext4_fc_unlock(sb, alloc_ctx);
+
+	if (!nr_inodes)
+		return 0;
+
+	inodes = kvcalloc(nr_inodes, sizeof(*inodes), GFP_NOFS);
+	if (!inodes)
+		return -ENOMEM;
+
+	alloc_ctx = ext4_fc_lock(sb);
+	list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
+		inodes[i] = igrab(&iter->vfs_inode);
+		if (inodes[i])
+			i++;
+	}
+
+	list_for_each_entry(fc_dentry, &sbi->s_fc_dentry_q[FC_Q_MAIN], fcd_list) {
+		struct ext4_inode_info *ei;
+
+		if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT)
+			continue;
+		if (list_empty(&fc_dentry->fcd_dilist))
+			continue;
+
+		/* See the comment in ext4_fc_commit_dentry_updates(). */
+		ei = list_first_entry(&fc_dentry->fcd_dilist,
+				      struct ext4_inode_info, i_fc_dilist);
+		if (!list_empty(&ei->i_fc_list))
+			continue;
+
+		inodes[i] = igrab(&ei->vfs_inode);
+		if (inodes[i])
+			i++;
+	}
+	ext4_fc_unlock(sb, alloc_ctx);
+
+	for (nr_inodes = 0; nr_inodes < i; nr_inodes++) {
+		ret = ext4_fc_snapshot_inode(inodes[nr_inodes]);
+		if (ret)
+			break;
+	}
+
+	for (nr_inodes = 0; nr_inodes < i; nr_inodes++) {
+		if (inodes[nr_inodes])
+			iput(inodes[nr_inodes]);
+	}
+	kvfree(inodes);
+	return ret;
+}
+
 static int ext4_fc_perform_commit(journal_t *journal)
 {
 	struct super_block *sb = journal->j_private;
@@ -1082,7 +1300,11 @@ static int ext4_fc_perform_commit(journal_t *journal)
 				     EXT4_STATE_FC_COMMITTING);
 	}
 	ext4_fc_unlock(sb, alloc_ctx);
+
+	ret = ext4_fc_snapshot_inodes(journal);
 	jbd2_journal_unlock_updates(journal);
+	if (ret)
+		return ret;
 
 	/*
 	 * Step 5: If file system device is different from journal device,
@@ -1281,6 +1503,7 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid)
 					struct ext4_inode_info,
 					i_fc_list);
 		list_del_init(&ei->i_fc_list);
+		ext4_fc_free_inode_snap(&ei->vfs_inode);
 		ext4_clear_inode_state(&ei->vfs_inode,
 				       EXT4_STATE_FC_COMMITTING);
 		if (tid_geq(tid, ei->i_sync_tid)) {
@@ -1313,6 +1536,14 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid)
 					     struct ext4_fc_dentry_update,
 					     fcd_list);
 		list_del_init(&fc_dentry->fcd_list);
+		if (fc_dentry->fcd_op == EXT4_FC_TAG_CREAT &&
+		    !list_empty(&fc_dentry->fcd_dilist)) {
+			/* See the comment in ext4_fc_commit_dentry_updates(). */
+			ei = list_first_entry(&fc_dentry->fcd_dilist,
+					      struct ext4_inode_info,
+					      i_fc_dilist);
+			ext4_fc_free_inode_snap(&ei->vfs_inode);
+		}
 		list_del_init(&fc_dentry->fcd_dilist);
 
 		release_dentry_name_snapshot(&fc_dentry->fcd_name);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index c2c2d6ac7f3d..4678612f82e8 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -5025,6 +5025,57 @@ int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc)
 	return ret;
 }
 
+/*
+ * ext4_get_inode_loc_noio() is a best-effort variant of ext4_get_inode_loc().
+ * It looks up the inode table block in the buffer cache and returns -EAGAIN if
+ * the block is not present or not uptodate, without starting any I/O.
+ */
+int ext4_get_inode_loc_noio(struct inode *inode, struct ext4_iloc *iloc)
+{
+	struct super_block *sb = inode->i_sb;
+	struct ext4_group_desc *gdp;
+	struct buffer_head *bh;
+	ext4_fsblk_t block;
+	int inodes_per_block, inode_offset;
+	unsigned long ino = inode->i_ino;
+
+	iloc->bh = NULL;
+	if (ino < EXT4_ROOT_INO ||
+	    ino > le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count))
+		return -EFSCORRUPTED;
+
+	iloc->block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
+	gdp = ext4_get_group_desc(sb, iloc->block_group, NULL);
+	if (!gdp)
+		return -EIO;
+
+	/* Figure out the offset within the block group inode table. */
+	inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
+	inode_offset = ((ino - 1) % EXT4_INODES_PER_GROUP(sb));
+	iloc->offset = (inode_offset % inodes_per_block) * EXT4_INODE_SIZE(sb);
+
+	block = ext4_inode_table(sb, gdp);
+	if (block <= le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block) ||
+	    block >= ext4_blocks_count(EXT4_SB(sb)->s_es)) {
+		ext4_error(sb,
+			   "Invalid inode table block %llu in block_group %u",
+			   block, iloc->block_group);
+		return -EFSCORRUPTED;
+	}
+	block += inode_offset / inodes_per_block;
+
+	bh = sb_find_get_block(sb, block);
+	if (!bh)
+		return -EAGAIN;
+	if (!ext4_buffer_uptodate(bh)) {
+		brelse(bh);
+		return -EAGAIN;
+	}
+
+	iloc->bh = bh;
+	return 0;
+}
+
 
 int ext4_get_fc_inode_loc(struct super_block *sb, unsigned long ino,
 			  struct ext4_iloc *iloc)
-- 
2.53.0

^ permalink raw reply related

* [RFC v8 0/7] ext4: fast commit: snapshot inode state for FC log
From: Li Chen @ 2026-05-15  9:18 UTC (permalink / raw)
  To: Zhang Yi, Theodore Ts'o, Andreas Dilger
  Cc: Steven Rostedt, Masami Hiramatsu, Mathieu Desnoyers, linux-ext4,
	linux-trace-kernel, linux-kernel

Hi,

(This RFC v8 series is rebased onto linux-next master as of 2026-05-09,
commit e98d21c170b0 ("Add linux-next specific files for 20260508"), and
depends on patch "ext4: fix fast commit wait/wake bit mapping on
64-bit" [0]).

Zhang Yi in RFC v3 review pointed out that postponing lockdep assertions only
masks the issue, and that sleeping in ext4_fc_track_inode() while holding
i_data_sem can form a real ABBA deadlock if the fast commit writer also needs
i_data_sem while the inode is in FC_COMMITTING.

Zhang Yi suggested two possible directions to address the root cause:

1. "Ha, the solution seems to have already been listed in the TODOs in
fast_commit.c.

  Change ext4_fc_commit() to lookup logical to physical mapping using extent
  status tree. This would get rid of the need to call ext4_fc_track_inode()
  before acquiring i_data_sem. To do that we would need to ensure that
  modified extents from the extent status tree are not evicted from memory."

2. "Alternatively, recording the mapped range of tracking might also be
feasible."

This series implements a hybrid way: it implements approach 2 by snapshotting inode image
and mapped ranges at commit time, and consuming only snapshots during log
writing.

Approach 2 still needs a mapping source while building the snapshot
(logical-to-physical and unwritten/hole semantics). Calling ext4_map_blocks()
there would take i_data_sem and can block inside the
jbd2_journal_lock_updates() window, which risks deadlocks or unbounded stalls.
So the snapshot path uses approach 1's extent status lookups as a best-effort
mapping source to avoid ext4_map_blocks().

I did not fully implement approach 1 (making extent status lookups
authoritative by preventing reclaim of needed entries) because that would need
additional pinning/integration under memory pressure and a larger correctness
surface. Instead, the extent status tree is treated as a cache and the
snapshot path falls back to full commit on cache misses or unstable mappings
(e.g. delayed allocation).

Lock inversion / deadlock model (before):

CPU0 (metadata update)               CPU1 (fast commit)
--------------------               -----------------
... hold i_data_sem (A)             mutex_lock(s_fc_lock) (B)
    ext4_fc_track_inode()             ext4_fc_write_inode_data()
      mutex_lock(s_fc_lock) (B)         ext4_map_blocks()
      wait FC_COMMITTING (sleep)          down_read(i_data_sem) (A)

This creates i_data_sem (A) -> s_fc_lock (B) on update paths, and
s_fc_lock (B) -> i_data_sem (A) on commit paths. Once CPU0 sleeps while
holding (A), CPU1 can block on (A) while holding (B), completing the ABBA
cycle.

New model (this series):

CPU0 (metadata update)               CPU1 (fast commit)
--------------------               -----------------
... maybe hold i_data_sem (A)        jbd2_journal_lock_updates()
    ext4_fc_track_*()                 snapshot inode + ranges (no map_blocks)
      mutex_lock(s_fc_lock) (B)       jbd2_journal_unlock_updates()
      if FC_COMMITTING: set FC_REQUEUE s_fc_lock (B)
      no sleep                         write FC log from snapshots only
                                    cleanup: clear COMMITTING, requeue if set

The commit path no longer takes i_data_sem while holding s_fc_lock, and
tracking no longer sleeps waiting for FC_COMMITTING. If an inode is updated
during a fast commit, EXT4_STATE_FC_REQUEUE records that fact and the inode
is moved to FC_Q_STAGING for the next commit.
The only remaining FC_COMMITTING waiter is ext4_fc_del(). It checks
FC_COMMITTING and FC_FLUSHING_DATA while holding s_fc_lock, drops s_fc_lock
around the sleep, and rechecks FC_COMMITTING after a FC_FLUSHING_DATA wait
before deleting the inode from the fast commit lists. This keeps inode
lifetime/deletion synchronized with the commit thread's transition from
FLUSHING_DATA to COMMITTING.

This series snapshots the on-disk inode and tracked data ranges while journal
updates are locked and existing handles are drained. The log writing phase then
serializes only snapshots, so it no longer needs to call ext4_map_blocks() and
take i_data_sem under s_fc_lock. This is done in two steps: patch 1 drops
ext4_map_blocks() from log writing by introducing commit-time snapshots, and
patch 5 drops ext4_map_blocks() from the snapshot path by using the extent
status cache. The snapshot also records whether a mapped extent is unwritten,
so the ADD_RANGE records (and replay) preserve unwritten semantics.

Snapshotting runs under jbd2_journal_lock_updates(). Since a cache miss in
ext4_get_inode_loc() can start synchronous inode table I/O and stall handle
starts for milliseconds, patch 1 uses ext4_get_inode_loc_noio() and falls back
to full commit if the inode table block is not present or not uptodate.

ext4_fc_track_inode() also stops waiting for FC_COMMITTING. Updates during an
ongoing fast commit are marked with EXT4_STATE_FC_REQUEUE and are replayed in
the next fast commit, while ext4_fc_del() waits for FC_COMMITTING so an inode
cannot be removed while the commit thread is still using it.

The extent status tree is a cache, not an authoritative source, so the snapshot
path falls back to full commit on cache misses or unstable mappings (e.g.
delayed allocation). This includes cases where extent status entries are not
present (or have been reclaimed) under memory pressure. The snapshot path does
not try to rebuild mappings by calling ext4_map_blocks(); instead it simply
marks the transaction fast commit ineligible.

To keep the updates-locked window bounded, the snapshot path caps the number of
snapshotted inodes and ranges per fast commit (currently 1024 inodes and 2048
ranges) and falls back to full commit when the cap is exceeded. The series also
handles the journal inode i_data_sem lockdep false positive via subclassing;
journal inode mapping may still take i_data_sem even when data inode mapping is
avoided.

Patch 6 adds the ext4_fc_lock_updates tracepoint to quantify the updates-locked
window and snapshot fallback reasons. Patch 7 extends
/proc/fs/ext4/<sb_id>/fc_info with best-effort snapshot counters. If the /proc
interface is undesirable, I can drop patch 7 and keep the tracepoint only, or
drop even both.

Testing and measurement were done on a QEMU/KVM guest with virtio-pmem + dax
(ext4 -O fast_commit, mounted dax,noatime). The workload does python3 500x
{4K write + fsync}, fallocate 256M, and python3 500x {creat + fsync(dir)}.
Over 3 cold boots, ext4_fc_lock_updates reported locked_ns p50 2.88-2.92 us,
p99 <= 6.71 us, and max <= 102.71 us, with snap_err always 0. Under stress-ng
memory pressure (stress-ng --vm 4 --vm-bytes 75% --timeout 60s), locked_ns p50
2.94 us, p99 <= 4.97 us, and max <= 20.07 us. The fc_info snapshot failure
counters stayed at 0.
These hold times are in the low microseconds range, and the caps keep the
worst case bounded.

Comments and guidance are very welcome. Please let me know if there are any
concerns about correctness, corner cases, or better approaches.

RFC v7 -> RFC v8:
- Base the series on "ext4: fix fast commit wait/wake bit
  mapping on 64-bit", which the fast commit wait/wake paths now depend on.
- Factor out small ext4_fc_wait_inode_state()/ext4_fc_wake_inode_state()
  helpers so the repeated FC state wait/wake mapping stays in one place.
- Use trace_call__ext4_fc_lock_updates() at the guarded tracepoint call site
  so the static branch is not checked twice.
- Address the Sashiko feedback around the commit-time snapshot lifecycle and
  snapshot stats accounting, including the FC_COMMITTING /
  FC_FLUSHING_DATA transition and stale snapshot sizing fallback. [3][4]

RFC v6 -> RFC v7:
- Rebase onto linux-next master as of 2026-05-09, commit e98d21c170b0
  ("Add linux-next specific files for 20260508").
- Address Sashiko review feedback for RFC v6. [2]
- Fix the reported snapshot range arithmetic issue near EXT_MAX_BLOCKS to
  avoid cur_lblk / range wraparound in the snapshot walk.
- Report successfully snapshotted inode counts in ext4_fc_lock_updates when
  snapshotting stops early, as reported by Sashiko.
- Use READ_ONCE() + div64_u64() for the fc_info lock_updates average, as
  reported by Sashiko.

RFC v5 -> RFC v6:
- Rebase onto linux-next master as of 2026-04-08.
- Address tracepoint review feedback by relying on enum auto-increment for
  snap_err values and by switching the guarded ext4_fc_lock_updates call site
  to trace_call__ext4_fc_lock_updates() to avoid the double static_branch. [1]
- Keep lock window accounting unconditional for fc_info while using the guarded
  direct tracepoint call.
- Fix the inode debug print format exposed by the rebase.

RFC v4 -> RFC v5:
- Patch 6: Make ext4_fc_lock_updates snap_err human readable via
  TRACE_DEFINE_ENUM() + __print_symbolic(), using a single TRACE_SNAP_ERR
  mapping while keeping the enum values stable for tooling.

RFC v3 -> RFC v4:
- Replace lockdep_assert movement with removing the wait in
  ext4_fc_track_inode() and using EXT4_STATE_FC_REQUEUE to capture updates
  during an ongoing fast commit.
- Replace dropping s_fc_lock around log writing with commit-time snapshots of
  inode image and mapped ranges (recording the mapped range of tracking as
  suggested by Zhang Yi) so log writing consumes only snapshots.
- Avoid inode table I/O under jbd2_journal_lock_updates() via
  ext4_get_inode_loc_noio() and fallback to full commit on cache misses.
- Use the extent status cache for snapshot mappings and fall back to full
  commit on cache misses or unstable mappings (e.g. delayed allocation).
- Add tracepoint and /proc snapshot stats to quantify the updates-locked window
  and snapshot fallback reasons.

RFC v2 -> RFC v3:
- rebase on top of
  https://lore.kernel.org/linux-ext4/20251223131342.287864-1-me@linux.beauty/T/#u

RFC v1 -> RFC v2:
- patch 1: move comments to correct place
- patch 2: add it to patchset.
- add missing RFC prefix

RFC v1: https://lore.kernel.org/linux-ext4/20251222032655.87056-1-me@linux.beauty/T/#u
RFC v2: https://lore.kernel.org/linux-ext4/20251222151906.24607-1-me@linux.beauty/T/#t
RFC v3: https://lore.kernel.org/linux-ext4/20251224032943.134063-1-me@linux.beauty/
RFC v4: https://lore.kernel.org/all/20260120112538.132774-1-me@linux.beauty/
RFC v5: https://lore.kernel.org/all/20260317084624.457185-1-me@linux.beauty/t/#u
RFC v6: https://lore.kernel.org/all/20260408112020.716706-1-me@linux.beauty/
RFC v7: https://lore.kernel.org/all/20260511084304.1559557-1-me@linux.beauty/

[0]: https://lore.kernel.org/all/20260513085818.552432-1-me@linux.beauty/
[1]: https://lore.kernel.org/all/acZJl8QUYEq8voqQ@BLRRASHENOY1.amd.com/T/#u
[2]: https://sashiko.dev/#/patchset/20260408112020.716706-1-me%40linux.beauty
[3]: https://sashiko.dev/#/patchset/20260511084304.1559557-1-me%40linux.beauty?part=4
[4]: https://sashiko.dev/#/patchset/20260511084304.1559557-1-me%40linux.beauty?part=7

Thanks,

Li Chen (7):
  ext4: fast commit: snapshot inode state before writing log
  ext4: lockdep: handle i_data_sem subclassing for special inodes
  ext4: fast commit: avoid waiting for FC_COMMITTING
  ext4: fast commit: avoid self-deadlock in inode snapshotting
  ext4: fast commit: avoid i_data_sem by dropping ext4_map_blocks() in
    snapshots
  ext4: fast commit: add lock_updates tracepoint
  ext4: fast commit: export snapshot stats in fc_info

 fs/ext4/ext4.h              |  73 ++++-
 fs/ext4/fast_commit.c       | 768 +++++++++++++++++++++++++++++++++++---------
 fs/ext4/inode.c             |  51 +++
 fs/ext4/super.c             |   9 +
 include/trace/events/ext4.h |  53 ++++
 5 files changed, 805 insertions(+), 149 deletions(-)

-- 
2.53.0

^ permalink raw reply

* [PATCH v3] iomap: add simple read path for small direct I/O
From: Fengnan Chang @ 2026-05-15  8:40 UTC (permalink / raw)
  To: brauner, djwong, hch, ojaswin, dgc, linux-xfs, linux-fsdevel,
	linux-ext4, linux-kernel, lidiangang
  Cc: Fengnan Chang

When running 4K random read workloads on high-performance Gen5 NVMe
SSDs, the software overhead in the iomap direct I/O path
(__iomap_dio_rw) becomes a significant bottleneck.

Using io_uring with poll mode for a 4K randread test on a raw block
device:
taskset -c 30 ./t/io_uring -p1 -d512 -b4096 -s32 -c32 -F1 -B1 -R1 -X1
-n1 -P1 /dev/nvme10n1
Result: ~3.2M IOPS

Running the exact same workload on ext4 and XFS:
taskset -c 30 ./t/io_uring -p1 -d512 -b4096 -s32 -c32 -F1 -B1 -R1 -X1
-n1 -P1 /mnt/testfile
Result: ~1.92M IOPS

Profiling the ext4 workload reveals that a significant portion of CPU
time is spent on memory allocation and the iomap state machine
iteration:
  5.33%  [kernel]  [k] __iomap_dio_rw
  3.26%  [kernel]  [k] iomap_iter
  2.37%  [kernel]  [k] iomap_dio_bio_iter
  2.35%  [kernel]  [k] kfree
  1.33%  [kernel]  [k] iomap_dio_complete

Introduce simple reads to reduce the overhead of iomap, simple read path
is triggered when the request satisfies:
- I/O size is <= inode blocksize (fits in a single block, no splits).
- No custom `iomap_dio_ops` (dops) registered by the filesystem.

After this optimization, the heavy generic functions disappear from the
profile, replaced by a single streamlined execution path:
  4.83%  [kernel]  [k] iomap_dio_simple_read

With this patch, 4K random read IOPS on ext4 increases from 1.92M to
2.19M in the original single-core io_uring poll-mode workload.

Below are the test results using fio:

  fs    workload       qd    simple=0      simple=1      gain
  ext4  libaio         1     18,295        18,314        +0.10%
  ext4  libaio         64    458,374       473,557       +3.31%
  ext4  libaio         256   456,944       471,865       +3.27%
  ext4  libaio         1024  459,058       476,433       +3.78%
  ext4  io_uring       1     18,882        18,897        +0.08%
  ext4  io_uring       64    552,607       576,712       +4.36%
  ext4  io_uring       256   552,330       576,603       +4.39%
  ext4  io_uring       1024  557,330       584,227       +4.83%
  ext4  io_uring_poll  1     19,387        19,407        +0.10%
  ext4  io_uring_poll  64    794,073       843,926       +6.28%
  ext4  io_uring_poll  256   794,081       852,765       +7.39%
  ext4  io_uring_poll  1024  679,857       739,773       +8.81%
  xfs   libaio         1     18,314        18,344        +0.16%
  xfs   libaio         64    459,008       477,249       +3.97%
  xfs   libaio         256   455,308       475,960       +4.54%
  xfs   libaio         1024  461,499       477,098       +3.38%
  xfs   io_uring       1     18,867        18,895        +0.15%
  xfs   io_uring       64    556,197       582,650       +4.76%
  xfs   io_uring       256   556,946       582,802       +4.64%
  xfs   io_uring       1024  562,361       591,056       +5.10%
  xfs   io_uring_poll  1     19,380        19,406        +0.14%
  xfs   io_uring_poll  64    797,734       849,029       +6.43%
  xfs   io_uring_poll  256   796,156       852,550       +7.08%
  xfs   io_uring_poll  1024  677,254       735,667       +8.62%

v3:
Test data updated based on v7.1-rc3.

Signed-off-by: Fengnan Chang <changfengnan@bytedance.com>
---
 fs/iomap/direct-io.c | 384 +++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 373 insertions(+), 11 deletions(-)

diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index b0a6549b38487..8b86ad7df4bbd 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -10,6 +10,9 @@
 #include <linux/iomap.h>
 #include <linux/task_io_accounting_ops.h>
 #include <linux/fserror.h>
+#include <linux/kobject.h>
+#include <linux/sysfs.h>
+#include <linux/init.h>
 #include "internal.h"
 #include "trace.h"
 
@@ -237,23 +240,29 @@ static void iomap_dio_done(struct iomap_dio *dio)
 	iomap_dio_complete_work(&dio->aio.work);
 }
 
-static void __iomap_dio_bio_end_io(struct bio *bio, bool inline_completion)
+static inline void iomap_dio_bio_release_pages(struct bio *bio,
+		unsigned int dio_flags, bool error)
 {
-	struct iomap_dio *dio = bio->bi_private;
-
 	if (bio_integrity(bio))
 		fs_bio_integrity_free(bio);
 
-	if (dio->flags & IOMAP_DIO_BOUNCE) {
-		bio_iov_iter_unbounce(bio, !!dio->error,
-				dio->flags & IOMAP_DIO_USER_BACKED);
+	if (dio_flags & IOMAP_DIO_BOUNCE) {
+		bio_iov_iter_unbounce(bio, error,
+				dio_flags & IOMAP_DIO_USER_BACKED);
 		bio_put(bio);
-	} else if (dio->flags & IOMAP_DIO_USER_BACKED) {
+	} else if (dio_flags & IOMAP_DIO_USER_BACKED) {
 		bio_check_pages_dirty(bio);
 	} else {
 		bio_release_pages(bio, false);
 		bio_put(bio);
 	}
+}
+
+static void __iomap_dio_bio_end_io(struct bio *bio, bool inline_completion)
+{
+	struct iomap_dio *dio = bio->bi_private;
+
+	iomap_dio_bio_release_pages(bio, dio->flags, !!dio->error);
 
 	/* Do not touch bio below, we just gave up our reference. */
 
@@ -398,6 +407,14 @@ static ssize_t iomap_dio_bio_iter_one(struct iomap_iter *iter,
 	return ret;
 }
 
+static inline unsigned int iomap_dio_alignment(struct inode *inode,
+		struct block_device *bdev, unsigned int dio_flags)
+{
+	if (dio_flags & IOMAP_DIO_FSBLOCK_ALIGNED)
+		return i_blocksize(inode);
+	return bdev_logical_block_size(bdev);
+}
+
 static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio)
 {
 	const struct iomap *iomap = &iter->iomap;
@@ -416,10 +433,7 @@ static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio)
 	 * File systems that write out of place and always allocate new blocks
 	 * need each bio to be block aligned as that's the unit of allocation.
 	 */
-	if (dio->flags & IOMAP_DIO_FSBLOCK_ALIGNED)
-		alignment = fs_block_size;
-	else
-		alignment = bdev_logical_block_size(iomap->bdev);
+	alignment = iomap_dio_alignment(inode, iomap->bdev, dio->flags);
 
 	if ((pos | length) & (alignment - 1))
 		return -EINVAL;
@@ -891,12 +905,352 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
 }
 EXPORT_SYMBOL_GPL(__iomap_dio_rw);
 
+struct iomap_dio_simple_read {
+	struct kiocb		*iocb;
+	size_t			size;
+	unsigned int		dio_flags;
+	atomic_t		state;
+	union {
+		struct task_struct	*waiter;
+		struct work_struct	work;
+	};
+	/*
+	 * Align @bio to a cacheline boundary so that, combined with the
+	 * front_pad passed to bioset_init(), the bio sits at the start of
+	 * a cacheline in memory returned by the (HWCACHE-aligned) bio
+	 * slab.  This keeps the hot fields block layer touches on submit
+	 * and completion (bi_iter, bi_status, ...) within a single line.
+	 */
+	struct bio	bio ____cacheline_aligned_in_smp;
+};
+
+static struct bio_set iomap_dio_simple_read_pool;
+
+/*
+ * In the async simple read path, we need to prevent bio_endio() from
+ * triggering iocb->ki_complete() before the submitter has returned
+ * -EIOCBQUEUED. Otherwise, the caller might free the iocb concurrently.
+ *
+ * We use a three-state rendezvous to synchronize the submitter and end_io:
+ *
+ * IOMAP_DIO_SIMPLE_SUBMITTING: Initial state set before submitting the bio.
+ *
+ * IOMAP_DIO_SIMPLE_QUEUED: The submitter has safely queued the IO and will
+ * return -EIOCBQUEUED. If end_io sees this state, it takes over and calls
+ * ki_complete().
+ *
+ * IOMAP_DIO_SIMPLE_DONE: end_io fired before the submitter finished the
+ * submit path. end_io sets this state and does nothing else. The submitter
+ * will see this state and handle the completion synchronously (bypassing
+ * ki_complete() and returning the actual result).
+ */
+enum {
+	IOMAP_DIO_SIMPLE_SUBMITTING = 0,
+	IOMAP_DIO_SIMPLE_QUEUED,
+	IOMAP_DIO_SIMPLE_DONE,
+};
+
+static ssize_t iomap_dio_simple_read_finish(struct kiocb *iocb,
+		struct bio *bio, ssize_t ret)
+{
+	struct inode *inode = file_inode(iocb->ki_filp);
+	struct iomap_dio_simple_read *sr = bio->bi_private;
+
+	if (likely(!ret)) {
+		ret = sr->size;
+		iocb->ki_pos += ret;
+	} else {
+		fserror_report_io(inode, FSERR_DIRECTIO_READ, iocb->ki_pos,
+				  sr->size, ret, GFP_NOFS);
+	}
+
+	iomap_dio_bio_release_pages(bio, sr->dio_flags, ret < 0);
+
+	return ret;
+}
+
+static ssize_t iomap_dio_simple_read_complete(struct kiocb *iocb,
+		struct bio *bio)
+{
+	struct inode *inode = file_inode(iocb->ki_filp);
+	ssize_t ret;
+
+	WRITE_ONCE(iocb->private, NULL);
+
+	ret = iomap_dio_simple_read_finish(iocb, bio,
+			blk_status_to_errno(bio->bi_status));
+
+	inode_dio_end(inode);
+	trace_iomap_dio_complete(iocb, ret < 0 ? ret : 0, ret > 0 ? ret : 0);
+	return ret;
+}
+
+static void iomap_dio_simple_read_complete_work(struct work_struct *work)
+{
+	struct iomap_dio_simple_read *sr =
+		container_of(work, struct iomap_dio_simple_read, work);
+	struct kiocb *iocb = sr->iocb;
+	ssize_t ret;
+
+	ret = iomap_dio_simple_read_complete(iocb, &sr->bio);
+	iocb->ki_complete(iocb, ret);
+}
+
+static void iomap_dio_simple_read_async_done(struct iomap_dio_simple_read *sr)
+{
+	struct kiocb *iocb = sr->iocb;
+
+	if (unlikely(sr->bio.bi_status)) {
+		struct inode *inode = file_inode(iocb->ki_filp);
+
+		INIT_WORK(&sr->work, iomap_dio_simple_read_complete_work);
+		queue_work(inode->i_sb->s_dio_done_wq, &sr->work);
+		return;
+	}
+
+	iomap_dio_simple_read_complete_work(&sr->work);
+}
+
+static void iomap_dio_simple_read_end_io(struct bio *bio)
+{
+	struct iomap_dio_simple_read *sr = bio->bi_private;
+
+	if (sr->waiter) {
+		struct task_struct *waiter = sr->waiter;
+
+		WRITE_ONCE(sr->waiter, NULL);
+		blk_wake_io_task(waiter);
+		return;
+	}
+
+	if (likely(atomic_read(&sr->state) == IOMAP_DIO_SIMPLE_QUEUED) ||
+	    atomic_cmpxchg(&sr->state, IOMAP_DIO_SIMPLE_SUBMITTING,
+			   IOMAP_DIO_SIMPLE_DONE) == IOMAP_DIO_SIMPLE_QUEUED)
+		iomap_dio_simple_read_async_done(sr);
+}
+
+static inline bool iomap_dio_simple_read_supported(struct kiocb *iocb,
+		struct iov_iter *iter, unsigned int dio_flags)
+{
+	struct inode *inode = file_inode(iocb->ki_filp);
+	size_t count = iov_iter_count(iter);
+
+	if (iov_iter_rw(iter) != READ)
+		return false;
+	if (!count)
+		return false;
+	/*
+	 * Simple read is an optimization for small IO. Filter out large IO
+	 * early as it's the most common case to fail for typical direct IO
+	 * workloads.
+	 */
+	if (count > inode->i_sb->s_blocksize)
+		return false;
+	if (dio_flags & (IOMAP_DIO_FORCE_WAIT | IOMAP_DIO_PARTIAL))
+		return false;
+	if (iocb->ki_pos + count > i_size_read(inode))
+		return false;
+
+	return true;
+}
+
+static ssize_t iomap_dio_simple_read(struct kiocb *iocb,
+		struct iov_iter *iter, const struct iomap_ops *ops,
+		void *private, unsigned int dio_flags)
+{
+	struct inode *inode = file_inode(iocb->ki_filp);
+	size_t count = iov_iter_count(iter);
+	int nr_pages;
+	struct iomap_dio_simple_read *sr;
+	unsigned int alignment;
+	struct iomap_iter iomi = {
+		.inode		= inode,
+		.pos		= iocb->ki_pos,
+		.len		= count,
+		.flags		= IOMAP_DIRECT,
+		.private	= private,
+	};
+	struct bio *bio;
+	bool wait_for_completion = is_sync_kiocb(iocb);
+	ssize_t ret;
+
+	if (dio_flags & IOMAP_DIO_BOUNCE)
+		nr_pages = bio_iov_bounce_nr_vecs(iter, REQ_OP_READ);
+	else
+		nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS);
+
+	if (iocb->ki_flags & IOCB_NOWAIT)
+		iomi.flags |= IOMAP_NOWAIT;
+
+	ret = kiocb_write_and_wait(iocb, count);
+	if (ret)
+		return ret;
+
+	inode_dio_begin(inode);
+
+	ret = ops->iomap_begin(inode, iomi.pos, count, iomi.flags,
+			       &iomi.iomap, &iomi.srcmap);
+	if (ret) {
+		inode_dio_end(inode);
+		return ret;
+	}
+
+	if (iomi.iomap.type != IOMAP_MAPPED ||
+	    iomi.iomap.offset > iomi.pos ||
+	    iomi.iomap.offset + iomi.iomap.length < iomi.pos + count ||
+	    (iomi.iomap.flags & IOMAP_F_INTEGRITY)) {
+		ret = -ENOTBLK;
+		goto out_iomap_end;
+	}
+
+	alignment = iomap_dio_alignment(inode, iomi.iomap.bdev, dio_flags);
+	if ((iomi.pos | count) & (alignment - 1)) {
+		ret = -EINVAL;
+		goto out_iomap_end;
+	}
+
+	if (!wait_for_completion && unlikely(!inode->i_sb->s_dio_done_wq)) {
+		ret = sb_init_dio_done_wq(inode->i_sb);
+		if (ret < 0)
+			goto out_iomap_end;
+	}
+
+	trace_iomap_dio_rw_begin(iocb, iter, dio_flags, 0);
+
+	if (user_backed_iter(iter))
+		dio_flags |= IOMAP_DIO_USER_BACKED;
+
+	bio = bio_alloc_bioset(iomi.iomap.bdev, nr_pages,
+			       REQ_OP_READ | REQ_SYNC | REQ_IDLE,
+			       GFP_KERNEL, &iomap_dio_simple_read_pool);
+	sr = container_of(bio, struct iomap_dio_simple_read, bio);
+
+	fscrypt_set_bio_crypt_ctx(bio, inode, iomi.pos, GFP_KERNEL);
+	sr->iocb = iocb;
+	sr->dio_flags = dio_flags;
+
+	bio->bi_iter.bi_sector = iomap_sector(&iomi.iomap, iomi.pos);
+	bio->bi_ioprio = iocb->ki_ioprio;
+	bio->bi_private = sr;
+	bio->bi_end_io = iomap_dio_simple_read_end_io;
+
+	if (dio_flags & IOMAP_DIO_BOUNCE)
+		ret = bio_iov_iter_bounce(bio, iter, count);
+	else
+		ret = bio_iov_iter_get_pages(bio, iter, alignment - 1);
+	if (unlikely(ret))
+		goto out_bio_put;
+
+	if (bio->bi_iter.bi_size != count) {
+		iov_iter_revert(iter, bio->bi_iter.bi_size);
+		ret = -ENOTBLK;
+		goto out_bio_release_pages;
+	}
+
+	sr->size = bio->bi_iter.bi_size;
+
+	if ((dio_flags & IOMAP_DIO_USER_BACKED) &&
+	    !(dio_flags & IOMAP_DIO_BOUNCE))
+		bio_set_pages_dirty(bio);
+
+	if (iocb->ki_flags & IOCB_NOWAIT)
+		bio->bi_opf |= REQ_NOWAIT;
+	if ((iocb->ki_flags & IOCB_HIPRI) && !wait_for_completion) {
+		bio->bi_opf |= REQ_POLLED;
+		bio_set_polled(bio, iocb);
+		WRITE_ONCE(iocb->private, bio);
+	}
+
+	if (wait_for_completion) {
+		sr->waiter = current;
+		blk_crypto_submit_bio(bio);
+	} else {
+		atomic_set(&sr->state, IOMAP_DIO_SIMPLE_SUBMITTING);
+		sr->waiter = NULL;
+		blk_crypto_submit_bio(bio);
+		ret = -EIOCBQUEUED;
+	}
+
+	if (ops->iomap_end)
+		ops->iomap_end(inode, iomi.pos, count, count, iomi.flags,
+			       &iomi.iomap);
+
+	if (wait_for_completion) {
+		for (;;) {
+			set_current_state(TASK_UNINTERRUPTIBLE);
+			if (!READ_ONCE(sr->waiter))
+				break;
+			blk_io_schedule();
+		}
+		__set_current_state(TASK_RUNNING);
+
+		ret = iomap_dio_simple_read_finish(iocb, bio,
+				blk_status_to_errno(bio->bi_status));
+		inode_dio_end(inode);
+		trace_iomap_dio_complete(iocb, ret < 0 ? ret : 0,
+					 ret > 0 ? ret : 0);
+	} else if (atomic_cmpxchg(&sr->state, IOMAP_DIO_SIMPLE_SUBMITTING,
+				  IOMAP_DIO_SIMPLE_QUEUED) ==
+		   IOMAP_DIO_SIMPLE_DONE) {
+		ret = iomap_dio_simple_read_complete(iocb, bio);
+	} else {
+		trace_iomap_dio_rw_queued(inode, iomi.pos, count);
+	}
+
+	return ret;
+
+out_bio_release_pages:
+	if (dio_flags & IOMAP_DIO_BOUNCE)
+		bio_iov_iter_unbounce(bio, true, false);
+	else
+		bio_release_pages(bio, false);
+out_bio_put:
+	bio_put(bio);
+out_iomap_end:
+	if (ops->iomap_end)
+		ops->iomap_end(inode, iomi.pos, count, 0, iomi.flags,
+			       &iomi.iomap);
+	inode_dio_end(inode);
+	return ret;
+}
+
 ssize_t
 iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
 		const struct iomap_ops *ops, const struct iomap_dio_ops *dops,
 		unsigned int dio_flags, void *private, size_t done_before)
 {
 	struct iomap_dio *dio;
+	ssize_t ret;
+
+	/*
+	 * Fast path for small, block-aligned reads that map to a single
+	 * contiguous on-disk extent.
+	 *
+	 * @dops must be NULL: a non-NULL @dops means the caller wants its
+	 * ->end_io / ->submit_io hooks invoked, and in particular wants its
+	 * bios to be allocated from the filesystem-private @dops->bio_set
+	 * (whose front_pad sizes a filesystem-private wrapper around the
+	 * bio).  The fast path instead allocates from the shared
+	 * iomap_dio_simple_read_pool, whose front_pad matches
+	 * struct iomap_dio_simple_read; the two wrappers are not
+	 * interchangeable, so we must fall back to __iomap_dio_rw() in
+	 * that case.
+	 *
+	 * @done_before must be zero: a non-zero caller-accumulated residual
+	 * cannot be carried through a single-bio inline completion.
+	 *
+	 * -ENOTBLK is the private sentinel returned by iomap_dio_simple_read()
+	 * when it decides the request does not fit the fast path.
+	 * In that case we proceed to the generic __iomap_dio_rw() slow
+	 * path.  Any other errno is a real result and is propagated as-is,
+	 * in particular -EAGAIN for IOCB_NOWAIT must reach the caller.
+	 */
+	if (!dops && !done_before &&
+	    iomap_dio_simple_read_supported(iocb, iter, dio_flags)) {
+		ret = iomap_dio_simple_read(iocb, iter, ops, private, dio_flags);
+		if (ret != -ENOTBLK)
+			return ret;
+	}
 
 	dio = __iomap_dio_rw(iocb, iter, ops, dops, dio_flags, private,
 			     done_before);
@@ -905,3 +1259,11 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
 	return iomap_dio_complete(dio);
 }
 EXPORT_SYMBOL_GPL(iomap_dio_rw);
+
+static int __init iomap_dio_init(void)
+{
+	return bioset_init(&iomap_dio_simple_read_pool, 4,
+			   offsetof(struct iomap_dio_simple_read, bio),
+			   BIOSET_NEED_BVECS | BIOSET_PERCPU_CACHE);
+}
+fs_initcall(iomap_dio_init);
-- 
2.39.5 (Apple Git-154)

^ permalink raw reply related

* Re: [PATCH 4/4] iomap: fix out-of-bounds bitmap_set() with zero-length range
From: Zhang Yi @ 2026-05-15  1:57 UTC (permalink / raw)
  To: Joanne Koong
  Cc: linux-fsdevel, linux-xfs, linux-ext4, brauner, djwong, hch,
	yi.zhang, yizhang089, yangerkun, yukuai
In-Reply-To: <CAJnrk1bHM9aGXbk+LJYJZrfbtLvbwsqwrJgY2KkT5AyVfBjdrg@mail.gmail.com>

On 5/14/2026 11:08 PM, Joanne Koong wrote:
> On Wed, May 13, 2026 at 11:35 PM Zhang Yi <yi.zhang@huaweicloud.com> wrote:
>>
>> From: Zhang Yi <yi.zhang@huawei.com>
>>
>> ifs_set_range_dirty() and ifs_set_range_uptodate() compute last_blk
>> as (off + len - 1) >> i_blkbits.  When off is 0 and len is 0, the
>> unsigned subtraction underflows to SIZE_MAX, producing a huge
>> last_blk and nr_blks value that causes bitmap_set() to write far
>> beyond the ifs->state allocation.
>>
>> Regarding ifs_set_range_uptodate(), it is temporarily safe because len
>> cannot be passed in as 0. However, for ifs_set_range_dirty() this is
>> reachable from __iomap_write_end(): when copy_folio_from_iter_atomic()
>> returns 0 (e.g. user buffer fault) and the folio is already uptodate,
>> the guard at the top of __iomap_write_end() does not trigger because
>> !folio_test_uptodate() is false, and iomap_set_range_dirty() is called
>> with copied == 0.
> 
> Is the Fixes: 4ce02c679722  ("iomap: Add per-block dirty state
> tracking to improve performance") tag needed for this?
> 
>>
>> Add a !len guard to both functions before the computation, so that a
>> zero-length range is a no-op.
>>
>> Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
>> ---
>>  fs/iomap/buffered-io.c | 23 +++++++++++++++--------
>>  1 file changed, 15 insertions(+), 8 deletions(-)
>>
>> diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
>> index 27ab33edbdee..6fe5f7e998fd 100644
>> --- a/fs/iomap/buffered-io.c
>> +++ b/fs/iomap/buffered-io.c
>> @@ -67,11 +67,14 @@ static bool ifs_set_range_uptodate(struct folio *folio,
>>                 struct iomap_folio_state *ifs, size_t off, size_t len)
>>  {
>>         struct inode *inode = folio->mapping->host;
>> -       unsigned int first_blk = off >> inode->i_blkbits;
>> -       unsigned int last_blk = (off + len - 1) >> inode->i_blkbits;
>> -       unsigned int nr_blks = last_blk - first_blk + 1;
>> +       unsigned int first_blk, last_blk;
>>
>> -       bitmap_set(ifs->state, first_blk, nr_blks);
>> +       if (!len)
>> +               return true;
> 
> I think both callers of ifs_set_range_uptodate() use the return value
> to decide whether to mark the folio uptodate or not - does this still
> need to return ifs_is_fully_uptodate(folio, ifs) instead of always
> true?
> 

Yeah, you are right! I missed that, will fix.

Thanks,
Yi.

> Thanks,
> Joanne
> 
>> +
>> +       first_blk = off >> inode->i_blkbits;
>> +       last_blk = (off + len - 1) >> inode->i_blkbits;
>> +       bitmap_set(ifs->state, first_blk, last_blk - first_blk + 1);
>>         return ifs_is_fully_uptodate(folio, ifs);
>>  }


^ permalink raw reply

* Re: [PATCH 4/4] iomap: fix out-of-bounds bitmap_set() with zero-length range
From: Zhang Yi @ 2026-05-15  1:50 UTC (permalink / raw)
  To: Darrick J. Wong
  Cc: linux-fsdevel, linux-xfs, linux-ext4, brauner, hch, yi.zhang,
	yizhang089, yangerkun, yukuai
In-Reply-To: <20260514181053.GB9555@frogsfrogsfrogs>

On 5/15/2026 2:10 AM, Darrick J. Wong wrote:
> On Thu, May 14, 2026 at 02:29:55PM +0800, Zhang Yi wrote:
>> From: Zhang Yi <yi.zhang@huawei.com>
>>
>> ifs_set_range_dirty() and ifs_set_range_uptodate() compute last_blk
>> as (off + len - 1) >> i_blkbits.  When off is 0 and len is 0, the
>> unsigned subtraction underflows to SIZE_MAX, producing a huge
>> last_blk and nr_blks value that causes bitmap_set() to write far
>> beyond the ifs->state allocation.
>>
>> Regarding ifs_set_range_uptodate(), it is temporarily safe because len
>> cannot be passed in as 0. However, for ifs_set_range_dirty() this is
>> reachable from __iomap_write_end(): when copy_folio_from_iter_atomic()
>> returns 0 (e.g. user buffer fault) and the folio is already uptodate,
>> the guard at the top of __iomap_write_end() does not trigger because
>> !folio_test_uptodate() is false, and iomap_set_range_dirty() is called
>> with copied == 0.
>>
>> Add a !len guard to both functions before the computation, so that a
>> zero-length range is a no-op.
>>
>> Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
>> ---
>>  fs/iomap/buffered-io.c | 23 +++++++++++++++--------
>>  1 file changed, 15 insertions(+), 8 deletions(-)
>>
>> diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
>> index 27ab33edbdee..6fe5f7e998fd 100644
>> --- a/fs/iomap/buffered-io.c
>> +++ b/fs/iomap/buffered-io.c
>> @@ -67,11 +67,14 @@ static bool ifs_set_range_uptodate(struct folio *folio,
>>  		struct iomap_folio_state *ifs, size_t off, size_t len)
>>  {
>>  	struct inode *inode = folio->mapping->host;
>> -	unsigned int first_blk = off >> inode->i_blkbits;
>> -	unsigned int last_blk = (off + len - 1) >> inode->i_blkbits;
>> -	unsigned int nr_blks = last_blk - first_blk + 1;
>> +	unsigned int first_blk, last_blk;
>>  
>> -	bitmap_set(ifs->state, first_blk, nr_blks);
>> +	if (!len)
>> +		return true;
>> +
>> +	first_blk = off >> inode->i_blkbits;
>> +	last_blk = (off + len - 1) >> inode->i_blkbits;
>> +	bitmap_set(ifs->state, first_blk, last_blk - first_blk + 1);
>>  	return ifs_is_fully_uptodate(folio, ifs);
>>  }
>>  
>> @@ -203,13 +206,17 @@ static void ifs_set_range_dirty(struct folio *folio,
>>  {
>>  	struct inode *inode = folio->mapping->host;
>>  	unsigned int blks_per_folio = i_blocks_per_folio(inode, folio);
>> -	unsigned int first_blk = (off >> inode->i_blkbits);
>> -	unsigned int last_blk = (off + len - 1) >> inode->i_blkbits;
>> -	unsigned int nr_blks = last_blk - first_blk + 1;
>> +	unsigned int first_blk, last_blk;
>>  	unsigned long flags;
>>  
>> +	if (!len)
>> +		return;
>> +
>> +	first_blk = off >> inode->i_blkbits;
>> +	last_blk = (off + len - 1) >> inode->i_blkbits;
>>  	spin_lock_irqsave(&ifs->state_lock, flags);
>> -	bitmap_set(ifs->state, first_blk + blks_per_folio, nr_blks);
>> +	bitmap_set(ifs->state, first_blk + blks_per_folio,
>> +		   last_blk - first_blk + 1);
> 
> I'm curious about the inconsistency in the computations between
> ifs_clear_range_dirty and ifs_set_range_dirty now.  In the function that
> clears dirty bits, off/len are rounded inwards:
> 
> 	unsigned int first_blk = round_up(off, i_blocksize(inode)) >>
> 				 inode->i_blkbits;
> 	unsigned int last_blk = (off + len) >> inode->i_blkbits;
> 	unsigned long flags;
> 
> 	if (first_blk >= last_blk)
> 		return;
> 
> 	spin_lock_irqsave(&ifs->state_lock, flags);
> 	bitmap_clear(ifs->state, first_blk + blks_per_folio,
> 		     last_blk - first_blk);
> 
> but here we're still rounding outwards:
> 
> 	if (!len)
> 		return;
> 
> 	first_blk = off >> inode->i_blkbits;
> 	last_blk = (off + len - 1) >> inode->i_blkbits;
> 	spin_lock_irqsave(&ifs->state_lock, flags);
> 	bitmap_set(ifs->state, first_blk + blks_per_folio,
> 		   last_blk - first_blk + 1);
> 
> That doesn't quite sound right to me without an explanation in the code,
> which currently lacks one.  I *think* the reason for the discrepancy is
> that if we want to dirty part of an fsblock, we need to mark the whole
> block dirty in the ifs so that all the blocks get written out; but when
> we're clearing dirty bits, we want to leave an fsblock dirty if we only
> wrote back part of that fsblock.  Does that sound right?
> 
> --D
> 

Yes, that's right. The primary purpose of clearing the dirty bit is for
invalidating a partial folio(e.g., when punching hole). Only when a full
fsblock has been invalidated can its corresponding dirty bit be cleared.
Write-back operations never seem to write back only part of a fsblock.

I can add comments for them in my next iteration.

Thanks,
Yi.



^ permalink raw reply

* Re: [PATCH 1/4] iomap: correct the range of a partial dirty clear
From: Zhang Yi @ 2026-05-15  1:20 UTC (permalink / raw)
  To: Darrick J. Wong
  Cc: linux-fsdevel, linux-xfs, linux-ext4, brauner, hch, yi.zhang,
	yizhang089, yangerkun, yukuai
In-Reply-To: <20260514180320.GA9555@frogsfrogsfrogs>

On 5/15/2026 2:03 AM, Darrick J. Wong wrote:
> On Thu, May 14, 2026 at 02:29:52PM +0800, Zhang Yi wrote:
>> From: Zhang Yi <yi.zhang@huawei.com>
>>
>> The block range calculation in ifs_clear_range_dirty() is incorrect when
>> partially clearing a range in a folio. We cannot clear the dirty bit of
>> the first block or the last block if the start or end offset is not
>> blocksize-aligned. This has not yet caused any issues since we always
>> clear a whole folio in iomap_writeback_folio().
>>
>> Fix this by rounding up the first block to blocksize alignment, and
>> calculate the last block by rounding down (using truncation). Correct
>> the nr_blks calculation accordingly.
>>
>> Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
> 
> Cc: <stable@vger.kernel.org> # v6.6
> Fixes: 4ce02c67972211 ("iomap: Add per-block dirty state tracking to improve performance")
> 
>> ---
>> This is modified from:
>>  https://lore.kernel.org/linux-fsdevel/20240812121159.3775074-2-yi.zhang@huaweicloud.com/
>> Changes:
>>  - Use round_up() instead of DIV_ROUND_UP() to prevent wasted integer
>>    division.
>>
>>  fs/iomap/buffered-io.c | 12 ++++++++----
>>  1 file changed, 8 insertions(+), 4 deletions(-)
>>
>> diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
>> index d7b648421a70..64351a448a8b 100644
>> --- a/fs/iomap/buffered-io.c
>> +++ b/fs/iomap/buffered-io.c
>> @@ -176,13 +176,17 @@ static void ifs_clear_range_dirty(struct folio *folio,
>>  {
>>  	struct inode *inode = folio->mapping->host;
>>  	unsigned int blks_per_folio = i_blocks_per_folio(inode, folio);
>> -	unsigned int first_blk = (off >> inode->i_blkbits);
>> -	unsigned int last_blk = (off + len - 1) >> inode->i_blkbits;
>> -	unsigned int nr_blks = last_blk - first_blk + 1;
>> +	unsigned int first_blk = round_up(off, i_blocksize(inode)) >>
>> +				 inode->i_blkbits;
> 
> Ok, so now we round off up to the next fsblock to compute first_blk...
> 
>> +	unsigned int last_blk = (off + len) >> inode->i_blkbits;
> 
> ...and last_blk (which is really the next block number after the range
> that we're undirtying) is now rounded down.  Presumably off/len have to
> be aligned to fsblock granularity so we'll never have to deal with
> unaligned situations like (off=324,len=1), right?

Yeah, that's right.

> 
>>  	unsigned long flags;
>>  
>> +	if (first_blk >= last_blk)
> 
> Do we need this check?  When would the test actually be true?

Yes, we do. Following your example, when off = 324 and len = 1, we have
first_block = 1 but last_block = 0. In this case, we must add this check
to prevent an inversion when calculating the length to be zeroed out.

Thanks,
Yi.

> 
> --D
> 
>> +		return;
>> +
>>  	spin_lock_irqsave(&ifs->state_lock, flags);
>> -	bitmap_clear(ifs->state, first_blk + blks_per_folio, nr_blks);
>> +	bitmap_clear(ifs->state, first_blk + blks_per_folio,
>> +		     last_blk - first_blk);
>>  	spin_unlock_irqrestore(&ifs->state_lock, flags);
>>  }
>>  
>> -- 
>> 2.52.0
>>
>>
> 


^ permalink raw reply

* Re: [PATCH 4/4] iomap: fix out-of-bounds bitmap_set() with zero-length range
From: Darrick J. Wong @ 2026-05-14 18:10 UTC (permalink / raw)
  To: Zhang Yi
  Cc: linux-fsdevel, linux-xfs, linux-ext4, brauner, hch, yi.zhang,
	yizhang089, yangerkun, yukuai
In-Reply-To: <20260514062955.1183976-5-yi.zhang@huaweicloud.com>

On Thu, May 14, 2026 at 02:29:55PM +0800, Zhang Yi wrote:
> From: Zhang Yi <yi.zhang@huawei.com>
> 
> ifs_set_range_dirty() and ifs_set_range_uptodate() compute last_blk
> as (off + len - 1) >> i_blkbits.  When off is 0 and len is 0, the
> unsigned subtraction underflows to SIZE_MAX, producing a huge
> last_blk and nr_blks value that causes bitmap_set() to write far
> beyond the ifs->state allocation.
> 
> Regarding ifs_set_range_uptodate(), it is temporarily safe because len
> cannot be passed in as 0. However, for ifs_set_range_dirty() this is
> reachable from __iomap_write_end(): when copy_folio_from_iter_atomic()
> returns 0 (e.g. user buffer fault) and the folio is already uptodate,
> the guard at the top of __iomap_write_end() does not trigger because
> !folio_test_uptodate() is false, and iomap_set_range_dirty() is called
> with copied == 0.
> 
> Add a !len guard to both functions before the computation, so that a
> zero-length range is a no-op.
> 
> Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
> ---
>  fs/iomap/buffered-io.c | 23 +++++++++++++++--------
>  1 file changed, 15 insertions(+), 8 deletions(-)
> 
> diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
> index 27ab33edbdee..6fe5f7e998fd 100644
> --- a/fs/iomap/buffered-io.c
> +++ b/fs/iomap/buffered-io.c
> @@ -67,11 +67,14 @@ static bool ifs_set_range_uptodate(struct folio *folio,
>  		struct iomap_folio_state *ifs, size_t off, size_t len)
>  {
>  	struct inode *inode = folio->mapping->host;
> -	unsigned int first_blk = off >> inode->i_blkbits;
> -	unsigned int last_blk = (off + len - 1) >> inode->i_blkbits;
> -	unsigned int nr_blks = last_blk - first_blk + 1;
> +	unsigned int first_blk, last_blk;
>  
> -	bitmap_set(ifs->state, first_blk, nr_blks);
> +	if (!len)
> +		return true;
> +
> +	first_blk = off >> inode->i_blkbits;
> +	last_blk = (off + len - 1) >> inode->i_blkbits;
> +	bitmap_set(ifs->state, first_blk, last_blk - first_blk + 1);
>  	return ifs_is_fully_uptodate(folio, ifs);
>  }
>  
> @@ -203,13 +206,17 @@ static void ifs_set_range_dirty(struct folio *folio,
>  {
>  	struct inode *inode = folio->mapping->host;
>  	unsigned int blks_per_folio = i_blocks_per_folio(inode, folio);
> -	unsigned int first_blk = (off >> inode->i_blkbits);
> -	unsigned int last_blk = (off + len - 1) >> inode->i_blkbits;
> -	unsigned int nr_blks = last_blk - first_blk + 1;
> +	unsigned int first_blk, last_blk;
>  	unsigned long flags;
>  
> +	if (!len)
> +		return;
> +
> +	first_blk = off >> inode->i_blkbits;
> +	last_blk = (off + len - 1) >> inode->i_blkbits;
>  	spin_lock_irqsave(&ifs->state_lock, flags);
> -	bitmap_set(ifs->state, first_blk + blks_per_folio, nr_blks);
> +	bitmap_set(ifs->state, first_blk + blks_per_folio,
> +		   last_blk - first_blk + 1);

I'm curious about the inconsistency in the computations between
ifs_clear_range_dirty and ifs_set_range_dirty now.  In the function that
clears dirty bits, off/len are rounded inwards:

	unsigned int first_blk = round_up(off, i_blocksize(inode)) >>
				 inode->i_blkbits;
	unsigned int last_blk = (off + len) >> inode->i_blkbits;
	unsigned long flags;

	if (first_blk >= last_blk)
		return;

	spin_lock_irqsave(&ifs->state_lock, flags);
	bitmap_clear(ifs->state, first_blk + blks_per_folio,
		     last_blk - first_blk);

but here we're still rounding outwards:

	if (!len)
		return;

	first_blk = off >> inode->i_blkbits;
	last_blk = (off + len - 1) >> inode->i_blkbits;
	spin_lock_irqsave(&ifs->state_lock, flags);
	bitmap_set(ifs->state, first_blk + blks_per_folio,
		   last_blk - first_blk + 1);

That doesn't quite sound right to me without an explanation in the code,
which currently lacks one.  I *think* the reason for the discrepancy is
that if we want to dirty part of an fsblock, we need to mark the whole
block dirty in the ifs so that all the blocks get written out; but when
we're clearing dirty bits, we want to leave an fsblock dirty if we only
wrote back part of that fsblock.  Does that sound right?

--D

^ permalink raw reply

* Re: [PATCH 1/4] iomap: correct the range of a partial dirty clear
From: Darrick J. Wong @ 2026-05-14 18:03 UTC (permalink / raw)
  To: Zhang Yi
  Cc: linux-fsdevel, linux-xfs, linux-ext4, brauner, hch, yi.zhang,
	yizhang089, yangerkun, yukuai
In-Reply-To: <20260514062955.1183976-2-yi.zhang@huaweicloud.com>

On Thu, May 14, 2026 at 02:29:52PM +0800, Zhang Yi wrote:
> From: Zhang Yi <yi.zhang@huawei.com>
> 
> The block range calculation in ifs_clear_range_dirty() is incorrect when
> partially clearing a range in a folio. We cannot clear the dirty bit of
> the first block or the last block if the start or end offset is not
> blocksize-aligned. This has not yet caused any issues since we always
> clear a whole folio in iomap_writeback_folio().
> 
> Fix this by rounding up the first block to blocksize alignment, and
> calculate the last block by rounding down (using truncation). Correct
> the nr_blks calculation accordingly.
> 
> Signed-off-by: Zhang Yi <yi.zhang@huawei.com>

Cc: <stable@vger.kernel.org> # v6.6
Fixes: 4ce02c67972211 ("iomap: Add per-block dirty state tracking to improve performance")

> ---
> This is modified from:
>  https://lore.kernel.org/linux-fsdevel/20240812121159.3775074-2-yi.zhang@huaweicloud.com/
> Changes:
>  - Use round_up() instead of DIV_ROUND_UP() to prevent wasted integer
>    division.
> 
>  fs/iomap/buffered-io.c | 12 ++++++++----
>  1 file changed, 8 insertions(+), 4 deletions(-)
> 
> diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
> index d7b648421a70..64351a448a8b 100644
> --- a/fs/iomap/buffered-io.c
> +++ b/fs/iomap/buffered-io.c
> @@ -176,13 +176,17 @@ static void ifs_clear_range_dirty(struct folio *folio,
>  {
>  	struct inode *inode = folio->mapping->host;
>  	unsigned int blks_per_folio = i_blocks_per_folio(inode, folio);
> -	unsigned int first_blk = (off >> inode->i_blkbits);
> -	unsigned int last_blk = (off + len - 1) >> inode->i_blkbits;
> -	unsigned int nr_blks = last_blk - first_blk + 1;
> +	unsigned int first_blk = round_up(off, i_blocksize(inode)) >>
> +				 inode->i_blkbits;

Ok, so now we round off up to the next fsblock to compute first_blk...

> +	unsigned int last_blk = (off + len) >> inode->i_blkbits;

...and last_blk (which is really the next block number after the range
that we're undirtying) is now rounded down.  Presumably off/len have to
be aligned to fsblock granularity so we'll never have to deal with
unaligned situations like (off=324,len=1), right?

>  	unsigned long flags;
>  
> +	if (first_blk >= last_blk)

Do we need this check?  When would the test actually be true?

--D

> +		return;
> +
>  	spin_lock_irqsave(&ifs->state_lock, flags);
> -	bitmap_clear(ifs->state, first_blk + blks_per_folio, nr_blks);
> +	bitmap_clear(ifs->state, first_blk + blks_per_folio,
> +		     last_blk - first_blk);
>  	spin_unlock_irqrestore(&ifs->state_lock, flags);
>  }
>  
> -- 
> 2.52.0
> 
> 

^ permalink raw reply

* Re: [PATCH 4/4] iomap: fix out-of-bounds bitmap_set() with zero-length range
From: Joanne Koong @ 2026-05-14 15:08 UTC (permalink / raw)
  To: Zhang Yi
  Cc: linux-fsdevel, linux-xfs, linux-ext4, brauner, djwong, hch,
	yi.zhang, yizhang089, yangerkun, yukuai
In-Reply-To: <20260514062955.1183976-5-yi.zhang@huaweicloud.com>

On Wed, May 13, 2026 at 11:35 PM Zhang Yi <yi.zhang@huaweicloud.com> wrote:
>
> From: Zhang Yi <yi.zhang@huawei.com>
>
> ifs_set_range_dirty() and ifs_set_range_uptodate() compute last_blk
> as (off + len - 1) >> i_blkbits.  When off is 0 and len is 0, the
> unsigned subtraction underflows to SIZE_MAX, producing a huge
> last_blk and nr_blks value that causes bitmap_set() to write far
> beyond the ifs->state allocation.
>
> Regarding ifs_set_range_uptodate(), it is temporarily safe because len
> cannot be passed in as 0. However, for ifs_set_range_dirty() this is
> reachable from __iomap_write_end(): when copy_folio_from_iter_atomic()
> returns 0 (e.g. user buffer fault) and the folio is already uptodate,
> the guard at the top of __iomap_write_end() does not trigger because
> !folio_test_uptodate() is false, and iomap_set_range_dirty() is called
> with copied == 0.

Is the Fixes: 4ce02c679722  ("iomap: Add per-block dirty state
tracking to improve performance") tag needed for this?

>
> Add a !len guard to both functions before the computation, so that a
> zero-length range is a no-op.
>
> Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
> ---
>  fs/iomap/buffered-io.c | 23 +++++++++++++++--------
>  1 file changed, 15 insertions(+), 8 deletions(-)
>
> diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
> index 27ab33edbdee..6fe5f7e998fd 100644
> --- a/fs/iomap/buffered-io.c
> +++ b/fs/iomap/buffered-io.c
> @@ -67,11 +67,14 @@ static bool ifs_set_range_uptodate(struct folio *folio,
>                 struct iomap_folio_state *ifs, size_t off, size_t len)
>  {
>         struct inode *inode = folio->mapping->host;
> -       unsigned int first_blk = off >> inode->i_blkbits;
> -       unsigned int last_blk = (off + len - 1) >> inode->i_blkbits;
> -       unsigned int nr_blks = last_blk - first_blk + 1;
> +       unsigned int first_blk, last_blk;
>
> -       bitmap_set(ifs->state, first_blk, nr_blks);
> +       if (!len)
> +               return true;

I think both callers of ifs_set_range_uptodate() use the return value
to decide whether to mark the folio uptodate or not - does this still
need to return ifs_is_fully_uptodate(folio, ifs) instead of always
true?

Thanks,
Joanne

> +
> +       first_blk = off >> inode->i_blkbits;
> +       last_blk = (off + len - 1) >> inode->i_blkbits;
> +       bitmap_set(ifs->state, first_blk, last_blk - first_blk + 1);
>         return ifs_is_fully_uptodate(folio, ifs);
>  }

^ permalink raw reply

* Re: [PATCH] doc: correct date in release notes
From: Theodore Ts'o @ 2026-05-14 15:01 UTC (permalink / raw)
  To: Ext4 Developers List, jinzhiguang; +Cc: Theodore Ts'o
In-Reply-To: <20260324091259.1167-1-jinzhiguang@kylinos.cn>


On Tue, 24 Mar 2026 17:12:59 +0800, jinzhiguang wrote:
> Correct the date in the RelNotes/v1.47.4.txt from 2025 to 2026.

Applied, thanks!

[1/1] doc: correct date in release notes
      commit: c22405c8880b31fd4c8574feccea989643e588f1

Best regards,
-- 
Theodore Ts'o <tytso@mit.edu>

^ permalink raw reply

* Re: [PATCH 09/12] swap: push down setting sis->bdev into ->swap_activate
From: Darrick J. Wong @ 2026-05-14 14:37 UTC (permalink / raw)
  To: Damien Le Moal
  Cc: Christoph Hellwig, Andrew Morton, Chris Li, Kairui Song,
	Christian Brauner, Jens Axboe, David Sterba, Theodore Ts'o,
	Jaegeuk Kim, Chao Yu, Trond Myklebust, Anna Schumaker,
	Namjae Jeon, Hyunchul Lee, Steve French, Paulo Alcantara,
	Carlos Maiolino, Naohiro Aota, linux-xfs, linux-fsdevel,
	linux-doc, linux-mm, linux-block, linux-btrfs, linux-ext4,
	linux-f2fs-devel, linux-nfs, linux-cifs
In-Reply-To: <b37ca8a7-289e-45a0-8cbd-eb14d7453b97@kernel.org>

On Wed, May 13, 2026 at 04:58:37PM +0900, Damien Le Moal wrote:
> On 5/13/26 16:46, Christoph Hellwig wrote:
> > On Wed, May 13, 2026 at 04:44:53PM +0900, Damien Le Moal wrote:
> >> Hmmm... With zonefs, swap files can be created on top of conventional zone
> >> files. So enforcing "no swap on zoned device" here would break that.
> > 
> > We can check that none of the extents fall onto sequential zones instead
> > of just devices.
> > 
> > I still wonder why you bother with swap to zonefs at all, though.
> 
> Yeah. I do not think anyone actually use that... But since it is there from the
> start, kind of stuck with it now.

Ahh, right, I forgot that zoned devices can have conventional zones
where swap would actually work.  Question withdrawn.

--D

^ permalink raw reply

* Re: [PATCH] ext4: fix fast commit wait/wake bit mapping on 64-bit
From: Zhang Yi @ 2026-05-14 12:00 UTC (permalink / raw)
  To: Li Chen, Theodore Ts'o
  Cc: Andreas Dilger, Baokun Li, Jan Kara, Ojaswin Mujoo,
	Ritesh Harjani, linux-ext4, linux-kernel, Sashiko AI review
In-Reply-To: <20260513085818.552432-1-me@linux.beauty>

On 5/13/2026 4:58 PM, Li Chen wrote:
> On 64-bit, ext4 dynamic inode states live in the upper half of i_flags,
> and ext4_test_inode_state() applies the corresponding +32 offset.
> 
> The fast-commit wait and wake paths open-coded the wait key with the raw
> EXT4_STATE_* value. Add small helpers for the state wait word and bit,
> and use them for the FC_COMMITTING and FC_FLUSHING_DATA waits so the wait
> key follows the same mapping as the state helpers.
> 
> Fixes: 857d32f26181 ("ext4: rework fast commit commit path")
> Reported-by: Sashiko AI review <sashiko-bot@kernel.org>
> Signed-off-by: Li Chen <chenl311@chinatelecom.cn>

Ha, This looks good to me! Thanks.

Reviewed-by: Zhang Yi <yi.zhang@huawei.com>

> ---
>  fs/ext4/ext4.h        | 20 +++++++++++++++++
>  fs/ext4/fast_commit.c | 50 ++++++++++++++++---------------------------
>  2 files changed, 38 insertions(+), 32 deletions(-)
> 
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 94283a991e5c..6569d1d575a0 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -2000,6 +2000,8 @@ EXT4_INODE_BIT_FNS(flag, flags, 0)
>  static inline int ext4_test_inode_state(struct inode *inode, int bit);
>  static inline void ext4_set_inode_state(struct inode *inode, int bit);
>  static inline void ext4_clear_inode_state(struct inode *inode, int bit);
> +static inline unsigned long *ext4_inode_state_wait_word(struct inode *inode);
> +static inline int ext4_inode_state_wait_bit(int bit);
>  #if (BITS_PER_LONG < 64)
>  EXT4_INODE_BIT_FNS(state, state_flags, 0)
>  
> @@ -2015,6 +2017,24 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
>  	/* We depend on the fact that callers will set i_flags */
>  }
>  #endif
> +
> +static inline unsigned long *ext4_inode_state_wait_word(struct inode *inode)
> +{
> +#if (BITS_PER_LONG < 64)
> +	return &EXT4_I(inode)->i_state_flags;
> +#else
> +	return &EXT4_I(inode)->i_flags;
> +#endif
> +}
> +
> +static inline int ext4_inode_state_wait_bit(int bit)
> +{
> +#if (BITS_PER_LONG < 64)
> +	return bit;
> +#else
> +	return bit + 32;
> +#endif
> +}
>  #else
>  /* Assume that user mode programs are passing in an ext4fs superblock, not
>   * a kernel struct super_block.  This will allow us to call the feature-test
> diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
> index b3c22636251d..1775bce9649a 100644
> --- a/fs/ext4/fast_commit.c
> +++ b/fs/ext4/fast_commit.c
> @@ -239,6 +239,8 @@ void ext4_fc_del(struct inode *inode)
>  	struct ext4_inode_info *ei = EXT4_I(inode);
>  	struct ext4_fc_dentry_update *fc_dentry;
>  	wait_queue_head_t *wq;
> +	unsigned long *wait_word = ext4_inode_state_wait_word(inode);
> +	int wait_bit = ext4_inode_state_wait_bit(EXT4_STATE_FC_FLUSHING_DATA);
>  	int alloc_ctx;
>  
>  	if (ext4_fc_disabled(inode->i_sb))
> @@ -268,17 +270,9 @@ void ext4_fc_del(struct inode *inode)
>  	WARN_ON(ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)
>  		&& !ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE));
>  	while (ext4_test_inode_state(inode, EXT4_STATE_FC_FLUSHING_DATA)) {
> -#if (BITS_PER_LONG < 64)
> -		DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
> -				EXT4_STATE_FC_FLUSHING_DATA);
> -		wq = bit_waitqueue(&ei->i_state_flags,
> -				   EXT4_STATE_FC_FLUSHING_DATA);
> -#else
> -		DEFINE_WAIT_BIT(wait, &ei->i_flags,
> -				EXT4_STATE_FC_FLUSHING_DATA);
> -		wq = bit_waitqueue(&ei->i_flags,
> -				   EXT4_STATE_FC_FLUSHING_DATA);
> -#endif
> +		DEFINE_WAIT_BIT(wait, wait_word, wait_bit);
> +
> +		wq = bit_waitqueue(wait_word, wait_bit);
>  		prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
>  		if (ext4_test_inode_state(inode, EXT4_STATE_FC_FLUSHING_DATA)) {
>  			ext4_fc_unlock(inode->i_sb, alloc_ctx);
> @@ -542,6 +536,8 @@ void ext4_fc_track_inode(handle_t *handle, struct inode *inode)
>  {
>  	struct ext4_inode_info *ei = EXT4_I(inode);
>  	wait_queue_head_t *wq;
> +	unsigned long *wait_word = ext4_inode_state_wait_word(inode);
> +	int wait_bit = ext4_inode_state_wait_bit(EXT4_STATE_FC_COMMITTING);
>  	int ret;
>  
>  	if (S_ISDIR(inode->i_mode))
> @@ -564,17 +560,9 @@ void ext4_fc_track_inode(handle_t *handle, struct inode *inode)
>  	lockdep_assert_not_held(&ei->i_data_sem);
>  
>  	while (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
> -#if (BITS_PER_LONG < 64)
> -		DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
> -				EXT4_STATE_FC_COMMITTING);
> -		wq = bit_waitqueue(&ei->i_state_flags,
> -				   EXT4_STATE_FC_COMMITTING);
> -#else
> -		DEFINE_WAIT_BIT(wait, &ei->i_flags,
> -				EXT4_STATE_FC_COMMITTING);
> -		wq = bit_waitqueue(&ei->i_flags,
> -				   EXT4_STATE_FC_COMMITTING);
> -#endif
> +		DEFINE_WAIT_BIT(wait, wait_word, wait_bit);
> +
> +		wq = bit_waitqueue(wait_word, wait_bit);
>  		prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
>  		if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING))
>  			schedule();
> @@ -1034,6 +1022,8 @@ static int ext4_fc_perform_commit(journal_t *journal)
>  	int ret = 0;
>  	u32 crc = 0;
>  	int alloc_ctx;
> +	int flushing_wait_bit =
> +		ext4_inode_state_wait_bit(EXT4_STATE_FC_FLUSHING_DATA);
>  
>  	/*
>  	 * Step 1: Mark all inodes on s_fc_q[MAIN] with
> @@ -1059,11 +1049,8 @@ static int ext4_fc_perform_commit(journal_t *journal)
>  	list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
>  		ext4_clear_inode_state(&iter->vfs_inode,
>  				       EXT4_STATE_FC_FLUSHING_DATA);
> -#if (BITS_PER_LONG < 64)
> -		wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_FLUSHING_DATA);
> -#else
> -		wake_up_bit(&iter->i_flags, EXT4_STATE_FC_FLUSHING_DATA);
> -#endif
> +		wake_up_bit(ext4_inode_state_wait_word(&iter->vfs_inode),
> +			    flushing_wait_bit);
>  	}
>  
>  	/*
> @@ -1279,6 +1266,8 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid)
>  	struct ext4_inode_info *ei;
>  	struct ext4_fc_dentry_update *fc_dentry;
>  	int alloc_ctx;
> +	int committing_wait_bit =
> +		ext4_inode_state_wait_bit(EXT4_STATE_FC_COMMITTING);
>  
>  	if (full && sbi->s_fc_bh)
>  		sbi->s_fc_bh = NULL;
> @@ -1315,11 +1304,8 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid)
>  		 * barrier in prepare_to_wait() in ext4_fc_track_inode().
>  		 */
>  		smp_mb();
> -#if (BITS_PER_LONG < 64)
> -		wake_up_bit(&ei->i_state_flags, EXT4_STATE_FC_COMMITTING);
> -#else
> -		wake_up_bit(&ei->i_flags, EXT4_STATE_FC_COMMITTING);
> -#endif
> +		wake_up_bit(ext4_inode_state_wait_word(&ei->vfs_inode),
> +			    committing_wait_bit);
>  	}
>  
>  	while (!list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) {


^ permalink raw reply

* Re: [RFC v7 6/7] ext4: fast commit: add lock_updates tracepoint
From: Li Chen @ 2026-05-14 11:43 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Zhang Yi, Theodore Ts'o, Andreas Dilger, Baokun Li, Jan Kara,
	Ojaswin Mujoo, Ritesh Harjani, Zhang Yi, Masami Hiramatsu,
	Mathieu Desnoyers, linux-ext4, linux-kernel, linux-trace-kernel
In-Reply-To: <20260513135741.12ddb97d@gandalf.local.home>

Hi Steven,

 ---- On Thu, 14 May 2026 01:57:41 +0800  Steven Rostedt <rostedt@goodmis.org> wrote --- 
 > On Mon, 11 May 2026 16:43:01 +0800
 > Li Chen <me@linux.beauty> wrote:
 > 
 > > @@ -1346,8 +1383,15 @@ static int ext4_fc_perform_commit(journal_t *journal)
 > >      }
 > >      ext4_fc_unlock(sb, alloc_ctx);
 > >  
 > > -    ret = ext4_fc_snapshot_inodes(journal, inodes, inodes_size);
 > > +    ret = ext4_fc_snapshot_inodes(journal, inodes, inodes_size,
 > > +                      &snap_inodes, &snap_ranges, &snap_err);
 > >      jbd2_journal_unlock_updates(journal);
 > > +    if (trace_ext4_fc_lock_updates_enabled()) {
 > > +        locked_ns = ktime_to_ns(ktime_sub(ktime_get(), lock_start));
 > > +        trace_ext4_fc_lock_updates(sb, commit_tid, locked_ns,
 > > +                       snap_inodes, snap_ranges, ret,
 > > +                       snap_err);
 > 
 > Please change this to:
 > 
 >         trace_call__ext4_fc_lock_updates(...)
 > 
 > As the "trace_ext4_fc_lock_updates_enabled()" already has the static
 > branch. No need to do it twice anymore. 7.1 introduced the
 > "trace_call__foo()" that will do a direct call to the tracepoints
 > registered, without the need for another static branch.

Thanks, will do it.


Regards,
Li​


^ permalink raw reply

* Re: [RFC v7 4/7] ext4: fast commit: avoid self-deadlock in inode snapshotting
From: Li Chen @ 2026-05-14  7:28 UTC (permalink / raw)
  To: sashiko; +Cc: tytso, Linux ext4
In-Reply-To: <20260512034640.71DA1C2BCB0@smtp.kernel.org>

Hi sashiko,

 ---- On Tue, 12 May 2026 11:46:39 +0800   <sashiko-bot@kernel.org> wrote --- 
 > Thank you for your contribution! Sashiko AI review found 1 potential issue(s) to consider:
 > - [Critical] Removing igrab() exposes a critical use-after-free vulnerability because ext4_fc_del() can miss the EXT4_STATE_FC_COMMITTING state transition and free the inode while it is actively being snapshotted.
 > --
 > 
 > commit 39c5450f50154240ff764cf3a63e011d8e973fd4
 > Author: Li Chen <me@linux.beauty>
 > 
 > ext4: fast commit: avoid self-deadlock in inode snapshotting
 > 
 > This commit removes igrab()/iput() during fast commit snapshotting, relying
 > instead on the EXT4_STATE_FC_COMMITTING flag to keep inodes pinned while
 > their pointers are stored in the inodes[] array for snapshotting.
 > 
 > > diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
 > > index 273bf34031aeb..f9bb18c0b5497 100644
 > > --- a/fs/ext4/fast_commit.c
 > > +++ b/fs/ext4/fast_commit.c
 > > @@ -1195,13 +1195,12 @@ static int ext4_fc_snapshot_inodes(journal_t *journal)
 > >  
 > >      alloc_ctx = ext4_fc_lock(sb);
 > >      list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
 > > -        inodes[i] = igrab(&iter->vfs_inode);
 > > -        if (inodes[i])
 > > -            i++;
 > > +        inodes[i++] = &iter->vfs_inode;
 > >      }
 > 
 > Is it possible for this change to cause a use-after-free of the inode?
 > 
 > If ext4_fc_del() is called during inode eviction, it waits sequentially for
 > EXT4_STATE_FC_COMMITTING and then EXT4_STATE_FC_FLUSHING_DATA. However, it
 > doesn't appear to loop back to re-check EXT4_STATE_FC_COMMITTING after
 > waiting for the flush bit.
 > 
 > Could a sequence like this occur?
 > 
 > 1. ext4_fc_perform_commit() sets EXT4_STATE_FC_FLUSHING_DATA.
 > 2. Eviction calls ext4_fc_del(), which sees EXT4_STATE_FC_COMMITTING is
 >    false, but EXT4_STATE_FC_FLUSHING_DATA is true, so it waits on the
 >    flush bit.
 > 3. ext4_fc_perform_commit() clears EXT4_STATE_FC_FLUSHING_DATA, waking up
 >    ext4_fc_del().
 > 4. Before ext4_fc_del() resumes, ext4_fc_perform_commit() sets
 >    EXT4_STATE_FC_COMMITTING and saves the unpinned inode pointer into the
 >    inodes[] array.
 > 5. ext4_fc_del() resumes, skips re-checking EXT4_STATE_FC_COMMITTING,
 >    removes the inode from the fast commit list, and returns.
 > 6. The VFS completely frees the inode.
 > 7. ext4_fc_snapshot_inodes() iterates over the inodes[] array and passes
 >    the now dangling pointer to ext4_fc_snapshot_inode().
 > 
 > Does ext4_fc_del() need to re-verify the state flags after waking up, or
 > is there another mechanism preventing the inode from being freed before
 > ext4_fc_snapshot_inode() accesses it?
 > 
 > [ ... ]
 > 
 > -- 
 > Sashiko AI review · https://sashiko.dev/#/patchset/20260511084304.1559557-1-me@linux.beauty?part=4
 > 

Yes, I think you're right. I missed that window.

I'll make ext4_fc_del() loop back after waiting on FC_FLUSHING_DATA, so it checks FC_COMMITTING again
under s_fc_lock before removing the inode from the FC lists.

Regards,
Li​


^ permalink raw reply

* [PATCH 2/4] iomap: support invalidating partial folios
From: Zhang Yi @ 2026-05-14  6:29 UTC (permalink / raw)
  To: linux-fsdevel, linux-xfs
  Cc: linux-ext4, brauner, djwong, hch, yi.zhang, yi.zhang, yizhang089,
	yangerkun, yukuai
In-Reply-To: <20260514062955.1183976-1-yi.zhang@huaweicloud.com>

From: Zhang Yi <yi.zhang@huawei.com>

Current iomap_invalidate_folio() can only invalidate an entire folio. If
we truncate a partial folio on a filesystem where the block size is
smaller than the folio size, it will leave behind dirty bits for the
truncated or punched blocks. During the write-back process, it will
attempt to map the invalid hole range. Fortunately, this has not caused
any real problems so far because the ->writeback_range() function
corrects the length.

However, the implementation of FALLOC_FL_ZERO_RANGE in ext4 depends on
the support for invalidating partial folios. When ext4 partially zeroes
out a dirty and unwritten folio, it does not perform a flush first like
XFS. Therefore, if the dirty bits of the corresponding area cannot be
cleared, the zeroed area after writeback remains in the written state
rather than reverting to the unwritten state. Fix this by supporting
invalidation of partial folios.

Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
---
This is taken from:
 https://lore.kernel.org/linux-fsdevel/20240812121159.3775074-3-yi.zhang@huaweicloud.com/
No code changes, only update the commit message to explain why Ext4
needs this.

 fs/iomap/buffered-io.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 64351a448a8b..876c2f507f58 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -761,6 +761,8 @@ void iomap_invalidate_folio(struct folio *folio, size_t offset, size_t len)
 		WARN_ON_ONCE(folio_test_writeback(folio));
 		folio_cancel_dirty(folio);
 		ifs_free(folio);
+	} else {
+		iomap_clear_range_dirty(folio, offset, len);
 	}
 }
 EXPORT_SYMBOL_GPL(iomap_invalidate_folio);
-- 
2.52.0


^ permalink raw reply related

* [PATCH 1/4] iomap: correct the range of a partial dirty clear
From: Zhang Yi @ 2026-05-14  6:29 UTC (permalink / raw)
  To: linux-fsdevel, linux-xfs
  Cc: linux-ext4, brauner, djwong, hch, yi.zhang, yi.zhang, yizhang089,
	yangerkun, yukuai
In-Reply-To: <20260514062955.1183976-1-yi.zhang@huaweicloud.com>

From: Zhang Yi <yi.zhang@huawei.com>

The block range calculation in ifs_clear_range_dirty() is incorrect when
partially clearing a range in a folio. We cannot clear the dirty bit of
the first block or the last block if the start or end offset is not
blocksize-aligned. This has not yet caused any issues since we always
clear a whole folio in iomap_writeback_folio().

Fix this by rounding up the first block to blocksize alignment, and
calculate the last block by rounding down (using truncation). Correct
the nr_blks calculation accordingly.

Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
---
This is modified from:
 https://lore.kernel.org/linux-fsdevel/20240812121159.3775074-2-yi.zhang@huaweicloud.com/
Changes:
 - Use round_up() instead of DIV_ROUND_UP() to prevent wasted integer
   division.

 fs/iomap/buffered-io.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index d7b648421a70..64351a448a8b 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -176,13 +176,17 @@ static void ifs_clear_range_dirty(struct folio *folio,
 {
 	struct inode *inode = folio->mapping->host;
 	unsigned int blks_per_folio = i_blocks_per_folio(inode, folio);
-	unsigned int first_blk = (off >> inode->i_blkbits);
-	unsigned int last_blk = (off + len - 1) >> inode->i_blkbits;
-	unsigned int nr_blks = last_blk - first_blk + 1;
+	unsigned int first_blk = round_up(off, i_blocksize(inode)) >>
+				 inode->i_blkbits;
+	unsigned int last_blk = (off + len) >> inode->i_blkbits;
 	unsigned long flags;
 
+	if (first_blk >= last_blk)
+		return;
+
 	spin_lock_irqsave(&ifs->state_lock, flags);
-	bitmap_clear(ifs->state, first_blk + blks_per_folio, nr_blks);
+	bitmap_clear(ifs->state, first_blk + blks_per_folio,
+		     last_blk - first_blk);
 	spin_unlock_irqrestore(&ifs->state_lock, flags);
 }
 
-- 
2.52.0


^ permalink raw reply related

* [PATCH 3/4] iomap: fix incorrect did_zero setting in iomap_zero_iter()
From: Zhang Yi @ 2026-05-14  6:29 UTC (permalink / raw)
  To: linux-fsdevel, linux-xfs
  Cc: linux-ext4, brauner, djwong, hch, yi.zhang, yi.zhang, yizhang089,
	yangerkun, yukuai
In-Reply-To: <20260514062955.1183976-1-yi.zhang@huaweicloud.com>

From: Zhang Yi <yi.zhang@huawei.com>

The did_zero output parameter was unconditionally set after the loop,
which is incorrect. It should only be set when the zeroing operation
actually completes, not when IOMAP_F_STALE is set or when
IOMAP_F_FOLIO_BATCH is set but !folio causes the loop to break early,
or when iomap_iter_advance() returns an error.

This causes did_zero to be incorrectly set when zeroing a clean
unwritten extent because the loop exits early without actually zeroing
any data.

Fix it by using a local variable to track whether any folio was actually
zeroed, and only set did_zero after the loop if zeroing happened.

Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
---
This is taken from:
 https://lore.kernel.org/linux-fsdevel/20260310082250.3535486-1-yi.zhang@huaweicloud.com/
No changes.

 fs/iomap/buffered-io.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 876c2f507f58..27ab33edbdee 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -1542,6 +1542,7 @@ static int iomap_zero_iter(struct iomap_iter *iter, bool *did_zero,
 		const struct iomap_write_ops *write_ops)
 {
 	u64 bytes = iomap_length(iter);
+	bool zeroed = false;
 	int status;
 
 	do {
@@ -1560,6 +1561,8 @@ static int iomap_zero_iter(struct iomap_iter *iter, bool *did_zero,
 		/* a NULL folio means we're done with a folio batch */
 		if (!folio) {
 			status = iomap_iter_advance_full(iter);
+			if (status)
+				return status;
 			break;
 		}
 
@@ -1570,6 +1573,7 @@ static int iomap_zero_iter(struct iomap_iter *iter, bool *did_zero,
 				bytes);
 
 		folio_zero_range(folio, offset, bytes);
+		zeroed = true;
 		folio_mark_accessed(folio);
 
 		ret = iomap_write_end(iter, bytes, bytes, folio);
@@ -1579,10 +1583,10 @@ static int iomap_zero_iter(struct iomap_iter *iter, bool *did_zero,
 
 		status = iomap_iter_advance(iter, bytes);
 		if (status)
-			break;
+			return status;
 	} while ((bytes = iomap_length(iter)) > 0);
 
-	if (did_zero)
+	if (did_zero && zeroed)
 		*did_zero = true;
 	return status;
 }
-- 
2.52.0


^ permalink raw reply related

* [PATCH 4/4] iomap: fix out-of-bounds bitmap_set() with zero-length range
From: Zhang Yi @ 2026-05-14  6:29 UTC (permalink / raw)
  To: linux-fsdevel, linux-xfs
  Cc: linux-ext4, brauner, djwong, hch, yi.zhang, yi.zhang, yizhang089,
	yangerkun, yukuai
In-Reply-To: <20260514062955.1183976-1-yi.zhang@huaweicloud.com>

From: Zhang Yi <yi.zhang@huawei.com>

ifs_set_range_dirty() and ifs_set_range_uptodate() compute last_blk
as (off + len - 1) >> i_blkbits.  When off is 0 and len is 0, the
unsigned subtraction underflows to SIZE_MAX, producing a huge
last_blk and nr_blks value that causes bitmap_set() to write far
beyond the ifs->state allocation.

Regarding ifs_set_range_uptodate(), it is temporarily safe because len
cannot be passed in as 0. However, for ifs_set_range_dirty() this is
reachable from __iomap_write_end(): when copy_folio_from_iter_atomic()
returns 0 (e.g. user buffer fault) and the folio is already uptodate,
the guard at the top of __iomap_write_end() does not trigger because
!folio_test_uptodate() is false, and iomap_set_range_dirty() is called
with copied == 0.

Add a !len guard to both functions before the computation, so that a
zero-length range is a no-op.

Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
---
 fs/iomap/buffered-io.c | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 27ab33edbdee..6fe5f7e998fd 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -67,11 +67,14 @@ static bool ifs_set_range_uptodate(struct folio *folio,
 		struct iomap_folio_state *ifs, size_t off, size_t len)
 {
 	struct inode *inode = folio->mapping->host;
-	unsigned int first_blk = off >> inode->i_blkbits;
-	unsigned int last_blk = (off + len - 1) >> inode->i_blkbits;
-	unsigned int nr_blks = last_blk - first_blk + 1;
+	unsigned int first_blk, last_blk;
 
-	bitmap_set(ifs->state, first_blk, nr_blks);
+	if (!len)
+		return true;
+
+	first_blk = off >> inode->i_blkbits;
+	last_blk = (off + len - 1) >> inode->i_blkbits;
+	bitmap_set(ifs->state, first_blk, last_blk - first_blk + 1);
 	return ifs_is_fully_uptodate(folio, ifs);
 }
 
@@ -203,13 +206,17 @@ static void ifs_set_range_dirty(struct folio *folio,
 {
 	struct inode *inode = folio->mapping->host;
 	unsigned int blks_per_folio = i_blocks_per_folio(inode, folio);
-	unsigned int first_blk = (off >> inode->i_blkbits);
-	unsigned int last_blk = (off + len - 1) >> inode->i_blkbits;
-	unsigned int nr_blks = last_blk - first_blk + 1;
+	unsigned int first_blk, last_blk;
 	unsigned long flags;
 
+	if (!len)
+		return;
+
+	first_blk = off >> inode->i_blkbits;
+	last_blk = (off + len - 1) >> inode->i_blkbits;
 	spin_lock_irqsave(&ifs->state_lock, flags);
-	bitmap_set(ifs->state, first_blk + blks_per_folio, nr_blks);
+	bitmap_set(ifs->state, first_blk + blks_per_folio,
+		   last_blk - first_blk + 1);
 	spin_unlock_irqrestore(&ifs->state_lock, flags);
 }
 
-- 
2.52.0


^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox