Linux-mm Archive on lore.kernel.org
 help / color / mirror / Atom feed
From: Jan Kara <jack@suse.cz>
To: <linux-fsdevel@vger.kernel.org>
Cc: <linux-mm@kvack.org>, Matthew Wilcox <willy@infradead.org>,
	Jan Kara <jack@suse.cz>
Subject: [PATCH 3/4] fs: Add throttling to deferred inode reclaim
Date: Wed, 29 Apr 2026 20:00:53 +0200	[thread overview]
Message-ID: <20260429180056.29598-7-jack@suse.cz> (raw)
In-Reply-To: <20260429174850.18223-1-jack@suse.cz>

Deferring difficult inode reclaim from prune_icache_sb() to a workqueue
removes the natural feedback loop of blocking tasks in direct reclaim
until they make space for new allocations. This can result in the list
of deferred inodes to grow beyond any bounds and possibly push the
machine to a reclaim storm or OOM.

Add a throttling mechanism slowing down tasks in
mark_inode_reclaim_deferred() if the list of deferred inodes to reclaim
grows over limit. We measure average time it takes to reclaim inode on
deferred list and block tasks proportionally to that.

Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/inode.c                       | 94 +++++++++++++++++++++++++++++---
 include/linux/fs/super_types.h   |  2 +
 include/trace/events/writeback.h | 51 +++++++++++++++++
 3 files changed, 139 insertions(+), 8 deletions(-)

diff --git a/fs/inode.c b/fs/inode.c
index 448e3d7ee48e..fe39f96fbc80 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -941,6 +941,7 @@ EXPORT_SYMBOL_GPL(evict_inodes);
 struct inodes_to_prune {
 	struct list_head freeable;
 	struct list_head deferred;
+	int deferred_count;
 };
 
 /*
@@ -1013,9 +1014,10 @@ static enum lru_status inode_lru_isolate(struct list_head *item,
 	WARN_ON(inode_state_read(inode) & I_NEW);
 	inode_state_set(inode, I_FREEING);
 	/* Inode will take long time to cleanup. Offload that to worker. */
-	if (inode_state_read(inode) & I_DEFER_RECLAIM)
+	if (inode_state_read(inode) & I_DEFER_RECLAIM) {
 		list_lru_isolate_move(lru, &inode->i_lru, &lists->deferred);
-	else
+		lists->deferred_count++;
+	} else
 		list_lru_isolate_move(lru, &inode->i_lru, &lists->freeable);
 	spin_unlock(&inode->i_lock);
 
@@ -1052,27 +1054,58 @@ long prune_icache_sb(struct super_block *sb, struct shrink_control *sc)
 		if (list_empty(&reclaim->list))
 			queue_work(system_dfl_wq, &reclaim->work);
 		list_splice_tail(&lists.deferred, &reclaim->list);
+		reclaim->len += lists.deferred_count;
 		spin_unlock(&reclaim->lock);
 	}
 	return freed;
 }
 
+static void inode_reclaim_update_stat(struct inode_deferred_reclaim *reclaim,
+				      struct super_block *sb, unsigned int n,
+				      u64 start)
+{
+	u64 end = ktime_get_ns();
+	u32 delay;
+
+	delay = div_u64(end - start, n);
+	/* Smooth delay updates with exponential moving average */
+	reclaim->delay = (63 * (u64)reclaim->delay + delay) / 64;
+
+	trace_inode_reclaim_update_stat(sb, n, delay, reclaim->delay);
+}
+
 static void inode_reclaim_deferred(struct work_struct *work)
 {
 	struct inode_deferred_reclaim *reclaim =
 		container_of(work, struct inode_deferred_reclaim, work);
+	struct super_block *sb = NULL;
 	struct inode *inode;
+	u64 start;
+	unsigned int batch = 0;
 
 	spin_lock(&reclaim->lock);
 	while (!list_empty(&reclaim->list)) {
 		inode = list_first_entry(&reclaim->list, struct inode, i_lru);
 		list_del_init(&inode->i_lru);
+		reclaim->len--;
 		spin_unlock(&reclaim->lock);
+		if (!sb)
+			sb = inode->i_sb;
+		if (!batch)
+			start = ktime_get_ns();
 		evict(inode);
+		batch++;
+		/* Batch stat updates to avoid excessive computations */
+		if (batch >= 64 || need_resched()) {
+			inode_reclaim_update_stat(reclaim, sb, batch, start);
+			batch = 0;
+		}
 		cond_resched();
 		spin_lock(&reclaim->lock);
 	}
 	spin_unlock(&reclaim->lock);
+	if (batch)
+		inode_reclaim_update_stat(reclaim, sb, batch, start);
 }
 
 static struct inode_deferred_reclaim *inode_deferred_reclaim_alloc(
@@ -1091,20 +1124,65 @@ static struct inode_deferred_reclaim *inode_deferred_reclaim_alloc(
 	return sb->s_inode_reclaim;
 }
 
+/*
+ * Size of deferred reclaim list from which we start throttling tasks creating
+ * inodes marked for deferred reclaim.
+ */
+#define INODE_DEFERRED_RECLAIM_LIMIT 8192
+
+static void throttle_inode_deferred_reclaim(struct inode *inode)
+{
+	unsigned int len;
+	struct inode_deferred_reclaim *reclaim =
+				READ_ONCE(inode->i_sb->s_inode_reclaim);
+
+	if (!reclaim)
+		reclaim = inode_deferred_reclaim_alloc(inode->i_sb);
+
+	/*
+	 * If inodes with deferred reclaim are accumulating too much, slow down
+	 * tasks creating them. This doesn't provide any kind of guarantee on
+	 * the length of the deferred list since lots of inodes with
+	 * I_DEFER_RECLAIM can be already present in the inode cache and we
+	 * have no control when they reach the deferred list. But if the
+	 * pressure on the deferred list is sustained, the balance should
+	 * eventually be established.
+	 */
+	len = READ_ONCE(reclaim->len);
+	if (len > INODE_DEFERRED_RECLAIM_LIMIT) {
+		u64 delay = READ_ONCE(reclaim->delay);
+
+		if (!delay)
+			return;
+		/*
+		 * Scale the delay based on how much we exceed the limit. Wait
+		 * at most 4x as long as estimated time to reclaim the inode.
+		 */
+		len = min(len, 5 * INODE_DEFERRED_RECLAIM_LIMIT);
+		delay = div_u64(delay * (len - INODE_DEFERRED_RECLAIM_LIMIT),
+				INODE_DEFERRED_RECLAIM_LIMIT);
+		trace_mark_inode_reclaim_deferred_throttle(inode, len, delay);
+
+		schedule_timeout_killable(nsecs_to_jiffies(delay));
+	}
+}
+
 void mark_inode_reclaim_deferred(struct inode *inode)
 {
-	struct inode_deferred_reclaim *reclaim;
+	bool throttle = false;
 
 	if (inode_state_read_once(inode) & I_DEFER_RECLAIM)
 		return;
 
-	reclaim = READ_ONCE(inode->i_sb->s_inode_reclaim);
-	if (!reclaim)
-		reclaim = inode_deferred_reclaim_alloc(inode->i_sb);
-
 	spin_lock(&inode->i_lock);
-	inode_state_set(inode, I_DEFER_RECLAIM);
+	if (!(inode_state_read(inode) & I_DEFER_RECLAIM)) {
+		inode_state_set(inode, I_DEFER_RECLAIM);
+		throttle = true;
+	}
 	spin_unlock(&inode->i_lock);
+
+	if (throttle)
+		throttle_inode_deferred_reclaim(inode);
 }
 EXPORT_SYMBOL_GPL(mark_inode_reclaim_deferred);
 
diff --git a/include/linux/fs/super_types.h b/include/linux/fs/super_types.h
index 00744ae5be18..533256892550 100644
--- a/include/linux/fs/super_types.h
+++ b/include/linux/fs/super_types.h
@@ -133,6 +133,8 @@ struct inode_deferred_reclaim {
 	struct list_head	list;
 	struct work_struct	work;
 	spinlock_t		lock;
+	unsigned int		len;
+	u32			delay;
 };
 
 struct super_block {
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
index bdac0d685a98..c0ae39b4dc7b 100644
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -879,6 +879,57 @@ DEFINE_EVENT(writeback_inode_template, sb_clear_inode_writeback,
 	TP_ARGS(inode)
 );
 
+TRACE_EVENT(inode_reclaim_update_stat,
+	TP_PROTO(
+		struct super_block *sb,
+		unsigned int n,
+		u32 batch_delay,
+		u32 avg_delay
+	),
+	TP_ARGS(sb, n, batch_delay, avg_delay),
+
+	TP_STRUCT__entry(
+		__field(dev_t,		dev)
+		__field(unsigned int,	n)
+		__field(u32,		batch_delay)
+		__field(u32,		avg_delay)
+	),
+
+	TP_fast_assign(
+		__entry->dev = sb->s_dev;
+		__entry->n = n;
+		__entry->batch_delay = batch_delay;
+		__entry->avg_delay = avg_delay;
+	),
+
+	TP_printk("dev %d,%d batch size %u batch delay %u ns avg delay %u ns",
+		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->n,
+		  __entry->batch_delay, __entry->avg_delay)
+);
+
+TRACE_EVENT(mark_inode_reclaim_deferred_throttle,
+	TP_PROTO(struct inode *inode, unsigned int len, u64 delay),
+	TP_ARGS(inode, len, delay),
+
+	TP_STRUCT__entry(
+		__field(u64,		ino)
+		__field(dev_t,		dev)
+		__field(unsigned int,	len)
+		__field(u64,		delay)
+	),
+
+	TP_fast_assign(
+		__entry->ino = inode->i_ino;
+		__entry->dev = inode->i_sb->s_dev;
+		__entry->len = len;
+		__entry->delay = delay;
+	),
+
+	TP_printk("dev %d,%d ino %llu deferred list len %u delay %llu ns",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino, __entry->len, __entry->delay)
+);
+
 #endif /* _TRACE_WRITEBACK_H */
 
 /* This part must be outside protection */
-- 
2.51.0



  parent reply	other threads:[~2026-04-29 18:01 UTC|newest]

Thread overview: 5+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-04-29 18:00 [PATCH RFC 0/4] fs: Deferred inode reclaim Jan Kara
2026-04-29 18:00 ` [PATCH 1/4] fs: Avoid inode dirtying on last iput Jan Kara
2026-04-29 18:00 ` [PATCH 2/4] fs: Basic infrastructure for offloading inode reclaim Jan Kara
2026-04-29 18:00 ` Jan Kara [this message]
2026-04-29 18:00 ` [PATCH 4/4] ext4: Defer inode reclaim if it has preallocations Jan Kara

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260429180056.29598-7-jack@suse.cz \
    --to=jack@suse.cz \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=willy@infradead.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox