From: Josef Bacik <josef@toxicpanda.com>
To: linux-fsdevel@vger.kernel.org, linux-btrfs@vger.kernel.org,
kernel-team@fb.com, linux-ext4@vger.kernel.org,
linux-xfs@vger.kernel.org, brauner@kernel.org,
viro@ZenIV.linux.org.uk
Subject: [PATCH 14/50] fs: maintain a list of pinned inodes
Date: Thu, 21 Aug 2025 16:18:25 -0400 [thread overview]
Message-ID: <cbca76c429c4f3418cc219deb1a9eb917a77cde0.1755806649.git.josef@toxicpanda.com> (raw)
In-Reply-To: <cover.1755806649.git.josef@toxicpanda.com>
Currently we have relied on dirty inodes and inodes with cache on them
to simply be left hanging around on the system outside of an LRU. The
only way to make sure these inodes are eventually reclaimed is because
dirty writeback will grab a reference on the inode and then iput it when
it's done, potentially getting it on the LRU. For the cached case the
page cache deletion path will call inode_add_lru when the inode no
longer has cached pages in order to make sure the inode object can be
freed eventually. In the unmount case we walk all inodes and free them
so this all works out fine.
But we want to eliminate 0 i_count objects as a concept, so we need a
mechanism to hold a reference on these pinned inodes. To that end, add a
list to the super block that contains any inodes that are cached for one
reason or another.
When we call inode_add_lru(), if the inode falls into one of these
categories, we will add it to the cached inode list and hold an
i_obj_count reference. If the inode does not fall into one of these
categories it will be moved to the normal LRU, which is already holds an
i_obj_count reference.
The dirty case we will delete it from the LRU if it is on one, and then
the iput after the writeout will make sure it's placed onto the correct
list at that point.
The page cache case will migrate it when it calls inode_add_lru() when
deleting pages from the page cache.
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
---
fs/fs-writeback.c | 8 +++
fs/inode.c | 102 +++++++++++++++++++++++++++++--
fs/internal.h | 1 +
fs/super.c | 3 +
include/linux/fs.h | 11 ++++
include/trace/events/writeback.h | 3 +-
6 files changed, 121 insertions(+), 7 deletions(-)
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index d2e1fb1a0787..111a9d8215bf 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -2736,6 +2736,14 @@ static void wait_sb_inodes(struct super_block *sb)
continue;
}
__iget(inode);
+
+ /*
+ * We could have potentially ended up on the cached LRU list, so
+ * remove ourselves from this list now that we have a reference,
+ * the iput will handle placing it back on the appropriate LRU
+ * list if necessary.
+ */
+ inode_lru_list_del(inode);
spin_unlock(&inode->i_lock);
rcu_read_unlock();
diff --git a/fs/inode.c b/fs/inode.c
index 94769b356224..adcba0a4d776 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -319,6 +319,23 @@ void free_inode_nonrcu(struct inode *inode)
}
EXPORT_SYMBOL(free_inode_nonrcu);
+/*
+ * Some inodes need to stay pinned in memory because they are dirty or there are
+ * cached pages that the VM wants to keep around to avoid thrashing. This does
+ * the appropriate checks to see if we want to sheild this inode from periodic
+ * reclaim. Must be called with ->i_lock held.
+ */
+static bool inode_needs_cached(struct inode *inode)
+{
+ lockdep_assert_held(&inode->i_lock);
+
+ if (inode->i_state & (I_DIRTY_ALL | I_SYNC))
+ return true;
+ if (!mapping_shrinkable(&inode->i_data))
+ return true;
+ return false;
+}
+
static void i_callback(struct rcu_head *head)
{
struct inode *inode = container_of(head, struct inode, i_rcu);
@@ -532,20 +549,67 @@ void ihold(struct inode *inode)
}
EXPORT_SYMBOL(ihold);
+static void inode_add_cached_lru(struct inode *inode)
+{
+ lockdep_assert_held(&inode->i_lock);
+
+ if (inode->i_state & I_CACHED_LRU)
+ return;
+ if (!list_empty(&inode->i_lru))
+ return;
+
+ inode->i_state |= I_CACHED_LRU;
+ spin_lock(&inode->i_sb->s_cached_inodes_lock);
+ list_add(&inode->i_lru, &inode->i_sb->s_cached_inodes);
+ spin_unlock(&inode->i_sb->s_cached_inodes_lock);
+ iobj_get(inode);
+}
+
+static bool __inode_del_cached_lru(struct inode *inode)
+{
+ lockdep_assert_held(&inode->i_lock);
+
+ if (!(inode->i_state & I_CACHED_LRU))
+ return false;
+
+ inode->i_state &= ~I_CACHED_LRU;
+ spin_lock(&inode->i_sb->s_cached_inodes_lock);
+ list_del_init(&inode->i_lru);
+ spin_unlock(&inode->i_sb->s_cached_inodes_lock);
+ return true;
+}
+
+static bool inode_del_cached_lru(struct inode *inode)
+{
+ if (__inode_del_cached_lru(inode)) {
+ iobj_put(inode);
+ return true;
+ }
+ return false;
+}
+
static void __inode_add_lru(struct inode *inode, bool rotate)
{
- if (inode->i_state & (I_DIRTY_ALL | I_SYNC | I_FREEING | I_WILL_FREE))
+ bool need_ref = true;
+
+ lockdep_assert_held(&inode->i_lock);
+
+ if (inode->i_state & (I_FREEING | I_WILL_FREE))
return;
if (atomic_read(&inode->i_count))
return;
if (!(inode->i_sb->s_flags & SB_ACTIVE))
return;
- if (!mapping_shrinkable(&inode->i_data))
+ if (inode_needs_cached(inode)) {
+ inode_add_cached_lru(inode);
return;
+ }
+ need_ref = __inode_del_cached_lru(inode) == false;
if (list_lru_add_obj(&inode->i_sb->s_inode_lru, &inode->i_lru)) {
- iobj_get(inode);
inode->i_state |= I_LRU;
+ if (need_ref)
+ iobj_get(inode);
this_cpu_inc(nr_unused);
} else if (rotate) {
inode->i_state |= I_REFERENCED;
@@ -573,8 +637,19 @@ void inode_add_lru(struct inode *inode)
__inode_add_lru(inode, false);
}
-static void inode_lru_list_del(struct inode *inode)
+/*
+ * Caller must be holding it's own i_count reference on this inode in order to
+ * prevent this being the final iput.
+ *
+ * Needs inode->i_lock held.
+ */
+void inode_lru_list_del(struct inode *inode)
{
+ lockdep_assert_held(&inode->i_lock);
+
+ if (inode_del_cached_lru(inode))
+ return;
+
if (!(inode->i_state & I_LRU))
return;
@@ -950,6 +1025,22 @@ static enum lru_status inode_lru_isolate(struct list_head *item,
if (!spin_trylock(&inode->i_lock))
return LRU_SKIP;
+ /*
+ * This inode is either dirty or has page cache we want to keep around,
+ * so move it to the cached list.
+ *
+ * We drop the extra i_obj_count reference we grab when adding it to the
+ * cached lru.
+ */
+ if (inode_needs_cached(inode)) {
+ list_lru_isolate(lru, &inode->i_lru);
+ inode_add_cached_lru(inode);
+ iobj_put(inode);
+ spin_unlock(&inode->i_lock);
+ this_cpu_dec(nr_unused);
+ return LRU_REMOVED;
+ }
+
/*
* Inodes can get referenced, redirtied, or repopulated while
* they're already on the LRU, and this can make them
@@ -957,8 +1048,7 @@ static enum lru_status inode_lru_isolate(struct list_head *item,
* sync, or the last page cache deletion will requeue them.
*/
if (atomic_read(&inode->i_count) ||
- (inode->i_state & ~I_REFERENCED) ||
- !mapping_shrinkable(&inode->i_data)) {
+ (inode->i_state & ~I_REFERENCED)) {
list_lru_isolate(lru, &inode->i_lru);
inode->i_state &= ~I_LRU;
spin_unlock(&inode->i_lock);
diff --git a/fs/internal.h b/fs/internal.h
index 38e8aab27bbd..17ecee7056d5 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -207,6 +207,7 @@ extern long prune_icache_sb(struct super_block *sb, struct shrink_control *sc);
int dentry_needs_remove_privs(struct mnt_idmap *, struct dentry *dentry);
bool in_group_or_capable(struct mnt_idmap *idmap,
const struct inode *inode, vfsgid_t vfsgid);
+void inode_lru_list_del(struct inode *inode);
/*
* fs-writeback.c
diff --git a/fs/super.c b/fs/super.c
index a038848e8d1f..bf3e6d9055af 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -364,6 +364,8 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags,
spin_lock_init(&s->s_inode_list_lock);
INIT_LIST_HEAD(&s->s_inodes_wb);
spin_lock_init(&s->s_inode_wblist_lock);
+ INIT_LIST_HEAD(&s->s_cached_inodes);
+ spin_lock_init(&s->s_cached_inodes_lock);
s->s_count = 1;
atomic_set(&s->s_active, 1);
@@ -409,6 +411,7 @@ static void __put_super(struct super_block *s)
WARN_ON(s->s_dentry_lru.node);
WARN_ON(s->s_inode_lru.node);
WARN_ON(!list_empty(&s->s_mounts));
+ WARN_ON(!list_empty(&s->s_cached_inodes));
call_rcu(&s->rcu, destroy_super_rcu);
}
}
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 509e696a4df0..8384ed81a5ad 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -749,6 +749,9 @@ is_uncached_acl(struct posix_acl *acl)
* ->i_lru is on the LRU and those that are using ->i_lru
* for some other means.
*
+ * I_CACHED_LRU Inode is cached because it is dirty or isn't shrinkable,
+ * and thus is on the s_cached_inode_lru list.
+ *
* Q: What is the difference between I_WILL_FREE and I_FREEING?
*
* __I_{SYNC,NEW,LRU_ISOLATING} are used to derive unique addresses to wait
@@ -786,6 +789,7 @@ enum inode_state_bits {
INODE_BIT(I_SYNC_QUEUED),
INODE_BIT(I_PINNING_NETFS_WB),
INODE_BIT(I_LRU),
+ INODE_BIT(I_CACHED_LRU),
};
#define I_DIRTY_INODE (I_DIRTY_SYNC | I_DIRTY_DATASYNC)
@@ -1584,6 +1588,13 @@ struct super_block {
spinlock_t s_inode_wblist_lock;
struct list_head s_inodes_wb; /* writeback inodes */
+
+ /*
+ * Cached inodes, any inodes that their reference is held by another
+ * mechanism, such as dirty inodes or unshrinkable inodes.
+ */
+ spinlock_t s_cached_inodes_lock;
+ struct list_head s_cached_inodes;
} __randomize_layout;
static inline struct user_namespace *i_user_ns(const struct inode *inode)
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
index 486f85aca84d..6949329c744a 100644
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -29,7 +29,8 @@
{I_SYNC_QUEUED, "I_SYNC_QUEUED"}, \
{I_PINNING_NETFS_WB, "I_PINNING_NETFS_WB"}, \
{I_LRU_ISOLATING, "I_LRU_ISOLATING"}, \
- {I_LRU, "I_LRU"} \
+ {I_LRU, "I_LRU"}, \
+ {I_CACHED_LRU, "I_CACHED_LRU"} \
)
/* enums need to be exported to user space */
--
2.49.0
next prev parent reply other threads:[~2025-08-21 20:20 UTC|newest]
Thread overview: 85+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-08-21 20:18 [PATCH 00/50] fs: rework inode reference counting Josef Bacik
2025-08-21 20:18 ` [PATCH 01/50] fs: add an i_obj_count refcount to the inode Josef Bacik
2025-08-21 20:18 ` [PATCH 02/50] fs: make the i_state flags an enum Josef Bacik
2025-08-22 11:08 ` Christian Brauner
2025-08-22 13:31 ` Josef Bacik
2025-08-22 14:36 ` David Sterba
2025-08-22 11:18 ` Sun YangKai
2025-08-22 11:42 ` [PATCH 02/50] " Alan Huang
2025-08-22 12:11 ` Sun YangKai
2025-08-22 14:40 ` [PATCH 02/50] fs: " Josef Bacik
2025-08-21 20:18 ` [PATCH 03/50] fs: hold an i_obj_count reference in wait_sb_inodes Josef Bacik
2025-08-21 20:18 ` [PATCH 04/50] fs: hold an i_obj_count reference for the i_wb_list Josef Bacik
2025-08-22 11:27 ` Christian Brauner
2025-08-21 20:18 ` [PATCH 05/50] fs: hold an i_obj_count reference for the i_io_list Josef Bacik
2025-08-21 20:18 ` [PATCH 06/50] fs: hold an i_obj_count reference in writeback_sb_inodes Josef Bacik
2025-08-22 12:20 ` Christian Brauner
2025-08-21 20:18 ` [PATCH 07/50] fs: hold an i_obj_count reference while on the hashtable Josef Bacik
2025-08-21 20:18 ` [PATCH 08/50] fs: hold an i_obj_count reference while on the LRU list Josef Bacik
2025-08-21 20:18 ` [PATCH 09/50] fs: hold an i_obj_count reference while on the sb inode list Josef Bacik
2025-08-21 20:18 ` [PATCH 10/50] fs: stop accessing ->i_count directly in f2fs and gfs2 Josef Bacik
2025-08-22 12:38 ` (subset) " Christian Brauner
2025-08-21 20:18 ` [PATCH 11/50] fs: hold an i_obj_count when we have an i_count reference Josef Bacik
2025-08-21 20:18 ` [PATCH 12/50] fs: rework iput logic Josef Bacik
2025-08-22 12:54 ` Christian Brauner
2025-08-21 20:18 ` [PATCH 13/50] fs: add an I_LRU flag to the inode Josef Bacik
2025-08-21 20:18 ` Josef Bacik [this message]
2025-08-22 14:55 ` [PATCH 14/50] fs: maintain a list of pinned inodes Christian Brauner
2025-08-21 20:18 ` [PATCH 15/50] fs: delete the inode from the LRU list on lookup Josef Bacik
2025-08-22 15:27 ` Christian Brauner
2025-08-21 20:18 ` [PATCH 16/50] fs: change evict_inodes to use iput instead of evict directly Josef Bacik
2025-08-25 9:07 ` Christian Brauner
2025-08-25 19:35 ` Josef Bacik
2025-08-26 9:56 ` Christian Brauner
2025-08-21 20:18 ` [PATCH 17/50] fs: hold a full ref while the inode is on a LRU Josef Bacik
2025-08-25 9:20 ` Christian Brauner
2025-08-25 10:40 ` Christian Brauner
2025-08-21 20:18 ` [PATCH 18/50] fs: disallow 0 reference count inodes Josef Bacik
2025-08-25 10:54 ` Christian Brauner
2025-08-25 19:26 ` Josef Bacik
2025-08-26 9:28 ` Christian Brauner
2025-08-21 20:18 ` [PATCH 19/50] fs: make evict_inodes add to the dispose list under the i_lock Josef Bacik
2025-08-21 20:18 ` [PATCH 20/50] fs: convert i_count to refcount_t Josef Bacik
2025-08-22 12:10 ` Amir Goldstein
2025-08-22 13:56 ` kernel test robot
2025-08-25 11:03 ` Christian Brauner
2025-08-21 20:18 ` [PATCH 21/50] fs: use refcount_inc_not_zero in igrab Josef Bacik
2025-08-25 11:21 ` Christian Brauner
2025-08-21 20:18 ` [PATCH 22/50] fs: use inode_tryget in find_inode* Josef Bacik
2025-08-25 11:26 ` Christian Brauner
2025-08-21 20:18 ` [PATCH 23/50] fs: update find_inode_*rcu to check the i_count count Josef Bacik
2025-08-25 11:27 ` Christian Brauner
2025-08-21 20:18 ` [PATCH 24/50] fs: use igrab in insert_inode_locked Josef Bacik
2025-08-21 20:18 ` [PATCH 25/50] fs: remove I_WILL_FREE|I_FREEING check from __inode_add_lru Josef Bacik
2025-08-21 20:18 ` [PATCH 26/50] fs: remove I_WILL_FREE|I_FREEING check in inode_pin_lru_isolating Josef Bacik
2025-08-21 20:18 ` [PATCH 27/50] fs: use inode_tryget in evict_inodes Josef Bacik
2025-08-25 11:43 ` Christian Brauner
2025-08-25 18:22 ` Josef Bacik
2025-08-21 20:18 ` [PATCH 28/50] fs: change evict_dentries_for_decrypted_inodes to use refcount Josef Bacik
2025-08-21 20:18 ` [PATCH 29/50] block: use igrab in sync_bdevs Josef Bacik
2025-08-21 20:18 ` [PATCH 30/50] bcachefs: use the refcount instead of I_WILL_FREE|I_FREEING Josef Bacik
2025-08-21 20:18 ` [PATCH 31/50] btrfs: don't check I_WILL_FREE|I_FREEING Josef Bacik
2025-08-21 20:18 ` [PATCH 32/50] fs: use igrab in drop_pagecache_sb Josef Bacik
2025-08-21 20:18 ` [PATCH 33/50] fs: stop checking I_FREEING in d_find_alias_rcu Josef Bacik
2025-08-21 20:18 ` [PATCH 34/50] ext4: stop checking I_WILL_FREE|IFREEING in ext4_check_map_extents_env Josef Bacik
2025-08-21 20:18 ` [PATCH 35/50] fs: remove I_WILL_FREE|I_FREEING from fs-writeback.c Josef Bacik
2025-08-25 11:46 ` Christian Brauner
2025-08-21 20:18 ` [PATCH 36/50] gfs2: remove I_WILL_FREE|I_FREEING usage Josef Bacik
2025-08-21 20:18 ` [PATCH 37/50] fs: remove I_WILL_FREE|I_FREEING check from dquot.c Josef Bacik
2025-08-21 20:18 ` [PATCH 38/50] notify: remove I_WILL_FREE|I_FREEING checks in fsnotify_unmount_inodes Josef Bacik
2025-08-21 20:18 ` [PATCH 39/50] xfs: remove I_FREEING check Josef Bacik
2025-08-21 20:18 ` [PATCH 40/50] landlock: remove I_FREEING|I_WILL_FREE check Josef Bacik
2025-08-21 20:18 ` [PATCH 41/50] fs: change inode_is_dirtytime_only to use refcount Josef Bacik
2025-08-21 20:18 ` [PATCH 42/50] btrfs: remove references to I_FREEING Josef Bacik
2025-08-21 20:18 ` [PATCH 43/50] ext4: remove reference to I_FREEING in inode.c Josef Bacik
2025-08-21 20:18 ` [PATCH 44/50] ext4: remove reference to I_FREEING in orphan.c Josef Bacik
2025-08-21 20:18 ` [PATCH 45/50] pnfs: use i_count refcount to determine if the inode is going away Josef Bacik
2025-08-21 20:18 ` [PATCH 46/50] fs: remove some spurious I_FREEING references in inode.c Josef Bacik
2025-08-21 20:18 ` [PATCH 47/50] xfs: remove reference to I_FREEING|I_WILL_FREE Josef Bacik
2025-08-21 20:18 ` [PATCH 48/50] ocfs2: do not set I_WILL_FREE Josef Bacik
2025-08-21 20:19 ` [PATCH 49/50] fs: remove I_FREEING|I_WILL_FREE Josef Bacik
2025-08-25 11:53 ` Christian Brauner
2025-08-21 20:19 ` [PATCH 50/50] fs: add documentation explaining the reference count rules for inodes Josef Bacik
2025-08-25 11:56 ` Christian Brauner
2025-08-22 10:51 ` [PATCH 00/50] fs: rework inode reference counting Christian Brauner
2025-08-22 13:30 ` Josef Bacik
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=cbca76c429c4f3418cc219deb1a9eb917a77cde0.1755806649.git.josef@toxicpanda.com \
--to=josef@toxicpanda.com \
--cc=brauner@kernel.org \
--cc=kernel-team@fb.com \
--cc=linux-btrfs@vger.kernel.org \
--cc=linux-ext4@vger.kernel.org \
--cc=linux-fsdevel@vger.kernel.org \
--cc=linux-xfs@vger.kernel.org \
--cc=viro@ZenIV.linux.org.uk \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).