From: Mateusz Guzik <mjguzik@gmail.com>
To: brauner@kernel.org
Cc: viro@zeniv.linux.org.uk, jack@suse.cz,
linux-kernel@vger.kernel.org, linux-fsdevel@vger.kernel.org,
Mateusz Guzik <mjguzik@gmail.com>
Subject: [PATCH v5 4/4] fs: allow lockless ->i_count bumps as long as it does not transition 0->1
Date: Tue, 31 Mar 2026 18:08:51 +0200 [thread overview]
Message-ID: <20260331160851.3854954-5-mjguzik@gmail.com> (raw)
In-Reply-To: <20260331160851.3854954-1-mjguzik@gmail.com>
With this change only 0->1 and 1->0 transitions need the lock.
I verified all places which look at the refcount either only care about
it staying 0 (and have the lock enforce it) or don't hold the inode lock
to begin with (making the above change irrelevant to their correcness or
lack thereof).
I also confirmed nfs and btrfs like to call into these a lot and now
avoid the lock in the common case, shaving off some atomics.
Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
---
fs/dcache.c | 4 +++
fs/inode.c | 65 ++++++++++++++++++++++++++++++++++++++++++++++
include/linux/fs.h | 4 +--
3 files changed, 71 insertions(+), 2 deletions(-)
diff --git a/fs/dcache.c b/fs/dcache.c
index 9ceab142896f..b63450ebb85c 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -2033,6 +2033,10 @@ void d_instantiate_new(struct dentry *entry, struct inode *inode)
__d_instantiate(entry, inode);
spin_unlock(&entry->d_lock);
WARN_ON(!(inode_state_read(inode) & I_NEW));
+ /*
+ * Paired with igrab_try_lockless()
+ */
+ smp_wmb();
inode_state_clear(inode, I_NEW | I_CREATING);
inode_wake_up_bit(inode, __I_NEW);
spin_unlock(&inode->i_lock);
diff --git a/fs/inode.c b/fs/inode.c
index 013470e6d144..03472be4e1a9 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1029,6 +1029,7 @@ long prune_icache_sb(struct super_block *sb, struct shrink_control *sc)
}
static void __wait_on_freeing_inode(struct inode *inode, bool hash_locked, bool rcu_locked);
+static bool igrab_try_lockless(struct inode *inode);
/*
* Called with the inode lock held.
@@ -1053,6 +1054,11 @@ static struct inode *find_inode(struct super_block *sb,
continue;
if (!test(inode, data))
continue;
+ if (igrab_try_lockless(inode)) {
+ rcu_read_unlock();
+ *isnew = false;
+ return inode;
+ }
spin_lock(&inode->i_lock);
if (inode_state_read(inode) & (I_FREEING | I_WILL_FREE)) {
__wait_on_freeing_inode(inode, hash_locked, true);
@@ -1095,6 +1101,11 @@ static struct inode *find_inode_fast(struct super_block *sb,
continue;
if (inode->i_sb != sb)
continue;
+ if (igrab_try_lockless(inode)) {
+ rcu_read_unlock();
+ *isnew = false;
+ return inode;
+ }
spin_lock(&inode->i_lock);
if (inode_state_read(inode) & (I_FREEING | I_WILL_FREE)) {
__wait_on_freeing_inode(inode, hash_locked, true);
@@ -1212,6 +1223,10 @@ void unlock_new_inode(struct inode *inode)
lockdep_annotate_inode_mutex_key(inode);
spin_lock(&inode->i_lock);
WARN_ON(!(inode_state_read(inode) & I_NEW));
+ /*
+ * Paired with igrab_try_lockless()
+ */
+ smp_wmb();
inode_state_clear(inode, I_NEW | I_CREATING);
inode_wake_up_bit(inode, __I_NEW);
spin_unlock(&inode->i_lock);
@@ -1223,6 +1238,10 @@ void discard_new_inode(struct inode *inode)
lockdep_annotate_inode_mutex_key(inode);
spin_lock(&inode->i_lock);
WARN_ON(!(inode_state_read(inode) & I_NEW));
+ /*
+ * Paired with igrab_try_lockless()
+ */
+ smp_wmb();
inode_state_clear(inode, I_NEW);
inode_wake_up_bit(inode, __I_NEW);
spin_unlock(&inode->i_lock);
@@ -1582,6 +1601,14 @@ EXPORT_SYMBOL(ihold);
struct inode *igrab(struct inode *inode)
{
+ /*
+ * Read commentary above igrab_try_lockless() for an explanation why this works.
+ */
+ if (atomic_add_unless(&inode->i_count, 1, 0)) {
+ VFS_BUG_ON_INODE(inode_state_read_once(inode) & (I_FREEING | I_WILL_FREE), inode);
+ return inode;
+ }
+
spin_lock(&inode->i_lock);
if (!(inode_state_read(inode) & (I_FREEING | I_WILL_FREE))) {
__iget(inode);
@@ -1599,6 +1626,44 @@ struct inode *igrab(struct inode *inode)
}
EXPORT_SYMBOL(igrab);
+/*
+ * igrab_try_lockless - special inode refcount acquire primitive for the inode hash
+ * (don't use elsewhere!)
+ *
+ * It provides lockless refcount acquire in the common case of no problematic
+ * flags being set and the count being > 0.
+ *
+ * There are 4 state flags to worry about and the routine makes sure to not bump the
+ * ref if any of them is present.
+ *
+ * I_NEW and I_CREATING can only legally get set *before* the inode becomes visible
+ * during lookup. Thus if the flags are not spotted, they are guaranteed to not be
+ * a factor. However, we need an acquire fence before returning the inode just
+ * in case we raced against clearing the state to make sure our consumer picks up
+ * any other changes made prior. atomic_add_unless provides a full fence, which
+ * takes care of it.
+ *
+ * I_FREEING and I_WILL_FREE can only legally get set if ->i_count == 0 and it is
+ * illegal to bump the ref if either is present. Consequently if atomic_add_unless
+ * managed to replace a non-0 value with a bigger one, we have a guarantee neither
+ * of these flags is set. Note this means explicitly checking of these flags below
+ * is not necessary, it is only done because it does not cost anything on top of the
+ * load which already needs to be done to handle the other flags.
+ */
+static bool igrab_try_lockless(struct inode *inode)
+{
+ if (inode_state_read_once(inode) & (I_NEW | I_CREATING | I_FREEING | I_WILL_FREE))
+ return false;
+ /*
+ * Paired with routines clearing I_NEW
+ */
+ if (atomic_add_unless(&inode->i_count, 1, 0)) {
+ VFS_BUG_ON_INODE(inode_state_read_once(inode) & (I_FREEING | I_WILL_FREE), inode);
+ return true;
+ }
+ return false;
+}
+
/**
* ilookup5_nowait - search for an inode in the inode cache
* @sb: super block of file system to search
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 07363fce4406..119e0a3d2f42 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2234,8 +2234,8 @@ static inline int icount_read_once(const struct inode *inode)
}
/*
- * returns the refcount on the inode. The lock guarantees no new references
- * are added, but references can be dropped as long as the result is > 0.
+ * returns the refcount on the inode. The lock guarantees no 0->1 or 1->0 transitions
+ * of the count are going to take place, otherwise it changes arbitrarily.
*/
static inline int icount_read(const struct inode *inode)
{
--
2.48.1
next prev parent reply other threads:[~2026-03-31 16:09 UTC|newest]
Thread overview: 15+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-03-31 16:08 [PATCH v5 0/4] assorted ->i_count changes + extension of lockless handling Mateusz Guzik
2026-03-31 16:08 ` [PATCH v5 1/4] fs: add icount_read_once() and stop open-coding ->i_count loads Mateusz Guzik
2026-04-01 17:29 ` Jan Kara
2026-03-31 16:08 ` [PATCH v5 2/4] fs: relocate and tidy up ihold() Mateusz Guzik
2026-04-01 17:28 ` Jan Kara
2026-03-31 16:08 ` [PATCH v5 3/4] fs: handle potential filesystems which use I_DONTCACHE and drop the lock in ->drop_inode Mateusz Guzik
2026-04-01 17:45 ` Jan Kara
2026-04-01 18:50 ` Mateusz Guzik
2026-04-09 13:40 ` Christian Brauner
2026-04-09 14:55 ` Mateusz Guzik
2026-04-10 9:41 ` Christian Brauner
2026-03-31 16:08 ` Mateusz Guzik [this message]
2026-04-08 17:01 ` [PATCH v5 4/4] fs: allow lockless ->i_count bumps as long as it does not transition 0->1 Jan Kara
2026-04-18 12:32 ` Mateusz Guzik
2026-04-20 7:56 ` Jan Kara
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260331160851.3854954-5-mjguzik@gmail.com \
--to=mjguzik@gmail.com \
--cc=brauner@kernel.org \
--cc=jack@suse.cz \
--cc=linux-fsdevel@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=viro@zeniv.linux.org.uk \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.