* [PATCH v4 1/4] fs: add icount_read_once() and stop open-coding ->i_count loads
2026-03-30 12:25 [PATCH v4 0/4] assorted ->i_count changes + extension of lockless handling Mateusz Guzik
@ 2026-03-30 12:25 ` Mateusz Guzik
2026-03-30 12:26 ` [PATCH v4 2/4] fs: relocate and tidy up ihold() Mateusz Guzik
` (2 subsequent siblings)
3 siblings, 0 replies; 5+ messages in thread
From: Mateusz Guzik @ 2026-03-30 12:25 UTC (permalink / raw)
To: brauner; +Cc: viro, jack, linux-kernel, linux-fsdevel, Mateusz Guzik
Similarly to inode_state_read_once(), it makes the caller spell out
they acknowledge instability of the returned value.
Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
---
arch/powerpc/platforms/cell/spufs/file.c | 2 +-
fs/btrfs/inode.c | 2 +-
fs/ceph/mds_client.c | 2 +-
fs/ext4/ialloc.c | 4 ++--
fs/hpfs/inode.c | 2 +-
fs/inode.c | 12 ++++++------
fs/nfs/inode.c | 4 ++--
fs/smb/client/inode.c | 2 +-
fs/ubifs/super.c | 2 +-
fs/xfs/xfs_inode.c | 2 +-
fs/xfs/xfs_trace.h | 2 +-
include/linux/fs.h | 13 +++++++++++++
include/trace/events/filelock.h | 2 +-
security/landlock/fs.c | 2 +-
14 files changed, 33 insertions(+), 20 deletions(-)
diff --git a/arch/powerpc/platforms/cell/spufs/file.c b/arch/powerpc/platforms/cell/spufs/file.c
index 10fa9b844fcc..f6de8c1169d5 100644
--- a/arch/powerpc/platforms/cell/spufs/file.c
+++ b/arch/powerpc/platforms/cell/spufs/file.c
@@ -1430,7 +1430,7 @@ static int spufs_mfc_open(struct inode *inode, struct file *file)
if (ctx->owner != current->mm)
return -EINVAL;
- if (icount_read(inode) != 1)
+ if (icount_read_once(inode) != 1)
return -EBUSY;
mutex_lock(&ctx->mapping_lock);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 8d97a8ad3858..f36c49e83c04 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4741,7 +4741,7 @@ static void btrfs_prune_dentries(struct btrfs_root *root)
inode = btrfs_find_first_inode(root, min_ino);
while (inode) {
- if (icount_read(&inode->vfs_inode) > 1)
+ if (icount_read_once(&inode->vfs_inode) > 1)
d_prune_aliases(&inode->vfs_inode);
min_ino = btrfs_ino(inode) + 1;
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index b1746273f186..2cb3c919d40d 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -2223,7 +2223,7 @@ static int trim_caps_cb(struct inode *inode, int mds, void *arg)
int count;
dput(dentry);
d_prune_aliases(inode);
- count = icount_read(inode);
+ count = icount_read_once(inode);
if (count == 1)
(*remaining)--;
doutc(cl, "%p %llx.%llx cap %p pruned, count now %d\n",
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 3fd8f0099852..8c80d5087516 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -252,10 +252,10 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
"nonexistent device\n", __func__, __LINE__);
return;
}
- if (icount_read(inode) > 1) {
+ if (icount_read_once(inode) > 1) {
ext4_msg(sb, KERN_ERR, "%s:%d: inode #%llu: count=%d",
__func__, __LINE__, inode->i_ino,
- icount_read(inode));
+ icount_read_once(inode));
return;
}
if (inode->i_nlink) {
diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c
index 0e932cc8be1b..1b4fcf760aad 100644
--- a/fs/hpfs/inode.c
+++ b/fs/hpfs/inode.c
@@ -184,7 +184,7 @@ void hpfs_write_inode(struct inode *i)
struct hpfs_inode_info *hpfs_inode = hpfs_i(i);
struct inode *parent;
if (i->i_ino == hpfs_sb(i->i_sb)->sb_root) return;
- if (hpfs_inode->i_rddir_off && !icount_read(i)) {
+ if (hpfs_inode->i_rddir_off && !icount_read_once(i)) {
if (*hpfs_inode->i_rddir_off)
pr_err("write_inode: some position still there\n");
kfree(hpfs_inode->i_rddir_off);
diff --git a/fs/inode.c b/fs/inode.c
index 5ad169d51728..1f5a383ccf27 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -907,7 +907,7 @@ void evict_inodes(struct super_block *sb)
again:
spin_lock(&sb->s_inode_list_lock);
list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
- if (icount_read(inode))
+ if (icount_read_once(inode))
continue;
spin_lock(&inode->i_lock);
@@ -1926,7 +1926,7 @@ static void iput_final(struct inode *inode)
int drop;
WARN_ON(inode_state_read(inode) & I_NEW);
- VFS_BUG_ON_INODE(atomic_read(&inode->i_count) != 0, inode);
+ VFS_BUG_ON_INODE(icount_read(inode) != 0, inode);
if (op->drop_inode)
drop = op->drop_inode(inode);
@@ -1945,7 +1945,7 @@ static void iput_final(struct inode *inode)
* Re-check ->i_count in case the ->drop_inode() hooks played games.
* Note we only execute this if the verdict was to drop the inode.
*/
- VFS_BUG_ON_INODE(atomic_read(&inode->i_count) != 0, inode);
+ VFS_BUG_ON_INODE(icount_read(inode) != 0, inode);
if (drop) {
inode_state_set(inode, I_FREEING);
@@ -1989,7 +1989,7 @@ void iput(struct inode *inode)
* equal to one, then two CPUs racing to further drop it can both
* conclude it's fine.
*/
- VFS_BUG_ON_INODE(atomic_read(&inode->i_count) < 1, inode);
+ VFS_BUG_ON_INODE(icount_read_once(inode) < 1, inode);
if (atomic_add_unless(&inode->i_count, -1, 1))
return;
@@ -2023,7 +2023,7 @@ EXPORT_SYMBOL(iput);
void iput_not_last(struct inode *inode)
{
VFS_BUG_ON_INODE(inode_state_read_once(inode) & (I_FREEING | I_CLEAR), inode);
- VFS_BUG_ON_INODE(atomic_read(&inode->i_count) < 2, inode);
+ VFS_BUG_ON_INODE(icount_read_once(inode) < 2, inode);
WARN_ON(atomic_sub_return(1, &inode->i_count) == 0);
}
@@ -3046,7 +3046,7 @@ void dump_inode(struct inode *inode, const char *reason)
}
state = inode_state_read_once(inode);
- count = atomic_read(&inode->i_count);
+ count = icount_read_once(inode);
if (!sb ||
get_kernel_nofault(s_type, &sb->s_type) || !s_type ||
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 98a8f0de1199..22834eddd5b1 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -608,7 +608,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
inode->i_sb->s_id,
(unsigned long long)NFS_FILEID(inode),
nfs_display_fhandle_hash(fh),
- icount_read(inode));
+ icount_read_once(inode));
out:
return inode;
@@ -2261,7 +2261,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
dfprintk(VFS, "NFS: %s(%s/%llu fh_crc=0x%08x ct=%d info=0x%llx)\n",
__func__, inode->i_sb->s_id, inode->i_ino,
nfs_display_fhandle_hash(NFS_FH(inode)),
- icount_read(inode), fattr->valid);
+ icount_read_once(inode), fattr->valid);
if (!(fattr->valid & NFS_ATTR_FATTR_FILEID)) {
/* Only a mounted-on-fileid? Just exit */
diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c
index 888f9e35f14b..ab35e35b16d7 100644
--- a/fs/smb/client/inode.c
+++ b/fs/smb/client/inode.c
@@ -2842,7 +2842,7 @@ int cifs_revalidate_dentry_attr(struct dentry *dentry)
}
cifs_dbg(FYI, "Update attributes: %s inode 0x%p count %d dentry: 0x%p d_time %ld jiffies %ld\n",
- full_path, inode, icount_read(inode),
+ full_path, inode, icount_read_once(inode),
dentry, cifs_get_time(dentry), jiffies);
again:
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 9a77d8b64ffa..38972786817e 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -358,7 +358,7 @@ static void ubifs_evict_inode(struct inode *inode)
goto out;
dbg_gen("inode %llu, mode %#x", inode->i_ino, (int)inode->i_mode);
- ubifs_assert(c, !icount_read(inode));
+ ubifs_assert(c, !icount_read_once(inode));
truncate_inode_pages_final(&inode->i_data);
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index beaa26ec62da..4f659eba6ae5 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1046,7 +1046,7 @@ xfs_itruncate_extents_flags(
int error = 0;
xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
- if (icount_read(VFS_I(ip)))
+ if (icount_read_once(VFS_I(ip)))
xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL);
if (whichfork == XFS_DATA_FORK)
ASSERT(new_size <= XFS_ISIZE(ip));
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 5e8190fe2be9..cbdec40826b3 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -1156,7 +1156,7 @@ DECLARE_EVENT_CLASS(xfs_iref_class,
TP_fast_assign(
__entry->dev = VFS_I(ip)->i_sb->s_dev;
__entry->ino = ip->i_ino;
- __entry->count = icount_read(VFS_I(ip));
+ __entry->count = icount_read_once(VFS_I(ip));
__entry->pincount = atomic_read(&ip->i_pincount);
__entry->iflags = ip->i_flags;
__entry->caller_ip = caller_ip;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 8afbe2ef2686..07363fce4406 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2225,8 +2225,21 @@ static inline void mark_inode_dirty_sync(struct inode *inode)
__mark_inode_dirty(inode, I_DIRTY_SYNC);
}
+/*
+ * returns the refcount on the inode. it can change arbitrarily.
+ */
+static inline int icount_read_once(const struct inode *inode)
+{
+ return atomic_read(&inode->i_count);
+}
+
+/*
+ * returns the refcount on the inode. The lock guarantees no new references
+ * are added, but references can be dropped as long as the result is > 0.
+ */
static inline int icount_read(const struct inode *inode)
{
+ lockdep_assert_held(&inode->i_lock);
return atomic_read(&inode->i_count);
}
diff --git a/include/trace/events/filelock.h b/include/trace/events/filelock.h
index 116774886244..c8c8847bb6f6 100644
--- a/include/trace/events/filelock.h
+++ b/include/trace/events/filelock.h
@@ -190,7 +190,7 @@ TRACE_EVENT(generic_add_lease,
__entry->i_ino = inode->i_ino;
__entry->wcount = atomic_read(&inode->i_writecount);
__entry->rcount = atomic_read(&inode->i_readcount);
- __entry->icount = icount_read(inode);
+ __entry->icount = icount_read_once(inode);
__entry->owner = fl->c.flc_owner;
__entry->flags = fl->c.flc_flags;
__entry->type = fl->c.flc_type;
diff --git a/security/landlock/fs.c b/security/landlock/fs.c
index c1ecfe239032..32d560f12dbd 100644
--- a/security/landlock/fs.c
+++ b/security/landlock/fs.c
@@ -1278,7 +1278,7 @@ static void hook_sb_delete(struct super_block *const sb)
struct landlock_object *object;
/* Only handles referenced inodes. */
- if (!icount_read(inode))
+ if (!icount_read_once(inode))
continue;
/*
--
2.48.1
^ permalink raw reply related [flat|nested] 5+ messages in thread* [PATCH v4 4/4] fs: allow lockless ->i_count bumps as long as it does not transition 0->1
2026-03-30 12:25 [PATCH v4 0/4] assorted ->i_count changes + extension of lockless handling Mateusz Guzik
` (2 preceding siblings ...)
2026-03-30 12:26 ` [PATCH v4 3/4] fs: handle potential filesystems which use I_DONTCACHE and drop the lock in ->drop_inode Mateusz Guzik
@ 2026-03-30 12:26 ` Mateusz Guzik
3 siblings, 0 replies; 5+ messages in thread
From: Mateusz Guzik @ 2026-03-30 12:26 UTC (permalink / raw)
To: brauner; +Cc: viro, jack, linux-kernel, linux-fsdevel, Mateusz Guzik
With this change only 0->1 and 1->0 transitions need the lock.
I verified all places which look at the refcount either only care about
it staying 0 (and have the lock enforce it) or don't hold the inode lock
to begin with (making the above change irrelevant to their correcness or
lack thereof).
I also confirmed nfs and btrfs like to call into these a lot and now
avoid the lock in the common case, shaving off some atomics.
Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
---
fs/dcache.c | 4 +++
fs/inode.c | 65 ++++++++++++++++++++++++++++++++++++++++++++++
include/linux/fs.h | 4 +--
3 files changed, 71 insertions(+), 2 deletions(-)
diff --git a/fs/dcache.c b/fs/dcache.c
index 9ceab142896f..b63450ebb85c 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -2033,6 +2033,10 @@ void d_instantiate_new(struct dentry *entry, struct inode *inode)
__d_instantiate(entry, inode);
spin_unlock(&entry->d_lock);
WARN_ON(!(inode_state_read(inode) & I_NEW));
+ /*
+ * Paired with igrab_try_lockless()
+ */
+ smp_wmb();
inode_state_clear(inode, I_NEW | I_CREATING);
inode_wake_up_bit(inode, __I_NEW);
spin_unlock(&inode->i_lock);
diff --git a/fs/inode.c b/fs/inode.c
index a06edd557e84..5e5cddd76a9d 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1029,6 +1029,7 @@ long prune_icache_sb(struct super_block *sb, struct shrink_control *sc)
}
static void __wait_on_freeing_inode(struct inode *inode, bool hash_locked, bool rcu_locked);
+static bool igrab_try_lockless(struct inode *inode);
/*
* Called with the inode lock held.
@@ -1053,6 +1054,11 @@ static struct inode *find_inode(struct super_block *sb,
continue;
if (!test(inode, data))
continue;
+ if (igrab_try_lockless(inode)) {
+ rcu_read_unlock();
+ *isnew = false;
+ return inode;
+ }
spin_lock(&inode->i_lock);
if (inode_state_read(inode) & (I_FREEING | I_WILL_FREE)) {
__wait_on_freeing_inode(inode, hash_locked, true);
@@ -1095,6 +1101,11 @@ static struct inode *find_inode_fast(struct super_block *sb,
continue;
if (inode->i_sb != sb)
continue;
+ if (igrab_try_lockless(inode)) {
+ rcu_read_unlock();
+ *isnew = false;
+ return inode;
+ }
spin_lock(&inode->i_lock);
if (inode_state_read(inode) & (I_FREEING | I_WILL_FREE)) {
__wait_on_freeing_inode(inode, hash_locked, true);
@@ -1212,6 +1223,10 @@ void unlock_new_inode(struct inode *inode)
lockdep_annotate_inode_mutex_key(inode);
spin_lock(&inode->i_lock);
WARN_ON(!(inode_state_read(inode) & I_NEW));
+ /*
+ * Paired with igrab_try_lockless()
+ */
+ smp_wmb();
inode_state_clear(inode, I_NEW | I_CREATING);
inode_wake_up_bit(inode, __I_NEW);
spin_unlock(&inode->i_lock);
@@ -1223,6 +1238,10 @@ void discard_new_inode(struct inode *inode)
lockdep_annotate_inode_mutex_key(inode);
spin_lock(&inode->i_lock);
WARN_ON(!(inode_state_read(inode) & I_NEW));
+ /*
+ * Paired with igrab_try_lockless()
+ */
+ smp_wmb();
inode_state_clear(inode, I_NEW);
inode_wake_up_bit(inode, __I_NEW);
spin_unlock(&inode->i_lock);
@@ -1582,6 +1601,14 @@ EXPORT_SYMBOL(ihold);
struct inode *igrab(struct inode *inode)
{
+ /*
+ * Read commentary above igrab_try_lockless() for an explanation why this works.
+ */
+ if (atomic_add_unless(&inode->i_count, 1, 0)) {
+ VFS_BUG_ON_INODE(inode_state_read_once(inode) & (I_FREEING | I_WILL_FREE), inode);
+ return inode;
+ }
+
spin_lock(&inode->i_lock);
if (!(inode_state_read(inode) & (I_FREEING | I_WILL_FREE))) {
__iget(inode);
@@ -1599,6 +1626,44 @@ struct inode *igrab(struct inode *inode)
}
EXPORT_SYMBOL(igrab);
+/*
+ * igrab_try_lockless - special inode refcount acquire primitive for the inode hash
+ * (don't use elsewhere!)
+ *
+ * It provides lockless refcount acquire in the common case of no problematic
+ * flags being set and the count being > 0.
+ *
+ * There are 4 state flags to worry about and the routine makes sure to not bump the
+ * ref if any of them is present.
+ *
+ * I_NEW and I_CREATING can only legally get set *before* the inode becomes visible
+ * during lookup. Thus if the flags are not spotted, they are guaranteed to not be
+ * a factor. However, we need an acquire fence before returning the inode just
+ * in case we raced against clearing the state to make sure our consumer picks up
+ * any other changes made prior. atomic_add_unless provides a full fence, which
+ * takes care of it.
+ *
+ * I_FREEING and I_WILL_FREE can only legally get set if ->i_count == 0 and it is
+ * illegal to bump the ref if either is present. Consequently if atomic_add_unless
+ * managed to replaced a non-0 value with a bigger one, we have a guarantee neither
+ * of these flags is set. Note this means explicitly checking of these flags below
+ * is not necessary, it is only done because it does not cost anything on top of the
+ * load which already needs to be done to handle the other flags.
+ */
+static bool igrab_try_lockless(struct inode *inode)
+{
+ if (inode_state_read_once(inode) & (I_NEW | I_CREATING | I_FREEING | I_WILL_FREE))
+ return false;
+ /*
+ * Paired with routines clearing I_NEW
+ */
+ if (atomic_add_unless(&inode->i_count, 1, 0)) {
+ VFS_BUG_ON_INODE(inode_state_read_once(inode) & (I_FREEING | I_WILL_FREE), inode);
+ return true;
+ }
+ return false;
+}
+
/**
* ilookup5_nowait - search for an inode in the inode cache
* @sb: super block of file system to search
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 07363fce4406..119e0a3d2f42 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2234,8 +2234,8 @@ static inline int icount_read_once(const struct inode *inode)
}
/*
- * returns the refcount on the inode. The lock guarantees no new references
- * are added, but references can be dropped as long as the result is > 0.
+ * returns the refcount on the inode. The lock guarantees no 0->1 or 1->0 transitions
+ * of the count are going to take place, otherwise it changes arbitrarily.
*/
static inline int icount_read(const struct inode *inode)
{
--
2.48.1
^ permalink raw reply related [flat|nested] 5+ messages in thread