* [PATCH][RFC] get rid of busy-wait in shrink_dcache_tree()
@ 2026-01-22 20:20 Al Viro
2026-01-23 0:19 ` Linus Torvalds
` (2 more replies)
0 siblings, 3 replies; 33+ messages in thread
From: Al Viro @ 2026-01-22 20:20 UTC (permalink / raw)
To: Linus Torvalds
Cc: linux-fsdevel, Christian Brauner, Jan Kara, Nikolay Borisov,
Max Kellermann
There's a case in which shrink_dcache_tree() ends up busy-waiting: if some
dentry in the subtree in question is found to be in process of being evicted
by another thread. We need to wait for that to finish so that parent would
no longer be pinned, to avoid the situations when nothing in the tree is
busy, but shrink_dcache_tree() fails to evict some directory only because
memory pressure initiated eviction of some of its children before we got to
evicting those ourselves. That would be bogus both for shrink_dcache_parent()
and for shrink_dcache_for_umount().
Unfortunately, we have nothing to wait on. That had led to the possibility
of busy-waiting - getting through the iteration of shrink_dcache_tree() main
loop without having made any progress. That's Not Nice(tm) and that had been
discussed quite a few times since at least 2018. Recently it became obvious
that this goes beyond "not nice" - on sufficiently contrieved setup it's
possible to get a livelock there, with both threads involved tied to the same
CPU, shrink_dcache_tree() one with higher priority than the thread that has
given CPU up on may_sleep() in the very beginning of iput() during eviction
of the dentry in the tree shrink_dcache_tree() is busy-waiting for.
Let's get rid of that busy-waiting. Constraints are
* don't grow struct dentry
* don't slow the normal case of __dentry_kill() down
and it turns out to be doable.
Dentries in question are
* already marked dead (negative ->d_count)
* already negative
* already unhashed
* already not in in-lookup hash
* yet to get removed from ->d_sib and get DCACHE_DENTRY_KILLED in
flags.
Neither ->d_alias nor the fields overlapping it (->d_rcu and ->d_in_lookup_hash)
are going to be accessed for these dentries until after dentry_unlist(). What's
more, ->d_alias.next is guaranteed to be NULL.
So we can embed struct completion into struct select_data and (ab)use
->d_alias.next for linked list of struct select_data instances.
If dentry_unlist() finds ->d_alias.next non-NULL, it carefully goes over that
list and calls complete() for each of those.
That way select_collect2() can treat negative ->d_count the same way it deals
with dentries on other thread's shrink list - grab rcu_read_lock(), stash the
dentry into data.victim and tell d_walk() to stop.
If shrink_dcache_parent() runs into that case, it should attach its select_data
to victim dentry, evict whatever normal eviction candidates it has gathered
and wait for completion. Voila...
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
diff --git a/fs/dcache.c b/fs/dcache.c
index dc2fff4811d1..6db72a684d8d 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -605,6 +605,54 @@ void d_drop(struct dentry *dentry)
}
EXPORT_SYMBOL(d_drop);
+struct select_data {
+ struct dentry *start;
+ union {
+ long found;
+ struct {
+ struct dentry *victim;
+ struct select_data *next;
+ struct completion completion;
+ };
+ };
+ struct list_head dispose;
+};
+
+/*
+ * shrink_dcache_parent() needs to be notified when dentry in process of
+ * being evicted finally gets unlisted. Such dentries are
+ * already with negative ->d_count
+ * already negative
+ * already not in in-lookup hash
+ * reachable only via ->d_sib.
+ *
+ * Neither ->d_alias, nor ->d_rcu, nor ->d_in_lookup_hash are going to be
+ * accessed for those, so we can (ab)use ->d_alias.next for list of
+ * select_data of waiters. Initially it's going to be NULL and as long
+ * as dentry_unlist() returns it to that state we are fine.
+ */
+static inline void d_add_waiter(struct dentry *dentry, struct select_data *p)
+{
+ struct select_data *v = (void *)dentry->d_u.d_alias.next;
+ init_completion(&p->completion);
+ p->next = v;
+ dentry->d_u.d_alias.next = (void *)p;
+}
+
+static inline void d_complete_waiters(struct dentry *dentry)
+{
+ struct select_data *v = (void *)dentry->d_u.d_alias.next;
+ if (unlikely(v)) {
+ /* some shrink_dcache_tree() instances are waiting */
+ dentry->d_u.d_alias.next = NULL;
+ while (v) {
+ struct completion *r = &v->completion;
+ v = v->next;
+ complete(r);
+ }
+ }
+}
+
static inline void dentry_unlist(struct dentry *dentry)
{
struct dentry *next;
@@ -613,6 +661,7 @@ static inline void dentry_unlist(struct dentry *dentry)
* attached to the dentry tree
*/
dentry->d_flags |= DCACHE_DENTRY_KILLED;
+ d_complete_waiters(dentry);
if (unlikely(hlist_unhashed(&dentry->d_sib)))
return;
__hlist_del(&dentry->d_sib);
@@ -1499,15 +1548,6 @@ int d_set_mounted(struct dentry *dentry)
* constraints.
*/
-struct select_data {
- struct dentry *start;
- union {
- long found;
- struct dentry *victim;
- };
- struct list_head dispose;
-};
-
static enum d_walk_ret select_collect(void *_data, struct dentry *dentry)
{
struct select_data *data = _data;
@@ -1559,6 +1599,10 @@ static enum d_walk_ret select_collect2(void *_data, struct dentry *dentry)
return D_WALK_QUIT;
}
to_shrink_list(dentry, &data->dispose);
+ } else if (dentry->d_lockref.count < 0) {
+ rcu_read_lock();
+ data->victim = dentry;
+ return D_WALK_QUIT;
}
/*
* We can return to the caller if we have found some (this
@@ -1598,12 +1642,26 @@ static void shrink_dcache_tree(struct dentry *parent, bool for_umount)
data.victim = NULL;
d_walk(parent, &data, select_collect2);
if (data.victim) {
- spin_lock(&data.victim->d_lock);
- if (!lock_for_kill(data.victim)) {
- spin_unlock(&data.victim->d_lock);
+ struct dentry *v = data.victim;
+
+ spin_lock(&v->d_lock);
+ if (v->d_lockref.count < 0 &&
+ !(v->d_flags & DCACHE_DENTRY_KILLED)) {
+ // It's busy dying; have it notify us once
+ // it becomes invisible to d_walk().
+ d_add_waiter(v, &data);
+ spin_unlock(&v->d_lock);
+ rcu_read_unlock();
+ if (!list_empty(&data.dispose))
+ shrink_dentry_list(&data.dispose);
+ wait_for_completion(&data.completion);
+ continue;
+ }
+ if (!lock_for_kill(v)) {
+ spin_unlock(&v->d_lock);
rcu_read_unlock();
} else {
- shrink_kill(data.victim);
+ shrink_kill(v);
}
}
if (!list_empty(&data.dispose))
^ permalink raw reply related [flat|nested] 33+ messages in thread* Re: [PATCH][RFC] get rid of busy-wait in shrink_dcache_tree() 2026-01-22 20:20 [PATCH][RFC] get rid of busy-wait in shrink_dcache_tree() Al Viro @ 2026-01-23 0:19 ` Linus Torvalds 2026-01-23 0:36 ` Al Viro 2026-04-02 18:08 ` [RFC PATCH v2 0/4] getting rid of busy-wait in shrink_dcache_parent() Al Viro 2026-04-04 8:07 ` [RFC PATCH v3 " Al Viro 2 siblings, 1 reply; 33+ messages in thread From: Linus Torvalds @ 2026-01-23 0:19 UTC (permalink / raw) To: Al Viro Cc: linux-fsdevel, Christian Brauner, Jan Kara, Nikolay Borisov, Max Kellermann On Thu, 22 Jan 2026 at 12:18, Al Viro <viro@zeniv.linux.org.uk> wrote: > > +static inline void d_add_waiter(struct dentry *dentry, struct select_data *p) > +{ > + struct select_data *v = (void *)dentry->d_u.d_alias.next; > + init_completion(&p->completion); > + p->next = v; > + dentry->d_u.d_alias.next = (void *)p; > +} I tend to not love it when I see new users of completions - I've seen too many mis-uses - but this does seem to be a good use-case for them. That said, I absolutely abhor your cast. Christ - that 'd_u' is *already* a union, exactly because that thing gets used for different things - just add a new union member, instead of mis-using an existing union member that then requires you to cast the data to a different form. Yes, you had an explanation for why you used d_alias.next, but please make that explanation be in the union itself, not in the commit message of something that mis-uses the union. Please? That way there's no need for a cast, and you can name that new union member something that also clarifies things on a source level ("eviction_completion" or whatever). Or am I missing something? Linus ^ permalink raw reply [flat|nested] 33+ messages in thread
* Re: [PATCH][RFC] get rid of busy-wait in shrink_dcache_tree() 2026-01-23 0:19 ` Linus Torvalds @ 2026-01-23 0:36 ` Al Viro 2026-01-24 4:36 ` Al Viro 0 siblings, 1 reply; 33+ messages in thread From: Al Viro @ 2026-01-23 0:36 UTC (permalink / raw) To: Linus Torvalds Cc: linux-fsdevel, Christian Brauner, Jan Kara, Nikolay Borisov, Max Kellermann On Thu, Jan 22, 2026 at 04:19:56PM -0800, Linus Torvalds wrote: > On Thu, 22 Jan 2026 at 12:18, Al Viro <viro@zeniv.linux.org.uk> wrote: > > > > +static inline void d_add_waiter(struct dentry *dentry, struct select_data *p) > > +{ > > + struct select_data *v = (void *)dentry->d_u.d_alias.next; > > + init_completion(&p->completion); > > + p->next = v; > > + dentry->d_u.d_alias.next = (void *)p; > > +} > > I tend to not love it when I see new users of completions - I've seen > too many mis-uses - but this does seem to be a good use-case for them. > > That said, I absolutely abhor your cast. Christ - that 'd_u' is > *already* a union, exactly because that thing gets used for different > things - just add a new union member, instead of mis-using an existing > union member that then requires you to cast the data to a different > form. > > Yes, you had an explanation for why you used d_alias.next, but please > make that explanation be in the union itself, not in the commit > message of something that mis-uses the union. Please? > > That way there's no need for a cast, and you can name that new union > member something that also clarifies things on a source level > ("eviction_completion" or whatever). > > Or am I missing something? In practice it doesn't really matter, but we don't want to initialize that field to NULL - no good place for doing that. Sure, the entire d_alias has been subject to hlist_del_init() or INIT_HLIST_NODE(), so any pointer field unioned with it will end up being NULL without any assignments to it, but... ugh. "We have a union of two-pointer struct, a pointer and some other stuff; we'd set both members of that struct member to NULL and count upon the pointer member of union having been zeroed by that" leaves a bad taste. ^ permalink raw reply [flat|nested] 33+ messages in thread
* Re: [PATCH][RFC] get rid of busy-wait in shrink_dcache_tree() 2026-01-23 0:36 ` Al Viro @ 2026-01-24 4:36 ` Al Viro 2026-01-24 4:46 ` Linus Torvalds 0 siblings, 1 reply; 33+ messages in thread From: Al Viro @ 2026-01-24 4:36 UTC (permalink / raw) To: Linus Torvalds Cc: linux-fsdevel, Christian Brauner, Jan Kara, Nikolay Borisov, Max Kellermann On Fri, Jan 23, 2026 at 12:36:51AM +0000, Al Viro wrote: > On Thu, Jan 22, 2026 at 04:19:56PM -0800, Linus Torvalds wrote: > > On Thu, 22 Jan 2026 at 12:18, Al Viro <viro@zeniv.linux.org.uk> wrote: > > > > > > +static inline void d_add_waiter(struct dentry *dentry, struct select_data *p) > > > +{ > > > + struct select_data *v = (void *)dentry->d_u.d_alias.next; > > > + init_completion(&p->completion); > > > + p->next = v; > > > + dentry->d_u.d_alias.next = (void *)p; > > > +} > > > > I tend to not love it when I see new users of completions - I've seen > > too many mis-uses - but this does seem to be a good use-case for them. > > > > That said, I absolutely abhor your cast. Christ - that 'd_u' is > > *already* a union, exactly because that thing gets used for different > > things - just add a new union member, instead of mis-using an existing > > union member that then requires you to cast the data to a different > > form. > > > > Yes, you had an explanation for why you used d_alias.next, but please > > make that explanation be in the union itself, not in the commit > > message of something that mis-uses the union. Please? > > > > That way there's no need for a cast, and you can name that new union > > member something that also clarifies things on a source level > > ("eviction_completion" or whatever). > > > > Or am I missing something? > > In practice it doesn't really matter, but we don't want to initialize > that field to NULL - no good place for doing that. Sure, the entire > d_alias has been subject to hlist_del_init() or INIT_HLIST_NODE(), so > any pointer field unioned with it will end up being NULL without > any assignments to it, but... ugh. "We have a union of two-pointer > struct, a pointer and some other stuff; we'd set both members of that > struct member to NULL and count upon the pointer member of union > having been zeroed by that" leaves a bad taste. FWIW, there's another reason, but it's not one I'm fond of. There are few places where we use hlist_unhashed(&dentry->d_u.d_alias) as a debugging check. I _think_ that none of those are reachable for dentries in that state, but then all of them are of "it should evaluate true unless we have a kernel bug" kind. I'm not saying it's a good reason. As the matter of fact, the tests are misspelled checks for dentry being negative. _If_ we kill those, we could declare the state of ->d_u.d_alias undefined for negative dentries and have transitions to that state explicitly clear the new field instead. IOW, hlist_del_init() in dentry_unlink_inode() becomes __hlist_del(&dentry->d_u.d_alias); // !hlist_unhashed() guaranteed dentry->d_u.shrink_waiters = NULL; INIT_HLIST_NODE(&dentry->d_u.d_alias) in __d_alloc() and __d_lookup_unhash() simply dentry->d_u.shrink_waiters = NULL; Then we are out of the nasal demon country. Debugging tests are interesting, though. We have them in dentry_free() - WARN_ON if on alias list; what we want is rather "have DENTRY_KILLED in flags", and I would probably add "no PERSISTENT" as well as "no LRU_LIST" there. d_instantiate(), d_instantiate_new(), d_make_persistent(), d_mark_tmpfile() - a mix of WARN_ON() and BUG_ON(); in all cases it should be "it must be negative here", and in all cases it's done before ->d_lock is taken. Not wanting an oops while holding ->d_lock (or ->i_lock, for that matter) on anything is understandable, and stability is actually provided by the callers, but... it's still confusing for readers, especially since the real proof of stability is nowhere near trivial. I'd probably go for d_really_is_negative() in all of those; it might make sense to unify some of that boilerplate, but that's for the next cycle... ^ permalink raw reply [flat|nested] 33+ messages in thread
* Re: [PATCH][RFC] get rid of busy-wait in shrink_dcache_tree() 2026-01-24 4:36 ` Al Viro @ 2026-01-24 4:46 ` Linus Torvalds 2026-01-24 5:36 ` Al Viro 0 siblings, 1 reply; 33+ messages in thread From: Linus Torvalds @ 2026-01-24 4:46 UTC (permalink / raw) To: Al Viro Cc: linux-fsdevel, Christian Brauner, Jan Kara, Nikolay Borisov, Max Kellermann On Fri, 23 Jan 2026 at 20:34, Al Viro <viro@zeniv.linux.org.uk> wrote: > > > > > In practice it doesn't really matter, but we don't want to initialize > > that field to NULL - no good place for doing that. Sure, the entire > > d_alias has been subject to hlist_del_init() or INIT_HLIST_NODE(), so > > any pointer field unioned with it will end up being NULL without > > any assignments to it, but... ugh. "We have a union of two-pointer > > struct, a pointer and some other stuff; we'd set both members of that > > struct member to NULL and count upon the pointer member of union > > having been zeroed by that" leaves a bad taste. > > FWIW, there's another reason, but it's not one I'm fond of. There are few > places where we use hlist_unhashed(&dentry->d_u.d_alias) as a debugging > check. I _think_ that none of those are reachable for dentries in that state, > but then all of them are of "it should evaluate true unless we have a kernel > bug" kind. > Note that I'm not complaining about re-using the 'dentry->d_u.d_alias.next' field. I'm complaining about re-using it WITH A CAST, AND WITHOUT DOCUMENTING IT IN THE TYPE DECLARATION. So by all means use that field, and use that value, but use it *BY* making it a proper union member, and by documenting the lifetime change of that union member. None of this "randomly cast a different pointer to 'void *' where it will then be implicitly cast to to 'struct select_data *'" garbage. Just make a proper "struct select_data *" union member that aliases that "d_alias.next" field, and the compiler will generate the EXACT same code, except the source code will be cleaner, and you won't need any hacky pointer casts. And document how that field is NULL when the dentry is killed, and how that NULL 'dentry->d_u.d_alias.next' field at that point becomes a NULL 'dentry->d_u.d_select_data' field. You don't need to describe 'struct select_data', you just need to declare it. IOW, something like this: --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -89,6 +89,11 @@ union shortname_store { #define d_lock d_lockref.lock #define d_iname d_shortname.string +// The d_alias list becomes a 'struct select_data' pointer when +// the dentry is killed (so the NULL d_alias.next pointer becomes +// a NULL 'struct select_data *' +struct select_data; + struct dentry { /* RCU lookup touched fields */ unsigned int d_flags; /* protected by d_lock */ @@ -128,6 +133,7 @@ struct dentry { struct hlist_node d_alias; /* inode alias list */ struct hlist_bl_node d_in_lookup_hash; /* only for in-lookup ones */ struct rcu_head d_rcu; + struct select_data *d_select_data; /* only for killed dentries */ } d_u; }; but I obviously didn't test it. Linus ^ permalink raw reply [flat|nested] 33+ messages in thread
* Re: [PATCH][RFC] get rid of busy-wait in shrink_dcache_tree() 2026-01-24 4:46 ` Linus Torvalds @ 2026-01-24 5:36 ` Al Viro 2026-01-24 17:45 ` Linus Torvalds 0 siblings, 1 reply; 33+ messages in thread From: Al Viro @ 2026-01-24 5:36 UTC (permalink / raw) To: Linus Torvalds Cc: linux-fsdevel, Christian Brauner, Jan Kara, Nikolay Borisov, Max Kellermann On Fri, Jan 23, 2026 at 08:46:52PM -0800, Linus Torvalds wrote: > Just make a proper "struct select_data *" union member that aliases > that "d_alias.next" field, and the compiler will generate the EXACT > same code, except the source code will be cleaner, and you won't need > any hacky pointer casts. > > And document how that field is NULL when the dentry is killed, and how > that NULL 'dentry->d_u.d_alias.next' field at that point becomes a > NULL 'dentry->d_u.d_select_data' field. > > You don't need to describe 'struct select_data', you just need to > declare it. IOW, something like this: As the matter of fact, that _was_ the previous iteration of that patch - see http://ftp.linux.org.uk/people/viro/y8 The only trouble is that as soon as some joker slaps __randomize_layout on struct hlist_node they'll start flipping from sharing with ->next to sharing with ->pprev, at random. I'm not saying it wouldn't work, but I would rather have the proofs of correctness less subtle. And it's not even hard to do - the only rule added would be that ->d_u.d_alias should never be accessed for negative dentries and never without ->i_lock on the inode of dentry in question. The only places where it does not hold at the moment are those WARN_ON() and we'll be better off having those spelled in less obscure way; we want to verify that dentry is negative, so let's express that in the idiomatic way. And that's it - with that done, we can add a field, obviously with forward declaration of struct select_data, etc. and have it explicitly initialized whenever dentry goes negative. Instead of zeroing ->d_u.d_alias.{next,pprev} as we do now. Currently !hlist_unlinked(&dentry->d_u.d_alias) is equivalent to dentry->d_inode != NULL, with identical stability requirements. And nobody ever traverses that hlist without ->i_lock - no RCU accesses there. We do have lockless checks that list is not empty (right before grabbing ->i_lock and rechecking), but those come from the inode side; "are there any aliases for this inode" rather than "is this dentry an alias for anything (== positive)". I'm putting together short documentation on d_inode/d_alias/i_dentry/type bits in d_flags; should be done by tomorrow morning... PS: a fun catch while doing that code audit - AFAICS, we don't really need to play with fake root dentry for NFS anymore; the reason why it used to be needed had been gone since 2013 as an unnoticed side effect of switching shrink_dcache_for_umount() to use of d_walk()... Obviously needs a review from NFS folks, but if they see no problems with that, it would be nice to get rid of that kludge, as in diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c index f13d25d95b85..2ac8404e1a15 100644 --- a/fs/nfs/getroot.c +++ b/fs/nfs/getroot.c @@ -32,35 +32,6 @@ #define NFSDBG_FACILITY NFSDBG_CLIENT -/* - * Set the superblock root dentry. - * Note that this function frees the inode in case of error. - */ -static int nfs_superblock_set_dummy_root(struct super_block *sb, struct inode *inode) -{ - /* The mntroot acts as the dummy root dentry for this superblock */ - if (sb->s_root == NULL) { - sb->s_root = d_make_root(inode); - if (sb->s_root == NULL) - return -ENOMEM; - ihold(inode); - /* - * Ensure that this dentry is invisible to d_find_alias(). - * Otherwise, it may be spliced into the tree by - * d_splice_alias if a parent directory from the same - * filesystem gets mounted at a later time. - * This again causes shrink_dcache_for_umount_subtree() to - * Oops, since the test for IS_ROOT() will fail. - */ - spin_lock(&d_inode(sb->s_root)->i_lock); - spin_lock(&sb->s_root->d_lock); - hlist_del_init(&sb->s_root->d_u.d_alias); - spin_unlock(&sb->s_root->d_lock); - spin_unlock(&d_inode(sb->s_root)->i_lock); - } - return 0; -} - /* * get a root dentry from the root filehandle */ @@ -99,10 +70,6 @@ int nfs_get_root(struct super_block *s, struct fs_context *fc) goto out_fattr; } - error = nfs_superblock_set_dummy_root(s, inode); - if (error != 0) - goto out_fattr; - /* root dentries normally start off anonymous and get spliced in later * if the dentry tree reaches them; however if the dentry already * exists, we'll pick it up at this point and use it as the root @@ -115,7 +82,6 @@ int nfs_get_root(struct super_block *s, struct fs_context *fc) goto out_fattr; } - security_d_instantiate(root, inode); spin_lock(&root->d_lock); if (IS_ROOT(root) && !root->d_fsdata && !(root->d_flags & DCACHE_NFSFS_RENAMED)) { @@ -123,6 +89,8 @@ int nfs_get_root(struct super_block *s, struct fs_context *fc) name = NULL; } spin_unlock(&root->d_lock); + if (!s->s_root) + s->s_root = dget(root); fc->root = root; if (server->caps & NFS_CAP_SECURITY_LABEL) kflags |= SECURITY_LSM_NATIVE_LABELS; ^ permalink raw reply related [flat|nested] 33+ messages in thread
* Re: [PATCH][RFC] get rid of busy-wait in shrink_dcache_tree() 2026-01-24 5:36 ` Al Viro @ 2026-01-24 17:45 ` Linus Torvalds 2026-01-24 18:43 ` Al Viro 0 siblings, 1 reply; 33+ messages in thread From: Linus Torvalds @ 2026-01-24 17:45 UTC (permalink / raw) To: Al Viro Cc: linux-fsdevel, Christian Brauner, Jan Kara, Nikolay Borisov, Max Kellermann On Fri, 23 Jan 2026 at 21:34, Al Viro <viro@zeniv.linux.org.uk> wrote: > > The only trouble is that as soon as some joker slaps __randomize_layout > on struct hlist_node they'll start flipping from sharing with ->next to > sharing with ->pprev, at random. If somebody starts using randomize_layout on core data structures, they get what they deserve. We have tons of data structures that are *NOT* randomizable. In fact, RANDSTRUCT is so broken in general that we actually taint the kernel if you enable that crazy option in the first place. So no, "what if somebody enables it on random things" is not even remotely worth worrying about. Linus ^ permalink raw reply [flat|nested] 33+ messages in thread
* Re: [PATCH][RFC] get rid of busy-wait in shrink_dcache_tree() 2026-01-24 17:45 ` Linus Torvalds @ 2026-01-24 18:43 ` Al Viro 2026-01-24 19:32 ` Linus Torvalds 0 siblings, 1 reply; 33+ messages in thread From: Al Viro @ 2026-01-24 18:43 UTC (permalink / raw) To: Linus Torvalds Cc: linux-fsdevel, Christian Brauner, Jan Kara, Nikolay Borisov, Max Kellermann On Sat, Jan 24, 2026 at 09:45:54AM -0800, Linus Torvalds wrote: > On Fri, 23 Jan 2026 at 21:34, Al Viro <viro@zeniv.linux.org.uk> wrote: > > > > The only trouble is that as soon as some joker slaps __randomize_layout > > on struct hlist_node they'll start flipping from sharing with ->next to > > sharing with ->pprev, at random. > > If somebody starts using randomize_layout on core data structures, > they get what they deserve. > > We have tons of data structures that are *NOT* randomizable. > > In fact, RANDSTRUCT is so broken in general that we actually taint the > kernel if you enable that crazy option in the first place. So no, > "what if somebody enables it on random things" is not even remotely > worth worrying about. Very much agreed, but we *do* have that on e.g. struct path (two pointers), as well as struct inode, struct file, struct mount, etc. As far as VFS goes, those are core data structures... While we are at it, does anybody have objections to making dentry->d_u anonymous? We are already using anonymous union members in struct dentry, so compiler support is no longer a consideration. Another thing in the same area: #define for_each_alias(dentry, inode) \ hlist_for_each_entry(dentry, &(inode)->i_dentry, d_u.d_alias) to avoid the boilerplate. Objections? ^ permalink raw reply [flat|nested] 33+ messages in thread
* Re: [PATCH][RFC] get rid of busy-wait in shrink_dcache_tree() 2026-01-24 18:43 ` Al Viro @ 2026-01-24 19:32 ` Linus Torvalds 2026-01-24 20:28 ` Al Viro 0 siblings, 1 reply; 33+ messages in thread From: Linus Torvalds @ 2026-01-24 19:32 UTC (permalink / raw) To: Al Viro Cc: linux-fsdevel, Christian Brauner, Jan Kara, Nikolay Borisov, Max Kellermann On Sat, 24 Jan 2026 at 10:41, Al Viro <viro@zeniv.linux.org.uk> wrote: > > > In fact, RANDSTRUCT is so broken in general that we actually taint the > > kernel if you enable that crazy option in the first place. So no, > > "what if somebody enables it on random things" is not even remotely > > worth worrying about. > > Very much agreed, but we *do* have that on e.g. struct path (two pointers), > as well as struct inode, struct file, struct mount, etc. As far as VFS goes, > those are core data structures... I certainly wouldn't mind if we remove the 'struct path' one in particular. It's insanely pointless to do randstruct on two fields. The inode and file structs make a lot more sense, in that they are obviously not only much bigger, but they are prime candidates for the kind of attacks that randstruct is designed for. > While we are at it, does anybody have objections to making dentry->d_u anonymous? > We are already using anonymous union members in struct dentry, so compiler support > is no longer a consideration. > > Another thing in the same area: > > #define for_each_alias(dentry, inode) \ > hlist_for_each_entry(dentry, &(inode)->i_dentry, d_u.d_alias) > > to avoid the boilerplate. Objections? No objection to either of those from me. Linus ^ permalink raw reply [flat|nested] 33+ messages in thread
* Re: [PATCH][RFC] get rid of busy-wait in shrink_dcache_tree() 2026-01-24 19:32 ` Linus Torvalds @ 2026-01-24 20:28 ` Al Viro 0 siblings, 0 replies; 33+ messages in thread From: Al Viro @ 2026-01-24 20:28 UTC (permalink / raw) To: Linus Torvalds Cc: linux-fsdevel, Christian Brauner, Jan Kara, Nikolay Borisov, Max Kellermann On Sat, Jan 24, 2026 at 11:32:59AM -0800, Linus Torvalds wrote: > On Sat, 24 Jan 2026 at 10:41, Al Viro <viro@zeniv.linux.org.uk> wrote: > > > > > In fact, RANDSTRUCT is so broken in general that we actually taint the > > > kernel if you enable that crazy option in the first place. So no, > > > "what if somebody enables it on random things" is not even remotely > > > worth worrying about. > > > > Very much agreed, but we *do* have that on e.g. struct path (two pointers), > > as well as struct inode, struct file, struct mount, etc. As far as VFS goes, > > those are core data structures... > > I certainly wouldn't mind if we remove the 'struct path' one in > particular. It's insanely pointless to do randstruct on two fields. <wry> Frankly, each time I talk to these folks I keep hearing "Every bit is wanted/Every bit is good/Every bit is needed/In your neighborhood" set to the obvious tune... </wry> ^ permalink raw reply [flat|nested] 33+ messages in thread
* [RFC PATCH v2 0/4] getting rid of busy-wait in shrink_dcache_parent() 2026-01-22 20:20 [PATCH][RFC] get rid of busy-wait in shrink_dcache_tree() Al Viro 2026-01-23 0:19 ` Linus Torvalds @ 2026-04-02 18:08 ` Al Viro 2026-04-02 18:08 ` [RFC PATCH v2 1/4] for_each_alias(): helper macro for iterating through dentries of given inode Al Viro ` (4 more replies) 2026-04-04 8:07 ` [RFC PATCH v3 " Al Viro 2 siblings, 5 replies; 33+ messages in thread From: Al Viro @ 2026-04-02 18:08 UTC (permalink / raw) To: Linus Torvalds Cc: linux-fsdevel, Christian Brauner, Jan Kara, Nikolay Borisov, Max Kellermann, Eric Sandeen, Paulo Alcantara [This got stalled last cycle; back then it had been a single patch, now it's carved up, and hopefully more readable for that. I think Linus' objections should be addressed by that variant - no casts, no void * conversions, etc. As the matter of fact, no type-punning and no reliance upon the layouts of overlapping members of struct dentry either] There's a case in which shrink_dcache_tree() ends up busy-waiting: if some dentry in the subtree in question is found to be in process of being evicted by another thread. We need to wait for that to finish so that parent would no longer be pinned, to avoid the situations when nothing in the tree is busy, but shrink_dcache_tree() fails to evict some directory only because memory pressure initiated eviction of some of its children before we got to evicting those ourselves. That would be bogus both for shrink_dcache_parent() and for shrink_dcache_for_umount(). Unfortunately, we have nothing to wait on. That had led to the possibility of busy-waiting - getting through the iteration of shrink_dcache_tree() main loop without having made any progress. That's Not Nice(tm) and that had been discussed quite a few times since at least 2018. Recently it became obvious that this goes beyond "not nice" - on sufficiently contrieved setup it's possible to get a livelock there, with both threads involved tied to the same CPU, shrink_dcache_tree() one with higher priority than the thread that has given CPU up on may_sleep() in the very beginning of iput() during eviction of the dentry in the tree shrink_dcache_tree() is busy-waiting for. Let's get rid of that busy-waiting. Constraints are * don't grow struct dentry * don't slow the normal case of __dentry_kill() down Satisfying those turns out to be possible. Dentries in question are * already marked dying (negative ->d_count) * already negative * already unhashed * already not in in-lookup hash * still visible in the dentry tree (reachable via ->d_sib) * yet to get DCACHE_DENTRY_KILLED in flags. Neither ->d_alias (aside of several pathological places, dealt with in #3/4) nor the members overlapping it (->d_rcu and ->d_in_lookup_hash) are going to be accessed for these dentries until after dentry_unlist(). So we can embed struct completion into struct select_data and store a reference to linked list of select_data instances in a new member (->waiters) colocated with ->d_alias. That allows to have shrink_dcache_parent() insert its select_data to ->waiters of dying dentry it needs to wait for and use wait_for_complete(), with complete() coming from dentry_unlist() in case it sees non-NULL ->waiters. No more busy-waiting there... Branch (currently -rc6-based) lives in git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs.git #work.dcache-busy-wait individual patches in followups. Please, review. It does survive local testing, but extra eyes would be very welcome. Note: in -next CIFS has grown a local copy of d_mark_tmpfile(), which would need the same modification of sanity checks the original gets in #3/4 here. I wonder if that cifs_d_mark_tmpfile() should be turned into to d_mark_tmpfile_name() and lifted into fs/dcache.c... Anyway, that's a separate story... Shortlog: Al Viro (4): for_each_alias(): helper macro for iterating through dentries of given inode struct dentry: make ->d_u anonymous dcache.c: more idiomatic "positives are not allowed" sanity checks get rid of busy-waiting in shrink_dcache_tree() Diffstat: Documentation/filesystems/porting.rst | 10 ++ fs/affs/amigaffs.c | 2 +- fs/ceph/mds_client.c | 4 +- fs/dcache.c | 138 ++++++++++++++++++-------- fs/exportfs/expfs.c | 2 +- fs/inode.c | 2 +- fs/nfs/dir.c | 4 +- fs/nfs/getroot.c | 2 +- fs/notify/fsnotify.c | 2 +- fs/ocfs2/dcache.c | 2 +- fs/overlayfs/dir.c | 2 +- fs/smb/client/inode.c | 2 +- include/linux/dcache.h | 24 ++++- 13 files changed, 140 insertions(+), 56 deletions(-) Series overview: First a couple of cleanups to reduce the amount of noise: 1) for_each_alias(): helper macro for iterating through dentries of given inode Most of the places using d_alias are loops iterating through all aliases for given inode; introduce a helper macro (for_each_alias(dentry, inode)) and convert open-coded instances of such loop to it. They are easier to read that way and it reduces the noise on the next steps. 2) struct dentry: make ->d_u anonymous Making ->d_rcu and (then) ->d_child overlapping dates back to 2006; anon unions support had been added to gcc only in 4.6 (2011) and the minimal gcc version hadn't been bumped to that until 4.19 (2018). These days there's no reason not to keep that union named. Next we get rid of the aforementioned pathologies: 3) dcache.c: more idiomatic "positives are not allowed" sanity checks Several functions have BUG_ON/WARN_ON sanity checks that want to verify that dentry is not positive and instead of looking at ->d_inode (as we do in all other places that check that) they look at ->d_alias. Just use the normal helpers instead - that way we no longer even look at ->d_alias for negative dentries Now everything is ready for adding a new member and killing busy-wait: 4) get rid of busy-waiting in shrink_dcache_tree() If shrink_dcache_parent() runs into a potential victim that is already dying, it must wait for that dentry to go away. To avoid busy-waiting we need some object to wait on and a way for dentry_unlist() to see that we need to be notified. The obvious place for the object to wait on would be inside struct select_data we have on our stack frame. We will store a pointer to select_data in victim dentry; if there's more than one thread wanting to wait for the same dentry to finish dying, we'll have their select_data linked into a list, with reference in dentry pointing to the head of that list. * add struct completion and a pointer to the next select_data into struct select_data; that will be what we wait on and the list linkage. * add a new member (->waiters, opaque pointer to struct select_data) to struct dentry. It is defined for negative live dentries that are not in-lookup ones and it will remain NULL for almost all of them. It does not conflict with ->d_rcu (defined for killed dentries), ->d_alias (defined for positive dentries, all live) or ->d_in_lookup_hash (defined for in-lookup dentries, all live negative). That allows to colocate all four members. * make sure that all places where dentry enters the state where ->waiters is defined (live, negative, not-in-lookup) initialize ->waiters to NULL. * if select_collect2() runs into a dentry that is already dying, have its caller insert its select_data into the head of list that hangs off dentry->waiters and wait for completion. * if dentry_unlist() sees non-NULL ->waiters, have it carefully walk through the select_data on that list, calling complete() for each. ^ permalink raw reply [flat|nested] 33+ messages in thread
* [RFC PATCH v2 1/4] for_each_alias(): helper macro for iterating through dentries of given inode 2026-04-02 18:08 ` [RFC PATCH v2 0/4] getting rid of busy-wait in shrink_dcache_parent() Al Viro @ 2026-04-02 18:08 ` Al Viro 2026-04-02 18:08 ` [RFC PATCH v2 2/4] struct dentry: make ->d_u anonymous Al Viro ` (3 subsequent siblings) 4 siblings, 0 replies; 33+ messages in thread From: Al Viro @ 2026-04-02 18:08 UTC (permalink / raw) To: Linus Torvalds Cc: linux-fsdevel, Christian Brauner, Jan Kara, Nikolay Borisov, Max Kellermann, Eric Sandeen, Paulo Alcantara Most of the places using d_alias are loops iterating through all aliases for given inode; introduce a helper macro (for_each_alias(dentry, inode)) and convert open-coded instances of such loop to it. They are easier to read that way and it reduces the noise on the next steps. You _must_ hold inode->i_lock over that thing. Signed-off-by: Al Viro <viro@zeniv.linux.org.uk> --- Documentation/filesystems/porting.rst | 10 ++++++++++ fs/affs/amigaffs.c | 2 +- fs/ceph/mds_client.c | 2 +- fs/dcache.c | 6 +++--- fs/exportfs/expfs.c | 2 +- fs/nfs/dir.c | 2 +- fs/notify/fsnotify.c | 2 +- fs/ocfs2/dcache.c | 2 +- fs/overlayfs/dir.c | 2 +- fs/smb/client/inode.c | 2 +- include/linux/dcache.h | 4 ++++ 11 files changed, 25 insertions(+), 11 deletions(-) diff --git a/Documentation/filesystems/porting.rst b/Documentation/filesystems/porting.rst index 52ff1d19405b..9a9babd9ec48 100644 --- a/Documentation/filesystems/porting.rst +++ b/Documentation/filesystems/porting.rst @@ -1361,3 +1361,13 @@ to match what strlen() would return if it was ran on the string. However, if the string is freely accessible for the duration of inode's lifetime, consider using inode_set_cached_link() instead. + +--- + +**recommended** + +If you really need to iterate through dentries for given inode, use +for_each_alias(dentry, inode) instead of hlist_for_each_entry; better +yet, see if any of the exported primitives could be used instead of +the entire loop. You still need to hold ->i_lock of the inode over +either form of manual loop. diff --git a/fs/affs/amigaffs.c b/fs/affs/amigaffs.c index fd669daa4e7b..91966b1f41f6 100644 --- a/fs/affs/amigaffs.c +++ b/fs/affs/amigaffs.c @@ -126,7 +126,7 @@ affs_fix_dcache(struct inode *inode, u32 entry_ino) { struct dentry *dentry; spin_lock(&inode->i_lock); - hlist_for_each_entry(dentry, &inode->i_dentry, d_u.d_alias) { + for_each_alias(dentry, inode) { if (entry_ino == (u32)(long)dentry->d_fsdata) { dentry->d_fsdata = (void *)inode->i_ino; break; diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index b1746273f186..f839109fb66f 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -4614,7 +4614,7 @@ static struct dentry* d_find_primary(struct inode *inode) goto out_unlock; } - hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) { + for_each_alias(alias, inode) { spin_lock(&alias->d_lock); if (!d_unhashed(alias) && (ceph_dentry(alias)->flags & CEPH_DENTRY_PRIMARY_LINK)) { diff --git a/fs/dcache.c b/fs/dcache.c index 7ba1801d8132..e069b6ec4ec0 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -790,7 +790,7 @@ void d_mark_dontcache(struct inode *inode) struct dentry *de; spin_lock(&inode->i_lock); - hlist_for_each_entry(de, &inode->i_dentry, d_u.d_alias) { + for_each_alias(de, inode) { spin_lock(&de->d_lock); de->d_flags |= DCACHE_DONTCACHE; spin_unlock(&de->d_lock); @@ -1040,7 +1040,7 @@ static struct dentry *__d_find_alias(struct inode *inode) if (S_ISDIR(inode->i_mode)) return __d_find_any_alias(inode); - hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) { + for_each_alias(alias, inode) { spin_lock(&alias->d_lock); if (!d_unhashed(alias)) { dget_dlock(alias); @@ -1133,7 +1133,7 @@ void d_prune_aliases(struct inode *inode) struct dentry *dentry; spin_lock(&inode->i_lock); - hlist_for_each_entry(dentry, &inode->i_dentry, d_u.d_alias) + for_each_alias(dentry, inode) d_dispose_if_unused(dentry, &dispose); spin_unlock(&inode->i_lock); shrink_dentry_list(&dispose); diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c index 6c9be60a3e48..f67b3ce672fc 100644 --- a/fs/exportfs/expfs.c +++ b/fs/exportfs/expfs.c @@ -52,7 +52,7 @@ find_acceptable_alias(struct dentry *result, inode = result->d_inode; spin_lock(&inode->i_lock); - hlist_for_each_entry(dentry, &inode->i_dentry, d_u.d_alias) { + for_each_alias(dentry, inode) { dget(dentry); spin_unlock(&inode->i_lock); if (toput) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 2402f57c8e7d..5a0bd8113e3a 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1471,7 +1471,7 @@ static void nfs_clear_verifier_file(struct inode *inode) struct dentry *alias; struct inode *dir; - hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) { + for_each_alias(alias, inode) { spin_lock(&alias->d_lock); dir = d_inode_rcu(alias->d_parent); if (!dir || diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index 9995de1710e5..b7198c4744e3 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -76,7 +76,7 @@ void fsnotify_set_children_dentry_flags(struct inode *inode) spin_lock(&inode->i_lock); /* run all of the dentries associated with this inode. Since this is a * directory, there damn well better only be one item on this list */ - hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) { + for_each_alias(alias, inode) { struct dentry *child; /* run all of the children of the original inode and fix their diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c index c4ba968e778b..e06774fd89d8 100644 --- a/fs/ocfs2/dcache.c +++ b/fs/ocfs2/dcache.c @@ -145,7 +145,7 @@ struct dentry *ocfs2_find_local_alias(struct inode *inode, struct dentry *dentry; spin_lock(&inode->i_lock); - hlist_for_each_entry(dentry, &inode->i_dentry, d_u.d_alias) { + for_each_alias(dentry, inode) { spin_lock(&dentry->d_lock); if (ocfs2_match_dentry(dentry, parent_blkno, skip_unhashed)) { trace_ocfs2_find_local_alias(dentry->d_name.len, diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index ff3dbd1ca61f..f8dfa534b566 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -904,7 +904,7 @@ static void ovl_drop_nlink(struct dentry *dentry) /* Try to find another, hashed alias */ spin_lock(&inode->i_lock); - hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) { + for_each_alias(alias, inode) { if (alias != dentry && !d_unhashed(alias)) break; } diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c index 888f9e35f14b..e2b4ad9bd0bd 100644 --- a/fs/smb/client/inode.c +++ b/fs/smb/client/inode.c @@ -1595,7 +1595,7 @@ inode_has_hashed_dentries(struct inode *inode) struct dentry *dentry; spin_lock(&inode->i_lock); - hlist_for_each_entry(dentry, &inode->i_dentry, d_u.d_alias) { + for_each_alias(dentry, inode) { if (!d_unhashed(dentry) || IS_ROOT(dentry)) { spin_unlock(&inode->i_lock); return true; diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 898c60d21c92..7f1dbc7121d7 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -615,4 +615,8 @@ void set_default_d_op(struct super_block *, const struct dentry_operations *); struct dentry *d_make_persistent(struct dentry *, struct inode *); void d_make_discardable(struct dentry *dentry); +/* inode->i_lock must be held over that */ +#define for_each_alias(dentry, inode) \ + hlist_for_each_entry(dentry, &(inode)->i_dentry, d_u.d_alias) + #endif /* __LINUX_DCACHE_H */ -- 2.47.3 ^ permalink raw reply related [flat|nested] 33+ messages in thread
* [RFC PATCH v2 2/4] struct dentry: make ->d_u anonymous 2026-04-02 18:08 ` [RFC PATCH v2 0/4] getting rid of busy-wait in shrink_dcache_parent() Al Viro 2026-04-02 18:08 ` [RFC PATCH v2 1/4] for_each_alias(): helper macro for iterating through dentries of given inode Al Viro @ 2026-04-02 18:08 ` Al Viro 2026-04-02 18:08 ` [RFC PATCH v2 3/4] dcache.c: more idiomatic "positives are not allowed" sanity checks Al Viro ` (2 subsequent siblings) 4 siblings, 0 replies; 33+ messages in thread From: Al Viro @ 2026-04-02 18:08 UTC (permalink / raw) To: Linus Torvalds Cc: linux-fsdevel, Christian Brauner, Jan Kara, Nikolay Borisov, Max Kellermann, Eric Sandeen, Paulo Alcantara Making ->d_rcu and (then) ->d_child overlapping dates back to 2006; anon unions support had been added to gcc only in 4.6 (2011) and the minimal gcc version hadn't been bumped to that until 4.19 (2018). These days there's no reason not to keep that union named. Signed-off-by: Al Viro <viro@zeniv.linux.org.uk> --- fs/ceph/mds_client.c | 2 +- fs/dcache.c | 48 +++++++++++++++++++++--------------------- fs/inode.c | 2 +- fs/nfs/dir.c | 2 +- fs/nfs/getroot.c | 2 +- include/linux/dcache.h | 4 ++-- 6 files changed, 30 insertions(+), 30 deletions(-) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index f839109fb66f..a5eb99c3c36b 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -4608,7 +4608,7 @@ static struct dentry* d_find_primary(struct inode *inode) goto out_unlock; if (S_ISDIR(inode->i_mode)) { - alias = hlist_entry(inode->i_dentry.first, struct dentry, d_u.d_alias); + alias = hlist_entry(inode->i_dentry.first, struct dentry, d_alias); if (!IS_ROOT(alias)) dn = dget(alias); goto out_unlock; diff --git a/fs/dcache.c b/fs/dcache.c index e069b6ec4ec0..4378eb8a00bb 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -40,7 +40,7 @@ /* * Usage: * dcache->d_inode->i_lock protects: - * - i_dentry, d_u.d_alias, d_inode of aliases + * - i_dentry, d_alias, d_inode of aliases * dcache_hash_bucket lock protects: * - the dcache hash table * s_roots bl list spinlock protects: @@ -55,7 +55,7 @@ * - d_unhashed() * - d_parent and d_chilren * - childrens' d_sib and d_parent - * - d_u.d_alias, d_inode + * - d_alias, d_inode * * Ordering: * dentry->d_inode->i_lock @@ -341,14 +341,14 @@ static inline struct external_name *external_name(struct dentry *dentry) static void __d_free(struct rcu_head *head) { - struct dentry *dentry = container_of(head, struct dentry, d_u.d_rcu); + struct dentry *dentry = container_of(head, struct dentry, d_rcu); kmem_cache_free(dentry_cache, dentry); } static void __d_free_external(struct rcu_head *head) { - struct dentry *dentry = container_of(head, struct dentry, d_u.d_rcu); + struct dentry *dentry = container_of(head, struct dentry, d_rcu); kfree(external_name(dentry)); kmem_cache_free(dentry_cache, dentry); } @@ -428,19 +428,19 @@ static inline void __d_clear_type_and_inode(struct dentry *dentry) static void dentry_free(struct dentry *dentry) { - WARN_ON(!hlist_unhashed(&dentry->d_u.d_alias)); + WARN_ON(!hlist_unhashed(&dentry->d_alias)); if (unlikely(dname_external(dentry))) { struct external_name *p = external_name(dentry); if (likely(atomic_dec_and_test(&p->count))) { - call_rcu(&dentry->d_u.d_rcu, __d_free_external); + call_rcu(&dentry->d_rcu, __d_free_external); return; } } /* if dentry was never visible to RCU, immediate free is OK */ if (dentry->d_flags & DCACHE_NORCU) - __d_free(&dentry->d_u.d_rcu); + __d_free(&dentry->d_rcu); else - call_rcu(&dentry->d_u.d_rcu, __d_free); + call_rcu(&dentry->d_rcu, __d_free); } /* @@ -455,7 +455,7 @@ static void dentry_unlink_inode(struct dentry * dentry) raw_write_seqcount_begin(&dentry->d_seq); __d_clear_type_and_inode(dentry); - hlist_del_init(&dentry->d_u.d_alias); + hlist_del_init(&dentry->d_alias); raw_write_seqcount_end(&dentry->d_seq); spin_unlock(&dentry->d_lock); spin_unlock(&inode->i_lock); @@ -1010,7 +1010,7 @@ static struct dentry * __d_find_any_alias(struct inode *inode) if (hlist_empty(&inode->i_dentry)) return NULL; - alias = hlist_entry(inode->i_dentry.first, struct dentry, d_u.d_alias); + alias = hlist_entry(inode->i_dentry.first, struct dentry, d_alias); lockref_get(&alias->d_lockref); return alias; } @@ -1093,9 +1093,9 @@ struct dentry *d_find_alias_rcu(struct inode *inode) // used without having I_FREEING set, which means no aliases left if (likely(!(inode_state_read(inode) & I_FREEING) && !hlist_empty(l))) { if (S_ISDIR(inode->i_mode)) { - de = hlist_entry(l->first, struct dentry, d_u.d_alias); + de = hlist_entry(l->first, struct dentry, d_alias); } else { - hlist_for_each_entry(de, l, d_u.d_alias) + hlist_for_each_entry(de, l, d_alias) if (!d_unhashed(de)) break; } @@ -1787,7 +1787,7 @@ static struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name) INIT_HLIST_BL_NODE(&dentry->d_hash); INIT_LIST_HEAD(&dentry->d_lru); INIT_HLIST_HEAD(&dentry->d_children); - INIT_HLIST_NODE(&dentry->d_u.d_alias); + INIT_HLIST_NODE(&dentry->d_alias); INIT_HLIST_NODE(&dentry->d_sib); if (dentry->d_op && dentry->d_op->d_init) { @@ -1980,7 +1980,7 @@ static void __d_instantiate(struct dentry *dentry, struct inode *inode) if ((dentry->d_flags & (DCACHE_LRU_LIST|DCACHE_SHRINK_LIST)) == DCACHE_LRU_LIST) this_cpu_dec(nr_dentry_negative); - hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry); + hlist_add_head(&dentry->d_alias, &inode->i_dentry); raw_write_seqcount_begin(&dentry->d_seq); __d_set_inode_and_type(dentry, inode, add_flags); raw_write_seqcount_end(&dentry->d_seq); @@ -2004,7 +2004,7 @@ static void __d_instantiate(struct dentry *dentry, struct inode *inode) void d_instantiate(struct dentry *entry, struct inode * inode) { - BUG_ON(!hlist_unhashed(&entry->d_u.d_alias)); + BUG_ON(!hlist_unhashed(&entry->d_alias)); if (inode) { security_d_instantiate(entry, inode); spin_lock(&inode->i_lock); @@ -2024,7 +2024,7 @@ EXPORT_SYMBOL(d_instantiate); */ void d_instantiate_new(struct dentry *entry, struct inode *inode) { - BUG_ON(!hlist_unhashed(&entry->d_u.d_alias)); + BUG_ON(!hlist_unhashed(&entry->d_alias)); BUG_ON(!inode); lockdep_annotate_inode_mutex_key(inode); security_d_instantiate(entry, inode); @@ -2087,7 +2087,7 @@ static struct dentry *__d_obtain_alias(struct inode *inode, bool disconnected) spin_lock(&new->d_lock); __d_set_inode_and_type(new, inode, add_flags); - hlist_add_head(&new->d_u.d_alias, &inode->i_dentry); + hlist_add_head(&new->d_alias, &inode->i_dentry); if (!disconnected) { hlist_bl_lock(&sb->s_roots); hlist_bl_add_head(&new->d_hash, &sb->s_roots); @@ -2658,7 +2658,7 @@ struct dentry *d_alloc_parallel(struct dentry *parent, * we unlock the chain. All fields are stable in everything * we encounter. */ - hlist_bl_for_each_entry(dentry, node, b, d_u.d_in_lookup_hash) { + hlist_bl_for_each_entry(dentry, node, b, d_in_lookup_hash) { if (dentry->d_name.hash != hash) continue; if (dentry->d_parent != parent) @@ -2700,7 +2700,7 @@ struct dentry *d_alloc_parallel(struct dentry *parent, } rcu_read_unlock(); new->d_wait = wq; - hlist_bl_add_head(&new->d_u.d_in_lookup_hash, b); + hlist_bl_add_head(&new->d_in_lookup_hash, b); hlist_bl_unlock(b); return new; mismatch: @@ -2725,11 +2725,11 @@ static wait_queue_head_t *__d_lookup_unhash(struct dentry *dentry) b = in_lookup_hash(dentry->d_parent, dentry->d_name.hash); hlist_bl_lock(b); dentry->d_flags &= ~DCACHE_PAR_LOOKUP; - __hlist_bl_del(&dentry->d_u.d_in_lookup_hash); + __hlist_bl_del(&dentry->d_in_lookup_hash); d_wait = dentry->d_wait; dentry->d_wait = NULL; hlist_bl_unlock(b); - INIT_HLIST_NODE(&dentry->d_u.d_alias); + INIT_HLIST_NODE(&dentry->d_alias); INIT_LIST_HEAD(&dentry->d_lru); return d_wait; } @@ -2760,7 +2760,7 @@ static inline void __d_add(struct dentry *dentry, struct inode *inode, d_set_d_op(dentry, ops); if (inode) { unsigned add_flags = d_flags_for_inode(inode); - hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry); + hlist_add_head(&dentry->d_alias, &inode->i_dentry); raw_write_seqcount_begin(&dentry->d_seq); __d_set_inode_and_type(dentry, inode, add_flags); raw_write_seqcount_end(&dentry->d_seq); @@ -2795,7 +2795,7 @@ EXPORT_SYMBOL(d_add); struct dentry *d_make_persistent(struct dentry *dentry, struct inode *inode) { - WARN_ON(!hlist_unhashed(&dentry->d_u.d_alias)); + WARN_ON(!hlist_unhashed(&dentry->d_alias)); WARN_ON(!inode); security_d_instantiate(dentry, inode); spin_lock(&inode->i_lock); @@ -3185,7 +3185,7 @@ void d_mark_tmpfile(struct file *file, struct inode *inode) struct dentry *dentry = file->f_path.dentry; BUG_ON(dname_external(dentry) || - !hlist_unhashed(&dentry->d_u.d_alias) || + !hlist_unhashed(&dentry->d_alias) || !d_unlinked(dentry)); spin_lock(&dentry->d_parent->d_lock); spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); diff --git a/fs/inode.c b/fs/inode.c index cc12b68e021b..9e1ab333d382 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -754,7 +754,7 @@ void dump_mapping(const struct address_space *mapping) return; } - dentry_ptr = container_of(dentry_first, struct dentry, d_u.d_alias); + dentry_ptr = container_of(dentry_first, struct dentry, d_alias); if (get_kernel_nofault(dentry, dentry_ptr) || !dentry.d_parent || !dentry.d_name.name) { pr_warn("aops:%ps ino:%lx invalid dentry:%px\n", diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 5a0bd8113e3a..f2f1b036f2f1 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1490,7 +1490,7 @@ static void nfs_clear_verifier_directory(struct inode *dir) if (hlist_empty(&dir->i_dentry)) return; this_parent = - hlist_entry(dir->i_dentry.first, struct dentry, d_u.d_alias); + hlist_entry(dir->i_dentry.first, struct dentry, d_alias); spin_lock(&this_parent->d_lock); nfs_unset_verifier_delegated(&this_parent->d_time); diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c index f13d25d95b85..eef0736beb67 100644 --- a/fs/nfs/getroot.c +++ b/fs/nfs/getroot.c @@ -54,7 +54,7 @@ static int nfs_superblock_set_dummy_root(struct super_block *sb, struct inode *i */ spin_lock(&d_inode(sb->s_root)->i_lock); spin_lock(&sb->s_root->d_lock); - hlist_del_init(&sb->s_root->d_u.d_alias); + hlist_del_init(&sb->s_root->d_alias); spin_unlock(&sb->s_root->d_lock); spin_unlock(&d_inode(sb->s_root)->i_lock); } diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 7f1dbc7121d7..f939d2ed10a3 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -128,7 +128,7 @@ struct dentry { struct hlist_node d_alias; /* inode alias list */ struct hlist_bl_node d_in_lookup_hash; /* only for in-lookup ones */ struct rcu_head d_rcu; - } d_u; + }; }; /* @@ -617,6 +617,6 @@ void d_make_discardable(struct dentry *dentry); /* inode->i_lock must be held over that */ #define for_each_alias(dentry, inode) \ - hlist_for_each_entry(dentry, &(inode)->i_dentry, d_u.d_alias) + hlist_for_each_entry(dentry, &(inode)->i_dentry, d_alias) #endif /* __LINUX_DCACHE_H */ -- 2.47.3 ^ permalink raw reply related [flat|nested] 33+ messages in thread
* [RFC PATCH v2 3/4] dcache.c: more idiomatic "positives are not allowed" sanity checks 2026-04-02 18:08 ` [RFC PATCH v2 0/4] getting rid of busy-wait in shrink_dcache_parent() Al Viro 2026-04-02 18:08 ` [RFC PATCH v2 1/4] for_each_alias(): helper macro for iterating through dentries of given inode Al Viro 2026-04-02 18:08 ` [RFC PATCH v2 2/4] struct dentry: make ->d_u anonymous Al Viro @ 2026-04-02 18:08 ` Al Viro 2026-04-02 18:08 ` [RFC PATCH v2 4/4] get rid of busy-waiting in shrink_dcache_tree() Al Viro 2026-04-02 20:28 ` [RFC PATCH v2 0/4] getting rid of busy-wait in shrink_dcache_parent() Paulo Alcantara 4 siblings, 0 replies; 33+ messages in thread From: Al Viro @ 2026-04-02 18:08 UTC (permalink / raw) To: Linus Torvalds Cc: linux-fsdevel, Christian Brauner, Jan Kara, Nikolay Borisov, Max Kellermann, Eric Sandeen, Paulo Alcantara Several functions have BUG_ON/WARN_ON sanity checks that want to verify that dentry is not positive and instead of looking at ->d_inode (as we do in all other places that check that) they look at ->d_alias. Just use the normal helpers instead - that way we no longer even look at ->d_alias for negative dentries Signed-off-by: Al Viro <viro@zeniv.linux.org.uk> --- fs/dcache.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/dcache.c b/fs/dcache.c index 4378eb8a00bb..616a445ec720 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -428,7 +428,7 @@ static inline void __d_clear_type_and_inode(struct dentry *dentry) static void dentry_free(struct dentry *dentry) { - WARN_ON(!hlist_unhashed(&dentry->d_alias)); + WARN_ON(d_really_is_positive(dentry)); if (unlikely(dname_external(dentry))) { struct external_name *p = external_name(dentry); if (likely(atomic_dec_and_test(&p->count))) { @@ -2004,7 +2004,7 @@ static void __d_instantiate(struct dentry *dentry, struct inode *inode) void d_instantiate(struct dentry *entry, struct inode * inode) { - BUG_ON(!hlist_unhashed(&entry->d_alias)); + BUG_ON(d_really_is_positive(entry)); if (inode) { security_d_instantiate(entry, inode); spin_lock(&inode->i_lock); @@ -2024,7 +2024,7 @@ EXPORT_SYMBOL(d_instantiate); */ void d_instantiate_new(struct dentry *entry, struct inode *inode) { - BUG_ON(!hlist_unhashed(&entry->d_alias)); + BUG_ON(d_really_is_positive(entry)); BUG_ON(!inode); lockdep_annotate_inode_mutex_key(inode); security_d_instantiate(entry, inode); @@ -2795,7 +2795,7 @@ EXPORT_SYMBOL(d_add); struct dentry *d_make_persistent(struct dentry *dentry, struct inode *inode) { - WARN_ON(!hlist_unhashed(&dentry->d_alias)); + WARN_ON(d_really_is_positive(dentry)); WARN_ON(!inode); security_d_instantiate(dentry, inode); spin_lock(&inode->i_lock); @@ -3185,7 +3185,7 @@ void d_mark_tmpfile(struct file *file, struct inode *inode) struct dentry *dentry = file->f_path.dentry; BUG_ON(dname_external(dentry) || - !hlist_unhashed(&dentry->d_alias) || + d_really_is_positive(dentry) || !d_unlinked(dentry)); spin_lock(&dentry->d_parent->d_lock); spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); -- 2.47.3 ^ permalink raw reply related [flat|nested] 33+ messages in thread
* [RFC PATCH v2 4/4] get rid of busy-waiting in shrink_dcache_tree() 2026-04-02 18:08 ` [RFC PATCH v2 0/4] getting rid of busy-wait in shrink_dcache_parent() Al Viro ` (2 preceding siblings ...) 2026-04-02 18:08 ` [RFC PATCH v2 3/4] dcache.c: more idiomatic "positives are not allowed" sanity checks Al Viro @ 2026-04-02 18:08 ` Al Viro 2026-04-02 19:52 ` Linus Torvalds 2026-04-02 20:28 ` [RFC PATCH v2 0/4] getting rid of busy-wait in shrink_dcache_parent() Paulo Alcantara 4 siblings, 1 reply; 33+ messages in thread From: Al Viro @ 2026-04-02 18:08 UTC (permalink / raw) To: Linus Torvalds Cc: linux-fsdevel, Christian Brauner, Jan Kara, Nikolay Borisov, Max Kellermann, Eric Sandeen, Paulo Alcantara If shrink_dcache_parent() runs into a potential victim that is already dying, it must wait for that dentry to go away. To avoid busy-waiting we need some object to wait on and a way for dentry_unlist() to see that we need to be notified. The obvious place for the object to wait on would be inside struct select_data we have on our stack frame. We will store a pointer to select_data in victim dentry; if there's more than one thread wanting to wait for the same dentry to finish dying, we'll have their select_data linked into a list, with reference in dentry pointing to the head of that list. * add struct completion and a pointer to the next select_data into struct select_data; that will be what we wait on and the list linkage. * add a new member (->waiters, opaque pointer to struct select_data) to struct dentry. It is defined for negative live dentries that are not in-lookup ones and it will remain NULL for almost all of them. It does not conflict with ->d_rcu (defined for killed dentries), ->d_alias (defined for positive dentries, all live) or ->d_in_lookup_hash (defined for in-lookup dentries, all live negative). That allows to colocate all four members. * make sure that all places where dentry enters the state where ->waiters is defined (live, negative, not-in-lookup) initialize ->waiters to NULL. * if select_collect2() runs into a dentry that is already dying, have its caller insert its select_data into the head of list that hangs off dentry->waiters and wait for completion. * if dentry_unlist() sees non-NULL ->waiters, have it carefully walk through the select_data on that list, calling complete() for each. Signed-off-by: Al Viro <viro@zeniv.linux.org.uk> --- fs/dcache.c | 90 ++++++++++++++++++++++++++++++++++-------- include/linux/dcache.h | 18 +++++++-- 2 files changed, 89 insertions(+), 19 deletions(-) diff --git a/fs/dcache.c b/fs/dcache.c index 616a445ec720..09a80f7f3d05 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -455,7 +455,9 @@ static void dentry_unlink_inode(struct dentry * dentry) raw_write_seqcount_begin(&dentry->d_seq); __d_clear_type_and_inode(dentry); - hlist_del_init(&dentry->d_alias); + if (likely(!hlist_unhashed(&dentry->d_alias))) + __hlist_del(&dentry->d_alias); + dentry->waiters = NULL; raw_write_seqcount_end(&dentry->d_seq); spin_unlock(&dentry->d_lock); spin_unlock(&inode->i_lock); @@ -605,6 +607,52 @@ void d_drop(struct dentry *dentry) } EXPORT_SYMBOL(d_drop); +struct select_data { + struct dentry *start; + union { + long found; + struct { + struct dentry *victim; + struct select_data *next; + struct completion completion; + }; + }; + struct list_head dispose; +}; + +/* + * shrink_dcache_parent() needs to be notified when dentry in process of + * being evicted finally gets unlisted. Such dentries are + * already with negative ->d_count + * already negative + * already not in in-lookup hash + * reachable only via ->d_sib. + * + * Use ->waiters for a single-linked list of struct select_data of + * waiters. + */ +static inline void d_add_waiter(struct dentry *dentry, struct select_data *p) +{ + struct select_data *v = dentry->waiters; + init_completion(&p->completion); + p->next = v; + dentry->waiters = p; +} + +static inline void d_complete_waiters(struct dentry *dentry) +{ + struct select_data *v = dentry->waiters; + if (unlikely(v)) { + /* some shrink_dcache_tree() instances are waiting */ + dentry->waiters = NULL; + while (v) { + struct completion *r = &v->completion; + v = v->next; + complete(r); + } + } +} + static inline void dentry_unlist(struct dentry *dentry) { struct dentry *next; @@ -613,6 +661,7 @@ static inline void dentry_unlist(struct dentry *dentry) * attached to the dentry tree */ dentry->d_flags |= DCACHE_DENTRY_KILLED; + d_complete_waiters(dentry); if (unlikely(hlist_unhashed(&dentry->d_sib))) return; __hlist_del(&dentry->d_sib); @@ -1509,15 +1558,6 @@ int d_set_mounted(struct dentry *dentry) * constraints. */ -struct select_data { - struct dentry *start; - union { - long found; - struct dentry *victim; - }; - struct list_head dispose; -}; - static enum d_walk_ret select_collect(void *_data, struct dentry *dentry) { struct select_data *data = _data; @@ -1569,6 +1609,10 @@ static enum d_walk_ret select_collect2(void *_data, struct dentry *dentry) return D_WALK_QUIT; } to_shrink_list(dentry, &data->dispose); + } else if (dentry->d_lockref.count < 0) { + rcu_read_lock(); + data->victim = dentry; + return D_WALK_QUIT; } /* * We can return to the caller if we have found some (this @@ -1608,12 +1652,26 @@ static void shrink_dcache_tree(struct dentry *parent, bool for_umount) data.victim = NULL; d_walk(parent, &data, select_collect2); if (data.victim) { - spin_lock(&data.victim->d_lock); - if (!lock_for_kill(data.victim)) { - spin_unlock(&data.victim->d_lock); + struct dentry *v = data.victim; + + spin_lock(&v->d_lock); + if (v->d_lockref.count < 0 && + !(v->d_flags & DCACHE_DENTRY_KILLED)) { + // It's busy dying; have it notify us once + // it becomes invisible to d_walk(). + d_add_waiter(v, &data); + spin_unlock(&v->d_lock); + rcu_read_unlock(); + if (!list_empty(&data.dispose)) + shrink_dentry_list(&data.dispose); + wait_for_completion(&data.completion); + continue; + } + if (!lock_for_kill(v)) { + spin_unlock(&v->d_lock); rcu_read_unlock(); } else { - shrink_kill(data.victim); + shrink_kill(v); } } if (!list_empty(&data.dispose)) @@ -1787,7 +1845,7 @@ static struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name) INIT_HLIST_BL_NODE(&dentry->d_hash); INIT_LIST_HEAD(&dentry->d_lru); INIT_HLIST_HEAD(&dentry->d_children); - INIT_HLIST_NODE(&dentry->d_alias); + dentry->waiters = NULL; INIT_HLIST_NODE(&dentry->d_sib); if (dentry->d_op && dentry->d_op->d_init) { @@ -2729,7 +2787,7 @@ static wait_queue_head_t *__d_lookup_unhash(struct dentry *dentry) d_wait = dentry->d_wait; dentry->d_wait = NULL; hlist_bl_unlock(b); - INIT_HLIST_NODE(&dentry->d_alias); + dentry->waiters = NULL; INIT_LIST_HEAD(&dentry->d_lru); return d_wait; } diff --git a/include/linux/dcache.h b/include/linux/dcache.h index f939d2ed10a3..07dac37eb75c 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -88,6 +88,7 @@ union shortname_store { #define d_lock d_lockref.lock #define d_iname d_shortname.string +struct select_data; struct dentry { /* RCU lookup touched fields */ @@ -122,12 +123,23 @@ struct dentry { struct hlist_node d_sib; /* child of parent list */ struct hlist_head d_children; /* our children */ /* - * d_alias and d_rcu can share memory + * the following members can share memory - their uses are + * mutually exclusive. */ union { - struct hlist_node d_alias; /* inode alias list */ - struct hlist_bl_node d_in_lookup_hash; /* only for in-lookup ones */ + /* positives: inode alias list */ + struct hlist_node d_alias; + /* in-lookup ones (all negative, live): hash chain */ + struct hlist_bl_node d_in_lookup_hash; + /* killed ones: (already negative) used to schedule freeing */ struct rcu_head d_rcu; + /* + * live non-in-lookup negatives: used if shrink_dcache_parent() + * races with eviction by another thread and needs to wait for + * this dentry to get killed . Remains NULL for almost all + * negative dentries. + */ + struct select_data *waiters; }; }; -- 2.47.3 ^ permalink raw reply related [flat|nested] 33+ messages in thread
* Re: [RFC PATCH v2 4/4] get rid of busy-waiting in shrink_dcache_tree() 2026-04-02 18:08 ` [RFC PATCH v2 4/4] get rid of busy-waiting in shrink_dcache_tree() Al Viro @ 2026-04-02 19:52 ` Linus Torvalds 2026-04-02 22:44 ` Al Viro 0 siblings, 1 reply; 33+ messages in thread From: Linus Torvalds @ 2026-04-02 19:52 UTC (permalink / raw) To: Al Viro Cc: linux-fsdevel, Christian Brauner, Jan Kara, Nikolay Borisov, Max Kellermann, Eric Sandeen, Paulo Alcantara On Thu, 2 Apr 2026 at 11:05, Al Viro <viro@zeniv.linux.org.uk> wrote: > > @@ -455,7 +455,9 @@ static void dentry_unlink_inode(struct dentry * dentry) > > raw_write_seqcount_begin(&dentry->d_seq); > __d_clear_type_and_inode(dentry); > - hlist_del_init(&dentry->d_alias); > + if (likely(!hlist_unhashed(&dentry->d_alias))) > + __hlist_del(&dentry->d_alias); > + dentry->waiters = NULL; > raw_write_seqcount_end(&dentry->d_seq); > spin_unlock(&dentry->d_lock); > spin_unlock(&inode->i_lock); I have no real objections to this patch series, but I'd like this to get a big comment here. There's also a 'hlist_del_init(&dentry->d_alias);' in nfs/getroot.c, and that one stays as such. I thnk that's just because it can't actually be a real alias list at that point, and all that case does is to remove its own inode from the initial list to make it always be empty. And so it doesn't have any of this 'negative live under lookup' case. But I'd like to have some commentary about *this* particular one, because this is the subtle one. In particular, I have paged out the history from the last iteration of this all, and the reason why you changed hlist_del_init() to that 'likely(!hlist_unhashed...' escapes me. I'm sure I could look it up and remind myself, but honestly, I'd rather have it just all explained in the code. Please? Linus ^ permalink raw reply [flat|nested] 33+ messages in thread
* Re: [RFC PATCH v2 4/4] get rid of busy-waiting in shrink_dcache_tree() 2026-04-02 19:52 ` Linus Torvalds @ 2026-04-02 22:44 ` Al Viro 2026-04-02 22:49 ` Linus Torvalds 0 siblings, 1 reply; 33+ messages in thread From: Al Viro @ 2026-04-02 22:44 UTC (permalink / raw) To: Linus Torvalds Cc: linux-fsdevel, Christian Brauner, Jan Kara, Nikolay Borisov, Max Kellermann, Eric Sandeen, Paulo Alcantara On Thu, Apr 02, 2026 at 12:52:28PM -0700, Linus Torvalds wrote: > There's also a 'hlist_del_init(&dentry->d_alias);' in nfs/getroot.c, > and that one stays as such. > > I thnk that's just because it can't actually be a real alias list at > that point, and all that case does is to remove its own inode from the > initial list to make it always be empty. And so it doesn't have any of > this 'negative live under lookup' case. But I'd like to have some > commentary about *this* particular one, because this is the subtle > one. > > In particular, I have paged out the history from the last iteration of > this all, and the reason why you changed hlist_del_init() to that > 'likely(!hlist_unhashed...' escapes me. > > I'm sure I could look it up and remind myself, but honestly, I'd > rather have it just all explained in the code. No deep reasons, really - it can stay hlist_del_init() too; the only reason why I went for __hlist_del_init() (and got immediately reminded that the sucker needs hlist_unhashed() checked first) was that ->d_alias ceases to exist at that point, so there's no real reason to zero it. OTOH, these two stores (to ->next and ->pprev) are not going to cost much, especially since one of them combines with zeroing ->waiters immediately afterwards... We'd just read both words, and they'd better be in the same cacheline, so the extra store should be pretty much free... Let's go with hlist_del_init() there, possibly with a comment that "init" part is not really needed, but use of __hlist_del() is clumsier and it wouldn't really win us anything. Re nfs_superblock_set_dummy_root(): it's really ugly wart and I'm not sure we still need it these days; it _is_ safe, even though it violates all kind of rules for dentry state. It's also a headache on every code audit in the area ;-/ I suspect that we could get rid of the entire dummy root thing and if sb->s_root is still NULL after d_obtain_root() in nfs_get_root(), have root removed from sb->s_roots and stored in ->s_root. Once upon a time we used to oops if shrink_dcache_for_umount() found !IS_ROOT(sb->s_root) (i.e. if root got spliced on top of one of the secondary trees at some point), but that's no longer true - not since 42c326082d8a "switch shrink_dcache_for_umount() to use of d_walk()". Never got around to checking if anything in NFS might get unhappy with such situation; VFS ought to be OK with it. That's definitely a separate story, though; as far as this series is concerned, dummy root is out of scope - it's still positive, even though it's removed from alias list. Which is yet another reason why these weirdly spelled sanity checks were wrong - we never hit any of those for the dummy root, but they would wrongly treat it as negative if we ever did... ^ permalink raw reply [flat|nested] 33+ messages in thread
* Re: [RFC PATCH v2 4/4] get rid of busy-waiting in shrink_dcache_tree() 2026-04-02 22:44 ` Al Viro @ 2026-04-02 22:49 ` Linus Torvalds 2026-04-02 23:16 ` Al Viro 0 siblings, 1 reply; 33+ messages in thread From: Linus Torvalds @ 2026-04-02 22:49 UTC (permalink / raw) To: Al Viro Cc: linux-fsdevel, Christian Brauner, Jan Kara, Nikolay Borisov, Max Kellermann, Eric Sandeen, Paulo Alcantara On Thu, 2 Apr 2026 at 15:41, Al Viro <viro@zeniv.linux.org.uk> wrote: > > No deep reasons, really - it can stay hlist_del_init() too; the only > reason why I went for __hlist_del_init() (and got immediately reminded > that the sucker needs hlist_unhashed() checked first) was that ->d_alias > ceases to exist at that point, so there's no real reason to zero it. Ok, so that's what I thought it was, but then the whole "now you're checking whether it was hashed" confused me. > Let's go with hlist_del_init() there, possibly with a comment that "init" > part is not really needed, but use of __hlist_del() is clumsier and it > wouldn't really win us anything. Agreed. A comment about how the "state" of that union changes from being a hlist to being that waiter, at that point is all that is needed, but I'd prefer the "hlist_del_init()". In fact, the __hlist_del() *without* the init part would work fine for me with the comment. But it was really the added if (likely(!hlist_unhashed(&dentry->d_alias))) that made me go "why is _that_ needed?" Afaik, it's a perfectly find hlist entry at that point, and no need to add the hlist_unhashed() check. No? Linus ^ permalink raw reply [flat|nested] 33+ messages in thread
* Re: [RFC PATCH v2 4/4] get rid of busy-waiting in shrink_dcache_tree() 2026-04-02 22:49 ` Linus Torvalds @ 2026-04-02 23:16 ` Al Viro 2026-04-03 0:29 ` Linus Torvalds 0 siblings, 1 reply; 33+ messages in thread From: Al Viro @ 2026-04-02 23:16 UTC (permalink / raw) To: Linus Torvalds Cc: linux-fsdevel, Christian Brauner, Jan Kara, Nikolay Borisov, Max Kellermann, Eric Sandeen, Paulo Alcantara On Thu, Apr 02, 2026 at 03:49:39PM -0700, Linus Torvalds wrote: > In fact, the __hlist_del() *without* the init part would work fine for > me with the comment. > > But it was really the added > > if (likely(!hlist_unhashed(&dentry->d_alias))) > > that made me go "why is _that_ needed?" > > Afaik, it's a perfectly find hlist entry at that point, and no need to > add the hlist_unhashed() check. No? The incremental I've got right now is diff --git a/fs/dcache.c b/fs/dcache.c index 09a80f7f3d05..b83c279f34a0 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -455,8 +455,15 @@ static void dentry_unlink_inode(struct dentry * dentry) raw_write_seqcount_begin(&dentry->d_seq); __d_clear_type_and_inode(dentry); - if (likely(!hlist_unhashed(&dentry->d_alias))) - __hlist_del(&dentry->d_alias); + hlist_del_init(&dentry->d_alias); + /* + * dentry becomes negative, so the space occupied by ->d_alias + * belongs to ->waiters now; we could use __hlist_del() instead + * of hlist_del_init(), if not for the stunt pulled by nfs + * dummy root dentries - positive dentry *not* included into + * the alias list of its inode. Open-coding hlist_del_init() + * and removing zeroing would be too clumsy... + */ dentry->waiters = NULL; raw_write_seqcount_end(&dentry->d_seq); spin_unlock(&dentry->d_lock); ^ permalink raw reply related [flat|nested] 33+ messages in thread
* Re: [RFC PATCH v2 4/4] get rid of busy-waiting in shrink_dcache_tree() 2026-04-02 23:16 ` Al Viro @ 2026-04-03 0:29 ` Linus Torvalds 2026-04-03 2:15 ` Al Viro 0 siblings, 1 reply; 33+ messages in thread From: Linus Torvalds @ 2026-04-03 0:29 UTC (permalink / raw) To: Al Viro Cc: linux-fsdevel, Christian Brauner, Jan Kara, Nikolay Borisov, Max Kellermann, Eric Sandeen, Paulo Alcantara On Thu, 2 Apr 2026 at 16:13, Al Viro <viro@zeniv.linux.org.uk> wrote: > > The incremental I've got right now is LGTM. Linus ^ permalink raw reply [flat|nested] 33+ messages in thread
* Re: [RFC PATCH v2 4/4] get rid of busy-waiting in shrink_dcache_tree() 2026-04-03 0:29 ` Linus Torvalds @ 2026-04-03 2:15 ` Al Viro 2026-04-04 0:02 ` Al Viro 0 siblings, 1 reply; 33+ messages in thread From: Al Viro @ 2026-04-03 2:15 UTC (permalink / raw) To: Linus Torvalds Cc: linux-fsdevel, Christian Brauner, Jan Kara, Nikolay Borisov, Max Kellermann, Eric Sandeen, Paulo Alcantara On Thu, Apr 02, 2026 at 05:29:28PM -0700, Linus Torvalds wrote: > On Thu, 2 Apr 2026 at 16:13, Al Viro <viro@zeniv.linux.org.uk> wrote: > > > > The incremental I've got right now is > > LGTM. Force-pushed... ^ permalink raw reply [flat|nested] 33+ messages in thread
* Re: [RFC PATCH v2 4/4] get rid of busy-waiting in shrink_dcache_tree() 2026-04-03 2:15 ` Al Viro @ 2026-04-04 0:02 ` Al Viro 2026-04-04 0:04 ` Linus Torvalds 0 siblings, 1 reply; 33+ messages in thread From: Al Viro @ 2026-04-04 0:02 UTC (permalink / raw) To: Linus Torvalds Cc: linux-fsdevel, Christian Brauner, Jan Kara, Nikolay Borisov, Max Kellermann, Eric Sandeen, Paulo Alcantara On Fri, Apr 03, 2026 at 03:15:22AM +0100, Al Viro wrote: > On Thu, Apr 02, 2026 at 05:29:28PM -0700, Linus Torvalds wrote: > > On Thu, 2 Apr 2026 at 16:13, Al Viro <viro@zeniv.linux.org.uk> wrote: > > > > > > The incremental I've got right now is > > > > LGTM. > > Force-pushed... FWIW, I wonder if we would be better off with something like struct completion_list { struct completion completion; struct completion_list *next; }; embedded into select_data, with dentry->waiters being a pointer to that. It's just that dragging struct select_data into that, even as an opaque pointer, tastes wrong ;-/ Preferences? ^ permalink raw reply [flat|nested] 33+ messages in thread
* Re: [RFC PATCH v2 4/4] get rid of busy-waiting in shrink_dcache_tree() 2026-04-04 0:02 ` Al Viro @ 2026-04-04 0:04 ` Linus Torvalds 2026-04-04 18:54 ` Al Viro 0 siblings, 1 reply; 33+ messages in thread From: Linus Torvalds @ 2026-04-04 0:04 UTC (permalink / raw) To: Al Viro Cc: linux-fsdevel, Christian Brauner, Jan Kara, Nikolay Borisov, Max Kellermann, Eric Sandeen, Paulo Alcantara On Fri, 3 Apr 2026 at 16:58, Al Viro <viro@zeniv.linux.org.uk> wrote: > > FWIW, I wonder if we would be better off with something like > > struct completion_list { > struct completion completion; > struct completion_list *next; > }; > > embedded into select_data, with dentry->waiters being a pointer > to that. It's just that dragging struct select_data into that, > even as an opaque pointer, tastes wrong ;-/ > > Preferences? I have no strong preferences in this area. The opaque pointer is fine by me, because it's type-safe without exposing internal implementation or having people being able to much around with the fields. But if you prefer a completion list, i don't see that being a problem either. So entirely up to you. Linus ^ permalink raw reply [flat|nested] 33+ messages in thread
* Re: [RFC PATCH v2 4/4] get rid of busy-waiting in shrink_dcache_tree() 2026-04-04 0:04 ` Linus Torvalds @ 2026-04-04 18:54 ` Al Viro 2026-04-04 19:04 ` Linus Torvalds 0 siblings, 1 reply; 33+ messages in thread From: Al Viro @ 2026-04-04 18:54 UTC (permalink / raw) To: Linus Torvalds Cc: linux-fsdevel, Christian Brauner, Jan Kara, Nikolay Borisov, Max Kellermann, Eric Sandeen, Paulo Alcantara On Fri, Apr 03, 2026 at 05:04:26PM -0700, Linus Torvalds wrote: > On Fri, 3 Apr 2026 at 16:58, Al Viro <viro@zeniv.linux.org.uk> wrote: > > > > FWIW, I wonder if we would be better off with something like > > > > struct completion_list { > > struct completion completion; > > struct completion_list *next; > > }; > > > > embedded into select_data, with dentry->waiters being a pointer > > to that. It's just that dragging struct select_data into that, > > even as an opaque pointer, tastes wrong ;-/ > > > > Preferences? > > I have no strong preferences in this area. > > The opaque pointer is fine by me, because it's type-safe without > exposing internal implementation or having people being able to much > around with the fields. > > But if you prefer a completion list, i don't see that being a problem > either. So entirely up to you. BTW, how much would you hate something like /* * Check if dentry is already dying and if it is arrange to be notified * once it's gone. * * caller must hold rcu_read_lock and dentry->d_lock */ static bool d_deathwatch(struct dentry *dentry, struct completion_list *node) { if (dentry->d_lockref.count >= 0) return false; // it's not dying yet init_completion(&node->completion); if (unlikely(dentry->d_flags & DCACHE_DENTRY_KILLED)) { // it's already gone complete(&node->completion); return true; } node->next = dentry->waiters; dentry->waiters = node; return true; } with if (data.victim) { struct dentry *v = data.victim; struct completion_list wait; spin_lock(&v->d_lock); if (d_deathwatch(v, &wait)) { // Another thread got around to killing it. // if there's something else to evict, do so // then wait for the victim to be gone spin_unlock(&v->d_lock); rcu_read_unlock(); if (!list_empty(&data.dispose)) shrink_dentry_list(&data.dispose); wait_for_completion(&wait.completion); continue; } if (!lock_for_kill(v)) { spin_unlock(&v->d_lock); rcu_read_unlock(); } else { shrink_kill(v); } } in shrink_dentry_tree()? ^ permalink raw reply [flat|nested] 33+ messages in thread
* Re: [RFC PATCH v2 4/4] get rid of busy-waiting in shrink_dcache_tree() 2026-04-04 18:54 ` Al Viro @ 2026-04-04 19:04 ` Linus Torvalds 2026-04-05 0:04 ` Al Viro 0 siblings, 1 reply; 33+ messages in thread From: Linus Torvalds @ 2026-04-04 19:04 UTC (permalink / raw) To: Al Viro Cc: linux-fsdevel, Christian Brauner, Jan Kara, Nikolay Borisov, Max Kellermann, Eric Sandeen, Paulo Alcantara On Sat, 4 Apr 2026 at 11:51, Al Viro <viro@zeniv.linux.org.uk> wrote: > > BTW, how much would you hate something like Ugh. This is your code, You'r ein charge in the end. > static bool d_deathwatch(struct dentry *dentry, struct completion_list *node) This looks odd to me. Particularly the "complete it myself" case where you return true. and then shrink the dentry list because you *think* you'll be waiting even if you don't. IOW, if you go down this case, wouldn't it make more sense to have d_deathwatch() return a ternary value, and have a switch case in the caller, and just enumerate the different cases? But I don't know. That's really more of a "that looks like an odd pattern" to me than any kind of objection. Linus ^ permalink raw reply [flat|nested] 33+ messages in thread
* Re: [RFC PATCH v2 4/4] get rid of busy-waiting in shrink_dcache_tree() 2026-04-04 19:04 ` Linus Torvalds @ 2026-04-05 0:04 ` Al Viro 0 siblings, 0 replies; 33+ messages in thread From: Al Viro @ 2026-04-05 0:04 UTC (permalink / raw) To: Linus Torvalds Cc: linux-fsdevel, Christian Brauner, Jan Kara, Nikolay Borisov, Max Kellermann, Eric Sandeen, Paulo Alcantara On Sat, Apr 04, 2026 at 12:04:42PM -0700, Linus Torvalds wrote: > > static bool d_deathwatch(struct dentry *dentry, struct completion_list *node) > > This looks odd to me. Particularly the "complete it myself" case where > you return true. and then shrink the dentry list because you *think* > you'll be waiting even if you don't. Not really - we shrink the list before each rescan anyway. We have collected some dentries into data.dispose and perhaps we'd stopped at finding a victim already in somebody else's shrink list or already dying. 1) no such victim found => shrink data.dispose, rescan 2) victim found and successfully stolen from whatever shrink list it was on => evict the victim, shrink data.dispose, rescan 3) victim was busy dying => shrink data.dispose, wait until it finishes dying, rescan 4) victim has managed to die while we'd been returning from d_walk() => shrink data.dispose, rescan In (3) we could do wait_for_completion() before shrinking data.dispose; that would merge the tails of all 4 cases. However, that would add pointless waits - basically, that's the scenario when another thread is sitting inside iput() and we would be better off evicting whatever else we'd collected. So (3) gets shrinking first, then wait, then go for the next iteration. Sure, we can have (4) as a separate case is the same 3-way switch, but what would it do? spin_unlock(&v->d_lock); rcu_read_unlock(); break; // proceeding to shrink_dentry_list() between the // end of if (data.victim) and the end of loop body And right next to it we'd have spin_unlock(&v->d_lock); rcu_read_unlock(); shrink data.dispose wait_for_completion(&wait.complete); continue; for case (3), which differs only in having shrink_dentry_list() right there and followed by wait_for_completion(). Put another way, (4) is equivalent to spin_unlock(&v->d_lock); rcu_read_unlock(); shrink data.dispose continue; > IOW, if you go down this case, wouldn't it make more sense to have > d_deathwatch() return a ternary value, and have a switch case in the > caller, and just enumerate the different cases? See above. > But I don't know. That's really more of a "that looks like an odd > pattern" to me than any kind of objection. FWIW, the thing that had me flipping back and forth between the variants of calling conventions until I went "I've been staring at that for too long to trust my taste" and asked that question was not the difference in location of shrinking - it was self-called complete(). Hell knows; I'll probably leave it as-is for now and return to that next week - it doesn't affect correctness, after all, and I'd rather have that thing in -next in the current form seeing that it fixes a real bug and further modifications would be local to fs/dcache.c anyway. ^ permalink raw reply [flat|nested] 33+ messages in thread
* Re: [RFC PATCH v2 0/4] getting rid of busy-wait in shrink_dcache_parent() 2026-04-02 18:08 ` [RFC PATCH v2 0/4] getting rid of busy-wait in shrink_dcache_parent() Al Viro ` (3 preceding siblings ...) 2026-04-02 18:08 ` [RFC PATCH v2 4/4] get rid of busy-waiting in shrink_dcache_tree() Al Viro @ 2026-04-02 20:28 ` Paulo Alcantara 2026-04-03 4:46 ` Al Viro 4 siblings, 1 reply; 33+ messages in thread From: Paulo Alcantara @ 2026-04-02 20:28 UTC (permalink / raw) To: Al Viro, Linus Torvalds Cc: linux-fsdevel, Christian Brauner, Jan Kara, Nikolay Borisov, Max Kellermann, Eric Sandeen Al Viro <viro@zeniv.linux.org.uk> writes: > Note: in -next CIFS has grown a local copy of d_mark_tmpfile(), which would need > the same modification of sanity checks the original gets in #3/4 here. I wonder > if that cifs_d_mark_tmpfile() should be turned into to d_mark_tmpfile_name() and > lifted into fs/dcache.c... Anyway, that's a separate story... Yeah, I would love to have a helper like that and don't mess up with any ->d_name outside fs/dcache.c. That's why I'd suggested it in [1]. I'll send a separate patch adding it as soon as I'm done with testing O_TMPFILE support in CIFS. [1] https://lore.kernel.org/r/20260401011153.1757515-1-pc@manguebit.org ^ permalink raw reply [flat|nested] 33+ messages in thread
* Re: [RFC PATCH v2 0/4] getting rid of busy-wait in shrink_dcache_parent() 2026-04-02 20:28 ` [RFC PATCH v2 0/4] getting rid of busy-wait in shrink_dcache_parent() Paulo Alcantara @ 2026-04-03 4:46 ` Al Viro 0 siblings, 0 replies; 33+ messages in thread From: Al Viro @ 2026-04-03 4:46 UTC (permalink / raw) To: Paulo Alcantara Cc: Linus Torvalds, linux-fsdevel, Christian Brauner, Jan Kara, Nikolay Borisov, Max Kellermann, Eric Sandeen On Thu, Apr 02, 2026 at 05:28:58PM -0300, Paulo Alcantara wrote: > Al Viro <viro@zeniv.linux.org.uk> writes: > > > Note: in -next CIFS has grown a local copy of d_mark_tmpfile(), which would need > > the same modification of sanity checks the original gets in #3/4 here. I wonder > > if that cifs_d_mark_tmpfile() should be turned into to d_mark_tmpfile_name() and > > lifted into fs/dcache.c... Anyway, that's a separate story... > > Yeah, I would love to have a helper like that and don't mess up with any > ->d_name outside fs/dcache.c. That's why I'd suggested it in [1]. > > I'll send a separate patch adding it as soon as I'm done with testing > O_TMPFILE support in CIFS. > > [1] https://lore.kernel.org/r/20260401011153.1757515-1-pc@manguebit.org OK... IMO we'd be better off with never-rebased branch adding the helper with cifs branch pulling it and busy-wait branch based on top of the "add the helper" one - no merge conflicts that way... ^ permalink raw reply [flat|nested] 33+ messages in thread
* [RFC PATCH v3 0/4] getting rid of busy-wait in shrink_dcache_parent() 2026-01-22 20:20 [PATCH][RFC] get rid of busy-wait in shrink_dcache_tree() Al Viro 2026-01-23 0:19 ` Linus Torvalds 2026-04-02 18:08 ` [RFC PATCH v2 0/4] getting rid of busy-wait in shrink_dcache_parent() Al Viro @ 2026-04-04 8:07 ` Al Viro 2026-04-04 8:07 ` [RFC PATCH v3 1/4] for_each_alias(): helper macro for iterating through dentries of given inode Al Viro ` (3 more replies) 2 siblings, 4 replies; 33+ messages in thread From: Al Viro @ 2026-04-04 8:07 UTC (permalink / raw) To: Linus Torvalds Cc: linux-fsdevel, Christian Brauner, Jan Kara, Nikolay Borisov, Max Kellermann, Eric Sandeen, Paulo Alcantara Changes since v2: * separate select_data from the completion list elements - add a separate struct completion_list (private to fs/dcache.c at the moment, but it's completely independent from struct dentry and can be lifted e.g. into linux/completion.h if there are other potential users). Branch (currently -rc6-based) lives in git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs.git #work.dcache-busy-wait individual patches in followups. Please, review. Shortlog: Al Viro (4): for_each_alias(): helper macro for iterating through dentries of given inode struct dentry: make ->d_u anonymous dcache.c: more idiomatic "positives are not allowed" sanity checks get rid of busy-waiting in shrink_dcache_tree() Diffstat: Documentation/filesystems/porting.rst | 10 ++ fs/affs/amigaffs.c | 2 +- fs/ceph/mds_client.c | 4 +- fs/dcache.c | 129 +++++++++++++++++++------- fs/exportfs/expfs.c | 2 +- fs/inode.c | 2 +- fs/nfs/dir.c | 4 +- fs/nfs/getroot.c | 2 +- fs/notify/fsnotify.c | 2 +- fs/ocfs2/dcache.c | 2 +- fs/overlayfs/dir.c | 2 +- fs/smb/client/inode.c | 2 +- include/linux/dcache.h | 24 ++++- 13 files changed, 140 insertions(+), 47 deletions(-) Series overview First a couple of cleanups to reduce the amount of noise: 1) for_each_alias(): helper macro for iterating through dentries of given inode Most of the places using d_alias are loops iterating through all aliases for given inode; introduce a helper macro (for_each_alias(dentry, inode)) and convert open-coded instances of such loop to it. They are easier to read that way and it reduces the noise on the next steps. 2) struct dentry: make ->d_u anonymous Making ->d_rcu and (then) ->d_child overlapping dates back to 2006; anon unions support had been added to gcc only in 4.6 (2011) and the minimal gcc version hadn't been bumped to that until 4.19 (2018). These days there's no reason not to keep that union named. Next we get rid of the aforementioned pathologies: 3) dcache.c: more idiomatic "positives are not allowed" sanity checks Several functions have BUG_ON/WARN_ON sanity checks that want to verify that dentry is not positive and instead of looking at ->d_inode (as we do in all other places that check that) they look at ->d_alias. Just use the normal helpers instead - that way we no longer even look at ->d_alias for negative dentries Now that everything is ready for adding a new member and killing busy-wait: 4) get rid of busy-waiting in shrink_dcache_tree() If shrink_dcache_tree() runs into a potential victim that is already dying, it must wait for that dentry to go away. To avoid busy-waiting we need some object to wait on and a way for dentry_unlist() to see that we need to be notified. The obvious place for the object to wait on would be on our stack frame. We will store a pointer to that object (struct completion_list) in victim dentry; if there's more than one thread wanting to wait for the same dentry to finish dying, we'll have their instances linked into a list, with reference in dentry pointing to the head of that list. * new object - struct completion_list. A pair of struct completion and pointer to the next instance. That's what shrink_dcache_tree() will wait on if needed. * add a new member (->waiters, opaque pointer to struct completion_list) to struct dentry. It is defined for negative live dentries that are not in-lookup ones and it will remain NULL for almost all of them. It does not conflict with ->d_rcu (defined for killed dentries), ->d_alias (defined for positive dentries, all live) or ->d_in_lookup_hash (defined for in-lookup dentries, all live negative). That allows to colocate all four members. * make sure that all places where dentry enters the state where ->waiters is defined (live, negative, not-in-lookup) initialize ->waiters to NULL. * if select_collect2() runs into a dentry that is already dying, have its caller insert a local instance of struct completion_list into the head of the list hanging off dentry->waiters and wait for completion. * if dentry_unlist() sees non-NULL ->waiters, have it carefully walk through the completion_list instances in that list, calling complete() for each. For now struct completion_list is local to fs/dcache.c; it's obviously dentry-agnostic, and it can be trivially lifted into linux/completion.h if somebody finds a reason to do so... -- 2.47.3 ^ permalink raw reply [flat|nested] 33+ messages in thread
* [RFC PATCH v3 1/4] for_each_alias(): helper macro for iterating through dentries of given inode 2026-04-04 8:07 ` [RFC PATCH v3 " Al Viro @ 2026-04-04 8:07 ` Al Viro 2026-04-04 8:07 ` [RFC PATCH v3 2/4] struct dentry: make ->d_u anonymous Al Viro ` (2 subsequent siblings) 3 siblings, 0 replies; 33+ messages in thread From: Al Viro @ 2026-04-04 8:07 UTC (permalink / raw) To: Linus Torvalds Cc: linux-fsdevel, Christian Brauner, Jan Kara, Nikolay Borisov, Max Kellermann, Eric Sandeen, Paulo Alcantara Most of the places using d_alias are loops iterating through all aliases for given inode; introduce a helper macro (for_each_alias(dentry, inode)) and convert open-coded instances of such loop to it. They are easier to read that way and it reduces the noise on the next steps. You _must_ hold inode->i_lock over that thing. Signed-off-by: Al Viro <viro@zeniv.linux.org.uk> --- Documentation/filesystems/porting.rst | 10 ++++++++++ fs/affs/amigaffs.c | 2 +- fs/ceph/mds_client.c | 2 +- fs/dcache.c | 6 +++--- fs/exportfs/expfs.c | 2 +- fs/nfs/dir.c | 2 +- fs/notify/fsnotify.c | 2 +- fs/ocfs2/dcache.c | 2 +- fs/overlayfs/dir.c | 2 +- fs/smb/client/inode.c | 2 +- include/linux/dcache.h | 4 ++++ 11 files changed, 25 insertions(+), 11 deletions(-) diff --git a/Documentation/filesystems/porting.rst b/Documentation/filesystems/porting.rst index 52ff1d19405b..9a9babd9ec48 100644 --- a/Documentation/filesystems/porting.rst +++ b/Documentation/filesystems/porting.rst @@ -1361,3 +1361,13 @@ to match what strlen() would return if it was ran on the string. However, if the string is freely accessible for the duration of inode's lifetime, consider using inode_set_cached_link() instead. + +--- + +**recommended** + +If you really need to iterate through dentries for given inode, use +for_each_alias(dentry, inode) instead of hlist_for_each_entry; better +yet, see if any of the exported primitives could be used instead of +the entire loop. You still need to hold ->i_lock of the inode over +either form of manual loop. diff --git a/fs/affs/amigaffs.c b/fs/affs/amigaffs.c index fd669daa4e7b..91966b1f41f6 100644 --- a/fs/affs/amigaffs.c +++ b/fs/affs/amigaffs.c @@ -126,7 +126,7 @@ affs_fix_dcache(struct inode *inode, u32 entry_ino) { struct dentry *dentry; spin_lock(&inode->i_lock); - hlist_for_each_entry(dentry, &inode->i_dentry, d_u.d_alias) { + for_each_alias(dentry, inode) { if (entry_ino == (u32)(long)dentry->d_fsdata) { dentry->d_fsdata = (void *)inode->i_ino; break; diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index b1746273f186..f839109fb66f 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -4614,7 +4614,7 @@ static struct dentry* d_find_primary(struct inode *inode) goto out_unlock; } - hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) { + for_each_alias(alias, inode) { spin_lock(&alias->d_lock); if (!d_unhashed(alias) && (ceph_dentry(alias)->flags & CEPH_DENTRY_PRIMARY_LINK)) { diff --git a/fs/dcache.c b/fs/dcache.c index 7ba1801d8132..e069b6ec4ec0 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -790,7 +790,7 @@ void d_mark_dontcache(struct inode *inode) struct dentry *de; spin_lock(&inode->i_lock); - hlist_for_each_entry(de, &inode->i_dentry, d_u.d_alias) { + for_each_alias(de, inode) { spin_lock(&de->d_lock); de->d_flags |= DCACHE_DONTCACHE; spin_unlock(&de->d_lock); @@ -1040,7 +1040,7 @@ static struct dentry *__d_find_alias(struct inode *inode) if (S_ISDIR(inode->i_mode)) return __d_find_any_alias(inode); - hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) { + for_each_alias(alias, inode) { spin_lock(&alias->d_lock); if (!d_unhashed(alias)) { dget_dlock(alias); @@ -1133,7 +1133,7 @@ void d_prune_aliases(struct inode *inode) struct dentry *dentry; spin_lock(&inode->i_lock); - hlist_for_each_entry(dentry, &inode->i_dentry, d_u.d_alias) + for_each_alias(dentry, inode) d_dispose_if_unused(dentry, &dispose); spin_unlock(&inode->i_lock); shrink_dentry_list(&dispose); diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c index 6c9be60a3e48..f67b3ce672fc 100644 --- a/fs/exportfs/expfs.c +++ b/fs/exportfs/expfs.c @@ -52,7 +52,7 @@ find_acceptable_alias(struct dentry *result, inode = result->d_inode; spin_lock(&inode->i_lock); - hlist_for_each_entry(dentry, &inode->i_dentry, d_u.d_alias) { + for_each_alias(dentry, inode) { dget(dentry); spin_unlock(&inode->i_lock); if (toput) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 2402f57c8e7d..5a0bd8113e3a 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1471,7 +1471,7 @@ static void nfs_clear_verifier_file(struct inode *inode) struct dentry *alias; struct inode *dir; - hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) { + for_each_alias(alias, inode) { spin_lock(&alias->d_lock); dir = d_inode_rcu(alias->d_parent); if (!dir || diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index 9995de1710e5..b7198c4744e3 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -76,7 +76,7 @@ void fsnotify_set_children_dentry_flags(struct inode *inode) spin_lock(&inode->i_lock); /* run all of the dentries associated with this inode. Since this is a * directory, there damn well better only be one item on this list */ - hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) { + for_each_alias(alias, inode) { struct dentry *child; /* run all of the children of the original inode and fix their diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c index c4ba968e778b..e06774fd89d8 100644 --- a/fs/ocfs2/dcache.c +++ b/fs/ocfs2/dcache.c @@ -145,7 +145,7 @@ struct dentry *ocfs2_find_local_alias(struct inode *inode, struct dentry *dentry; spin_lock(&inode->i_lock); - hlist_for_each_entry(dentry, &inode->i_dentry, d_u.d_alias) { + for_each_alias(dentry, inode) { spin_lock(&dentry->d_lock); if (ocfs2_match_dentry(dentry, parent_blkno, skip_unhashed)) { trace_ocfs2_find_local_alias(dentry->d_name.len, diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index ff3dbd1ca61f..f8dfa534b566 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -904,7 +904,7 @@ static void ovl_drop_nlink(struct dentry *dentry) /* Try to find another, hashed alias */ spin_lock(&inode->i_lock); - hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) { + for_each_alias(alias, inode) { if (alias != dentry && !d_unhashed(alias)) break; } diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c index 888f9e35f14b..e2b4ad9bd0bd 100644 --- a/fs/smb/client/inode.c +++ b/fs/smb/client/inode.c @@ -1595,7 +1595,7 @@ inode_has_hashed_dentries(struct inode *inode) struct dentry *dentry; spin_lock(&inode->i_lock); - hlist_for_each_entry(dentry, &inode->i_dentry, d_u.d_alias) { + for_each_alias(dentry, inode) { if (!d_unhashed(dentry) || IS_ROOT(dentry)) { spin_unlock(&inode->i_lock); return true; diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 898c60d21c92..7f1dbc7121d7 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -615,4 +615,8 @@ void set_default_d_op(struct super_block *, const struct dentry_operations *); struct dentry *d_make_persistent(struct dentry *, struct inode *); void d_make_discardable(struct dentry *dentry); +/* inode->i_lock must be held over that */ +#define for_each_alias(dentry, inode) \ + hlist_for_each_entry(dentry, &(inode)->i_dentry, d_u.d_alias) + #endif /* __LINUX_DCACHE_H */ -- 2.47.3 ^ permalink raw reply related [flat|nested] 33+ messages in thread
* [RFC PATCH v3 2/4] struct dentry: make ->d_u anonymous 2026-04-04 8:07 ` [RFC PATCH v3 " Al Viro 2026-04-04 8:07 ` [RFC PATCH v3 1/4] for_each_alias(): helper macro for iterating through dentries of given inode Al Viro @ 2026-04-04 8:07 ` Al Viro 2026-04-04 8:07 ` [RFC PATCH v3 3/4] dcache.c: more idiomatic "positives are not allowed" sanity checks Al Viro 2026-04-04 8:07 ` [RFC PATCH v3 4/4] get rid of busy-waiting in shrink_dcache_tree() Al Viro 3 siblings, 0 replies; 33+ messages in thread From: Al Viro @ 2026-04-04 8:07 UTC (permalink / raw) To: Linus Torvalds Cc: linux-fsdevel, Christian Brauner, Jan Kara, Nikolay Borisov, Max Kellermann, Eric Sandeen, Paulo Alcantara Making ->d_rcu and (then) ->d_child overlapping dates back to 2006; anon unions support had been added to gcc only in 4.6 (2011) and the minimal gcc version hadn't been bumped to that until 4.19 (2018). These days there's no reason not to keep that union named. Signed-off-by: Al Viro <viro@zeniv.linux.org.uk> --- fs/ceph/mds_client.c | 2 +- fs/dcache.c | 48 +++++++++++++++++++++--------------------- fs/inode.c | 2 +- fs/nfs/dir.c | 2 +- fs/nfs/getroot.c | 2 +- include/linux/dcache.h | 4 ++-- 6 files changed, 30 insertions(+), 30 deletions(-) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index f839109fb66f..a5eb99c3c36b 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -4608,7 +4608,7 @@ static struct dentry* d_find_primary(struct inode *inode) goto out_unlock; if (S_ISDIR(inode->i_mode)) { - alias = hlist_entry(inode->i_dentry.first, struct dentry, d_u.d_alias); + alias = hlist_entry(inode->i_dentry.first, struct dentry, d_alias); if (!IS_ROOT(alias)) dn = dget(alias); goto out_unlock; diff --git a/fs/dcache.c b/fs/dcache.c index e069b6ec4ec0..4378eb8a00bb 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -40,7 +40,7 @@ /* * Usage: * dcache->d_inode->i_lock protects: - * - i_dentry, d_u.d_alias, d_inode of aliases + * - i_dentry, d_alias, d_inode of aliases * dcache_hash_bucket lock protects: * - the dcache hash table * s_roots bl list spinlock protects: @@ -55,7 +55,7 @@ * - d_unhashed() * - d_parent and d_chilren * - childrens' d_sib and d_parent - * - d_u.d_alias, d_inode + * - d_alias, d_inode * * Ordering: * dentry->d_inode->i_lock @@ -341,14 +341,14 @@ static inline struct external_name *external_name(struct dentry *dentry) static void __d_free(struct rcu_head *head) { - struct dentry *dentry = container_of(head, struct dentry, d_u.d_rcu); + struct dentry *dentry = container_of(head, struct dentry, d_rcu); kmem_cache_free(dentry_cache, dentry); } static void __d_free_external(struct rcu_head *head) { - struct dentry *dentry = container_of(head, struct dentry, d_u.d_rcu); + struct dentry *dentry = container_of(head, struct dentry, d_rcu); kfree(external_name(dentry)); kmem_cache_free(dentry_cache, dentry); } @@ -428,19 +428,19 @@ static inline void __d_clear_type_and_inode(struct dentry *dentry) static void dentry_free(struct dentry *dentry) { - WARN_ON(!hlist_unhashed(&dentry->d_u.d_alias)); + WARN_ON(!hlist_unhashed(&dentry->d_alias)); if (unlikely(dname_external(dentry))) { struct external_name *p = external_name(dentry); if (likely(atomic_dec_and_test(&p->count))) { - call_rcu(&dentry->d_u.d_rcu, __d_free_external); + call_rcu(&dentry->d_rcu, __d_free_external); return; } } /* if dentry was never visible to RCU, immediate free is OK */ if (dentry->d_flags & DCACHE_NORCU) - __d_free(&dentry->d_u.d_rcu); + __d_free(&dentry->d_rcu); else - call_rcu(&dentry->d_u.d_rcu, __d_free); + call_rcu(&dentry->d_rcu, __d_free); } /* @@ -455,7 +455,7 @@ static void dentry_unlink_inode(struct dentry * dentry) raw_write_seqcount_begin(&dentry->d_seq); __d_clear_type_and_inode(dentry); - hlist_del_init(&dentry->d_u.d_alias); + hlist_del_init(&dentry->d_alias); raw_write_seqcount_end(&dentry->d_seq); spin_unlock(&dentry->d_lock); spin_unlock(&inode->i_lock); @@ -1010,7 +1010,7 @@ static struct dentry * __d_find_any_alias(struct inode *inode) if (hlist_empty(&inode->i_dentry)) return NULL; - alias = hlist_entry(inode->i_dentry.first, struct dentry, d_u.d_alias); + alias = hlist_entry(inode->i_dentry.first, struct dentry, d_alias); lockref_get(&alias->d_lockref); return alias; } @@ -1093,9 +1093,9 @@ struct dentry *d_find_alias_rcu(struct inode *inode) // used without having I_FREEING set, which means no aliases left if (likely(!(inode_state_read(inode) & I_FREEING) && !hlist_empty(l))) { if (S_ISDIR(inode->i_mode)) { - de = hlist_entry(l->first, struct dentry, d_u.d_alias); + de = hlist_entry(l->first, struct dentry, d_alias); } else { - hlist_for_each_entry(de, l, d_u.d_alias) + hlist_for_each_entry(de, l, d_alias) if (!d_unhashed(de)) break; } @@ -1787,7 +1787,7 @@ static struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name) INIT_HLIST_BL_NODE(&dentry->d_hash); INIT_LIST_HEAD(&dentry->d_lru); INIT_HLIST_HEAD(&dentry->d_children); - INIT_HLIST_NODE(&dentry->d_u.d_alias); + INIT_HLIST_NODE(&dentry->d_alias); INIT_HLIST_NODE(&dentry->d_sib); if (dentry->d_op && dentry->d_op->d_init) { @@ -1980,7 +1980,7 @@ static void __d_instantiate(struct dentry *dentry, struct inode *inode) if ((dentry->d_flags & (DCACHE_LRU_LIST|DCACHE_SHRINK_LIST)) == DCACHE_LRU_LIST) this_cpu_dec(nr_dentry_negative); - hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry); + hlist_add_head(&dentry->d_alias, &inode->i_dentry); raw_write_seqcount_begin(&dentry->d_seq); __d_set_inode_and_type(dentry, inode, add_flags); raw_write_seqcount_end(&dentry->d_seq); @@ -2004,7 +2004,7 @@ static void __d_instantiate(struct dentry *dentry, struct inode *inode) void d_instantiate(struct dentry *entry, struct inode * inode) { - BUG_ON(!hlist_unhashed(&entry->d_u.d_alias)); + BUG_ON(!hlist_unhashed(&entry->d_alias)); if (inode) { security_d_instantiate(entry, inode); spin_lock(&inode->i_lock); @@ -2024,7 +2024,7 @@ EXPORT_SYMBOL(d_instantiate); */ void d_instantiate_new(struct dentry *entry, struct inode *inode) { - BUG_ON(!hlist_unhashed(&entry->d_u.d_alias)); + BUG_ON(!hlist_unhashed(&entry->d_alias)); BUG_ON(!inode); lockdep_annotate_inode_mutex_key(inode); security_d_instantiate(entry, inode); @@ -2087,7 +2087,7 @@ static struct dentry *__d_obtain_alias(struct inode *inode, bool disconnected) spin_lock(&new->d_lock); __d_set_inode_and_type(new, inode, add_flags); - hlist_add_head(&new->d_u.d_alias, &inode->i_dentry); + hlist_add_head(&new->d_alias, &inode->i_dentry); if (!disconnected) { hlist_bl_lock(&sb->s_roots); hlist_bl_add_head(&new->d_hash, &sb->s_roots); @@ -2658,7 +2658,7 @@ struct dentry *d_alloc_parallel(struct dentry *parent, * we unlock the chain. All fields are stable in everything * we encounter. */ - hlist_bl_for_each_entry(dentry, node, b, d_u.d_in_lookup_hash) { + hlist_bl_for_each_entry(dentry, node, b, d_in_lookup_hash) { if (dentry->d_name.hash != hash) continue; if (dentry->d_parent != parent) @@ -2700,7 +2700,7 @@ struct dentry *d_alloc_parallel(struct dentry *parent, } rcu_read_unlock(); new->d_wait = wq; - hlist_bl_add_head(&new->d_u.d_in_lookup_hash, b); + hlist_bl_add_head(&new->d_in_lookup_hash, b); hlist_bl_unlock(b); return new; mismatch: @@ -2725,11 +2725,11 @@ static wait_queue_head_t *__d_lookup_unhash(struct dentry *dentry) b = in_lookup_hash(dentry->d_parent, dentry->d_name.hash); hlist_bl_lock(b); dentry->d_flags &= ~DCACHE_PAR_LOOKUP; - __hlist_bl_del(&dentry->d_u.d_in_lookup_hash); + __hlist_bl_del(&dentry->d_in_lookup_hash); d_wait = dentry->d_wait; dentry->d_wait = NULL; hlist_bl_unlock(b); - INIT_HLIST_NODE(&dentry->d_u.d_alias); + INIT_HLIST_NODE(&dentry->d_alias); INIT_LIST_HEAD(&dentry->d_lru); return d_wait; } @@ -2760,7 +2760,7 @@ static inline void __d_add(struct dentry *dentry, struct inode *inode, d_set_d_op(dentry, ops); if (inode) { unsigned add_flags = d_flags_for_inode(inode); - hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry); + hlist_add_head(&dentry->d_alias, &inode->i_dentry); raw_write_seqcount_begin(&dentry->d_seq); __d_set_inode_and_type(dentry, inode, add_flags); raw_write_seqcount_end(&dentry->d_seq); @@ -2795,7 +2795,7 @@ EXPORT_SYMBOL(d_add); struct dentry *d_make_persistent(struct dentry *dentry, struct inode *inode) { - WARN_ON(!hlist_unhashed(&dentry->d_u.d_alias)); + WARN_ON(!hlist_unhashed(&dentry->d_alias)); WARN_ON(!inode); security_d_instantiate(dentry, inode); spin_lock(&inode->i_lock); @@ -3185,7 +3185,7 @@ void d_mark_tmpfile(struct file *file, struct inode *inode) struct dentry *dentry = file->f_path.dentry; BUG_ON(dname_external(dentry) || - !hlist_unhashed(&dentry->d_u.d_alias) || + !hlist_unhashed(&dentry->d_alias) || !d_unlinked(dentry)); spin_lock(&dentry->d_parent->d_lock); spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); diff --git a/fs/inode.c b/fs/inode.c index cc12b68e021b..9e1ab333d382 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -754,7 +754,7 @@ void dump_mapping(const struct address_space *mapping) return; } - dentry_ptr = container_of(dentry_first, struct dentry, d_u.d_alias); + dentry_ptr = container_of(dentry_first, struct dentry, d_alias); if (get_kernel_nofault(dentry, dentry_ptr) || !dentry.d_parent || !dentry.d_name.name) { pr_warn("aops:%ps ino:%lx invalid dentry:%px\n", diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 5a0bd8113e3a..f2f1b036f2f1 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1490,7 +1490,7 @@ static void nfs_clear_verifier_directory(struct inode *dir) if (hlist_empty(&dir->i_dentry)) return; this_parent = - hlist_entry(dir->i_dentry.first, struct dentry, d_u.d_alias); + hlist_entry(dir->i_dentry.first, struct dentry, d_alias); spin_lock(&this_parent->d_lock); nfs_unset_verifier_delegated(&this_parent->d_time); diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c index f13d25d95b85..eef0736beb67 100644 --- a/fs/nfs/getroot.c +++ b/fs/nfs/getroot.c @@ -54,7 +54,7 @@ static int nfs_superblock_set_dummy_root(struct super_block *sb, struct inode *i */ spin_lock(&d_inode(sb->s_root)->i_lock); spin_lock(&sb->s_root->d_lock); - hlist_del_init(&sb->s_root->d_u.d_alias); + hlist_del_init(&sb->s_root->d_alias); spin_unlock(&sb->s_root->d_lock); spin_unlock(&d_inode(sb->s_root)->i_lock); } diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 7f1dbc7121d7..f939d2ed10a3 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -128,7 +128,7 @@ struct dentry { struct hlist_node d_alias; /* inode alias list */ struct hlist_bl_node d_in_lookup_hash; /* only for in-lookup ones */ struct rcu_head d_rcu; - } d_u; + }; }; /* @@ -617,6 +617,6 @@ void d_make_discardable(struct dentry *dentry); /* inode->i_lock must be held over that */ #define for_each_alias(dentry, inode) \ - hlist_for_each_entry(dentry, &(inode)->i_dentry, d_u.d_alias) + hlist_for_each_entry(dentry, &(inode)->i_dentry, d_alias) #endif /* __LINUX_DCACHE_H */ -- 2.47.3 ^ permalink raw reply related [flat|nested] 33+ messages in thread
* [RFC PATCH v3 3/4] dcache.c: more idiomatic "positives are not allowed" sanity checks 2026-04-04 8:07 ` [RFC PATCH v3 " Al Viro 2026-04-04 8:07 ` [RFC PATCH v3 1/4] for_each_alias(): helper macro for iterating through dentries of given inode Al Viro 2026-04-04 8:07 ` [RFC PATCH v3 2/4] struct dentry: make ->d_u anonymous Al Viro @ 2026-04-04 8:07 ` Al Viro 2026-04-04 8:07 ` [RFC PATCH v3 4/4] get rid of busy-waiting in shrink_dcache_tree() Al Viro 3 siblings, 0 replies; 33+ messages in thread From: Al Viro @ 2026-04-04 8:07 UTC (permalink / raw) To: Linus Torvalds Cc: linux-fsdevel, Christian Brauner, Jan Kara, Nikolay Borisov, Max Kellermann, Eric Sandeen, Paulo Alcantara Several functions have BUG_ON/WARN_ON sanity checks that want to verify that dentry is not positive and instead of looking at ->d_inode (as we do in all other places that check that) they look at ->d_alias. Just use the normal helpers instead - that way we no longer even look at ->d_alias for negative dentries Signed-off-by: Al Viro <viro@zeniv.linux.org.uk> --- fs/dcache.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/dcache.c b/fs/dcache.c index 4378eb8a00bb..616a445ec720 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -428,7 +428,7 @@ static inline void __d_clear_type_and_inode(struct dentry *dentry) static void dentry_free(struct dentry *dentry) { - WARN_ON(!hlist_unhashed(&dentry->d_alias)); + WARN_ON(d_really_is_positive(dentry)); if (unlikely(dname_external(dentry))) { struct external_name *p = external_name(dentry); if (likely(atomic_dec_and_test(&p->count))) { @@ -2004,7 +2004,7 @@ static void __d_instantiate(struct dentry *dentry, struct inode *inode) void d_instantiate(struct dentry *entry, struct inode * inode) { - BUG_ON(!hlist_unhashed(&entry->d_alias)); + BUG_ON(d_really_is_positive(entry)); if (inode) { security_d_instantiate(entry, inode); spin_lock(&inode->i_lock); @@ -2024,7 +2024,7 @@ EXPORT_SYMBOL(d_instantiate); */ void d_instantiate_new(struct dentry *entry, struct inode *inode) { - BUG_ON(!hlist_unhashed(&entry->d_alias)); + BUG_ON(d_really_is_positive(entry)); BUG_ON(!inode); lockdep_annotate_inode_mutex_key(inode); security_d_instantiate(entry, inode); @@ -2795,7 +2795,7 @@ EXPORT_SYMBOL(d_add); struct dentry *d_make_persistent(struct dentry *dentry, struct inode *inode) { - WARN_ON(!hlist_unhashed(&dentry->d_alias)); + WARN_ON(d_really_is_positive(dentry)); WARN_ON(!inode); security_d_instantiate(dentry, inode); spin_lock(&inode->i_lock); @@ -3185,7 +3185,7 @@ void d_mark_tmpfile(struct file *file, struct inode *inode) struct dentry *dentry = file->f_path.dentry; BUG_ON(dname_external(dentry) || - !hlist_unhashed(&dentry->d_alias) || + d_really_is_positive(dentry) || !d_unlinked(dentry)); spin_lock(&dentry->d_parent->d_lock); spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); -- 2.47.3 ^ permalink raw reply related [flat|nested] 33+ messages in thread
* [RFC PATCH v3 4/4] get rid of busy-waiting in shrink_dcache_tree() 2026-04-04 8:07 ` [RFC PATCH v3 " Al Viro ` (2 preceding siblings ...) 2026-04-04 8:07 ` [RFC PATCH v3 3/4] dcache.c: more idiomatic "positives are not allowed" sanity checks Al Viro @ 2026-04-04 8:07 ` Al Viro 3 siblings, 0 replies; 33+ messages in thread From: Al Viro @ 2026-04-04 8:07 UTC (permalink / raw) To: Linus Torvalds Cc: linux-fsdevel, Christian Brauner, Jan Kara, Nikolay Borisov, Max Kellermann, Eric Sandeen, Paulo Alcantara If shrink_dcache_tree() runs into a potential victim that is already dying, it must wait for that dentry to go away. To avoid busy-waiting we need some object to wait on and a way for dentry_unlist() to see that we need to be notified. The obvious place for the object to wait on would be on our stack frame. We will store a pointer to that object (struct completion_list) in victim dentry; if there's more than one thread wanting to wait for the same dentry to finish dying, we'll have their instances linked into a list, with reference in dentry pointing to the head of that list. * new object - struct completion_list. A pair of struct completion and pointer to the next instance. That's what shrink_dcache_tree() will wait on if needed. * add a new member (->waiters, opaque pointer to struct completion_list) to struct dentry. It is defined for negative live dentries that are not in-lookup ones and it will remain NULL for almost all of them. It does not conflict with ->d_rcu (defined for killed dentries), ->d_alias (defined for positive dentries, all live) or ->d_in_lookup_hash (defined for in-lookup dentries, all live negative). That allows to colocate all four members. * make sure that all places where dentry enters the state where ->waiters is defined (live, negative, not-in-lookup) initialize ->waiters to NULL. * if select_collect2() runs into a dentry that is already dying, have its caller insert a local instance of struct completion_list into the head of the list hanging off dentry->waiters and wait for completion. * if dentry_unlist() sees non-NULL ->waiters, have it carefully walk through the completion_list instances in that list, calling complete() for each. For now struct completion_list is local to fs/dcache.c; it's obviously dentry-agnostic, and it can be trivially lifted into linux/completion.h if somebody finds a reason to do so... Signed-off-by: Al Viro <viro@zeniv.linux.org.uk> --- fs/dcache.c | 79 ++++++++++++++++++++++++++++++++++++++---- include/linux/dcache.h | 18 ++++++++-- 2 files changed, 88 insertions(+), 9 deletions(-) diff --git a/fs/dcache.c b/fs/dcache.c index 616a445ec720..0c8faeee02e2 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -456,6 +456,15 @@ static void dentry_unlink_inode(struct dentry * dentry) raw_write_seqcount_begin(&dentry->d_seq); __d_clear_type_and_inode(dentry); hlist_del_init(&dentry->d_alias); + /* + * dentry becomes negative, so the space occupied by ->d_alias + * belongs to ->waiters now; we could use __hlist_del() instead + * of hlist_del_init(), if not for the stunt pulled by nfs + * dummy root dentries - positive dentry *not* included into + * the alias list of its inode. Open-coding hlist_del_init() + * and removing zeroing would be too clumsy... + */ + dentry->waiters = NULL; raw_write_seqcount_end(&dentry->d_seq); spin_unlock(&dentry->d_lock); spin_unlock(&inode->i_lock); @@ -605,6 +614,44 @@ void d_drop(struct dentry *dentry) } EXPORT_SYMBOL(d_drop); +struct completion_list { + struct completion_list *next; + struct completion completion; +}; + +/* + * shrink_dcache_tree() needs to be notified when dentry in process of + * being evicted finally gets unlisted. Such dentries are + * already with negative ->d_count + * already negative + * already not in in-lookup hash + * reachable only via ->d_sib. + * + * Use ->waiters for a single-linked list of struct completion_list of + * waiters. + */ +static inline void d_add_waiter(struct dentry *dentry, struct completion_list *p) +{ + struct completion_list *v = dentry->waiters; + init_completion(&p->completion); + p->next = v; + dentry->waiters = p; +} + +static inline void d_complete_waiters(struct dentry *dentry) +{ + struct completion_list *v = dentry->waiters; + if (unlikely(v)) { + /* some shrink_dcache_tree() instances are waiting */ + dentry->waiters = NULL; + while (v) { + struct completion *r = &v->completion; + v = v->next; + complete(r); + } + } +} + static inline void dentry_unlist(struct dentry *dentry) { struct dentry *next; @@ -613,6 +660,7 @@ static inline void dentry_unlist(struct dentry *dentry) * attached to the dentry tree */ dentry->d_flags |= DCACHE_DENTRY_KILLED; + d_complete_waiters(dentry); if (unlikely(hlist_unhashed(&dentry->d_sib))) return; __hlist_del(&dentry->d_sib); @@ -1569,6 +1617,10 @@ static enum d_walk_ret select_collect2(void *_data, struct dentry *dentry) return D_WALK_QUIT; } to_shrink_list(dentry, &data->dispose); + } else if (dentry->d_lockref.count < 0) { + rcu_read_lock(); + data->victim = dentry; + return D_WALK_QUIT; } /* * We can return to the caller if we have found some (this @@ -1608,12 +1660,27 @@ static void shrink_dcache_tree(struct dentry *parent, bool for_umount) data.victim = NULL; d_walk(parent, &data, select_collect2); if (data.victim) { - spin_lock(&data.victim->d_lock); - if (!lock_for_kill(data.victim)) { - spin_unlock(&data.victim->d_lock); + struct dentry *v = data.victim; + + spin_lock(&v->d_lock); + if (v->d_lockref.count < 0 && + !(v->d_flags & DCACHE_DENTRY_KILLED)) { + struct completion_list wait; + // It's busy dying; have it notify us once + // it becomes invisible to d_walk(). + d_add_waiter(v, &wait); + spin_unlock(&v->d_lock); + rcu_read_unlock(); + if (!list_empty(&data.dispose)) + shrink_dentry_list(&data.dispose); + wait_for_completion(&wait.completion); + continue; + } + if (!lock_for_kill(v)) { + spin_unlock(&v->d_lock); rcu_read_unlock(); } else { - shrink_kill(data.victim); + shrink_kill(v); } } if (!list_empty(&data.dispose)) @@ -1787,7 +1854,7 @@ static struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name) INIT_HLIST_BL_NODE(&dentry->d_hash); INIT_LIST_HEAD(&dentry->d_lru); INIT_HLIST_HEAD(&dentry->d_children); - INIT_HLIST_NODE(&dentry->d_alias); + dentry->waiters = NULL; INIT_HLIST_NODE(&dentry->d_sib); if (dentry->d_op && dentry->d_op->d_init) { @@ -2729,7 +2796,7 @@ static wait_queue_head_t *__d_lookup_unhash(struct dentry *dentry) d_wait = dentry->d_wait; dentry->d_wait = NULL; hlist_bl_unlock(b); - INIT_HLIST_NODE(&dentry->d_alias); + dentry->waiters = NULL; INIT_LIST_HEAD(&dentry->d_lru); return d_wait; } diff --git a/include/linux/dcache.h b/include/linux/dcache.h index f939d2ed10a3..19098253f2dd 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -88,6 +88,7 @@ union shortname_store { #define d_lock d_lockref.lock #define d_iname d_shortname.string +struct completion_list; struct dentry { /* RCU lookup touched fields */ @@ -122,12 +123,23 @@ struct dentry { struct hlist_node d_sib; /* child of parent list */ struct hlist_head d_children; /* our children */ /* - * d_alias and d_rcu can share memory + * the following members can share memory - their uses are + * mutually exclusive. */ union { - struct hlist_node d_alias; /* inode alias list */ - struct hlist_bl_node d_in_lookup_hash; /* only for in-lookup ones */ + /* positives: inode alias list */ + struct hlist_node d_alias; + /* in-lookup ones (all negative, live): hash chain */ + struct hlist_bl_node d_in_lookup_hash; + /* killed ones: (already negative) used to schedule freeing */ struct rcu_head d_rcu; + /* + * live non-in-lookup negatives: used if shrink_dcache_tree() + * races with eviction by another thread and needs to wait for + * this dentry to get killed . Remains NULL for almost all + * negative dentries. + */ + struct completion_list *waiters; }; }; -- 2.47.3 ^ permalink raw reply related [flat|nested] 33+ messages in thread
end of thread, other threads:[~2026-04-05 0:00 UTC | newest] Thread overview: 33+ messages (download: mbox.gz follow: Atom feed -- links below jump to the message on this page -- 2026-01-22 20:20 [PATCH][RFC] get rid of busy-wait in shrink_dcache_tree() Al Viro 2026-01-23 0:19 ` Linus Torvalds 2026-01-23 0:36 ` Al Viro 2026-01-24 4:36 ` Al Viro 2026-01-24 4:46 ` Linus Torvalds 2026-01-24 5:36 ` Al Viro 2026-01-24 17:45 ` Linus Torvalds 2026-01-24 18:43 ` Al Viro 2026-01-24 19:32 ` Linus Torvalds 2026-01-24 20:28 ` Al Viro 2026-04-02 18:08 ` [RFC PATCH v2 0/4] getting rid of busy-wait in shrink_dcache_parent() Al Viro 2026-04-02 18:08 ` [RFC PATCH v2 1/4] for_each_alias(): helper macro for iterating through dentries of given inode Al Viro 2026-04-02 18:08 ` [RFC PATCH v2 2/4] struct dentry: make ->d_u anonymous Al Viro 2026-04-02 18:08 ` [RFC PATCH v2 3/4] dcache.c: more idiomatic "positives are not allowed" sanity checks Al Viro 2026-04-02 18:08 ` [RFC PATCH v2 4/4] get rid of busy-waiting in shrink_dcache_tree() Al Viro 2026-04-02 19:52 ` Linus Torvalds 2026-04-02 22:44 ` Al Viro 2026-04-02 22:49 ` Linus Torvalds 2026-04-02 23:16 ` Al Viro 2026-04-03 0:29 ` Linus Torvalds 2026-04-03 2:15 ` Al Viro 2026-04-04 0:02 ` Al Viro 2026-04-04 0:04 ` Linus Torvalds 2026-04-04 18:54 ` Al Viro 2026-04-04 19:04 ` Linus Torvalds 2026-04-05 0:04 ` Al Viro 2026-04-02 20:28 ` [RFC PATCH v2 0/4] getting rid of busy-wait in shrink_dcache_parent() Paulo Alcantara 2026-04-03 4:46 ` Al Viro 2026-04-04 8:07 ` [RFC PATCH v3 " Al Viro 2026-04-04 8:07 ` [RFC PATCH v3 1/4] for_each_alias(): helper macro for iterating through dentries of given inode Al Viro 2026-04-04 8:07 ` [RFC PATCH v3 2/4] struct dentry: make ->d_u anonymous Al Viro 2026-04-04 8:07 ` [RFC PATCH v3 3/4] dcache.c: more idiomatic "positives are not allowed" sanity checks Al Viro 2026-04-04 8:07 ` [RFC PATCH v3 4/4] get rid of busy-waiting in shrink_dcache_tree() Al Viro
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox