public inbox for linux-fsdevel@vger.kernel.org
 help / color / mirror / Atom feed
* [PATCH][RFC] get rid of busy-wait in shrink_dcache_tree()
@ 2026-01-22 20:20 Al Viro
  2026-01-23  0:19 ` Linus Torvalds
  0 siblings, 1 reply; 10+ messages in thread
From: Al Viro @ 2026-01-22 20:20 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: linux-fsdevel, Christian Brauner, Jan Kara, Nikolay Borisov,
	Max Kellermann

There's a case in which shrink_dcache_tree() ends up busy-waiting: if some
dentry in the subtree in question is found to be in process of being evicted
by another thread.  We need to wait for that to finish so that parent would
no longer be pinned, to avoid the situations when nothing in the tree is
busy, but shrink_dcache_tree() fails to evict some directory only because
memory pressure initiated eviction of some of its children before we got to
evicting those ourselves.  That would be bogus both for shrink_dcache_parent()
and for shrink_dcache_for_umount().

Unfortunately, we have nothing to wait on.  That had led to the possibility
of busy-waiting - getting through the iteration of shrink_dcache_tree() main
loop without having made any progress.  That's Not Nice(tm) and that had been
discussed quite a few times since at least 2018.  Recently it became obvious
that this goes beyond "not nice" - on sufficiently contrieved setup it's
possible to get a livelock there, with both threads involved tied to the same
CPU, shrink_dcache_tree() one with higher priority than the thread that has
given CPU up on may_sleep() in the very beginning of iput() during eviction
of the dentry in the tree shrink_dcache_tree() is busy-waiting for.

Let's get rid of that busy-waiting.  Constraints are
        * don't grow struct dentry
        * don't slow the normal case of __dentry_kill() down
and it turns out to be doable.

Dentries in question are
        * already marked dead (negative ->d_count)
        * already negative
        * already unhashed
        * already not in in-lookup hash
        * yet to get removed from ->d_sib and get DCACHE_DENTRY_KILLED in
flags.

Neither ->d_alias nor the fields overlapping it (->d_rcu and ->d_in_lookup_hash)
are going to be accessed for these dentries until after dentry_unlist().  What's
more, ->d_alias.next is guaranteed to be NULL.

So we can embed struct completion into struct select_data and (ab)use
->d_alias.next for linked list of struct select_data instances.

If dentry_unlist() finds ->d_alias.next non-NULL, it carefully goes over that
list and calls complete() for each of those.

That way select_collect2() can treat negative ->d_count the same way it deals
with dentries on other thread's shrink list - grab rcu_read_lock(), stash the
dentry into data.victim and tell d_walk() to stop.

If shrink_dcache_parent() runs into that case, it should attach its select_data
to victim dentry, evict whatever normal eviction candidates it has gathered
and wait for completion.  Voila...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
diff --git a/fs/dcache.c b/fs/dcache.c
index dc2fff4811d1..6db72a684d8d 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -605,6 +605,54 @@ void d_drop(struct dentry *dentry)
 }
 EXPORT_SYMBOL(d_drop);
 
+struct select_data {
+	struct dentry *start;
+	union {
+		long found;
+		struct {
+			struct dentry *victim;
+			struct select_data *next;
+			struct completion completion;
+		};
+	};
+	struct list_head dispose;
+};
+
+/*
+ *  shrink_dcache_parent() needs to be notified when dentry in process of
+ *  being evicted finally gets unlisted.  Such dentries are
+ *	already with negative ->d_count
+ *	already negative
+ *	already not in in-lookup hash
+ *	reachable only via ->d_sib.
+ *
+ *  Neither ->d_alias, nor ->d_rcu, nor ->d_in_lookup_hash are going to be
+ *  accessed for those, so we can (ab)use ->d_alias.next for list of
+ *  select_data of waiters.  Initially it's going to be NULL and as long
+ *  as dentry_unlist() returns it to that state we are fine.
+ */
+static inline void d_add_waiter(struct dentry *dentry, struct select_data *p)
+{
+	struct select_data *v = (void *)dentry->d_u.d_alias.next;
+	init_completion(&p->completion);
+	p->next = v;
+	dentry->d_u.d_alias.next = (void *)p;
+}
+
+static inline void d_complete_waiters(struct dentry *dentry)
+{
+	struct select_data *v = (void *)dentry->d_u.d_alias.next;
+	if (unlikely(v)) {
+		/* some shrink_dcache_tree() instances are waiting */
+		dentry->d_u.d_alias.next = NULL;
+		while (v) {
+			struct completion *r = &v->completion;
+			v = v->next;
+			complete(r);
+		}
+	}
+}
+
 static inline void dentry_unlist(struct dentry *dentry)
 {
 	struct dentry *next;
@@ -613,6 +661,7 @@ static inline void dentry_unlist(struct dentry *dentry)
 	 * attached to the dentry tree
 	 */
 	dentry->d_flags |= DCACHE_DENTRY_KILLED;
+	d_complete_waiters(dentry);
 	if (unlikely(hlist_unhashed(&dentry->d_sib)))
 		return;
 	__hlist_del(&dentry->d_sib);
@@ -1499,15 +1548,6 @@ int d_set_mounted(struct dentry *dentry)
  * constraints.
  */
 
-struct select_data {
-	struct dentry *start;
-	union {
-		long found;
-		struct dentry *victim;
-	};
-	struct list_head dispose;
-};
-
 static enum d_walk_ret select_collect(void *_data, struct dentry *dentry)
 {
 	struct select_data *data = _data;
@@ -1559,6 +1599,10 @@ static enum d_walk_ret select_collect2(void *_data, struct dentry *dentry)
 			return D_WALK_QUIT;
 		}
 		to_shrink_list(dentry, &data->dispose);
+	} else if (dentry->d_lockref.count < 0) {
+		rcu_read_lock();
+		data->victim = dentry;
+		return D_WALK_QUIT;
 	}
 	/*
 	 * We can return to the caller if we have found some (this
@@ -1598,12 +1642,26 @@ static void shrink_dcache_tree(struct dentry *parent, bool for_umount)
 		data.victim = NULL;
 		d_walk(parent, &data, select_collect2);
 		if (data.victim) {
-			spin_lock(&data.victim->d_lock);
-			if (!lock_for_kill(data.victim)) {
-				spin_unlock(&data.victim->d_lock);
+			struct dentry *v = data.victim;
+
+			spin_lock(&v->d_lock);
+			if (v->d_lockref.count < 0 &&
+			    !(v->d_flags & DCACHE_DENTRY_KILLED)) {
+				// It's busy dying; have it notify us once
+				// it becomes invisible to d_walk().
+				d_add_waiter(v, &data);
+				spin_unlock(&v->d_lock);
+				rcu_read_unlock();
+				if (!list_empty(&data.dispose))
+					shrink_dentry_list(&data.dispose);
+				wait_for_completion(&data.completion);
+				continue;
+			}
+			if (!lock_for_kill(v)) {
+				spin_unlock(&v->d_lock);
 				rcu_read_unlock();
 			} else {
-				shrink_kill(data.victim);
+				shrink_kill(v);
 			}
 		}
 		if (!list_empty(&data.dispose))

^ permalink raw reply related	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2026-01-24 20:26 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-01-22 20:20 [PATCH][RFC] get rid of busy-wait in shrink_dcache_tree() Al Viro
2026-01-23  0:19 ` Linus Torvalds
2026-01-23  0:36   ` Al Viro
2026-01-24  4:36     ` Al Viro
2026-01-24  4:46       ` Linus Torvalds
2026-01-24  5:36         ` Al Viro
2026-01-24 17:45           ` Linus Torvalds
2026-01-24 18:43             ` Al Viro
2026-01-24 19:32               ` Linus Torvalds
2026-01-24 20:28                 ` Al Viro

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox