linux-fsdevel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Al Viro <viro@zeniv.linux.org.uk>
To: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: linux-fsdevel@vger.kernel.org,
	Linus Torvalds <torvalds@linux-foundation.org>,
	Christian Brauner <brauner@kernel.org>
Subject: Re: [BUG] propagate_umount() breakage
Date: Thu, 15 May 2025 12:41:50 +0100	[thread overview]
Message-ID: <20250515114150.GA3221059@ZenIV> (raw)
In-Reply-To: <20250513035622.GE2023217@ZenIV>

On Tue, May 13, 2025 at 04:56:22AM +0100, Al Viro wrote:

> And yes, it's going to be posted along with the proof of correctness -
> I've had it with the amount of subtle corner cases in that area ;-/

FWIW, it seems to survive testing; it's _not_ final - I'm still editing
the promised proof, but by this point it's stylistic work.  I hadn't
touched that kind of formal writing for quite a while, so the text is clumsy.
The code changes are pretty much in the final shape.  Current diff of
code (all in fs/pnode.*) relative to mainline:

diff --git a/fs/pnode.c b/fs/pnode.c
index fb77427df39e..32ecebbc1d19 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -24,11 +24,6 @@ static inline struct mount *first_slave(struct mount *p)
 	return list_entry(p->mnt_slave_list.next, struct mount, mnt_slave);
 }
 
-static inline struct mount *last_slave(struct mount *p)
-{
-	return list_entry(p->mnt_slave_list.prev, struct mount, mnt_slave);
-}
-
 static inline struct mount *next_slave(struct mount *p)
 {
 	return list_entry(p->mnt_slave.next, struct mount, mnt_slave);
@@ -136,6 +131,23 @@ void change_mnt_propagation(struct mount *mnt, int type)
 	}
 }
 
+static struct mount *__propagation_next(struct mount *m,
+					 struct mount *origin)
+{
+	while (1) {
+		struct mount *master = m->mnt_master;
+
+		if (master == origin->mnt_master) {
+			struct mount *next = next_peer(m);
+			return (next == origin) ? NULL : next;
+		} else if (m->mnt_slave.next != &master->mnt_slave_list)
+			return next_slave(m);
+
+		/* back at master */
+		m = master;
+	}
+}
+
 /*
  * get the next mount in the propagation tree.
  * @m: the mount seen last
@@ -153,31 +165,26 @@ static struct mount *propagation_next(struct mount *m,
 	if (!IS_MNT_NEW(m) && !list_empty(&m->mnt_slave_list))
 		return first_slave(m);
 
-	while (1) {
-		struct mount *master = m->mnt_master;
-
-		if (master == origin->mnt_master) {
-			struct mount *next = next_peer(m);
-			return (next == origin) ? NULL : next;
-		} else if (m->mnt_slave.next != &master->mnt_slave_list)
-			return next_slave(m);
+	return __propagation_next(m, origin);
+}
 
-		/* back at master */
-		m = master;
-	}
+static inline bool peers(const struct mount *m1, const struct mount *m2)
+{
+	return m1->mnt_group_id == m2->mnt_group_id && m1->mnt_group_id;
 }
 
 static struct mount *skip_propagation_subtree(struct mount *m,
 						struct mount *origin)
 {
 	/*
-	 * Advance m such that propagation_next will not return
-	 * the slaves of m.
+	 * Advance m past everything that gets propagation from it.
 	 */
-	if (!IS_MNT_NEW(m) && !list_empty(&m->mnt_slave_list))
-		m = last_slave(m);
+	struct mount *p = __propagation_next(m, origin);
 
-	return m;
+	while (p && peers(m, p))
+		p = __propagation_next(p, origin);
+
+	return p;
 }
 
 static struct mount *next_group(struct mount *m, struct mount *origin)
@@ -216,11 +223,6 @@ static struct mount *next_group(struct mount *m, struct mount *origin)
 static struct mount *last_dest, *first_source, *last_source, *dest_master;
 static struct hlist_head *list;
 
-static inline bool peers(const struct mount *m1, const struct mount *m2)
-{
-	return m1->mnt_group_id == m2->mnt_group_id && m1->mnt_group_id;
-}
-
 static int propagate_one(struct mount *m, struct mountpoint *dest_mp)
 {
 	struct mount *child;
@@ -463,109 +465,144 @@ void propagate_mount_unlock(struct mount *mnt)
 	}
 }
 
-static void umount_one(struct mount *mnt, struct list_head *to_umount)
+static LIST_HEAD(to_umount);
+static LIST_HEAD(candidates);
+
+static inline struct mount *first_candidate(void)
 {
-	CLEAR_MNT_MARK(mnt);
-	mnt->mnt.mnt_flags |= MNT_UMOUNT;
-	list_del_init(&mnt->mnt_child);
-	list_del_init(&mnt->mnt_umounting);
-	move_from_ns(mnt, to_umount);
+	if (list_empty(&candidates))
+		return NULL;
+	return list_first_entry(&candidates, struct mount, mnt_umounting);
 }
 
-/*
- * NOTE: unmounting 'mnt' naturally propagates to all other mounts its
- * parent propagates to.
- */
-static bool __propagate_umount(struct mount *mnt,
-			       struct list_head *to_umount,
-			       struct list_head *to_restore)
+static inline bool is_candidate(struct mount *m)
 {
-	bool progress = false;
-	struct mount *child;
-
-	/*
-	 * The state of the parent won't change if this mount is
-	 * already unmounted or marked as without children.
-	 */
-	if (mnt->mnt.mnt_flags & (MNT_UMOUNT | MNT_MARKED))
-		goto out;
+	return !list_empty(&m->mnt_umounting);
+}
 
-	/* Verify topper is the only grandchild that has not been
-	 * speculatively unmounted.
-	 */
-	list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) {
-		if (child->mnt_mountpoint == mnt->mnt.mnt_root)
-			continue;
-		if (!list_empty(&child->mnt_umounting) && IS_MNT_MARKED(child))
-			continue;
-		/* Found a mounted child */
-		goto children;
-	}
+static inline bool will_be_unmounted(struct mount *m)
+{
+	return m->mnt.mnt_flags & MNT_UMOUNT;
+}
 
-	/* Mark mounts that can be unmounted if not locked */
-	SET_MNT_MARK(mnt);
-	progress = true;
+static void umount_one(struct mount *m)
+{
+	m->mnt.mnt_flags |= MNT_UMOUNT;
+	list_del_init(&m->mnt_child);
+	move_from_ns(m, &to_umount);
+}
 
-	/* If a mount is without children and not locked umount it. */
-	if (!IS_MNT_LOCKED(mnt)) {
-		umount_one(mnt, to_umount);
-	} else {
-children:
-		list_move_tail(&mnt->mnt_umounting, to_restore);
-	}
-out:
-	return progress;
+static void remove_candidate(struct mount *m)
+{
+	CLEAR_MNT_MARK(m);
+	list_del_init(&m->mnt_umounting);
 }
 
-static void umount_list(struct list_head *to_umount,
-			struct list_head *to_restore)
+static void gather_candidates(struct list_head *set)
 {
-	struct mount *mnt, *child, *tmp;
-	list_for_each_entry(mnt, to_umount, mnt_list) {
-		list_for_each_entry_safe(child, tmp, &mnt->mnt_mounts, mnt_child) {
-			/* topper? */
-			if (child->mnt_mountpoint == mnt->mnt.mnt_root)
-				list_move_tail(&child->mnt_umounting, to_restore);
-			else
-				umount_one(child, to_umount);
+	LIST_HEAD(visited);
+	struct mount *m, *p, *q;
+
+	list_for_each_entry(m, set, mnt_list) {
+		if (is_candidate(m))
+			continue;
+		list_add(&m->mnt_umounting, &visited);
+		p = m->mnt_parent;
+		q = propagation_next(p, p);
+		while (q) {
+			struct mount *child = __lookup_mnt(&q->mnt,
+							   m->mnt_mountpoint);
+			if (child) {
+				struct list_head *head;
+
+				if (is_candidate(child)) {
+					q = skip_propagation_subtree(q, p);
+					continue;
+				}
+				if (will_be_unmounted(child))
+					head = &visited;
+				else
+					head = &candidates;
+				list_add(&child->mnt_umounting, head);
+			}
+			q = propagation_next(q, p);
 		}
 	}
+	while (!list_empty(&visited))	// empty visited
+		list_del_init(visited.next);
 }
 
-static void restore_mounts(struct list_head *to_restore)
+static struct mount *trim_one(struct mount *m)
 {
-	/* Restore mounts to a clean working state */
-	while (!list_empty(to_restore)) {
-		struct mount *mnt, *parent;
-		struct mountpoint *mp;
-
-		mnt = list_first_entry(to_restore, struct mount, mnt_umounting);
-		CLEAR_MNT_MARK(mnt);
-		list_del_init(&mnt->mnt_umounting);
-
-		/* Should this mount be reparented? */
-		mp = mnt->mnt_mp;
-		parent = mnt->mnt_parent;
-		while (parent->mnt.mnt_flags & MNT_UMOUNT) {
-			mp = parent->mnt_mp;
-			parent = parent->mnt_parent;
+	bool remove_this = false, found = false, umount_this = false;
+	struct mount *n;
+	struct list_head *next;
+
+	list_for_each_entry(n, &m->mnt_mounts, mnt_child) {
+		if (!is_candidate(n)) {
+			found = true;
+			if (n->mnt_mountpoint != m->mnt.mnt_root) {
+				remove_this = true;
+				break;
+			}
 		}
-		if (parent != mnt->mnt_parent) {
-			mnt_change_mountpoint(parent, mp, mnt);
-			mnt_notify_add(mnt);
+	}
+	if (found) {
+		struct mount *this = m;
+		for (struct mount *p = this->mnt_parent;
+		     !IS_MNT_MARKED(this) && is_candidate(p);
+		     this = p, p = p->mnt_parent) {
+			SET_MNT_MARK(this);
+			if (this->mnt_mountpoint != p->mnt.mnt_root)
+				remove_candidate(p);
 		}
+	} else if (!IS_MNT_LOCKED(m) && list_empty(&m->mnt_mounts)) {
+		remove_this = true;
+		umount_this = true;
 	}
+	next = m->mnt_umounting.next;
+	if (remove_this) {
+		remove_candidate(m);
+		if (umount_this)
+			umount_one(m);
+	}
+	if (next == &candidates)
+		return NULL;
+
+	return list_entry(next, struct mount, mnt_umounting);
 }
 
-static void cleanup_umount_visitations(struct list_head *visited)
+static void handle_locked(struct mount *m)
 {
-	while (!list_empty(visited)) {
-		struct mount *mnt =
-			list_first_entry(visited, struct mount, mnt_umounting);
-		list_del_init(&mnt->mnt_umounting);
+	struct mount *cutoff = m, *p;
+
+	for (p = m; is_candidate(p); p = p->mnt_parent) {
+		remove_candidate(p);
+		if (!IS_MNT_LOCKED(p))
+			cutoff = p->mnt_parent;
+	}
+	if (will_be_unmounted(p))
+		cutoff = p;
+	while (m != cutoff) {
+		umount_one(m);
+		m = m->mnt_parent;
 	}
 }
 
+static void reparent(struct mount *m)
+{
+	struct mount *p = m;
+	struct mountpoint *mp;
+
+	do {
+		mp = p->mnt_mp;
+		p = p->mnt_parent;
+	} while (will_be_unmounted(p));
+
+	mnt_change_mountpoint(p, mp, m);
+	mnt_notify_add(m);
+}
+
 /*
  * collect all mounts that receive propagation from the mount in @list,
  * and return these additional mounts in the same list.
@@ -573,71 +610,27 @@ static void cleanup_umount_visitations(struct list_head *visited)
  *
  * vfsmount lock must be held for write
  */
-int propagate_umount(struct list_head *list)
+void propagate_umount(struct list_head *set)
 {
-	struct mount *mnt;
-	LIST_HEAD(to_restore);
-	LIST_HEAD(to_umount);
-	LIST_HEAD(visited);
+	struct mount *m;
 
-	/* Find candidates for unmounting */
-	list_for_each_entry_reverse(mnt, list, mnt_list) {
-		struct mount *parent = mnt->mnt_parent;
-		struct mount *m;
+	gather_candidates(set);
 
-		/*
-		 * If this mount has already been visited it is known that it's
-		 * entire peer group and all of their slaves in the propagation
-		 * tree for the mountpoint has already been visited and there is
-		 * no need to visit them again.
-		 */
-		if (!list_empty(&mnt->mnt_umounting))
-			continue;
+	// make it non-shifting
+	for (m = first_candidate(); m; m = trim_one(m))
+		;
 
-		list_add_tail(&mnt->mnt_umounting, &visited);
-		for (m = propagation_next(parent, parent); m;
-		     m = propagation_next(m, parent)) {
-			struct mount *child = __lookup_mnt(&m->mnt,
-							   mnt->mnt_mountpoint);
-			if (!child)
-				continue;
-
-			if (!list_empty(&child->mnt_umounting)) {
-				/*
-				 * If the child has already been visited it is
-				 * know that it's entire peer group and all of
-				 * their slaves in the propgation tree for the
-				 * mountpoint has already been visited and there
-				 * is no need to visit this subtree again.
-				 */
-				m = skip_propagation_subtree(m, parent);
-				continue;
-			} else if (child->mnt.mnt_flags & MNT_UMOUNT) {
-				/*
-				 * We have come across a partially unmounted
-				 * mount in a list that has not been visited
-				 * yet. Remember it has been visited and
-				 * continue about our merry way.
-				 */
-				list_add_tail(&child->mnt_umounting, &visited);
-				continue;
-			}
+	// ... and non-revealing
+	while (!list_empty(&candidates))
+		handle_locked(first_candidate());
 
-			/* Check the child and parents while progress is made */
-			while (__propagate_umount(child,
-						  &to_umount, &to_restore)) {
-				/* Is the parent a umount candidate? */
-				child = child->mnt_parent;
-				if (list_empty(&child->mnt_umounting))
-					break;
-			}
-		}
+	// now to_umount consists of all non-rejected candidates
+	// deal with reparenting of remaining overmounts on those
+	list_for_each_entry(m, &to_umount, mnt_list) {
+		while (!list_empty(&m->mnt_mounts)) // should be at most one
+			reparent(list_first_entry(&m->mnt_mounts,
+						  struct mount, mnt_child));
 	}
 
-	umount_list(&to_umount, &to_restore);
-	restore_mounts(&to_restore);
-	cleanup_umount_visitations(&visited);
-	list_splice_tail(&to_umount, list);
-
-	return 0;
+	list_splice_tail_init(&to_umount, set);
 }
diff --git a/fs/pnode.h b/fs/pnode.h
index 34b6247af01d..d84a397bfd43 100644
--- a/fs/pnode.h
+++ b/fs/pnode.h
@@ -39,7 +39,7 @@ static inline void set_mnt_shared(struct mount *mnt)
 void change_mnt_propagation(struct mount *, int);
 int propagate_mnt(struct mount *, struct mountpoint *, struct mount *,
 		struct hlist_head *);
-int propagate_umount(struct list_head *);
+void propagate_umount(struct list_head *);
 int propagate_mount_busy(struct mount *, int);
 void propagate_mount_unlock(struct mount *);
 void mnt_release_group_id(struct mount *);

  reply	other threads:[~2025-05-15 11:41 UTC|newest]

Thread overview: 16+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-05-11 23:27 [BUG] propagate_umount() breakage Al Viro
2025-05-12  4:50 ` Eric W. Biederman
2025-05-13  3:56   ` Al Viro
2025-05-15 11:41     ` Al Viro [this message]
2025-05-15 11:47       ` Al Viro
2025-05-16  5:21         ` [RFC][CFT][PATCH] Rewrite of propagate_umount() (was Re: [BUG] propagate_umount() breakage) Al Viro
2025-05-19 18:11           ` Linus Torvalds
2025-05-19 21:35             ` Al Viro
2025-05-19 22:08               ` Linus Torvalds
2025-05-19 22:26                 ` Linus Torvalds
2025-05-20 22:27               ` Eric W. Biederman
2025-05-20 23:08                 ` Al Viro
2025-05-23  2:10                   ` [RFC][CFT][PATCH v2] Rewrite of propagate_umount() Al Viro
     [not found]               ` <20250520075317.GB2023217@ZenIV>
     [not found]                 ` <87y0uqlvxs.fsf@email.froward.int.ebiederm.org>
     [not found]                   ` <20250520231854.GF2023217@ZenIV>
     [not found]                     ` <20250521023219.GA1309405@ZenIV>
     [not found]                       ` <20250617041754.GA1591763@ZenIV>
2025-06-17 21:18                         ` [PATCH][RFC] replace collect_mounts()/drop_collected_mounts() with safer variant Al Viro
2025-05-20 11:10           ` [RFC][CFT][PATCH] Rewrite of propagate_umount() (was Re: [BUG] propagate_umount() breakage) Christian Brauner
2025-05-21  2:11             ` Al Viro

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20250515114150.GA3221059@ZenIV \
    --to=viro@zeniv.linux.org.uk \
    --cc=brauner@kernel.org \
    --cc=ebiederm@xmission.com \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=torvalds@linux-foundation.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).