All of lore.kernel.org
 help / color / mirror / Atom feed
From: Al Viro <viro@zeniv.linux.org.uk>
To: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: linux-fsdevel@vger.kernel.org,
	Linus Torvalds <torvalds@linux-foundation.org>,
	Christian Brauner <brauner@kernel.org>
Subject: Re: [BUG] propagate_umount() breakage
Date: Thu, 15 May 2025 12:41:50 +0100	[thread overview]
Message-ID: <20250515114150.GA3221059@ZenIV> (raw)
In-Reply-To: <20250513035622.GE2023217@ZenIV>

On Tue, May 13, 2025 at 04:56:22AM +0100, Al Viro wrote:

> And yes, it's going to be posted along with the proof of correctness -
> I've had it with the amount of subtle corner cases in that area ;-/

FWIW, it seems to survive testing; it's _not_ final - I'm still editing
the promised proof, but by this point it's stylistic work.  I hadn't
touched that kind of formal writing for quite a while, so the text is clumsy.
The code changes are pretty much in the final shape.  Current diff of
code (all in fs/pnode.*) relative to mainline:

diff --git a/fs/pnode.c b/fs/pnode.c
index fb77427df39e..32ecebbc1d19 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -24,11 +24,6 @@ static inline struct mount *first_slave(struct mount *p)
 	return list_entry(p->mnt_slave_list.next, struct mount, mnt_slave);
 }
 
-static inline struct mount *last_slave(struct mount *p)
-{
-	return list_entry(p->mnt_slave_list.prev, struct mount, mnt_slave);
-}
-
 static inline struct mount *next_slave(struct mount *p)
 {
 	return list_entry(p->mnt_slave.next, struct mount, mnt_slave);
@@ -136,6 +131,23 @@ void change_mnt_propagation(struct mount *mnt, int type)
 	}
 }
 
+static struct mount *__propagation_next(struct mount *m,
+					 struct mount *origin)
+{
+	while (1) {
+		struct mount *master = m->mnt_master;
+
+		if (master == origin->mnt_master) {
+			struct mount *next = next_peer(m);
+			return (next == origin) ? NULL : next;
+		} else if (m->mnt_slave.next != &master->mnt_slave_list)
+			return next_slave(m);
+
+		/* back at master */
+		m = master;
+	}
+}
+
 /*
  * get the next mount in the propagation tree.
  * @m: the mount seen last
@@ -153,31 +165,26 @@ static struct mount *propagation_next(struct mount *m,
 	if (!IS_MNT_NEW(m) && !list_empty(&m->mnt_slave_list))
 		return first_slave(m);
 
-	while (1) {
-		struct mount *master = m->mnt_master;
-
-		if (master == origin->mnt_master) {
-			struct mount *next = next_peer(m);
-			return (next == origin) ? NULL : next;
-		} else if (m->mnt_slave.next != &master->mnt_slave_list)
-			return next_slave(m);
+	return __propagation_next(m, origin);
+}
 
-		/* back at master */
-		m = master;
-	}
+static inline bool peers(const struct mount *m1, const struct mount *m2)
+{
+	return m1->mnt_group_id == m2->mnt_group_id && m1->mnt_group_id;
 }
 
 static struct mount *skip_propagation_subtree(struct mount *m,
 						struct mount *origin)
 {
 	/*
-	 * Advance m such that propagation_next will not return
-	 * the slaves of m.
+	 * Advance m past everything that gets propagation from it.
 	 */
-	if (!IS_MNT_NEW(m) && !list_empty(&m->mnt_slave_list))
-		m = last_slave(m);
+	struct mount *p = __propagation_next(m, origin);
 
-	return m;
+	while (p && peers(m, p))
+		p = __propagation_next(p, origin);
+
+	return p;
 }
 
 static struct mount *next_group(struct mount *m, struct mount *origin)
@@ -216,11 +223,6 @@ static struct mount *next_group(struct mount *m, struct mount *origin)
 static struct mount *last_dest, *first_source, *last_source, *dest_master;
 static struct hlist_head *list;
 
-static inline bool peers(const struct mount *m1, const struct mount *m2)
-{
-	return m1->mnt_group_id == m2->mnt_group_id && m1->mnt_group_id;
-}
-
 static int propagate_one(struct mount *m, struct mountpoint *dest_mp)
 {
 	struct mount *child;
@@ -463,109 +465,144 @@ void propagate_mount_unlock(struct mount *mnt)
 	}
 }
 
-static void umount_one(struct mount *mnt, struct list_head *to_umount)
+static LIST_HEAD(to_umount);
+static LIST_HEAD(candidates);
+
+static inline struct mount *first_candidate(void)
 {
-	CLEAR_MNT_MARK(mnt);
-	mnt->mnt.mnt_flags |= MNT_UMOUNT;
-	list_del_init(&mnt->mnt_child);
-	list_del_init(&mnt->mnt_umounting);
-	move_from_ns(mnt, to_umount);
+	if (list_empty(&candidates))
+		return NULL;
+	return list_first_entry(&candidates, struct mount, mnt_umounting);
 }
 
-/*
- * NOTE: unmounting 'mnt' naturally propagates to all other mounts its
- * parent propagates to.
- */
-static bool __propagate_umount(struct mount *mnt,
-			       struct list_head *to_umount,
-			       struct list_head *to_restore)
+static inline bool is_candidate(struct mount *m)
 {
-	bool progress = false;
-	struct mount *child;
-
-	/*
-	 * The state of the parent won't change if this mount is
-	 * already unmounted or marked as without children.
-	 */
-	if (mnt->mnt.mnt_flags & (MNT_UMOUNT | MNT_MARKED))
-		goto out;
+	return !list_empty(&m->mnt_umounting);
+}
 
-	/* Verify topper is the only grandchild that has not been
-	 * speculatively unmounted.
-	 */
-	list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) {
-		if (child->mnt_mountpoint == mnt->mnt.mnt_root)
-			continue;
-		if (!list_empty(&child->mnt_umounting) && IS_MNT_MARKED(child))
-			continue;
-		/* Found a mounted child */
-		goto children;
-	}
+static inline bool will_be_unmounted(struct mount *m)
+{
+	return m->mnt.mnt_flags & MNT_UMOUNT;
+}
 
-	/* Mark mounts that can be unmounted if not locked */
-	SET_MNT_MARK(mnt);
-	progress = true;
+static void umount_one(struct mount *m)
+{
+	m->mnt.mnt_flags |= MNT_UMOUNT;
+	list_del_init(&m->mnt_child);
+	move_from_ns(m, &to_umount);
+}
 
-	/* If a mount is without children and not locked umount it. */
-	if (!IS_MNT_LOCKED(mnt)) {
-		umount_one(mnt, to_umount);
-	} else {
-children:
-		list_move_tail(&mnt->mnt_umounting, to_restore);
-	}
-out:
-	return progress;
+static void remove_candidate(struct mount *m)
+{
+	CLEAR_MNT_MARK(m);
+	list_del_init(&m->mnt_umounting);
 }
 
-static void umount_list(struct list_head *to_umount,
-			struct list_head *to_restore)
+static void gather_candidates(struct list_head *set)
 {
-	struct mount *mnt, *child, *tmp;
-	list_for_each_entry(mnt, to_umount, mnt_list) {
-		list_for_each_entry_safe(child, tmp, &mnt->mnt_mounts, mnt_child) {
-			/* topper? */
-			if (child->mnt_mountpoint == mnt->mnt.mnt_root)
-				list_move_tail(&child->mnt_umounting, to_restore);
-			else
-				umount_one(child, to_umount);
+	LIST_HEAD(visited);
+	struct mount *m, *p, *q;
+
+	list_for_each_entry(m, set, mnt_list) {
+		if (is_candidate(m))
+			continue;
+		list_add(&m->mnt_umounting, &visited);
+		p = m->mnt_parent;
+		q = propagation_next(p, p);
+		while (q) {
+			struct mount *child = __lookup_mnt(&q->mnt,
+							   m->mnt_mountpoint);
+			if (child) {
+				struct list_head *head;
+
+				if (is_candidate(child)) {
+					q = skip_propagation_subtree(q, p);
+					continue;
+				}
+				if (will_be_unmounted(child))
+					head = &visited;
+				else
+					head = &candidates;
+				list_add(&child->mnt_umounting, head);
+			}
+			q = propagation_next(q, p);
 		}
 	}
+	while (!list_empty(&visited))	// empty visited
+		list_del_init(visited.next);
 }
 
-static void restore_mounts(struct list_head *to_restore)
+static struct mount *trim_one(struct mount *m)
 {
-	/* Restore mounts to a clean working state */
-	while (!list_empty(to_restore)) {
-		struct mount *mnt, *parent;
-		struct mountpoint *mp;
-
-		mnt = list_first_entry(to_restore, struct mount, mnt_umounting);
-		CLEAR_MNT_MARK(mnt);
-		list_del_init(&mnt->mnt_umounting);
-
-		/* Should this mount be reparented? */
-		mp = mnt->mnt_mp;
-		parent = mnt->mnt_parent;
-		while (parent->mnt.mnt_flags & MNT_UMOUNT) {
-			mp = parent->mnt_mp;
-			parent = parent->mnt_parent;
+	bool remove_this = false, found = false, umount_this = false;
+	struct mount *n;
+	struct list_head *next;
+
+	list_for_each_entry(n, &m->mnt_mounts, mnt_child) {
+		if (!is_candidate(n)) {
+			found = true;
+			if (n->mnt_mountpoint != m->mnt.mnt_root) {
+				remove_this = true;
+				break;
+			}
 		}
-		if (parent != mnt->mnt_parent) {
-			mnt_change_mountpoint(parent, mp, mnt);
-			mnt_notify_add(mnt);
+	}
+	if (found) {
+		struct mount *this = m;
+		for (struct mount *p = this->mnt_parent;
+		     !IS_MNT_MARKED(this) && is_candidate(p);
+		     this = p, p = p->mnt_parent) {
+			SET_MNT_MARK(this);
+			if (this->mnt_mountpoint != p->mnt.mnt_root)
+				remove_candidate(p);
 		}
+	} else if (!IS_MNT_LOCKED(m) && list_empty(&m->mnt_mounts)) {
+		remove_this = true;
+		umount_this = true;
 	}
+	next = m->mnt_umounting.next;
+	if (remove_this) {
+		remove_candidate(m);
+		if (umount_this)
+			umount_one(m);
+	}
+	if (next == &candidates)
+		return NULL;
+
+	return list_entry(next, struct mount, mnt_umounting);
 }
 
-static void cleanup_umount_visitations(struct list_head *visited)
+static void handle_locked(struct mount *m)
 {
-	while (!list_empty(visited)) {
-		struct mount *mnt =
-			list_first_entry(visited, struct mount, mnt_umounting);
-		list_del_init(&mnt->mnt_umounting);
+	struct mount *cutoff = m, *p;
+
+	for (p = m; is_candidate(p); p = p->mnt_parent) {
+		remove_candidate(p);
+		if (!IS_MNT_LOCKED(p))
+			cutoff = p->mnt_parent;
+	}
+	if (will_be_unmounted(p))
+		cutoff = p;
+	while (m != cutoff) {
+		umount_one(m);
+		m = m->mnt_parent;
 	}
 }
 
+static void reparent(struct mount *m)
+{
+	struct mount *p = m;
+	struct mountpoint *mp;
+
+	do {
+		mp = p->mnt_mp;
+		p = p->mnt_parent;
+	} while (will_be_unmounted(p));
+
+	mnt_change_mountpoint(p, mp, m);
+	mnt_notify_add(m);
+}
+
 /*
  * collect all mounts that receive propagation from the mount in @list,
  * and return these additional mounts in the same list.
@@ -573,71 +610,27 @@ static void cleanup_umount_visitations(struct list_head *visited)
  *
  * vfsmount lock must be held for write
  */
-int propagate_umount(struct list_head *list)
+void propagate_umount(struct list_head *set)
 {
-	struct mount *mnt;
-	LIST_HEAD(to_restore);
-	LIST_HEAD(to_umount);
-	LIST_HEAD(visited);
+	struct mount *m;
 
-	/* Find candidates for unmounting */
-	list_for_each_entry_reverse(mnt, list, mnt_list) {
-		struct mount *parent = mnt->mnt_parent;
-		struct mount *m;
+	gather_candidates(set);
 
-		/*
-		 * If this mount has already been visited it is known that it's
-		 * entire peer group and all of their slaves in the propagation
-		 * tree for the mountpoint has already been visited and there is
-		 * no need to visit them again.
-		 */
-		if (!list_empty(&mnt->mnt_umounting))
-			continue;
+	// make it non-shifting
+	for (m = first_candidate(); m; m = trim_one(m))
+		;
 
-		list_add_tail(&mnt->mnt_umounting, &visited);
-		for (m = propagation_next(parent, parent); m;
-		     m = propagation_next(m, parent)) {
-			struct mount *child = __lookup_mnt(&m->mnt,
-							   mnt->mnt_mountpoint);
-			if (!child)
-				continue;
-
-			if (!list_empty(&child->mnt_umounting)) {
-				/*
-				 * If the child has already been visited it is
-				 * know that it's entire peer group and all of
-				 * their slaves in the propgation tree for the
-				 * mountpoint has already been visited and there
-				 * is no need to visit this subtree again.
-				 */
-				m = skip_propagation_subtree(m, parent);
-				continue;
-			} else if (child->mnt.mnt_flags & MNT_UMOUNT) {
-				/*
-				 * We have come across a partially unmounted
-				 * mount in a list that has not been visited
-				 * yet. Remember it has been visited and
-				 * continue about our merry way.
-				 */
-				list_add_tail(&child->mnt_umounting, &visited);
-				continue;
-			}
+	// ... and non-revealing
+	while (!list_empty(&candidates))
+		handle_locked(first_candidate());
 
-			/* Check the child and parents while progress is made */
-			while (__propagate_umount(child,
-						  &to_umount, &to_restore)) {
-				/* Is the parent a umount candidate? */
-				child = child->mnt_parent;
-				if (list_empty(&child->mnt_umounting))
-					break;
-			}
-		}
+	// now to_umount consists of all non-rejected candidates
+	// deal with reparenting of remaining overmounts on those
+	list_for_each_entry(m, &to_umount, mnt_list) {
+		while (!list_empty(&m->mnt_mounts)) // should be at most one
+			reparent(list_first_entry(&m->mnt_mounts,
+						  struct mount, mnt_child));
 	}
 
-	umount_list(&to_umount, &to_restore);
-	restore_mounts(&to_restore);
-	cleanup_umount_visitations(&visited);
-	list_splice_tail(&to_umount, list);
-
-	return 0;
+	list_splice_tail_init(&to_umount, set);
 }
diff --git a/fs/pnode.h b/fs/pnode.h
index 34b6247af01d..d84a397bfd43 100644
--- a/fs/pnode.h
+++ b/fs/pnode.h
@@ -39,7 +39,7 @@ static inline void set_mnt_shared(struct mount *mnt)
 void change_mnt_propagation(struct mount *, int);
 int propagate_mnt(struct mount *, struct mountpoint *, struct mount *,
 		struct hlist_head *);
-int propagate_umount(struct list_head *);
+void propagate_umount(struct list_head *);
 int propagate_mount_busy(struct mount *, int);
 void propagate_mount_unlock(struct mount *);
 void mnt_release_group_id(struct mount *);

  reply	other threads:[~2025-05-15 11:41 UTC|newest]

Thread overview: 16+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-05-11 23:27 [BUG] propagate_umount() breakage Al Viro
2025-05-12  4:50 ` Eric W. Biederman
2025-05-13  3:56   ` Al Viro
2025-05-15 11:41     ` Al Viro [this message]
2025-05-15 11:47       ` Al Viro
2025-05-16  5:21         ` [RFC][CFT][PATCH] Rewrite of propagate_umount() (was Re: [BUG] propagate_umount() breakage) Al Viro
2025-05-19 18:11           ` Linus Torvalds
2025-05-19 21:35             ` Al Viro
2025-05-19 22:08               ` Linus Torvalds
2025-05-19 22:26                 ` Linus Torvalds
2025-05-20 22:27               ` Eric W. Biederman
2025-05-20 23:08                 ` Al Viro
2025-05-23  2:10                   ` [RFC][CFT][PATCH v2] Rewrite of propagate_umount() Al Viro
     [not found]               ` <20250520075317.GB2023217@ZenIV>
     [not found]                 ` <87y0uqlvxs.fsf@email.froward.int.ebiederm.org>
     [not found]                   ` <20250520231854.GF2023217@ZenIV>
     [not found]                     ` <20250521023219.GA1309405@ZenIV>
     [not found]                       ` <20250617041754.GA1591763@ZenIV>
2025-06-17 21:18                         ` [PATCH][RFC] replace collect_mounts()/drop_collected_mounts() with safer variant Al Viro
2025-05-20 11:10           ` [RFC][CFT][PATCH] Rewrite of propagate_umount() (was Re: [BUG] propagate_umount() breakage) Christian Brauner
2025-05-21  2:11             ` Al Viro

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20250515114150.GA3221059@ZenIV \
    --to=viro@zeniv.linux.org.uk \
    --cc=brauner@kernel.org \
    --cc=ebiederm@xmission.com \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=torvalds@linux-foundation.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.