* [PATCH][RFC] ->mnt_devname is never NULL
@ 2025-04-21 3:35 Al Viro
2025-04-21 7:56 ` Christian Brauner
0 siblings, 1 reply; 13+ messages in thread
From: Al Viro @ 2025-04-21 3:35 UTC (permalink / raw)
To: linux-fsdevel
Not since 8f2918898eb5 "new helpers: vfs_create_mount(), fc_mount()"
back in 2018. Get rid of the dead checks...
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
diff --git a/fs/namespace.c b/fs/namespace.c
index d9ca80dcc544..fa17762268f5 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -355,12 +355,13 @@ static struct mount *alloc_vfsmnt(const char *name)
if (err)
goto out_free_cache;
- if (name) {
+ if (name)
mnt->mnt_devname = kstrdup_const(name,
GFP_KERNEL_ACCOUNT);
- if (!mnt->mnt_devname)
- goto out_free_id;
- }
+ else
+ mnt->mnt_devname = "none";
+ if (!mnt->mnt_devname)
+ goto out_free_id;
#ifdef CONFIG_SMP
mnt->mnt_pcp = alloc_percpu(struct mnt_pcp);
@@ -1268,7 +1269,7 @@ struct vfsmount *vfs_create_mount(struct fs_context *fc)
if (!fc->root)
return ERR_PTR(-EINVAL);
- mnt = alloc_vfsmnt(fc->source ?: "none");
+ mnt = alloc_vfsmnt(fc->source);
if (!mnt)
return ERR_PTR(-ENOMEM);
@@ -5491,7 +5492,7 @@ static int statmount_sb_source(struct kstatmount *s, struct seq_file *seq)
seq->buf[seq->count] = '\0';
seq->count = start;
seq_commit(seq, string_unescape_inplace(seq->buf + start, UNESCAPE_OCTAL));
- } else if (r->mnt_devname) {
+ } else {
seq_puts(seq, r->mnt_devname);
}
return 0;
diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c
index e133b507ddf3..5c555db68aa2 100644
--- a/fs/proc_namespace.c
+++ b/fs/proc_namespace.c
@@ -111,7 +111,7 @@ static int show_vfsmnt(struct seq_file *m, struct vfsmount *mnt)
if (err)
goto out;
} else {
- mangle(m, r->mnt_devname ? r->mnt_devname : "none");
+ mangle(m, r->mnt_devname);
}
seq_putc(m, ' ');
/* mountpoints outside of chroot jail will give SEQ_SKIP on this */
@@ -177,7 +177,7 @@ static int show_mountinfo(struct seq_file *m, struct vfsmount *mnt)
if (err)
goto out;
} else {
- mangle(m, r->mnt_devname ? r->mnt_devname : "none");
+ mangle(m, r->mnt_devname);
}
seq_puts(m, sb_rdonly(sb) ? " ro" : " rw");
err = show_sb_opts(m, sb);
@@ -199,17 +199,13 @@ static int show_vfsstat(struct seq_file *m, struct vfsmount *mnt)
int err;
/* device */
+ seq_puts(m, "device ");
if (sb->s_op->show_devname) {
- seq_puts(m, "device ");
err = sb->s_op->show_devname(m, mnt_path.dentry);
if (err)
goto out;
} else {
- if (r->mnt_devname) {
- seq_puts(m, "device ");
- mangle(m, r->mnt_devname);
- } else
- seq_puts(m, "no device");
+ mangle(m, r->mnt_devname);
}
/* mount point */
^ permalink raw reply related [flat|nested] 13+ messages in thread
* Re: [PATCH][RFC] ->mnt_devname is never NULL
2025-04-21 3:35 [PATCH][RFC] ->mnt_devname is never NULL Al Viro
@ 2025-04-21 7:56 ` Christian Brauner
2025-04-21 16:29 ` Al Viro
0 siblings, 1 reply; 13+ messages in thread
From: Christian Brauner @ 2025-04-21 7:56 UTC (permalink / raw)
To: Al Viro; +Cc: linux-fsdevel
On Mon, Apr 21, 2025 at 04:35:09AM +0100, Al Viro wrote:
> Not since 8f2918898eb5 "new helpers: vfs_create_mount(), fc_mount()"
> back in 2018. Get rid of the dead checks...
>
> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
> ---
Good idea. Fwiw, I've put this into vfs-6.16.mount with some other minor
stuff. If you're keeping it yourself let me know.
> diff --git a/fs/namespace.c b/fs/namespace.c
> index d9ca80dcc544..fa17762268f5 100644
> --- a/fs/namespace.c
> +++ b/fs/namespace.c
> @@ -355,12 +355,13 @@ static struct mount *alloc_vfsmnt(const char *name)
> if (err)
> goto out_free_cache;
>
> - if (name) {
> + if (name)
> mnt->mnt_devname = kstrdup_const(name,
> GFP_KERNEL_ACCOUNT);
> - if (!mnt->mnt_devname)
> - goto out_free_id;
> - }
> + else
> + mnt->mnt_devname = "none";
> + if (!mnt->mnt_devname)
> + goto out_free_id;
>
> #ifdef CONFIG_SMP
> mnt->mnt_pcp = alloc_percpu(struct mnt_pcp);
> @@ -1268,7 +1269,7 @@ struct vfsmount *vfs_create_mount(struct fs_context *fc)
> if (!fc->root)
> return ERR_PTR(-EINVAL);
>
> - mnt = alloc_vfsmnt(fc->source ?: "none");
> + mnt = alloc_vfsmnt(fc->source);
> if (!mnt)
> return ERR_PTR(-ENOMEM);
>
> @@ -5491,7 +5492,7 @@ static int statmount_sb_source(struct kstatmount *s, struct seq_file *seq)
> seq->buf[seq->count] = '\0';
> seq->count = start;
> seq_commit(seq, string_unescape_inplace(seq->buf + start, UNESCAPE_OCTAL));
> - } else if (r->mnt_devname) {
> + } else {
> seq_puts(seq, r->mnt_devname);
> }
> return 0;
> diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c
> index e133b507ddf3..5c555db68aa2 100644
> --- a/fs/proc_namespace.c
> +++ b/fs/proc_namespace.c
> @@ -111,7 +111,7 @@ static int show_vfsmnt(struct seq_file *m, struct vfsmount *mnt)
> if (err)
> goto out;
> } else {
> - mangle(m, r->mnt_devname ? r->mnt_devname : "none");
> + mangle(m, r->mnt_devname);
> }
> seq_putc(m, ' ');
> /* mountpoints outside of chroot jail will give SEQ_SKIP on this */
> @@ -177,7 +177,7 @@ static int show_mountinfo(struct seq_file *m, struct vfsmount *mnt)
> if (err)
> goto out;
> } else {
> - mangle(m, r->mnt_devname ? r->mnt_devname : "none");
> + mangle(m, r->mnt_devname);
> }
> seq_puts(m, sb_rdonly(sb) ? " ro" : " rw");
> err = show_sb_opts(m, sb);
> @@ -199,17 +199,13 @@ static int show_vfsstat(struct seq_file *m, struct vfsmount *mnt)
> int err;
>
> /* device */
> + seq_puts(m, "device ");
> if (sb->s_op->show_devname) {
> - seq_puts(m, "device ");
> err = sb->s_op->show_devname(m, mnt_path.dentry);
> if (err)
> goto out;
> } else {
> - if (r->mnt_devname) {
> - seq_puts(m, "device ");
> - mangle(m, r->mnt_devname);
> - } else
> - seq_puts(m, "no device");
> + mangle(m, r->mnt_devname);
> }
>
> /* mount point */
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH][RFC] ->mnt_devname is never NULL
2025-04-21 7:56 ` Christian Brauner
@ 2025-04-21 16:29 ` Al Viro
2025-04-21 17:03 ` Al Viro
` (2 more replies)
0 siblings, 3 replies; 13+ messages in thread
From: Al Viro @ 2025-04-21 16:29 UTC (permalink / raw)
To: Christian Brauner; +Cc: linux-fsdevel
On Mon, Apr 21, 2025 at 09:56:20AM +0200, Christian Brauner wrote:
> On Mon, Apr 21, 2025 at 04:35:09AM +0100, Al Viro wrote:
> > Not since 8f2918898eb5 "new helpers: vfs_create_mount(), fc_mount()"
> > back in 2018. Get rid of the dead checks...
> >
> > Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
> > ---
>
> Good idea. Fwiw, I've put this into vfs-6.16.mount with some other minor
> stuff. If you're keeping it yourself let me know.
Not sure... I'm going through documenting the struct mount lifecycle/locking/etc.
and it already looks like there will be more patches, but then some are going
to be #fixes fodder.
Example caught just a couple of minutes ago: do_lock_mount()
if (beneath) {
m = real_mount(mnt);
read_seqlock_excl(&mount_lock);
dentry = dget(m->mnt_mountpoint);
read_sequnlock_excl(&mount_lock);
} else {
dentry = path->dentry;
}
inode_lock(dentry->d_inode);
What's to prevent the 'beneath' case from getting mnt mount --move'd
away *AND* the ex-parent from getting unmounted while we are blocked
in inode_lock? At this point we are not holding any locks whatsoever
(and all mount-related locks nest inside inode_lock(), so we couldn't
hold them there anyway).
Hit that race and watch a very unhappy umount...
BTW, a stylistic note: 'beneath' and '!beneath' cases have very little
in common; I'm pretty sure it would be cleaner to split this function
in two, putting the '!beneath' case back into lock_mount() and calling
the rest lock_mount_beneath()... This kind of boolean arguments is
a bad idea, IME - especially when they affect locking or lifetimes
in any way.
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH][RFC] ->mnt_devname is never NULL
2025-04-21 16:29 ` Al Viro
@ 2025-04-21 17:03 ` Al Viro
2025-04-22 3:14 ` [PATCH][RFC] do_lock_mount() races in 'beneath' case Al Viro
2025-04-22 7:43 ` [PATCH][RFC] ->mnt_devname is never NULL Christian Brauner
2025-04-22 7:31 ` Christian Brauner
2025-04-23 22:20 ` Al Viro
2 siblings, 2 replies; 13+ messages in thread
From: Al Viro @ 2025-04-21 17:03 UTC (permalink / raw)
To: Christian Brauner; +Cc: linux-fsdevel
On Mon, Apr 21, 2025 at 05:29:47PM +0100, Al Viro wrote:
> What's to prevent the 'beneath' case from getting mnt mount --move'd
> away *AND* the ex-parent from getting unmounted while we are blocked
> in inode_lock? At this point we are not holding any locks whatsoever
> (and all mount-related locks nest inside inode_lock(), so we couldn't
> hold them there anyway).
>
> Hit that race and watch a very unhappy umount...
While we are at it, in normal case inode_unlock() in unlock_mount()
is safe since we have dentry (and associated mount) pinned by
struct path we'd fed to matching lock_mount(). No longer true for
the 'beneath' case, AFAICS...
^ permalink raw reply [flat|nested] 13+ messages in thread
* [PATCH][RFC] do_lock_mount() races in 'beneath' case
2025-04-21 17:03 ` Al Viro
@ 2025-04-22 3:14 ` Al Viro
2025-04-22 7:47 ` Christian Brauner
2025-04-22 7:43 ` [PATCH][RFC] ->mnt_devname is never NULL Christian Brauner
1 sibling, 1 reply; 13+ messages in thread
From: Al Viro @ 2025-04-22 3:14 UTC (permalink / raw)
To: Christian Brauner; +Cc: linux-fsdevel
On Mon, Apr 21, 2025 at 06:03:19PM +0100, Al Viro wrote:
> On Mon, Apr 21, 2025 at 05:29:47PM +0100, Al Viro wrote:
>
> > What's to prevent the 'beneath' case from getting mnt mount --move'd
> > away *AND* the ex-parent from getting unmounted while we are blocked
> > in inode_lock? At this point we are not holding any locks whatsoever
> > (and all mount-related locks nest inside inode_lock(), so we couldn't
> > hold them there anyway).
> >
> > Hit that race and watch a very unhappy umount...
>
> While we are at it, in normal case inode_unlock() in unlock_mount()
> is safe since we have dentry (and associated mount) pinned by
> struct path we'd fed to matching lock_mount(). No longer true for
> the 'beneath' case, AFAICS...
Completely untested patch follows; 'beneath' case in do_lock_mount() is made
to grab mount reference to match the dentry one (same lifetime; dropped
simultaneously), unlock_mount() unlocks the inode *before* namespace_unlock(),
so we don't depend upon the externally held references.
Comments?
diff --git a/fs/namespace.c b/fs/namespace.c
index fa17762268f5..91d28f283f1c 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2827,56 +2827,62 @@ static struct mountpoint *do_lock_mount(struct path *path, bool beneath)
struct vfsmount *mnt = path->mnt;
struct dentry *dentry;
struct mountpoint *mp = ERR_PTR(-ENOENT);
+ struct path under = {};
for (;;) {
- struct mount *m;
+ struct mount *m = real_mount(mnt);
if (beneath) {
- m = real_mount(mnt);
+ path_put(&under);
read_seqlock_excl(&mount_lock);
- dentry = dget(m->mnt_mountpoint);
+ under.mnt = mntget(&m->mnt_parent->mnt);
+ under.dentry = dget(m->mnt_mountpoint);
read_sequnlock_excl(&mount_lock);
+ dentry = under.dentry;
} else {
dentry = path->dentry;
}
inode_lock(dentry->d_inode);
- if (unlikely(cant_mount(dentry))) {
- inode_unlock(dentry->d_inode);
- goto out;
- }
-
namespace_lock();
- if (beneath && (!is_mounted(mnt) || m->mnt_mountpoint != dentry)) {
+ if (unlikely(cant_mount(dentry) || !is_mounted(mnt)))
+ break; // not to be mounted on
+
+ if (beneath && unlikely(m->mnt_mountpoint != dentry ||
+ &m->mnt_parent->mnt != under.mnt)) {
namespace_unlock();
inode_unlock(dentry->d_inode);
- goto out;
+ continue; // got moved
}
mnt = lookup_mnt(path);
- if (likely(!mnt))
+ if (unlikely(mnt)) {
+ namespace_unlock();
+ inode_unlock(dentry->d_inode);
+ path_put(path);
+ path->mnt = mnt;
+ path->dentry = dget(mnt->mnt_root);
+ continue; // got overmounted
+ }
+ mp = get_mountpoint(dentry);
+ if (IS_ERR(mp))
break;
-
- namespace_unlock();
- inode_unlock(dentry->d_inode);
- if (beneath)
- dput(dentry);
- path_put(path);
- path->mnt = mnt;
- path->dentry = dget(mnt->mnt_root);
- }
-
- mp = get_mountpoint(dentry);
- if (IS_ERR(mp)) {
- namespace_unlock();
- inode_unlock(dentry->d_inode);
+ if (beneath) {
+ /*
+ * @under duplicates the references that will stay
+ * at least until namespace_unlock(), so the path_put()
+ * below is safe (and OK to do under namespace_lock -
+ * we are not dropping the final references here).
+ */
+ path_put(&under);
+ }
+ return mp;
}
-
-out:
+ namespace_unlock();
+ inode_unlock(dentry->d_inode);
if (beneath)
- dput(dentry);
-
+ path_put(&under);
return mp;
}
@@ -2892,9 +2898,9 @@ static void unlock_mount(struct mountpoint *where)
read_seqlock_excl(&mount_lock);
put_mountpoint(where);
read_sequnlock_excl(&mount_lock);
+ inode_unlock(dentry->d_inode);
namespace_unlock();
- inode_unlock(dentry->d_inode);
}
static int graft_tree(struct mount *mnt, struct mount *p, struct mountpoint *mp)
^ permalink raw reply related [flat|nested] 13+ messages in thread
* Re: [PATCH][RFC] ->mnt_devname is never NULL
2025-04-21 16:29 ` Al Viro
2025-04-21 17:03 ` Al Viro
@ 2025-04-22 7:31 ` Christian Brauner
2025-04-22 12:25 ` Al Viro
2025-04-23 22:20 ` Al Viro
2 siblings, 1 reply; 13+ messages in thread
From: Christian Brauner @ 2025-04-22 7:31 UTC (permalink / raw)
To: Al Viro; +Cc: linux-fsdevel
On Mon, Apr 21, 2025 at 05:29:47PM +0100, Al Viro wrote:
> On Mon, Apr 21, 2025 at 09:56:20AM +0200, Christian Brauner wrote:
> > On Mon, Apr 21, 2025 at 04:35:09AM +0100, Al Viro wrote:
> > > Not since 8f2918898eb5 "new helpers: vfs_create_mount(), fc_mount()"
> > > back in 2018. Get rid of the dead checks...
> > >
> > > Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
> > > ---
> >
> > Good idea. Fwiw, I've put this into vfs-6.16.mount with some other minor
> > stuff. If you're keeping it yourself let me know.
>
> Not sure... I'm going through documenting the struct mount lifecycle/locking/etc.
> and it already looks like there will be more patches, but then some are going
> to be #fixes fodder.
>
> Example caught just a couple of minutes ago: do_lock_mount()
> if (beneath) {
> m = real_mount(mnt);
> read_seqlock_excl(&mount_lock);
> dentry = dget(m->mnt_mountpoint);
> read_sequnlock_excl(&mount_lock);
> } else {
> dentry = path->dentry;
> }
>
> inode_lock(dentry->d_inode);
> What's to prevent the 'beneath' case from getting mnt mount --move'd
> away *AND* the ex-parent from getting unmounted while we are blocked
> in inode_lock? At this point we are not holding any locks whatsoever
> (and all mount-related locks nest inside inode_lock(), so we couldn't
> hold them there anyway).
>
> Hit that race and watch a very unhappy umount...
If it gets unmounted or moved we immediately detect this in the next line:
if (beneath && (!is_mounted(mnt) || m->mnt_mountpoint != dentry)) {
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH][RFC] ->mnt_devname is never NULL
2025-04-21 17:03 ` Al Viro
2025-04-22 3:14 ` [PATCH][RFC] do_lock_mount() races in 'beneath' case Al Viro
@ 2025-04-22 7:43 ` Christian Brauner
1 sibling, 0 replies; 13+ messages in thread
From: Christian Brauner @ 2025-04-22 7:43 UTC (permalink / raw)
To: Al Viro; +Cc: linux-fsdevel
On Mon, Apr 21, 2025 at 06:03:19PM +0100, Al Viro wrote:
> On Mon, Apr 21, 2025 at 05:29:47PM +0100, Al Viro wrote:
>
> > What's to prevent the 'beneath' case from getting mnt mount --move'd
> > away *AND* the ex-parent from getting unmounted while we are blocked
> > in inode_lock? At this point we are not holding any locks whatsoever
> > (and all mount-related locks nest inside inode_lock(), so we couldn't
> > hold them there anyway).
> >
> > Hit that race and watch a very unhappy umount...
>
> While we are at it, in normal case inode_unlock() in unlock_mount()
> is safe since we have dentry (and associated mount) pinned by
> struct path we'd fed to matching lock_mount(). No longer true for
> the 'beneath' case, AFAICS...
I'm not following. Please explain the issue in detail. Both mount and
dentry are pinned via struct path independent of whether its beneath or
not beneath.
What we pass to unlock_mount() is the mountpoint which pins the relevant
dentry separately. do_lock_mount() keeps @dentry for the mountpoint
pinned until it has taken a separate reference. We only put the
reference to the mountpoint's dentry if we know that the for (;;) will
continue aka not break or when get_mountpoint() has taken it's own
reference.
So really, I'm very confused atm.
Also if this were the case all invasive move mount beneath tests I added
should cause endless splats under any sort of KASAN which they are
constantly run under in a tight loop in my local testing and by syzbot.
For the latter I explicitly added support for it in:
https://github.com/google/syzkaller/commit/058b3a5a6a945a55767811552eb7b9f4a20307f8
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH][RFC] do_lock_mount() races in 'beneath' case
2025-04-22 3:14 ` [PATCH][RFC] do_lock_mount() races in 'beneath' case Al Viro
@ 2025-04-22 7:47 ` Christian Brauner
0 siblings, 0 replies; 13+ messages in thread
From: Christian Brauner @ 2025-04-22 7:47 UTC (permalink / raw)
To: Al Viro; +Cc: linux-fsdevel
On Tue, Apr 22, 2025 at 04:14:48AM +0100, Al Viro wrote:
> On Mon, Apr 21, 2025 at 06:03:19PM +0100, Al Viro wrote:
> > On Mon, Apr 21, 2025 at 05:29:47PM +0100, Al Viro wrote:
> >
> > > What's to prevent the 'beneath' case from getting mnt mount --move'd
> > > away *AND* the ex-parent from getting unmounted while we are blocked
> > > in inode_lock? At this point we are not holding any locks whatsoever
> > > (and all mount-related locks nest inside inode_lock(), so we couldn't
> > > hold them there anyway).
> > >
> > > Hit that race and watch a very unhappy umount...
> >
> > While we are at it, in normal case inode_unlock() in unlock_mount()
> > is safe since we have dentry (and associated mount) pinned by
> > struct path we'd fed to matching lock_mount(). No longer true for
> > the 'beneath' case, AFAICS...
>
> Completely untested patch follows; 'beneath' case in do_lock_mount() is made
> to grab mount reference to match the dentry one (same lifetime; dropped
> simultaneously), unlock_mount() unlocks the inode *before* namespace_unlock(),
> so we don't depend upon the externally held references.
Afaict both isssues you mentioned shouldn't exist. So I'd first like to
have details on how they're supposed to happen before fiddling with the
code, please.
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH][RFC] ->mnt_devname is never NULL
2025-04-22 7:31 ` Christian Brauner
@ 2025-04-22 12:25 ` Al Viro
2025-04-22 13:40 ` Christian Brauner
2025-04-23 1:30 ` Al Viro
0 siblings, 2 replies; 13+ messages in thread
From: Al Viro @ 2025-04-22 12:25 UTC (permalink / raw)
To: Christian Brauner; +Cc: linux-fsdevel
On Tue, Apr 22, 2025 at 09:31:14AM +0200, Christian Brauner wrote:
> On Mon, Apr 21, 2025 at 05:29:47PM +0100, Al Viro wrote:
> > On Mon, Apr 21, 2025 at 09:56:20AM +0200, Christian Brauner wrote:
> > > On Mon, Apr 21, 2025 at 04:35:09AM +0100, Al Viro wrote:
> > > > Not since 8f2918898eb5 "new helpers: vfs_create_mount(), fc_mount()"
> > > > back in 2018. Get rid of the dead checks...
> > > >
> > > > Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
> > > > ---
> > >
> > > Good idea. Fwiw, I've put this into vfs-6.16.mount with some other minor
> > > stuff. If you're keeping it yourself let me know.
> >
> > Not sure... I'm going through documenting the struct mount lifecycle/locking/etc.
> > and it already looks like there will be more patches, but then some are going
> > to be #fixes fodder.
> >
> > Example caught just a couple of minutes ago: do_lock_mount()
> > if (beneath) {
> > m = real_mount(mnt);
> > read_seqlock_excl(&mount_lock);
> > dentry = dget(m->mnt_mountpoint);
> > read_sequnlock_excl(&mount_lock);
> > } else {
> > dentry = path->dentry;
> > }
> >
> > inode_lock(dentry->d_inode);
> > What's to prevent the 'beneath' case from getting mnt mount --move'd
> > away *AND* the ex-parent from getting unmounted while we are blocked
> > in inode_lock? At this point we are not holding any locks whatsoever
> > (and all mount-related locks nest inside inode_lock(), so we couldn't
> > hold them there anyway).
> >
> > Hit that race and watch a very unhappy umount...
>
> If it gets unmounted or moved we immediately detect this in the next line:
>
> if (beneath && (!is_mounted(mnt) || m->mnt_mountpoint != dentry)) {
Sure, we would - *AFTER* we get through that inode_lock().
Consider the following setup:
mkdir foo
mkdir bar
mkdir splat
mount -t tmpfs none foo # mount 1
mount -t tmpfs none bar # mount 2
mkdir bar/baz
mount -t tmpfs none bar/baz # mount 3
then
A: move_mount(AT_FDCWD, "foo", AT_FDCWD, "bar/baz", MOVE_MOUNT_BENEATH)
gets to do_move_mount() and into do_lock_mount() called by it.
path->mnt points to mount 3, path->dentry - to its root. Both are pinned.
do_lock_mount() goes into the first iteration of loop. beneath is true,
so it picks dentry - that of #3 mountpoint, i.e. "/baz" on #2 tmpfs instance.
At that point refcount of that dentry is 3 - one from being a positive on
tmpfs, one from being a mountpoint and one more just grabbed by do_lock_mount().
Now we enter inode_lock(dentry->d_inode). Note that at that point A is not
holding any locks. Suppose it gets preempted at this moment for whatever reason.
B: mount --move bar/baz splat
Proceeds without any problems, mount #3 gets moved to "splat". Now refcount
of mount #2 is not pinned by anything and refcount of "/baz" on it is 2, since
it's no longer a mountpoint.
B: umount bar
... and now it hits the fan, since the refcount of mount #2 is not elevated by
anything, so we do not hit -EBUSY and proceed through umount(2) all the way to
kill_litter_super(), which drops the refcount of "/baz" to 1 and calls kill_anon_super().
Which gets to shrink_dcache_for_umount() and from there - to umount_check() on
that dentry. You get yelled at, then you get yelled at again for busy inodes
after umount (that dentry is pinning the inode down), etc. Superblock of #2
is freed.
A: regains CPU. is_mounted() is true (now at splat instead of bar/baz, but still
mounted), ->mnt_mountpoint does not match.
All right, inode_unlock(dentry->d_inode), then dput(dentry) and now the refcount
of that dentry finally hits zero. We get iput() on its inode, followed by
shmem_evict_inode() which is where we finally oops.
As for the second issue... Normal callers of unlock_mount() do have a struct path
somewhere that pins the location we are dealing with. However, 'beneath' case
of do_move_mount() does not - it relies upon the sucker being a mountpoint all
along. Which is fine until you drop namespace_sem. As soon as namespace_unlock()
has been called, there's no warranty that it will _stay_ a mountpoint. Moving
that inode_unlock() before the namespace_unlock() avoids that scenario.
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH][RFC] ->mnt_devname is never NULL
2025-04-22 12:25 ` Al Viro
@ 2025-04-22 13:40 ` Christian Brauner
2025-04-23 1:30 ` Al Viro
1 sibling, 0 replies; 13+ messages in thread
From: Christian Brauner @ 2025-04-22 13:40 UTC (permalink / raw)
To: Al Viro; +Cc: linux-fsdevel
> Consider the following setup:
>
> mkdir foo
> mkdir bar
> mkdir splat
> mount -t tmpfs none foo # mount 1
> mount -t tmpfs none bar # mount 2
> mkdir bar/baz
> mount -t tmpfs none bar/baz # mount 3
>
> then
>
> A: move_mount(AT_FDCWD, "foo", AT_FDCWD, "bar/baz", MOVE_MOUNT_BENEATH)
> gets to do_move_mount() and into do_lock_mount() called by it.
>
> path->mnt points to mount 3, path->dentry - to its root. Both are pinned.
> do_lock_mount() goes into the first iteration of loop. beneath is true,
> so it picks dentry - that of #3 mountpoint, i.e. "/baz" on #2 tmpfs instance.
>
> At that point refcount of that dentry is 3 - one from being a positive on
> tmpfs, one from being a mountpoint and one more just grabbed by do_lock_mount().
>
> Now we enter inode_lock(dentry->d_inode). Note that at that point A is not
> holding any locks. Suppose it gets preempted at this moment for whatever reason.
>
> B: mount --move bar/baz splat
> Proceeds without any problems, mount #3 gets moved to "splat". Now refcount
> of mount #2 is not pinned by anything and refcount of "/baz" on it is 2, since
> it's no longer a mountpoint.
>
> B: umount bar
> ... and now it hits the fan, since the refcount of mount #2 is not elevated by
> anything, so we do not hit -EBUSY and proceed through umount(2) all the way to
Ok, what you want to say is that we're not keeping a refcount on
path->mnt where the mountpoint is located that we're looking up in the
beneath case.
First iteration, @path->mnt_3 will hold a reference to mount 3. We now
get a reference to @m->mnt_mountpoint that resides on mnt_2 but
we're not holding a reference to mnt_2.
> As for the second issue... Normal callers of unlock_mount() do have a struct path
> somewhere that pins the location we are dealing with. However, 'beneath' case
> of do_move_mount() does not - it relies upon the sucker being a mountpoint all
> along. Which is fine until you drop namespace_sem. As soon as namespace_unlock()
> has been called, there's no warranty that it will _stay_ a mountpoint. Moving
> that inode_unlock() before the namespace_unlock() avoids that scenario.
Right.
Thanks for the details, that helped!
Can you resend your changes as a proper commit, please? I have a set of
fixes for -rc4 already and ideally I'd like to include and backport this
right away (We also should really have at least a test with exactly that
layout you're describing.).
It'd be nice if we had infra to force context switches so things like
that were reliably testable...
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH][RFC] ->mnt_devname is never NULL
2025-04-22 12:25 ` Al Viro
2025-04-22 13:40 ` Christian Brauner
@ 2025-04-23 1:30 ` Al Viro
1 sibling, 0 replies; 13+ messages in thread
From: Al Viro @ 2025-04-23 1:30 UTC (permalink / raw)
To: Christian Brauner; +Cc: linux-fsdevel
On Tue, Apr 22, 2025 at 01:25:14PM +0100, Al Viro wrote:
> Consider the following setup:
>
> mkdir foo
> mkdir bar
> mkdir splat
> mount -t tmpfs none foo # mount 1
> mount -t tmpfs none bar # mount 2
> mkdir bar/baz
> mount -t tmpfs none bar/baz # mount 3
>
> then
>
> A: move_mount(AT_FDCWD, "foo", AT_FDCWD, "bar/baz", MOVE_MOUNT_BENEATH)
> gets to do_move_mount() and into do_lock_mount() called by it.
>
> path->mnt points to mount 3, path->dentry - to its root. Both are pinned.
> do_lock_mount() goes into the first iteration of loop. beneath is true,
> so it picks dentry - that of #3 mountpoint, i.e. "/baz" on #2 tmpfs instance.
>
> At that point refcount of that dentry is 3 - one from being a positive on
> tmpfs, one from being a mountpoint and one more just grabbed by do_lock_mount().
>
> Now we enter inode_lock(dentry->d_inode). Note that at that point A is not
> holding any locks. Suppose it gets preempted at this moment for whatever reason.
>
> B: mount --move bar/baz splat
> Proceeds without any problems, mount #3 gets moved to "splat". Now refcount
> of mount #2 is not pinned by anything and refcount of "/baz" on it is 2, since
> it's no longer a mountpoint.
>
> B: umount bar
BTW, another way to hit the same would be umount("bar", MNT_DETACH) instead of
mount --move + umount - just one syscall to get in; as long as it hits
namespace_lock() first before the move_mount() does, that's it.
And yes, on mainline with
if (unlikely(strcmp(current->comm, "wanker") == 0))
msleep(60000);
in front of that inode_lock() the above does trigger the spew about busy dentries
and inodes on umount -l and an oops from do_move_mount() -> do_lock_mount() ->
dput() -> iput() once the damn thing notices that mnt got unmounted and tries
to clean up. Commit below + same debugging patch => no problem.
I'd prefer a reproducer with decent hit rate without an artificial delay inserted
just before that inode_lock() - that's fine to demonstrate that bug exists in
mainline (and fixed by the patch), but for CI it's not a good idea...
Any suggestions (as well as more testing the fix itself) would be very welcome...
commit e8dbfe567bc5362ecb5ebadb4457a2110163a901
Author: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue Apr 22 15:27:03 2025 -0400
fix a couple of races in MNT_TREE_BENEATH handling by do_move_mount()
Normally do_lock_mount(path, _) is locking a mountpoint pinned by
*path and at the time when matching unlock_mount() unlocks that
location it is still pinned by the same thing.
Unfortunately, for 'beneath' case it's no longer that simple -
the object being locked is not the one *path points to. It's the
mountpoint of path->mnt. The thing is, without sufficient locking
->mnt_parent may change under us and none of the locks are held
at that point. The rules are
* mount_lock stabilizes m->mnt_parent for any mount m.
* namespace_sem stabilizes m->mnt_parent, provided that
m is mounted.
* if either of the above holds and refcount of m is positive,
we are guaranteed the same for refcount of m->mnt_parent.
namespace_sem nests inside inode_lock(), so do_lock_mount() has
to take inode_lock() before grabbing namespace_sem. It does
recheck that path->mnt is still mounted in the same place after
getting namespace_sem, and it does take care to pin the dentry.
It is needed, since otherwise we might end up with racing mount --move
(or umount) happening while we were getting locks; in that case
dentry would no longer be a mountpoint and could've been evicted
on memory pressure along with its inode - not something you want
when grabbing lock on that inode.
However, pinning a dentry is not enough - the matching mount is
also pinned only by the fact that path->mnt is mounted on top it
and at that point we are not holding any locks whatsoever, so
the same kind of races could end up with all references to
that mount gone just as we are about to enter inode_lock().
If that happens, we are left with filesystem being shut down while
we are holding a dentry reference on it; results are not pretty.
What we need to do is grab both dentry and mount at the same time;
that makes inode_lock() safe *and* avoids the problem with fs getting
shut down under us. After taking namespace_sem we verify that
path->mnt is still mounted (which stabilizes its ->mnt_parent) and
check that it's still mounted at the same place. From that point
on to the matching namespace_unlock() we are guaranteed that
mount/dentry pair we'd grabbed are also pinned by being the mountpoint
of path->mnt, so we can quietly drop both the dentry reference (as
the current code does) and mnt one - it's OK to do under namespace_sem,
since we are not dropping the final refs.
That solves the problem on do_lock_mount() side; unlock_mount()
also has one, since dentry is guaranteed to stay pinned only until
the namespace_unlock(). That's easy to fix - just have inode_unlock()
done earlier, while it's still pinned by mp->m_dentry.
Fixes: 6ac392815628 "fs: allow to mount beneath top mount" # v6.5+
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
diff --git a/fs/namespace.c b/fs/namespace.c
index fa17762268f5..bbda516444ff 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2827,56 +2827,62 @@ static struct mountpoint *do_lock_mount(struct path *path, bool beneath)
struct vfsmount *mnt = path->mnt;
struct dentry *dentry;
struct mountpoint *mp = ERR_PTR(-ENOENT);
+ struct path under = {};
for (;;) {
- struct mount *m;
+ struct mount *m = real_mount(mnt);
if (beneath) {
- m = real_mount(mnt);
+ path_put(&under);
read_seqlock_excl(&mount_lock);
- dentry = dget(m->mnt_mountpoint);
+ under.mnt = mntget(&m->mnt_parent->mnt);
+ under.dentry = dget(m->mnt_mountpoint);
read_sequnlock_excl(&mount_lock);
+ dentry = under.dentry;
} else {
dentry = path->dentry;
}
inode_lock(dentry->d_inode);
- if (unlikely(cant_mount(dentry))) {
- inode_unlock(dentry->d_inode);
- goto out;
- }
-
namespace_lock();
- if (beneath && (!is_mounted(mnt) || m->mnt_mountpoint != dentry)) {
+ if (unlikely(cant_mount(dentry) || !is_mounted(mnt)))
+ break; // not to be mounted on
+
+ if (beneath && unlikely(m->mnt_mountpoint != dentry ||
+ &m->mnt_parent->mnt != under.mnt)) {
namespace_unlock();
inode_unlock(dentry->d_inode);
- goto out;
+ continue; // got moved
}
mnt = lookup_mnt(path);
- if (likely(!mnt))
+ if (unlikely(mnt)) {
+ namespace_unlock();
+ inode_unlock(dentry->d_inode);
+ path_put(path);
+ path->mnt = mnt;
+ path->dentry = dget(mnt->mnt_root);
+ continue; // got overmounted
+ }
+ mp = get_mountpoint(dentry);
+ if (IS_ERR(mp))
break;
-
- namespace_unlock();
- inode_unlock(dentry->d_inode);
- if (beneath)
- dput(dentry);
- path_put(path);
- path->mnt = mnt;
- path->dentry = dget(mnt->mnt_root);
- }
-
- mp = get_mountpoint(dentry);
- if (IS_ERR(mp)) {
- namespace_unlock();
- inode_unlock(dentry->d_inode);
+ if (beneath) {
+ /*
+ * @under duplicates the references that will stay
+ * at least until namespace_unlock(), so the path_put()
+ * below is safe (and OK to do under namespace_lock -
+ * we are not dropping the final references here).
+ */
+ path_put(&under);
+ }
+ return mp;
}
-
-out:
+ namespace_unlock();
+ inode_unlock(dentry->d_inode);
if (beneath)
- dput(dentry);
-
+ path_put(&under);
return mp;
}
@@ -2887,14 +2893,11 @@ static inline struct mountpoint *lock_mount(struct path *path)
static void unlock_mount(struct mountpoint *where)
{
- struct dentry *dentry = where->m_dentry;
-
+ inode_unlock(where->m_dentry->d_inode);
read_seqlock_excl(&mount_lock);
put_mountpoint(where);
read_sequnlock_excl(&mount_lock);
-
namespace_unlock();
- inode_unlock(dentry->d_inode);
}
static int graft_tree(struct mount *mnt, struct mount *p, struct mountpoint *mp)
^ permalink raw reply related [flat|nested] 13+ messages in thread
* Re: [PATCH][RFC] ->mnt_devname is never NULL
2025-04-21 16:29 ` Al Viro
2025-04-21 17:03 ` Al Viro
2025-04-22 7:31 ` Christian Brauner
@ 2025-04-23 22:20 ` Al Viro
2025-04-24 8:56 ` Christian Brauner
2 siblings, 1 reply; 13+ messages in thread
From: Al Viro @ 2025-04-23 22:20 UTC (permalink / raw)
To: Christian Brauner; +Cc: linux-fsdevel
On Mon, Apr 21, 2025 at 05:29:47PM +0100, Al Viro wrote:
> On Mon, Apr 21, 2025 at 09:56:20AM +0200, Christian Brauner wrote:
> > On Mon, Apr 21, 2025 at 04:35:09AM +0100, Al Viro wrote:
> > > Not since 8f2918898eb5 "new helpers: vfs_create_mount(), fc_mount()"
> > > back in 2018. Get rid of the dead checks...
> > >
> > > Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
> > > ---
> >
> > Good idea. Fwiw, I've put this into vfs-6.16.mount with some other minor
> > stuff. If you're keeping it yourself let me know.
>
> Not sure... I'm going through documenting the struct mount lifecycle/locking/etc.
> and it already looks like there will be more patches, but then some are going
> to be #fixes fodder.
BTW, could you explain what this is about?
/*
* If this is an attached mount make sure it's located in the callers
* mount namespace. If it's not don't let the caller interact with it.
*
* If this mount doesn't have a parent it's most often simply a
* detached mount with an anonymous mount namespace. IOW, something
* that's simply not attached yet. But there are apparently also users
* that do change mount properties on the rootfs itself. That obviously
* neither has a parent nor is it a detached mount so we cannot
* unconditionally check for detached mounts.
*/
if ((mnt_has_parent(mnt) || !is_anon_ns(mnt->mnt_ns)) && !check_mnt(mnt))
goto out;
Why do you care about mnt_has_parent() here? mnt is the root of subtree you
are operating on, so that condition means
* any subtree (including the entire tree) of caller's mount tree is OK
(fair enough)
* full mount tree of anon namespace is OK
* nothing else is acceptable
What about partial subtrees of anon namespaces? Restriction looks odd...
^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [PATCH][RFC] ->mnt_devname is never NULL
2025-04-23 22:20 ` Al Viro
@ 2025-04-24 8:56 ` Christian Brauner
0 siblings, 0 replies; 13+ messages in thread
From: Christian Brauner @ 2025-04-24 8:56 UTC (permalink / raw)
To: Al Viro; +Cc: linux-fsdevel
On Wed, Apr 23, 2025 at 11:20:45PM +0100, Al Viro wrote:
> On Mon, Apr 21, 2025 at 05:29:47PM +0100, Al Viro wrote:
> > On Mon, Apr 21, 2025 at 09:56:20AM +0200, Christian Brauner wrote:
> > > On Mon, Apr 21, 2025 at 04:35:09AM +0100, Al Viro wrote:
> > > > Not since 8f2918898eb5 "new helpers: vfs_create_mount(), fc_mount()"
> > > > back in 2018. Get rid of the dead checks...
> > > >
> > > > Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
> > > > ---
> > >
> > > Good idea. Fwiw, I've put this into vfs-6.16.mount with some other minor
> > > stuff. If you're keeping it yourself let me know.
> >
> > Not sure... I'm going through documenting the struct mount lifecycle/locking/etc.
> > and it already looks like there will be more patches, but then some are going
> > to be #fixes fodder.
>
> BTW, could you explain what this is about?
> /*
> * If this is an attached mount make sure it's located in the callers
> * mount namespace. If it's not don't let the caller interact with it.
> *
> * If this mount doesn't have a parent it's most often simply a
> * detached mount with an anonymous mount namespace. IOW, something
> * that's simply not attached yet. But there are apparently also users
> * that do change mount properties on the rootfs itself. That obviously
> * neither has a parent nor is it a detached mount so we cannot
> * unconditionally check for detached mounts.
> */
> if ((mnt_has_parent(mnt) || !is_anon_ns(mnt->mnt_ns)) && !check_mnt(mnt))
> goto out;
>
> Why do you care about mnt_has_parent() here? mnt is the root of subtree you
> are operating on, so that condition means
> * any subtree (including the entire tree) of caller's mount tree is OK
> (fair enough)
> * full mount tree of anon namespace is OK
> * nothing else is acceptable
> What about partial subtrees of anon namespaces? Restriction looks odd...
No one has ever cared about that ever so far so I specifically only
allowed the root of an anonymous mount namespace and this also keeps it
in line with other calls. I'm not against opening that up if this is a
use-case but so far we haven't had anyone care about mount properties in
detached mount subtrees.
^ permalink raw reply [flat|nested] 13+ messages in thread
end of thread, other threads:[~2025-04-24 8:56 UTC | newest]
Thread overview: 13+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-04-21 3:35 [PATCH][RFC] ->mnt_devname is never NULL Al Viro
2025-04-21 7:56 ` Christian Brauner
2025-04-21 16:29 ` Al Viro
2025-04-21 17:03 ` Al Viro
2025-04-22 3:14 ` [PATCH][RFC] do_lock_mount() races in 'beneath' case Al Viro
2025-04-22 7:47 ` Christian Brauner
2025-04-22 7:43 ` [PATCH][RFC] ->mnt_devname is never NULL Christian Brauner
2025-04-22 7:31 ` Christian Brauner
2025-04-22 12:25 ` Al Viro
2025-04-22 13:40 ` Christian Brauner
2025-04-23 1:30 ` Al Viro
2025-04-23 22:20 ` Al Viro
2025-04-24 8:56 ` Christian Brauner
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).