From: David Howells <dhowells@redhat.com>
To: torvalds@transmeta.com, viro@parcelfarce.linux.theplanet.co.uk
Cc: linux-kernel@vger.kernel.org
Subject: [PATCH] VFS mountpoint expiry
Date: Fri, 04 Jul 2003 10:25:54 +0100 [thread overview]
Message-ID: <27682.1057310754@warthog.warthog> (raw)
Hi Linus, Al,
Here's a patch to do mountpoint expiry along the lines suggested by Al.
Mountpoint expiry can be driven in two ways:
(1) From userspace using umount(<path>,MNT_EXPIRE).
If the mountpoint is in use, then -EBUSY is returned.
If not already set, this sets a flag in the vfsmount structure and
returns -EAGAIN.
If the flag is already set, the unmount is actually effected and 0 is
returned.
Note that mixing MNT_EXPIRE with MNT_FORCE or MNT_DETACH will incur a
-EINVAL error.
The flag is cleared should mntput() decrease the vfsmount's mnt_count to
one.
(2) A filesystem can call mark_mounts_for_expiry() with a list of vfsmounts
that should be checked. Such mounts are linked together through a new
list_head (mnt_fslink) in the vfsmount structure. The head of this list
should be retained by the module, and should only be accessed whilst
holding the dcache_lock.
This function performs a similar action on every vfsmount in the list to
that performed by umount with MNT_EXPIRE.
With regard to mnt_fslink, should a namespace be cloned every vfsmount
that is on a such list will be cloned as normal and added to the same
list whilst the dcache_lock is held.
Should a "listed" vfsmount be moved, then it will be removed from the
list. Should a bind be performed on a vfsmount, then the resulting new
vfsmount will not be added to the list.
vfsmounts to go on this list can be obtained by playing tricks with a
follow_link() method attached to a directory - this method can then call
do_kern_mount() to obtain a new vfsmount, and then call do_add_mount() to
mount the vfsmount _and_ add it to the module's expiry list under the
appropriate locking.
This more or less permits a filesystem to do automounting at arbitrary
points within a tree without worrying about namespace issues.
Note that mnt_flags should now be accessed atomically with set_bit() and co.
David
diff -ur linux-2.5.74/fs/block_dev.c linux-2.5.74-auto/fs/block_dev.c
--- linux-2.5.74/fs/block_dev.c 2003-07-03 15:35:15.000000000 +0100
+++ linux-2.5.74-auto/fs/block_dev.c 2003-07-03 15:41:15.000000000 +0100
@@ -762,7 +762,7 @@
if (!S_ISBLK(inode->i_mode))
goto fail;
error = -EACCES;
- if (nd.mnt->mnt_flags & MNT_NODEV)
+ if (test_bit(__MNT_NODEV, &nd.mnt->mnt_flags))
goto fail;
error = bd_acquire(inode);
if (error)
diff -ur linux-2.5.74/fs/exec.c linux-2.5.74-auto/fs/exec.c
--- linux-2.5.74/fs/exec.c 2003-07-03 15:35:13.000000000 +0100
+++ linux-2.5.74-auto/fs/exec.c 2003-07-03 15:41:15.000000000 +0100
@@ -459,7 +459,7 @@
if (!err) {
struct inode *inode = nd.dentry->d_inode;
file = ERR_PTR(-EACCES);
- if (!(nd.mnt->mnt_flags & MNT_NOEXEC) &&
+ if (!test_bit(__MNT_NOEXEC, &nd.mnt->mnt_flags) &&
S_ISREG(inode->i_mode)) {
int err = permission(inode, MAY_EXEC);
if (!err && !(inode->i_mode & 0111))
@@ -841,7 +841,7 @@
bprm->e_uid = current->euid;
bprm->e_gid = current->egid;
- if(!(bprm->file->f_vfsmnt->mnt_flags & MNT_NOSUID)) {
+ if(!test_bit(__MNT_NOSUID, &bprm->file->f_vfsmnt->mnt_flags)) {
/* Set-uid? */
if (mode & S_ISUID)
bprm->e_uid = inode->i_uid;
diff -ur linux-2.5.74/fs/namei.c linux-2.5.74-auto/fs/namei.c
--- linux-2.5.74/fs/namei.c 2003-07-03 15:35:13.000000000 +0100
+++ linux-2.5.74-auto/fs/namei.c 2003-07-03 15:41:15.000000000 +0100
@@ -269,6 +269,12 @@
mntput(nd->mnt);
}
+void path_release_on_umount(struct nameidata *nd)
+{
+ dput(nd->dentry);
+ _mntput(nd->mnt);
+}
+
/*
* Internal lookup() using the new generic dcache.
* SMP-safe
@@ -1147,7 +1153,7 @@
if (S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
flag &= ~O_TRUNC;
} else if (S_ISBLK(inode->i_mode) || S_ISCHR(inode->i_mode)) {
- if (nd->mnt->mnt_flags & MNT_NODEV)
+ if (test_bit(__MNT_NODEV, &nd->mnt->mnt_flags))
return -EACCES;
flag &= ~O_TRUNC;
diff -ur linux-2.5.74/fs/namespace.c linux-2.5.74-auto/fs/namespace.c
--- linux-2.5.74/fs/namespace.c 2003-07-03 15:35:14.000000000 +0100
+++ linux-2.5.74-auto/fs/namespace.c 2003-07-03 15:41:15.000000000 +0100
@@ -48,6 +48,7 @@
INIT_LIST_HEAD(&mnt->mnt_child);
INIT_LIST_HEAD(&mnt->mnt_mounts);
INIT_LIST_HEAD(&mnt->mnt_list);
+ INIT_LIST_HEAD(&mnt->mnt_fslink);
if (name) {
int size = strlen(name)+1;
char *newname = kmalloc(size, GFP_KERNEL);
@@ -84,13 +85,9 @@
return p;
}
-static int check_mnt(struct vfsmount *mnt)
+static inline int check_mnt(struct vfsmount *mnt)
{
- spin_lock(&dcache_lock);
- while (mnt->mnt_parent != mnt)
- mnt = mnt->mnt_parent;
- spin_unlock(&dcache_lock);
- return mnt == current->namespace->root;
+ return mnt->mnt_namespace == current->namespace;
}
static void detach_mnt(struct vfsmount *mnt, struct nameidata *old_nd)
@@ -136,12 +133,17 @@
struct vfsmount *mnt = alloc_vfsmnt(old->mnt_devname);
if (mnt) {
mnt->mnt_flags = old->mnt_flags;
atomic_inc(&sb->s_active);
mnt->mnt_sb = sb;
mnt->mnt_root = dget(root);
mnt->mnt_mountpoint = mnt->mnt_root;
mnt->mnt_parent = mnt;
+ mnt->mnt_namespace = old->mnt_namespace;
+ spin_lock(&dcache_lock);
+ if (!list_empty(&old->mnt_fslink))
+ list_add(&mnt->mnt_fslink, &old->mnt_fslink);
+ spin_unlock(&dcache_lock);
}
return mnt;
}
@@ -203,9 +205,9 @@
{ 0, NULL }
};
static struct proc_fs_info mnt_info[] = {
- { MNT_NOSUID, ",nosuid" },
- { MNT_NODEV, ",nodev" },
- { MNT_NOEXEC, ",noexec" },
+ { __MNT_NOSUID, ",nosuid" },
+ { __MNT_NODEV, ",nodev" },
+ { __MNT_NOEXEC, ",noexec" },
{ 0, NULL }
};
struct proc_fs_info *fs_infop;
@@ -221,7 +223,7 @@
seq_puts(m, fs_infop->str);
}
for (fs_infop = mnt_info; fs_infop->flag; fs_infop++) {
- if (mnt->mnt_flags & fs_infop->flag)
+ if (test_bit(fs_infop->flag, &mnt->mnt_flags))
seq_puts(m, fs_infop->str);
}
if (mnt->mnt_sb->s_op->show_options)
@@ -262,6 +264,7 @@
while (!list_empty(&kill)) {
mnt = list_entry(kill.next, struct vfsmount, mnt_list);
list_del_init(&mnt->mnt_list);
+ list_del_init(&mnt->mnt_fslink);
if (mnt->mnt_parent == mnt) {
spin_unlock(&dcache_lock);
} else {
@@ -285,6 +288,25 @@
return retval;
/*
+ * If we've been asked to mark for expiry, we only _actually_ effect
+ * the unmount if:
+ * (1) the mark is already set (the mark is cleared by mntput()
+ * reducing the count to 1)
+ * (2) the usage count != 1 [parent vfsmount] + 1 [sys_umount]
+ */
+ if (flags & MNT_EXPIRE) {
+ if (mnt == current->fs->rootmnt ||
+ flags & (MNT_FORCE | MNT_DETACH))
+ return -EINVAL;
+
+ if (atomic_read(&mnt->mnt_count) != 2)
+ return -EBUSY;
+
+ if (!test_and_set_bit(__MNT_EXPIRY_MARK, &mnt->mnt_flags))
+ return -EAGAIN;
+ }
+
+ /*
* If we may have to abort operations to get out of this
* mount, and they will themselves hold resources we must
* allow the fs to do things. In the Unix tradition of
@@ -377,7 +399,7 @@
retval = do_umount(nd.mnt, flags);
dput_and_out:
- path_release(&nd);
+ path_release_on_umount(&nd);
out:
return retval;
}
@@ -530,6 +552,10 @@
}
if (mnt) {
+ spin_lock(&dcache_lock);
+ list_del_init(&mnt->mnt_fslink);
+ spin_unlock(&dcache_lock);
+
err = graft_tree(mnt, nd);
if (err) {
spin_lock(&dcache_lock);
@@ -550,7 +576,8 @@
* on it - tough luck.
*/
-static int do_remount(struct nameidata *nd,int flags,int mnt_flags,void *data)
+static int do_remount(struct nameidata *nd, int flags, unsigned long mnt_flags,
+ void *data)
{
int err;
struct super_block * sb = nd->mnt->mnt_sb;
@@ -622,6 +649,7 @@
detach_mnt(old_nd.mnt, &parent_nd);
attach_mnt(old_nd.mnt, nd);
+ list_del_init(&old_nd.mnt->mnt_fslink);
out2:
spin_unlock(&dcache_lock);
out1:
@@ -634,27 +662,14 @@
return err;
}
-static int do_add_mount(struct nameidata *nd, char *type, int flags,
- int mnt_flags, char *name, void *data)
+int do_add_mount(struct vfsmount *newmnt, struct nameidata *nd,
+ unsigned long mnt_flags, struct list_head *fslist)
{
- struct vfsmount *mnt;
int err;
- if (!type || !memchr(type, 0, PAGE_SIZE))
- return -EINVAL;
-
- /* we need capabilities... */
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- mnt = do_kern_mount(type, flags, name, data);
- err = PTR_ERR(mnt);
- if (IS_ERR(mnt))
- goto out;
-
down_write(¤t->namespace->sem);
/* Something was mounted here while we slept */
- while(d_mountpoint(nd->dentry) && follow_down(&nd->mnt, &nd->dentry))
+ while (d_mountpoint(nd->dentry) && follow_down(&nd->mnt, &nd->dentry))
;
err = -EINVAL;
if (!check_mnt(nd->mnt))
@@ -662,18 +677,122 @@
/* Refuse the same filesystem on the same mount point */
err = -EBUSY;
- if (nd->mnt->mnt_sb == mnt->mnt_sb && nd->mnt->mnt_root == nd->dentry)
+ if (nd->mnt->mnt_sb == newmnt->mnt_sb &&
+ nd->mnt->mnt_root == nd->dentry)
goto unlock;
- mnt->mnt_flags = mnt_flags;
- err = graft_tree(mnt, nd);
+ newmnt->mnt_flags = mnt_flags;
+ err = graft_tree(newmnt, nd);
+
+ if (err == 0 && fslist) {
+ spin_lock(&dcache_lock);
+ list_add_tail(&newmnt->mnt_fslink, fslist);
+ spin_unlock(&dcache_lock);
+ }
+
unlock:
up_write(¤t->namespace->sem);
- mntput(mnt);
-out:
+ mntput(newmnt);
return err;
}
+EXPORT_SYMBOL_GPL(do_add_mount);
+
+static int do_new_mount(struct nameidata *nd, char *type, int flags,
+ unsigned long mnt_flags, char *name, void *data)
+{
+ struct vfsmount *mnt;
+
+ if (!type || !memchr(type, 0, PAGE_SIZE))
+ return -EINVAL;
+
+ /* we need capabilities... */
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ mnt = do_kern_mount(type, flags, name, data);
+ if (IS_ERR(mnt))
+ return PTR_ERR(mnt);
+
+ return do_add_mount(mnt, nd, mnt_flags, NULL);
+}
+
+void mark_mounts_for_expiry(struct list_head *mounts)
+{
+ struct namespace *namespace;
+ struct list_head graveyard, *_p, *_n;
+ struct vfsmount *mnt;
+
+ INIT_LIST_HEAD(&graveyard);
+
+ spin_lock(&dcache_lock);
+
+ list_for_each_safe(_p, _n, mounts) {
+ mnt = list_entry(_p, struct vfsmount, mnt_fslink);
+
+ if (!test_and_set_bit(__MNT_EXPIRY_MARK, &mnt->mnt_flags) ||
+ atomic_read(&mnt->mnt_count) != 1)
+ continue;
+
+ mntget(mnt);
+ list_move(&mnt->mnt_fslink, &graveyard);
+ }
+
+ while (!list_empty(&graveyard)) {
+ mnt = list_entry(graveyard.next, struct vfsmount, mnt_fslink);
+ list_del_init(&mnt->mnt_fslink);
+
+ namespace = mnt->mnt_namespace;
+ if (!namespace || atomic_read(&namespace->count) <= 0)
+ continue;
+ get_namespace(namespace);
+
+ spin_unlock(&dcache_lock);
+ down_write(&namespace->sem);
+ spin_lock(&dcache_lock);
+
+ if (atomic_read(&mnt->mnt_count) == 2) {
+ struct dentry *xdentry;
+
+ list_del_init(&mnt->mnt_list);
+ list_del_init(&mnt->mnt_child);
+ list_del_init(&mnt->mnt_hash);
+ mnt->mnt_mountpoint->d_mounted--;
+
+ xdentry = xchg(&mnt->mnt_mountpoint, mnt->mnt_root);
+ mntput(xchg(&mnt->mnt_parent, mnt));
+ spin_unlock(&dcache_lock);
+
+ dput(xdentry);
+
+ if (atomic_read(&mnt->mnt_sb->s_active) == 1) {
+ /* last instance - try to be smart */
+ lock_kernel();
+ DQUOT_OFF(mnt->mnt_sb);
+ acct_auto_close(mnt->mnt_sb);
+ unlock_kernel();
+ }
+
+ mntput(mnt);
+ }
+ else {
+ list_add_tail(&mnt->mnt_fslink, mounts);
+ spin_unlock(&dcache_lock);
+ }
+
+ up_write(&namespace->sem);
+
+ mntput(mnt);
+ put_namespace(namespace);
+
+ spin_lock(&dcache_lock);
+ }
+
+ spin_unlock(&dcache_lock);
+}
+
+EXPORT_SYMBOL_GPL(mark_mounts_for_expiry);
+
static int copy_mount_options (const void __user *data, unsigned long *where)
{
int i;
@@ -726,7 +845,7 @@
{
struct nameidata nd;
int retval = 0;
- int mnt_flags = 0;
+ unsigned long mnt_flags = 0;
/* Discard magic */
if ((flags & MS_MGC_MSK) == MS_MGC_VAL)
@@ -765,7 +884,7 @@
else if (flags & MS_MOVE)
retval = do_move_mount(&nd, dev_name);
else
- retval = do_add_mount(&nd, type_page, flags, mnt_flags,
+ retval = do_new_mount(&nd, type_page, flags, mnt_flags,
dev_name, data_page);
dput_out:
path_release(&nd);
@@ -852,6 +971,24 @@
return -ENOMEM;
}
+void __put_namespace(struct namespace *namespace)
+{
+ struct vfsmount *mnt;
+
+ down_write(&namespace->sem);
+ spin_lock(&dcache_lock);
+
+ list_for_each_entry(mnt, &namespace->list, mnt_list) {
+ mnt->mnt_namespace = NULL;
+ }
+
+ umount_tree(namespace->root);
+
+ spin_unlock(&dcache_lock);
+ up_write(&namespace->sem);
+ kfree(namespace);
+}
+
asmlinkage long sys_mount(char __user * dev_name, char __user * dir_name,
char __user * type, unsigned long flags,
void __user * data)
@@ -1082,6 +1219,7 @@
init_rwsem(&namespace->sem);
list_add(&mnt->mnt_list, &namespace->list);
namespace->root = mnt;
+ mnt->mnt_namespace = namespace;
init_task.namespace = namespace;
read_lock(&tasklist_lock);
diff -ur linux-2.5.74/fs/super.c linux-2.5.74-auto/fs/super.c
--- linux-2.5.74/fs/super.c 2003-07-03 15:35:14.000000000 +0100
+++ linux-2.5.74-auto/fs/super.c 2003-07-03 15:41:15.000000000 +0100
@@ -21,6 +21,7 @@
*/
#include <linux/config.h>
+#include <linux/module.h>
#include <linux/slab.h>
#include <linux/smp_lock.h>
#include <linux/acct.h>
@@ -683,6 +684,7 @@
mnt->mnt_root = dget(sb->s_root);
mnt->mnt_mountpoint = sb->s_root;
mnt->mnt_parent = mnt;
+ mnt->mnt_namespace = current->namespace;
up_write(&sb->s_umount);
put_filesystem(type);
return mnt;
@@ -697,6 +699,8 @@
return (struct vfsmount *)sb;
}
+EXPORT_SYMBOL_GPL(do_kern_mount);
+
struct vfsmount *kern_mount(struct file_system_type *type)
{
return do_kern_mount(type->name, 0, type->name, NULL);
diff -ur linux-2.5.74/include/linux/fs.h linux-2.5.74-auto/include/linux/fs.h
--- linux-2.5.74/include/linux/fs.h 2003-07-03 15:34:43.000000000 +0100
+++ linux-2.5.74-auto/include/linux/fs.h 2003-07-03 15:41:15.000000000 +0100
@@ -572,6 +572,7 @@
#define MNT_FORCE 0x00000001 /* Attempt to forcibily umount */
#define MNT_DETACH 0x00000002 /* Just detach from the tree */
+#define MNT_EXPIRE 0x00000004 /* Mark for expiry */
extern struct list_head super_blocks;
extern spinlock_t sb_lock;
diff -ur linux-2.5.74/include/linux/mount.h linux-2.5.74-auto/include/linux/mount.h
--- linux-2.5.74/include/linux/mount.h 2003-07-03 15:34:43.000000000 +0100
+++ linux-2.5.74-auto/include/linux/mount.h 2003-07-03 15:41:15.000000000 +0100
@@ -14,9 +14,15 @@
#include <linux/list.h>
-#define MNT_NOSUID 1
-#define MNT_NODEV 2
-#define MNT_NOEXEC 4
+#define __MNT_NOSUID 0
+#define __MNT_NODEV 1
+#define __MNT_NOEXEC 2
+#define __MNT_EXPIRY_MARK 3
+
+#define MNT_NOSUID (1 << __MNT_NOSUID)
+#define MNT_NODEV (1 << __MNT_NODEV)
+#define MNT_NOEXEC (1 << __MNT_NOEXEC)
+#define MNT_EXPIRY_MARK (1 << __MNT_EXPIRY_MARK)
struct vfsmount
{
@@ -28,9 +34,11 @@
struct list_head mnt_mounts; /* list of children, anchored here */
struct list_head mnt_child; /* and going through their mnt_child */
atomic_t mnt_count;
- int mnt_flags;
+ unsigned long mnt_flags;
char *mnt_devname; /* Name of device e.g. /dev/dsk/hda1 */
struct list_head mnt_list;
+ struct list_head mnt_fslink; /* link in fs-specific list */
+ struct namespace *mnt_namespace; /* containing namespace */
};
static inline struct vfsmount *mntget(struct vfsmount *mnt)
@@ -42,7 +50,7 @@
extern void __mntput(struct vfsmount *mnt);
-static inline void mntput(struct vfsmount *mnt)
+static inline void _mntput(struct vfsmount *mnt)
{
if (mnt) {
if (atomic_dec_and_test(&mnt->mnt_count))
@@ -50,10 +58,24 @@
}
}
+static inline void mntput(struct vfsmount *mnt)
+{
+ if (mnt) {
+ if (atomic_read(&mnt->mnt_count) == 2)
+ clear_bit(__MNT_EXPIRY_MARK, &mnt->mnt_flags);
+ _mntput(mnt);
+ }
+}
+
extern void free_vfsmnt(struct vfsmount *mnt);
extern struct vfsmount *alloc_vfsmnt(const char *name);
extern struct vfsmount *do_kern_mount(const char *fstype, int flags,
const char *name, void *data);
+extern int do_add_mount(struct vfsmount *newmnt, struct nameidata *nd,
+ unsigned long mnt_flags, struct list_head *fslist);
+
+extern void mark_mounts_for_expiry(struct list_head *mounts);
+
#endif
#endif /* _LINUX_MOUNT_H */
diff -ur linux-2.5.74/include/linux/namei.h linux-2.5.74-auto/include/linux/namei.h
--- linux-2.5.74/include/linux/namei.h 2003-07-03 15:34:43.000000000 +0100
+++ linux-2.5.74-auto/include/linux/namei.h 2003-07-03 15:41:15.000000000 +0100
@@ -42,6 +42,7 @@
extern int FASTCALL(path_walk(const char *, struct nameidata *));
extern int FASTCALL(link_path_walk(const char *, struct nameidata *));
extern void path_release(struct nameidata *);
+extern void path_release_on_umount(struct nameidata *);
extern struct dentry * lookup_one_len(const char *, struct dentry *, int);
extern struct dentry * lookup_hash(struct qstr *, struct dentry *);
diff -ur linux-2.5.74/include/linux/namespace.h linux-2.5.74-auto/include/linux/namespace.h
--- linux-2.5.74/include/linux/namespace.h 2003-07-03 15:34:43.000000000 +0100
+++ linux-2.5.74-auto/include/linux/namespace.h 2003-07-03 15:41:15.000000000 +0100
@@ -15,16 +15,12 @@
extern void umount_tree(struct vfsmount *);
extern int copy_namespace(int, struct task_struct *);
+extern void __put_namespace(struct namespace *namespace);
+
static inline void put_namespace(struct namespace *namespace)
{
- if (atomic_dec_and_test(&namespace->count)) {
- down_write(&namespace->sem);
- spin_lock(&dcache_lock);
- umount_tree(namespace->root);
- spin_unlock(&dcache_lock);
- up_write(&namespace->sem);
- kfree(namespace);
- }
+ if (atomic_dec_and_test(&namespace->count))
+ __put_namespace(namespace);
}
static inline void exit_namespace(struct task_struct *p)
reply other threads:[~2003-07-04 9:11 UTC|newest]
Thread overview: [no followups] expand[flat|nested] mbox.gz Atom feed
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=27682.1057310754@warthog.warthog \
--to=dhowells@redhat.com \
--cc=linux-kernel@vger.kernel.org \
--cc=torvalds@transmeta.com \
--cc=viro@parcelfarce.linux.theplanet.co.uk \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.