From mboxrd@z Thu Jan 1 00:00:00 1970 From: Eric Van Hensbergen Subject: [RFC][2.6 patch] Allow creation of new namespaces during mount system call Date: Tue, 19 Apr 2005 17:13:32 -0500 Message-ID: Reply-To: Eric Van Hensbergen Mime-Version: 1.0 Content-Type: text/plain; charset=US-ASCII Content-Transfer-Encoding: 7BIT Return-path: Received: from wproxy.gmail.com ([64.233.184.200]:27411 "EHLO wproxy.gmail.com") by vger.kernel.org with ESMTP id S261687AbVDSWNe convert rfc822-to-8bit (ORCPT ); Tue, 19 Apr 2005 18:13:34 -0400 Received: by wproxy.gmail.com with SMTP id 68so17814wri for ; Tue, 19 Apr 2005 15:13:32 -0700 (PDT) To: linux-fsdevel@vger.kernel.org, Al Viro Content-Disposition: inline Sender: linux-fsdevel-owner@vger.kernel.org List-Id: linux-fsdevel.vger.kernel.org The motivation behind this patch is to make private namespaces more accessible by allowing their creation at mount/bind time. Based on some of the FUSE permissions discussions, I wanted to check into modifying the mount system calls -- adding a flag which created a new namespace for the resulting mount. I quickly discovered that what I typically wanted (for the case of running a mount command) was to actually create a new namespace for the parent thread (typically the shell), inherit that namespace, and then perform the mount. Its not clear to me that both options are needed, cloning the parent's namespace seems to be what you want most of the time. In order to minimize code impact I split the copy_namespace function, perhaps the right long term solution is to change it's interface to accommodate the changes. Things look a bit more invasive as I moved the copy_namespace function above do_mount. The patch follows: fs/namespace.c | 193 +++++++++++++++++++++++++++++------------------------ include/linux/fs.h | 2 2 files changed, 108 insertions(+), 87 deletions(-) --- linux-2.5/include/linux/fs.h 2005-04-19 17:02:28.530152496 -0500 +++ newns-2.5/include/linux/fs.h 2005-04-19 17:03:52.619368992 -0500 @@ -103,6 +103,8 @@ extern int dir_notify_enable; #define MS_REC 16384 #define MS_VERBOSE 32768 #define MS_POSIXACL (1<<16) /* VFS does not apply the umask */ +#define MS_CLONE_NEWNS (1<<17) /* clone my namespace before mount */ +#define MS_CLONE_NEWPNS (1<<18) /* clone my & my parent namespace */ #define MS_ACTIVE (1<<30) #define MS_NOUSER (1<<31) --- linux-2.5/fs/namespace.c 2005-04-19 17:02:14.551277608 -0500 +++ newns-2.5/fs/namespace.c 2005-04-19 17:03:38.227556880 -0500 @@ -991,6 +991,104 @@ int copy_mount_options(const void __user return 0; } +int update_namespace(struct task_struct *tsk, struct namespace *new_ns ) +{ + struct namespace *namespace = tsk->namespace; + struct vfsmount *rootmnt = NULL, *pwdmnt = NULL, *altrootmnt = NULL; + struct fs_struct *fs = tsk->fs; + struct vfsmount *p, *q; + + if (!namespace) + return 0; + + get_namespace(namespace); + + if (!capable(CAP_SYS_ADMIN)) { + put_namespace(namespace); + return -EPERM; + } + + down_write(&tsk->namespace->sem); + if(!new_ns) { + new_ns = kmalloc(sizeof(struct namespace), GFP_KERNEL); + if (!new_ns) + goto out; + + atomic_set(&new_ns->count, 1); + init_rwsem(&new_ns->sem); + INIT_LIST_HEAD(&new_ns->list); + + /* First pass: copy the tree topology */ + new_ns->root = copy_tree(namespace->root, namespace->root->mnt_root); + if (!new_ns->root) { + up_write(&tsk->namespace->sem); + kfree(new_ns); + goto out; + } + spin_lock(&vfsmount_lock); + list_add_tail(&new_ns->list, &new_ns->root->mnt_list); + spin_unlock(&vfsmount_lock); + } else + get_namespace(new_ns); + + /* + * Second pass: switch the tsk->fs->* elements and mark new vfsmounts + * as belonging to new namespace. We have already acquired a private + * fs_struct, so tsk->fs->lock is not needed. + */ + p = namespace->root; + q = new_ns->root; + while (p) { + q->mnt_namespace = new_ns; + if (fs) { + if (p == fs->rootmnt) { + rootmnt = p; + fs->rootmnt = mntget(q); + } + if (p == fs->pwdmnt) { + pwdmnt = p; + fs->pwdmnt = mntget(q); + } + if (p == fs->altrootmnt) { + altrootmnt = p; + fs->altrootmnt = mntget(q); + } + } + p = next_mnt(p, namespace->root); + q = next_mnt(q, new_ns->root); + } + up_write(&tsk->namespace->sem); + + tsk->namespace = new_ns; + + if (rootmnt) + mntput(rootmnt); + if (pwdmnt) + mntput(pwdmnt); + if (altrootmnt) + mntput(altrootmnt); + + put_namespace(namespace); + return 0; + +out: + put_namespace(namespace); + return -ENOMEM; +} + +int copy_namespace(int flags, struct task_struct *tsk) +{ + if (!tsk->namespace) + return 0; + + if (!(flags & CLONE_NEWNS)) { + get_namespace(tsk->namespace); + return 0; + } + + return update_namespace( tsk, NULL ); +} + /* * Flags is a 32-bit value that allows up to 31 non-fs dependent flags to * be given to the mount() call (ie: read-only, no-dev, no-suid etc). @@ -1033,7 +1131,14 @@ long do_mount(char * dev_name, char * di mnt_flags |= MNT_NODEV; if (flags & MS_NOEXEC) mnt_flags |= MNT_NOEXEC; - flags &= ~(MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_ACTIVE); + if (flags & MS_CLONE_NEWNS) + copy_namespace( CLONE_NEWNS, current); + if (flags & MS_CLONE_NEWPNS) { + copy_namespace( CLONE_NEWNS, current->real_parent); + update_namespace( current, current->real_parent->namespace); + } + + flags &= ~(MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_ACTIVE|MS_CLONE_NEWNS|MS_CLONE_NEWPNS); /* ... and get the mountpoint */ retval = path_lookup(dir_name, LOOKUP_FOLLOW, &nd); @@ -1059,92 +1164,6 @@ dput_out: return retval; } -int copy_namespace(int flags, struct task_struct *tsk) -{ - struct namespace *namespace = tsk->namespace; - struct namespace *new_ns; - struct vfsmount *rootmnt = NULL, *pwdmnt = NULL, *altrootmnt = NULL; - struct fs_struct *fs = tsk->fs; - struct vfsmount *p, *q; - - if (!namespace) - return 0; - - get_namespace(namespace); - - if (!(flags & CLONE_NEWNS)) - return 0; - - if (!capable(CAP_SYS_ADMIN)) { - put_namespace(namespace); - return -EPERM; - } - - new_ns = kmalloc(sizeof(struct namespace), GFP_KERNEL); - if (!new_ns) - goto out; - - atomic_set(&new_ns->count, 1); - init_rwsem(&new_ns->sem); - INIT_LIST_HEAD(&new_ns->list); - - down_write(&tsk->namespace->sem); - /* First pass: copy the tree topology */ - new_ns->root = copy_tree(namespace->root, namespace->root->mnt_root); - if (!new_ns->root) { - up_write(&tsk->namespace->sem); - kfree(new_ns); - goto out; - } - spin_lock(&vfsmount_lock); - list_add_tail(&new_ns->list, &new_ns->root->mnt_list); - spin_unlock(&vfsmount_lock); - - /* - * Second pass: switch the tsk->fs->* elements and mark new vfsmounts - * as belonging to new namespace. We have already acquired a private - * fs_struct, so tsk->fs->lock is not needed. - */ - p = namespace->root; - q = new_ns->root; - while (p) { - q->mnt_namespace = new_ns; - if (fs) { - if (p == fs->rootmnt) { - rootmnt = p; - fs->rootmnt = mntget(q); - } - if (p == fs->pwdmnt) { - pwdmnt = p; - fs->pwdmnt = mntget(q); - } - if (p == fs->altrootmnt) { - altrootmnt = p; - fs->altrootmnt = mntget(q); - } - } - p = next_mnt(p, namespace->root); - q = next_mnt(q, new_ns->root); - } - up_write(&tsk->namespace->sem); - - tsk->namespace = new_ns; - - if (rootmnt) - mntput(rootmnt); - if (pwdmnt) - mntput(pwdmnt); - if (altrootmnt) - mntput(altrootmnt); - - put_namespace(namespace); - return 0; - -out: - put_namespace(namespace); - return -ENOMEM; -} - asmlinkage long sys_mount(char __user * dev_name, char __user * dir_name, char __user * type, unsigned long flags, void __user * data)