From mboxrd@z Thu Jan  1 00:00:00 1970
From: Eric Van Hensbergen <ericvh@gmail.com>
Subject: [RFC][2.6 patch] Allow creation of new namespaces during mount system call
Date: Tue, 19 Apr 2005 17:13:32 -0500
Message-ID: <a4e6962a05041915132f57de3f@mail.gmail.com>
Reply-To: Eric Van Hensbergen <ericvh@gmail.com>
Mime-Version: 1.0
Content-Type: text/plain; charset=US-ASCII
Content-Transfer-Encoding: 7BIT
Return-path: <linux-fsdevel-owner@vger.kernel.org>
Received: from wproxy.gmail.com ([64.233.184.200]:27411 "EHLO wproxy.gmail.com")
	by vger.kernel.org with ESMTP id S261687AbVDSWNe convert rfc822-to-8bit
	(ORCPT <rfc822;linux-fsdevel@vger.kernel.org>);
	Tue, 19 Apr 2005 18:13:34 -0400
Received: by wproxy.gmail.com with SMTP id 68so17814wri
        for <linux-fsdevel@vger.kernel.org>; Tue, 19 Apr 2005 15:13:32 -0700 (PDT)
To: linux-fsdevel@vger.kernel.org,
	Al Viro <viro@parcelfarce.linux.theplanet.co.uk>
Content-Disposition: inline
Sender: linux-fsdevel-owner@vger.kernel.org
List-Id: linux-fsdevel.vger.kernel.org

The motivation behind this patch is to make private namespaces more
accessible by allowing their creation at mount/bind time.

Based on some of the FUSE permissions discussions, I wanted to check
into modifying the mount system calls -- adding a flag which created a
new namespace for the resulting mount.  I quickly discovered that what
I typically wanted (for the case of running a mount command) was to
actually create a new namespace for the parent thread (typically the
shell), inherit that namespace, and then perform the mount.

Its not clear to me that both options are needed, cloning the parent's
namespace seems to be what you want most of the time.

In order to minimize code impact I split the copy_namespace function,
perhaps the right long term solution is to change it's interface to
accommodate the changes.  Things look a bit more invasive as I moved
the copy_namespace function above do_mount.  The patch follows:

  fs/namespace.c     |  193
+++++++++++++++++++++++++++++------------------------
  include/linux/fs.h |    2 
 2 files changed, 108 insertions(+), 87 deletions(-)

--- linux-2.5/include/linux/fs.h	2005-04-19 17:02:28.530152496 -0500
+++ newns-2.5/include/linux/fs.h	2005-04-19 17:03:52.619368992 -0500
@@ -103,6 +103,8 @@ extern int dir_notify_enable;
 #define MS_REC		16384
 #define MS_VERBOSE	32768
  #define MS_POSIXACL	(1<<16)	/* VFS does not apply the umask */
+#define MS_CLONE_NEWNS	(1<<17) /* clone my namespace before mount */
+#define MS_CLONE_NEWPNS (1<<18) /* clone my & my parent namespace */
 #define MS_ACTIVE	(1<<30)
 #define MS_NOUSER	(1<<31)
 
--- linux-2.5/fs/namespace.c	2005-04-19 17:02:14.551277608 -0500
+++ newns-2.5/fs/namespace.c	2005-04-19 17:03:38.227556880 -0500
@@ -991,6 +991,104 @@ int copy_mount_options(const void __user
 	return 0;
 }
 
+int update_namespace(struct task_struct *tsk, struct namespace *new_ns )
+{
+	struct namespace *namespace = tsk->namespace;
+	struct vfsmount *rootmnt = NULL, *pwdmnt = NULL, *altrootmnt = NULL;
+	struct fs_struct *fs = tsk->fs;
+	struct vfsmount *p, *q;
+
+	if (!namespace)
+		return 0;
+
+	get_namespace(namespace);
+
+	if (!capable(CAP_SYS_ADMIN)) {
+		put_namespace(namespace);
+		return -EPERM;
+	}
+
+	down_write(&tsk->namespace->sem);
+	if(!new_ns) {
+		new_ns = kmalloc(sizeof(struct namespace), GFP_KERNEL);
+		if (!new_ns)
+			goto out;
+
+		atomic_set(&new_ns->count, 1);
+		init_rwsem(&new_ns->sem);
+		INIT_LIST_HEAD(&new_ns->list);
+
+		/* First pass: copy the tree topology */
+		new_ns->root = copy_tree(namespace->root, namespace->root->mnt_root);
+		if (!new_ns->root) {
+			up_write(&tsk->namespace->sem);
+			kfree(new_ns);
+			goto out;
+		}
+		spin_lock(&vfsmount_lock);
+		list_add_tail(&new_ns->list, &new_ns->root->mnt_list);
+		spin_unlock(&vfsmount_lock);
+	} else 
+		get_namespace(new_ns);
+
+	/*
+	 * Second pass: switch the tsk->fs->* elements and mark new vfsmounts
+	 * as belonging to new namespace.  We have already acquired a private
+	 * fs_struct, so tsk->fs->lock is not needed.
+	 */
+	p = namespace->root;
+	q = new_ns->root;
+	while (p) {
+		q->mnt_namespace = new_ns;
+		if (fs) {
+			if (p == fs->rootmnt) {
+				rootmnt = p;
+				fs->rootmnt = mntget(q);
+			}
+			if (p == fs->pwdmnt) {
+				pwdmnt = p;
+				fs->pwdmnt = mntget(q);
+			}
+			if (p == fs->altrootmnt) {
+				altrootmnt = p;
+				fs->altrootmnt = mntget(q);
+			}
+		}
+		p = next_mnt(p, namespace->root);
+		q = next_mnt(q, new_ns->root);
+	}
+	up_write(&tsk->namespace->sem);
+
+	tsk->namespace = new_ns;
+
+	if (rootmnt)
+		mntput(rootmnt);
+	if (pwdmnt)
+		mntput(pwdmnt);
+	if (altrootmnt)
+		mntput(altrootmnt);
+
+	put_namespace(namespace);
+	return 0;
+
+out:
+	put_namespace(namespace);
+	return -ENOMEM;
+}
+
+int copy_namespace(int flags, struct task_struct *tsk)
+{
+	if (!tsk->namespace)
+		return 0;
+
+	if (!(flags & CLONE_NEWNS)) {
+		get_namespace(tsk->namespace);
+		return 0;
+	}
+
+	return update_namespace( tsk, NULL );
+}
+
 /*
  * Flags is a 32-bit value that allows up to 31 non-fs dependent flags to
  * be given to the mount() call (ie: read-only, no-dev, no-suid etc).
@@ -1033,7 +1131,14 @@ long do_mount(char * dev_name, char * di
 		mnt_flags |= MNT_NODEV;
 	if (flags & MS_NOEXEC)
 		mnt_flags |= MNT_NOEXEC;
-	flags &= ~(MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_ACTIVE);
+        if (flags & MS_CLONE_NEWNS)
+		copy_namespace( CLONE_NEWNS, current);
+	if (flags & MS_CLONE_NEWPNS) {
+		copy_namespace( CLONE_NEWNS, current->real_parent);
+		update_namespace( current, current->real_parent->namespace);
+	}
+
+	flags &= ~(MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_ACTIVE|MS_CLONE_NEWNS|MS_CLONE_NEWPNS);
 
 	/* ... and get the mountpoint */
 	retval = path_lookup(dir_name, LOOKUP_FOLLOW, &nd);
@@ -1059,92 +1164,6 @@ dput_out:
 	return retval;
 }
 
-int copy_namespace(int flags, struct task_struct *tsk)
-{
-	struct namespace *namespace = tsk->namespace;
-	struct namespace *new_ns;
-	struct vfsmount *rootmnt = NULL, *pwdmnt = NULL, *altrootmnt = NULL;
-	struct fs_struct *fs = tsk->fs;
-	struct vfsmount *p, *q;
-
-	if (!namespace)
-		return 0;
-
-	get_namespace(namespace);
-
-	if (!(flags & CLONE_NEWNS))
-		return 0;
-
-	if (!capable(CAP_SYS_ADMIN)) {
-		put_namespace(namespace);
-		return -EPERM;
-	}
-
-	new_ns = kmalloc(sizeof(struct namespace), GFP_KERNEL);
-	if (!new_ns)
-		goto out;
-
-	atomic_set(&new_ns->count, 1);
-	init_rwsem(&new_ns->sem);
-	INIT_LIST_HEAD(&new_ns->list);
-
-	down_write(&tsk->namespace->sem);
-	/* First pass: copy the tree topology */
-	new_ns->root = copy_tree(namespace->root, namespace->root->mnt_root);
-	if (!new_ns->root) {
-		up_write(&tsk->namespace->sem);
-		kfree(new_ns);
-		goto out;
-	}
-	spin_lock(&vfsmount_lock);
-	list_add_tail(&new_ns->list, &new_ns->root->mnt_list);
-	spin_unlock(&vfsmount_lock);
-
-	/*
-	 * Second pass: switch the tsk->fs->* elements and mark new vfsmounts
-	 * as belonging to new namespace.  We have already acquired a private
-	 * fs_struct, so tsk->fs->lock is not needed.
-	 */
-	p = namespace->root;
-	q = new_ns->root;
-	while (p) {
-		q->mnt_namespace = new_ns;
-		if (fs) {
-			if (p == fs->rootmnt) {
-				rootmnt = p;
-				fs->rootmnt = mntget(q);
-			}
-			if (p == fs->pwdmnt) {
-				pwdmnt = p;
-				fs->pwdmnt = mntget(q);
-			}
-			if (p == fs->altrootmnt) {
-				altrootmnt = p;
-				fs->altrootmnt = mntget(q);
-			}
-		}
-		p = next_mnt(p, namespace->root);
-		q = next_mnt(q, new_ns->root);
-	}
-	up_write(&tsk->namespace->sem);
-
-	tsk->namespace = new_ns;
-
-	if (rootmnt)
-		mntput(rootmnt);
-	if (pwdmnt)
-		mntput(pwdmnt);
-	if (altrootmnt)
-		mntput(altrootmnt);
-
-	put_namespace(namespace);
-	return 0;
-
-out:
-	put_namespace(namespace);
-	return -ENOMEM;
-}
-
 asmlinkage long sys_mount(char __user * dev_name, char __user * dir_name,
 			  char __user * type, unsigned long flags,
 			  void __user * data)