linux-fsdevel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 12/18] shared mount handling: bind and rbind
@ 2005-11-08  2:01 Al Viro
  2005-11-08 14:11 ` Miklos Szeredi
  2005-11-09 10:54 ` Miklos Szeredi
  0 siblings, 2 replies; 26+ messages in thread
From: Al Viro @ 2005-11-08  2:01 UTC (permalink / raw)
  To: torvalds; +Cc: linux-kernel, linux-fsdevel, linuxram

From: Ram Pai <linuxram@us.ibm.com>
Date: 1131401990 -0500

Handling of MS_BIND in presense of shared mounts (see
Documentation/sharedsubtree.txt in the end of patch series
for detailed description).

Signed-off-by: Ram Pai (linuxram@us.ibm.com)
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

---

 fs/namespace.c     |  126 +++++++++++++++++++++++++++++++++++++++++++---------
 fs/pnode.c         |   81 +++++++++++++++++++++++++++++++++
 fs/pnode.h         |   14 ++++++
 include/linux/fs.h |    5 ++
 4 files changed, 204 insertions(+), 22 deletions(-)

61916467f11424161a197c5aab53a6bb79619911
diff --git a/fs/namespace.c b/fs/namespace.c
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -28,8 +28,6 @@
 
 extern int __init init_rootfs(void);
 
-#define CL_EXPIRE 	0x01
-
 #ifdef CONFIG_SYSFS
 extern int __init sysfs_init(void);
 #else
@@ -145,13 +143,43 @@ static void detach_mnt(struct vfsmount *
 	old_nd->dentry->d_mounted--;
 }
 
+void mnt_set_mountpoint(struct vfsmount *mnt, struct dentry *dentry,
+			struct vfsmount *child_mnt)
+{
+	child_mnt->mnt_parent = mntget(mnt);
+	child_mnt->mnt_mountpoint = dget(dentry);
+	dentry->d_mounted++;
+}
+
 static void attach_mnt(struct vfsmount *mnt, struct nameidata *nd)
 {
-	mnt->mnt_parent = mntget(nd->mnt);
-	mnt->mnt_mountpoint = dget(nd->dentry);
-	list_add(&mnt->mnt_hash, mount_hashtable + hash(nd->mnt, nd->dentry));
+	mnt_set_mountpoint(nd->mnt, nd->dentry, mnt);
+	list_add_tail(&mnt->mnt_hash, mount_hashtable +
+			hash(nd->mnt, nd->dentry));
 	list_add_tail(&mnt->mnt_child, &nd->mnt->mnt_mounts);
-	nd->dentry->d_mounted++;
+}
+
+/*
+ * the caller must hold vfsmount_lock
+ */
+static void commit_tree(struct vfsmount *mnt)
+{
+	struct vfsmount *parent = mnt->mnt_parent;
+	struct vfsmount *m;
+	LIST_HEAD(head);
+	struct namespace *n = parent->mnt_namespace;
+
+	BUG_ON(parent == mnt);
+
+	list_add_tail(&head, &mnt->mnt_list);
+	list_for_each_entry(m, &head, mnt_list)
+		m->mnt_namespace = n;
+	list_splice(&head, n->list.prev);
+
+	list_add_tail(&mnt->mnt_hash, mount_hashtable +
+				hash(parent, mnt->mnt_mountpoint));
+	list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
+	touch_namespace(n);
 }
 
 static struct vfsmount *next_mnt(struct vfsmount *p, struct vfsmount *root)
@@ -183,7 +211,11 @@ static struct vfsmount *clone_mnt(struct
 		mnt->mnt_root = dget(root);
 		mnt->mnt_mountpoint = mnt->mnt_root;
 		mnt->mnt_parent = mnt;
-		mnt->mnt_namespace = current->namespace;
+
+		if ((flag & CL_PROPAGATION) || IS_MNT_SHARED(old))
+			list_add(&mnt->mnt_share, &old->mnt_share);
+		if (flag & CL_MAKE_SHARED)
+			set_mnt_shared(mnt);
 
 		/* stick the duplicate mount on the same expiry list
 		 * as the original if that was on one */
@@ -379,7 +411,7 @@ int may_umount(struct vfsmount *mnt)
 
 EXPORT_SYMBOL(may_umount);
 
-static void release_mounts(struct list_head *head)
+void release_mounts(struct list_head *head)
 {
 	struct vfsmount *mnt;
 	while(!list_empty(head)) {
@@ -401,7 +433,7 @@ static void release_mounts(struct list_h
 	}
 }
 
-static void umount_tree(struct vfsmount *mnt, struct list_head *kill)
+void umount_tree(struct vfsmount *mnt, struct list_head *kill)
 {
 	struct vfsmount *p;
 
@@ -581,7 +613,7 @@ static int lives_below_in_same_fs(struct
 	}
 }
 
-static struct vfsmount *copy_tree(struct vfsmount *mnt, struct dentry *dentry,
+struct vfsmount *copy_tree(struct vfsmount *mnt, struct dentry *dentry,
 					int flag)
 {
 	struct vfsmount *res, *p, *q, *r, *s;
@@ -626,6 +658,67 @@ Enomem:
 	return NULL;
 }
 
+/*
+ *  @source_mnt : mount tree to be attached
+ *  @nd        : place the mount tree @source_mnt is attached
+ *
+ *  NOTE: in the table below explains the semantics when a source mount
+ *  of a given type is attached to a destination mount of a given type.
+ * 	---------------------------------------------
+ * 	|         BIND MOUNT OPERATION              |
+ * 	|********************************************
+ * 	| source-->| shared        |       private  |
+ * 	| dest     |               |                |
+ * 	|   |      |               |                |
+ * 	|   v      |               |                |
+ * 	|********************************************
+ * 	|  shared  | shared (++)   |     shared (+) |
+ * 	|          |               |                |
+ * 	|non-shared| shared (+)    |      private   |
+ * 	*********************************************
+ * A bind operation clones the source mount and mounts the clone on the
+ * destination mount.
+ *
+ * (++)  the cloned mount is propagated to all the mounts in the propagation
+ * 	 tree of the destination mount and the cloned mount is added to
+ * 	 the peer group of the source mount.
+ * (+)   the cloned mount is created under the destination mount and is marked
+ *       as shared. The cloned mount is added to the peer group of the source
+ *       mount.
+ *
+ * if the source mount is a tree, the operations explained above is
+ * applied to each mount in the tree.
+ * Must be called without spinlocks held, since this function can sleep
+ * in allocations.
+ */
+static int attach_recursive_mnt(struct vfsmount *source_mnt,
+				struct nameidata *nd)
+{
+	LIST_HEAD(tree_list);
+	struct vfsmount *dest_mnt = nd->mnt;
+	struct dentry *dest_dentry = nd->dentry;
+	struct vfsmount *child, *p;
+
+	if (propagate_mnt(dest_mnt, dest_dentry, source_mnt, &tree_list))
+		return -EINVAL;
+
+	if (IS_MNT_SHARED(dest_mnt)) {
+		for (p = source_mnt; p; p = next_mnt(p, source_mnt))
+			set_mnt_shared(p);
+	}
+
+	spin_lock(&vfsmount_lock);
+	mnt_set_mountpoint(dest_mnt, dest_dentry, source_mnt);
+	commit_tree(source_mnt);
+
+	list_for_each_entry_safe(child, p, &tree_list, mnt_hash) {
+		list_del_init(&child->mnt_hash);
+		commit_tree(child);
+	}
+	spin_unlock(&vfsmount_lock);
+	return 0;
+}
+
 static int graft_tree(struct vfsmount *mnt, struct nameidata *nd)
 {
 	int err;
@@ -646,17 +739,8 @@ static int graft_tree(struct vfsmount *m
 		goto out_unlock;
 
 	err = -ENOENT;
-	spin_lock(&vfsmount_lock);
-	if (IS_ROOT(nd->dentry) || !d_unhashed(nd->dentry)) {
-		struct list_head head;
-
-		attach_mnt(mnt, nd);
-		list_add_tail(&head, &mnt->mnt_list);
-		list_splice(&head, current->namespace->list.prev);
-		err = 0;
-		touch_namespace(current->namespace);
-	}
-	spin_unlock(&vfsmount_lock);
+	if (IS_ROOT(nd->dentry) || !d_unhashed(nd->dentry))
+		err = attach_recursive_mnt(mnt, nd);
 out_unlock:
 	up(&nd->dentry->d_inode->i_sem);
 	if (!err)
diff --git a/fs/pnode.c b/fs/pnode.c
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -20,9 +20,88 @@ static inline struct vfsmount *next_peer
 void change_mnt_propagation(struct vfsmount *mnt, int type)
 {
 	if (type == MS_SHARED) {
-		mnt->mnt_flags |= MNT_SHARED;
+		set_mnt_shared(mnt);
 	} else {
 		list_del_init(&mnt->mnt_share);
 		mnt->mnt_flags &= ~MNT_PNODE_MASK;
 	}
 }
+
+/*
+ * get the next mount in the propagation tree.
+ * @m: the mount seen last
+ * @origin: the original mount from where the tree walk initiated
+ */
+static struct vfsmount *propagation_next(struct vfsmount *m,
+					 struct vfsmount *origin)
+{
+	m = next_peer(m);
+	if (m == origin)
+		return NULL;
+	return m;
+}
+
+/*
+ * mount 'source_mnt' under the destination 'dest_mnt' at
+ * dentry 'dest_dentry'. And propagate that mount to
+ * all the peer and slave mounts of 'dest_mnt'.
+ * Link all the new mounts into a propagation tree headed at
+ * source_mnt. Also link all the new mounts using ->mnt_list
+ * headed at source_mnt's ->mnt_list
+ *
+ * @dest_mnt: destination mount.
+ * @dest_dentry: destination dentry.
+ * @source_mnt: source mount.
+ * @tree_list : list of heads of trees to be attached.
+ */
+int propagate_mnt(struct vfsmount *dest_mnt, struct dentry *dest_dentry,
+		    struct vfsmount *source_mnt, struct list_head *tree_list)
+{
+	struct vfsmount *m, *child;
+	int ret = 0;
+	struct vfsmount *prev_dest_mnt = dest_mnt;
+	struct vfsmount *prev_src_mnt  = source_mnt;
+	LIST_HEAD(tmp_list);
+	LIST_HEAD(umount_list);
+
+	for (m = propagation_next(dest_mnt, dest_mnt); m;
+			m = propagation_next(m, dest_mnt)) {
+		int type = CL_PROPAGATION;
+
+		if (IS_MNT_NEW(m))
+			continue;
+
+		if (IS_MNT_SHARED(m))
+			type |= CL_MAKE_SHARED;
+
+		if (!(child = copy_tree(source_mnt, source_mnt->mnt_root,
+						type))) {
+			ret = -ENOMEM;
+			list_splice(tree_list, tmp_list.prev);
+			goto out;
+		}
+
+		if (is_subdir(dest_dentry, m->mnt_root)) {
+			mnt_set_mountpoint(m, dest_dentry, child);
+			list_add_tail(&child->mnt_hash, tree_list);
+		} else {
+			/*
+			 * This can happen if the parent mount was bind mounted
+			 * on some subdirectory of a shared/slave mount.
+			 */
+			list_add_tail(&child->mnt_hash, &tmp_list);
+		}
+		prev_dest_mnt = m;
+		prev_src_mnt  = child;
+	}
+out:
+	spin_lock(&vfsmount_lock);
+	while (!list_empty(&tmp_list)) {
+		child = list_entry(tmp_list.next, struct vfsmount, mnt_hash);
+		list_del_init(&child->mnt_hash);
+		umount_tree(child, &umount_list);
+	}
+	spin_unlock(&vfsmount_lock);
+	release_mounts(&umount_list);
+	return ret;
+}
diff --git a/fs/pnode.h b/fs/pnode.h
--- a/fs/pnode.h
+++ b/fs/pnode.h
@@ -12,7 +12,21 @@
 #include <linux/mount.h>
 
 #define IS_MNT_SHARED(mnt) (mnt->mnt_flags & MNT_SHARED)
+#define IS_MNT_NEW(mnt)  (!mnt->mnt_namespace)
 #define CLEAR_MNT_SHARED(mnt) (mnt->mnt_flags &= ~MNT_SHARED)
 
+#define CL_EXPIRE    		0x01
+#define CL_COPY_ALL 		0x04
+#define CL_MAKE_SHARED 		0x08
+#define CL_PROPAGATION 		0x10
+
+static inline void set_mnt_shared(struct vfsmount *mnt)
+{
+	mnt->mnt_flags &= ~MNT_PNODE_MASK;
+	mnt->mnt_flags |= MNT_SHARED;
+}
+
 void change_mnt_propagation(struct vfsmount *, int);
+int propagate_mnt(struct vfsmount *, struct dentry *, struct vfsmount *,
+		struct list_head *);
 #endif /* _LINUX_PNODE_H */
diff --git a/include/linux/fs.h b/include/linux/fs.h
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1251,7 +1251,12 @@ extern int unregister_filesystem(struct 
 extern struct vfsmount *kern_mount(struct file_system_type *);
 extern int may_umount_tree(struct vfsmount *);
 extern int may_umount(struct vfsmount *);
+extern void umount_tree(struct vfsmount *, struct list_head *);
+extern void release_mounts(struct list_head *);
 extern long do_mount(char *, char *, char *, unsigned long, void *);
+extern struct vfsmount *copy_tree(struct vfsmount *, struct dentry *, int);
+extern void mnt_set_mountpoint(struct vfsmount *, struct dentry *,
+				  struct vfsmount *);
 
 extern int vfs_statfs(struct super_block *, struct kstatfs *);
 

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 12/18] shared mount handling: bind and rbind
  2005-11-08  2:01 [PATCH 12/18] shared mount handling: bind and rbind Al Viro
@ 2005-11-08 14:11 ` Miklos Szeredi
  2005-11-08 15:48   ` Ram Pai
  2005-11-09 10:54 ` Miklos Szeredi
  1 sibling, 1 reply; 26+ messages in thread
From: Miklos Szeredi @ 2005-11-08 14:11 UTC (permalink / raw)
  To: viro; +Cc: torvalds, linux-kernel, linux-fsdevel, linuxram

>  static void attach_mnt(struct vfsmount *mnt, struct nameidata *nd)
>  {
> -	mnt->mnt_parent = mntget(nd->mnt);
> -	mnt->mnt_mountpoint = dget(nd->dentry);
> -	list_add(&mnt->mnt_hash, mount_hashtable + hash(nd->mnt, nd->dentry));
> +	mnt_set_mountpoint(nd->mnt, nd->dentry, mnt);
> +	list_add_tail(&mnt->mnt_hash, mount_hashtable +
> +			hash(nd->mnt, nd->dentry));

Ram,

IIRC the list_add -> list_add_tail change has been voted down.  Or do
you have new reasons why it's needed?

Miklos

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 12/18] shared mount handling: bind and rbind
  2005-11-08 14:11 ` Miklos Szeredi
@ 2005-11-08 15:48   ` Ram Pai
  2005-11-08 15:55     ` Miklos Szeredi
  0 siblings, 1 reply; 26+ messages in thread
From: Ram Pai @ 2005-11-08 15:48 UTC (permalink / raw)
  To: Miklos Szeredi; +Cc: Al Viro, torvalds, linux-kernel, linux-fsdevel

On Tue, 2005-11-08 at 06:11, Miklos Szeredi wrote:
> >  static void attach_mnt(struct vfsmount *mnt, struct nameidata *nd)
> >  {
> > -	mnt->mnt_parent = mntget(nd->mnt);
> > -	mnt->mnt_mountpoint = dget(nd->dentry);
> > -	list_add(&mnt->mnt_hash, mount_hashtable + hash(nd->mnt, nd->dentry));
> > +	mnt_set_mountpoint(nd->mnt, nd->dentry, mnt);
> > +	list_add_tail(&mnt->mnt_hash, mount_hashtable +
> > +			hash(nd->mnt, nd->dentry));
> 
> Ram,
> 
> IIRC the list_add -> list_add_tail change has been voted down.  Or do
> you have new reasons why it's needed?

No. As explained in the same earlier threads; without this change the
behavior of shared-subtrees leads to inconsistency and confusion in some
scenarios.

Under the premise that no application should depend on this behavior
(most-recent-mount-visible v/s top-most-mount-visible),  Al Viro
permitted this change. And this is certainly the right behavior.

RP
> 
> Miklos
> -
> To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 12/18] shared mount handling: bind and rbind
  2005-11-08 15:48   ` Ram Pai
@ 2005-11-08 15:55     ` Miklos Szeredi
  2005-11-09 18:44       ` Ram Pai
  0 siblings, 1 reply; 26+ messages in thread
From: Miklos Szeredi @ 2005-11-08 15:55 UTC (permalink / raw)
  To: linuxram; +Cc: miklos, viro, torvalds, linux-kernel, linux-fsdevel

> No. As explained in the same earlier threads; without this change the
> behavior of shared-subtrees leads to inconsistency and confusion in some
> scenarios.
> 
> Under the premise that no application should depend on this behavior
> (most-recent-mount-visible v/s top-most-mount-visible),

The strongest argument against was that

  mount foo .; umount .

would no longer be a no-op.

> Al Viro permitted this change. And this is certainly the right
> behavior.

Which is a contradiction in term, since you are saying that
applications _do_ depend on it.

Miklos

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 12/18] shared mount handling: bind and rbind
  2005-11-08  2:01 [PATCH 12/18] shared mount handling: bind and rbind Al Viro
  2005-11-08 14:11 ` Miklos Szeredi
@ 2005-11-09 10:54 ` Miklos Szeredi
  2005-11-09 14:31   ` Al Viro
  1 sibling, 1 reply; 26+ messages in thread
From: Miklos Szeredi @ 2005-11-09 10:54 UTC (permalink / raw)
  To: viro; +Cc: torvalds, linux-kernel, linux-fsdevel, linuxram

> +/*
> + * mount 'source_mnt' under the destination 'dest_mnt' at
> + * dentry 'dest_dentry'. And propagate that mount to
> + * all the peer and slave mounts of 'dest_mnt'.
> + * Link all the new mounts into a propagation tree headed at
> + * source_mnt. Also link all the new mounts using ->mnt_list
> + * headed at source_mnt's ->mnt_list
> + *
> + * @dest_mnt: destination mount.
> + * @dest_dentry: destination dentry.
> + * @source_mnt: source mount.
> + * @tree_list : list of heads of trees to be attached.
> + */
> +int propagate_mnt(struct vfsmount *dest_mnt, struct dentry *dest_dentry,
> +		    struct vfsmount *source_mnt, struct list_head *tree_list)
> +{
> +	struct vfsmount *m, *child;
> +	int ret = 0;
> +	struct vfsmount *prev_dest_mnt = dest_mnt;
> +	struct vfsmount *prev_src_mnt  = source_mnt;
> +	LIST_HEAD(tmp_list);
> +	LIST_HEAD(umount_list);
> +
> +	for (m = propagation_next(dest_mnt, dest_mnt); m;
> +			m = propagation_next(m, dest_mnt)) {
> +		int type = CL_PROPAGATION;
> +
> +		if (IS_MNT_NEW(m))
> +			continue;
> +
> +		if (IS_MNT_SHARED(m))
> +			type |= CL_MAKE_SHARED;
> +
> +		if (!(child = copy_tree(source_mnt, source_mnt->mnt_root,
> +						type))) {
> +			ret = -ENOMEM;
> +			list_splice(tree_list, tmp_list.prev);
> +			goto out;
> +		}
> +
> +		if (is_subdir(dest_dentry, m->mnt_root)) {

Shouldn't this check go before copy_tree()?  Not much point in copying
the tree if we are sure it won't be used.

> +			mnt_set_mountpoint(m, dest_dentry, child);
> +			list_add_tail(&child->mnt_hash, tree_list);
> +		} else {
> +			/*
> +			 * This can happen if the parent mount was bind mounted
> +			 * on some subdirectory of a shared/slave mount.
> +			 */

I can't grok this comment.  Shouldn't it read something like

  ... if 'm' is a result of a bind from a subdirectory of 'dest_dentry'


Miklos

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 12/18] shared mount handling: bind and rbind
  2005-11-09 10:54 ` Miklos Szeredi
@ 2005-11-09 14:31   ` Al Viro
  2005-11-09 15:22     ` Miklos Szeredi
  0 siblings, 1 reply; 26+ messages in thread
From: Al Viro @ 2005-11-09 14:31 UTC (permalink / raw)
  To: Miklos Szeredi; +Cc: torvalds, linux-kernel, linux-fsdevel, linuxram

On Wed, Nov 09, 2005 at 11:54:36AM +0100, Miklos Szeredi wrote:
> Shouldn't this check go before copy_tree()?  Not much point in copying
> the tree if we are sure it won't be used.

Incorrect.  Propagation nodes further down the tree can very well be
mountable.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 12/18] shared mount handling: bind and rbind
  2005-11-09 14:31   ` Al Viro
@ 2005-11-09 15:22     ` Miklos Szeredi
  2005-11-09 15:56       ` Al Viro
  0 siblings, 1 reply; 26+ messages in thread
From: Miklos Szeredi @ 2005-11-09 15:22 UTC (permalink / raw)
  To: viro; +Cc: torvalds, linux-kernel, linux-fsdevel, linuxram

> On Wed, Nov 09, 2005 at 11:54:36AM +0100, Miklos Szeredi wrote:
> > Shouldn't this check go before copy_tree()?  Not much point in copying
> > the tree if we are sure it won't be used.
> 
> Incorrect.  Propagation nodes further down the tree can very well be
> mountable.

Can you please give an example.  I'm feeling thick.

What I see in the code is that the the mount tree is copied, then put
on the tmp_list, and at the end the newly copied tree is freed with
umount_tree().

+		if (!(child = copy_tree(source_mnt, source_mnt->mnt_root,
+						type))) {
+			ret = -ENOMEM;
+			list_splice(tree_list, tmp_list.prev);
+			goto out;
+		}
+
+		if (is_subdir(dest_dentry, m->mnt_root)) {
+			mnt_set_mountpoint(m, dest_dentry, child);
+			list_add_tail(&child->mnt_hash, tree_list);
+		} else {
+			/*
+			 * This can happen if the parent mount was bind mounted
+			 * on some subdirectory of a shared/slave mount.
+			 */
+			list_add_tail(&child->mnt_hash, &tmp_list);
+		}
+		prev_dest_mnt = m;
+		prev_src_mnt  = child;
+	}
+out:
+	spin_lock(&vfsmount_lock);
+	while (!list_empty(&tmp_list)) {
+		child = list_entry(tmp_list.next, struct vfsmount, mnt_hash);
+		list_del_init(&child->mnt_hash);
+		umount_tree(child, &umount_list);
+	}
+	spin_unlock(&vfsmount_lock);
+	release_mounts(&umount_list);
+	return ret;

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 12/18] shared mount handling: bind and rbind
  2005-11-09 15:22     ` Miklos Szeredi
@ 2005-11-09 15:56       ` Al Viro
  2005-11-09 16:33         ` Miklos Szeredi
  0 siblings, 1 reply; 26+ messages in thread
From: Al Viro @ 2005-11-09 15:56 UTC (permalink / raw)
  To: Miklos Szeredi; +Cc: torvalds, linux-kernel, linux-fsdevel, linuxram

On Wed, Nov 09, 2005 at 04:22:23PM +0100, Miklos Szeredi wrote:
> > On Wed, Nov 09, 2005 at 11:54:36AM +0100, Miklos Szeredi wrote:
> > > Shouldn't this check go before copy_tree()?  Not much point in copying
> > > the tree if we are sure it won't be used.
> > 
> > Incorrect.  Propagation nodes further down the tree can very well be
> > mountable.
> 
> Can you please give an example.  I'm feeling thick.
> 
> What I see in the code is that the the mount tree is copied, then put
> on the tmp_list, and at the end the newly copied tree is freed with
> umount_tree().

Before it gets freed it may end up being copied.  Example: vfsmounts
A and B are peers, C is a slave of that peer group.  It happens to be
on slave list of B.  B has root deeper than A, which, in turn is deeper
than that of C (e.g. A and B had been created by binding subtrees of
C, which had been made slave afterwards).  We bind on something in A,
outside of the subtree mapped by B.

Alternatively, have A -> (B, D) -> C, with C on slave list of B.  Mountpoint
is within subtrees for A, C and D, but not B.  And no, we can't say "skip B,
just make a slave of tree on A and slap it on C" - correct result is to
have T_A -> T_D -> T_C (i.e. tree on C gets propagation from tree on D).
Which kills the variants with not creating that copy and making subsequent
ones directly from the original tree.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 12/18] shared mount handling: bind and rbind
  2005-11-09 15:56       ` Al Viro
@ 2005-11-09 16:33         ` Miklos Szeredi
  0 siblings, 0 replies; 26+ messages in thread
From: Miklos Szeredi @ 2005-11-09 16:33 UTC (permalink / raw)
  To: viro; +Cc: torvalds, linux-kernel, linux-fsdevel, linuxram

> Before it gets freed it may end up being copied.  Example: vfsmounts
> A and B are peers, C is a slave of that peer group.  It happens to be
> on slave list of B.  B has root deeper than A, which, in turn is deeper
> than that of C (e.g. A and B had been created by binding subtrees of
> C, which had been made slave afterwards).  We bind on something in A,
> outside of the subtree mapped by B.
> 
> Alternatively, have A -> (B, D) -> C, with C on slave list of B.  Mountpoint
> is within subtrees for A, C and D, but not B.  And no, we can't say "skip B,
> just make a slave of tree on A and slap it on C" - correct result is to
> have T_A -> T_D -> T_C (i.e. tree on C gets propagation from tree on D).
> Which kills the variants with not creating that copy and making subsequent
> ones directly from the original tree.

OK, I see it now.  What confused me is that from patch 12 it's not yet
obvious, that the copied mount will be used for futher propagation.

Miklos

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 12/18] shared mount handling: bind and rbind
  2005-11-08 15:55     ` Miklos Szeredi
@ 2005-11-09 18:44       ` Ram Pai
  2005-11-09 18:59         ` Linus Torvalds
  0 siblings, 1 reply; 26+ messages in thread
From: Ram Pai @ 2005-11-09 18:44 UTC (permalink / raw)
  To: Miklos Szeredi; +Cc: Al Viro, torvalds, linux-kernel, linux-fsdevel

On Tue, 2005-11-08 at 07:55, Miklos Szeredi wrote:
> > No. As explained in the same earlier threads; without this change the
> > behavior of shared-subtrees leads to inconsistency and confusion in some
> > scenarios.
> > 
> > Under the premise that no application should depend on this behavior
> > (most-recent-mount-visible v/s top-most-mount-visible),
> 
> The strongest argument against was that
> 
>   mount foo .; umount .
> 
> would no longer be a no-op.

It is a no-op even now.  Try it.

What you meant was something like this is no-op now?

P1: cd /tmp1
P2: mount --bind /var /tmp1
P1: mount --bind /bin .
P1: umount .
 
Yes this will not be a noop with my changes. However this behavior is
not documented to be a no-op anywhere. right? And 'umount .' really
doen't make sense. What does it mean? umount  the current mount? or
umount of the mount that is mounted on this dentry?

My changes just changes one behavior and that is:
If multiple mounts are mounted on the same <mount,dentry> combination
the later mounts are obscured by the preceeding mounts. Earlier it was
opposite behavior.

My biggest complaint about the earlier behavior was that the later
mounts obscured not only the earlier mounts on the <mount,dentry> tuple,
but also obscured all the mounts that got stacked on top of the
earlier <mount,dentry>.  It seemed totally unnatural, and confusing
with shared-subtree.  


> > Al Viro permitted this change. And this is certainly the right
> > behavior.
> 
> Which is a contradiction in term, since you are saying that
> applications _do_ depend on it.

no. I said application _should_not_ depend on it, because it is a
undefined semantics.

Sorry I was out yesterday and could reply earlier.
RP

> 
> Miklos

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 12/18] shared mount handling: bind and rbind
  2005-11-09 18:44       ` Ram Pai
@ 2005-11-09 18:59         ` Linus Torvalds
  2005-11-09 19:26           ` Al Viro
                             ` (2 more replies)
  0 siblings, 3 replies; 26+ messages in thread
From: Linus Torvalds @ 2005-11-09 18:59 UTC (permalink / raw)
  To: Ram Pai; +Cc: Miklos Szeredi, Al Viro, linux-kernel, linux-fsdevel



On Wed, 9 Nov 2005, Ram Pai wrote:
>
> And 'umount .' really doen't make sense. What does it mean? umount the 
> current mount? or umount of the mount that is mounted on this dentry?

"umount <directory>" _absolutely_ makes sense, whether "directory" is "." 
or something else. People do it all the time.

Now, if it doesn't unmount the last thing mounted on top of ".", then 
that's a misfeature. It might be a misfeature in the mount program (it 
might scan /etc/mounts top-to-bottom rather than the other way), but the 
kernel should also support it.

> no. I said application _should_not_ depend on it, because it is a
> undefined semantics.

It's definitely neither unusual nor undefined. I do all my umounts by 
directory (in fact, doing it by anything else really _is_ badly defined, 
since a block device can be mounted in many places), and the only sane 
semantics would be to peel off the last mount on that directory.

Now, that doesn't necessarily mean that "list_add_tail()" is wrong. But 
if we add new mounts to the end, then umount remove them from the end too, 
no?

		Linus

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 12/18] shared mount handling: bind and rbind
  2005-11-09 18:59         ` Linus Torvalds
@ 2005-11-09 19:26           ` Al Viro
  2005-11-09 19:28           ` Ram Pai
  2005-11-16  3:29           ` Rob Landley
  2 siblings, 0 replies; 26+ messages in thread
From: Al Viro @ 2005-11-09 19:26 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Ram Pai, Miklos Szeredi, linux-kernel, linux-fsdevel

On Wed, Nov 09, 2005 at 10:59:47AM -0800, Linus Torvalds wrote:
> 
> 
> On Wed, 9 Nov 2005, Ram Pai wrote:
> >
> > And 'umount .' really doen't make sense. What does it mean? umount the 
> > current mount? or umount of the mount that is mounted on this dentry?
> 
> "umount <directory>" _absolutely_ makes sense, whether "directory" is "." 
> or something else. People do it all the time.

With current (and all previous, actually) tree umount . is usually -EBUSY.

The case Mikulas is talking about is much uglier - it's "mount on top
of current directory, then umount .".  _That_ (i.e. when . is overmounted)
happens to work.  And semantics is really, really not well-defined.

Situation with overmounts is nasty - it's *not* just a chain, unfortunately.
I certainly intended it to be such.  However, we can get a *tree* of
overmounts due to side-effects I've missed back then.

We really need it sanitized (and that's what I'm doing right now), but
yes, it *will* cause user-visible changes.  Incidentally, one of those
will be that umount . will work...

The trouble begins since we allow to attach vfsmounts to the *middle* of
overmount chain.  I.e.

mount foo /tmp
cd /tmp
mount bar .
mount baz .

will end up with *two* vfsmounts having root of foo as mountpoint and having
the same mnt_parent.  Which one is seen depends on phase of moon - the only
answer is "whichever is first in mnt_hash chain".  Which is certainly not a
sane answer.  We need explicit rules dealing with effect of overmounts;
anything that seriously relies on details of current behaviour in that sort
of corner cases is very definitely broken.

And "we allow" above should be read as "Al had not thought about that mess
back in 2001" ;-/  Current behaviour in that sort setups is an accident -
as soon as it gets to such forked chains of overmounts sanity exits stage
left.  To be fixed...

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 12/18] shared mount handling: bind and rbind
  2005-11-09 18:59         ` Linus Torvalds
  2005-11-09 19:26           ` Al Viro
@ 2005-11-09 19:28           ` Ram Pai
  2005-11-16  3:29           ` Rob Landley
  2 siblings, 0 replies; 26+ messages in thread
From: Ram Pai @ 2005-11-09 19:28 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Miklos Szeredi, Al Viro, linux-kernel, linux-fsdevel

On Wed, 2005-11-09 at 10:59, Linus Torvalds wrote:
> On Wed, 9 Nov 2005, Ram Pai wrote:
> >
> > And 'umount .' really doen't make sense. What does it mean? umount the 
> > current mount? or umount of the mount that is mounted on this dentry?
> 
> "umount <directory>" _absolutely_ makes sense, whether "directory" is "." 
> or something else. People do it all the time.

ok. so you say 'umount <dir>' means umount something mounted on top of
dir.  In that case, when I say 'umount /tmp' it has to unmount something
that is mounted on top of /tmp, but there cannot be anything mounted on
top of /tmp. In the first place I cannot reach this mount since it is
obscured by the mount on top. right?

eg:  m1 is the root mount
     m2 is the mount on top of tmp on the root mount
     m3 is the mount on top of tmp(root dentry) of m2

'umount /tmp' 
will mean umount something that is mounted on top of root dentry of m3.



> 
> Now, if it doesn't unmount the last thing mounted on top of ".", then 
> that's a misfeature. It might be a misfeature in the mount program (it 
> might scan /etc/mounts top-to-bottom rather than the other way), but the 
> kernel should also support it.
> 
> > no. I said application _should_not_ depend on it, because it is a
> > undefined semantics.
> 
> It's definitely neither unusual nor undefined. I do all my umounts by 
> directory (in fact, doing it by anything else really _is_ badly defined, 
> since a block device can be mounted in many places), and the only sane 
> semantics would be to peel off the last mount on that directory.
> 
> Now, that doesn't necessarily mean that "list_add_tail()" is wrong. But 
> if we add new mounts to the end, then umount remove them from the end too, 
> no?

Yes it does. it removes the mounts that receive propagation from the
tail.  Anything that has been asked to be unmounted explicitly will be
removed irrespective of where it is on the list, and always it is at the
head because lookup_mnt() always returns the one at the head.

RP
> 
> 		Linus

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 12/18] shared mount handling: bind and rbind
  2005-11-09 18:59         ` Linus Torvalds
  2005-11-09 19:26           ` Al Viro
  2005-11-09 19:28           ` Ram Pai
@ 2005-11-16  3:29           ` Rob Landley
  2005-11-16  3:53             ` Linus Torvalds
  2 siblings, 1 reply; 26+ messages in thread
From: Rob Landley @ 2005-11-16  3:29 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Ram Pai, Miklos Szeredi, Al Viro, linux-kernel, linux-fsdevel

On Wednesday 09 November 2005 12:59, Linus Torvalds wrote:
> > no. I said application _should_not_ depend on it, because it is a
> > undefined semantics.
>
> It's definitely neither unusual nor undefined. I do all my umounts by
> directory (in fact, doing it by anything else really _is_ badly defined,
> since a block device can be mounted in many places), and the only sane
> semantics would be to peel off the last mount on that directory.

I noticed this upgrading busybox mount a few months back.  I was trying to 
figure out if the correct semantics for umount /dev/block were to umount 
_all_ instances of this block device, or umount just the most recent one.  I 
wound up just passing it through to the kernel and letting it decide, but I 
wasn't sure why it did what it did.

The 2.6 multiple mount semantics are still new enough that the tools are just 
now catching up.  Last I checked, the standard mount was kind of unhappy with 
--bind and --move mounts (they were corrupting /etc/mtab):

http://www.busybox.net/lists/busybox/2005-August/015285.html

The side effects of mount can be really non-obvious at times.  For example, 
while implementing busybox's switch_root I found out that this snippet of 
klibc's run-init.c is slightly wrong:
  if ( chdir(realroot) )
    die("chdir to new root");
/* snip */
  /* Overmount the root */
  if ( mount(".", "/", NULL, MS_MOVE, NULL) )
    die("overmounting root");

  /* chroot, chdir */
  if ( chroot(".") || chdir("/") )
    die("chroot");

The || fallback in the third part won't work.  chroot(".") will get you to the 
new filesystem, but chdir("/") still gets you to the old one, even though 
we've overmounted it.  (I have no idea why.  I assume it's because / is 
special.)

Rob

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 12/18] shared mount handling: bind and rbind
  2005-11-16  3:29           ` Rob Landley
@ 2005-11-16  3:53             ` Linus Torvalds
  2005-11-16  5:35               ` Al Boldi
  2005-11-16  8:41               ` Rob Landley
  0 siblings, 2 replies; 26+ messages in thread
From: Linus Torvalds @ 2005-11-16  3:53 UTC (permalink / raw)
  To: Rob Landley; +Cc: Ram Pai, Miklos Szeredi, Al Viro, linux-kernel, linux-fsdevel



On Tue, 15 Nov 2005, Rob Landley wrote:
> 
> The || fallback in the third part won't work.  chroot(".") will get you to the 
> new filesystem, but chdir("/") still gets you to the old one, even though 
> we've overmounted it.  (I have no idea why.  I assume it's because / is 
> special.)

'/' is special exactly the same way '.' is: one is shorthand for "current 
process' root", and the other is shorthand for "current process' cwd".

So if you mount over '/', it won't actually do what you think it does: 
because when you open "/", it will continue to open the _old_ "/". Exactly 
the same way that mounting over somebody's cwd won't do what you think it 
does - because the root and the cwd have been looked-up earlier and are 
cached with the process.

This is why we have "pivot_root()" and "chroot()", which can both be used 
to do what you want to do. You mount the new root somewhere else, and then 
you chroot (or pivot-root) to it. And THEN you do 'chdir("/")' to move the 
cwd into the new root too (and only at that point have you "lost" the old 
root - although you can actually get it back if you have some file 
descriptor open to it).

		Linus

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 12/18] shared mount handling: bind and rbind
  2005-11-16  3:53             ` Linus Torvalds
@ 2005-11-16  5:35               ` Al Boldi
  2005-11-16  8:19                 ` Miklos Szeredi
  2005-11-16  8:47                 ` Rob Landley
  2005-11-16  8:41               ` Rob Landley
  1 sibling, 2 replies; 26+ messages in thread
From: Al Boldi @ 2005-11-16  5:35 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Ram Pai, Miklos Szeredi, Al Viro, linux-kernel, linux-fsdevel,
	Rob Landley

Linus Torvalds wrote:
> This is why we have "pivot_root()" and "chroot()", which can both be used
> to do what you want to do. You mount the new root somewhere else, and then
> you chroot (or pivot-root) to it. And THEN you do 'chdir("/")' to move the
> cwd into the new root too (and only at that point have you "lost" the old
> root - although you can actually get it back if you have some file
> descriptor open to it).

Wouldn't this constitute a security flaw?

Shouldn't chroot jail you?

--
Al

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 12/18] shared mount handling: bind and rbind
  2005-11-16  5:35               ` Al Boldi
@ 2005-11-16  8:19                 ` Miklos Szeredi
  2005-11-16  9:10                   ` Rob Landley
  2005-11-16 13:59                   ` Shaya Potter
  2005-11-16  8:47                 ` Rob Landley
  1 sibling, 2 replies; 26+ messages in thread
From: Miklos Szeredi @ 2005-11-16  8:19 UTC (permalink / raw)
  To: a1426z; +Cc: torvalds, linuxram, viro, linux-kernel, linux-fsdevel, rob

> > This is why we have "pivot_root()" and "chroot()", which can both be used
> > to do what you want to do. You mount the new root somewhere else, and then
> > you chroot (or pivot-root) to it. And THEN you do 'chdir("/")' to move the
> > cwd into the new root too (and only at that point have you "lost" the old
> > root - although you can actually get it back if you have some file
> > descriptor open to it).
> 
> Wouldn't this constitute a security flaw?
> 
> Shouldn't chroot jail you?

No, chroot should just change the root.

If you don't want to be able to get back the old root, just close all
file descriptors _in addition_ to chroot() and chdir().

Miklos

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 12/18] shared mount handling: bind and rbind
  2005-11-16  3:53             ` Linus Torvalds
  2005-11-16  5:35               ` Al Boldi
@ 2005-11-16  8:41               ` Rob Landley
  2005-11-16 16:18                 ` Linus Torvalds
  1 sibling, 1 reply; 26+ messages in thread
From: Rob Landley @ 2005-11-16  8:41 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Ram Pai, Miklos Szeredi, Al Viro, linux-kernel, linux-fsdevel

On Tuesday 15 November 2005 21:53, Linus Torvalds wrote:
> So if you mount over '/', it won't actually do what you think it does:
> because when you open "/", it will continue to open the _old_ "/". Exactly
> the same way that mounting over somebody's cwd won't do what you think it
> does - because the root and the cwd have been looked-up earlier and are
> cached with the process.

So does mounting over / actually accomplish anything?  Or is it sort of an 
undermount instead of an overmount, resulting in a mounted but inaccessible 
filesystem?  (In theory, fork() would copy the current cached value of "/", 
and all absolute path lookups are really sort of relative paths from the 
cached "/"...)

I ask because I'm trying to figure out what switch_root's "mount --move . /" 
accomplishes, other than making /proc/mounts look right.  If we just did the 
chroot(".") it'd be functionally the same, and slightly smaller (which 
busybox cares about).

I'm also remembering that while playing around with stuff in a PID 1 shell 
under UML (trying to figure out how to implement pivot_root), I mounted 
something directly on / (which was a NOP) and then umount was also a NOP 
(presumably because it was trying to umount rootfs), meaning I had a mount 
that had simply _leaked_.  It still showed up in /proc/mounts, but was 
totally inaccessable and couldn't be removed either.

I guess that's a "don't do that then".

> This is why we have "pivot_root()" and "chroot()", which can both be used
> to do what you want to do. You mount the new root somewhere else, and then
> you chroot (or pivot-root) to it. And THEN you do 'chdir("/")' to move the
> cwd into the new root too (and only at that point have you "lost" the old
> root - although you can actually get it back if you have some file
> descriptor open to it).

So all chroot(2) really does is reset the "/" reference?

In the specific case of "mount --move . /" || chroot ("."), I don't see why we 
need a chdir afterwards, because cwd points to the correct filesystem.  (In 
fact, for a moment there between the mount move and the chroot it's the 
_only_ reference we have to this filesystem.)

Perhaps ".." isn't correct unless we chdir again...?

>   Linus

Rob

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 12/18] shared mount handling: bind and rbind
  2005-11-16  5:35               ` Al Boldi
  2005-11-16  8:19                 ` Miklos Szeredi
@ 2005-11-16  8:47                 ` Rob Landley
  1 sibling, 0 replies; 26+ messages in thread
From: Rob Landley @ 2005-11-16  8:47 UTC (permalink / raw)
  To: Al Boldi
  Cc: Linus Torvalds, Ram Pai, Miklos Szeredi, Al Viro, linux-kernel,
	linux-fsdevel

On Tuesday 15 November 2005 23:35, Al Boldi wrote:
> Linus Torvalds wrote:
> > This is why we have "pivot_root()" and "chroot()", which can both be used
> > to do what you want to do. You mount the new root somewhere else, and
> > then you chroot (or pivot-root) to it. And THEN you do 'chdir("/")' to
> > move the cwd into the new root too (and only at that point have you
> > "lost" the old root - although you can actually get it back if you have
> > some file descriptor open to it).
>
> Wouldn't this constitute a security flaw?
>
> Shouldn't chroot jail you?

A few years ago I had a build script that compiled a new Linux From Scratch 
system I could chroot into, and one of the things in the new chroot 
environment was a different boot loader.  And for testing purposes (and with 
a boot CD standing by) I would chroot into this new environment and run the 
lilo in it to add the new test kernel into the boot option list.

One day, I upgraded to a new kernel version and it stopped working, because 
chroot had acquired some unwanted security feature that prevented lilo from 
properly talking to /dev/hda from within a chroot environment.

I remember being rather put out by this.

Chroot is sometimes used for other purposes than "security".

Rob

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 12/18] shared mount handling: bind and rbind
  2005-11-16  8:19                 ` Miklos Szeredi
@ 2005-11-16  9:10                   ` Rob Landley
  2005-11-16 10:14                     ` Miklos Szeredi
  2005-11-16 13:59                   ` Shaya Potter
  1 sibling, 1 reply; 26+ messages in thread
From: Rob Landley @ 2005-11-16  9:10 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: a1426z, torvalds, linuxram, viro, linux-kernel, linux-fsdevel

On Wednesday 16 November 2005 02:19, Miklos Szeredi wrote:
> > > This is why we have "pivot_root()" and "chroot()", which can both be
> > > used to do what you want to do. You mount the new root somewhere else,
> > > and then you chroot (or pivot-root) to it. And THEN you do 'chdir("/")'
> > > to move the cwd into the new root too (and only at that point have you
> > > "lost" the old root - although you can actually get it back if you have
> > > some file descriptor open to it).
> >
> > Wouldn't this constitute a security flaw?
> >
> > Shouldn't chroot jail you?
>
> No, chroot should just change the root.
>
> If you don't want to be able to get back the old root, just close all
> file descriptors _in addition_ to chroot() and chdir().

If you try the chdir by filedescriptor trick on the stdin/stdout/stderr fed 
into PID 1 when it's started up by the kernel, which filesystem do you wind 
up in?  (rootfs?)

I ask because switch_root redoes those to point to /dev/console from the real 
root (presumably for security reasons), and this happens _before_ the init on 
the real root gets called, and thus before the real root gets to populate 
its' own dynamic /dev.

I suppose initramfs could make a temporary /dev, do the mknods for console and 
the real root, and then mount --move this tmpdir to the real root's /dev once 
that's available (and then let the real root's udev populate it the rest of 
the way).  Or the real root could have a hard /dev/console living in the 
directory that's going to get overmounted by tmpfs later.  Or just leave 
initramfs accessible until init can switch consoles...

Sigh.  I need to document the requirements here...

Rob

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 12/18] shared mount handling: bind and rbind
  2005-11-16  9:10                   ` Rob Landley
@ 2005-11-16 10:14                     ` Miklos Szeredi
  0 siblings, 0 replies; 26+ messages in thread
From: Miklos Szeredi @ 2005-11-16 10:14 UTC (permalink / raw)
  To: rob; +Cc: a1426z, torvalds, linuxram, viro, linux-kernel, linux-fsdevel

> > If you don't want to be able to get back the old root, just close all
> > file descriptors _in addition_ to chroot() and chdir().
> 
> If you try the chdir by filedescriptor trick on the stdin/stdout/stderr fed 
> into PID 1 when it's started up by the kernel, which filesystem do you wind 
> up in?  (rootfs?)

You can't fchdir() to a non-directory, so this shouldn't be an issue.

Miklos

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 12/18] shared mount handling: bind and rbind
  2005-11-16  8:19                 ` Miklos Szeredi
  2005-11-16  9:10                   ` Rob Landley
@ 2005-11-16 13:59                   ` Shaya Potter
  2005-11-16 16:35                     ` Miklos Szeredi
  2005-11-16 20:05                     ` Al Boldi
  1 sibling, 2 replies; 26+ messages in thread
From: Shaya Potter @ 2005-11-16 13:59 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: a1426z, torvalds, linuxram, viro, linux-kernel, linux-fsdevel,
	rob

On Wed, 2005-11-16 at 09:19 +0100, Miklos Szeredi wrote:
> > > This is why we have "pivot_root()" and "chroot()", which can both be used
> > > to do what you want to do. You mount the new root somewhere else, and then
> > > you chroot (or pivot-root) to it. And THEN you do 'chdir("/")' to move the
> > > cwd into the new root too (and only at that point have you "lost" the old
> > > root - although you can actually get it back if you have some file
> > > descriptor open to it).
> > 
> > Wouldn't this constitute a security flaw?
> > 
> > Shouldn't chroot jail you?
> 
> No, chroot should just change the root.
> 
> If you don't want to be able to get back the old root, just close all
> file descriptors _in addition_ to chroot() and chdir().

hah.  As long as you're running as root, chroot() again to a directory
below you, and you effectively broken the chroot and can make a relative
path to the old root. :)

I created a patch years ago that creates a chain of "chroot" points, and
any past chroot point would be considered a place that follow_dotdot
would consider a root.  There didn't seem much interest in the patch
though.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 12/18] shared mount handling: bind and rbind
  2005-11-16  8:41               ` Rob Landley
@ 2005-11-16 16:18                 ` Linus Torvalds
  0 siblings, 0 replies; 26+ messages in thread
From: Linus Torvalds @ 2005-11-16 16:18 UTC (permalink / raw)
  To: Rob Landley; +Cc: Ram Pai, Miklos Szeredi, Al Viro, linux-kernel, linux-fsdevel



On Wed, 16 Nov 2005, Rob Landley wrote:
> 
> So does mounting over / actually accomplish anything?  Or is it sort of an 
> undermount instead of an overmount, resulting in a mounted but inaccessible 
> filesystem?

I'd say that _usually_ you're better off using chroot() than mounting over 
"/".

> So all chroot(2) really does is reset the "/" reference?

Yes. Literally. Everything else stays the same, including any open files 
(and cwd).

It's a "flaw" in chroot if you consider it a jail, but it's used for so 
much more than that.  In fact, you shouldn't consider it jail: it's really 
just a small _part_ of the notion of limiting somebody to a specific area.

(The smallest part, in fact. And you should be aware that root can always 
get out of a chdir() if he just has enough tools - and the tools aren't 
even very big. "mknod" + "mount" will do it even in the absense of a way 
to add binaries, as will /proc access).

Note that the most common use of chroot isn't actually the "jail" kind of 
usage, but building and installation environments (ie a lot of package 
building stuff end up using chroot as a way to create the "target 
environment").

> In the specific case of "mount --move . /" || chroot ("."), I don't see why we 
> need a chdir afterwards, because cwd points to the correct filesystem.  (In 
> fact, for a moment there between the mount move and the chroot it's the 
> _only_ reference we have to this filesystem.)
> 
> Perhaps ".." isn't correct unless we chdir again...?

Indeed. The issue ends up being ".." and "getcwd()", which both want to 
know what your root is in order to know where to stop.

		Linus

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 12/18] shared mount handling: bind and rbind
  2005-11-16 13:59                   ` Shaya Potter
@ 2005-11-16 16:35                     ` Miklos Szeredi
  2005-11-16 20:05                     ` Al Boldi
  1 sibling, 0 replies; 26+ messages in thread
From: Miklos Szeredi @ 2005-11-16 16:35 UTC (permalink / raw)
  To: spotter; +Cc: a1426z, torvalds, linuxram, viro, linux-kernel, linux-fsdevel,
	rob

> > > Shouldn't chroot jail you?
> > 
> > No, chroot should just change the root.
> > 
> > If you don't want to be able to get back the old root, just close all
> > file descriptors _in addition_ to chroot() and chdir().
> 
> hah.  As long as you're running as root, chroot() again to a directory
> below you, and you effectively broken the chroot and can make a relative
> path to the old root. :)

... AND do 'setuid(nonroot)'.  Root can get out of a chroot() jail in
much more interesting ways.

Miklos

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 12/18] shared mount handling: bind and rbind
  2005-11-16 13:59                   ` Shaya Potter
  2005-11-16 16:35                     ` Miklos Szeredi
@ 2005-11-16 20:05                     ` Al Boldi
  2005-11-16 20:21                       ` Shaya Potter
  1 sibling, 1 reply; 26+ messages in thread
From: Al Boldi @ 2005-11-16 20:05 UTC (permalink / raw)
  To: Shaya Potter
  Cc: torvalds, linuxram, viro, linux-kernel, linux-fsdevel, rob,
	Miklos Szeredi

Shaya Potter wrote:
> I created a patch years ago that creates a chain of "chroot" points, and
> any past chroot point would be considered a place that follow_dotdot
> would consider a root.  There didn't seem much interest in the patch
> though.

Could you resubmit this patch for possible inclusion in 2.6.16, and make it a 
runtime option such that the default behaviour remains unchanged?

Thanks!!!

--
Al

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 12/18] shared mount handling: bind and rbind
  2005-11-16 20:05                     ` Al Boldi
@ 2005-11-16 20:21                       ` Shaya Potter
  0 siblings, 0 replies; 26+ messages in thread
From: Shaya Potter @ 2005-11-16 20:21 UTC (permalink / raw)
  To: Al Boldi
  Cc: torvalds, linuxram, viro, linux-kernel, linux-fsdevel, rob,
	Miklos Szeredi

On Wed, 2005-11-16 at 23:05 +0300, Al Boldi wrote:
> Shaya Potter wrote:
> > I created a patch years ago that creates a chain of "chroot" points, and
> > any past chroot point would be considered a place that follow_dotdot
> > would consider a root.  There didn't seem much interest in the patch
> > though.
> 
> Could you resubmit this patch for possible inclusion in 2.6.16, and make it a 
> runtime option such that the default behaviour remains unchanged?

resubmit?  It was a long long time ago (middle of 2.4 era)  I'd have to
find it, who knows if its still around.

Conceptually it was pretty simple, basically instead of just overwriting
the fs struct root/rootmnt you create a linked list of them (appending
on every chroot).  I think I might have had a seperate struct to
maintain that linked list (it wasn't the best code in the world).

follow_dotdot is then modified so that the code

if (nd->dentry == current->fs->root &&
	nd->mnt == current->fs->rootmnt) {
	read_unlock(&current->fs->lock);
	break;
}

instead of being just a single test, loops over every element in the
linked list.  Hence, if you are ever at a "chroot" point, you get a
root, so solves the simple problem of breaking out of a chroot by
calling chroot() again.  In the normal case (no chroot), there should be
no real performance hit, as it would hit the loop once.

However, as others have said, if you can make a device node, then you
can break out of it in other ways.

Now, there are other things one has to take care of.  

1) Obvious: cleaning up the list on process termination.
2) Obvious: propagating list to child processes.

and probably some less obvious things that I don't remember or didn't
even consider at the time.

^ permalink raw reply	[flat|nested] 26+ messages in thread

end of thread, other threads:[~2005-11-16 20:21 UTC | newest]

Thread overview: 26+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2005-11-08  2:01 [PATCH 12/18] shared mount handling: bind and rbind Al Viro
2005-11-08 14:11 ` Miklos Szeredi
2005-11-08 15:48   ` Ram Pai
2005-11-08 15:55     ` Miklos Szeredi
2005-11-09 18:44       ` Ram Pai
2005-11-09 18:59         ` Linus Torvalds
2005-11-09 19:26           ` Al Viro
2005-11-09 19:28           ` Ram Pai
2005-11-16  3:29           ` Rob Landley
2005-11-16  3:53             ` Linus Torvalds
2005-11-16  5:35               ` Al Boldi
2005-11-16  8:19                 ` Miklos Szeredi
2005-11-16  9:10                   ` Rob Landley
2005-11-16 10:14                     ` Miklos Szeredi
2005-11-16 13:59                   ` Shaya Potter
2005-11-16 16:35                     ` Miklos Szeredi
2005-11-16 20:05                     ` Al Boldi
2005-11-16 20:21                       ` Shaya Potter
2005-11-16  8:47                 ` Rob Landley
2005-11-16  8:41               ` Rob Landley
2005-11-16 16:18                 ` Linus Torvalds
2005-11-09 10:54 ` Miklos Szeredi
2005-11-09 14:31   ` Al Viro
2005-11-09 15:22     ` Miklos Szeredi
2005-11-09 15:56       ` Al Viro
2005-11-09 16:33         ` Miklos Szeredi

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).